muscle-3.8.31.orig/0000755000175000017500000000000011656150152013375 5ustar kratzcharlesmuscle-3.8.31.orig/drawtree.cpp0000644000175000017500000000162711352261666015732 0ustar kratzcharles#include "muscle.h" #include "tree.h" /*** Simple tree drawing algorithm. y coordinate of node is index in depth-first traversal. x coordinate is distance from root. ***/ static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex) { const unsigned uRoot = tree.GetRootNodeIndex(); unsigned uDist = 0; while (uNodeIndex != uRoot) { ++uDist; uNodeIndex = tree.GetParent(uNodeIndex); } return uDist; } static void DrawNode(const Tree &tree, unsigned uNodeIndex) { if (!tree.IsLeaf(uNodeIndex)) DrawNode(tree, tree.GetLeft(uNodeIndex)); unsigned uDist = DistFromRoot(tree, uNodeIndex); for (unsigned i = 0; i < 5*uDist; ++i) Log(" "); Log("%d\n", uNodeIndex); if (!tree.IsLeaf(uNodeIndex)) DrawNode(tree, tree.GetRight(uNodeIndex)); } void DrawTree(const Tree &tree) { unsigned uRoot = tree.GetRootNodeIndex(); DrawNode(tree, uRoot); } muscle-3.8.31.orig/progressivealign.cpp0000644000175000017500000000372211352261667017477 0ustar kratzcharles#include "muscle.h" #include #include "tree.h" #include "seqvect.h" #include "profile.h" #include "msa.h" #include "pwpath.h" #include "distfunc.h" #define TRACE 0 void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a) { assert(GuideTree.IsRooted()); #if TRACE Log("GuideTree:\n"); GuideTree.LogMe(); #endif const unsigned uSeqCount = v.Length(); const unsigned uNodeCount = 2*uSeqCount - 1; ProgNode *ProgNodes = new ProgNode[uNodeCount]; unsigned uJoin = 0; unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); SetProgressDesc("Align node"); do { if (GuideTree.IsLeaf(uTreeNodeIndex)) { if (uTreeNodeIndex >= uNodeCount) Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount); ProgNode &Node = ProgNodes[uTreeNodeIndex]; unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); if (uId >= uSeqCount) Quit("Seq index out of range"); const Seq &s = *(v[uId]); Node.m_MSA.FromSeq(s); Node.m_MSA.SetSeqId(0, uId); Node.m_uLength = Node.m_MSA.GetColCount(); } else { Progress(uJoin, uSeqCount - 1); ++uJoin; const unsigned uMergeNodeIndex = uTreeNodeIndex; ProgNode &Parent = ProgNodes[uMergeNodeIndex]; const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); ProgNode &Node1 = ProgNodes[uLeft]; ProgNode &Node2 = ProgNodes[uRight]; PWPath Path; AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); Parent.m_uLength = Parent.m_MSA.GetColCount(); Node1.m_MSA.Clear(); Node2.m_MSA.Clear(); } uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); ProgressStepsDone(); unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; a.Copy(RootProgNode.m_MSA); delete[] ProgNodes; ProgNodes = 0; } muscle-3.8.31.orig/treefrommsa.cpp0000644000175000017500000000441011352261673016430 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "tree.h" #include "clust.h" #include "clustsetmsa.h" #include "distcalc.h" static void SaveMSADist(const MSA &msa, MSADist &d, const char *FileName) { FILE *f = fopen(FileName, "w"); if (f == 0) Quit("Cannot create %s", FileName); unsigned n = msa.GetSeqCount(); for (unsigned i = 0; i < n; ++i) { fprintf(f, "%10.10s ", msa.GetSeqName(i)); for (unsigned j = 0; j < n; ++j) fprintf(f, " %9g", d.ComputeDist(msa, i, j)); fprintf(f, "\n"); } fclose(f); } static void TreeFromMSA_NJ(const MSA &msa, Tree &tree, CLUSTER Cluster, DISTANCE Distance, const char *SaveFileName) { MSADist MD(Distance); ClustSetMSA Set(msa, MD); if (SaveFileName != 0) SaveMSADist(msa, MD, SaveFileName); Clust C; C.Create(Set, Cluster); tree.FromClust(C); } static void SaveDC(const DistCalcMSA &DC, const char *FileName) { FILE *f = fopen(FileName, "w"); if (f == 0) Quit("Cannot create %s", FileName); unsigned n = DC.GetCount(); fprintf(f, "%u\n", n); float *Dist = new float[n]; for (unsigned i = 0; i < n; ++i) { fprintf(f, "%10.10s ", DC.GetName(i)); DC.CalcDistRange(i, Dist); for (unsigned j = 0; j < i; ++j) fprintf(f, " %9g", Dist[j]); fprintf(f, "\n"); } fclose(f); } static void TreeFromMSA_UPGMA(const MSA &msa, Tree &tree, CLUSTER Cluster, DISTANCE Distance, const char *SaveFileName) { LINKAGE Linkage = LINKAGE_Undefined; switch (Cluster) { case CLUSTER_UPGMA: Linkage = LINKAGE_Avg; break; case CLUSTER_UPGMAMin: Linkage = LINKAGE_Min; break; case CLUSTER_UPGMAMax: Linkage = LINKAGE_Max; break; case CLUSTER_UPGMB: Linkage = LINKAGE_Biased; break; default: Quit("TreeFromMSA_UPGMA, CLUSTER_%u not supported", Cluster); } DistCalcMSA DC; DC.Init(msa, Distance); if (SaveFileName != 0) SaveDC(DC, SaveFileName); UPGMA2(DC, tree, Linkage); } void TreeFromMSA(const MSA &msa, Tree &tree, CLUSTER Cluster, DISTANCE Distance, ROOT Root, const char *SaveFileName) { if (CLUSTER_NeighborJoining == Cluster) TreeFromMSA_NJ(msa, tree, Cluster, Distance, SaveFileName); else TreeFromMSA_UPGMA(msa, tree, Cluster, Distance, SaveFileName); FixRoot(tree, Root); } muscle-3.8.31.orig/distcalc.cpp0000644000175000017500000000357111352261673015701 0ustar kratzcharles#include "muscle.h" #include "distfunc.h" #include "distcalc.h" #include "msa.h" void DistCalcDF::Init(const DistFunc &DF) { m_ptrDF = &DF; } void DistCalcDF::CalcDistRange(unsigned i, dist_t Dist[]) const { for (unsigned j = 0; j < i; ++j) Dist[j] = m_ptrDF->GetDist(i, j); } unsigned DistCalcDF::GetCount() const { return m_ptrDF->GetCount(); } unsigned DistCalcDF::GetId(unsigned i) const { return m_ptrDF->GetId(i); } const char *DistCalcDF::GetName(unsigned i) const { return m_ptrDF->GetName(i); } void DistCalcMSA::Init(const MSA &msa, DISTANCE Distance) { m_ptrMSA = &msa; m_Distance = Distance; } void DistCalcMSA::CalcDistRange(unsigned i, dist_t Dist[]) const { for (unsigned j = 0; j < i; ++j) { switch (m_Distance) { case DISTANCE_PctIdKimura: { const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j); Dist[j] = (float) KimuraDist(PctId); break; } case DISTANCE_PctIdLog: { const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j); Dist[j] = (float) PctIdToMAFFTDist(PctId); break; } case DISTANCE_ScoreDist: { double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2); Dist[j] = (float) GetScoreDist(*m_ptrMSA, i, j); continue; } case DISTANCE_Edit: { const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j); if (PctId > 1.0) Quit("Internal error, DISTANCE_Edit, pct id=%.3g", PctId); Dist[j] = (float) 1.0 - PctId; break; } default: Quit("DistCalcMSA: Invalid DISTANCE_%u", m_Distance); } } } unsigned DistCalcMSA::GetCount() const { return m_ptrMSA->GetSeqCount(); } unsigned DistCalcMSA::GetId(unsigned i) const { return m_ptrMSA->GetSeqId(i); } const char *DistCalcMSA::GetName(unsigned i) const { return m_ptrMSA->GetSeqName(i); } muscle-3.8.31.orig/diaglist.cpp0000644000175000017500000002202311352261667015707 0ustar kratzcharles#include "muscle.h" #include "diaglist.h" #include "pwpath.h" #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) void DiagList::Add(const Diag &d) { if (m_uCount == MAX_DIAGS) Quit("DiagList::Add, overflow %u", m_uCount); m_Diags[m_uCount] = d; ++m_uCount; } void DiagList::Add(unsigned uStartPosA, unsigned uStartPosB, unsigned uLength) { Diag d; d.m_uStartPosA = uStartPosA; d.m_uStartPosB = uStartPosB; d.m_uLength = uLength; Add(d); } const Diag &DiagList::Get(unsigned uIndex) const { if (uIndex >= m_uCount) Quit("DiagList::Get(%u), count=%u", uIndex, m_uCount); return m_Diags[uIndex]; } void DiagList::LogMe() const { Log("DiagList::LogMe, count=%u\n", m_uCount); Log(" n StartA StartB Length\n"); Log("--- ------ ------ ------\n"); for (unsigned n = 0; n < m_uCount; ++n) { const Diag &d = m_Diags[n]; Log("%3u %6u %6u %6u\n", n, d.m_uStartPosA, d.m_uStartPosB, d.m_uLength); } } void DiagList::FromPath(const PWPath &Path) { Clear(); const unsigned uEdgeCount = Path.GetEdgeCount(); unsigned uLength = 0; unsigned uStartPosA; unsigned uStartPosB; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); // Typical cases if (Edge.cType == 'M') { if (0 == uLength) { uStartPosA = Edge.uPrefixLengthA - 1; uStartPosB = Edge.uPrefixLengthB - 1; } ++uLength; } else { if (uLength >= g_uMinDiagLength) Add(uStartPosA, uStartPosB, uLength); uLength = 0; } } // Special case for last edge if (uLength >= g_uMinDiagLength) Add(uStartPosA, uStartPosB, uLength); } bool DiagList::NonZeroIntersection(const Diag &d) const { for (unsigned n = 0; n < m_uCount; ++n) { const Diag &d2 = m_Diags[n]; if (DiagOverlap(d, d2) > 0) return true; } return false; } // DialogOverlap returns the length of the overlapping // section of the two diagonals along the diagonals // themselves; in other words, the length of // the intersection of the two sets of cells in // the matrix. unsigned DiagOverlap(const Diag &d1, const Diag &d2) { // Determine where the diagonals intersect the A // axis (extending them if required). If they // intersect at different points, they do not // overlap. Coordinates on a diagonal are // given by B = A + c where c is the value of // A at the intersection with the A axis. // Hence, c = B - A for any point on the diagonal. int c1 = (int) d1.m_uStartPosB - (int) d1.m_uStartPosA; int c2 = (int) d2.m_uStartPosB - (int) d2.m_uStartPosA; if (c1 != c2) return 0; assert(DiagOverlapA(d1, d2) == DiagOverlapB(d1, d2)); return DiagOverlapA(d1, d2); } // DialogOverlapA returns the length of the overlapping // section of the projection of the two diagonals onto // the A axis. unsigned DiagOverlapA(const Diag &d1, const Diag &d2) { unsigned uMaxStart = MAX(d1.m_uStartPosA, d2.m_uStartPosA); unsigned uMinEnd = MIN(d1.m_uStartPosA + d1.m_uLength - 1, d2.m_uStartPosA + d2.m_uLength - 1); int iLength = (int) uMinEnd - (int) uMaxStart + 1; if (iLength < 0) return 0; return (unsigned) iLength; } // DialogOverlapB returns the length of the overlapping // section of the projection of the two diagonals onto // the B axis. unsigned DiagOverlapB(const Diag &d1, const Diag &d2) { unsigned uMaxStart = MAX(d1.m_uStartPosB, d2.m_uStartPosB); unsigned uMinEnd = MIN(d1.m_uStartPosB + d1.m_uLength - 1, d2.m_uStartPosB + d2.m_uLength - 1); int iLength = (int) uMinEnd - (int) uMaxStart + 1; if (iLength < 0) return 0; return (unsigned) iLength; } // Returns true if the two diagonals can be on the // same path through the DP matrix. If DiagCompatible // returns false, they cannot be in the same path // and hence "contradict" each other. bool DiagCompatible(const Diag &d1, const Diag &d2) { if (DiagOverlap(d1, d2) > 0) return true; return 0 == DiagOverlapA(d1, d2) && 0 == DiagOverlapB(d1, d2); } // Returns the length of the "break" between two diagonals. unsigned DiagBreak(const Diag &d1, const Diag &d2) { int c1 = (int) d1.m_uStartPosB - (int) d1.m_uStartPosA; int c2 = (int) d2.m_uStartPosB - (int) d2.m_uStartPosA; if (c1 != c2) return 0; int iMaxStart = MAX(d1.m_uStartPosA, d2.m_uStartPosA); int iMinEnd = MIN(d1.m_uStartPosA + d1.m_uLength - 1, d2.m_uStartPosA + d1.m_uLength - 1); int iBreak = iMaxStart - iMinEnd - 1; if (iBreak < 0) return 0; return (unsigned) iBreak; } // Merge diagonals that are continuations of each other with // short breaks of up to length g_uMaxDiagBreak. // In a sorted list of diagonals, we only have to check // consecutive entries. void MergeDiags(DiagList &DL) { return; #if DEBUG if (!DL.IsSorted()) Quit("MergeDiags: !IsSorted"); #endif // TODO: Fix this! // Breaks must be with no offset (no gaps) const unsigned uCount = DL.GetCount(); if (uCount <= 1) return; DiagList NewList; Diag MergedDiag; const Diag *ptrPrev = &DL.Get(0); for (unsigned i = 1; i < uCount; ++i) { const Diag *ptrDiag = &DL.Get(i); unsigned uBreakLength = DiagBreak(*ptrPrev, *ptrDiag); if (uBreakLength <= g_uMaxDiagBreak) { MergedDiag.m_uStartPosA = ptrPrev->m_uStartPosA; MergedDiag.m_uStartPosB = ptrPrev->m_uStartPosB; MergedDiag.m_uLength = ptrPrev->m_uLength + ptrDiag->m_uLength + uBreakLength; ptrPrev = &MergedDiag; } else { NewList.Add(*ptrPrev); ptrPrev = ptrDiag; } } NewList.Add(*ptrPrev); DL.Copy(NewList); } void DiagList::DeleteIncompatible() { assert(IsSorted()); if (m_uCount < 2) return; bool *bFlagForDeletion = new bool[m_uCount]; for (unsigned i = 0; i < m_uCount; ++i) bFlagForDeletion[i] = false; for (unsigned i = 0; i < m_uCount; ++i) { const Diag &di = m_Diags[i]; for (unsigned j = i + 1; j < m_uCount; ++j) { const Diag &dj = m_Diags[j]; // Verify sorted correctly assert(di.m_uStartPosA <= dj.m_uStartPosA); // If two diagonals are incompatible and // one is is much longer than the other, // keep the longer one. if (!DiagCompatible(di, dj)) { if (di.m_uLength > dj.m_uLength*4) bFlagForDeletion[j] = true; else if (dj.m_uLength > di.m_uLength*4) bFlagForDeletion[i] = true; else { bFlagForDeletion[i] = true; bFlagForDeletion[j] = true; } } } } for (unsigned i = 0; i < m_uCount; ++i) { const Diag &di = m_Diags[i]; if (bFlagForDeletion[i]) continue; for (unsigned j = i + 1; j < m_uCount; ++j) { const Diag &dj = m_Diags[j]; if (bFlagForDeletion[j]) continue; // Verify sorted correctly assert(di.m_uStartPosA <= dj.m_uStartPosA); // If sort order in B different from sorted order in A, // either diags are incompatible or we detected a repeat // or permutation. if (di.m_uStartPosB >= dj.m_uStartPosB || !DiagCompatible(di, dj)) { bFlagForDeletion[i] = true; bFlagForDeletion[j] = true; } } } unsigned uNewCount = 0; Diag *NewDiags = new Diag[m_uCount]; for (unsigned i = 0; i < m_uCount; ++i) { if (bFlagForDeletion[i]) continue; const Diag &d = m_Diags[i]; NewDiags[uNewCount] = d; ++uNewCount; } memcpy(m_Diags, NewDiags, uNewCount*sizeof(Diag)); m_uCount = uNewCount; delete[] NewDiags; } void DiagList::Copy(const DiagList &DL) { Clear(); unsigned uCount = DL.GetCount(); for (unsigned i = 0; i < uCount; ++i) Add(DL.Get(i)); } // Check if sorted in increasing order of m_uStartPosA bool DiagList::IsSorted() const { return true; unsigned uCount = GetCount(); for (unsigned i = 1; i < uCount; ++i) if (m_Diags[i-1].m_uStartPosA > m_Diags[i].m_uStartPosA) return false; return true; } // Sort in increasing order of m_uStartPosA // Dumb bubble sort, but don't care about speed // because don't get long lists. void DiagList::Sort() { if (m_uCount < 2) return; bool bContinue = true; while (bContinue) { bContinue = false; for (unsigned i = 0; i < m_uCount - 1; ++i) { if (m_Diags[i].m_uStartPosA > m_Diags[i+1].m_uStartPosA) { Diag Tmp = m_Diags[i]; m_Diags[i] = m_Diags[i+1]; m_Diags[i+1] = Tmp; bContinue = true; } } } } //void TestDiag() // { // Diag d1; // Diag d2; // Diag d3; // // d1.m_uStartPosA = 0; // d1.m_uStartPosB = 1; // d1.m_uLength = 32; // // d2.m_uStartPosA = 55; // d2.m_uStartPosB = 70; // d2.m_uLength = 36; // // d3.m_uStartPosA = 102; // d3.m_uStartPosB = 122; // d3.m_uLength = 50; // // DiagList DL; // DL.Add(d1); // DL.Add(d2); // DL.Add(d3); // // Log("Before DeleteIncompatible:\n"); // DL.LogMe(); // DL.DeleteIncompatible(); // // Log("After DeleteIncompatible:\n"); // DL.LogMe(); // // MergeDiags(DL); // Log("After Merge:\n"); // DL.LogMe(); // // DPRegionList RL; // DiagListToDPRegionList(DL, RL, 200, 200); // RL.LogMe(); // } muscle-3.8.31.orig/scorehistory.cpp0000644000175000017500000000473611352261600016642 0ustar kratzcharles#include "muscle.h" #include "scorehistory.h" #include #define TRACE 0 ScoreHistory::ScoreHistory(unsigned uIters, unsigned uNodeCount) { m_uNodeCount = uNodeCount; m_uIters = uIters; m_Score = new SCORE *[uIters]; m_bScoreSet = new bool *[uIters]; for (unsigned n = 0; n < uIters; ++n) { m_Score[n] = new SCORE[uNodeCount*2]; m_bScoreSet[n] = new bool[uNodeCount*2]; memset(m_bScoreSet[n], 0, uNodeCount*2*sizeof(bool)); } } ScoreHistory::~ScoreHistory() { for (unsigned n = 0; n < m_uIters; ++n) { delete[] m_Score[n]; delete[] m_bScoreSet[n]; } delete[] m_Score; delete[] m_bScoreSet; } bool ScoreHistory::SetScore(unsigned uIter, unsigned uNodeIndex, bool bRight, SCORE Score) { #if TRACE Log("ScoreHistory::SetScore(Iter=%u Node=%u Right=%d Score=%g)\n", uIter, uNodeIndex, bRight, Score); #endif if (uIter >= m_uIters) Quit("ScoreHistory::SetScore-1"); if (uNodeIndex >= m_uNodeCount) Quit("ScoreHistory::SetScore-2"); const unsigned uIndex = uNodeIndex*2 + bRight; for (unsigned n = 1; n < uIter; ++n) { const unsigned uPrevIter = n - 1; if (!m_bScoreSet[uPrevIter][uIndex]) { LogMe(); Quit("ScoreHistory::SetScore-3"); } if (m_Score[uPrevIter][uIndex] == Score) { ProgressStepsDone(); #if TRACE Log("Oscillating\n"); #endif return true; } } m_Score[uIter][uIndex] = Score; m_bScoreSet[uIter][uIndex] = true; return false; } void ScoreHistory::LogMe() const { Log("ScoreHistory\n"); Log("Iter Node Right Score\n"); Log("---- ---- ----- ---------\n"); for (unsigned uIter = 0; uIter < m_uIters; ++uIter) { bool bAnySet = false; for (unsigned n = 0; n < m_uNodeCount*2; ++n) if (m_bScoreSet[uIter][n]) { bAnySet = true; break; } if (!bAnySet) return; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { const unsigned uBase = 2*uNodeIndex; if (m_bScoreSet[uIter][uBase]) Log("%4u %4u F %9.3f\n", uIter, uNodeIndex, m_Score[uIter][uBase]); if (m_bScoreSet[uIter][uBase+1]) Log("%4u %4u T %9.3f\n", uIter, uNodeIndex, m_Score[uIter][uBase+1]); } } } SCORE ScoreHistory::GetScore(unsigned uIter, unsigned uNodeIndex, bool bReverse, bool bRight) const { const unsigned uIndex = uNodeIndex*2 + bRight; if (!m_bScoreSet[uIter][uIndex]) Quit("ScoreHistory::GetScore"); return m_Score[uIter][uIndex]; } muscle-3.8.31.orig/gonnet.cpp0000644000175000017500000005745511352261600015405 0ustar kratzcharles#include "muscle.h" #include "gonnet.h" #define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ { A/4.0, C/4.0, D/4.0, E/4.0, F/4.0, G/4.0, H/4.0, I/4.0, K/4.0, L/4.0, M/4.0, N/4.0, P/4.0, Q/4.0, R/4.0, S/4.0, T/4.0, V/4.0, W/4.0, Y/4.0 }, static double Gonnet80[20][20] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 1990, 1140, 930, 1070, 600, 1130, 850, 810, 940, 810, 980, 900, 1080, 1020, 880, 1380, 1190, 1180, 370, 590) // A ROW( 1140, 2780, 310, 300, 850, 630, 810, 700, 360, 690, 850, 690, 310, 480, 640, 1090, 900, 1030, 810, 920) // C ROW( 930, 310, 2200, 1550, 130, 980, 1070, 180, 1030, 150, 360, 1450, 820, 1150, 800, 1100, 1000, 350, 0, 550) // D ROW( 1070, 300, 1550, 2120, 220, 770, 1070, 510, 1280, 490, 710, 1110, 890, 1470, 1010, 1050, 970, 730, 260, 500) // E ROW( 600, 850, 130, 220, 2380, 90, 980, 1090, 350, 1310, 1270, 490, 310, 540, 340, 470, 620, 930, 1400, 1730) // F ROW( 1130, 630, 980, 770, 90, 2210, 710, 100, 740, 200, 410, 1060, 660, 800, 810, 1080, 720, 380, 430, 300) // G ROW( 850, 810, 1070, 1070, 980, 710, 2510, 600, 1120, 670, 860, 1330, 790, 1380, 1140, 990, 1000, 590, 810, 1450) // H ROW( 810, 700, 180, 510, 1090, 100, 600, 2100, 650, 1460, 1490, 530, 490, 640, 530, 620, 960, 1650, 610, 770) // I ROW( 940, 360, 1030, 1280, 350, 740, 1120, 650, 2090, 660, 870, 1220, 870, 1410, 1570, 1040, 1090, 700, 350, 640) // K ROW( 810, 690, 150, 490, 1310, 200, 670, 1460, 660, 2010, 1550, 450, 660, 850, 660, 600, 750, 1270, 800, 890) // L ROW( 980, 850, 360, 710, 1270, 410, 860, 1490, 870, 1550, 2410, 620, 460, 1050, 710, 830, 990, 1250, 790, 870) // M ROW( 900, 690, 1450, 1110, 490, 1060, 1330, 530, 1220, 450, 620, 2210, 760, 1180, 1020, 1290, 1170, 550, 380, 850) // N ROW( 1080, 310, 820, 890, 310, 660, 790, 490, 870, 660, 460, 760, 2380, 1000, 790, 1100, 1040, 670, 120, 480) // P ROW( 1020, 480, 1150, 1470, 540, 800, 1380, 640, 1410, 850, 1050, 1180, 1000, 2190, 1350, 1090, 1060, 730, 620, 710) // Q ROW( 880, 640, 800, 1010, 340, 810, 1140, 530, 1570, 660, 710, 1020, 790, 1350, 2210, 970, 970, 640, 830, 740) // R ROW( 1380, 1090, 1100, 1050, 470, 1080, 990, 620, 1040, 600, 830, 1290, 1100, 1090, 970, 2020, 1490, 810, 520, 780) // S ROW( 1190, 900, 1000, 970, 620, 720, 1000, 960, 1090, 750, 990, 1170, 1040, 1060, 970, 1490, 2050, 1150, 370, 660) // T ROW( 1180, 1030, 350, 730, 930, 380, 590, 1650, 700, 1270, 1250, 550, 670, 730, 640, 810, 1150, 2040, 440, 770) // V ROW( 370, 810, 0, 260, 1400, 430, 810, 610, 350, 800, 790, 380, 120, 620, 830, 520, 370, 440, 2970, 1470) // W ROW( 590, 920, 550, 500, 1730, 300, 1450, 770, 640, 890, 870, 850, 480, 710, 740, 780, 660, 770, 1470, 2470) // Y }; static double Gonnet120[20][20] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 1550, 950, 780, 870, 480, 930, 700, 690, 770, 660, 790, 760, 900, 840, 730, 1120, 980, 960, 280, 480) // A ROW( 950, 2400, 270, 280, 700, 510, 650, 600, 320, 570, 700, 550, 280, 400, 510, 890, 750, 850, 670, 760) // C ROW( 780, 270, 1780, 1310, 90, 820, 890, 160, 880, 140, 320, 1220, 680, 970, 690, 910, 830, 310, 0, 430) // D ROW( 870, 280, 1310, 1680, 180, 650, 900, 410, 1070, 390, 560, 950, 740, 1210, 860, 870, 810, 580, 180, 400) // E ROW( 480, 700, 90, 180, 1980, 40, 820, 930, 290, 1110, 1070, 380, 240, 430, 280, 380, 490, 790, 1230, 1510) // F ROW( 930, 510, 820, 650, 40, 1860, 590, 90, 620, 140, 310, 890, 550, 660, 660, 900, 610, 310, 300, 220) // G ROW( 700, 650, 890, 900, 820, 590, 2060, 480, 940, 540, 680, 1100, 650, 1130, 950, 820, 820, 490, 680, 1220) // H ROW( 690, 600, 160, 410, 930, 90, 480, 1680, 520, 1240, 1250, 410, 400, 530, 430, 520, 790, 1380, 500, 650) // I ROW( 770, 320, 880, 1070, 290, 620, 940, 520, 1650, 520, 690, 1010, 720, 1160, 1320, 860, 900, 570, 280, 520) // K ROW( 660, 570, 140, 390, 1110, 140, 540, 1240, 520, 1620, 1300, 350, 520, 660, 520, 490, 620, 1090, 670, 760) // L ROW( 790, 700, 320, 560, 1070, 310, 680, 1250, 690, 1300, 1910, 500, 400, 820, 580, 670, 800, 1060, 650, 740) // M ROW( 760, 550, 1220, 950, 380, 890, 1100, 410, 1010, 350, 500, 1760, 640, 970, 860, 1060, 960, 460, 280, 680) // N ROW( 900, 280, 680, 740, 240, 550, 650, 400, 720, 520, 400, 640, 2010, 820, 660, 910, 860, 540, 70, 370) // P ROW( 840, 400, 970, 1210, 430, 660, 1130, 530, 1160, 660, 820, 970, 820, 1700, 1120, 890, 870, 600, 470, 580) // Q ROW( 730, 510, 690, 860, 280, 660, 950, 430, 1320, 520, 580, 860, 660, 1120, 1790, 810, 800, 520, 660, 590) // R ROW( 1120, 890, 910, 870, 380, 900, 820, 520, 860, 490, 670, 1060, 910, 890, 810, 1560, 1220, 680, 390, 610) // S ROW( 980, 750, 830, 810, 490, 610, 820, 790, 900, 620, 800, 960, 860, 870, 800, 1220, 1600, 930, 290, 540) // T ROW( 960, 850, 310, 580, 790, 310, 490, 1380, 570, 1090, 1060, 460, 540, 600, 520, 680, 930, 1610, 370, 630) // V ROW( 280, 670, 0, 180, 1230, 300, 680, 500, 280, 670, 650, 280, 70, 470, 660, 390, 290, 370, 2620, 1290) // W ROW( 480, 760, 430, 400, 1510, 220, 1220, 650, 520, 760, 740, 680, 370, 580, 590, 610, 540, 630, 1290, 2070) // Y }; static SCORE Gonnet160[20][20] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 1240, 810, 670, 740, 400, 800, 600, 600, 660, 560, 660, 660, 770, 710, 620, 940, 830, 790, 230, 410) // A ROW( 810, 2130, 250, 260, 600, 440, 550, 530, 300, 490, 590, 470, 260, 360, 430, 760, 640, 720, 570, 650) // C ROW( 670, 250, 1480, 1120, 80, 710, 770, 160, 770, 130, 280, 1040, 590, 840, 620, 780, 720, 290, 0, 360) // D ROW( 740, 260, 1120, 1370, 160, 570, 770, 350, 910, 330, 470, 830, 640, 1010, 750, 750, 700, 480, 140, 340) // E ROW( 400, 600, 80, 160, 1690, 20, 710, 810, 250, 970, 920, 310, 200, 370, 250, 330, 420, 700, 1100, 1340) // F ROW( 800, 440, 710, 570, 20, 1600, 510, 80, 540, 110, 260, 760, 480, 570, 570, 770, 540, 260, 230, 180) // G ROW( 600, 550, 770, 770, 710, 510, 1710, 410, 800, 460, 570, 930, 560, 950, 810, 700, 700, 430, 590, 1050) // H ROW( 600, 530, 160, 350, 810, 80, 410, 1370, 430, 1080, 1070, 340, 350, 460, 370, 450, 660, 1180, 440, 580) // I ROW( 660, 300, 770, 910, 250, 540, 800, 430, 1330, 440, 570, 860, 620, 980, 1130, 740, 760, 480, 240, 430) // K ROW( 560, 490, 130, 330, 970, 110, 460, 1080, 440, 1350, 1120, 300, 430, 540, 430, 420, 540, 950, 580, 670) // L ROW( 660, 590, 280, 470, 920, 260, 570, 1070, 570, 1120, 1540, 420, 360, 660, 490, 550, 670, 920, 560, 650) // M ROW( 660, 470, 1040, 830, 310, 760, 930, 340, 860, 300, 420, 1430, 560, 830, 740, 890, 810, 400, 230, 560) // N ROW( 770, 260, 590, 640, 200, 480, 560, 350, 620, 430, 360, 560, 1740, 700, 570, 780, 740, 460, 40, 300) // P ROW( 710, 360, 840, 1010, 370, 570, 950, 460, 980, 540, 660, 830, 700, 1340, 950, 760, 740, 510, 380, 490) // Q ROW( 620, 430, 620, 750, 250, 570, 810, 370, 1130, 430, 490, 740, 570, 950, 1490, 690, 690, 440, 540, 490) // R ROW( 940, 760, 780, 750, 330, 770, 700, 450, 740, 420, 550, 890, 780, 760, 690, 1220, 1010, 580, 310, 500) // S ROW( 830, 640, 720, 700, 420, 540, 700, 660, 760, 540, 670, 810, 740, 740, 690, 1010, 1280, 780, 240, 460) // T ROW( 790, 720, 290, 480, 700, 260, 430, 1180, 480, 950, 920, 400, 460, 510, 440, 580, 780, 1310, 330, 540) // V ROW( 230, 570, 0, 140, 1100, 230, 590, 440, 240, 580, 560, 230, 40, 380, 540, 310, 240, 330, 2360, 1160) // W ROW( 410, 650, 360, 340, 1340, 180, 1050, 580, 430, 670, 650, 560, 300, 490, 490, 500, 460, 540, 1160, 1780) // Y }; double Gonnet16[21][21] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 124, 81, 67, 74, 40, 80, 60, 60, 66, 56, 66, 66, 77, 71, 62, 94, 83, 79, 23, 41) // A ROW( 81, 213, 25, 26, 60, 44, 55, 53, 30, 49, 59, 47, 26, 36, 43, 76, 64, 72, 57, 65) // C ROW( 67, 25, 148, 112, 8, 71, 77, 16, 77, 13, 28, 104, 59, 84, 62, 78, 72, 29, 0, 36) // D ROW( 74, 26, 112, 137, 16, 57, 77, 35, 91, 33, 47, 83, 64, 101, 75, 75, 70, 48, 14, 34) // E ROW( 40, 60, 8, 16, 169, 2, 71, 81, 25, 97, 92, 31, 20, 37, 25, 33, 42, 70, 110, 134) // F ROW( 80, 44, 71, 57, 2, 160, 51, 8, 54, 11, 26, 76, 48, 57, 57, 77, 54, 26, 23, 18) // G ROW( 60, 55, 77, 77, 71, 51, 171, 41, 80, 46, 57, 93, 56, 95, 81, 70, 70, 43, 59, 105) // H ROW( 60, 53, 16, 35, 81, 8, 41, 137, 43, 108, 107, 34, 35, 46, 37, 45, 66, 118, 44, 58) // I ROW( 66, 30, 77, 91, 25, 54, 80, 43, 133, 44, 57, 86, 62, 98, 113, 74, 76, 48, 24, 43) // K ROW( 56, 49, 13, 33, 97, 11, 46, 108, 44, 135, 112, 30, 43, 54, 43, 42, 54, 95, 58, 67) // L ROW( 66, 59, 28, 47, 92, 26, 57, 107, 57, 112, 154, 42, 36, 66, 49, 55, 67, 92, 56, 65) // M ROW( 66, 47, 104, 83, 31, 76, 93, 34, 86, 30, 42, 143, 56, 83, 74, 89, 81, 40, 23, 56) // N ROW( 77, 26, 59, 64, 20, 48, 56, 35, 62, 43, 36, 56, 174, 70, 57, 78, 74, 46, 4, 30) // P ROW( 71, 36, 84, 101, 37, 57, 95, 46, 98, 54, 66, 83, 70, 134, 95, 76, 74, 51, 38, 49) // Q ROW( 62, 43, 62, 75, 25, 57, 81, 37, 113, 43, 49, 74, 57, 95, 149, 69, 69, 44, 54, 49) // R ROW( 94, 76, 78, 75, 33, 77, 70, 45, 74, 42, 55, 89, 78, 76, 69, 122, 101, 58, 31, 50) // S ROW( 83, 64, 72, 70, 42, 54, 70, 66, 76, 54, 67, 81, 74, 74, 69, 101, 128, 78, 24, 46) // T ROW( 79, 72, 29, 48, 70, 26, 43, 118, 48, 95, 92, 40, 46, 51, 44, 58, 78, 131, 33, 54) // V ROW( 23, 57, 0, 14, 110, 23, 59, 44, 24, 58, 56, 23, 4, 38, 54, 31, 24, 33, 236, 116) // W ROW( 41, 65, 36, 34, 134, 18, 105, 58, 43, 67, 65, 56, 30, 49, 49, 50, 46, 54, 116, 178) // Y }; static double Gonnet250[20][20] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 760, 570, 490, 520, 290, 570, 440, 440, 480, 400, 450, 490, 550, 500, 460, 630, 580, 530, 160, 300) // A ROW( 570, 1670, 200, 220, 440, 320, 390, 410, 240, 370, 430, 340, 210, 280, 300, 530, 470, 520, 420, 470) // C ROW( 490, 200, 990, 790, 70, 530, 560, 140, 570, 120, 220, 740, 450, 610, 490, 570, 520, 230, 0, 240) // D ROW( 520, 220, 790, 880, 130, 440, 560, 250, 640, 240, 320, 610, 470, 690, 560, 540, 510, 330, 90, 250) // E ROW( 290, 440, 70, 130, 1220, 0, 510, 620, 190, 720, 680, 210, 140, 260, 200, 240, 300, 530, 880, 1030) // F ROW( 570, 320, 530, 440, 0, 1180, 380, 70, 410, 80, 170, 560, 360, 420, 420, 560, 410, 190, 120, 120) // G ROW( 440, 390, 560, 560, 510, 380, 1120, 300, 580, 330, 390, 640, 410, 640, 580, 500, 490, 320, 440, 740) // H ROW( 440, 410, 140, 250, 620, 70, 300, 920, 310, 800, 770, 240, 260, 330, 280, 340, 460, 830, 340, 450) // I ROW( 480, 240, 570, 640, 190, 410, 580, 310, 840, 310, 380, 600, 460, 670, 790, 530, 530, 350, 170, 310) // K ROW( 400, 370, 120, 240, 720, 80, 330, 800, 310, 920, 800, 220, 290, 360, 300, 310, 390, 700, 450, 520) // L ROW( 450, 430, 220, 320, 680, 170, 390, 770, 380, 800, 950, 300, 280, 420, 350, 380, 460, 680, 420, 500) // M ROW( 490, 340, 740, 610, 210, 560, 640, 240, 600, 220, 300, 900, 430, 590, 550, 610, 570, 300, 160, 380) // N ROW( 550, 210, 450, 470, 140, 360, 410, 260, 460, 290, 280, 430, 1280, 500, 430, 560, 530, 340, 20, 210) // P ROW( 500, 280, 610, 690, 260, 420, 640, 330, 670, 360, 420, 590, 500, 790, 670, 540, 520, 370, 250, 350) // Q ROW( 460, 300, 490, 560, 200, 420, 580, 280, 790, 300, 350, 550, 430, 670, 990, 500, 500, 320, 360, 340) // R ROW( 630, 530, 570, 540, 240, 560, 500, 340, 530, 310, 380, 610, 560, 540, 500, 740, 670, 420, 190, 330) // S ROW( 580, 470, 520, 510, 300, 410, 490, 460, 530, 390, 460, 570, 530, 520, 500, 670, 770, 520, 170, 330) // T ROW( 530, 520, 230, 330, 530, 190, 320, 830, 350, 700, 680, 300, 340, 370, 320, 420, 520, 860, 260, 410) // V ROW( 160, 420, 0, 90, 880, 120, 440, 340, 170, 450, 420, 160, 20, 250, 360, 190, 170, 260, 1940, 930) // W ROW( 300, 470, 240, 250, 1030, 120, 740, 450, 310, 520, 500, 380, 210, 350, 340, 330, 330, 410, 930, 1300) // Y }; static double Gonnet350[20][20] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 450, 390, 350, 360, 210, 400, 310, 310, 340, 280, 310, 350, 380, 350, 330, 410, 390, 350, 110, 210) // A ROW( 390, 1280, 160, 180, 320, 230, 270, 300, 190, 280, 310, 240, 170, 210, 220, 360, 330, 370, 310, 340) // C ROW( 350, 160, 640, 540, 50, 390, 400, 110, 410, 100, 160, 500, 330, 430, 370, 400, 370, 170, 0, 170) // D ROW( 360, 180, 540, 550, 100, 330, 390, 180, 440, 170, 220, 440, 350, 460, 410, 380, 360, 230, 60, 180) // E ROW( 210, 320, 50, 100, 860, 0, 360, 460, 140, 530, 490, 150, 100, 190, 150, 170, 220, 400, 700, 770) // F ROW( 400, 230, 390, 330, 0, 860, 280, 60, 310, 50, 120, 400, 280, 310, 310, 400, 300, 140, 50, 80) // G ROW( 310, 270, 400, 390, 360, 280, 680, 220, 400, 240, 270, 430, 300, 420, 410, 350, 340, 240, 320, 500) // H ROW( 310, 300, 110, 180, 460, 60, 220, 620, 220, 570, 540, 170, 190, 240, 200, 240, 320, 570, 260, 340) // I ROW( 340, 190, 410, 440, 140, 310, 400, 220, 530, 210, 260, 420, 330, 450, 530, 370, 370, 250, 120, 210) // K ROW( 280, 280, 100, 170, 530, 50, 240, 570, 210, 630, 560, 160, 200, 240, 210, 220, 280, 510, 340, 400) // L ROW( 310, 310, 160, 220, 490, 120, 270, 540, 260, 560, 580, 210, 210, 280, 240, 260, 310, 490, 320, 370) // M ROW( 350, 240, 500, 440, 150, 400, 430, 170, 420, 160, 210, 550, 320, 410, 390, 410, 390, 220, 110, 250) // N ROW( 380, 170, 330, 350, 100, 280, 300, 190, 330, 200, 210, 320, 910, 350, 310, 390, 370, 240, 10, 150) // P ROW( 350, 210, 430, 460, 190, 310, 420, 240, 450, 240, 280, 410, 350, 470, 450, 370, 360, 260, 160, 240) // Q ROW( 330, 220, 370, 410, 150, 310, 410, 200, 530, 210, 240, 390, 310, 450, 630, 360, 350, 230, 230, 230) // R ROW( 410, 360, 400, 380, 170, 400, 350, 240, 370, 220, 260, 410, 390, 370, 360, 450, 430, 290, 130, 230) // S ROW( 390, 330, 370, 360, 220, 300, 340, 320, 370, 280, 310, 390, 370, 360, 350, 430, 460, 350, 120, 230) // T ROW( 350, 370, 170, 230, 400, 140, 240, 570, 250, 510, 490, 220, 240, 260, 230, 290, 350, 560, 210, 310) // V ROW( 110, 310, 0, 60, 700, 50, 320, 260, 120, 340, 320, 110, 10, 160, 230, 130, 120, 210, 1590, 740) // W ROW( 210, 340, 170, 180, 770, 80, 500, 340, 210, 400, 370, 250, 150, 240, 230, 230, 230, 310, 740, 920) // Y }; const t_ROW *GetGonnetMatrix(unsigned N) { switch (N) { case 80: return Gonnet80; case 120: return Gonnet120; //case 16: // return Gonnet16; //case 160: // return Gonnet160; case 250: return Gonnet250; case 350: return Gonnet350; } Quit("Invalid Gonnet%u", N); return 0; } //SCORE GetGonnetGapOpen(unsigned N) // { // switch (N) // { // case 80: // return -639; // case 120: // return -863; // case 160: // return -611; // case 250: // return -308; // case 350: // return -158; // } // Quit("Invalid Gonnet%u", N); // return 0; // } SCORE GetGonnetGapOpen(unsigned N) { switch (N) { case 80: return -1000; case 120: return -800; case 160: return -700; case 250: return -200; case 350: return -175; } Quit("Invalid Gonnet%u", N); return 0; } SCORE GetGonnetGapExtend(unsigned N) { switch (N) { case 80: return 350; case 120: return 200; case 160: return 175; case 250: return 20; case 350: return 20; } Quit("Invalid Gonnet%u", N); return 0; } //double GonnetLookup[400][400]; // //static bool InitGonnetLookup() // { // for (unsigned i = 0; i < 400; ++i) // { // const unsigned A1 = i/20; // const unsigned A2 = i%20; // for (unsigned j = 0; j <= i; ++j) // { // const unsigned B1 = j/20; // const unsigned B2 = j%20; // // const double s00 = Gonnet16[A1][B1]; // const double s01 = Gonnet16[A1][B2]; // const double s10 = Gonnet16[A2][B1]; // const double s11 = Gonnet16[A2][B2]; // // GonnetLookup[i][j] = GonnetLookup[j][i] = (s00 + s01 + s10 + s11)/4; // } // } // return true; // } // //static bool bGonnetLookupInitialized = InitGonnetLookup(); muscle-3.8.31.orig/bittraceback.cpp0000644000175000017500000000640211352261666016527 0ustar kratzcharles#include "muscle.h" #include "pwpath.h" #define TRACE 0 static char XlatEdgeType(char c) { if ('E' == c) return 'D'; if ('J' == c) return 'I'; return c; } static const char *BitsToStr(char Bits) { static char Str[] = "xM xD xI"; switch (Bits & BIT_xM) { case BIT_MM: Str[0] = 'M'; break; case BIT_DM: Str[0] = 'D'; break; case BIT_IM: Str[0] = 'I'; break; } switch (Bits & BIT_xD) { case BIT_MD: Str[3] = 'M'; break; case BIT_DD: Str[3] = 'D'; break; } switch (Bits & BIT_xI) { case BIT_MI: Str[6] = 'M'; break; case BIT_II: Str[6] = 'I'; break; } return Str; } static inline char XChar(char Bits, char cType) { switch (cType) { case 'M': { switch (Bits & BIT_xM) { case BIT_MM: return 'M'; case BIT_DM: return 'D'; case BIT_IM: return 'I'; #if DOUBLE_AFFINE case BIT_EM: return 'E'; case BIT_JM: return 'J'; #endif } Quit("Huh!?"); return '?'; } case 'D': { switch (Bits & BIT_xD) { case BIT_MD: return 'M'; case BIT_DD: return 'D'; } Quit("Huh!?"); return '?'; } case 'I': { switch (Bits & BIT_xI) { case BIT_MI: return 'M'; case BIT_II: return 'I'; } Quit("Huh!?"); return '?'; } #if DOUBLE_AFFINE case 'E': { switch (Bits & BIT_xE) { case BIT_ME: return 'M'; case BIT_EE: return 'E'; } Quit("Huh!?"); return '?'; } case 'J': { switch (Bits & BIT_xJ) { case BIT_MJ: return 'M'; case BIT_JJ: return 'J'; } Quit("Huh!?"); return '?'; } #endif default: Quit("Huh?"); return '?'; } } void BitTraceBack(char **TraceBack, unsigned uLengthA, unsigned uLengthB, char LastEdge, PWPath &Path) { #if TRACE Log("BitTraceBack\n"); #endif Path.Clear(); PWEdge Edge; Edge.uPrefixLengthA = uLengthA; Edge.uPrefixLengthB = uLengthB; char Bits = TraceBack[uLengthA][uLengthB]; Edge.cType = LastEdge; for (;;) { #if TRACE Log("Prepend %c%d.%d\n", Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); #endif char cSave = Edge.cType; Edge.cType = XlatEdgeType(cSave); Path.PrependEdge(Edge); Edge.cType = cSave; unsigned PLA = Edge.uPrefixLengthA; unsigned PLB = Edge.uPrefixLengthB; char Bits = TraceBack[PLA][PLB]; char NextEdgeType = XChar(Bits, Edge.cType); #if TRACE Log("XChar(%s, %c) = %c\n", BitsToStr(Bits), Edge.cType, NextEdgeType); #endif switch (Edge.cType) { case 'M': { if (Edge.uPrefixLengthA == 0) Quit("BitTraceBack MA=0"); if (Edge.uPrefixLengthB == 0) Quit("BitTraceBack MA=0"); --(Edge.uPrefixLengthA); --(Edge.uPrefixLengthB); break; } case 'D': case 'E': { if (Edge.uPrefixLengthA == 0) Quit("BitTraceBack DA=0"); --(Edge.uPrefixLengthA); break; } case 'I': case 'J': { if (Edge.uPrefixLengthB == 0) Quit("BitTraceBack IB=0"); --(Edge.uPrefixLengthB); break; } default: Quit("BitTraceBack: Invalid edge %c", Edge); } if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB) break; Edge.cType = NextEdgeType; } #if TRACE Path.LogMe(); #endif } muscle-3.8.31.orig/globalsosx.cpp0000644000175000017500000000345011366144056016264 0ustar kratzcharles#ifdef __MACH__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include const double DEFAULT_RAM = 1e9; const double DEFAULT_MEM_USE = 1e6; double GetNAN() { static unsigned long nan[2]={0xffffffff, 0x7fffffff}; double dNAN = *( double* )nan; return dNAN; } double g_dNAN = GetNAN(); double GetRAMSize() { static double CACHED_RAM = 0; if (CACHED_RAM != 0) return CACHED_RAM; uint64_t MemPages = 0; size_t Len = sizeof(MemPages); if (sysctlbyname("hw.memsize", &MemPages, &Len, NULL, 0) < 0) return DEFAULT_RAM; return (double) MemPages; } double GetRAMSizeMB() { return GetRAMSize()/1e6; } static double g_uPeakMemUseBytes; double GetMaxMemUseBytes() { return g_uPeakMemUseBytes; } double GetPeakMemUseBytes() { return GetMaxMemUseBytes(); } double GetMemUseBytes() { task_t mytask = mach_task_self(); struct task_basic_info ti; memset((void *) &ti, 0, sizeof(ti)); mach_msg_type_number_t count = TASK_BASIC_INFO_COUNT; kern_return_t ok = task_info(mytask, TASK_BASIC_INFO, (task_info_t) &ti, &count); if (ok == KERN_INVALID_ARGUMENT) return DEFAULT_MEM_USE; if (ok != KERN_SUCCESS) return DEFAULT_MEM_USE; double uBytes = (double ) ti.resident_size; if (uBytes > g_uPeakMemUseBytes) g_uPeakMemUseBytes = uBytes; return uBytes; } double GetMemUseMB() { return GetMemUseBytes()/1e6; } void OSInit() { } #endif // __MACH__ muscle-3.8.31.orig/gotowt.cpp0000644000175000017500000000000011352261600015404 0ustar kratzcharlesmuscle-3.8.31.orig/diffpaths.cpp0000644000175000017500000000606311352261621016053 0ustar kratzcharles#include "muscle.h" #include "pwpath.h" #define TRACE 0 void DiffPaths(const PWPath &p1, const PWPath &p2, unsigned Edges1[], unsigned *ptruDiffCount1, unsigned Edges2[], unsigned *ptruDiffCount2) { #if TRACE Log("DiffPaths\n"); Log("p1="); p1.LogMe(); Log("p2="); p2.LogMe(); #endif const unsigned uEdgeCount1 = p1.GetEdgeCount(); const unsigned uEdgeCount2 = p2.GetEdgeCount(); unsigned uDiffCount1 = 0; unsigned uDiffCount2 = 0; unsigned uEdgeIndex1 = 0; unsigned uEdgeIndex2 = 0; const PWEdge *Edge1 = &p1.GetEdge(uEdgeIndex1); const PWEdge *Edge2 = &p2.GetEdge(uEdgeIndex2); for (;;) { unsigned uEdgeIndexTop1 = uEdgeIndex1; unsigned uEdgeIndexTop2 = uEdgeIndex2; Edge1 = &p1.GetEdge(uEdgeIndex1); Edge2 = &p2.GetEdge(uEdgeIndex2); #if TRACE Log("e1[%u] PLA%u PLB%u %c, e2[%u] PLA%u PLB %u %c DC1=%u DC2=%u\n", uEdgeIndex1, Edge1->uPrefixLengthA, Edge1->uPrefixLengthB, Edge1->cType, uEdgeIndex2, Edge2->uPrefixLengthA, Edge2->uPrefixLengthB, Edge2->cType, uDiffCount1, uDiffCount2); #endif if (Edge1->uPrefixLengthA == Edge2->uPrefixLengthA && Edge1->uPrefixLengthB == Edge2->uPrefixLengthB) { if (!Edge1->Equal(*Edge2)) { Edges1[uDiffCount1++] = uEdgeIndex1; Edges2[uDiffCount2++] = uEdgeIndex2; } ++uEdgeIndex1; ++uEdgeIndex2; } else if (Edge2->uPrefixLengthA < Edge1->uPrefixLengthA || Edge2->uPrefixLengthB < Edge1->uPrefixLengthB) Edges2[uDiffCount2++] = uEdgeIndex2++; else if (Edge1->uPrefixLengthA < Edge2->uPrefixLengthA || Edge1->uPrefixLengthB < Edge2->uPrefixLengthB) Edges1[uDiffCount1++] = uEdgeIndex1++; if (uEdgeCount1 == uEdgeIndex1) { while (uEdgeIndex2 < uEdgeCount2) Edges2[uDiffCount2++] = uEdgeIndex2++; goto Done; } if (uEdgeCount2 == uEdgeIndex2) { while (uEdgeIndex1 < uEdgeCount1) Edges1[uDiffCount1++] = uEdgeIndex1++; goto Done; } if (uEdgeIndex1 == uEdgeIndexTop1 && uEdgeIndex2 == uEdgeIndexTop2) Quit("DiffPaths stuck"); } Done:; #if TRACE Log("DiffCount1=%u (%u %u)\n", uDiffCount1, uEdgeCount1, uEdgeCount2); Log("Diffs1="); for (unsigned i = 0; i < uDiffCount1; ++i) { const PWEdge e = p1.GetEdge(Edges1[i]); Log(" %u=%c%u.%u", Edges1[i], e.cType, e.uPrefixLengthA, e.uPrefixLengthB); } Log("\n"); Log("DiffCount2=%u\n", uDiffCount2); Log("Diffs2="); for (unsigned i = 0; i < uDiffCount2; ++i) { const PWEdge e = p2.GetEdge(Edges2[i]); Log(" %u=%c%u.%u", Edges2[i], e.cType, e.uPrefixLengthA, e.uPrefixLengthB); } Log("\n"); #endif *ptruDiffCount1 = uDiffCount1; *ptruDiffCount2 = uDiffCount2; } void TestDiffPaths() { PWPath p1; PWPath p2; p1.AppendEdge('M', 1, 1); p1.AppendEdge('M', 2, 2); p1.AppendEdge('M', 3, 3); p2.AppendEdge('M', 1, 1); p2.AppendEdge('D', 2, 1); p2.AppendEdge('I', 2, 2); p2.AppendEdge('M', 3, 3); unsigned Edges1[64]; unsigned Edges2[64]; unsigned uDiffCount1; unsigned uDiffCount2; DiffPaths(p1, p2, Edges1, &uDiffCount1, Edges2, &uDiffCount2); } muscle-3.8.31.orig/realigndiffse.cpp0000644000175000017500000000763711352261667016727 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "tree.h" #include "profile.h" #include "pwpath.h" #include "seqvect.h" #include "estring.h" #define TRACE 0 void DeleteProgNode(ProgNode &Node) { delete[] Node.m_Prof; delete[] Node.m_EstringL; delete[] Node.m_EstringR; Node.m_Prof = 0; Node.m_EstringL = 0; Node.m_EstringR = 0; } static void MakeNode(ProgNode &OldNode, ProgNode &NewNode, bool bSwapLR) { if (bSwapLR) { NewNode.m_EstringL = OldNode.m_EstringR; NewNode.m_EstringR = OldNode.m_EstringL; } else { NewNode.m_EstringL = OldNode.m_EstringL; NewNode.m_EstringR = OldNode.m_EstringR; } NewNode.m_Prof = OldNode.m_Prof; NewNode.m_uLength = OldNode.m_uLength; NewNode.m_Weight = OldNode.m_Weight; OldNode.m_Prof = 0; OldNode.m_EstringL = 0; OldNode.m_EstringR = 0; } void RealignDiffsE(const MSA &msaIn, const SeqVect &v, const Tree &NewTree, const Tree &OldTree, const unsigned uNewNodeIndexToOldNodeIndex[], MSA &msaOut, ProgNode *OldProgNodes) { assert(OldProgNodes != 0); const unsigned uNodeCount = NewTree.GetNodeCount(); if (uNodeCount%2 == 0) Quit("RealignDiffs: Expected odd number of nodes"); const unsigned uMergeCount = (uNodeCount - 1)/2; ProgNode *NewProgNodes = new ProgNode[uNodeCount]; for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex) { if (NODE_CHANGED == uNewNodeIndexToOldNodeIndex[uNewNodeIndex]) continue; unsigned uOldNodeIndex = uNewNodeIndexToOldNodeIndex[uNewNodeIndex]; assert(uNewNodeIndex < uNodeCount); assert(uOldNodeIndex < uNodeCount); ProgNode &NewNode = NewProgNodes[uNewNodeIndex]; ProgNode &OldNode = OldProgNodes[uOldNodeIndex]; bool bSwapLR = false; if (!NewTree.IsLeaf(uNewNodeIndex)) { unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex); unsigned uNewRight = NewTree.GetRight(uNewNodeIndex); unsigned uOld = uNewNodeIndexToOldNodeIndex[uNewNodeIndex]; unsigned uOldLeft = OldTree.GetLeft(uOld); unsigned uOldRight = OldTree.GetRight(uOld); assert(uOldLeft < uNodeCount && uOldRight < uNodeCount); if (uOldLeft != uNewNodeIndexToOldNodeIndex[uNewLeft]) { assert(uOldLeft == uNewNodeIndexToOldNodeIndex[uNewRight]); bSwapLR = true; } } MakeNode(OldNode, NewNode, bSwapLR); #if TRACE Log("MakeNode old=%u new=%u swap=%d length=%u weight=%.3g\n", uOldNodeIndex, uNewNodeIndex, bSwapLR, NewNode.m_uLength, NewNode.m_Weight); #endif } unsigned uJoin = 0; SetProgressDesc("Refine tree"); for (unsigned uNewNodeIndex = NewTree.FirstDepthFirstNode(); NULL_NEIGHBOR != uNewNodeIndex; uNewNodeIndex = NewTree.NextDepthFirstNode(uNewNodeIndex)) { if (NODE_CHANGED != uNewNodeIndexToOldNodeIndex[uNewNodeIndex]) continue; Progress(uJoin, uMergeCount - 1); ++uJoin; const unsigned uMergeNodeIndex = uNewNodeIndex; ProgNode &Parent = NewProgNodes[uMergeNodeIndex]; const unsigned uLeft = NewTree.GetLeft(uNewNodeIndex); const unsigned uRight = NewTree.GetRight(uNewNodeIndex); ProgNode &Node1 = NewProgNodes[uLeft]; ProgNode &Node2 = NewProgNodes[uRight]; AlignTwoProfs( Node1.m_Prof, Node1.m_uLength, Node1.m_Weight, Node2.m_Prof, Node2.m_uLength, Node2.m_Weight, Parent.m_Path, &Parent.m_Prof, &Parent.m_uLength); PathToEstrings(Parent.m_Path, &Parent.m_EstringL, &Parent.m_EstringR); Parent.m_Weight = Node1.m_Weight + Node2.m_Weight; delete[] Node1.m_Prof; delete[] Node2.m_Prof; Node1.m_Prof = 0; Node2.m_Prof = 0; } ProgressStepsDone(); if (g_bBrenner) MakeRootMSABrenner((SeqVect &) v, NewTree, NewProgNodes, msaOut); else MakeRootMSA(v, NewTree, NewProgNodes, msaOut); #if DEBUG AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut); #endif for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) DeleteProgNode(NewProgNodes[uNodeIndex]); delete[] NewProgNodes; } muscle-3.8.31.orig/muscle.cpp0000644000175000017500000000507411352261666015405 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "seqvect.h" #include "msa.h" #include "tree.h" #include "profile.h" void MUSCLE(SeqVect &v, MSA &msaOut) { const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) { SetPPScore(PPSCORE_SPN); g_Distance1 = DISTANCE_Kmer4_6; } unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags = g_bDiags1; SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); MSA::SetIdCount(uSeqCount); //// Initialize sequence ids. //// From this point on, ids must somehow propogate from here. // for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) // v.SetSeqId(uSeqIndex, uSeqIndex); if (uSeqCount > 1) MHackStart(v); if (0 == uSeqCount) { msaOut.Clear(); return; } if (1 == uSeqCount && ALPHA_Amino == Alpha) { const Seq &s = v.GetSeq(0); msaOut.FromSeq(s); return; } // First iteration Tree GuideTree; TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1); SetMuscleTree(GuideTree); ProgNode *ProgNodes = 0; if (g_bLow) ProgNodes = ProgressiveAlignE(v, GuideTree, msaOut); else ProgressiveAlign(v, GuideTree, msaOut); SetCurrentAlignment(msaOut); if (1 == g_uMaxIters || 2 == uSeqCount) { MHackEnd(msaOut); return; } g_bDiags = g_bDiags2; SetIter(2); if (g_bLow) { if (0 != g_uMaxTreeRefineIters) RefineTreeE(msaOut, v, GuideTree, ProgNodes); } else RefineTree(msaOut, GuideTree); extern void DeleteProgNode(ProgNode &Node); const unsigned uNodeCount = GuideTree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) DeleteProgNode(ProgNodes[uNodeIndex]); delete[] ProgNodes; ProgNodes = 0; SetSeqWeightMethod(g_SeqWeight2); SetMuscleTree(GuideTree); if (g_bAnchors) RefineVert(msaOut, GuideTree, g_uMaxIters - 2); else RefineHoriz(msaOut, GuideTree, g_uMaxIters - 2, false, false); MHackEnd(msaOut); } muscle-3.8.31.orig/glbalignns.cpp0000644000175000017500000002235511352261600016222 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "pwpath.h" struct DP_MEMORY { unsigned uLength; SCORE *GapOpenA; SCORE *GapOpenB; SCORE *GapCloseA; SCORE *GapCloseB; SCORE *MPrev; SCORE *MCurr; SCORE *MWork; SCORE *DPrev; SCORE *DCurr; SCORE *DWork; SCORE **ScoreMxB; unsigned **SortOrderA; unsigned *uDeletePos; FCOUNT **FreqsA; int **TraceBack; }; static struct DP_MEMORY DPM; static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) { // Max prefix length unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; if (uLength < DPM.uLength) return; // Add 256 to allow for future expansion and // round up to next multiple of 32. uLength += 256; uLength += 32 - uLength%32; const unsigned uOldLength = DPM.uLength; if (uOldLength > 0) { for (unsigned i = 0; i < uOldLength; ++i) { delete[] DPM.TraceBack[i]; delete[] DPM.FreqsA[i]; delete[] DPM.SortOrderA[i]; } for (unsigned n = 0; n < 20; ++n) delete[] DPM.ScoreMxB[n]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.uDeletePos; delete[] DPM.GapOpenA; delete[] DPM.GapOpenB; delete[] DPM.GapCloseA; delete[] DPM.GapCloseB; delete[] DPM.SortOrderA; delete[] DPM.FreqsA; delete[] DPM.ScoreMxB; delete[] DPM.TraceBack; } DPM.uLength = uLength; DPM.GapOpenA = new SCORE[uLength]; DPM.GapOpenB = new SCORE[uLength]; DPM.GapCloseA = new SCORE[uLength]; DPM.GapCloseB = new SCORE[uLength]; DPM.SortOrderA = new unsigned*[uLength]; DPM.FreqsA = new FCOUNT*[uLength]; DPM.ScoreMxB = new SCORE*[20]; DPM.MPrev = new SCORE[uLength]; DPM.MCurr = new SCORE[uLength]; DPM.MWork = new SCORE[uLength]; DPM.DPrev = new SCORE[uLength]; DPM.DCurr = new SCORE[uLength]; DPM.DWork = new SCORE[uLength]; DPM.uDeletePos = new unsigned[uLength]; DPM.TraceBack = new int*[uLength]; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) DPM.ScoreMxB[uLetter] = new SCORE[uLength]; for (unsigned i = 0; i < uLength; ++i) { DPM.SortOrderA[i] = new unsigned[20]; DPM.FreqsA[i] = new FCOUNT[20]; DPM.TraceBack[i] = new int[uLength]; } } SCORE GlobalAlignNS(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; AllocDPMem(uLengthA, uLengthB); SCORE *GapOpenA = DPM.GapOpenA; SCORE *GapOpenB = DPM.GapOpenB; SCORE *GapCloseA = DPM.GapCloseA; SCORE *GapCloseB = DPM.GapCloseB; unsigned **SortOrderA = DPM.SortOrderA; FCOUNT **FreqsA = DPM.FreqsA; SCORE **ScoreMxB = DPM.ScoreMxB; SCORE *MPrev = DPM.MPrev; SCORE *MCurr = DPM.MCurr; SCORE *MWork = DPM.MWork; SCORE *DPrev = DPM.DPrev; SCORE *DCurr = DPM.DCurr; SCORE *DWork = DPM.DWork; unsigned *uDeletePos = DPM.uDeletePos; int **TraceBack = DPM.TraceBack; for (unsigned i = 0; i < uLengthA; ++i) { GapOpenA[i] = PA[i].m_scoreGapOpen; GapCloseA[i] = PA[i].m_scoreGapClose; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; } } for (unsigned j = 0; j < uLengthB; ++j) { GapOpenB[j] = PB[j].m_scoreGapOpen; GapCloseB[j] = PB[j].m_scoreGapClose; } for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { for (unsigned j = 0; j < uLengthB; ++j) ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; } for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); // Special case for i=0 unsigned **ptrSortOrderA = SortOrderA; FCOUNT **ptrFreqsA = FreqsA; assert(ptrSortOrderA == &(SortOrderA[0])); assert(ptrFreqsA == &(FreqsA[0])); TraceBack[0][0] = 0; SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][0]; } MPrev[0] = scoreSum - g_scoreCenter; // D(0,0) is -infinity (requires I->D). DPrev[0] = MINUS_INFINITY; for (unsigned j = 1; j < uLengthB; ++j) { // Only way to get M(0, j) looks like this: // A ----X // B XXXXX // 0 j // So gap-open at j=0, gap-close at j-1. SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][j]; } MPrev[j] = scoreSum - g_scoreCenter + GapOpenB[0] + GapCloseB[j-1]; TraceBack[0][j] = -(int) j; // Assume no D->I transitions, then can't be a delete if only // one letter from A. DPrev[j] = MINUS_INFINITY; } SCORE IPrev_j_1; for (unsigned i = 1; i < uLengthA; ++i) { ++ptrSortOrderA; ++ptrFreqsA; assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); SCORE *ptrMCurr_j = MCurr; memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); const FCOUNT *FreqsAi = *ptrFreqsA; const unsigned *SortOrderAi = *ptrSortOrderA; const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; const SCORE *ptrMCurrMax = MCurr + uLengthB; for (const unsigned *ptrSortOrderAi = SortOrderAi; ptrSortOrderAi != ptrSortOrderAiEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; SCORE *NSBR_Letter = ScoreMxB[uLetter]; const FCOUNT fcLetter = FreqsAi[uLetter]; if (0 == fcLetter) break; SCORE *ptrNSBR = NSBR_Letter; for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) *ptrMCurr += fcLetter*(*ptrNSBR++); } for (unsigned j = 0; j < uLengthB; ++j) MCurr[j] -= g_scoreCenter; ptrMCurr_j = MCurr; unsigned *ptrDeletePos = uDeletePos; // Special case for j=0 // Only way to get M(i, 0) looks like this: // 0 i // A XXXXX // B ----X // So gap-open at i=0, gap-close at i-1. assert(ptrMCurr_j == &(MCurr[0])); *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; ++ptrMCurr_j; int *ptrTraceBack_ij = TraceBack[i]; *ptrTraceBack_ij++ = (int) i; SCORE *ptrMPrev_j = MPrev; SCORE *ptrDPrev = DPrev; SCORE d = *ptrDPrev; SCORE DNew = *ptrMPrev_j + GapOpenA[i]; if (DNew > d) { d = DNew; *ptrDeletePos = i; } SCORE *ptrDCurr = DCurr; assert(ptrDCurr == &(DCurr[0])); *ptrDCurr = d; // Can't have an insert if no letters from B IPrev_j_1 = MINUS_INFINITY; unsigned uInsertPos; const SCORE scoreGapOpenAi = GapOpenA[i]; const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; for (unsigned j = 1; j < uLengthB; ++j) { // Here, MPrev_j is preserved from previous // iteration so with current i,j is M[i-1][j-1] SCORE MPrev_j = *ptrMPrev_j; SCORE INew = MPrev_j + GapOpenB[j]; if (INew > IPrev_j_1) { IPrev_j_1 = INew; uInsertPos = j; } SCORE scoreMax = MPrev_j; assert(ptrDPrev == &(DPrev[j-1])); SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; if (scoreD > scoreMax) { scoreMax = scoreD; assert(ptrDeletePos == &(uDeletePos[j-1])); *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; assert(*ptrTraceBack_ij > 0); } ++ptrDeletePos; SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; if (scoreI > scoreMax) { scoreMax = scoreI; *ptrTraceBack_ij = (int) uInsertPos - (int) j; assert(*ptrTraceBack_ij < 0); } assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); *ptrMCurr_j += scoreMax; assert(ptrMCurr_j == &(MCurr[j])); ++ptrMCurr_j; MPrev_j = *(++ptrMPrev_j); assert(ptrDPrev == &(DPrev[j])); SCORE d = *ptrDPrev; SCORE DNew = MPrev_j + scoreGapOpenAi; if (DNew > d) { d = DNew; assert(ptrDeletePos == &uDeletePos[j]); *ptrDeletePos = i; } assert(ptrDCurr + 1 == &(DCurr[j])); *(++ptrDCurr) = d; ++ptrTraceBack_ij; } Rotate(MPrev, MCurr, MWork); Rotate(DPrev, DCurr, DWork); } // Special case for i=uLengthA SCORE IPrev = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { SCORE INew = MPrev[j-1] + GapOpenB[j]; if (INew > IPrev) { uInsertPos = j; IPrev = INew; } } // Special case for i=uLengthA, j=uLengthB SCORE scoreMax = MPrev[uLengthB-1]; int iTraceBack = 0; SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; if (scoreD > scoreMax) { scoreMax = scoreD; iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; } SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; if (scoreI > scoreMax) { scoreMax = scoreI; iTraceBack = (int) uInsertPos - (int) uLengthB; } TraceBack[uLengthA][uLengthB] = iTraceBack; TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); return scoreMax; } muscle-3.8.31.orig/savebest.cpp0000644000175000017500000000264611352261673015731 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "textfile.h" #include MSA *ptrBestMSA; static const char *pstrOutputFileName; void SetOutputFileName(const char *out) { pstrOutputFileName = out; } void SetCurrentAlignment(MSA &msa) { ptrBestMSA = &msa; } void SaveCurrentAlignment() { static bool bCalled = false; if (bCalled) { fprintf(stderr, "\nRecursive call to SaveCurrentAlignment, giving up attempt to save.\n"); exit(EXIT_FatalError); } if (0 == ptrBestMSA) { fprintf(stderr, "\nAlignment not completed, cannot save.\n"); Log("Alignment not completed, cannot save.\n"); exit(EXIT_FatalError); } if (0 == pstrOutputFileName) { fprintf(stderr, "\nOutput file name not specified, cannot save.\n"); exit(EXIT_FatalError); } fprintf(stderr, "\nSaving current alignment ...\n"); TextFile fileOut(pstrOutputFileName, true); ptrBestMSA->ToFASTAFile(fileOut); fprintf(stderr, "Current alignment saved to \"%s\".\n", pstrOutputFileName); Log("Current alignment saved to \"%s\".\n", pstrOutputFileName); } void CheckMaxTime() { if (0 == g_ulMaxSecs) return; time_t Now = time(0); time_t ElapsedSecs = Now - GetStartTime(); if (ElapsedSecs <= (time_t) g_ulMaxSecs) return; Log("Max time %s exceeded, elapsed seconds = %ul\n", MaxSecsToStr(), ElapsedSecs); SaveCurrentAlignment(); exit(EXIT_Success); } muscle-3.8.31.orig/dpregionlist.h0000644000175000017500000000177211352261626016262 0ustar kratzcharles#ifndef DPRegionList_h #define DPRegionList_h #include "diaglist.h" enum DPREGIONTYPE { DPREGIONTYPE_Unknown, DPREGIONTYPE_Diag, DPREGIONTYPE_Rect }; struct DPRegion { DPREGIONTYPE m_Type; union { Diag m_Diag; Rect m_Rect; }; }; const unsigned MAX_DPREGIONS = 1024; class DPRegionList { public: DPRegionList() { m_uCount = 0; } ~DPRegionList() { Free(); } public: // Creation void Clear() { Free(); } void Add(const DPRegion &r); // Accessors unsigned GetCount() const { return m_uCount; } const DPRegion &Get(unsigned uIndex) const { assert(uIndex < m_uCount); return m_DPRegions[uIndex]; } // Diagnostics void LogMe() const; private: void Free() { m_uCount = 0; } private: unsigned m_uCount; DPRegion m_DPRegions[MAX_DPREGIONS]; }; void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL, unsigned uLengthA, unsigned uLengthB); #endif // DPRegionList_h muscle-3.8.31.orig/fastdistkmer.cpp0000644000175000017500000001455511352261636016616 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "seqvect.h" #include "seq.h" #include "distfunc.h" #include #define TRACE 0 /*** Some candidate alphabets considered because they have high correlations and small table sizes. Correlation coefficent is between k-mer distance and %id D measured from a CLUSTALW alignment. Table size is N^k where N is size of alphabet. A is standard (uncompressed) amino alphabet. Correlation Alpha N k Table Size all 25-50% ----- -- - ---------- ---- ------ A 20 3 8,000 0.943 0.575 A 20 4 160,000 0.962 0.685 << LiA 14 4 38,416 0.966 0.645 SEB 14 4 38,416 0.964 0.634 LiA 13 4 28,561 0.965 0.640 LiA 12 4 20,736 0.963 0.620 LiA 10 5 100,000 0.964 0.652 We select A with k=4 because it has the best correlations. The only drawback is a large table size, but space is readily available and the only additional time cost is in resetting the table to zero, which can be done quickly with memset or by keeping a list of the k-mers that were found (should test to see which is faster, and may vary by compiler and processor type). It also has the minor advantage that we don't need to convert the alphabet. Fractional identity d is estimated as follows. F = fractional k-mer count if F is 0: F = 0.01 Y = log(0.02 + F) d = -4.1 + 4.12*Y The constant 0.02 was chosen to make the relationship between Y and D linear. The constants -4.1 and 4.12 were chosen to fit a straight line to the scatterplot of Y vs D. ***/ #define MIN(x, y) (((x) < (y)) ? (x) : (y)) const unsigned K = 4; const unsigned N = 20; const unsigned N_2 = 20*20; const unsigned N_3 = 20*20*20; const unsigned N_4 = 20*20*20*20; const unsigned TABLE_SIZE = N_4; // For debug output const char *KmerToStr(unsigned Kmer) { static char s[5]; unsigned c3 = (Kmer/N_3)%N; unsigned c2 = (Kmer/N_2)%N; unsigned c1 = (Kmer/N)%N; unsigned c0 = Kmer%N; s[0] = LetterToChar(c3); s[1] = LetterToChar(c2); s[2] = LetterToChar(c1); s[3] = LetterToChar(c0); return s; } void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[]) { #if TRACE Log("CountKmers\n"); #endif memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte)); const byte *ptrKmerStart = s; const byte *ptrKmerEnd = s + 4; const byte *ptrSeqEnd = s + uSeqLength; unsigned c3 = s[0]*N_3; unsigned c2 = s[1]*N_2; unsigned c1 = s[2]*N; unsigned c0 = s[3]; unsigned Kmer = c3 + c2 + c1 + c0; for (;;) { assert(Kmer < TABLE_SIZE); #if TRACE Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer)); #endif ++(KmerCounts[Kmer]); if (ptrKmerEnd == ptrSeqEnd) break; // Compute k-mer as function of previous k-mer: // 1. Subtract first letter from previous k-mer. // 2. Multiply by N. // 3. Add next letter. c3 = (*ptrKmerStart++) * N_3; Kmer = (Kmer - c3)*N; Kmer += *ptrKmerEnd++; } } unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength, const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2) { byte KmerCounts2[TABLE_SIZE]; CountKmers(Seq2, uSeqLength2, KmerCounts2); const byte *ptrKmerStart = Seq; const byte *ptrKmerEnd = Seq + 4; const byte *ptrSeqEnd = Seq + uSeqLength; unsigned c3 = Seq[0]*N_3; unsigned c2 = Seq[1]*N_2; unsigned c1 = Seq[2]*N; unsigned c0 = Seq[3]; unsigned Kmer = c3 + c2 + c1 + c0; unsigned uCommonCount = 0; for (;;) { assert(Kmer < TABLE_SIZE); const byte Count1 = KmerCounts1[Kmer]; const byte Count2 = KmerCounts2[Kmer]; uCommonCount += MIN(Count1, Count2); // Hack so we don't double-count KmerCounts2[Kmer] = 0; if (ptrKmerEnd == ptrSeqEnd) break; // Compute k-mer as function of previous k-mer: // 1. Subtract first letter from previous k-mer. // 2. Multiply by N. // 3. Add next letter. c3 = (*ptrKmerStart++) * N_3; Kmer = (Kmer - c3)*N; Kmer += *ptrKmerEnd++; } return uCommonCount; } static void SeqToLetters(const Seq &s, byte Letters[]) { const unsigned uSeqLength = s.Length(); for (unsigned uCol = 0; uCol < uSeqLength; ++uCol) { char c = s.GetChar(uCol); // Ugly hack. My k-mer counting code isn't wild-card // aware. Arbitrarily replace wildcards by a specific // amino acid. if (IsWildcardChar(c)) c = 'A'; *Letters++ = CharToLetter(c); } } void FastDistKmer(const SeqVect &v, DistFunc &DF) { byte KmerCounts[TABLE_SIZE]; const unsigned uSeqCount = v.GetSeqCount(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; // Initialize distance matrix to zero for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } unsigned uMaxLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const Seq &s = v.GetSeq(uSeqIndex); unsigned uSeqLength = s.Length(); if (uSeqLength > uMaxLength) uMaxLength = uSeqLength; } if (0 == uMaxLength) return; byte *Seq1Letters = new byte[uMaxLength]; byte *Seq2Letters = new byte[uMaxLength]; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1) { const Seq &s1 = v.GetSeq(uSeqIndex1); const unsigned uSeqLength1 = s1.Length(); SeqToLetters(s1, Seq1Letters); CountKmers(Seq1Letters, uSeqLength1, KmerCounts); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const Seq &s2 = v.GetSeq(uSeqIndex2); const unsigned uSeqLength2 = s2.Length(); SeqToLetters(s2, Seq2Letters); unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1, KmerCounts, Seq2Letters, uSeqLength2); unsigned uMinLength = MIN(uSeqLength1, uSeqLength2); double F = (double) uCommonKmerCount / (uMinLength - K + 1); if (0.0 == F) F = 0.01; double Y = log(0.02 + F); double EstimatedPctId = Y/4.12 + 0.995; double KD = KimuraDist(EstimatedPctId); // DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD); DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F)); #if TRACE Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n", uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD); #endif } } delete[] Seq1Letters; delete[] Seq2Letters; } muscle-3.8.31.orig/phyfromclust.cpp0000644000175000017500000000526711352261612016647 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include "clust.h" void Tree::InitCache(unsigned uCacheCount) { m_uCacheCount = uCacheCount; m_uNeighbor1 = new unsigned[m_uCacheCount]; m_uNeighbor2 = new unsigned[m_uCacheCount]; m_uNeighbor3 = new unsigned[m_uCacheCount]; m_Ids = new unsigned[m_uCacheCount]; m_dEdgeLength1 = new double[m_uCacheCount]; m_dEdgeLength2 = new double[m_uCacheCount]; m_dEdgeLength3 = new double[m_uCacheCount]; m_dHeight = new double[m_uCacheCount]; m_bHasEdgeLength1 = new bool[m_uCacheCount]; m_bHasEdgeLength2 = new bool[m_uCacheCount]; m_bHasEdgeLength3 = new bool[m_uCacheCount]; m_bHasHeight = new bool[m_uCacheCount]; m_ptrName = new char *[m_uCacheCount]; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { m_uNeighbor1[uNodeIndex] = NULL_NEIGHBOR; m_uNeighbor2[uNodeIndex] = NULL_NEIGHBOR; m_uNeighbor3[uNodeIndex] = NULL_NEIGHBOR; m_bHasEdgeLength1[uNodeIndex] = false; m_bHasEdgeLength2[uNodeIndex] = false; m_bHasEdgeLength3[uNodeIndex] = false; m_bHasHeight[uNodeIndex] = false; m_dEdgeLength1[uNodeIndex] = dInsane; m_dEdgeLength2[uNodeIndex] = dInsane; m_dEdgeLength3[uNodeIndex] = dInsane; m_dHeight[uNodeIndex] = dInsane; m_ptrName[uNodeIndex] = 0; m_Ids[uNodeIndex] = uInsane; } } void Tree::FromClust(Clust &C) { Clear(); m_uNodeCount = C.GetNodeCount(); InitCache(m_uNodeCount); // Cluster is always rooted. An unrooted cluster // is represented by a pseudo-root, which we fix later. m_bRooted = true; const unsigned uRoot = C.GetRootNodeIndex(); m_uRootNodeIndex = uRoot; m_uNeighbor1[uRoot] = NULL_NEIGHBOR; m_bHasEdgeLength1[uRoot] = false; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { if (C.IsLeaf(uNodeIndex)) { const char *ptrName = C.GetNodeName(uNodeIndex); m_ptrName[uNodeIndex] = strsave(ptrName); m_Ids[uNodeIndex] = C.GetNodeId(uNodeIndex); continue; } const unsigned uLeft = C.GetLeftIndex(uNodeIndex); const unsigned uRight = C.GetRightIndex(uNodeIndex); const double dLeftLength = C.GetLength(uLeft); const double dRightLength = C.GetLength(uRight); m_uNeighbor2[uNodeIndex] = uLeft; m_uNeighbor3[uNodeIndex] = uRight; m_dEdgeLength1[uLeft] = dLeftLength; m_dEdgeLength1[uRight] = dRightLength; m_uNeighbor1[uLeft] = uNodeIndex; m_uNeighbor1[uRight] = uNodeIndex; m_bHasEdgeLength1[uLeft] = true; m_bHasEdgeLength1[uRight] = true; m_dEdgeLength2[uNodeIndex] = dLeftLength; m_dEdgeLength3[uNodeIndex] = dRightLength; m_bHasEdgeLength2[uNodeIndex] = true; m_bHasEdgeLength3[uNodeIndex] = true; } Validate(); } muscle-3.8.31.orig/clust.h0000644000175000017500000001103511352261612014675 0ustar kratzcharles#ifndef Clust_h #define Clust_h class Clust; class ClustNode; class ClustSet; class Phylip; class SortedNode; const unsigned RB_NIL = ((unsigned) 0xfff0); class ClustNode { public: ClustNode() { m_uIndex = uInsane; m_uSize = uInsane; m_dLength = (float) dInsane; m_ptrLeft = 0; m_ptrRight = 0; m_ptrParent = 0; m_ptrNextCluster = 0; m_ptrPrevCluster = 0; m_uLeafIndexes = 0; } ~ClustNode() { delete[] m_uLeafIndexes; } unsigned m_uIndex; unsigned m_uSize; float m_dLength; ClustNode *m_ptrLeft; ClustNode *m_ptrRight; ClustNode *m_ptrParent; ClustNode *m_ptrNextCluster; ClustNode *m_ptrPrevCluster; unsigned *m_uLeafIndexes; }; class Clust { public: Clust(); virtual ~Clust(); void Create(ClustSet &Set, CLUSTER Method); unsigned GetLeafCount() const; unsigned GetClusterCount() const; unsigned GetClusterSize(unsigned uNodeIndex) const; unsigned GetLeaf(unsigned uClusterIndex, unsigned uLeafIndex) const; unsigned GetNodeCount() const { return 2*m_uLeafCount - 1; } const ClustNode &GetRoot() const { return m_Nodes[GetRootNodeIndex()]; } unsigned GetRootNodeIndex() const { return m_uNodeCount - 1; } const ClustNode &GetNode(unsigned uNodeIndex) const; bool IsLeaf(unsigned uNodeIndex) const; unsigned GetLeftIndex(unsigned uNodeIndex) const; unsigned GetRightIndex(unsigned uNodeIndex) const; float GetLength(unsigned uNodeIndex) const; float GetHeight(unsigned uNodeIndex) const; const char *GetNodeName(unsigned uNodeIndex) const; unsigned GetNodeId(unsigned uNodeIndex) const; JOIN GetJoinStyle() const { return m_JoinStyle; } LINKAGE GetCentroidStyle() const { return m_CentroidStyle; } void SetDist(unsigned uIndex1, unsigned uIndex2, float dDist); float GetDist(unsigned uIndex1, unsigned uIndex2) const; void ToPhylip(Phylip &tree); void LogMe() const; //private: void SetLeafCount(unsigned uLeafCount); void CreateCluster(); void JoinNodes(unsigned uLeftNodeIndex, unsigned uRightNodeIndex, float dLeftLength, float dRightLength, unsigned uNewNodeIndex); void ChooseJoin(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength); void ChooseJoinNeighborJoining(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength); void ChooseJoinNearestNeighbor(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength); float ComputeDist(unsigned uNewNodeIndex, unsigned uNodeIndex); float ComputeDistAverageLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex); float ComputeDistMinLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex); float ComputeDistMaxLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex); float ComputeDistNeighborJoining(unsigned uNewNewIndex, unsigned uNodeIndex); float ComputeDistMAFFT(unsigned uNewNewIndex, unsigned uNodeIndex); float Calc_r(unsigned uNodeIndex) const; unsigned VectorIndex(unsigned uIndex1, unsigned uIndex2) const; unsigned GetFirstCluster() const; unsigned GetNextCluster(unsigned uNodeIndex) const; float ComputeMetric(unsigned uIndex1, unsigned uIndex2) const; float ComputeMetricNearestNeighbor(unsigned i, unsigned j) const; float ComputeMetricNeighborJoining(unsigned i, unsigned j) const; void InitMetric(unsigned uMaxNodeIndex); void InsertMetric(unsigned uIndex1, unsigned uIndex2, float dMetric); float GetMinMetric(unsigned *ptruIndex1, unsigned *ptruIndex2) const; float GetMinMetricBruteForce(unsigned *ptruIndex1, unsigned *ptruIndex2) const; void DeleteMetric(unsigned uIndex); void DeleteMetric(unsigned uIndex1, unsigned uIndex2); void ListMetric() const; void DeleteFromClusterList(unsigned uNodeIndex); void AddToClusterList(unsigned uNodeIndex); void RBDelete(unsigned RBNode); unsigned RBInsert(unsigned i, unsigned j, float fMetric); unsigned RBNext(unsigned RBNode) const; unsigned RBPrev(unsigned RBNode) const; unsigned RBMin(unsigned RBNode) const; unsigned RBMax(unsigned RBNode) const; void ValidateRB(const char szMsg[] = 0) const; void ValidateRBNode(unsigned Node, const char szMsg[]) const; //private: JOIN m_JoinStyle; LINKAGE m_CentroidStyle; ClustNode *m_Nodes; unsigned *m_ClusterIndexToNodeIndex; unsigned *m_NodeIndexToClusterIndex; unsigned m_uLeafCount; unsigned m_uNodeCount; unsigned m_uClusterCount; unsigned m_uTriangularMatrixSize; float *m_dDist; ClustSet *m_ptrSet; ClustNode *m_ptrClusterList; }; #endif // Clust_h muscle-3.8.31.orig/subfam.cpp0000644000175000017500000002303111366141374015361 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include "textfile.h" // for test only #include "msa.h" #include "seqvect.h" #include "profile.h" #ifndef _MSC_VER #include // for unlink #endif #define TRACE 0 /*** Find subfamilies from tree by following criteria: (a) number of leaves <= max, (b) is monophyletic, i.e. most recent common ancestor is parent of no more than one subfamily. ***/ static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount, unsigned SubFams[], unsigned &uSubFamCount) { if (tree.IsLeaf(uNodeIndex)) return 1; unsigned uLeft = tree.GetLeft(uNodeIndex); unsigned uRight = tree.GetRight(uNodeIndex); unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount); unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount); unsigned uLeafCount = uLeftCount + uRightCount; if (uLeftCount + uRightCount > uMaxLeafCount) { if (uLeftCount <= uMaxLeafCount) SubFams[uSubFamCount++] = uLeft; if (uRightCount <= uMaxLeafCount) SubFams[uSubFamCount++] = uRight; } else if (tree.IsRoot(uNodeIndex)) { if (uSubFamCount != 0) Quit("Error in SubFamRecurse"); SubFams[uSubFamCount++] = uNodeIndex; } return uLeafCount; } void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount) { *ptruSubFamCount = 0; SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount); #if TRACE { Log("\n"); Log("Tree:\n"); tree.LogMe(); //void DrawTree(const Tree &tree); //DrawTree(tree); Log("\n"); Log("%d subfams:\n", *ptruSubFamCount); for (unsigned i = 0; i < *ptruSubFamCount; ++i) Log(" %d=%d", i, SubFams[i]); Log("\n"); } #endif } //unsigned SubFams[9999]; //unsigned uSubFamCount; // //static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex) // { // const unsigned uRoot = tree.GetRootNodeIndex(); // unsigned uDist = 0; // while (uNodeIndex != uRoot) // { // ++uDist; // uNodeIndex = tree.GetParent(uNodeIndex); // } // return uDist; // } // //static void DrawNode(const Tree &tree, unsigned uNodeIndex) // { // if (!tree.IsLeaf(uNodeIndex)) // DrawNode(tree, tree.GetLeft(uNodeIndex)); // // unsigned uDist = DistFromRoot(tree, uNodeIndex); // for (unsigned i = 0; i < 5*uDist; ++i) // Log(" "); // Log("%d", uNodeIndex); // for (unsigned i = 0; i < uSubFamCount; ++i) // if (uNodeIndex == SubFams[i]) // { // Log("*"); // break; // } // Log("\n"); // // if (!tree.IsLeaf(uNodeIndex)) // DrawNode(tree, tree.GetRight(uNodeIndex)); // } // //static void DrawTree(const Tree &tree) // { // unsigned uRoot = tree.GetRootNodeIndex(); // DrawNode(tree, uRoot); // } // //void TestSubFams(const char *FileName) // { // Tree tree; // TextFile f(FileName); // tree.FromFile(f); // SubFam(tree, 5, SubFams, &uSubFamCount); // DrawTree(tree); // } static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[]) { if (tree.IsLeaf(uNodeIndex)) return; unsigned uLeft = tree.GetLeft(uNodeIndex); unsigned uRight = tree.GetRight(uNodeIndex); NodeInSubFam[uLeft] = true; NodeInSubFam[uRight] = true; SetInFam(tree, uLeft, NodeInSubFam); SetInFam(tree, uRight, NodeInSubFam); } void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex, MSA &msaOut) { const unsigned uSeqCount = vAll.GetSeqCount(); const char *InTmp = "asf_in.tmp"; const char *OutTmp = "asf_out.tmp"; unsigned *Leaves = new unsigned[uSeqCount]; unsigned uLeafCount; GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount); SeqVect v; for (unsigned i = 0; i < uLeafCount; ++i) { unsigned uLeafNodeIndex = Leaves[i]; unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex); Seq &s = vAll.GetSeqById(uId); v.AppendSeq(s); } #if TRACE { Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount); for (unsigned i = 0; i < uLeafCount; ++i) Log(" %s", v.GetSeqName(i)); Log("\n"); } #endif TextFile fIn(InTmp, true); v.ToFASTAFile(fIn); fIn.Close(); char CmdLine[4096]; sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp); // sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp); int NotUsed = system(CmdLine); TextFile fOut(OutTmp); msaOut.FromFile(fOut); for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex) { const char *Name = msaOut.GetSeqName(uSeqIndex); unsigned uId = vAll.GetSeqIdFromName(Name); msaOut.SetSeqId(uSeqIndex, uId); } unlink(InTmp); unlink(OutTmp); delete[] Leaves; } void ProgAlignSubFams() { MSA msaOut; SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); PTR_SCOREMATRIX UserMatrix = 0; if (0 != g_pstrMatrixFileName) { const char *FileName = g_pstrMatrixFileName; const char *Path = getenv("MUSCLE_MXPATH"); if (Path != 0) { size_t n = strlen(Path) + 1 + strlen(FileName) + 1; char *NewFileName = new char[n]; sprintf(NewFileName, "%s/%s", Path, FileName); FileName = NewFileName; } TextFile File(FileName); UserMatrix = ReadMx(File); g_Alpha = ALPHA_Amino; g_PPScore = PPSCORE_SP; } SetPPScore(); if (0 != UserMatrix) g_ptrScoreMatrix = UserMatrix; if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) { SetPPScore(PPSCORE_SPN); g_Distance1 = DISTANCE_Kmer4_6; } unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags = g_bDiags1; SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (uSeqCount > 1) MHackStart(v); if (0 == uSeqCount) { msaOut.Clear(); return; } if (1 == uSeqCount && ALPHA_Amino == Alpha) { const Seq &s = v.GetSeq(0); msaOut.FromSeq(s); return; } Tree GuideTree; TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1); SetMuscleTree(GuideTree); MSA msa; if (g_bLow) { ProgNode *ProgNodes = 0; ProgNodes = ProgressiveAlignE(v, GuideTree, msa); delete[] ProgNodes; } else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2); SetMuscleTree(GuideTree); unsigned *SubFams = new unsigned[uSeqCount]; unsigned uSubFamCount; SubFam(GuideTree, g_uMaxSubFamCount, SubFams, &uSubFamCount); SetProgressDesc("Align node"); const unsigned uNodeCount = 2*uSeqCount - 1; ProgNode *ProgNodes = new ProgNode[uNodeCount]; bool *NodeIsSubFam = new bool[uNodeCount]; bool *NodeInSubFam = new bool[uNodeCount]; for (unsigned i = 0; i < uNodeCount; ++i) { NodeIsSubFam[i] = false; NodeInSubFam[i] = false; } for (unsigned i = 0; i < uSubFamCount; ++i) { unsigned uNodeIndex = SubFams[i]; assert(uNodeIndex < uNodeCount); NodeIsSubFam[uNodeIndex] = true; SetInFam(GuideTree, uNodeIndex, NodeInSubFam); } unsigned uJoin = 0; unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); do { if (NodeIsSubFam[uTreeNodeIndex]) { #if TRACE Log("Node %d: align subfam\n", uTreeNodeIndex); #endif ProgNode &Node = ProgNodes[uTreeNodeIndex]; AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA); Node.m_uLength = Node.m_MSA.GetColCount(); } else if (!NodeInSubFam[uTreeNodeIndex]) { #if TRACE Log("Node %d: align two subfams\n", uTreeNodeIndex); #endif Progress(uJoin, uSubFamCount - 1); ++uJoin; const unsigned uMergeNodeIndex = uTreeNodeIndex; ProgNode &Parent = ProgNodes[uMergeNodeIndex]; const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); ProgNode &Node1 = ProgNodes[uLeft]; ProgNode &Node2 = ProgNodes[uRight]; PWPath Path; AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); Parent.m_uLength = Parent.m_MSA.GetColCount(); Node1.m_MSA.Clear(); Node2.m_MSA.Clear(); } else { #if TRACE Log("Node %d: in subfam\n", uTreeNodeIndex); #endif ; } uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); ProgressStepsDone(); unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; TextFile fOut(g_pstrOutFileName, true); MHackEnd(RootProgNode.m_MSA); RootProgNode.m_MSA.ToFile(fOut); delete[] NodeInSubFam; delete[] NodeIsSubFam; delete[] ProgNodes; delete[] SubFams; ProgNodes = 0; NodeInSubFam = 0; NodeIsSubFam = 0; SubFams = 0; } muscle-3.8.31.orig/msa.cpp0000644000175000017500000005054411352261667014700 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "textfile.h" #include "seq.h" #include const unsigned DEFAULT_SEQ_LENGTH = 500; unsigned MSA::m_uIdCount = 0; MSA::MSA() { m_uSeqCount = 0; m_uColCount = 0; m_szSeqs = 0; m_szNames = 0; m_Weights = 0; m_IdToSeqIndex = 0; m_SeqIndexToId = 0; m_uCacheSeqCount = 0; m_uCacheSeqLength = 0; } MSA::~MSA() { Free(); } void MSA::Free() { for (unsigned n = 0; n < m_uSeqCount; ++n) { delete[] m_szSeqs[n]; delete[] m_szNames[n]; } delete[] m_szSeqs; delete[] m_szNames; delete[] m_Weights; delete[] m_IdToSeqIndex; delete[] m_SeqIndexToId; m_uSeqCount = 0; m_uColCount = 0; m_szSeqs = 0; m_szNames = 0; m_Weights = 0; m_IdToSeqIndex = 0; m_SeqIndexToId = 0; } void MSA::SetSize(unsigned uSeqCount, unsigned uColCount) { Free(); m_uSeqCount = uSeqCount; m_uCacheSeqLength = uColCount; m_uColCount = 0; if (0 == uSeqCount && 0 == uColCount) return; m_szSeqs = new char *[uSeqCount]; m_szNames = new char *[uSeqCount]; m_Weights = new WEIGHT[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { m_szSeqs[uSeqIndex] = new char[uColCount+1]; m_szNames[uSeqIndex] = 0; #if DEBUG m_Weights[uSeqIndex] = BTInsane; memset(m_szSeqs[uSeqIndex], '?', uColCount); #endif m_szSeqs[uSeqIndex][uColCount] = 0; } if (m_uIdCount > 0) { m_IdToSeqIndex = new unsigned[m_uIdCount]; m_SeqIndexToId = new unsigned[m_uSeqCount]; #if DEBUG memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); #endif } } void MSA::LogMe() const { if (0 == GetColCount()) { Log("MSA empty\n"); return; } const unsigned uColsPerLine = 50; unsigned uLinesPerSeq = (GetColCount() - 1)/uColsPerLine + 1; for (unsigned n = 0; n < uLinesPerSeq; ++n) { unsigned i; unsigned iStart = n*uColsPerLine; unsigned iEnd = GetColCount(); if (iEnd - iStart + 1 > uColsPerLine) iEnd = iStart + uColsPerLine; Log(" "); for (i = iStart; i < iEnd; ++i) Log("%u", i%10); Log("\n"); Log(" "); for (i = iStart; i + 9 < iEnd; i += 10) Log("%-10u", i); if (n == uLinesPerSeq - 1) Log(" %-10u", GetColCount()); Log("\n"); for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { Log("%12.12s", m_szNames[uSeqIndex]); if (m_Weights[uSeqIndex] != BTInsane) Log(" (%5.3f)", m_Weights[uSeqIndex]); else Log(" "); Log(" "); for (i = iStart; i < iEnd; ++i) Log("%c", GetChar(uSeqIndex, i)); if (0 != m_SeqIndexToId) Log(" [%5u]", m_SeqIndexToId[uSeqIndex]); Log("\n"); } Log("\n\n"); } } char MSA::GetChar(unsigned uSeqIndex, unsigned uIndex) const { // TODO: Performance cost? if (uSeqIndex >= m_uSeqCount || uIndex >= m_uColCount) Quit("MSA::GetChar(%u/%u,%u/%u)", uSeqIndex, m_uSeqCount, uIndex, m_uColCount); char c = m_szSeqs[uSeqIndex][uIndex]; // assert(IsLegalChar(c)); return c; } unsigned MSA::GetLetter(unsigned uSeqIndex, unsigned uIndex) const { // TODO: Performance cost? char c = GetChar(uSeqIndex, uIndex); unsigned uLetter = CharToLetter(c); if (uLetter >= 20) { char c = ' '; if (uSeqIndex < m_uSeqCount && uIndex < m_uColCount) c = m_szSeqs[uSeqIndex][uIndex]; Quit("MSA::GetLetter(%u/%u, %u/%u)='%c'/%u", uSeqIndex, m_uSeqCount, uIndex, m_uColCount, c, uLetter); } return uLetter; } unsigned MSA::GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const { // TODO: Performance cost? char c = GetChar(uSeqIndex, uIndex); unsigned uLetter = CharToLetterEx(c); return uLetter; } void MSA::SetSeqName(unsigned uSeqIndex, const char szName[]) { if (uSeqIndex >= m_uSeqCount) Quit("MSA::SetSeqName(%u, %s), count=%u", uSeqIndex, m_uSeqCount); delete[] m_szNames[uSeqIndex]; int n = (int) strlen(szName) + 1; m_szNames[uSeqIndex] = new char[n]; memcpy(m_szNames[uSeqIndex], szName, n); } const char *MSA::GetSeqName(unsigned uSeqIndex) const { if (uSeqIndex >= m_uSeqCount) Quit("MSA::GetSeqName(%u), count=%u", uSeqIndex, m_uSeqCount); return m_szNames[uSeqIndex]; } bool MSA::IsGap(unsigned uSeqIndex, unsigned uIndex) const { char c = GetChar(uSeqIndex, uIndex); return IsGapChar(c); } bool MSA::IsWildcard(unsigned uSeqIndex, unsigned uIndex) const { char c = GetChar(uSeqIndex, uIndex); return IsWildcardChar(c); } void MSA::SetChar(unsigned uSeqIndex, unsigned uIndex, char c) { if (uSeqIndex >= m_uSeqCount || uIndex > m_uCacheSeqLength) Quit("MSA::SetChar(%u,%u)", uSeqIndex, uIndex); if (uIndex == m_uCacheSeqLength) { const unsigned uNewCacheSeqLength = m_uCacheSeqLength + DEFAULT_SEQ_LENGTH; for (unsigned n = 0; n < m_uSeqCount; ++n) { char *ptrNewSeq = new char[uNewCacheSeqLength+1]; memcpy(ptrNewSeq, m_szSeqs[n], m_uCacheSeqLength); memset(ptrNewSeq + m_uCacheSeqLength, '?', DEFAULT_SEQ_LENGTH); ptrNewSeq[uNewCacheSeqLength] = 0; delete[] m_szSeqs[n]; m_szSeqs[n] = ptrNewSeq; } m_uColCount = uIndex; m_uCacheSeqLength = uNewCacheSeqLength; } if (uIndex >= m_uColCount) m_uColCount = uIndex + 1; m_szSeqs[uSeqIndex][uIndex] = c; } void MSA::GetSeq(unsigned uSeqIndex, Seq &seq) const { assert(uSeqIndex < m_uSeqCount); seq.Clear(); for (unsigned n = 0; n < m_uColCount; ++n) if (!IsGap(uSeqIndex, n)) { char c = GetChar(uSeqIndex, n); if (!isalpha(c)) Quit("Invalid character '%c' in sequence", c); c = toupper(c); seq.push_back(c); } const char *ptrName = GetSeqName(uSeqIndex); seq.SetName(ptrName); } bool MSA::HasGap() const { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) for (unsigned n = 0; n < GetColCount(); ++n) if (IsGap(uSeqIndex, n)) return true; return false; } bool MSA::IsLegalLetter(unsigned uLetter) const { return uLetter < 20; } void MSA::SetSeqCount(unsigned uSeqCount) { Free(); SetSize(uSeqCount, DEFAULT_SEQ_LENGTH); } void MSA::CopyCol(unsigned uFromCol, unsigned uToCol) { assert(uFromCol < GetColCount()); assert(uToCol < GetColCount()); if (uFromCol == uToCol) return; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char c = GetChar(uSeqIndex, uFromCol); SetChar(uSeqIndex, uToCol, c); } } void MSA::Copy(const MSA &msa) { Free(); const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { SetSeqName(uSeqIndex, msa.GetSeqName(uSeqIndex)); const unsigned uId = msa.GetSeqId(uSeqIndex); SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msa.GetChar(uSeqIndex, uColIndex); SetChar(uSeqIndex, uColIndex, c); } } } bool MSA::IsGapColumn(unsigned uColIndex) const { assert(GetSeqCount() > 0); for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) if (!IsGap(uSeqIndex, uColIndex)) return false; return true; } bool MSA::GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) if (0 == stricmp(ptrSeqName, GetSeqName(uSeqIndex))) { *ptruSeqIndex = uSeqIndex; return true; } return false; } void MSA::DeleteCol(unsigned uColIndex) { assert(uColIndex < m_uColCount); size_t n = m_uColCount - uColIndex; if (n > 0) { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { char *ptrSeq = m_szSeqs[uSeqIndex]; memmove(ptrSeq + uColIndex, ptrSeq + uColIndex + 1, n); } } --m_uColCount; } void MSA::DeleteColumns(unsigned uColIndex, unsigned uColCount) { for (unsigned n = 0; n < uColCount; ++n) DeleteCol(uColIndex); } void MSA::FromFile(TextFile &File) { FromFASTAFile(File); } // Weights sum to 1, WCounts sum to NIC WEIGHT MSA::GetSeqWeight(unsigned uSeqIndex) const { assert(uSeqIndex < m_uSeqCount); WEIGHT w = m_Weights[uSeqIndex]; if (w == wInsane) Quit("Seq weight not set"); return w; } void MSA::SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const { assert(uSeqIndex < m_uSeqCount); m_Weights[uSeqIndex] = w; } void MSA::NormalizeWeights(WEIGHT wDesiredTotal) const { WEIGHT wTotal = 0; for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) wTotal += m_Weights[uSeqIndex]; if (0 == wTotal) return; const WEIGHT f = wDesiredTotal/wTotal; for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) m_Weights[uSeqIndex] *= f; } void MSA::CalcWeights() const { Quit("Calc weights not implemented"); } static void FmtChar(char c, unsigned uWidth) { Log("%c", c); for (unsigned n = 0; n < uWidth - 1; ++n) Log(" "); } static void FmtInt(unsigned u, unsigned uWidth) { static char szStr[1024]; assert(uWidth < sizeof(szStr)); if (u > 0) sprintf(szStr, "%u", u); else strcpy(szStr, "."); Log(szStr); unsigned n = (unsigned) strlen(szStr); if (n < uWidth) for (unsigned i = 0; i < uWidth - n; ++i) Log(" "); } static void FmtInt0(unsigned u, unsigned uWidth) { static char szStr[1024]; assert(uWidth < sizeof(szStr)); sprintf(szStr, "%u", u); Log(szStr); unsigned n = (unsigned) strlen(szStr); if (n < uWidth) for (unsigned i = 0; i < uWidth - n; ++i) Log(" "); } static void FmtPad(unsigned n) { for (unsigned i = 0; i < n; ++i) Log(" "); } void MSA::FromSeq(const Seq &s) { unsigned uSeqLength = s.Length(); SetSize(1, uSeqLength); SetSeqName(0, s.GetName()); if (0 != m_SeqIndexToId) SetSeqId(0, s.GetId()); for (unsigned n = 0; n < uSeqLength; ++n) SetChar(0, n, s[n]); } unsigned MSA::GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const { assert(uSeqIndex < GetSeqCount()); assert(uColIndex < GetColCount()); unsigned uCol = 0; for (unsigned n = 0; n <= uColIndex; ++n) if (!IsGap(uSeqIndex, n)) ++uCol; return uCol; } void MSA::CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex) { assert(uToSeqIndex < m_uSeqCount); const unsigned uColCount = msaFrom.GetColCount(); assert(m_uColCount == uColCount || (0 == m_uColCount && uColCount <= m_uCacheSeqLength)); memcpy(m_szSeqs[uToSeqIndex], msaFrom.GetSeqBuffer(uFromSeqIndex), uColCount); SetSeqName(uToSeqIndex, msaFrom.GetSeqName(uFromSeqIndex)); if (0 == m_uColCount) m_uColCount = uColCount; } const char *MSA::GetSeqBuffer(unsigned uSeqIndex) const { assert(uSeqIndex < m_uSeqCount); return m_szSeqs[uSeqIndex]; } void MSA::DeleteSeq(unsigned uSeqIndex) { assert(uSeqIndex < m_uSeqCount); delete m_szSeqs[uSeqIndex]; delete m_szNames[uSeqIndex]; const unsigned uBytesToMove = (m_uSeqCount - uSeqIndex)*sizeof(char *); if (uBytesToMove > 0) { memmove(m_szSeqs + uSeqIndex, m_szSeqs + uSeqIndex + 1, uBytesToMove); memmove(m_szNames + uSeqIndex, m_szNames + uSeqIndex + 1, uBytesToMove); } --m_uSeqCount; delete[] m_Weights; m_Weights = 0; } bool MSA::IsEmptyCol(unsigned uColIndex) const { const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (!IsGap(uSeqIndex, uColIndex)) return false; return true; } //void MSA::DeleteEmptyCols(bool bProgress) // { // unsigned uColCount = GetColCount(); // for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) // { // if (IsEmptyCol(uColIndex)) // { // if (bProgress) // { // Log("Deleting col %u of %u\n", uColIndex, uColCount); // printf("Deleting col %u of %u\n", uColIndex, uColCount); // } // DeleteCol(uColIndex); // --uColCount; // } // } // } unsigned MSA::AlignedColIndexToColIndex(unsigned uAlignedColIndex) const { Quit("MSA::AlignedColIndexToColIndex not implemented"); return 0; } WEIGHT MSA::GetTotalSeqWeight() const { WEIGHT wTotal = 0; const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) wTotal += m_Weights[uSeqIndex]; return wTotal; } bool MSA::SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2, unsigned uSeqIndex2) { Seq s1; Seq s2; a1.GetSeq(uSeqIndex1, s1); a2.GetSeq(uSeqIndex2, s2); s1.StripGaps(); s2.StripGaps(); return s1.EqIgnoreCase(s2); } unsigned MSA::GetSeqLength(unsigned uSeqIndex) const { assert(uSeqIndex < GetSeqCount()); const unsigned uColCount = GetColCount(); unsigned uLength = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) if (!IsGap(uSeqIndex, uColIndex)) ++uLength; return uLength; } void MSA::GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrPWID, unsigned *ptruPosCount) const { assert(uSeqIndex1 < GetSeqCount()); assert(uSeqIndex2 < GetSeqCount()); unsigned uSameCount = 0; unsigned uPosCount = 0; const unsigned uColCount = GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { char c1 = GetChar(uSeqIndex1, uColIndex); if (IsGapChar(c1)) continue; char c2 = GetChar(uSeqIndex2, uColIndex); if (IsGapChar(c2)) continue; ++uPosCount; if (c1 == c2) ++uSameCount; } *ptruPosCount = uPosCount; if (uPosCount > 0) *ptrPWID = 100.0 * (double) uSameCount / (double) uPosCount; else *ptrPWID = 0; } void MSA::UnWeight() { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) m_Weights[uSeqIndex] = BTInsane; } unsigned MSA::UniqueResidueTypes(unsigned uColIndex) const { assert(uColIndex < GetColCount()); unsigned Counts[MAX_ALPHA]; memset(Counts, 0, sizeof(Counts)); const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) continue; const unsigned uLetter = GetLetter(uSeqIndex, uColIndex); ++(Counts[uLetter]); } unsigned uUniqueCount = 0; for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) if (Counts[uLetter] > 0) ++uUniqueCount; return uUniqueCount; } double MSA::GetOcc(unsigned uColIndex) const { unsigned uGapCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) ++uGapCount; unsigned uSeqCount = GetSeqCount(); return (double) (uSeqCount - uGapCount) / (double) uSeqCount; } void MSA::ToFile(TextFile &File) const { if (g_bMSF) ToMSFFile(File); else if (g_bAln) ToAlnFile(File); else if (g_bHTML) ToHTMLFile(File); else if (g_bPHYS) ToPhySequentialFile(File); else if (g_bPHYI) ToPhyInterleavedFile(File); else ToFASTAFile(File); if (0 != g_pstrScoreFileName) WriteScoreFile(*this); } bool MSA::ColumnHasGap(unsigned uColIndex) const { const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) return true; return false; } void MSA::SetIdCount(unsigned uIdCount) { //if (m_uIdCount != 0) // Quit("MSA::SetIdCount: may only be called once"); if (m_uIdCount > 0) { if (uIdCount > m_uIdCount) Quit("MSA::SetIdCount: cannot increase count"); return; } m_uIdCount = uIdCount; } void MSA::SetSeqId(unsigned uSeqIndex, unsigned uId) { assert(uSeqIndex < m_uSeqCount); assert(uId < m_uIdCount); if (0 == m_SeqIndexToId) { if (0 == m_uIdCount) Quit("MSA::SetSeqId, SetIdCount has not been called"); m_IdToSeqIndex = new unsigned[m_uIdCount]; m_SeqIndexToId = new unsigned[m_uSeqCount]; memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); } m_SeqIndexToId[uSeqIndex] = uId; m_IdToSeqIndex[uId] = uSeqIndex; } unsigned MSA::GetSeqIndex(unsigned uId) const { assert(uId < m_uIdCount); assert(0 != m_IdToSeqIndex); unsigned uSeqIndex = m_IdToSeqIndex[uId]; assert(uSeqIndex < m_uSeqCount); return uSeqIndex; } bool MSA::GetSeqIndex(unsigned uId, unsigned *ptruIndex) const { for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { if (uId == m_SeqIndexToId[uSeqIndex]) { *ptruIndex = uSeqIndex; return true; } } return false; } unsigned MSA::GetSeqId(unsigned uSeqIndex) const { assert(uSeqIndex < m_uSeqCount); unsigned uId = m_SeqIndexToId[uSeqIndex]; assert(uId < m_uIdCount); return uId; } bool MSA::WeightsSet() const { return BTInsane != m_Weights[0]; } void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uIdCount, uColCount); for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uIdCount; ++uSeqIndexOut) { const unsigned uId = Ids[uSeqIndexOut]; const unsigned uSeqIndexIn = msaIn.GetSeqIndex(uId); const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); msaOut.SetSeqId(uSeqIndexOut, uId); msaOut.SetSeqName(uSeqIndexOut, ptrName); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); msaOut.SetChar(uSeqIndexOut, uColIndex, c); } } } // Caller must allocate ptrSeq and ptrLabel as new char[n]. void MSA::AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel) { if (m_uSeqCount > m_uCacheSeqCount) Quit("Internal error MSA::AppendSeq"); if (m_uSeqCount == m_uCacheSeqCount) ExpandCache(m_uSeqCount + 4, uSeqLength); m_szSeqs[m_uSeqCount] = ptrSeq; m_szNames[m_uSeqCount] = ptrLabel; ++m_uSeqCount; } void MSA::ExpandCache(unsigned uSeqCount, unsigned uColCount) { if (m_IdToSeqIndex != 0 || m_SeqIndexToId != 0 || uSeqCount < m_uSeqCount) Quit("Internal error MSA::ExpandCache"); if (m_uSeqCount > 0 && uColCount != m_uColCount) Quit("Internal error MSA::ExpandCache, ColCount changed"); char **NewSeqs = new char *[uSeqCount]; char **NewNames = new char *[uSeqCount]; WEIGHT *NewWeights = new WEIGHT[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { NewSeqs[uSeqIndex] = m_szSeqs[uSeqIndex]; NewNames[uSeqIndex] = m_szNames[uSeqIndex]; NewWeights[uSeqIndex] = m_Weights[uSeqIndex]; } for (unsigned uSeqIndex = m_uSeqCount; uSeqIndex < uSeqCount; ++uSeqIndex) { char *Seq = new char[uColCount]; NewSeqs[uSeqIndex] = Seq; #if DEBUG memset(Seq, '?', uColCount); #endif } delete[] m_szSeqs; delete[] m_szNames; delete[] m_Weights; m_szSeqs = NewSeqs; m_szNames = NewNames; m_Weights = NewWeights; m_uCacheSeqCount = uSeqCount; m_uCacheSeqLength = uColCount; m_uColCount = uColCount; } void MSA::FixAlpha() { ClearInvalidLetterWarning(); for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { for (unsigned uColIndex = 0; uColIndex < m_uColCount; ++uColIndex) { char c = GetChar(uSeqIndex, uColIndex); if (!IsResidueChar(c) && !IsGapChar(c)) { char w = GetWildcardChar(); // Warning("Invalid letter '%c', replaced by '%c'", c, w); InvalidLetterWarning(c, w); SetChar(uSeqIndex, uColIndex, w); } } } ReportInvalidLetters(); } ALPHA MSA::GuessAlpha() const { // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap // letters belong to the nucleotide alphabet, guess nucleo. // Otherwise amino. const unsigned CHAR_COUNT = 100; const unsigned MIN_NUCLEO_PCT = 95; const unsigned uSeqCount = GetSeqCount(); const unsigned uColCount = GetColCount(); if (0 == uSeqCount) return ALPHA_Amino; unsigned uDNACount = 0; unsigned uRNACount = 0; unsigned uTotal = 0; unsigned i = 0; for (;;) { unsigned uSeqIndex = i/uColCount; if (uSeqIndex >= uSeqCount) break; unsigned uColIndex = i%uColCount; ++i; char c = GetChar(uSeqIndex, uColIndex); if (IsGapChar(c)) continue; if (IsDNA(c)) ++uDNACount; if (IsRNA(c)) ++uRNACount; ++uTotal; if (uTotal >= CHAR_COUNT) break; } if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_RNA; if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_DNA; return ALPHA_Amino; } muscle-3.8.31.orig/fasta2.cpp0000644000175000017500000000460611352261673015273 0ustar kratzcharles#include "muscle.h" #include #include //const int BUFFER_BYTES = 16*1024; const int BUFFER_BYTES = 128; const int CR = '\r'; const int NL = '\n'; #define ADD(c) \ { \ if (Pos >= BufferLength) \ { \ const int NewBufferLength = BufferLength + BUFFER_BYTES; \ char *NewBuffer = new char[NewBufferLength]; \ memcpy(NewBuffer, Buffer, BufferLength); \ delete[] Buffer; \ Buffer = NewBuffer; \ BufferLength = NewBufferLength; \ } \ Buffer[Pos++] = c; \ } // Get next sequence from file. char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, bool DeleteGaps) { unsigned BufferLength = 0; unsigned Pos = 0; char *Buffer = 0; int c = fgetc(f); if (EOF == c) return 0; if ('>' != c) Quit("Invalid file format, expected '>' to start FASTA label"); for (;;) { int c = fgetc(f); if (EOF == c) Quit("End-of-file or input error in FASTA label"); // NL or CR terminates label if (NL == c || CR == c) break; // All other characters added to label ADD(c) } // Nul-terminate label ADD(0) *ptrLabel = Buffer; BufferLength = 0; Pos = 0; Buffer = 0; int PreviousChar = NL; for (;;) { int c = fgetc(f); if (EOF == c) { if (feof(f)) break; else if (ferror(f)) Quit("Error reading FASTA file, ferror=TRUE feof=FALSE errno=%d %s", errno, strerror(errno)); else Quit("Error reading FASTA file, fgetc=EOF feof=FALSE ferror=FALSE errno=%d %s", errno, strerror(errno)); } if ('>' == c) { if (NL == PreviousChar || CR == PreviousChar) { ungetc(c, f); break; } else Quit("Unexpected '>' in FASTA sequence data"); } else if (isspace(c)) ; else if (IsGapChar(c)) { if (!DeleteGaps) ADD(c) } else if (isalpha(c)) { c = toupper(c); ADD(c) } else if (isprint(c)) { Warning("Invalid character '%c' in FASTA sequence data, ignored", c); continue; } else { Warning("Invalid byte hex %02x in FASTA sequence data, ignored", (unsigned char) c); continue; } PreviousChar = c; } if (0 == Pos) return GetFastaSeq(f, ptrSeqLength, ptrLabel, DeleteGaps); *ptrSeqLength = Pos; return Buffer; } muscle-3.8.31.orig/progalign.cpp0000644000175000017500000001260611352261667016077 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include "seqvect.h" #include "profile.h" #include "msa.h" #include "pwpath.h" #include "distfunc.h" #include "textfile.h" #include "estring.h" #define TRACE 0 #define VALIDATE 0 #define TRACE_LENGTH_DELTA 0 static void LogLeafNames(const Tree &tree, unsigned uNodeIndex) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Leaves = new unsigned[uNodeCount]; unsigned uLeafCount; GetLeaves(tree, uNodeIndex, Leaves, &uLeafCount); for (unsigned i = 0; i < uLeafCount; ++i) { if (i > 0) Log(","); Log("%s", tree.GetLeafName(Leaves[i])); } delete[] Leaves; } ProgNode *ProgressiveAlignE(const SeqVect &v, const Tree &GuideTree, MSA &a) { assert(GuideTree.IsRooted()); #if TRACE Log("GuideTree:\n"); GuideTree.LogMe(); #endif const unsigned uSeqCount = v.Length(); const unsigned uNodeCount = 2*uSeqCount - 1; const unsigned uIterCount = uSeqCount - 1; WEIGHT *Weights = new WEIGHT[uSeqCount]; CalcClustalWWeights(GuideTree, Weights); ProgNode *ProgNodes = new ProgNode[uNodeCount]; unsigned uJoin = 0; unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); SetProgressDesc("Align node"); do { if (GuideTree.IsLeaf(uTreeNodeIndex)) { if (uTreeNodeIndex >= uNodeCount) Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount); ProgNode &Node = ProgNodes[uTreeNodeIndex]; unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); if (uId >= uSeqCount) Quit("Seq index out of range"); const Seq &s = *(v[uId]); Node.m_MSA.FromSeq(s); Node.m_MSA.SetSeqId(0, uId); Node.m_uLength = Node.m_MSA.GetColCount(); Node.m_Weight = Weights[uId]; // TODO: Term gaps settable Node.m_Prof = ProfileFromMSA(Node.m_MSA); Node.m_EstringL = 0; Node.m_EstringR = 0; #if TRACE Log("Leaf id=%u\n", uId); Log("MSA=\n"); Node.m_MSA.LogMe(); Log("Profile (from MSA)=\n"); ListProfile(Node.m_Prof, Node.m_uLength, &Node.m_MSA); #endif } else { Progress(uJoin, uSeqCount - 1); ++uJoin; const unsigned uMergeNodeIndex = uTreeNodeIndex; ProgNode &Parent = ProgNodes[uMergeNodeIndex]; const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); if (g_bVerbose) { Log("Align: ("); LogLeafNames(GuideTree, uLeft); Log(") ("); LogLeafNames(GuideTree, uRight); Log(")\n"); } ProgNode &Node1 = ProgNodes[uLeft]; ProgNode &Node2 = ProgNodes[uRight]; #if TRACE Log("AlignTwoMSAs:\n"); #endif AlignTwoProfs( Node1.m_Prof, Node1.m_uLength, Node1.m_Weight, Node2.m_Prof, Node2.m_uLength, Node2.m_Weight, Parent.m_Path, &Parent.m_Prof, &Parent.m_uLength); #if TRACE_LENGTH_DELTA { unsigned L = Node1.m_uLength; unsigned R = Node2.m_uLength; unsigned P = Parent.m_Path.GetEdgeCount(); unsigned Max = L > R ? L : R; unsigned d = P - Max; Log("LD%u;%u;%u;%u\n", L, R, P, d); } #endif PathToEstrings(Parent.m_Path, &Parent.m_EstringL, &Parent.m_EstringR); Parent.m_Weight = Node1.m_Weight + Node2.m_Weight; #if VALIDATE { #if TRACE Log("AlignTwoMSAs:\n"); #endif PWPath TmpPath; AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, TmpPath); ProfPos *P1 = ProfileFromMSA(Node1.m_MSA, true); ProfPos *P2 = ProfileFromMSA(Node2.m_MSA, true); unsigned uLength = Parent.m_MSA.GetColCount(); ProfPos *TmpProf = ProfileFromMSA(Parent.m_MSA, true); #if TRACE Log("Node1 MSA=\n"); Node1.m_MSA.LogMe(); Log("Node1 prof=\n"); ListProfile(Node1.m_Prof, Node1.m_MSA.GetColCount(), &Node1.m_MSA); Log("Node1 prof (from MSA)=\n"); ListProfile(P1, Node1.m_MSA.GetColCount(), &Node1.m_MSA); AssertProfsEq(Node1.m_Prof, Node1.m_uLength, P1, Node1.m_MSA.GetColCount()); Log("Node2 prof=\n"); ListProfile(Node2.m_Prof, Node2.m_MSA.GetColCount(), &Node2.m_MSA); Log("Node2 MSA=\n"); Node2.m_MSA.LogMe(); Log("Node2 prof (from MSA)=\n"); ListProfile(P2, Node2.m_MSA.GetColCount(), &Node2.m_MSA); AssertProfsEq(Node2.m_Prof, Node2.m_uLength, P2, Node2.m_MSA.GetColCount()); TmpPath.AssertEqual(Parent.m_Path); Log("Parent MSA=\n"); Parent.m_MSA.LogMe(); Log("Parent prof=\n"); ListProfile(Parent.m_Prof, Parent.m_uLength, &Parent.m_MSA); Log("Parent prof (from MSA)=\n"); ListProfile(TmpProf, Parent.m_MSA.GetColCount(), &Parent.m_MSA); #endif // TRACE AssertProfsEq(Parent.m_Prof, Parent.m_uLength, TmpProf, Parent.m_MSA.GetColCount()); delete[] P1; delete[] P2; delete[] TmpProf; } #endif // VALIDATE Node1.m_MSA.Clear(); Node2.m_MSA.Clear(); // Don't delete profiles, may need them for tree refinement. //delete[] Node1.m_Prof; //delete[] Node2.m_Prof; //Node1.m_Prof = 0; //Node2.m_Prof = 0; } uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); ProgressStepsDone(); if (g_bBrenner) MakeRootMSABrenner((SeqVect &) v, GuideTree, ProgNodes, a); else MakeRootMSA(v, GuideTree, ProgNodes, a); #if VALIDATE { unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; AssertMSAEq(a, RootProgNode.m_MSA); } #endif delete[] Weights; return ProgNodes; } muscle-3.8.31.orig/enums.h0000644000175000017500000000376111352261673014710 0ustar kratzcharles// enums.h // Define enum types. // Exploit macro hacks to avoid lots of repetetive typing. // Generally I am opposed to macro hacks because of the // highly obscure code that results, but in this case it // makes maintenance much easier and less error-prone. // The idea is that this file can be included in different // places with different definitions of s (Start), c (Case) // and e (End). See types.h. s(ALPHA) c(ALPHA, Amino) c(ALPHA, DNA) c(ALPHA, RNA) e(ALPHA) s(SEQTYPE) c(SEQTYPE, Protein) c(SEQTYPE, DNA) c(SEQTYPE, RNA) c(SEQTYPE, Auto) e(SEQTYPE) s(ROOT) c(ROOT, Pseudo) c(ROOT, MidLongestSpan) c(ROOT, MinAvgLeafDist) e(ROOT) s(CLUSTER) c(CLUSTER, UPGMA) c(CLUSTER, UPGMAMax) c(CLUSTER, UPGMAMin) c(CLUSTER, UPGMB) c(CLUSTER, NeighborJoining) e(CLUSTER) s(JOIN) c(JOIN, NearestNeighbor) c(JOIN, NeighborJoining) e(JOIN) s(LINKAGE) c(LINKAGE, Min) c(LINKAGE, Avg) c(LINKAGE, Max) c(LINKAGE, NeighborJoining) c(LINKAGE, Biased) e(LINKAGE) s(DISTANCE) c(DISTANCE, Kmer6_6) c(DISTANCE, Kmer20_3) c(DISTANCE, Kmer20_4) c(DISTANCE, Kbit20_3) c(DISTANCE, Kmer4_6) c(DISTANCE, PctIdKimura) c(DISTANCE, PctIdLog) c(DISTANCE, PWKimura) c(DISTANCE, PWScoreDist) c(DISTANCE, ScoreDist) c(DISTANCE, Edit) e(DISTANCE) s(PPSCORE) c(PPSCORE, LE) c(PPSCORE, SP) c(PPSCORE, SV) c(PPSCORE, SPN) e(PPSCORE) s(SEQWEIGHT) c(SEQWEIGHT, None) c(SEQWEIGHT, Henikoff) c(SEQWEIGHT, HenikoffPB) c(SEQWEIGHT, GSC) c(SEQWEIGHT, ClustalW) c(SEQWEIGHT, ThreeWay) e(SEQWEIGHT) s(OBJSCORE) c(OBJSCORE, SP) // Sum of Pairs of sequences c(OBJSCORE, DP) // Dynamic Programming score c(OBJSCORE, XP) // Cross Pairs = sum of pairs between two MSAs c(OBJSCORE, PS) // sum of Prof-Seq score for all seqs in MSA c(OBJSCORE, SPF) // sum of pairs, fast approximation c(OBJSCORE, SPM) // sp if <= 100 seqs, spf otherwise e(OBJSCORE) s(TERMGAPS) c(TERMGAPS, Full) c(TERMGAPS, Half) c(TERMGAPS, Ext) e(TERMGAPS) #undef s #undef c #undef e muscle-3.8.31.orig/objscore2.cpp0000644000175000017500000003304511352261667016005 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "profile.h" #include "objscore.h" #define TRACE 0 #define TRACE_SEQPAIR 0 #define TEST_SPFAST 0 extern SCOREMATRIX VTML_LA; extern SCOREMATRIX PAM200; extern SCOREMATRIX PAM200NoCenter; extern SCOREMATRIX VTML_SP; extern SCOREMATRIX VTML_SPNoCenter; extern SCOREMATRIX NUC_SP; SCORE g_SPScoreLetters; SCORE g_SPScoreGaps; static SCORE TermGapScore(bool Gap) { switch (g_TermGaps) { case TERMGAPS_Full: return 0; case TERMGAPS_Half: if (Gap) return g_scoreGapOpen/2; return 0; case TERMGAPS_Ext: if (Gap) return g_scoreGapExtend; return 0; } Quit("TermGapScore?!"); return 0; } SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2) { const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPairLetters, different lengths"); #if TRACE_SEQPAIR { Log("\n"); Log("ScoreSeqPairLetters\n"); MSA msaTmp; msaTmp.SetSize(2, uColCount); msaTmp.CopySeq(0, msa1, uSeqIndex1); msaTmp.CopySeq(1, msa2, uSeqIndex2); msaTmp.LogMe(); } #endif SCORE scoreLetters = 0; SCORE scoreGaps = 0; bool bGapping1 = false; bool bGapping2 = false; unsigned uColStart = 0; bool bLeftTermGap = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bLeftTermGap = true; uColStart = uColIndex; break; } } unsigned uColEnd = uColCount - 1; bool bRightTermGap = false; for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bRightTermGap = true; uColEnd = (unsigned) iColIndex; break; } } #if TRACE_SEQPAIR Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); #endif for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) { unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter1 >= g_AlphaSize) continue; unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex); if (uLetter2 >= g_AlphaSize) continue; SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2]; scoreLetters += scoreMatch; } return scoreLetters; } SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2) { const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPairGaps, different lengths"); #if TRACE_SEQPAIR { Log("\n"); Log("ScoreSeqPairGaps\n"); MSA msaTmp; msaTmp.SetSize(2, uColCount); msaTmp.CopySeq(0, msa1, uSeqIndex1); msaTmp.CopySeq(1, msa2, uSeqIndex2); msaTmp.LogMe(); } #endif SCORE scoreGaps = 0; bool bGapping1 = false; bool bGapping2 = false; unsigned uColStart = 0; bool bLeftTermGap = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bLeftTermGap = true; uColStart = uColIndex; break; } } unsigned uColEnd = uColCount - 1; bool bRightTermGap = false; for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bRightTermGap = true; uColEnd = (unsigned) iColIndex; break; } } #if TRACE_SEQPAIR Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); #endif for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (bGap1 && bGap2) continue; if (bGap1) { if (!bGapping1) { #if TRACE_SEQPAIR Log("Gap open seq 1 col %d\n", uColIndex); #endif if (uColIndex == uColStart) scoreGaps += TermGapScore(true); else scoreGaps += g_scoreGapOpen; bGapping1 = true; } else scoreGaps += g_scoreGapExtend; continue; } else if (bGap2) { if (!bGapping2) { #if TRACE_SEQPAIR Log("Gap open seq 2 col %d\n", uColIndex); #endif if (uColIndex == uColStart) scoreGaps += TermGapScore(true); else scoreGaps += g_scoreGapOpen; bGapping2 = true; } else scoreGaps += g_scoreGapExtend; continue; } bGapping1 = false; bGapping2 = false; } if (bGapping1 || bGapping2) { scoreGaps -= g_scoreGapOpen; scoreGaps += TermGapScore(true); } return scoreGaps; } // The usual sum-of-pairs objective score: sum the score // of the alignment of each pair of sequences. SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[]) { #if TRACE Log("==================ObjScoreSP==============\n"); Log("msa=\n"); msa.LogMe(); #endif g_SPScoreLetters = 0; g_SPScoreGaps = 0; if (0 != MatchScore) { const unsigned uColCount = msa.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) MatchScore[uColIndex] = 0; } const unsigned uSeqCount = msa.GetSeqCount(); SCORE scoreTotal = 0; unsigned uPairCount = 0; #if TRACE Log("Seq1 Seq2 wt1 wt2 Letters Gaps Unwt.Score Wt.Score Total\n"); Log("---- ---- ------ ------ ---------- ---------- ---------- ---------- ----------\n"); #endif for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scorePair = scoreLetters + scoreGaps; ++uPairCount; scoreTotal += w*scorePair; g_SPScoreLetters += w*scoreLetters; g_SPScoreGaps += w*scoreGaps; #if TRACE Log("%4d %4d %6.3f %6.3f %10.2f %10.2f %10.2f %10.2f %10.2f >%s >%s\n", uSeqIndex1, uSeqIndex2, w1, w2, scoreLetters, scoreGaps, scorePair, scorePair*w1*w2, scoreTotal, msa.GetSeqName(uSeqIndex1), msa.GetSeqName(uSeqIndex2)); #endif } } #if TEST_SPFAST { SCORE f = ObjScoreSPFast(msa); Log("Fast = %.6g\n", f); Log("Brute = %.6g\n", scoreTotal); if (BTEq(f, scoreTotal)) Log("Agree\n"); else Log("** DISAGREE **\n"); } #endif // return scoreTotal / uPairCount; return scoreTotal; } // Objective score defined as the dynamic programming score. // Input is two alignments, which must be of the same length. // Result is the same profile-profile score that is optimized // by dynamic programming. SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[]) { const unsigned uColCount = msa1.GetColCount(); if (msa2.GetColCount() != uColCount) Quit("ObjScoreDP, must be same length"); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const ProfPos *PA = ProfileFromMSA(msa1); const ProfPos *PB = ProfileFromMSA(msa2); return ObjScoreDP_Profs(PA, PB, uColCount1, MatchScore); } SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount, SCORE MatchScore[]) { //#if TRACE // Log("Profile 1:\n"); // ListProfile(PA, uColCount, &msa1); // // Log("Profile 2:\n"); // ListProfile(PB, uColCount, &msa2); //#endif SCORE scoreTotal = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const ProfPos &PPA = PA[uColIndex]; const ProfPos &PPB = PB[uColIndex]; SCORE scoreGap = 0; SCORE scoreMatch = 0; // If gapped column... if (PPA.m_bAllGaps && PPB.m_bAllGaps) scoreGap = 0; else if (PPA.m_bAllGaps) { if (uColCount - 1 == uColIndex || !PA[uColIndex+1].m_bAllGaps) scoreGap = PPB.m_scoreGapClose; if (0 == uColIndex || !PA[uColIndex-1].m_bAllGaps) scoreGap += PPB.m_scoreGapOpen; //if (0 == scoreGap) // scoreGap = PPB.m_scoreGapExtend; } else if (PPB.m_bAllGaps) { if (uColCount - 1 == uColIndex || !PB[uColIndex+1].m_bAllGaps) scoreGap = PPA.m_scoreGapClose; if (0 == uColIndex || !PB[uColIndex-1].m_bAllGaps) scoreGap += PPA.m_scoreGapOpen; //if (0 == scoreGap) // scoreGap = PPA.m_scoreGapExtend; } else scoreMatch = ScoreProfPos2(PPA, PPB); if (0 != MatchScore) MatchScore[uColIndex] = scoreMatch; scoreTotal += scoreMatch + scoreGap; extern bool g_bTracePPScore; extern MSA *g_ptrPPScoreMSA1; extern MSA *g_ptrPPScoreMSA2; if (g_bTracePPScore) { const MSA &msa1 = *g_ptrPPScoreMSA1; const MSA &msa2 = *g_ptrPPScoreMSA2; const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); for (unsigned n = 0; n < uSeqCount1; ++n) Log("%c", msa1.GetChar(n, uColIndex)); Log(" "); for (unsigned n = 0; n < uSeqCount2; ++n) Log("%c", msa2.GetChar(n, uColIndex)); Log(" %10.3f", scoreMatch); if (scoreGap != 0) Log(" %10.3f", scoreGap); Log("\n"); } } delete[] PA; delete[] PB; return scoreTotal; } // Objective score defined as the sum of profile-sequence // scores for each sequence in the alignment. The profile // is computed from the entire alignment, so this includes // the score of each sequence against itself. This is to // avoid recomputing the profile each time, so we reduce // complexity but introduce a questionable approximation. // The goal is to see if we can exploit the apparent // improvement in performance of log-expectation score // over the usual sum-of-pairs by optimizing this // objective score in the iterative refinement stage. SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[]) { if (g_PPScore != PPSCORE_LE) Quit("FastScoreMSA_LASimple: LA"); const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); const ProfPos *Prof = ProfileFromMSA(msa); if (0 != MatchScore) for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) MatchScore[uColIndex] = 0; SCORE scoreTotal = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const WEIGHT weightSeq = msa.GetSeqWeight(uSeqIndex); SCORE scoreSeq = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const ProfPos &PP = Prof[uColIndex]; if (msa.IsGap(uSeqIndex, uColIndex)) { bool bOpen = (0 == uColIndex || !msa.IsGap(uSeqIndex, uColIndex - 1)); bool bClose = (uColCount - 1 == uColIndex || !msa.IsGap(uSeqIndex, uColIndex + 1)); if (bOpen) scoreSeq += PP.m_scoreGapOpen; if (bClose) scoreSeq += PP.m_scoreGapClose; //if (!bOpen && !bClose) // scoreSeq += PP.m_scoreGapExtend; } else if (msa.IsWildcard(uSeqIndex, uColIndex)) continue; else { unsigned uLetter = msa.GetLetter(uSeqIndex, uColIndex); const SCORE scoreMatch = PP.m_AAScores[uLetter]; if (0 != MatchScore) MatchScore[uColIndex] += weightSeq*scoreMatch; scoreSeq += scoreMatch; } } scoreTotal += weightSeq*scoreSeq; } delete[] Prof; return scoreTotal; } // The XP score is the sum of the score of each pair of // sequences between two profiles which are aligned to // each other. Notice that for two given profiles aligned // in different ways, the difference in XP score must be // the same as the difference in SP score because the // score of a pair of sequences in one profile doesn't // depend on the alignment. SCORE ObjScoreXP(const MSA &msa1, const MSA &msa2) { const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount1 != uColCount2) Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); #if TRACE Log(" Score Weight Weight Total\n"); Log("---------- ------ ------ ----------\n"); #endif SCORE scoreTotal = 0; unsigned uPairCount = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) { const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) { const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2); SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2); SCORE scorePair = scoreLetters + scoreGaps; scoreTotal += w1*w2*scorePair; ++uPairCount; #if TRACE Log("%10.2f %6.3f %6.3f %10.2f >%s >%s\n", scorePair, w1, w2, scorePair*w1*w2, msa1.GetSeqName(uSeqIndex1), msa2.GetSeqName(uSeqIndex2)); #endif } } if (0 == uPairCount) Quit("0 == uPairCount"); #if TRACE Log("msa1=\n"); msa1.LogMe(); Log("msa2=\n"); msa2.LogMe(); Log("XP=%g\n", scoreTotal); #endif // return scoreTotal / uPairCount; return scoreTotal; } muscle-3.8.31.orig/dpreglist.h0000644000175000017500000000202311352261667015547 0ustar kratzcharles#ifndef dpreglist_h #define dpreglist_h #include "diaglist.h" enum DPREGIONTYPE { DPREGIONTYPE_Unknown, DPREGIONTYPE_Diag, DPREGIONTYPE_Rect }; struct DPRegion { DPREGIONTYPE m_Type; union { Diag m_Diag; Rect m_Rect; }; }; const unsigned MAX_DPREGIONS = 1024; class DPRegionList { public: DPRegionList() { m_uCount = 0; } ~DPRegionList() { Free(); } public: // Creation void Clear() { Free(); } void Add(const DPRegion &r); // Accessors unsigned GetCount() const { return m_uCount; } const DPRegion &Get(unsigned uIndex) const { assert(uIndex < m_uCount); return m_DPRegions[uIndex]; } unsigned GetDPArea() const; // Diagnostics void LogMe() const; private: void Free() { m_uCount = 0; } private: unsigned m_uCount; DPRegion m_DPRegions[MAX_DPREGIONS]; }; void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL, unsigned uLengthA, unsigned uLengthB); #endif // dpreglist_h muscle-3.8.31.orig/nwdasimple.cpp0000644000175000017500000003060511352261666016256 0ustar kratzcharles#include "muscle.h" #include #include "pwpath.h" #include "profile.h" #include #define TRACE 0 bool g_bKeepSimpleDP; SCORE *g_DPM; SCORE *g_DPD; SCORE *g_DPE; SCORE *g_DPI; SCORE *g_DPJ; char *g_TBM; char *g_TBD; char *g_TBE; char *g_TBI; char *g_TBJ; #if DOUBLE_AFFINE static char XlatEdgeType(char c) { if ('E' == c) return 'D'; if ('J' == c) return 'I'; return c; } static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -100000) return " *"; sprintf(str, "%6.1f", s); return str; } static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB)); Log("\n"); } } static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } SCORE NWDASimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; // Allocate DP matrices const size_t LM = uPrefixCountA*uPrefixCountB; SCORE *DPL_ = new SCORE[LM]; SCORE *DPM_ = new SCORE[LM]; SCORE *DPD_ = new SCORE[LM]; SCORE *DPE_ = new SCORE[LM]; SCORE *DPI_ = new SCORE[LM]; SCORE *DPJ_ = new SCORE[LM]; char *TBM_ = new char[LM]; char *TBD_ = new char[LM]; char *TBE_ = new char[LM]; char *TBI_ = new char[LM]; char *TBJ_ = new char[LM]; memset(TBM_, '?', LM); memset(TBD_, '?', LM); memset(TBE_, '?', LM); memset(TBI_, '?', LM); memset(TBJ_, '?', LM); DPM(0, 0) = 0; DPD(0, 0) = MINUS_INFINITY; DPE(0, 0) = MINUS_INFINITY; DPI(0, 0) = MINUS_INFINITY; DPJ(0, 0) = MINUS_INFINITY; DPM(1, 0) = MINUS_INFINITY; DPD(1, 0) = PA[0].m_scoreGapOpen; DPE(1, 0) = PA[0].m_scoreGapOpen2; TBD(1, 0) = 'D'; TBE(1, 0) = 'E'; DPI(1, 0) = MINUS_INFINITY; DPJ(1, 0) = MINUS_INFINITY; DPM(0, 1) = MINUS_INFINITY; DPD(0, 1) = MINUS_INFINITY; DPE(0, 1) = MINUS_INFINITY; DPI(0, 1) = PB[0].m_scoreGapOpen; DPJ(0, 1) = PB[0].m_scoreGapOpen2; TBI(0, 1) = 'I'; TBJ(0, 1) = 'J'; // Empty prefix of B is special case for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { DPM(uPrefixLengthA, 0) = MINUS_INFINITY; DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend; DPE(uPrefixLengthA, 0) = DPE(uPrefixLengthA - 1, 0) + g_scoreGapExtend2; TBD(uPrefixLengthA, 0) = 'D'; TBE(uPrefixLengthA, 0) = 'E'; DPI(uPrefixLengthA, 0) = MINUS_INFINITY; DPJ(uPrefixLengthA, 0) = MINUS_INFINITY; } // Empty prefix of A is special case for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { DPM(0, uPrefixLengthB) = MINUS_INFINITY; DPD(0, uPrefixLengthB) = MINUS_INFINITY; DPE(0, uPrefixLengthB) = MINUS_INFINITY; DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend; DPJ(0, uPrefixLengthB) = DPJ(0, uPrefixLengthB - 1) + g_scoreGapExtend2; TBI(0, uPrefixLengthB) = 'I'; TBJ(0, uPrefixLengthB) = 'J'; } // Special case to agree with NWFast, no D-I transitions so... DPD(uLengthA, 0) = MINUS_INFINITY; DPE(uLengthA, 0) = MINUS_INFINITY; // DPI(0, uLengthB) = MINUS_INFINITY; // DPJ(0, uLengthB) = MINUS_INFINITY; // ============ // Main DP loop // ============ SCORE scoreGapCloseB = MINUS_INFINITY; SCORE scoreGapClose2B = MINUS_INFINITY; for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreGapCloseA = MINUS_INFINITY; SCORE scoreGapClose2A = MINUS_INFINITY; for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; { // Match M=LetterA+LetterB SCORE scoreLL = ScoreProfPos2(PPA, PPB); DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL; SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; SCORE scoreEM = DPE(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2A; SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; SCORE scoreJM = DPJ(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2B; SCORE scoreBest; if (scoreMM >= scoreDM && scoreMM >= scoreEM && scoreMM >= scoreIM && scoreMM >= scoreJM) { scoreBest = scoreMM; TBM(uPrefixLengthA, uPrefixLengthB) = 'M'; } else if (scoreDM >= scoreMM && scoreDM >= scoreEM && scoreDM >= scoreIM && scoreDM >= scoreJM) { scoreBest = scoreDM; TBM(uPrefixLengthA, uPrefixLengthB) = 'D'; } else if (scoreEM >= scoreMM && scoreEM >= scoreDM && scoreEM >= scoreIM && scoreEM >= scoreJM) { scoreBest = scoreEM; TBM(uPrefixLengthA, uPrefixLengthB) = 'E'; } else if (scoreIM >= scoreMM && scoreIM >= scoreDM && scoreIM >= scoreEM && scoreIM >= scoreJM) { scoreBest = scoreIM; TBM(uPrefixLengthA, uPrefixLengthB) = 'I'; } else { assert(scoreJM >= scoreMM && scoreJM >= scoreDM && scoreJM >= scoreEM && scoreJM >= scoreIM); scoreBest = scoreJM; TBM(uPrefixLengthA, uPrefixLengthB) = 'J'; } DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; } { // Delete D=LetterA+GapB SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen; SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend; SCORE scoreBest; if (scoreMD >= scoreDD) { scoreBest = scoreMD; TBD(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreDD >= scoreMD); scoreBest = scoreDD; TBD(uPrefixLengthA, uPrefixLengthB) = 'D'; } DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; } { // Delete E=LetterA+GapB SCORE scoreME = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen2; SCORE scoreEE = DPE(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend2; SCORE scoreBest; if (scoreME >= scoreEE) { scoreBest = scoreME; TBE(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreEE >= scoreME); scoreBest = scoreEE; TBE(uPrefixLengthA, uPrefixLengthB) = 'E'; } DPE(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert I=GapA+LetterB { SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB - 1].m_scoreGapOpen; SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend; SCORE scoreBest; if (scoreMI >= scoreII) { scoreBest = scoreMI; TBI(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreII > scoreMI); scoreBest = scoreII; TBI(uPrefixLengthA, uPrefixLengthB) = 'I'; } DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert J=GapA+LetterB { SCORE scoreMJ = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB - 1].m_scoreGapOpen2; SCORE scoreJJ = DPJ(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend2; SCORE scoreBest; if (scoreMJ >= scoreJJ) { scoreBest = scoreMJ; TBJ(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreJJ > scoreMJ); scoreBest = scoreJJ; TBJ(uPrefixLengthA, uPrefixLengthB) = 'J'; } DPJ(uPrefixLengthA, uPrefixLengthB) = scoreBest; } scoreGapCloseA = PPA.m_scoreGapClose; scoreGapClose2A = PPA.m_scoreGapClose2; } scoreGapCloseB = PPB.m_scoreGapClose; scoreGapClose2B = PPB.m_scoreGapClose2; } #if TRACE Log("\n"); Log("DA Simple DPL:\n"); ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPM:\n"); ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPD:\n"); ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPE:\n"); ListDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPI:\n"); ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPJ:\n"); ListDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBM:\n"); ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBD:\n"); ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBE:\n"); ListTB(TBE_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBI:\n"); ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBJ:\n"); ListTB(TBJ_, PA, PB, uPrefixCountA, uPrefixCountB); #endif // Trace-back // ========== Path.Clear(); // Find last edge SCORE M = DPM(uLengthA, uLengthB); SCORE D = DPD(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose; SCORE E = DPE(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose2; SCORE I = DPI(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose; SCORE J = DPJ(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose2; char cEdgeType = '?'; SCORE BestScore = M; cEdgeType = 'M'; if (D > BestScore) { cEdgeType = 'D'; BestScore = D; } if (E > BestScore) { cEdgeType = 'E'; BestScore = E; } if (I > BestScore) { cEdgeType = 'I'; BestScore = I; } if (J > BestScore) { cEdgeType = 'J'; BestScore = J; } #if TRACE Log("DA Simple: MAB=%.4g DAB=%.4g EAB=%.4g IAB=%.4g JAB=%.4g best=%c\n", M, D, E, I, J, cEdgeType); #endif unsigned PLA = uLengthA; unsigned PLB = uLengthB; for (;;) { PWEdge Edge; Edge.cType = XlatEdgeType(cEdgeType); Edge.uPrefixLengthA = PLA; Edge.uPrefixLengthB = PLB; #if TRACE Log("Prepend %c%d.%d\n", Edge.cType, PLA, PLB); #endif Path.PrependEdge(Edge); switch (cEdgeType) { case 'M': assert(PLA > 0); assert(PLB > 0); cEdgeType = TBM(PLA, PLB); --PLA; --PLB; break; case 'D': assert(PLA > 0); cEdgeType = TBD(PLA, PLB); --PLA; break; case 'E': assert(PLA > 0); cEdgeType = TBE(PLA, PLB); --PLA; break; case 'I': assert(PLB > 0); cEdgeType = TBI(PLA, PLB); --PLB; break; case 'J': assert(PLB > 0); cEdgeType = TBJ(PLA, PLB); --PLB; break; default: Quit("Invalid edge %c", cEdgeType); } if (0 == PLA && 0 == PLB) break; } Path.Validate(); // SCORE Score = TraceBack(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, Path); #if TRACE SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path); Path.LogMe(); Log("Score = %s Path = %s\n", LocalScoreToStr(BestScore), LocalScoreToStr(scorePath)); #endif if (g_bKeepSimpleDP) { g_DPM = DPM_; g_DPD = DPD_; g_DPE = DPE_; g_DPI = DPI_; g_DPJ = DPJ_; g_TBM = TBM_; g_TBD = TBD_; g_TBE = TBE_; g_TBI = TBI_; g_TBJ = TBJ_; } else { delete[] DPM_; delete[] DPD_; delete[] DPE_; delete[] DPI_; delete[] DPJ_; delete[] TBM_; delete[] TBD_; delete[] TBE_; delete[] TBI_; delete[] TBJ_; } return BestScore; } #endif // DOUBLE_AFFINE muscle-3.8.31.orig/nwdasmall.cpp0000644000175000017500000005367111352261666016105 0ustar kratzcharles#include "muscle.h" #include #include "pwpath.h" #include "profile.h" #include #if DOUBLE_AFFINE // NW double affine small memory, term gaps fully penalized // (so up to caller to adjust in profile if desired). #define TRACE 0 #define MIN(x, y) ((x) < (y) ? (x) : (y)) #if TRACE extern bool g_bKeepSimpleDP; extern SCORE *g_DPM; extern SCORE *g_DPD; extern SCORE *g_DPE; extern SCORE *g_DPI; extern SCORE *g_DPJ; extern char *g_TBM; extern char *g_TBD; extern char *g_TBE; extern char *g_TBI; extern char *g_TBJ; #endif #if TRACE #define ALLOC_TRACE() \ const SCORE UNINIT = MINUS_INFINITY; \ const size_t LM = uPrefixCountA*uPrefixCountB; \ \ SCORE *DPM_ = new SCORE[LM]; \ SCORE *DPD_ = new SCORE[LM]; \ SCORE *DPE_ = new SCORE[LM]; \ SCORE *DPI_ = new SCORE[LM]; \ SCORE *DPJ_ = new SCORE[LM]; \ \ char *TBM_ = new char[LM]; \ char *TBD_ = new char[LM]; \ char *TBE_ = new char[LM]; \ char *TBI_ = new char[LM]; \ char *TBJ_ = new char[LM]; \ \ memset(TBM_, '?', LM); \ memset(TBD_, '?', LM); \ memset(TBE_, '?', LM); \ memset(TBI_, '?', LM); \ memset(TBJ_, '?', LM); \ \ for (unsigned i = 0; i <= uLengthA; ++i) \ for (unsigned j = 0; j <= uLengthB; ++j) \ { \ DPM(i, j) = UNINIT; \ DPD(i, j) = UNINIT; \ DPE(i, j) = UNINIT; \ DPI(i, j) = UNINIT; \ DPJ(i, j) = UNINIT; \ } #else #define ALLOC_TRACE() #endif #if TRACE #define SetDPM(i, j, x) DPM(i, j) = x #define SetDPD(i, j, x) DPD(i, j) = x #define SetDPE(i, j, x) DPE(i, j) = x #define SetDPI(i, j, x) DPI(i, j) = x #define SetDPJ(i, j, x) DPJ(i, j) = x #define SetTBM(i, j, x) TBM(i, j) = x #define SetTBD(i, j, x) TBD(i, j) = x #define SetTBE(i, j, x) TBE(i, j) = x #define SetTBI(i, j, x) TBI(i, j) = x #define SetTBJ(i, j, x) TBJ(i, j) = x #else #define SetDPM(i, j, x) /* empty */ #define SetDPD(i, j, x) /* empty */ #define SetDPE(i, j, x) /* empty */ #define SetDPI(i, j, x) /* empty */ #define SetDPJ(i, j, x) /* empty */ #define SetTBM(i, j, x) /* empty */ #define SetTBD(i, j, x) /* empty */ #define SetTBE(i, j, x) /* empty */ #define SetTBI(i, j, x) /* empty */ #define SetTBJ(i, j, x) /* empty */ #endif #define RECURSE_D(i, j) \ { \ SCORE DD = DRow[j] + e; \ SCORE MD = MPrev[j] + PA[i-1].m_scoreGapOpen;\ if (DD > MD) \ { \ DRow[j] = DD; \ SetTBD(i, j, 'D'); \ } \ else \ { \ DRow[j] = MD; \ SetBitTBD(TB, i, j, 'M'); \ SetTBD(i, j, 'M'); \ } \ SetDPD(i, j, DRow[j]); \ } #define RECURSE_E(i, j) \ { \ SCORE EE = ERow[j] + e2; \ SCORE ME = MPrev[j] + PA[i-1].m_scoreGapOpen2;\ if (EE > ME) \ { \ ERow[j] = EE; \ SetTBE(i, j, 'E'); \ } \ else \ { \ ERow[j] = ME; \ SetBitTBE(TB, i, j, 'M'); \ SetTBE(i, j, 'M'); \ } \ SetDPE(i, j, ERow[j]); \ } #define RECURSE_D_ATerm(j) RECURSE_D(uLengthA, j) #define RECURSE_E_ATerm(j) RECURSE_E(uLengthA, j) #define RECURSE_D_BTerm(j) RECURSE_D(i, uLengthB) #define RECURSE_E_BTerm(j) RECURSE_E(i, uLengthB) #define RECURSE_I(i, j) \ { \ Iij += e; \ SCORE MI = MCurr[j-1] + PB[j-1].m_scoreGapOpen;\ if (MI >= Iij) \ { \ Iij = MI; \ SetBitTBI(TB, i, j, 'M'); \ SetTBI(i, j, 'M'); \ } \ else \ SetTBI(i, j, 'I'); \ SetDPI(i, j, Iij); \ } #define RECURSE_J(i, j) \ { \ Jij += e2; \ SCORE MJ = MCurr[j-1] + PB[j-1].m_scoreGapOpen2;\ if (MJ >= Jij) \ { \ Jij = MJ; \ SetBitTBJ(TB, i, j, 'M'); \ SetTBJ(i, j, 'M'); \ } \ else \ SetTBJ(i, j, 'I'); \ SetDPJ(i, j, Jij); \ } #define RECURSE_I_ATerm(j) RECURSE_I(uLengthA, j) #define RECURSE_J_ATerm(j) RECURSE_J(uLengthA, j) #define RECURSE_I_BTerm(j) RECURSE_I(i, uLengthB) #define RECURSE_J_BTerm(j) RECURSE_J(i, uLengthB) #define RECURSE_M(i, j) \ { \ SCORE Best = MCurr[j]; /* MM */ \ SetTBM(i+1, j+1, 'M'); \ SetBitTBM(TB, i+1, j+1, 'M'); \ \ SCORE DM = DRow[j] + PA[i-1].m_scoreGapClose; \ if (DM > Best) \ { \ Best = DM; \ SetTBM(i+1, j+1, 'D'); \ SetBitTBM(TB, i+1, j+1, 'D'); \ } \ \ SCORE EM = ERow[j] + PA[i-1].m_scoreGapClose2; \ if (EM > Best) \ { \ Best = EM; \ SetTBM(i+1, j+1, 'E'); \ SetBitTBM(TB, i+1, j+1, 'E'); \ } \ \ SCORE IM = Iij + PB[j-1].m_scoreGapClose; \ if (IM > Best) \ { \ Best = IM; \ SetTBM(i+1, j+1, 'I'); \ SetBitTBM(TB, i+1, j+1, 'I'); \ } \ \ SCORE JM = Jij + PB[j-1].m_scoreGapClose2; \ if (JM > Best) \ { \ Best = JM; \ SetTBM(i+1, j+1, 'J'); \ SetBitTBM(TB, i+1, j+1, 'J'); \ } \ MNext[j+1] += Best; \ SetDPM(i+1, j+1, MNext[j+1]); \ } #if TRACE static bool LocalEq(BASETYPE b1, BASETYPE b2) { if (b1 < -100000 && b2 < -100000) return true; double diff = fabs(b1 - b2); if (diff < 0.0001) return true; double sum = fabs(b1) + fabs(b2); return diff/sum < 0.005; } static char Get_M_Char(char Bits) { switch (Bits & BIT_xM) { case BIT_MM: return 'M'; case BIT_DM: return 'D'; case BIT_EM: return 'E'; case BIT_IM: return 'I'; case BIT_JM: return 'J'; } Quit("Huh?"); return '?'; } static char Get_D_Char(char Bits) { return (Bits & BIT_xD) ? 'M' : 'D'; } static char Get_E_Char(char Bits) { return (Bits & BIT_xE) ? 'M' : 'E'; } static char Get_I_Char(char Bits) { return (Bits & BIT_xI) ? 'M' : 'I'; } static char Get_J_Char(char Bits) { return (Bits & BIT_xJ) ? 'M' : 'J'; } static bool DPEq(char c, SCORE *g_DP, SCORE *DPD_, unsigned uPrefixCountA, unsigned uPrefixCountB) { if (0 == g_DP) { Log("***DPDIFF*** DP%c=NULL\n", c); return true; } SCORE *DPM_ = g_DP; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) if (!LocalEq(DPM(i, j), DPD(i, j))) { Log("***DPDIFF*** DP%c(%d, %d) Simple = %.2g, Small = %.2g\n", c, i, j, DPM(i, j), DPD(i, j)); return false; } return true; } static bool CompareTB(char **TB, char *TBM_, char *TBD_, char *TBE_, char *TBI_, char *TBJ_, unsigned uPrefixCountA, unsigned uPrefixCountB) { if (!g_bKeepSimpleDP) return true; SCORE *DPM_ = g_DPM; bool Eq = true; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBM(i, j); char c2 = Get_M_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPM(i, j) > -100000) { Log("TBM(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto D; } } D: SCORE *DPD_ = g_DPD; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBD(i, j); char c2 = Get_D_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPD(i, j) > -100000) { Log("TBD(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto E; } } E: SCORE *DPE_ = g_DPE; if (0 == TBE_) goto I; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBE(i, j); char c2 = Get_E_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPE(i, j) > -100000) { Log("TBE(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto I; } } I: SCORE *DPI_ = g_DPI; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBI(i, j); char c2 = Get_I_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPI(i, j) > -100000) { Log("TBI(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto J; } } J: SCORE *DPJ_ = g_DPJ; if (0 == DPJ_) goto Done; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBJ(i, j); char c2 = Get_J_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPJ(i, j) > -100000) { Log("TBJ(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto Done; } } Done: if (Eq) Log("TB success\n"); return Eq; } static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -100000) return " *"; sprintf(str, "%6.1f", s); return str; } static void LogDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } static void LogBitTB(char **TB, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); Log("Bit TBM:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_M_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBD:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_D_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBE:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_E_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBI:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_I_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBJ:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_J_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } } static void ListTB(char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = TBM(uPrefixLengthA, uPrefixLengthB); Log(" %6c", c); } Log("\n"); } } static const char *BitsToStr(char Bits) { static char Str[32]; sprintf(Str, "%cM %cD %cE %cI %cJ", Get_M_Char(Bits), Get_D_Char(Bits), Get_E_Char(Bits), Get_I_Char(Bits), Get_J_Char(Bits)); } #endif // TRACE static inline void SetBitTBM(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MM; break; case 'D': Bit = BIT_DM; break; #if DOUBLE_AFFINE case 'E': Bit = BIT_EM; break; case 'I': Bit = BIT_IM; break; case 'J': Bit = BIT_JM; break; #endif default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xM; TB[i][j] |= Bit; } static inline void SetBitTBD(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MD; break; case 'D': Bit = BIT_DD; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xD; TB[i][j] |= Bit; } static inline void SetBitTBI(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MI; break; case 'I': Bit = BIT_II; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xI; TB[i][j] |= Bit; } #if DOUBLE_AFFINE static inline void SetBitTBE(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_ME; break; case 'E': Bit = BIT_EE; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xE; TB[i][j] |= Bit; } static inline void SetBitTBJ(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MJ; break; case 'J': Bit = BIT_JJ; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xJ; TB[i][j] |= Bit; } #endif #if TRACE #define LogMatrices() \ { \ Log("Bit DPM:\n"); \ LogDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPD:\n"); \ LogDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPE:\n"); \ LogDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPI:\n"); \ LogDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPJ:\n"); \ LogDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit TB:\n"); \ LogBitTB(TB, PA, PB, uPrefixCountA, uPrefixCountB); \ bool Same; \ Same = DPEq('M', g_DPM, DPM_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPM success\n"); \ Same = DPEq('D', g_DPD, DPD_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPD success\n"); \ Same = DPEq('E', g_DPE, DPE_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPE success\n"); \ Same = DPEq('I', g_DPI, DPI_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPI success\n"); \ Same = DPEq('J', g_DPJ, DPJ_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPJ success\n"); \ CompareTB(TB, g_TBM, g_TBD, g_TBE, g_TBI, g_TBJ, uPrefixCountA, uPrefixCountB);\ } #else #define LogMatrices() /* empty */ #endif SCORE NWDASmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); ProfPos *pa0 = (ProfPos *) PA; ProfPos *pb0 = (ProfPos *) PB; ProfPos *paa = (ProfPos *) (PA + uLengthA - 1); ProfPos *pbb = (ProfPos *) (PB + uLengthB - 1); pa0->m_scoreGapOpen *= -1; pb0->m_scoreGapOpen *= -1; paa->m_scoreGapClose *= -1; pbb->m_scoreGapClose *= -1; pa0->m_scoreGapOpen2 *= -1; pb0->m_scoreGapOpen2 *= -1; paa->m_scoreGapClose2 *= -1; pbb->m_scoreGapClose2 *= -1; const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; const SCORE e = g_scoreGapExtend; const SCORE e2 = g_scoreGapExtend2; const SCORE min_e = MIN(g_scoreGapExtend, g_scoreGapExtend2); ALLOC_TRACE() SCORE *MCurr = new SCORE[uPrefixCountB]; SCORE *MNext = new SCORE[uPrefixCountB]; SCORE *MPrev = new SCORE[uPrefixCountB]; SCORE *DRow = new SCORE[uPrefixCountB]; SCORE *ERow = new SCORE[uPrefixCountB]; char **TB = new char *[uPrefixCountA]; for (unsigned i = 0; i < uPrefixCountA; ++i) { TB[i] = new char [uPrefixCountB]; memset(TB[i], 0, uPrefixCountB); } SCORE Iij = MINUS_INFINITY; SetDPI(0, 0, Iij); SCORE Jij = MINUS_INFINITY; SetDPJ(0, 0, Jij); Iij = PB[0].m_scoreGapOpen; SetDPI(0, 1, Iij); Jij = PB[0].m_scoreGapOpen2; SetDPJ(0, 1, Jij); for (unsigned j = 2; j <= uLengthB; ++j) { Iij += e; Jij += e2; SetDPI(0, j, Iij); SetDPJ(0, j, Jij); SetTBI(0, j, 'I'); SetTBJ(0, j, 'J'); } for (unsigned j = 0; j <= uLengthB; ++j) { DRow[j] = MINUS_INFINITY; ERow[j] = MINUS_INFINITY; SetDPD(0, j, DRow[j]); SetDPE(0, j, ERow[j]); SetTBD(0, j, 'D'); SetTBE(0, j, 'E'); } MPrev[0] = 0; SetDPM(0, 0, MPrev[0]); for (unsigned j = 1; j <= uLengthB; ++j) { MPrev[j] = MINUS_INFINITY; SetDPM(0, j, MPrev[j]); } MCurr[0] = MINUS_INFINITY; SetDPM(1, 0, MCurr[0]); MCurr[1] = ScoreProfPos2(PA[0], PB[0]); SetDPM(1, 1, MCurr[1]); SetBitTBM(TB, 1, 1, 'M'); SetTBM(1, 1, 'M'); for (unsigned j = 2; j <= uLengthB; ++j) { SCORE M = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen + (j - 2)*e + PB[j-2].m_scoreGapClose; SCORE M2 = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen2 + (j - 2)*e2 + PB[j-2].m_scoreGapClose2; if (M >= M2) { MCurr[j] = M; SetBitTBM(TB, 1, j, 'I'); SetTBM(1, j, 'I'); } else { MCurr[j] = M2; SetBitTBM(TB, 1, j, 'J'); SetTBM(1, j, 'J'); } SetDPM(1, j, MCurr[j]); } // Main DP loop for (unsigned i = 1; i < uLengthA; ++i) { Iij = MINUS_INFINITY; Jij = MINUS_INFINITY; SetDPI(i, 0, Iij); SetDPJ(i, 0, Jij); DRow[0] = PA[0].m_scoreGapOpen + (i - 1)*e; ERow[0] = PA[0].m_scoreGapOpen2 + (i - 1)*e2; SetDPD(i, 0, DRow[0]); SetDPE(i, 0, ERow[0]); MCurr[0] = MINUS_INFINITY; if (i == 1) { MCurr[1] = ScoreProfPos2(PA[0], PB[0]); SetBitTBM(TB, i, 1, 'M'); SetTBM(i, 1, 'M'); } else { SCORE M = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen + (i - 2)*e + PA[i-2].m_scoreGapClose; SCORE M2 = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen2 + (i - 2)*e2 + PA[i-2].m_scoreGapClose2; if (M >= M2) { MCurr[1] = M; SetBitTBM(TB, i, 1, 'D'); SetTBM(i, 1, 'D'); } else { MCurr[1] = M2; SetBitTBM(TB, i, 1, 'E'); SetTBM(i, 1, 'E'); } } SetDPM(i, 0, MCurr[0]); SetDPM(i, 1, MCurr[1]); for (unsigned j = 1; j < uLengthB; ++j) MNext[j+1] = ScoreProfPos2(PA[i], PB[j]); for (unsigned j = 1; j < uLengthB; ++j) { RECURSE_D(i, j) RECURSE_E(i, j) RECURSE_I(i, j) RECURSE_J(i, j) RECURSE_M(i, j) } // Special case for j=uLengthB RECURSE_D_BTerm(i) RECURSE_E_BTerm(i) RECURSE_I_BTerm(i) RECURSE_J_BTerm(i) // Prev := Curr, Curr := Next, Next := Prev Rotate(MPrev, MCurr, MNext); } // Special case for i=uLengthA MCurr[0] = MINUS_INFINITY; SCORE M = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e + PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose; SCORE M2 = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e + PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose; if (M >= M2) { MCurr[1] = M; SetBitTBM(TB, uLengthA, 1, 'D'); SetTBM(uLengthA, 1, 'D'); } else { MCurr[1] = M2; SetBitTBM(TB, uLengthA, 1, 'E'); SetTBM(uLengthA, 1, 'D'); } SetDPM(uLengthA, 0, MCurr[0]); SetDPM(uLengthA, 1, MCurr[1]); DRow[0] = MINUS_INFINITY; ERow[0] = MINUS_INFINITY; SetDPD(uLengthA, 0, DRow[0]); SetDPE(uLengthA, 0, ERow[0]); for (unsigned j = 1; j <= uLengthB; ++j) { RECURSE_D_ATerm(j); RECURSE_E_ATerm(j); } Iij = MINUS_INFINITY; Jij = MINUS_INFINITY; for (unsigned j = 1; j <= uLengthB; ++j) { RECURSE_I_ATerm(j) RECURSE_J_ATerm(j) } LogMatrices(); SCORE MAB = MCurr[uLengthB]; SCORE DAB = DRow[uLengthB] + PA[uLengthA-1].m_scoreGapClose; SCORE EAB = ERow[uLengthB] + PA[uLengthA-1].m_scoreGapClose2; SCORE IAB = Iij + PB[uLengthB-1].m_scoreGapClose; SCORE JAB = Jij + PB[uLengthB-1].m_scoreGapClose2; SCORE Score = MAB; char cEdgeType = 'M'; if (DAB > Score) { Score = DAB; cEdgeType = 'D'; } if (EAB > Score) { Score = EAB; cEdgeType = 'E'; } if (IAB > Score) { Score = IAB; cEdgeType = 'I'; } if (JAB > Score) { Score = JAB; cEdgeType = 'J'; } #if TRACE Log(" Small: MAB=%.4g DAB=%.4g EAB=%.4g IAB=%.4g JAB=%.4g best=%c\n", MAB, DAB, EAB, IAB, JAB, cEdgeType); #endif BitTraceBack(TB, uLengthA, uLengthB, cEdgeType, Path); #if DBEUG Path.Validate(); #endif delete[] MCurr; delete[] MNext; delete[] MPrev; delete[] DRow; delete[] ERow; for (unsigned i = 0; i < uPrefixCountA; ++i) delete[] TB[i]; delete[] TB; return 0; } #endif // DOUBLE_AFFINE muscle-3.8.31.orig/profilefrommsa.cpp0000644000175000017500000001702411352261667017141 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "profile.h" #define TRACE 0 static void LogF(FCOUNT f) { if (f > -0.00001 && f < 0.00001) Log(" "); else Log(" %5.3f", f); } static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -1e10 || s > 1e10) return " *"; sprintf(str, "%5.1f", s); return str; } #if DOUBLE_AFFINE void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA) { Log(" Pos Occ LL LG GL GG Open Close Open2 Clos2\n"); Log(" --- --- -- -- -- -- ---- ----- ----- -----\n"); for (unsigned n = 0; n < uLength; ++n) { const ProfPos &PP = Prof[n]; Log("%5u", n); LogF(PP.m_fOcc); LogF(PP.m_LL); LogF(PP.m_LG); LogF(PP.m_GL); LogF(PP.m_GG); Log(" %s", LocalScoreToStr(-PP.m_scoreGapOpen)); Log(" %s", LocalScoreToStr(-PP.m_scoreGapClose)); Log(" %s", LocalScoreToStr(-PP.m_scoreGapOpen2)); Log(" %s", LocalScoreToStr(-PP.m_scoreGapClose2)); if (0 != ptrMSA) { const unsigned uSeqCount = ptrMSA->GetSeqCount(); Log(" "); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%c", ptrMSA->GetChar(uSeqIndex, n)); } Log("\n"); } Log("\n"); Log(" Pos G"); for (unsigned n = 0; n < g_AlphaSize; ++n) Log(" %c", LetterExToChar(n)); Log("\n"); Log(" --- -"); for (unsigned n = 0; n < g_AlphaSize; ++n) Log(" -----"); Log("\n"); for (unsigned n = 0; n < uLength; ++n) { const ProfPos &PP = Prof[n]; Log("%5u", n); if (-1 == PP.m_uResidueGroup) Log(" -", PP.m_uResidueGroup); else Log(" %d", PP.m_uResidueGroup); for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) { FCOUNT f = PP.m_fcCounts[uLetter]; if (f == 0.0) Log(" "); else Log(" %5.3f", f); } if (0 != ptrMSA) { const unsigned uSeqCount = ptrMSA->GetSeqCount(); Log(" "); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%c", ptrMSA->GetChar(uSeqIndex, n)); } Log("\n"); } } #endif // DOUBLE_AFFINE #if SINGLE_AFFINE void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA) { Log(" Pos Occ LL LG GL GG Open Close\n"); Log(" --- --- -- -- -- -- ---- -----\n"); for (unsigned n = 0; n < uLength; ++n) { const ProfPos &PP = Prof[n]; Log("%5u", n); LogF(PP.m_fOcc); LogF(PP.m_LL); LogF(PP.m_LG); LogF(PP.m_GL); LogF(PP.m_GG); Log(" %5.1f", -PP.m_scoreGapOpen); Log(" %5.1f", -PP.m_scoreGapClose); if (0 != ptrMSA) { const unsigned uSeqCount = ptrMSA->GetSeqCount(); Log(" "); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%c", ptrMSA->GetChar(uSeqIndex, n)); } Log("\n"); } Log("\n"); Log(" Pos G"); for (unsigned n = 0; n < g_AlphaSize; ++n) Log(" %c", LetterExToChar(n)); Log("\n"); Log(" --- -"); for (unsigned n = 0; n < g_AlphaSize; ++n) Log(" -----"); Log("\n"); for (unsigned n = 0; n < uLength; ++n) { const ProfPos &PP = Prof[n]; Log("%5u", n); if (-1 == PP.m_uResidueGroup) Log(" -", PP.m_uResidueGroup); else Log(" %d", PP.m_uResidueGroup); for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) { FCOUNT f = PP.m_fcCounts[uLetter]; if (f == 0.0) Log(" "); else Log(" %5.3f", f); } if (0 != ptrMSA) { const unsigned uSeqCount = ptrMSA->GetSeqCount(); Log(" "); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%c", ptrMSA->GetChar(uSeqIndex, n)); } Log("\n"); } } #endif void SortCounts(const FCOUNT fcCounts[], unsigned SortOrder[]) { static unsigned InitialSortOrder[MAX_ALPHA] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 }; memcpy(SortOrder, InitialSortOrder, g_AlphaSize*sizeof(unsigned)); bool bAny = true; while (bAny) { bAny = false; for (unsigned n = 0; n < g_AlphaSize - 1; ++n) { unsigned i1 = SortOrder[n]; unsigned i2 = SortOrder[n+1]; if (fcCounts[i1] < fcCounts[i2]) { SortOrder[n+1] = i1; SortOrder[n] = i2; bAny = true; } } } } static unsigned AminoGroupFromFCounts(const FCOUNT fcCounts[]) { bool bAny = false; unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { if (0 == fcCounts[uLetter]) continue; const unsigned uResidueGroup = ResidueGroup[uLetter]; if (bAny) { if (uResidueGroup != uConsensusResidueGroup) return RESIDUE_GROUP_MULTIPLE; } else { bAny = true; uConsensusResidueGroup = uResidueGroup; } } return uConsensusResidueGroup; } static unsigned NucleoGroupFromFCounts(const FCOUNT fcCounts[]) { bool bAny = false; unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) { if (0 == fcCounts[uLetter]) continue; const unsigned uResidueGroup = uLetter; if (bAny) { if (uResidueGroup != uConsensusResidueGroup) return RESIDUE_GROUP_MULTIPLE; } else { bAny = true; uConsensusResidueGroup = uResidueGroup; } } return uConsensusResidueGroup; } unsigned ResidueGroupFromFCounts(const FCOUNT fcCounts[]) { switch (g_Alpha) { case ALPHA_Amino: return AminoGroupFromFCounts(fcCounts); case ALPHA_DNA: case ALPHA_RNA: return NucleoGroupFromFCounts(fcCounts); } Quit("ResidueGroupFromFCounts: bad alpha"); return 0; } ProfPos *ProfileFromMSA(const MSA &a) { const unsigned uSeqCount = a.GetSeqCount(); const unsigned uColCount = a.GetColCount(); // Yuck -- cast away const (inconsistent design here). SetMSAWeightsMuscle((MSA &) a); ProfPos *Pos = new ProfPos[uColCount]; unsigned uHydrophobicRunLength = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { ProfPos &PP = Pos[uColIndex]; PP.m_bAllGaps = a.IsGapColumn(uColIndex); FCOUNT fcGapStart; FCOUNT fcGapEnd; FCOUNT fcGapExtend; FCOUNT fOcc; a.GetFractionalWeightedCounts(uColIndex, g_bNormalizeCounts, PP.m_fcCounts, &fcGapStart, &fcGapEnd, &fcGapExtend, &fOcc, &PP.m_LL, &PP.m_LG, &PP.m_GL, &PP.m_GG); PP.m_fOcc = fOcc; SortCounts(PP.m_fcCounts, PP.m_uSortOrder); PP.m_uResidueGroup = ResidueGroupFromFCounts(PP.m_fcCounts); for (unsigned i = 0; i < g_AlphaSize; ++i) { SCORE scoreSum = 0; for (unsigned j = 0; j < g_AlphaSize; ++j) scoreSum += PP.m_fcCounts[j]*(*g_ptrScoreMatrix)[i][j]; PP.m_AAScores[i] = scoreSum; } SCORE sStartOcc = (SCORE) (1.0 - fcGapStart); SCORE sEndOcc = (SCORE) (1.0 - fcGapEnd); PP.m_fcStartOcc = sStartOcc; PP.m_fcEndOcc = sEndOcc; PP.m_scoreGapOpen = sStartOcc*g_scoreGapOpen/2; PP.m_scoreGapClose = sEndOcc*g_scoreGapOpen/2; #if DOUBLE_AFFINE PP.m_scoreGapOpen2 = sStartOcc*g_scoreGapOpen2/2; PP.m_scoreGapClose2 = sEndOcc*g_scoreGapOpen2/2; #endif // PP.m_scoreGapExtend = (SCORE) ((1.0 - fcGapExtend)*scoreGapExtend); #if PAF if (ALHPA_Amino == g_Alpha && sStartOcc > 0.5) { extern SCORE PAFactor(const FCOUNT fcCounts[]); SCORE paf = PAFactor(PP.m_fcCounts); PP.m_scoreGapOpen *= paf; PP.m_scoreGapClose *= paf; } #endif } #if HYDRO if (ALPHA_Amino == g_Alpha) Hydro(Pos, uColCount); #endif #if TRACE { Log("ProfileFromMSA\n"); ListProfile(Pos, uColCount, &a); } #endif return Pos; } muscle-3.8.31.orig/ppscore.cpp0000644000175000017500000000402711352261667015566 0ustar kratzcharles#include "muscle.h" #include "textfile.h" #include "msa.h" #include "tree.h" #include "profile.h" #include "objscore.h" bool g_bTracePPScore = false; MSA *g_ptrPPScoreMSA1 = 0; MSA *g_ptrPPScoreMSA2 = 0; static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree) { const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); TreeFromMSA(msa, tree, g_Cluster2, g_Distance2, g_Root1); SetMuscleTree(tree); return ProfileFromMSA(msa); } void PPScore() { if (0 == g_pstrFileName1 || 0 == g_pstrFileName2) Quit("-ppscore needs -in1 and -in2"); SetSeqWeightMethod(g_SeqWeight1); TextFile file1(g_pstrFileName1); TextFile file2(g_pstrFileName2); MSA msa1; MSA msa2; msa1.FromFile(file1); msa2.FromFile(file2); const unsigned uLength1 = msa1.GetColCount(); const unsigned uLength2 = msa2.GetColCount(); if (uLength1 != uLength2) Quit("Profiles must have the same length"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = msa1.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); msa1.FixAlpha(); msa2.FixAlpha(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); const unsigned uMaxSeqCount = (uSeqCount1 > uSeqCount2 ? uSeqCount1 : uSeqCount2); MSA::SetIdCount(uMaxSeqCount); Tree tree1; Tree tree2; ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1); ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2); g_bTracePPScore = true; g_ptrPPScoreMSA1 = &msa1; g_ptrPPScoreMSA2 = &msa2; SCORE Score = ObjScoreDP_Profs(Prof1, Prof2, uLength1); Log("Score=%.4g\n", Score); printf("Score=%.4g\n", Score); } muscle-3.8.31.orig/posgap.cpp0000644000175000017500000000541711352261673015405 0ustar kratzcharles#include "muscle.h" //// Pascaralle and Argos gap factors //// after Table 1 in Thompson et. al. ClustalW NAR paper. //static double PAFFacs[20] = // { // 1.13, // A // 1.13, // C // 0.96, // D // 1.31, // E // 1.20, // F // 0.61, // G // 1.00, // H // 1.32, // I // 0.96, // K // 1.21, // L // 1.29, // M // 0.62, // N // 0.74, // P // 1.07, // Q // 0.72, // R // 0.76, // S // 0.89, // T // 1.25, // V // 1.00, // Y // 1.23, // W // }; // //// (Not used: does not appear to work well). //SCORE PAFactor(const FCOUNT fcCounts[]) // { // if (ALPHA_Amino != g_Alpha) // Quit("PAFFactor: requires amino acid sequence"); // // FCOUNT fLetterCount = 0; // double dSum = 0; // for (unsigned uLetter = 0; uLetter < 20; ++uLetter) // { // const FCOUNT fCount = fcCounts[uLetter]; // dSum += fCount*PAFFacs[uLetter]; // fLetterCount += fCount; // } // if (0 == fLetterCount) // return 0.5; // return (SCORE) (dSum/fLetterCount); // } //static bool Hydrophilic[20] = // { // false, // A // false, // C // true, // D // true, // E // false, // F // true, // G // false, // H // false, // I // true, // K // false, // L // false, // M // true, // N // true, // P // true, // Q // true, // R // true, // S // false, // T // false, // V // false, // Y // false, // W // }; // //bool IsHydrophilic(const FCOUNT fcCounts[]) // { // if (ALPHA_Amino != g_Alpha) // Quit("IsHydrophilic: requires amino acid sequence"); // // for (unsigned uLetter = 0; uLetter < 20; ++uLetter) // if (fcCounts[uLetter] > 0 && !Hydrophilic[uLetter]) // return false; // return true; // } // //bool IsHydrophilic(const unsigned uCounts[]) // { // if (ALPHA_Amino != g_Alpha) // Quit("IsHydrophilic: requires amino acid sequence"); // // for (unsigned uLetter = 0; uLetter < 20; ++uLetter) // if (uCounts[uLetter] > 0 && !Hydrophilic[uLetter]) // return false; // return true; // } // LIVCATMFYWHK // Venn Pascaralla B&T Me // L y y y // I y y y // V y y y // C y n // A y y y // T N n // M y y y // F y y y // Y n n // W y n // H n n // K n n static bool Hydrophobic[20] = { true, // A true, // C false, // D false, // E true, // F false, // G true, // H true, // I false, // K true, // L true, // M false, // N false, // P false, // Q false, // R false, // S true, // T true, // V true, // Y true, // W }; bool IsHydrophobic(const FCOUNT fcCounts[]) { if (ALPHA_Amino != g_Alpha) Quit("IsHydrophobic: requires amino acid sequence"); for (unsigned uLetter = 0; uLetter < 20; ++uLetter) if (fcCounts[uLetter] > 0.0 && !Hydrophobic[uLetter]) return false; return true; } muscle-3.8.31.orig/scoredist.cpp0000644000175000017500000000665211352261672016114 0ustar kratzcharles#include #include #include "muscle.h" #include "msa.h" #include "distfunc.h" #include "msa.h" #include "seqvect.h" #include "pwpath.h" // ScoreDist // E. Sonnhammer & V. Hollich, Scoredist: A simple and robust protein sequence // distance estimator, BMC Bioinformatics 2005, 6:108. extern int BLOSUM62[20][20]; extern double BLOSUM62_Expected; static const double Dayhoff_CalibrationFactor = 1.3370; static const double JTT_CalibrationFactor = 1.2873; static const double MV_CalibrationFactor = 1.1775; static const double LARGE_D = 3.0; static double CalibrationFactor = JTT_CalibrationFactor; // Similarity score static double Sigma(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2, unsigned *ptrLength) { unsigned Length = 0; double Score = 0; const unsigned ColCount = msa.GetColCount(); for (unsigned ColIndex = 0; ColIndex < ColCount; ++ColIndex) { unsigned Letter1 = msa.GetLetterEx(SeqIndex1, ColIndex); unsigned Letter2 = msa.GetLetterEx(SeqIndex2, ColIndex); if (Letter1 >= 20 || Letter2 >= 20) continue; ++Length; Score += BLOSUM62[Letter1][Letter2]; } *ptrLength = Length; return Score; } // Normalized score static double Sigma_N(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2) { unsigned Length = UINT_MAX; double Score = Sigma(msa, SeqIndex1, SeqIndex2, &Length); double RandomScore = Length*BLOSUM62_Expected; return Score - RandomScore; } // Upper limit static double Sigma_U(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2, unsigned *ptrLength) { double Score11 = Sigma(msa, SeqIndex1, SeqIndex1, ptrLength); double Score22 = Sigma(msa, SeqIndex2, SeqIndex2, ptrLength); return (Score11 + Score22)/2; } // Normalized upper limit static double Sigma_UN(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2) { unsigned Length = UINT_MAX; double Score = Sigma_U(msa, SeqIndex1, SeqIndex2, &Length); double RandomScore = Length*BLOSUM62_Expected; return Score - RandomScore; } double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2) { if (g_Alpha != ALPHA_Amino) Quit("Scoredist is only for amino acid sequences"); double s_N = Sigma_N(msa, SeqIndex1, SeqIndex2); double s_UN = Sigma_UN(msa, SeqIndex1, SeqIndex2); double d = 0.0; if (s_UN != 0) { double Ratio = s_N/s_UN; if (Ratio < 0.001) d = LARGE_D; else d = -log(Ratio); } return d*CalibrationFactor; } void DistPWScoreDist(const SeqVect &v, DistFunc &DF) { SEQWEIGHT SeqWeightSave = GetSeqWeightMethod(); SetSeqWeightMethod(SEQWEIGHT_Henikoff); const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; SetProgressDesc("PW ScoreDist"); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const Seq &s1 = v.GetSeq(uSeqIndex1); MSA msa1; msa1.FromSeq(s1); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) { if (0 == uCount%20) Progress(uCount, uPairCount); ++uCount; const Seq &s2 = v.GetSeq(uSeqIndex2); MSA msa2; msa2.FromSeq(s2); PWPath Path; MSA msaOut; AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false); float d = (float) GetScoreDist(msaOut, 0, 1); DF.SetDist(uSeqIndex1, uSeqIndex2, d); } } ProgressStepsDone(); SetSeqWeightMethod(SeqWeightSave); } muscle-3.8.31.orig/edgelist.h0000644000175000017500000000077311352261600015347 0ustar kratzcharles#ifndef EdgeList_h #define EdgeList_h class EdgeList { public: EdgeList(); virtual ~EdgeList(); public: void Clear(); void Add(unsigned uNode1, unsigned uNode2); unsigned GetCount() const; void GetEdge(unsigned uIndex, unsigned *ptruNode1, unsigned *ptruNode2) const; void Copy(const EdgeList &rhs); void LogMe() const; private: void Expand(); private: unsigned m_uCount; unsigned m_uCacheSize; unsigned *m_uNode1; unsigned *m_uNode2; }; #endif // EdgeList_h muscle-3.8.31.orig/finddiagsn.cpp0000644000175000017500000000660111352261667016221 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "diaglist.h" #define TRACE 0 #define pow4(i) (1 << (2*i)) // 4^i = 2^(2*i) const unsigned K = 7; const unsigned KTUPS = pow4(K); static unsigned TuplePos[KTUPS]; static char *TupleToStr(int t) { static char s[K]; for (int i = 0; i < K; ++i) { unsigned Letter = (t/(pow4(i)))%4; assert(Letter >= 0 && Letter < 4); s[K-i-1] = LetterToChar(Letter); } return s; } static unsigned GetTuple(const ProfPos *PP, unsigned uPos) { unsigned t = 0; for (unsigned i = 0; i < K; ++i) { const unsigned uLetter = PP[uPos+i].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == uLetter) return EMPTY; t = t*4 + uLetter; } return t; } void FindDiagsNuc(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, unsigned uLengthY, DiagList &DL) { if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha) Quit("FindDiagsNuc: requires nucleo alphabet"); DL.Clear(); // 16 is arbitrary slop, no principled reason for this. if (uLengthX < K + 16 || uLengthY < K + 16) return; // Set A to shorter profile, B to longer const ProfPos *PA; const ProfPos *PB; unsigned uLengthA; unsigned uLengthB; bool bSwap; if (uLengthX < uLengthY) { bSwap = false; PA = PX; PB = PY; uLengthA = uLengthX; uLengthB = uLengthY; } else { bSwap = true; PA = PY; PB = PX; uLengthA = uLengthY; uLengthB = uLengthX; } #if TRACE Log("FindDiagsNuc(LengthA=%d LengthB=%d\n", uLengthA, uLengthB); #endif // Build tuple map for the longer profile, B if (uLengthB < K) Quit("FindDiags: profile too short"); memset(TuplePos, EMPTY, sizeof(TuplePos)); for (unsigned uPos = 0; uPos < uLengthB - K; ++uPos) { const unsigned uTuple = GetTuple(PB, uPos); if (EMPTY == uTuple) continue; TuplePos[uTuple] = uPos; } // Find matches for (unsigned uPosA = 0; uPosA < uLengthA - K; ++uPosA) { const unsigned uTuple = GetTuple(PA, uPosA); if (EMPTY == uTuple) continue; const unsigned uPosB = TuplePos[uTuple]; if (EMPTY == uPosB) continue; // This tuple is found in both profiles unsigned uStartPosA = uPosA; unsigned uStartPosB = uPosB; // Try to extend the match forwards unsigned uEndPosA = uPosA + K - 1; unsigned uEndPosB = uPosB + K - 1; for (;;) { if (uLengthA - 1 == uEndPosA || uLengthB - 1 == uEndPosB) break; const unsigned uAAGroupA = PA[uEndPosA+1].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == uAAGroupA) break; const unsigned uAAGroupB = PB[uEndPosB+1].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == uAAGroupB) break; if (uAAGroupA != uAAGroupB) break; ++uEndPosA; ++uEndPosB; } uPosA = uEndPosA; #if TRACE { Log("Match: A %4u-%4u ", uStartPosA, uEndPosA); for (unsigned n = uStartPosA; n <= uEndPosA; ++n) Log("%c", LetterToChar(PA[n].m_uResidueGroup)); Log("\n"); Log(" B %4u-%4u ", uStartPosB, uEndPosB); for (unsigned n = uStartPosB; n <= uEndPosB; ++n) Log("%c", LetterToChar(PB[n].m_uResidueGroup)); Log("\n"); } #endif const unsigned uLength = uEndPosA - uStartPosA + 1; assert(uEndPosB - uStartPosB + 1 == uLength); if (uLength >= g_uMinDiagLength) { if (bSwap) DL.Add(uStartPosB, uStartPosA, uLength); else DL.Add(uStartPosA, uStartPosB, uLength); } } } muscle-3.8.31.orig/traceback.cpp0000644000175000017500000001230411352261626016022 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "pwpath.h" #include #define TRACE 0 #define EQ(a, b) (fabs(a-b) < 0.1) SCORE TraceBack(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, PWPath &Path) { #if TRACE Log("\n"); Log("TraceBack LengthA=%u LengthB=%u\n", uLengthA, uLengthB); #endif assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; Path.Clear(); unsigned uPrefixLengthA = uLengthA; unsigned uPrefixLengthB = uLengthB; const SCORE scoreM = DPM(uPrefixLengthA, uPrefixLengthB); SCORE scoreD = DPD(uPrefixLengthA, uPrefixLengthB); SCORE scoreI = DPI(uPrefixLengthA, uPrefixLengthB); const ProfPos &LastPPA = PA[uLengthA - 1]; const ProfPos &LastPPB = PB[uLengthB - 1]; scoreD += LastPPA.m_scoreGapClose; scoreI += LastPPB.m_scoreGapClose; char cEdgeType = cInsane; SCORE scoreMax; if (scoreM >= scoreD && scoreM >= scoreI) { scoreMax = scoreM; cEdgeType = 'M'; } else if (scoreD >= scoreM && scoreD >= scoreI) { scoreMax = scoreD; cEdgeType = 'D'; } else { assert(scoreI >= scoreM && scoreI >= scoreD); scoreMax = scoreI; cEdgeType = 'I'; } for (;;) { if ('S' == cEdgeType) break; PWEdge Edge; Edge.cType = cEdgeType; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; Path.PrependEdge(Edge); char cPrevEdgeType; unsigned uPrevPrefixLengthA = uPrefixLengthA; unsigned uPrevPrefixLengthB = uPrefixLengthB; switch (cEdgeType) { case 'M': { assert(uPrefixLengthA > 0); assert(uPrefixLengthB > 0); const ProfPos &PPA = PA[uPrefixLengthA - 1]; const ProfPos &PPB = PB[uPrefixLengthB - 1]; const SCORE Score = DPM(uPrefixLengthA, uPrefixLengthB); const SCORE scoreMatch = ScoreProfPos2(PPA, PPB); SCORE scoreSM; if (1 == uPrefixLengthA && 1 == uPrefixLengthB) scoreSM = scoreMatch; else scoreSM = MINUS_INFINITY; SCORE scoreMM = MINUS_INFINITY; SCORE scoreDM = MINUS_INFINITY; SCORE scoreIM = MINUS_INFINITY; if (uPrefixLengthA > 1 && uPrefixLengthB > 1) scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1) + scoreMatch; if (uPrefixLengthA > 1) { SCORE scoreTransDM = PA[uPrefixLengthA-2].m_scoreGapClose; scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransDM + scoreMatch; } if (uPrefixLengthB > 1) { SCORE scoreTransIM = PB[uPrefixLengthB-2].m_scoreGapClose; scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransIM + scoreMatch; } if (EQ(scoreMM, Score)) cPrevEdgeType = 'M'; else if (EQ(scoreDM, Score)) cPrevEdgeType = 'D'; else if (EQ(scoreIM, Score)) cPrevEdgeType = 'I'; else if (EQ(scoreSM, Score)) cPrevEdgeType = 'S'; else Quit("TraceBack: failed to match M score=%g M=%g D=%g I=%g S=%g", Score, scoreMM, scoreDM, scoreIM, scoreSM); --uPrevPrefixLengthA; --uPrevPrefixLengthB; break; } case 'D': { assert(uPrefixLengthA > 0); const SCORE Score = DPD(uPrefixLengthA, uPrefixLengthB); SCORE scoreMD = MINUS_INFINITY; SCORE scoreDD = MINUS_INFINITY; SCORE scoreSD = MINUS_INFINITY; if (uPrefixLengthB == 0) { if (uPrefixLengthA == 1) scoreSD = PA[0].m_scoreGapOpen; else scoreSD = DPD(uPrefixLengthA - 1, 0); } if (uPrefixLengthA > 1) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; SCORE scoreTransMD = PPA.m_scoreGapOpen; scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + scoreTransMD; scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB); } if (EQ(Score, scoreMD)) cPrevEdgeType = 'M'; else if (EQ(Score, scoreDD)) cPrevEdgeType = 'D'; else if (EQ(Score, scoreSD)) cPrevEdgeType = 'S'; else Quit("TraceBack: failed to match D"); --uPrevPrefixLengthA; break; } case 'I': { assert(uPrefixLengthB > 0); const SCORE Score = DPI(uPrefixLengthA, uPrefixLengthB); SCORE scoreMI = MINUS_INFINITY; SCORE scoreII = MINUS_INFINITY; SCORE scoreSI = MINUS_INFINITY; if (uPrefixLengthA == 0) { if (uPrefixLengthB == 1) scoreSI = PB[0].m_scoreGapOpen; else scoreSI = DPI(0, uPrefixLengthB - 1); } if (uPrefixLengthB > 1) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreTransMI = PPB.m_scoreGapOpen; scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + scoreTransMI; scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1); } if (EQ(Score, scoreMI)) cPrevEdgeType = 'M'; else if (EQ(Score, scoreII)) cPrevEdgeType = 'I'; else if (EQ(Score, scoreSI)) cPrevEdgeType = 'S'; else Quit("TraceBack: failed to match I"); --uPrevPrefixLengthB; break; } default: assert(false); } #if TRACE Log("Edge %c%c%u.%u", cPrevEdgeType, cEdgeType, uPrefixLengthA, uPrefixLengthB); Log("\n"); #endif cEdgeType = cPrevEdgeType; uPrefixLengthA = uPrevPrefixLengthA; uPrefixLengthB = uPrevPrefixLengthB; } return scoreMax; } muscle-3.8.31.orig/svnversion.h0000644000175000017500000000000511367133036015756 0ustar kratzcharles"31" muscle-3.8.31.orig/cluster.cpp0000644000175000017500000002052011352261600015553 0ustar kratzcharles#include "muscle.h" #include "cluster.h" #include "distfunc.h" static inline float Min(float d1, float d2) { return d1 < d2 ? d1 : d2; } static inline float Max(float d1, float d2) { return d1 > d2 ? d1 : d2; } static inline float Mean(float d1, float d2) { return (float) ((d1 + d2)/2.0); } #if _DEBUG void ClusterTree::Validate(unsigned uNodeCount) { unsigned n; ClusterNode *pNode; unsigned uDisjointListCount = 0; for (pNode = m_ptrDisjoints; pNode; pNode = pNode->GetNextDisjoint()) { ClusterNode *pPrev = pNode->GetPrevDisjoint(); ClusterNode *pNext = pNode->GetNextDisjoint(); if (0 != pPrev) { if (pPrev->GetNextDisjoint() != pNode) { Log("Prev->This mismatch, prev=\n"); pPrev->LogMe(); Log("This=\n"); pNode->LogMe(); Quit("ClusterTree::Validate()"); } } else { if (pNode != m_ptrDisjoints) { Log("[%u]->prev = 0 but != m_ptrDisjoints=%d\n", pNode->GetIndex(), m_ptrDisjoints ? m_ptrDisjoints->GetIndex() : 0xffffffff); pNode->LogMe(); Quit("ClusterTree::Validate()"); } } if (0 != pNext) { if (pNext->GetPrevDisjoint() != pNode) { Log("Next->This mismatch, next=\n"); pNext->LogMe(); Log("This=\n"); pNode->LogMe(); Quit("ClusterTree::Validate()"); } } ++uDisjointListCount; if (uDisjointListCount > m_uNodeCount) Quit("Loop in disjoint list"); } unsigned uParentlessNodeCount = 0; for (n = 0; n < uNodeCount; ++n) if (0 == m_Nodes[n].GetParent()) ++uParentlessNodeCount; if (uDisjointListCount != uParentlessNodeCount) Quit("Disjoints = %u Parentless = %u\n", uDisjointListCount, uParentlessNodeCount); } #else // !_DEBUG #define Validate(uNodeCount) // empty #endif void ClusterNode::LogMe() const { unsigned uClusterSize = GetClusterSize(); Log("[%02u] w=%5.3f CW=%5.3f LBW=%5.3f RBW=%5.3f LWT=%5.3f RWT=%5.3f L=%02d R=%02d P=%02d NxDj=%02d PvDj=%02d Sz=%02d {", m_uIndex, m_dWeight, GetClusterWeight(), GetLeftBranchWeight(), GetRightBranchWeight(), GetLeftWeight(), GetRightWeight(), m_ptrLeft ? m_ptrLeft->GetIndex() : 0xffffffff, m_ptrRight ? m_ptrRight->GetIndex() : 0xffffffff, m_ptrParent ? m_ptrParent->GetIndex() : 0xffffffff, m_ptrNextDisjoint ? m_ptrNextDisjoint->GetIndex() : 0xffffffff, m_ptrPrevDisjoint ? m_ptrPrevDisjoint->GetIndex() : 0xffffffff, uClusterSize); for (unsigned i = 0; i < uClusterSize; ++i) Log(" %u", GetClusterLeaf(i)->GetIndex()); Log(" }\n"); } // How many leaves in the sub-tree under this node? unsigned ClusterNode::GetClusterSize() const { unsigned uLeafCount = 0; if (0 == m_ptrLeft && 0 == m_ptrRight) return 1; if (0 != m_ptrLeft) uLeafCount += m_ptrLeft->GetClusterSize(); if (0 != m_ptrRight) uLeafCount += m_ptrRight->GetClusterSize(); assert(uLeafCount > 0); return uLeafCount; } double ClusterNode::GetClusterWeight() const { double dWeight = 0.0; if (0 != m_ptrLeft) dWeight += m_ptrLeft->GetClusterWeight(); if (0 != m_ptrRight) dWeight += m_ptrRight->GetClusterWeight(); return dWeight + GetWeight(); } double ClusterNode::GetLeftBranchWeight() const { const ClusterNode *ptrLeft = GetLeft(); if (0 == ptrLeft) return 0.0; return GetWeight() - ptrLeft->GetWeight(); } double ClusterNode::GetRightBranchWeight() const { const ClusterNode *ptrRight = GetRight(); if (0 == ptrRight) return 0.0; return GetWeight() - ptrRight->GetWeight(); } double ClusterNode::GetRightWeight() const { const ClusterNode *ptrRight = GetRight(); if (0 == ptrRight) return 0.0; return ptrRight->GetClusterWeight() + GetWeight(); } double ClusterNode::GetLeftWeight() const { const ClusterNode *ptrLeft = GetLeft(); if (0 == ptrLeft) return 0.0; return ptrLeft->GetClusterWeight() + GetWeight(); } // Return n'th leaf in the sub-tree under this node. const ClusterNode *ClusterNode::GetClusterLeaf(unsigned uLeafIndex) const { if (0 != m_ptrLeft) { if (0 == m_ptrRight) return this; unsigned uLeftLeafCount = m_ptrLeft->GetClusterSize(); if (uLeafIndex < uLeftLeafCount) return m_ptrLeft->GetClusterLeaf(uLeafIndex); assert(uLeafIndex >= uLeftLeafCount); return m_ptrRight->GetClusterLeaf(uLeafIndex - uLeftLeafCount); } if (0 == m_ptrRight) return this; return m_ptrRight->GetClusterLeaf(uLeafIndex); } void ClusterTree::DeleteFromDisjoints(ClusterNode *ptrNode) { ClusterNode *ptrPrev = ptrNode->GetPrevDisjoint(); ClusterNode *ptrNext = ptrNode->GetNextDisjoint(); if (0 != ptrPrev) ptrPrev->SetNextDisjoint(ptrNext); else m_ptrDisjoints = ptrNext; if (0 != ptrNext) ptrNext->SetPrevDisjoint(ptrPrev); #if _DEBUG // not algorithmically necessary, but improves clarity // and supports Validate(). ptrNode->SetPrevDisjoint(0); ptrNode->SetNextDisjoint(0); #endif } void ClusterTree::AddToDisjoints(ClusterNode *ptrNode) { ptrNode->SetNextDisjoint(m_ptrDisjoints); ptrNode->SetPrevDisjoint(0); if (0 != m_ptrDisjoints) m_ptrDisjoints->SetPrevDisjoint(ptrNode); m_ptrDisjoints = ptrNode; } ClusterTree::ClusterTree() { m_ptrDisjoints = 0; m_Nodes = 0; m_uNodeCount = 0; } ClusterTree::~ClusterTree() { delete[] m_Nodes; } void ClusterTree::LogMe() const { Log("Disjoints=%d\n", m_ptrDisjoints ? m_ptrDisjoints->GetIndex() : 0xffffffff); for (unsigned i = 0; i < m_uNodeCount; ++i) { m_Nodes[i].LogMe(); } } ClusterNode *ClusterTree::GetRoot() const { return &m_Nodes[m_uNodeCount - 1]; } // This is the UPGMA algorithm as described in Durbin et al. p166. void ClusterTree::Create(const DistFunc &Dist) { unsigned i; m_uLeafCount = Dist.GetCount(); m_uNodeCount = 2*m_uLeafCount - 1; delete[] m_Nodes; m_Nodes = new ClusterNode[m_uNodeCount]; for (i = 0; i < m_uNodeCount; ++i) m_Nodes[i].SetIndex(i); for (i = 0; i < m_uLeafCount - 1; ++i) m_Nodes[i].SetNextDisjoint(&m_Nodes[i+1]); for (i = 1; i < m_uLeafCount; ++i) m_Nodes[i].SetPrevDisjoint(&m_Nodes[i-1]); m_ptrDisjoints = &m_Nodes[0]; // Log("Initial state\n"); // LogMe(); // Log("\n"); DistFunc ClusterDist; ClusterDist.SetCount(m_uNodeCount); double dMaxDist = 0.0; for (i = 0; i < m_uLeafCount; ++i) for (unsigned j = 0; j < m_uLeafCount; ++j) { float dDist = Dist.GetDist(i, j); ClusterDist.SetDist(i, j, dDist); } Validate(m_uLeafCount); // Iteration. N-1 joins needed to create a binary tree from N leaves. for (unsigned uJoinIndex = m_uLeafCount; uJoinIndex < m_uNodeCount; ++uJoinIndex) { // Find closest pair of clusters unsigned uIndexClosest1; unsigned uIndexClosest2; bool bFound = false; double dDistClosest = 9e99; for (ClusterNode *ptrNode1 = m_ptrDisjoints; ptrNode1; ptrNode1 = ptrNode1->GetNextDisjoint()) { for (ClusterNode *ptrNode2 = ptrNode1->GetNextDisjoint(); ptrNode2; ptrNode2 = ptrNode2->GetNextDisjoint()) { unsigned i1 = ptrNode1->GetIndex(); unsigned i2 = ptrNode2->GetIndex(); double dDist = ClusterDist.GetDist(i1, i2); if (dDist < dDistClosest) { bFound = true; dDistClosest = dDist; uIndexClosest1 = i1; uIndexClosest2 = i2; } } } assert(bFound); ClusterNode &Join = m_Nodes[uJoinIndex]; ClusterNode &Child1 = m_Nodes[uIndexClosest1]; ClusterNode &Child2 = m_Nodes[uIndexClosest2]; Join.SetLeft(&Child1); Join.SetRight(&Child2); Join.SetWeight(dDistClosest); Child1.SetParent(&Join); Child2.SetParent(&Join); DeleteFromDisjoints(&Child1); DeleteFromDisjoints(&Child2); AddToDisjoints(&Join); // Log("After join %d %d\n", uIndexClosest1, uIndexClosest2); // LogMe(); // Calculate distance of every remaining disjoint cluster to the // new cluster created by the join for (ClusterNode *ptrNode = m_ptrDisjoints; ptrNode; ptrNode = ptrNode->GetNextDisjoint()) { unsigned uNodeIndex = ptrNode->GetIndex(); float dDist1 = ClusterDist.GetDist(uNodeIndex, uIndexClosest1); float dDist2 = ClusterDist.GetDist(uNodeIndex, uIndexClosest2); float dDist = Min(dDist1, dDist2); ClusterDist.SetDist(uJoinIndex, uNodeIndex, dDist); } Validate(uJoinIndex+1); } GetRoot()->GetClusterWeight(); // LogMe(); } muscle-3.8.31.orig/clustsetdf.h0000644000175000017500000000176211352261612015731 0ustar kratzcharles#ifndef ClustSetDF_h #define ClustSetDF_h class MSA; class Clust; #include "clustset.h" #include "distfunc.h" #include "msa.h" class ClustSetDF : public ClustSet { public: ClustSetDF(const DistFunc &DF) : m_ptrDF(&DF) { } public: virtual unsigned GetLeafCount() { return m_ptrDF->GetCount(); } virtual const char *GetLeafName(unsigned uNodeIndex) { return m_ptrDF->GetName(uNodeIndex); } virtual unsigned GetLeafId(unsigned uNodeIndex) { return m_ptrDF->GetId(uNodeIndex); } virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex, unsigned uRightNodeIndex, unsigned uJoinedNodeIndex, double *ptrdLeftLength, double *ptrdRightLength) { Quit("ClustSetDF::JoinNodes, should never be called"); } virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1, unsigned uNodeIndex2) { return m_ptrDF->GetDist(uNodeIndex1, uNodeIndex2); } private: const DistFunc *m_ptrDF; }; #endif // ClustSetDF_h muscle-3.8.31.orig/phy2.cpp0000644000175000017500000001657711352261600014775 0ustar kratzcharles#include "muscle.h" #include "tree.h" #define TRACE 0 // Return false when done bool PhyEnumEdges(const Tree &tree, PhyEnumEdgeState &ES) { unsigned uNode1 = uInsane; if (!ES.m_bInit) { if (tree.GetNodeCount() <= 1) { ES.m_uNodeIndex1 = NULL_NEIGHBOR; ES.m_uNodeIndex2 = NULL_NEIGHBOR; return false; } uNode1 = tree.FirstDepthFirstNode(); ES.m_bInit = true; } else { uNode1 = tree.NextDepthFirstNode(ES.m_uNodeIndex1); if (NULL_NEIGHBOR == uNode1) return false; if (tree.IsRooted() && tree.IsRoot(uNode1)) { uNode1 = tree.NextDepthFirstNode(uNode1); if (NULL_NEIGHBOR == uNode1) return false; } } unsigned uNode2 = tree.GetParent(uNode1); ES.m_uNodeIndex1 = uNode1; ES.m_uNodeIndex2 = uNode2; return true; } bool PhyEnumEdgesR(const Tree &tree, PhyEnumEdgeState &ES) { unsigned uNode1 = uInsane; if (!ES.m_bInit) { if (tree.GetNodeCount() <= 1) { ES.m_uNodeIndex1 = NULL_NEIGHBOR; ES.m_uNodeIndex2 = NULL_NEIGHBOR; return false; } uNode1 = tree.FirstDepthFirstNodeR(); ES.m_bInit = true; } else { uNode1 = tree.NextDepthFirstNodeR(ES.m_uNodeIndex1); if (NULL_NEIGHBOR == uNode1) return false; if (tree.IsRooted() && tree.IsRoot(uNode1)) { uNode1 = tree.NextDepthFirstNode(uNode1); if (NULL_NEIGHBOR == uNode1) return false; } } unsigned uNode2 = tree.GetParent(uNode1); ES.m_uNodeIndex1 = uNode1; ES.m_uNodeIndex2 = uNode2; return true; } static void GetLeavesSubtree(const Tree &tree, unsigned uNodeIndex1, const unsigned uNodeIndex2, unsigned Leaves[], unsigned *ptruCount) { if (tree.IsLeaf(uNodeIndex1)) { Leaves[*ptruCount] = uNodeIndex1; ++(*ptruCount); return; } const unsigned uLeft = tree.GetFirstNeighbor(uNodeIndex1, uNodeIndex2); const unsigned uRight = tree.GetSecondNeighbor(uNodeIndex1, uNodeIndex2); if (NULL_NEIGHBOR != uLeft) GetLeavesSubtree(tree, uLeft, uNodeIndex1, Leaves, ptruCount); if (NULL_NEIGHBOR != uRight) GetLeavesSubtree(tree, uRight, uNodeIndex1, Leaves, ptruCount); } static void PhyGetLeaves(const Tree &tree, unsigned uNodeIndex1, unsigned uNodeIndex2, unsigned Leaves[], unsigned *ptruCount) { *ptruCount = 0; GetLeavesSubtree(tree, uNodeIndex1, uNodeIndex2, Leaves, ptruCount); } bool PhyEnumBiParts(const Tree &tree, PhyEnumEdgeState &ES, unsigned Leaves1[], unsigned *ptruCount1, unsigned Leaves2[], unsigned *ptruCount2) { bool bOk = PhyEnumEdges(tree, ES); if (!bOk) { *ptruCount1 = 0; *ptruCount2 = 0; return false; } // Special case: in a rooted tree, both edges from the root // give the same bipartition, so skip one of them. if (tree.IsRooted() && tree.IsRoot(ES.m_uNodeIndex2) && tree.GetRight(ES.m_uNodeIndex2) == ES.m_uNodeIndex1) { bOk = PhyEnumEdges(tree, ES); if (!bOk) return false; } PhyGetLeaves(tree, ES.m_uNodeIndex1, ES.m_uNodeIndex2, Leaves1, ptruCount1); PhyGetLeaves(tree, ES.m_uNodeIndex2, ES.m_uNodeIndex1, Leaves2, ptruCount2); if (*ptruCount1 + *ptruCount2 != tree.GetLeafCount()) Quit("PhyEnumBiParts %u + %u != %u", *ptruCount1, *ptruCount2, tree.GetLeafCount()); #if DEBUG { for (unsigned i = 0; i < *ptruCount1; ++i) { if (!tree.IsLeaf(Leaves1[i])) Quit("PhyEnumByParts: not leaf"); for (unsigned j = 0; j < *ptruCount2; ++j) { if (!tree.IsLeaf(Leaves2[j])) Quit("PhyEnumByParts: not leaf"); if (Leaves1[i] == Leaves2[j]) Quit("PhyEnumByParts: dupe"); } } } #endif return true; } #if 0 void TestBiPart() { SetListFileName("c:\\tmp\\lobster.log", false); Tree tree; TextFile fileIn("c:\\tmp\\test.phy"); tree.FromFile(fileIn); tree.LogMe(); const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Leaves1 = new unsigned[uNodeCount]; unsigned *Leaves2 = new unsigned[uNodeCount]; PhyEnumEdgeState ES; bool bDone = false; for (;;) { unsigned uCount1 = uInsane; unsigned uCount2 = uInsane; bool bOk = PhyEnumBiParts(tree, ES, Leaves1, &uCount1, Leaves2, &uCount2); Log("PEBP=%d ES.Init=%d ES.ni1=%d ES.ni2=%d\n", bOk, ES.m_bInit, ES.m_uNodeIndex1, ES.m_uNodeIndex2); if (!bOk) break; Log("\n"); Log("Part1: "); for (unsigned n = 0; n < uCount1; ++n) Log(" %d(%s)", Leaves1[n], tree.GetLeafName(Leaves1[n])); Log("\n"); Log("Part2: "); for (unsigned n = 0; n < uCount2; ++n) Log(" %d(%s)", Leaves2[n], tree.GetLeafName(Leaves2[n])); Log("\n"); } } #endif static void GetLeavesSubtreeExcluding(const Tree &tree, unsigned uNodeIndex, unsigned uExclude, unsigned Leaves[], unsigned *ptruCount) { if (uNodeIndex == uExclude) return; if (tree.IsLeaf(uNodeIndex)) { Leaves[*ptruCount] = uNodeIndex; ++(*ptruCount); return; } const unsigned uLeft = tree.GetLeft(uNodeIndex); const unsigned uRight = tree.GetRight(uNodeIndex); if (NULL_NEIGHBOR != uLeft) GetLeavesSubtreeExcluding(tree, uLeft, uExclude, Leaves, ptruCount); if (NULL_NEIGHBOR != uRight) GetLeavesSubtreeExcluding(tree, uRight, uExclude, Leaves, ptruCount); } void GetLeavesExcluding(const Tree &tree, unsigned uNodeIndex, unsigned uExclude, unsigned Leaves[], unsigned *ptruCount) { *ptruCount = 0; GetLeavesSubtreeExcluding(tree, uNodeIndex, uExclude, Leaves, ptruCount); } void GetInternalNodesInHeightOrder(const Tree &tree, unsigned NodeIndexes[]) { const unsigned uNodeCount = tree.GetNodeCount(); if (uNodeCount < 3) Quit("GetInternalNodesInHeightOrder: %u nodes, none are internal", uNodeCount); const unsigned uInternalNodeCount = (uNodeCount - 1)/2; double *Heights = new double[uInternalNodeCount]; unsigned uIndex = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (tree.IsLeaf(uNodeIndex)) continue; NodeIndexes[uIndex] = uNodeIndex; Heights[uIndex] = tree.GetNodeHeight(uNodeIndex); ++uIndex; } if (uIndex != uInternalNodeCount) Quit("Internal error: GetInternalNodesInHeightOrder"); // Simple but slow bubble sort (probably don't care about speed here) bool bDone = false; while (!bDone) { bDone = true; for (unsigned i = 0; i < uInternalNodeCount - 1; ++i) { if (Heights[i] > Heights[i+1]) { double dTmp = Heights[i]; Heights[i] = Heights[i+1]; Heights[i+1] = dTmp; unsigned uTmp = NodeIndexes[i]; NodeIndexes[i] = NodeIndexes[i+1]; NodeIndexes[i+1] = uTmp; bDone = false; } } } #if TRACE Log("Internal node index Height\n"); Log("------------------- --------\n"); // 1234567890123456789 123456789 for (unsigned n = 0; n < uInternalNodeCount; ++n) Log("%19u %9.3f\n", NodeIndexes[n], Heights[n]); #endif delete[] Heights; } void ApplyMinEdgeLength(Tree &tree, double dMinEdgeLength) { const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { const unsigned uNeighborCount = tree.GetNeighborCount(uNodeIndex); for (unsigned n = 0; n < uNeighborCount; ++n) { const unsigned uNeighborNodeIndex = tree.GetNeighbor(uNodeIndex, n); if (!tree.HasEdgeLength(uNodeIndex, uNeighborNodeIndex)) continue; if (tree.GetEdgeLength(uNodeIndex, uNeighborNodeIndex) < dMinEdgeLength) tree.SetEdgeLength(uNodeIndex, uNeighborNodeIndex, dMinEdgeLength); } } } muscle-3.8.31.orig/alpha.h0000644000175000017500000000376011352261667014650 0ustar kratzcharles#ifndef alpha_h #define alpha_h bool StrHasAmino(const char *Str); bool StrHasGap(const char *Str); void ClearInvalidLetterWarning(); void InvalidLetterWarning(char c, char w); void ReportInvalidLetters(); extern unsigned g_CharToLetter[]; extern unsigned g_CharToLetterEx[]; extern char g_LetterToChar[]; extern char g_LetterExToChar[]; extern char g_UnalignChar[]; extern char g_AlignChar[]; extern bool g_IsWildcardChar[]; extern bool g_IsResidueChar[]; #define CharToLetter(c) (g_CharToLetter[(unsigned char) (c)]) #define CharToLetterEx(c) (g_CharToLetterEx[(unsigned char) (c)]) #define LetterToChar(u) (g_LetterToChar[u]) #define LetterExToChar(u) (g_LetterExToChar[u]) #define IsResidueChar(c) (g_IsResidueChar[(unsigned char) (c)]) #define IsGapChar(c) ('-' == (c) || '.' == (c)) #define IsWildcardChar(c) (g_IsWildcardChar[(unsigned char) (c)]) #define AlignChar(c) (g_AlignChar[(unsigned char) (c)]) #define UnalignChar(c) (g_UnalignChar[(unsigned char) (c)]) // AX=Amino alphabet with eXtensions (B, Z and X) enum AX { AX_A, AX_C, AX_D, AX_E, AX_F, AX_G, AX_H, AX_I, AX_K, AX_L, AX_M, AX_N, AX_P, AX_Q, AX_R, AX_S, AX_T, AX_V, AX_W, AX_Y, AX_X, // Any AX_B, // D or N AX_Z, // E or Q AX_GAP, }; const unsigned AX_COUNT = AX_GAP + 1; // NX=Nucleotide alphabet with extensions enum NX { NX_A, NX_C, NX_G, NX_T, NX_U = NX_T, NX_M, // AC NX_R, // AG NX_W, // AT NX_S, // CG NX_Y, // CT NX_K, // GT NX_V, // ACG NX_H, // ACT NX_D, // AGT NX_B, // CGT NX_X, // GATC NX_N, // GATC NX_GAP }; const unsigned NX_COUNT = NX_GAP + 1; const unsigned MAX_ALPHA = 20; const unsigned MAX_ALPHA_EX = AX_COUNT; const unsigned MAX_CHAR = 256; extern ALPHA g_Alpha; extern unsigned g_AlphaSize; void SetAlpha(ALPHA Alpha); char GetWildcardChar(); bool IsNucleo(char c); bool IsDNA(char c); bool IsRNA(char c); #endif // alpha_h muscle-3.8.31.orig/readmx.cpp0000644000175000017500000000630111352261673015365 0ustar kratzcharles#include "muscle.h" #include "textfile.h" #define TRACE 0 const int MAX_LINE = 4096; const int MAX_HEADINGS = 32; static char Heading[MAX_HEADINGS]; static unsigned HeadingCount = 0; static float Mx[32][32]; static void LogMx() { Log("Matrix\n"); Log(" "); for (int i = 0; i < 20; ++i) Log(" %c", LetterToChar(i)); Log("\n"); for (int i = 0; i < 20; ++i) { Log("%c ", LetterToChar(i)); for (int j = 0; j < 20; ++j) Log("%5.1f", Mx[i][j]); Log("\n"); } Log("\n"); } static unsigned MxCharToLetter(char c) { for (unsigned Letter = 0; Letter < HeadingCount; ++Letter) if (Heading[Letter] == c) return Letter; Quit("Letter '%c' has no heading", c); return 0; } PTR_SCOREMATRIX ReadMx(TextFile &File) { // Find column headers char Line[MAX_LINE]; for (;;) { bool EndOfFile = File.GetLine(Line, sizeof(Line)); if (EndOfFile) Quit("Premature EOF in matrix file"); if (Line[0] == '#') continue; else if (Line[0] == ' ') break; else Quit("Invalid line in matrix file: '%s'", Line); } // Read column headers HeadingCount = 0; for (char *p = Line; *p; ++p) { char c = *p; if (!isspace(c)) Heading[HeadingCount++] = c; } if (HeadingCount > 0 && Heading[HeadingCount-1] == '*') --HeadingCount; if (HeadingCount < 20) Quit("Error in matrix file: < 20 headers, line='%s'", Line); #if TRACE { Log("ReadMx\n"); Log("%d headings: ", HeadingCount); for (unsigned i = 0; i < HeadingCount; ++i) Log("%c", Heading[i]); Log("\n"); } #endif // Zero out matrix for (int i = 0; i < MAX_ALPHA; ++i) for (int j = 0; j < MAX_ALPHA; ++j) Mx[i][j] = 0.0; // Read data lines for (unsigned RowIndex = 0; RowIndex < HeadingCount; ++RowIndex) { bool EndOfFile = File.GetTrimLine(Line, sizeof(Line)); if (EndOfFile) Quit("Premature EOF in matrix file"); #if TRACE Log("Line=%s\n", Line); #endif if (Line[0] == '#') continue; char c = Line[0]; #if TRACE Log("Row char=%c\n", c); #endif if (!IsResidueChar(c)) continue; unsigned RowLetter = CharToLetter(c); if (RowLetter >= 20) continue; #if TRACE Log("Row letter = %u\n", RowLetter); #endif char *p = Line + 1; char *maxp = p + strlen(Line); for (unsigned Col = 0; Col < HeadingCount - 1; ++Col) { if (p >= maxp) Quit("Too few fields in line of matrix file: '%s'", Line); while (isspace(*p)) ++p; char *Value = p; while (!isspace(*p)) ++p; float v = (float) atof(Value); char HeaderChar = Heading[Col]; if (IsResidueChar(HeaderChar)) { unsigned ColLetter = CharToLetter(HeaderChar); if (ColLetter >= 20) continue; Mx[RowLetter][ColLetter] = v; } p += 1; } } // Sanity check for symmetry for (int i = 0; i < 20; ++i) for (int j = 0; j < i; ++j) { if (Mx[i][j] != Mx[j][i]) { Warning("Matrix is not symmetrical, %c->%c=%g, %c->%c=%g", CharToLetter(i), CharToLetter(j), Mx[i][j], CharToLetter(j), CharToLetter(i), Mx[j][i]); goto ExitLoop; } } ExitLoop:; if (g_bVerbose) LogMx(); return &Mx; } muscle-3.8.31.orig/mpam200.cpp0000644000175000017500000001757411352261626015275 0ustar kratzcharles#include "muscle.h" const float PAM_200_CENTER = (float) 20.0; #define v(x) ((float) x + PAM_200_CENTER) #define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), \ v(M), v(N), v(P), v(Q), v(R), v(S), v(T), v(V), v(W), v(Y) }, float PAM200[32][32] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 388, -0, 34, 32, -202, 159, -88, 89, -55, -67, 19, 86, 186, -34, -32, 237, 273, 171, -326, -239) // A ROW( -0, 1170, -248, -315, 74, -14, 43, -151, -204, -196, -132, -49, -142, -215, 29, 165, -7, -69, 179, 313) // C ROW( 34, -248, 625, 496, -419, 148, 78, -245, 55, -361, -255, 332, -169, 122, -64, 45, -13, -167, -438, -148) // D ROW( 32, -315, 496, 610, -480, 125, 25, -245, 175, -327, -242, 166, -141, 279, 34, -30, -56, -150, -386, -305) // E ROW( -202, 74, -419, -480, 888, -407, 62, 80, -443, 320, 67, -236, -180, -294, -327, -51, -173, 31, -1, 584) // F ROW( 159, -14, 148, 125, -407, 662, -114, -216, -34, -324, -246, 79, -77, -68, 97, 155, 21, -93, -58, -349) // G ROW( -88, 43, 78, 25, 62, -114, 766, -205, 144, -92, -152, 238, 66, 368, 257, 35, -35, -217, -201, 468) // H ROW( 89, -151, -245, -245, 80, -216, -205, 554, -224, 288, 391, -114, -115, -222, -208, -19, 162, 469, -274, -153) // I ROW( -55, -204, 55, 175, -443, -34, 144, -224, 632, -249, -118, 186, -86, 315, 466, 2, 19, -227, -216, -264) // K ROW( -67, -196, -361, -327, 320, -324, -92, 288, -249, 591, 369, -223, 53, -86, -170, -69, -41, 239, -66, -29) // L ROW( 19, -132, -255, -242, 67, -246, -152, 391, -118, 369, 756, -131, -98, -124, -129, -49, 129, 331, -229, -182) // M ROW( 86, -49, 332, 166, -236, 79, 238, -114, 186, -223, -131, 516, -21, 88, 73, 240, 168, -118, -379, -8) // N ROW( 186, -142, -169, -141, -180, -77, 66, -115, -86, 53, -98, -21, 736, 122, 5, 221, 139, -75, -373, -226) // P ROW( -34, -215, 122, 279, -294, -68, 368, -222, 315, -86, -124, 88, 122, 635, 301, -13, -35, -195, -243, -73) // Q ROW( -32, 29, -64, 34, -327, 97, 257, -208, 466, -170, -129, 73, 5, 301, 606, 28, -4, -201, 104, -133) // R ROW( 237, 165, 45, -30, -51, 155, 35, -19, 2, -69, -49, 240, 221, -13, 28, 353, 259, 8, -213, -55) // S ROW( 273, -7, -13, -56, -173, 21, -35, 162, 19, -41, 129, 168, 139, -35, -4, 259, 422, 143, -343, -190) // T ROW( 171, -69, -167, -150, 31, -93, -217, 469, -227, 239, 331, -118, -75, -195, -201, 8, 143, 505, -245, -197) // V ROW( -326, 179, -438, -386, -1, -58, -201, -274, -216, -66, -229, -379, -373, -243, 104, -213, -343, -245, 1475, 63) // W ROW( -239, 313, -148, -305, 584, -349, 468, -153, -264, -29, -182, -8, -226, -73, -133, -55, -190, -197, 63, 979) // Y }; #undef v #define v(x) ((float) x) #define RNC(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), \ v(M), v(N), v(P), v(Q), v(R), v(S), v(T), v(V), v(W), v(Y) }, float PAM200NoCenter[32][32] = { // A C D E F G H I K L // M N P Q R S T V W Y RNC( 388, -0, 34, 32, -202, 159, -88, 89, -55, -67, 19, 86, 186, -34, -32, 237, 273, 171, -326, -239) // A RNC( -0, 1170, -248, -315, 74, -14, 43, -151, -204, -196, -132, -49, -142, -215, 29, 165, -7, -69, 179, 313) // C RNC( 34, -248, 625, 496, -419, 148, 78, -245, 55, -361, -255, 332, -169, 122, -64, 45, -13, -167, -438, -148) // D RNC( 32, -315, 496, 610, -480, 125, 25, -245, 175, -327, -242, 166, -141, 279, 34, -30, -56, -150, -386, -305) // E RNC( -202, 74, -419, -480, 888, -407, 62, 80, -443, 320, 67, -236, -180, -294, -327, -51, -173, 31, -1, 584) // F RNC( 159, -14, 148, 125, -407, 662, -114, -216, -34, -324, -246, 79, -77, -68, 97, 155, 21, -93, -58, -349) // G RNC( -88, 43, 78, 25, 62, -114, 766, -205, 144, -92, -152, 238, 66, 368, 257, 35, -35, -217, -201, 468) // H RNC( 89, -151, -245, -245, 80, -216, -205, 554, -224, 288, 391, -114, -115, -222, -208, -19, 162, 469, -274, -153) // I RNC( -55, -204, 55, 175, -443, -34, 144, -224, 632, -249, -118, 186, -86, 315, 466, 2, 19, -227, -216, -264) // K RNC( -67, -196, -361, -327, 320, -324, -92, 288, -249, 591, 369, -223, 53, -86, -170, -69, -41, 239, -66, -29) // L RNC( 19, -132, -255, -242, 67, -246, -152, 391, -118, 369, 756, -131, -98, -124, -129, -49, 129, 331, -229, -182) // M RNC( 86, -49, 332, 166, -236, 79, 238, -114, 186, -223, -131, 516, -21, 88, 73, 240, 168, -118, -379, -8) // N RNC( 186, -142, -169, -141, -180, -77, 66, -115, -86, 53, -98, -21, 736, 122, 5, 221, 139, -75, -373, -226) // P RNC( -34, -215, 122, 279, -294, -68, 368, -222, 315, -86, -124, 88, 122, 635, 301, -13, -35, -195, -243, -73) // Q RNC( -32, 29, -64, 34, -327, 97, 257, -208, 466, -170, -129, 73, 5, 301, 606, 28, -4, -201, 104, -133) // R RNC( 237, 165, 45, -30, -51, 155, 35, -19, 2, -69, -49, 240, 221, -13, 28, 353, 259, 8, -213, -55) // S RNC( 273, -7, -13, -56, -173, 21, -35, 162, 19, -41, 129, 168, 139, -35, -4, 259, 422, 143, -343, -190) // T RNC( 171, -69, -167, -150, 31, -93, -217, 469, -227, 239, 331, -118, -75, -195, -201, 8, 143, 505, -245, -197) // V RNC( -326, 179, -438, -386, -1, -58, -201, -274, -216, -66, -229, -379, -373, -243, 104, -213, -343, -245, 1475, 63) // W RNC( -239, 313, -148, -305, 584, -349, 468, -153, -264, -29, -182, -8, -226, -73, -133, -55, -190, -197, 63, 979) // Y }; muscle-3.8.31.orig/params.cpp0000644000175000017500000004132211352261676015375 0ustar kratzcharles#include "muscle.h" #include "objscore.h" #include "profile.h" #include "enumopts.h" const double DEFAULT_MAX_MB_FRACT = 0.8; SCORE g_scoreCenter = 0; SCORE g_scoreGapExtend = 0; SCORE g_scoreGapOpen2 = MINUS_INFINITY; SCORE g_scoreGapExtend2 = MINUS_INFINITY; SCORE g_scoreGapAmbig = 0; SCORE g_scoreAmbigFactor = 0; extern SCOREMATRIX VTML_LA; extern SCOREMATRIX PAM200; extern SCOREMATRIX PAM200NoCenter; extern SCOREMATRIX VTML_SP; extern SCOREMATRIX VTML_SPNoCenter; extern SCOREMATRIX NUC_SP; PTR_SCOREMATRIX g_ptrScoreMatrix; const char *g_pstrInFileName = "-"; const char *g_pstrOutFileName = "-"; const char *g_pstrFASTAOutFileName = 0; const char *g_pstrMSFOutFileName = 0; const char *g_pstrClwOutFileName = 0; const char *g_pstrClwStrictOutFileName = 0; const char *g_pstrHTMLOutFileName = 0; const char *g_pstrPHYIOutFileName = 0; const char *g_pstrPHYSOutFileName = 0; const char *g_pstrDistMxFileName1 = 0; const char *g_pstrDistMxFileName2 = 0; const char *g_pstrFileName1 = 0; const char *g_pstrFileName2 = 0; const char *g_pstrSPFileName = 0; const char *g_pstrMatrixFileName = 0; const char *g_pstrUseTreeFileName = 0; bool g_bUseTreeNoWarn = false; const char *g_pstrComputeWeightsFileName; const char *g_pstrScoreFileName; const char *g_pstrProf1FileName = 0; const char *g_pstrProf2FileName = 0; unsigned g_uSmoothWindowLength = 7; unsigned g_uAnchorSpacing = 32; unsigned g_uMaxTreeRefineIters = 1; unsigned g_uRefineWindow = 200; unsigned g_uWindowFrom = 0; unsigned g_uWindowTo = 0; unsigned g_uSaveWindow = uInsane; unsigned g_uWindowOffset = 0; unsigned g_uMaxSubFamCount = 5; unsigned g_uHydrophobicRunLength = 5; float g_dHydroFactor = (float) 1.2; unsigned g_uMinDiagLength = 24; // TODO alpha -- should depend on alphabet? unsigned g_uMaxDiagBreak = 1; unsigned g_uDiagMargin = 5; float g_dSUEFF = (float) 0.1; bool g_bPrecompiledCenter = true; bool g_bNormalizeCounts = false; bool g_bDiags1 = false; bool g_bDiags2 = false; bool g_bAnchors = true; bool g_bQuiet = false; bool g_bVerbose = false; bool g_bRefine = false; bool g_bRefineW = false; bool g_bProfDB = false; bool g_bLow = false; bool g_bSW = false; bool g_bClusterOnly = false; bool g_bProfile = false; bool g_bPPScore = false; bool g_bBrenner = false; bool g_bDimer = false; bool g_bVersion = false; bool g_bStable = false; bool g_bFASTA = false; bool g_bPAS = false; bool g_bTomHydro = false; bool g_bMakeTree = false; #if DEBUG bool g_bCatchExceptions = false; #else bool g_bCatchExceptions = true; #endif bool g_bMSF = false; bool g_bAln = false; bool g_bClwStrict = false; bool g_bHTML = false; bool g_bPHYI = false; bool g_bPHYS = false; unsigned g_uMaxIters = 8; unsigned long g_ulMaxSecs = 0; unsigned g_uMaxMB = 500; PPSCORE g_PPScore = PPSCORE_LE; OBJSCORE g_ObjScore = OBJSCORE_SPM; SEQWEIGHT g_SeqWeight1 = SEQWEIGHT_ClustalW; SEQWEIGHT g_SeqWeight2 = SEQWEIGHT_ClustalW; DISTANCE g_Distance1 = DISTANCE_Kmer6_6; DISTANCE g_Distance2 = DISTANCE_PctIdKimura; CLUSTER g_Cluster1 = CLUSTER_UPGMB; CLUSTER g_Cluster2 = CLUSTER_UPGMB; ROOT g_Root1 = ROOT_Pseudo; ROOT g_Root2 = ROOT_Pseudo; bool g_bDiags; SEQTYPE g_SeqType = SEQTYPE_Auto; TERMGAPS g_TermGaps = TERMGAPS_Half; //------------------------------------------------------ // These parameters depending on the chosen prof-prof // score (g_PPScore), initialized to "Undefined". float g_dSmoothScoreCeil = fInsane; float g_dMinBestColScore = fInsane; float g_dMinSmoothScore = fInsane; SCORE g_scoreGapOpen = fInsane; //------------------------------------------------------ static unsigned atou(const char *s) { return (unsigned) atoi(s); } const char *MaxSecsToStr() { if (0 == g_ulMaxSecs) return "(No limit)"; return SecsToStr(g_ulMaxSecs); } void ListParams() { Log("\n"); Log("%s\n", MUSCLE_LONG_VERSION); Log("http://www.drive5.com/muscle\n"); Log("\n"); Log("Profile-profile score %s\n", PPSCOREToStr(g_PPScore)); Log("Max iterations %u\n", g_uMaxIters); Log("Max trees %u\n", g_uMaxTreeRefineIters); Log("Max time %s\n", MaxSecsToStr()); Log("Max MB %u\n", g_uMaxMB); Log("Gap open %g\n", g_scoreGapOpen); Log("Gap extend (dimer) %g\n", g_scoreGapExtend); Log("Gap ambig factor %g\n", g_scoreAmbigFactor); Log("Gap ambig penalty %g\n", g_scoreGapAmbig); Log("Center (LE) %g\n", g_scoreCenter); Log("Term gaps %s\n", TERMGAPSToStr(g_TermGaps)); Log("Smooth window length %u\n", g_uSmoothWindowLength); Log("Refine window length %u\n", g_uRefineWindow); Log("Min anchor spacing %u\n", g_uAnchorSpacing); Log("Min diag length (lambda) %u\n", g_uMinDiagLength); Log("Diag margin (mu) %u\n", g_uDiagMargin); Log("Min diag break %u\n", g_uMaxDiagBreak); Log("Hydrophobic window %u\n", g_uHydrophobicRunLength); Log("Hydrophobic gap factor %g\n", g_dHydroFactor); Log("Smooth score ceiling %g\n", g_dSmoothScoreCeil); Log("Min best col score %g\n", g_dMinBestColScore); Log("Min anchor score %g\n", g_dMinSmoothScore); Log("SUEFF %g\n", g_dSUEFF); Log("Brenner root MSA %s\n", BoolToStr(g_bBrenner)); Log("Normalize counts %s\n", BoolToStr(g_bNormalizeCounts)); Log("Diagonals (1) %s\n", BoolToStr(g_bDiags1)); Log("Diagonals (2) %s\n", BoolToStr(g_bDiags2)); Log("Anchors %s\n", BoolToStr(g_bAnchors)); Log("MSF output format %s\n", BoolToStr(g_bMSF)); Log("Phylip interleaved %s\n", BoolToStr(g_bPHYI)); Log("Phylip sequential %s\n", BoolToStr(g_bPHYS)); Log("ClustalW output format %s\n", BoolToStr(g_bAln)); Log("Catch exceptions %s\n", BoolToStr(g_bCatchExceptions)); Log("Quiet %s\n", BoolToStr(g_bQuiet)); Log("Refine %s\n", BoolToStr(g_bRefine)); Log("ProdfDB %s\n", BoolToStr(g_bProfDB)); Log("Low complexity profiles %s\n", BoolToStr(g_bLow)); Log("Objective score %s\n", OBJSCOREToStr(g_ObjScore)); Log("Distance method (1) %s\n", DISTANCEToStr(g_Distance1)); Log("Clustering method (1) %s\n", CLUSTERToStr(g_Cluster1)); Log("Root method (1) %s\n", ROOTToStr(g_Root1)); Log("Sequence weighting (1) %s\n", SEQWEIGHTToStr(g_SeqWeight1)); Log("Distance method (2) %s\n", DISTANCEToStr(g_Distance2)); Log("Clustering method (2) %s\n", CLUSTERToStr(g_Cluster2)); Log("Root method (2) %s\n", ROOTToStr(g_Root2)); Log("Sequence weighting (2) %s\n", SEQWEIGHTToStr(g_SeqWeight2)); Log("\n"); } static void SetDefaultsLE() { g_ptrScoreMatrix = &VTML_LA; //g_scoreGapOpen = (SCORE) -3.00; //g_scoreCenter = (SCORE) -0.55; g_scoreGapOpen = (SCORE) -2.9; g_scoreCenter = (SCORE) -0.52; g_bNormalizeCounts = true; //g_dSmoothScoreCeil = 5.0; //g_dMinBestColScore = 4.0; //g_dMinSmoothScore = 2.0; g_dSmoothScoreCeil = 3.0; g_dMinBestColScore = 2.0; g_dMinSmoothScore = 1.0; g_Distance1 = DISTANCE_Kmer6_6; g_Distance2 = DISTANCE_PctIdKimura; } static void SetDefaultsSP() { g_ptrScoreMatrix = &PAM200; g_scoreGapOpen = -1439; g_scoreCenter = 0.0; // center pre-added into score mx g_bNormalizeCounts = false; g_dSmoothScoreCeil = 200.0; g_dMinBestColScore = 300.0; g_dMinSmoothScore = 125.0; g_Distance1 = DISTANCE_Kmer6_6; g_Distance2 = DISTANCE_PctIdKimura; } static void SetDefaultsSV() { g_ptrScoreMatrix = &VTML_SP; g_scoreGapOpen = -300; g_scoreCenter = 0.0; // center pre-added into score mx g_bNormalizeCounts = false; g_dSmoothScoreCeil = 90.0; g_dMinBestColScore = 130.0; g_dMinSmoothScore = 40.0; g_Distance1 = DISTANCE_Kmer6_6; g_Distance2 = DISTANCE_PctIdKimura; } //static void SetDefaultsSPN() // { // g_ptrScoreMatrix = &NUC_SP; // // g_scoreGapOpen = -400; // g_scoreCenter = 0.0; // center pre-added into score mx // // g_bNormalizeCounts = false; // // g_dSmoothScoreCeil = 999.0; // disable // g_dMinBestColScore = 90; // g_dMinSmoothScore = 90; // // g_Distance1 = DISTANCE_Kmer4_6; // g_Distance2 = DISTANCE_PctIdKimura; // } static void SetDefaultsSPN_DNA() { g_ptrScoreMatrix = &NUC_SP; g_scoreGapOpen = -400; g_scoreCenter = 0.0; // center pre-added into score mx g_scoreGapExtend = 0.0; g_bNormalizeCounts = false; g_dSmoothScoreCeil = 999.0; // disable g_dMinBestColScore = 90; g_dMinSmoothScore = 90; g_Distance1 = DISTANCE_Kmer4_6; g_Distance2 = DISTANCE_PctIdKimura; } static void SetDefaultsSPN_RNA() { g_ptrScoreMatrix = &NUC_SP; g_scoreGapOpen = -420; g_scoreCenter = -300; // total center = NUC_EXTEND - 300 g_scoreGapExtend = 0.0; g_bNormalizeCounts = false; g_dSmoothScoreCeil = 999.0; // disable g_dMinBestColScore = 90; g_dMinSmoothScore = 90; g_Distance1 = DISTANCE_Kmer4_6; g_Distance2 = DISTANCE_PctIdKimura; } static void FlagParam(const char *OptName, bool *ptrParam, bool bValueIfFlagSet) { bool bIsSet = FlagOpt(OptName); if (bIsSet) *ptrParam = bValueIfFlagSet; } static void StrParam(const char *OptName, const char **ptrptrParam) { const char *opt = ValueOpt(OptName); if (0 != opt) *ptrptrParam = opt; } static void FloatParam(const char *OptName, float *ptrParam) { const char *opt = ValueOpt(OptName); if (0 != opt) *ptrParam = (float) atof(opt); } static void UintParam(const char *OptName, unsigned *ptrParam) { const char *opt = ValueOpt(OptName); if (0 != opt) *ptrParam = atou(opt); } static void EnumParam(const char *OptName, EnumOpt *Opts, int *Param) { const char *Value = ValueOpt(OptName); if (0 == Value) return; for (;;) { if (0 == Opts->pstrOpt) Quit("Invalid parameter -%s %s", OptName, Value); if (0 == stricmp(Value, Opts->pstrOpt)) { *Param = Opts->iValue; return; } ++Opts; } } static void SetPPDefaultParams() { switch (g_PPScore) { case PPSCORE_SP: SetDefaultsSP(); break; case PPSCORE_LE: SetDefaultsLE(); break; case PPSCORE_SV: SetDefaultsSV(); break; case PPSCORE_SPN: switch (g_Alpha) { case ALPHA_DNA: SetDefaultsSPN_DNA(); break; case ALPHA_RNA: SetDefaultsSPN_RNA(); break; default: Quit("Invalid alpha %d", g_Alpha); } break; default: Quit("Invalid g_PPScore"); } } static void SetPPCommandLineParams() { FloatParam("GapOpen", &g_scoreGapOpen); FloatParam("GapOpen2", &g_scoreGapOpen2); FloatParam("GapExtend", &g_scoreGapExtend); FloatParam("GapExtend2", &g_scoreGapExtend2); FloatParam("GapAmbig", &g_scoreAmbigFactor); FloatParam("Center", &g_scoreCenter); FloatParam("SmoothScoreCeil", &g_dSmoothScoreCeil); FloatParam("MinBestColScore", &g_dMinBestColScore); FloatParam("MinSmoothScore", &g_dMinSmoothScore); EnumParam("Distance", DISTANCE_Opts, (int *) &g_Distance1); EnumParam("Distance", DISTANCE_Opts, (int *) &g_Distance2); EnumParam("Distance1", DISTANCE_Opts, (int *) &g_Distance1); EnumParam("Distance2", DISTANCE_Opts, (int *) &g_Distance2); } void SetPPScore(bool bRespectFlagOpts) { if (bRespectFlagOpts) { if (FlagOpt("SP")) g_PPScore = PPSCORE_SP; else if (FlagOpt("LE")) g_PPScore = PPSCORE_LE; else if (FlagOpt("SV")) g_PPScore = PPSCORE_SV; else if (FlagOpt("SPN")) g_PPScore = PPSCORE_SPN; } switch (g_PPScore) { case PPSCORE_LE: case PPSCORE_SP: case PPSCORE_SV: if (ALPHA_RNA == g_Alpha || ALPHA_DNA == g_Alpha) g_PPScore = PPSCORE_SPN; break; case PPSCORE_SPN: if (ALPHA_Amino == g_Alpha) g_PPScore = PPSCORE_LE; break; } SetPPDefaultParams(); SetPPCommandLineParams(); if (g_bVerbose) ListParams(); } void SetPPScore(PPSCORE p) { g_PPScore = p; SetPPScore(true); } static void SetMaxSecs() { float fMaxHours = 0.0; FloatParam("MaxHours", &fMaxHours); if (0.0 == fMaxHours) return; g_ulMaxSecs = (unsigned long) (fMaxHours*60*60); } static bool CanDoLowComplexity() { if (g_SeqWeight1 != SEQWEIGHT_ClustalW) return false; if (1 == g_uMaxIters) return true; return g_SeqWeight2 == SEQWEIGHT_ClustalW; } bool MissingCommand() { if (strcmp(g_pstrInFileName, "-")) return false; if (0 != g_pstrFileName1) return false; if (0 != g_pstrSPFileName) return false; return true; } void SetParams() { SetMaxSecs(); StrParam("in", &g_pstrInFileName); StrParam("out", &g_pstrOutFileName); StrParam("FASTAOut", &g_pstrFASTAOutFileName); StrParam("ClwOut", &g_pstrClwOutFileName); StrParam("ClwStrictOut", &g_pstrClwStrictOutFileName); StrParam("HTMLOut", &g_pstrHTMLOutFileName); StrParam("PHYIOut", &g_pstrPHYIOutFileName); StrParam("PHYSOut", &g_pstrPHYSOutFileName); StrParam("MSFOut", &g_pstrMSFOutFileName); StrParam("in1", &g_pstrFileName1); StrParam("in2", &g_pstrFileName2); StrParam("Matrix", &g_pstrMatrixFileName); StrParam("SPScore", &g_pstrSPFileName); StrParam("UseTree_NoWarn", &g_pstrUseTreeFileName); if (0 != g_pstrUseTreeFileName) g_bUseTreeNoWarn = true; StrParam("UseTree", &g_pstrUseTreeFileName); StrParam("ComputeWeights", &g_pstrComputeWeightsFileName); StrParam("ScoreFile", &g_pstrScoreFileName); StrParam("DistMx1", &g_pstrDistMxFileName1); StrParam("DistMx2", &g_pstrDistMxFileName2); FlagParam("Core", &g_bCatchExceptions, false); FlagParam("NoCore", &g_bCatchExceptions, true); FlagParam("Diags1", &g_bDiags1, true); FlagParam("Diags2", &g_bDiags2, true); bool Diags = false; FlagParam("Diags", &Diags, true); if (Diags) { g_bDiags1 = true; g_bDiags2 = true; } FlagParam("Anchors", &g_bAnchors, true); FlagParam("NoAnchors", &g_bAnchors, false); FlagParam("Quiet", &g_bQuiet, true); FlagParam("Verbose", &g_bVerbose, true); FlagParam("Version", &g_bVersion, true); FlagParam("Stable", &g_bStable, true); FlagParam("Group", &g_bStable, false); FlagParam("Refine", &g_bRefine, true); FlagParam("RefineW", &g_bRefineW, true); FlagParam("ProfDB", &g_bProfDB, true); FlagParam("SW", &g_bSW, true); FlagParam("ClusterOnly", &g_bClusterOnly, true); FlagParam("Profile", &g_bProfile, true); FlagParam("PPScore", &g_bPPScore, true); FlagParam("Brenner", &g_bBrenner, true); FlagParam("Dimer", &g_bDimer, true); FlagParam("MSF", &g_bMSF, true); FlagParam("PHYI", &g_bPHYI, true); FlagParam("PHYS", &g_bPHYS, true); FlagParam("clw", &g_bAln, true); FlagParam("HTML", &g_bHTML, true); FlagParam("FASTA", &g_bFASTA, true); FlagParam("PAS", &g_bPAS, true); FlagParam("MakeTree", &g_bMakeTree, true); if (g_bStable) Quit("-stable not supported in this version of muscle"); bool b = false; FlagParam("clwstrict", &b, true); if (b) { g_bAln = true; g_bClwStrict = true; } UintParam("MaxIters", &g_uMaxIters); UintParam("MaxTrees", &g_uMaxTreeRefineIters); UintParam("SmoothWindow", &g_uSmoothWindowLength); UintParam("RefineWindow", &g_uRefineWindow); UintParam("FromWindow", &g_uWindowFrom); UintParam("ToWindow", &g_uWindowTo); UintParam("SaveWindow", &g_uSaveWindow); UintParam("WindowOffset", &g_uWindowOffset); UintParam("AnchorSpacing", &g_uAnchorSpacing); UintParam("DiagLength", &g_uMinDiagLength); UintParam("DiagMargin", &g_uDiagMargin); UintParam("DiagBreak", &g_uMaxDiagBreak); UintParam("MaxSubFam", &g_uMaxSubFamCount); UintParam("Hydro", &g_uHydrophobicRunLength); FlagParam("TomHydro", &g_bTomHydro, true); if (g_bTomHydro) g_uHydrophobicRunLength = 0; FloatParam("SUEFF", &g_dSUEFF); FloatParam("HydroFactor", &g_dHydroFactor); EnumParam("ObjScore", OBJSCORE_Opts, (int *) &g_ObjScore); EnumParam("TermGaps", TERMGAPS_Opts, (int *) &g_TermGaps); EnumParam("Weight", SEQWEIGHT_Opts, (int *) &g_SeqWeight1); EnumParam("Weight", SEQWEIGHT_Opts, (int *) &g_SeqWeight2); EnumParam("Weight1", SEQWEIGHT_Opts, (int *) &g_SeqWeight1); EnumParam("Weight2", SEQWEIGHT_Opts, (int *) &g_SeqWeight2); EnumParam("Cluster", CLUSTER_Opts, (int *) &g_Cluster1); EnumParam("Cluster", CLUSTER_Opts, (int *) &g_Cluster2); EnumParam("Cluster1", CLUSTER_Opts, (int *) &g_Cluster1); EnumParam("Cluster2", CLUSTER_Opts, (int *) &g_Cluster2); EnumParam("Root1", ROOT_Opts, (int *) &g_Root1); EnumParam("Root2", ROOT_Opts, (int *) &g_Root2); EnumParam("SeqType", SEQTYPE_Opts, (int *) &g_SeqType); g_scoreGapAmbig = g_scoreGapOpen*g_scoreAmbigFactor; g_bLow = CanDoLowComplexity(); if (g_bDimer) g_bPrecompiledCenter = false; UintParam("MaxMB", &g_uMaxMB); if (0 == ValueOpt("MaxMB")) g_uMaxMB = (unsigned) (GetRAMSizeMB()*DEFAULT_MAX_MB_FRACT); } muscle-3.8.31.orig/progress.cpp0000644000175000017500000000656211367132113015752 0ustar kratzcharles#include "muscle.h" #include #include // Functions that provide visible feedback to the user // that progress is being made. static unsigned g_uIter = 0; // Main MUSCLE iteration 1, 2.. static unsigned g_uLocalMaxIters = 0; // Max iters static FILE *g_fProgress = stderr; // Default to standard error static char g_strFileName[32]; // File name static time_t g_tLocalStart; // Start time static char g_strDesc[32]; // Description static bool g_bWipeDesc = false; static int g_nPrevDescLength; static unsigned g_uTotalSteps; const char *ElapsedTimeAsStr() { time_t Now = time(0); unsigned long ElapsedSecs = (unsigned long) (Now - g_tLocalStart); return SecsToStr(ElapsedSecs); } const char *MemToStr(double MB) { if (MB < 0) return ""; static char Str[16]; static double MaxMB = 0; static double RAMMB = 0; if (RAMMB == 0) RAMMB = GetRAMSizeMB(); if (MB > MaxMB) MaxMB = MB; double Pct = (MaxMB*100.0)/RAMMB; if (Pct > 100) Pct = 100; sprintf(Str, "%.0f MB(%.0f%%)", MaxMB, Pct); return Str; } void SetInputFileName(const char *pstrFileName) { NameFromPath(pstrFileName, g_strFileName, sizeof(g_strFileName)); } void SetSeqStats(unsigned uSeqCount, unsigned uMaxL, unsigned uAvgL) { if (g_bQuiet) return; fprintf(g_fProgress, "%s %u seqs, max length %u, avg length %u\n", g_strFileName, uSeqCount, uMaxL, uAvgL); if (g_bVerbose) Log("%u seqs, max length %u, avg length %u\n", uSeqCount, uMaxL, uAvgL); } void SetStartTime() { time(&g_tLocalStart); } unsigned long GetStartTime() { return (unsigned long) g_tLocalStart; } void SetIter(unsigned uIter) { g_uIter = uIter; } void IncIter() { ++g_uIter; } void SetMaxIters(unsigned uMaxIters) { g_uLocalMaxIters = uMaxIters; } void SetProgressDesc(const char szDesc[]) { strncpy(g_strDesc, szDesc, sizeof(g_strDesc)); g_strDesc[sizeof(g_strDesc) - 1] = 0; } static void Wipe(int n) { for (int i = 0; i < n; ++i) fprintf(g_fProgress, " "); } void Progress(const char *szFormat, ...) { CheckMaxTime(); if (g_bQuiet) return; double MB = GetMemUseMB(); char szStr[4096]; va_list ArgList; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); fprintf(g_fProgress, "%8.8s %12s %s", ElapsedTimeAsStr(), MemToStr(MB), szStr); fprintf(g_fProgress, "\n"); fflush(g_fProgress); } void Progress(unsigned uStep, unsigned uTotalSteps) { CheckMaxTime(); if (g_bQuiet) return; double dPct = ((uStep + 1)*100.0)/uTotalSteps; double MB = GetMemUseMB(); fprintf(g_fProgress, "%8.8s %12s Iter %3u %6.2f%% %s", ElapsedTimeAsStr(), MemToStr(MB), g_uIter, dPct, g_strDesc); if (g_bWipeDesc) { int n = g_nPrevDescLength - (int) strlen(g_strDesc); Wipe(n); g_bWipeDesc = false; } fprintf(g_fProgress, "\r"); g_uTotalSteps = uTotalSteps; } void ProgressStepsDone() { CheckMaxTime(); if (g_bVerbose) { double MB = GetMemUseMB(); Log("Elapsed time %8.8s Peak memory use %12s Iteration %3u %s\n", ElapsedTimeAsStr(), MemToStr(MB), g_uIter, g_strDesc); } if (g_bQuiet) return; Progress(g_uTotalSteps - 1, g_uTotalSteps); fprintf(g_fProgress, "\n"); g_bWipeDesc = true; g_nPrevDescLength = (int) strlen(g_strDesc); } muscle-3.8.31.orig/tree.h0000644000175000017500000002253111352261612014505 0ustar kratzcharles#ifndef tree_h #define tree_h #include class Clust; const unsigned NULL_NEIGHBOR = UINT_MAX; enum NEWICK_TOKEN_TYPE { NTT_Unknown, // Returned from Tree::GetToken: NTT_Lparen, NTT_Rparen, NTT_Colon, NTT_Comma, NTT_Semicolon, NTT_String, // Following are never returned from Tree::GetToken: NTT_SingleQuotedString, NTT_DoubleQuotedString, NTT_Comment }; class Tree { public: Tree() { m_uNodeCount = 0; m_uCacheCount = 0; m_uNeighbor1 = 0; m_uNeighbor2 = 0; m_uNeighbor3 = 0; m_dEdgeLength1 = 0; m_dEdgeLength2 = 0; m_dEdgeLength3 = 0; m_dHeight = 0; m_bHasEdgeLength1 = 0; m_bHasEdgeLength2 = 0; m_bHasEdgeLength3 = 0; m_bHasHeight = 0; m_ptrName = 0; m_Ids = 0; } virtual ~Tree() { Clear(); } void Clear() { for (unsigned n = 0; n < m_uNodeCount; ++n) free(m_ptrName[n]); m_uNodeCount = 0; m_uCacheCount = 0; delete[] m_uNeighbor1; delete[] m_uNeighbor2; delete[] m_uNeighbor3; delete[] m_dEdgeLength1; delete[] m_dEdgeLength2; delete[] m_dEdgeLength3; delete[] m_bHasEdgeLength1; delete[] m_bHasEdgeLength2; delete[] m_bHasEdgeLength3; delete[] m_ptrName; delete[] m_Ids; delete[] m_bHasHeight; delete[] m_dHeight; m_uNeighbor1 = 0; m_uNeighbor2 = 0; m_uNeighbor3 = 0; m_dEdgeLength1 = 0; m_dEdgeLength2 = 0; m_dEdgeLength3 = 0; m_ptrName = 0; m_Ids = 0; m_uRootNodeIndex = 0; m_bHasHeight = 0; m_dHeight = 0; m_bRooted = false; } // Creation and manipulation void CreateRooted(); void CreateUnrooted(double dEdgeLength); void FromFile(TextFile &File); void FromClust(Clust &C); void Copy(const Tree &tree); void Create(unsigned uLeafCount, unsigned uRoot, const unsigned Left[], const unsigned Right[], const float LeftLength[], const float RightLength[], const unsigned LeafIds[], char *LeafNames[]); unsigned AppendBranch(unsigned uExistingNodeIndex); void SetLeafName(unsigned uNodeIndex, const char *ptrName); void SetLeafId(unsigned uNodeIndex, unsigned uId); void SetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2, double dLength); void RootUnrootedTree(unsigned uNodeIndex1, unsigned uNodeIndex2); void RootUnrootedTree(ROOT Method); void UnrootByDeletingRoot(); // Saving to file void ToFile(TextFile &File) const; // Accessor functions unsigned GetNodeCount() const { return m_uNodeCount; } unsigned GetLeafCount() const { if (m_bRooted) { assert(m_uNodeCount%2 == 1); return (m_uNodeCount + 1)/2; } else { assert(m_uNodeCount%2 == 0); return (m_uNodeCount + 2)/2; } } unsigned GetNeighbor(unsigned uNodeIndex, unsigned uNeighborSubscript) const; unsigned GetNeighbor1(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_uNeighbor1[uNodeIndex]; } unsigned GetNeighbor2(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_uNeighbor2[uNodeIndex]; } unsigned GetNeighbor3(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_uNeighbor3[uNodeIndex]; } unsigned GetParent(unsigned uNodeIndex) const { assert(m_bRooted && uNodeIndex < m_uNodeCount); return m_uNeighbor1[uNodeIndex]; } bool IsRooted() const { return m_bRooted; } unsigned GetLeft(unsigned uNodeIndex) const { assert(m_bRooted && uNodeIndex < m_uNodeCount); return m_uNeighbor2[uNodeIndex]; } unsigned GetRight(unsigned uNodeIndex) const { assert(m_bRooted && uNodeIndex < m_uNodeCount); return m_uNeighbor3[uNodeIndex]; } const char *GetName(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_ptrName[uNodeIndex]; } unsigned GetRootNodeIndex() const { assert(m_bRooted); return m_uRootNodeIndex; } unsigned GetNeighborCount(unsigned uNodeIndex) const { const unsigned n1 = m_uNeighbor1[uNodeIndex]; const unsigned n2 = m_uNeighbor2[uNodeIndex]; const unsigned n3 = m_uNeighbor3[uNodeIndex]; return (NULL_NEIGHBOR != n1) + (NULL_NEIGHBOR != n2) + (NULL_NEIGHBOR != n3); } bool IsLeaf(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); if (1 == m_uNodeCount) return true; return 1 == GetNeighborCount(uNodeIndex); } bool IsRoot(unsigned uNodeIndex) const { return IsRooted() && m_uRootNodeIndex == uNodeIndex; } unsigned GetLeafId(unsigned uNodeIndex) const; unsigned GetLeafNodeIndex(const char *ptrName) const; bool IsEdge(unsigned uNodeIndex1, unsigned uNodeIndex2) const; bool HasEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const; double GetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const; const char *GetLeafName(unsigned uNodeIndex) const; unsigned GetNeighborSubscript(unsigned uNodeIndex, unsigned uNeighborIndex) const; double GetNodeHeight(unsigned uNodeIndex) const; // Depth-first traversal unsigned FirstDepthFirstNode() const; unsigned NextDepthFirstNode(unsigned uNodeIndex) const; unsigned FirstDepthFirstNodeR() const; unsigned NextDepthFirstNodeR(unsigned uNodeIndex) const; // Equivalent of GetLeft/Right in unrooted tree, works in rooted tree too. unsigned GetFirstNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const; unsigned GetSecondNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const; // Getting parent node in unrooted tree defined iff leaf unsigned GetLeafParent(unsigned uNodeIndex) const; // Misc const char *NTTStr(NEWICK_TOKEN_TYPE NTT) const; void FindCenterByLongestSpan(unsigned *ptrNodeIndex1, unsigned *ptrNodeIndex2) const; void PruneTree(const Tree &tree, unsigned Subfams[], unsigned uSubfamCount); unsigned LeafIndexToNodeIndex(unsigned uLeafIndex) const; // Debugging & trouble-shooting support void Validate() const; void ValidateNode(unsigned uNodeIndex) const; void AssertAreNeighbors(unsigned uNodeIndex1, unsigned uNodeIndex2) const; void LogMe() const; private: unsigned UnrootFromFile(); NEWICK_TOKEN_TYPE GetTokenVerbose(TextFile &File, char szToken[], unsigned uBytes) const { NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, uBytes); Log("GetToken %10.10s %s\n", NTTStr(NTT), szToken); return NTT; } void InitCache(unsigned uCacheCount); void ExpandCache(); NEWICK_TOKEN_TYPE GetToken(TextFile &File, char szToken[], unsigned uBytes) const; bool GetGroupFromFile(TextFile &File, unsigned uNodeIndex, double *ptrdEdgeLength); unsigned GetLeafCountUnrooted(unsigned uNodeIndex1, unsigned uNodeIndex2, double *ptrdTotalDistance) const; void ToFileNodeRooted(TextFile &File, unsigned uNodeIndex) const; void ToFileNodeUnrooted(TextFile &File, unsigned uNodeIndex, unsigned uParent) const; void OrientParent(unsigned uNodeIndex, unsigned uParentNodeIndex); double FromClustNode(const Clust &C, unsigned uClustNodeIndex, unsigned uPhyNodeIndex); unsigned GetAnyNonLeafNode() const; // Yuck. Data is made public for the convenience of Tree::Copy. // There has to be a better way. public: unsigned m_uNodeCount; unsigned m_uCacheCount; unsigned *m_uNeighbor1; unsigned *m_uNeighbor2; unsigned *m_uNeighbor3; double *m_dEdgeLength1; double *m_dEdgeLength2; double *m_dEdgeLength3; double *m_dHeight; bool *m_bHasEdgeLength1; bool *m_bHasEdgeLength2; bool *m_bHasEdgeLength3; bool *m_bHasHeight; unsigned *m_Ids; char **m_ptrName; bool m_bRooted; unsigned m_uRootNodeIndex; }; struct PhyEnumEdgeState { PhyEnumEdgeState() { m_bInit = false; m_uNodeIndex1 = NULL_NEIGHBOR; m_uNodeIndex2 = NULL_NEIGHBOR; } bool m_bInit; unsigned m_uNodeIndex1; unsigned m_uNodeIndex2; }; const unsigned NODE_CHANGED = (unsigned) (~0); extern bool PhyEnumBiParts(const Tree &tree, PhyEnumEdgeState &ES, unsigned Leaves1[], unsigned *ptruCount1, unsigned Leaves2[], unsigned *ptruCount2); extern bool PhyEnumBiPartsR(const Tree &tree, PhyEnumEdgeState &ES, unsigned Leaves1[], unsigned *ptruCount1, unsigned Leaves2[], unsigned *ptruCount2); extern void ClusterByHeight(const Tree &tree, double dMaxHeight, unsigned Subtrees[], unsigned *ptruSubtreeCount); void ClusterBySubfamCount(const Tree &tree, unsigned uSubfamCount, unsigned Subfams[], unsigned *ptruSubfamCount); void GetLeaves(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[], unsigned *ptruLeafCount); void GetLeavesExcluding(const Tree &tree, unsigned uNodeIndex, unsigned uExclude, unsigned Leaves[], unsigned *ptruCount); void GetInternalNodesInHeightOrder(const Tree &tree, unsigned NodeIndexes[]); void ApplyMinEdgeLength(Tree &tree, double dMinEdgeLength); void LeafIndexesToLeafNames(const Tree &tree, const unsigned Leaves[], unsigned uCount, char *Names[]); void LeafIndexesToIds(const Tree &tree, const unsigned Leaves[], unsigned uCount, unsigned Ids[]); void MSASeqSubset(const MSA &msaIn, char *Names[], unsigned uSeqCount, MSA &msaOut); void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs, unsigned IdToDiffsLeafNodeIndex[]); void DiffTreesE(const Tree &NewTree, const Tree &OldTree, unsigned NewNodeIndexToOldNodeIndex[]); void FindRoot(const Tree &tree, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2, ROOT RootMethod); void FixRoot(Tree &tree, ROOT RootMethod); #endif // tree_h muscle-3.8.31.orig/edgelist.cpp0000644000175000017500000000326711352261600015703 0ustar kratzcharles#include "muscle.h" #include "edgelist.h" EdgeList::EdgeList() { m_uNode1 = 0; m_uNode2 = 0; m_uCount = 0; m_uCacheSize = 0; } EdgeList::~EdgeList() { Clear(); } void EdgeList::Clear() { delete[] m_uNode1; delete[] m_uNode2; m_uNode1 = 0; m_uNode2 = 0; m_uCount = 0; m_uCacheSize = 0; } void EdgeList::Add(unsigned uNode1, unsigned uNode2) { if (m_uCount <= m_uCacheSize) Expand(); m_uNode1[m_uCount] = uNode1; m_uNode2[m_uCount] = uNode2; ++m_uCount; } unsigned EdgeList::GetCount() const { return m_uCount; } void EdgeList::GetEdge(unsigned uIndex, unsigned *ptruNode1, unsigned *ptruNode2) const { if (uIndex > m_uCount) Quit("EdgeList::GetEdge(%u) count=%u", uIndex, m_uCount); *ptruNode1 = m_uNode1[uIndex]; *ptruNode2 = m_uNode2[uIndex]; } void EdgeList::Copy(const EdgeList &rhs) { Clear(); const unsigned uCount = rhs.GetCount(); for (unsigned n = 0; n < uCount; ++n) { unsigned uNode1; unsigned uNode2; rhs.GetEdge(n, &uNode1, &uNode2); Add(uNode1, uNode2); } } void EdgeList::Expand() { unsigned uNewCacheSize = m_uCacheSize + 512; unsigned *NewNode1 = new unsigned[uNewCacheSize]; unsigned *NewNode2 = new unsigned[uNewCacheSize]; if (m_uCount > 0) { memcpy(NewNode1, m_uNode1, m_uCount*sizeof(unsigned)); memcpy(NewNode2, m_uNode2, m_uCount*sizeof(unsigned)); } delete[] m_uNode1; delete[] m_uNode2; m_uNode1 = NewNode1; m_uNode2 = NewNode2; m_uCacheSize = uNewCacheSize; } void EdgeList::LogMe() const { for (unsigned n = 0; n < m_uCount; ++n) { if (n > 0) Log(" "); Log("%u->%u", m_uNode1[n], m_uNode2[n]); } Log("\n"); } muscle-3.8.31.orig/glbalignsp.cpp0000644000175000017500000002235511352261617016234 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "pwpath.h" struct DP_MEMORY { unsigned uLength; SCORE *GapOpenA; SCORE *GapOpenB; SCORE *GapCloseA; SCORE *GapCloseB; SCORE *MPrev; SCORE *MCurr; SCORE *MWork; SCORE *DPrev; SCORE *DCurr; SCORE *DWork; SCORE **ScoreMxB; unsigned **SortOrderA; unsigned *uDeletePos; FCOUNT **FreqsA; int **TraceBack; }; static struct DP_MEMORY DPM; static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) { // Max prefix length unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; if (uLength < DPM.uLength) return; // Add 256 to allow for future expansion and // round up to next multiple of 32. uLength += 256; uLength += 32 - uLength%32; const unsigned uOldLength = DPM.uLength; if (uOldLength > 0) { for (unsigned i = 0; i < uOldLength; ++i) { delete[] DPM.TraceBack[i]; delete[] DPM.FreqsA[i]; delete[] DPM.SortOrderA[i]; } for (unsigned n = 0; n < 20; ++n) delete[] DPM.ScoreMxB[n]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.uDeletePos; delete[] DPM.GapOpenA; delete[] DPM.GapOpenB; delete[] DPM.GapCloseA; delete[] DPM.GapCloseB; delete[] DPM.SortOrderA; delete[] DPM.FreqsA; delete[] DPM.ScoreMxB; delete[] DPM.TraceBack; } DPM.uLength = uLength; DPM.GapOpenA = new SCORE[uLength]; DPM.GapOpenB = new SCORE[uLength]; DPM.GapCloseA = new SCORE[uLength]; DPM.GapCloseB = new SCORE[uLength]; DPM.SortOrderA = new unsigned*[uLength]; DPM.FreqsA = new FCOUNT*[uLength]; DPM.ScoreMxB = new SCORE*[20]; DPM.MPrev = new SCORE[uLength]; DPM.MCurr = new SCORE[uLength]; DPM.MWork = new SCORE[uLength]; DPM.DPrev = new SCORE[uLength]; DPM.DCurr = new SCORE[uLength]; DPM.DWork = new SCORE[uLength]; DPM.uDeletePos = new unsigned[uLength]; DPM.TraceBack = new int*[uLength]; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) DPM.ScoreMxB[uLetter] = new SCORE[uLength]; for (unsigned i = 0; i < uLength; ++i) { DPM.SortOrderA[i] = new unsigned[20]; DPM.FreqsA[i] = new FCOUNT[20]; DPM.TraceBack[i] = new int[uLength]; } } SCORE GlobalAlignSP(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; AllocDPMem(uLengthA, uLengthB); SCORE *GapOpenA = DPM.GapOpenA; SCORE *GapOpenB = DPM.GapOpenB; SCORE *GapCloseA = DPM.GapCloseA; SCORE *GapCloseB = DPM.GapCloseB; unsigned **SortOrderA = DPM.SortOrderA; FCOUNT **FreqsA = DPM.FreqsA; SCORE **ScoreMxB = DPM.ScoreMxB; SCORE *MPrev = DPM.MPrev; SCORE *MCurr = DPM.MCurr; SCORE *MWork = DPM.MWork; SCORE *DPrev = DPM.DPrev; SCORE *DCurr = DPM.DCurr; SCORE *DWork = DPM.DWork; unsigned *uDeletePos = DPM.uDeletePos; int **TraceBack = DPM.TraceBack; for (unsigned i = 0; i < uLengthA; ++i) { GapOpenA[i] = PA[i].m_scoreGapOpen; GapCloseA[i] = PA[i].m_scoreGapClose; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; } } for (unsigned j = 0; j < uLengthB; ++j) { GapOpenB[j] = PB[j].m_scoreGapOpen; GapCloseB[j] = PB[j].m_scoreGapClose; } for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { for (unsigned j = 0; j < uLengthB; ++j) ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; } for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); // Special case for i=0 unsigned **ptrSortOrderA = SortOrderA; FCOUNT **ptrFreqsA = FreqsA; assert(ptrSortOrderA == &(SortOrderA[0])); assert(ptrFreqsA == &(FreqsA[0])); TraceBack[0][0] = 0; SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][0]; } MPrev[0] = scoreSum - g_scoreCenter; // D(0,0) is -infinity (requires I->D). DPrev[0] = MINUS_INFINITY; for (unsigned j = 1; j < uLengthB; ++j) { // Only way to get M(0, j) looks like this: // A ----X // B XXXXX // 0 j // So gap-open at j=0, gap-close at j-1. SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][j]; } MPrev[j] = scoreSum - g_scoreCenter + GapOpenB[0] + GapCloseB[j-1]; TraceBack[0][j] = -(int) j; // Assume no D->I transitions, then can't be a delete if only // one letter from A. DPrev[j] = MINUS_INFINITY; } SCORE IPrev_j_1; for (unsigned i = 1; i < uLengthA; ++i) { ++ptrSortOrderA; ++ptrFreqsA; assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); SCORE *ptrMCurr_j = MCurr; memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); const FCOUNT *FreqsAi = *ptrFreqsA; const unsigned *SortOrderAi = *ptrSortOrderA; const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; const SCORE *ptrMCurrMax = MCurr + uLengthB; for (const unsigned *ptrSortOrderAi = SortOrderAi; ptrSortOrderAi != ptrSortOrderAiEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; SCORE *NSBR_Letter = ScoreMxB[uLetter]; const FCOUNT fcLetter = FreqsAi[uLetter]; if (0 == fcLetter) break; SCORE *ptrNSBR = NSBR_Letter; for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) *ptrMCurr += fcLetter*(*ptrNSBR++); } for (unsigned j = 0; j < uLengthB; ++j) MCurr[j] -= g_scoreCenter; ptrMCurr_j = MCurr; unsigned *ptrDeletePos = uDeletePos; // Special case for j=0 // Only way to get M(i, 0) looks like this: // 0 i // A XXXXX // B ----X // So gap-open at i=0, gap-close at i-1. assert(ptrMCurr_j == &(MCurr[0])); *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; ++ptrMCurr_j; int *ptrTraceBack_ij = TraceBack[i]; *ptrTraceBack_ij++ = (int) i; SCORE *ptrMPrev_j = MPrev; SCORE *ptrDPrev = DPrev; SCORE d = *ptrDPrev; SCORE DNew = *ptrMPrev_j + GapOpenA[i]; if (DNew > d) { d = DNew; *ptrDeletePos = i; } SCORE *ptrDCurr = DCurr; assert(ptrDCurr == &(DCurr[0])); *ptrDCurr = d; // Can't have an insert if no letters from B IPrev_j_1 = MINUS_INFINITY; unsigned uInsertPos; const SCORE scoreGapOpenAi = GapOpenA[i]; const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; for (unsigned j = 1; j < uLengthB; ++j) { // Here, MPrev_j is preserved from previous // iteration so with current i,j is M[i-1][j-1] SCORE MPrev_j = *ptrMPrev_j; SCORE INew = MPrev_j + GapOpenB[j]; if (INew > IPrev_j_1) { IPrev_j_1 = INew; uInsertPos = j; } SCORE scoreMax = MPrev_j; assert(ptrDPrev == &(DPrev[j-1])); SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; if (scoreD > scoreMax) { scoreMax = scoreD; assert(ptrDeletePos == &(uDeletePos[j-1])); *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; assert(*ptrTraceBack_ij > 0); } ++ptrDeletePos; SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; if (scoreI > scoreMax) { scoreMax = scoreI; *ptrTraceBack_ij = (int) uInsertPos - (int) j; assert(*ptrTraceBack_ij < 0); } assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); *ptrMCurr_j += scoreMax; assert(ptrMCurr_j == &(MCurr[j])); ++ptrMCurr_j; MPrev_j = *(++ptrMPrev_j); assert(ptrDPrev == &(DPrev[j])); SCORE d = *ptrDPrev; SCORE DNew = MPrev_j + scoreGapOpenAi; if (DNew > d) { d = DNew; assert(ptrDeletePos == &uDeletePos[j]); *ptrDeletePos = i; } assert(ptrDCurr + 1 == &(DCurr[j])); *(++ptrDCurr) = d; ++ptrTraceBack_ij; } Rotate(MPrev, MCurr, MWork); Rotate(DPrev, DCurr, DWork); } // Special case for i=uLengthA SCORE IPrev = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { SCORE INew = MPrev[j-1] + GapOpenB[j]; if (INew > IPrev) { uInsertPos = j; IPrev = INew; } } // Special case for i=uLengthA, j=uLengthB SCORE scoreMax = MPrev[uLengthB-1]; int iTraceBack = 0; SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; if (scoreD > scoreMax) { scoreMax = scoreD; iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; } SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; if (scoreI > scoreMax) { scoreMax = scoreI; iTraceBack = (int) uInsertPos - (int) uLengthB; } TraceBack[uLengthA][uLengthB] = iTraceBack; TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); return scoreMax; } muscle-3.8.31.orig/globals.cpp0000644000175000017500000001361711366141374015540 0ustar kratzcharles#if WIN32 #include #include #endif #include "muscle.h" #include #include #include #include #include #include #include #include #ifndef MAX_PATH #define MAX_PATH 260 #endif static char g_strListFileName[MAX_PATH]; static bool g_bListFileAppend = false; static SEQWEIGHT g_SeqWeight = SEQWEIGHT_Undefined; void SetSeqWeightMethod(SEQWEIGHT Method) { g_SeqWeight = Method; } SEQWEIGHT GetSeqWeightMethod() { return g_SeqWeight; } void SetListFileName(const char *ptrListFileName, bool bAppend) { assert(strlen(ptrListFileName) < MAX_PATH); strcpy(g_strListFileName, ptrListFileName); g_bListFileAppend = bAppend; } void Log(const char szFormat[], ...) { if (0 == g_strListFileName[0]) return; static FILE *f = NULL; const char *mode; if (g_bListFileAppend) mode = "a"; else mode = "w"; if (NULL == f) f = _fsopen(g_strListFileName, mode, _SH_DENYNO); if (NULL == f) { perror(g_strListFileName); exit(EXIT_NotStarted); } char szStr[4096]; va_list ArgList; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); fprintf(f, "%s", szStr); fflush(f); } const char *GetTimeAsStr() { static char szStr[32]; time_t t; time(&t); struct tm *ptmCurrentTime = localtime(&t); strcpy(szStr, asctime(ptmCurrentTime)); assert('\n' == szStr[24]); szStr[24] = 0; return szStr; } // Exit immediately with error message, printf-style. void Quit(const char szFormat[], ...) { va_list ArgList; char szStr[4096]; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); fprintf(stderr, "\n*** ERROR *** %s\n", szStr); Log("\n*** FATAL ERROR *** "); Log("%s\n", szStr); Log("Stopped %s\n", GetTimeAsStr()); #ifdef WIN32 if (IsDebuggerPresent()) { int iBtn = MessageBox(NULL, szStr, "muscle", MB_ICONERROR | MB_OKCANCEL); if (IDCANCEL == iBtn) Break(); } #endif exit(EXIT_FatalError); } void Warning(const char szFormat[], ...) { va_list ArgList; char szStr[4096]; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); fprintf(stderr, "\n*** WARNING *** %s\n", szStr); Log("\n*** WARNING *** %s\n", szStr); } // Remove leading and trailing blanks from string void TrimBlanks(char szStr[]) { TrimLeadingBlanks(szStr); TrimTrailingBlanks(szStr); } void TrimLeadingBlanks(char szStr[]) { size_t n = strlen(szStr); while (szStr[0] == ' ') { memmove(szStr, szStr+1, n); szStr[--n] = 0; } } void TrimTrailingBlanks(char szStr[]) { size_t n = strlen(szStr); while (n > 0 && szStr[n-1] == ' ') szStr[--n] = 0; } bool Verbose() { return true; } SCORE StrToScore(const char *pszStr) { return (SCORE) atof(pszStr); } void StripWhitespace(char szStr[]) { unsigned uOutPos = 0; unsigned uInPos = 0; while (char c = szStr[uInPos++]) if (' ' != c && '\t' != c && '\n' != c && '\r' != c) szStr[uOutPos++] = c; szStr[uOutPos] = 0; } void StripGaps(char szStr[]) { unsigned uOutPos = 0; unsigned uInPos = 0; while (char c = szStr[uInPos++]) if ('-' != c) szStr[uOutPos++] = c; szStr[uOutPos] = 0; } bool IsValidSignedInteger(const char *Str) { if (0 == strlen(Str)) return false; if ('+' == *Str || '-' == *Str) ++Str; while (char c = *Str++) if (!isdigit(c)) return false; return true; } bool IsValidInteger(const char *Str) { if (0 == strlen(Str)) return false; while (char c = *Str++) if (!isdigit(c)) return false; return true; } // Is c valid as first character in an identifier? bool isidentf(char c) { return isalpha(c) || '_' == c; } // Is c valid character in an identifier? bool isident(char c) { return isalpha(c) || isdigit(c) || '_' == c; } bool IsValidIdentifier(const char *Str) { if (!isidentf(Str[0])) return false; while (char c = *Str++) if (!isident(c)) return false; return true; } void SetLogFile() { const char *strFileName = ValueOpt("loga"); if (0 != strFileName) g_bListFileAppend = true; else strFileName = ValueOpt("log"); if (0 == strFileName) return; strcpy(g_strListFileName, strFileName); } // Get filename, stripping any extension and directory parts. void NameFromPath(const char szPath[], char szName[], unsigned uBytes) { if (0 == uBytes) return; const char *pstrLastSlash = strrchr(szPath, '/'); const char *pstrLastBackslash = strrchr(szPath, '\\'); const char *pstrLastDot = strrchr(szPath, '.'); const char *pstrLastSep = pstrLastSlash > pstrLastBackslash ? pstrLastSlash : pstrLastBackslash; const char *pstrBegin = pstrLastSep ? pstrLastSep + 1 : szPath; const char *pstrEnd = pstrLastDot ? pstrLastDot - 1 : szPath + strlen(szPath); unsigned uNameLength = (unsigned) (pstrEnd - pstrBegin + 1); if (uNameLength > uBytes - 1) uNameLength = uBytes - 1; memcpy(szName, pstrBegin, uNameLength); szName[uNameLength] = 0; } char *strsave(const char *s) { char *ptrCopy = strdup(s); if (0 == ptrCopy) Quit("Out of memory"); return ptrCopy; } bool IsValidFloatChar(char c) { return isdigit(c) || '.' == c || 'e' == c || 'E' == c || 'd' == c || 'D' == c || '.' == c || '+' == c || '-' == c; } void Call_MY_ASSERT(const char *file, int line, bool b, const char *msg) { if (b) return; Quit("%s(%d): MY_ASSERT(%s)", file, line, msg); } static size_t g_MemTotal; void MemPlus(size_t Bytes, char *Where) { g_MemTotal += Bytes; Log("+%10u %6u %6u %s\n", (unsigned) Bytes, (unsigned) GetMemUseMB(), (unsigned) (g_MemTotal/1000000), Where); } void MemMinus(size_t Bytes, char *Where) { g_MemTotal -= Bytes; Log("-%10u %6u %6u %s\n", (unsigned) Bytes, (unsigned) GetMemUseMB(), (unsigned) (g_MemTotal/1000000), Where); } muscle-3.8.31.orig/Makefile0000644000175000017500000000003511367660321015036 0ustar kratzcharlesmuscle: chmod +x ./mk ./mk muscle-3.8.31.orig/phy4.cpp0000644000175000017500000002074711352261600014771 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include #define TRACE 0 void ClusterByHeight(const Tree &tree, double dMaxHeight, unsigned Subtrees[], unsigned *ptruSubtreeCount) { if (!tree.IsRooted()) Quit("ClusterByHeight: requires rooted tree"); #if TRACE Log("ClusterByHeight, max height=%g\n", dMaxHeight); #endif unsigned uSubtreeCount = 0; const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (tree.IsRoot(uNodeIndex)) continue; unsigned uParent = tree.GetParent(uNodeIndex); double dHeight = tree.GetNodeHeight(uNodeIndex); double dParentHeight = tree.GetNodeHeight(uParent); #if TRACE Log("Node %3u Height %5.2f ParentHeight %5.2f\n", uNodeIndex, dHeight, dParentHeight); #endif if (dParentHeight > dMaxHeight && dHeight <= dMaxHeight) { Subtrees[uSubtreeCount] = uNodeIndex; #if TRACE Log("Subtree[%u]=%u\n", uSubtreeCount, uNodeIndex); #endif ++uSubtreeCount; } } *ptruSubtreeCount = uSubtreeCount; } static void ClusterBySubfamCount_Iteration(const Tree &tree, unsigned Subfams[], unsigned uCount) { // Find highest child node of current set of subfamilies. double dHighestHeight = -1e20; int iParentSubscript = -1; for (int n = 0; n < (int) uCount; ++n) { const unsigned uNodeIndex = Subfams[n]; if (tree.IsLeaf(uNodeIndex)) continue; const unsigned uLeft = tree.GetLeft(uNodeIndex); const double dHeightLeft = tree.GetNodeHeight(uLeft); if (dHeightLeft > dHighestHeight) { dHighestHeight = dHeightLeft; iParentSubscript = n; } const unsigned uRight = tree.GetRight(uNodeIndex); const double dHeightRight = tree.GetNodeHeight(uRight); if (dHeightRight > dHighestHeight) { dHighestHeight = dHeightRight; iParentSubscript = n; } } if (-1 == iParentSubscript) Quit("CBSFCIter: failed to find highest child"); const unsigned uNodeIndex = Subfams[iParentSubscript]; const unsigned uLeft = tree.GetLeft(uNodeIndex); const unsigned uRight = tree.GetRight(uNodeIndex); // Delete parent by replacing with left child Subfams[iParentSubscript] = uLeft; // Append right child to list Subfams[uCount] = uRight; #if TRACE { Log("Iter %3u:", uCount); for (unsigned n = 0; n < uCount; ++n) Log(" %u", Subfams[n]); Log("\n"); } #endif } // Divide a tree containing N leaves into k families by // cutting the tree at a horizontal line at some height. // Each internal node defines a height for the cut, // considering all internal nodes enumerates all distinct // cuts. Visit internal nodes in decreasing order of height. // Visiting the node corresponds to moving the horizontal // line down to cut the tree at the height of that node. // We consider the cut to be "infinitestimally below" // the node, so the effect is to remove the current node // from the list of subfamilies and add its two children. // We must visit a parent before its children (so care may // be needed to handle zero edge lengths properly). // We assume that N is small, and write dumb O(N^2) code. // More efficient strategies are possible for large N // by maintaining a list of nodes sorted by height. void ClusterBySubfamCount(const Tree &tree, unsigned uSubfamCount, unsigned Subfams[], unsigned *ptruSubfamCount) { const unsigned uNodeCount = tree.GetNodeCount(); const unsigned uLeafCount = (uNodeCount + 1)/2; // Special case: empty tree if (0 == uNodeCount) { *ptruSubfamCount = 0; return; } // Special case: more subfamilies than leaves if (uSubfamCount >= uLeafCount) { for (unsigned n = 0; n < uLeafCount; ++n) Subfams[n] = n; *ptruSubfamCount = uLeafCount; return; } // Initialize list of subfamilies to be root Subfams[0] = tree.GetRootNodeIndex(); // Iterate for (unsigned i = 1; i < uSubfamCount; ++i) ClusterBySubfamCount_Iteration(tree, Subfams, i); *ptruSubfamCount = uSubfamCount; } static void GetLeavesRecurse(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[], unsigned &uLeafCount /* in-out */) { if (tree.IsLeaf(uNodeIndex)) { Leaves[uLeafCount] = uNodeIndex; ++uLeafCount; return; } const unsigned uLeft = tree.GetLeft(uNodeIndex); const unsigned uRight = tree.GetRight(uNodeIndex); GetLeavesRecurse(tree, uLeft, Leaves, uLeafCount); GetLeavesRecurse(tree, uRight, Leaves, uLeafCount); } void GetLeaves(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[], unsigned *ptruLeafCount) { unsigned uLeafCount = 0; GetLeavesRecurse(tree, uNodeIndex, Leaves, uLeafCount); *ptruLeafCount = uLeafCount; } void Tree::PruneTree(const Tree &tree, unsigned Subfams[], unsigned uSubfamCount) { if (!tree.IsRooted()) Quit("Tree::PruneTree: requires rooted tree"); Clear(); m_uNodeCount = 2*uSubfamCount - 1; InitCache(m_uNodeCount); const unsigned uUnprunedNodeCount = tree.GetNodeCount(); unsigned *uUnprunedToPrunedIndex = new unsigned[uUnprunedNodeCount]; unsigned *uPrunedToUnprunedIndex = new unsigned[m_uNodeCount]; for (unsigned n = 0; n < uUnprunedNodeCount; ++n) uUnprunedToPrunedIndex[n] = NULL_NEIGHBOR; for (unsigned n = 0; n < m_uNodeCount; ++n) uPrunedToUnprunedIndex[n] = NULL_NEIGHBOR; // Create mapping between unpruned and pruned node indexes unsigned uInternalNodeIndex = uSubfamCount; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uUnprunedNodeIndex = Subfams[uSubfamIndex]; uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uSubfamIndex; uPrunedToUnprunedIndex[uSubfamIndex] = uUnprunedNodeIndex; for (;;) { uUnprunedNodeIndex = tree.GetParent(uUnprunedNodeIndex); if (tree.IsRoot(uUnprunedNodeIndex)) break; // Already visited this node? if (NULL_NEIGHBOR != uUnprunedToPrunedIndex[uUnprunedNodeIndex]) break; uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uInternalNodeIndex; uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedNodeIndex; ++uInternalNodeIndex; } } const unsigned uUnprunedRootIndex = tree.GetRootNodeIndex(); uUnprunedToPrunedIndex[uUnprunedRootIndex] = uInternalNodeIndex; uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedRootIndex; #if TRACE { Log("Pruned to unpruned:\n"); for (unsigned i = 0; i < m_uNodeCount; ++i) Log(" [%u]=%u", i, uPrunedToUnprunedIndex[i]); Log("\n"); Log("Unpruned to pruned:\n"); for (unsigned i = 0; i < uUnprunedNodeCount; ++i) { unsigned n = uUnprunedToPrunedIndex[i]; if (n != NULL_NEIGHBOR) Log(" [%u]=%u", i, n); } Log("\n"); } #endif if (uInternalNodeIndex != m_uNodeCount - 1) Quit("Tree::PruneTree, Internal error"); // Nodes 0, 1 ... are the leaves for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { char szName[32]; sprintf(szName, "Subfam_%u", uSubfamIndex + 1); m_ptrName[uSubfamIndex] = strsave(szName); } for (unsigned uPrunedNodeIndex = uSubfamCount; uPrunedNodeIndex < m_uNodeCount; ++uPrunedNodeIndex) { unsigned uUnprunedNodeIndex = uPrunedToUnprunedIndex[uPrunedNodeIndex]; const unsigned uUnprunedLeft = tree.GetLeft(uUnprunedNodeIndex); const unsigned uUnprunedRight = tree.GetRight(uUnprunedNodeIndex); const unsigned uPrunedLeft = uUnprunedToPrunedIndex[uUnprunedLeft]; const unsigned uPrunedRight = uUnprunedToPrunedIndex[uUnprunedRight]; const double dLeftLength = tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedLeft); const double dRightLength = tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedRight); m_uNeighbor2[uPrunedNodeIndex] = uPrunedLeft; m_uNeighbor3[uPrunedNodeIndex] = uPrunedRight; m_dEdgeLength1[uPrunedLeft] = dLeftLength; m_dEdgeLength1[uPrunedRight] = dRightLength; m_uNeighbor1[uPrunedLeft] = uPrunedNodeIndex; m_uNeighbor1[uPrunedRight] = uPrunedNodeIndex; m_bHasEdgeLength1[uPrunedLeft] = true; m_bHasEdgeLength1[uPrunedRight] = true; m_dEdgeLength2[uPrunedNodeIndex] = dLeftLength; m_dEdgeLength3[uPrunedNodeIndex] = dRightLength; m_bHasEdgeLength2[uPrunedNodeIndex] = true; m_bHasEdgeLength3[uPrunedNodeIndex] = true; } m_uRootNodeIndex = uUnprunedToPrunedIndex[uUnprunedRootIndex]; m_bRooted = true; Validate(); delete[] uUnprunedToPrunedIndex; } void LeafIndexesToIds(const Tree &tree, const unsigned Leaves[], unsigned uCount, unsigned Ids[]) { for (unsigned n = 0; n < uCount; ++n) Ids[n] = tree.GetLeafId(Leaves[n]); } muscle-3.8.31.orig/params.h0000644000175000017500000000556411352261673015047 0ustar kratzcharles#ifndef params_h #define params_h extern const char *g_pstrInFileName; extern const char *g_pstrOutFileName; extern const char *g_pstrFASTAOutFileName; extern const char *g_pstrMSFOutFileName; extern const char *g_pstrClwOutFileName; extern const char *g_pstrClwStrictOutFileName; extern const char *g_pstrHTMLOutFileName; extern const char *g_pstrPHYIOutFileName; extern const char *g_pstrPHYSOutFileName; extern const char *g_pstrDistMxFileName1; extern const char *g_pstrDistMxFileName2; extern const char *g_pstrFileName1; extern const char *g_pstrFileName2; extern const char *g_pstrSPFileName; extern const char *g_pstrMatrixFileName; extern const char *g_pstrUseTreeFileName; extern bool g_bUseTreeNoWarn; extern const char *g_pstrComputeWeightsFileName; extern const char *g_pstrScoreFileName; extern SCORE g_scoreGapOpen; extern SCORE g_scoreCenter; extern SCORE g_scoreGapExtend; extern SCORE g_scoreGapAmbig; #if DOUBLE_AFFINE extern SCORE g_scoreGapOpen2; extern SCORE g_scoreGapExtend2; #endif extern unsigned g_uSmoothWindowLength; extern unsigned g_uAnchorSpacing; extern unsigned g_uMaxTreeRefineIters; extern unsigned g_uMinDiagLength; extern unsigned g_uMaxDiagBreak; extern unsigned g_uDiagMargin; extern unsigned g_uRefineWindow; extern unsigned g_uWindowFrom; extern unsigned g_uWindowTo; extern unsigned g_uSaveWindow; extern unsigned g_uWindowOffset; extern unsigned g_uMaxSubFamCount; extern unsigned g_uHydrophobicRunLength; extern float g_dHydroFactor; extern float g_dSmoothScoreCeil; extern float g_dMinBestColScore; extern float g_dMinSmoothScore; extern float g_dSUEFF; extern bool g_bPrecompiledCenter; extern bool g_bNormalizeCounts; extern bool g_bDiags1; extern bool g_bDiags2; extern bool g_bDiags; extern bool g_bAnchors; extern bool g_bCatchExceptions; extern bool g_bMSF; extern bool g_bAln; extern bool g_bClwStrict; extern bool g_bHTML; extern bool g_bPHYI; extern bool g_bPHYS; extern bool g_bQuiet; extern bool g_bVerbose; extern bool g_bRefine; extern bool g_bRefineW; extern bool g_bRefineX; extern bool g_bLow; extern bool g_bSW; extern bool g_bClusterOnly; extern bool g_bProfile; extern bool g_bProfDB; extern bool g_bPPScore; extern bool g_bBrenner; extern bool g_bDimer; extern bool g_bVersion; extern bool g_bStable; extern bool g_bFASTA; extern bool g_bPAS; extern bool g_bTomHydro; extern bool g_bMakeTree; extern PPSCORE g_PPScore; extern OBJSCORE g_ObjScore; extern DISTANCE g_Distance1; extern CLUSTER g_Cluster1; extern ROOT g_Root1; extern SEQWEIGHT g_SeqWeight1; extern DISTANCE g_Distance2; extern CLUSTER g_Cluster2; extern ROOT g_Root2; extern SEQWEIGHT g_SeqWeight2; extern unsigned g_uMaxIters; extern unsigned long g_ulMaxSecs; extern unsigned g_uMaxMB; extern SEQTYPE g_SeqType; extern TERMGAPS g_TermGaps; #endif // params_h muscle-3.8.31.orig/writescorefile.cpp0000644000175000017500000000323211352261666017135 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include extern float VTML_SP[32][32]; extern float NUC_SP[32][32]; static double GetColScore(const MSA &msa, unsigned uCol) { const unsigned uSeqCount = msa.GetSeqCount(); unsigned uPairCount = 0; double dSum = 0.0; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { if (msa.IsGap(uSeq1, uCol)) continue; unsigned uLetter1 = msa.GetLetterEx(uSeq1, uCol); if (uLetter1 >= g_AlphaSize) continue; for (unsigned uSeq2 = uSeq1 + 1; uSeq2 < uSeqCount; ++uSeq2) { if (msa.IsGap(uSeq2, uCol)) continue; unsigned uLetter2 = msa.GetLetterEx(uSeq2, uCol); if (uLetter2 >= g_AlphaSize) continue; double Score; switch (g_Alpha) { case ALPHA_Amino: Score = VTML_SP[uLetter1][uLetter2]; break; case ALPHA_DNA: case ALPHA_RNA: Score = NUC_SP[uLetter1][uLetter2]; break; default: Quit("GetColScore: invalid alpha=%d", g_Alpha); } dSum += Score; ++uPairCount; } } if (0 == uPairCount) return 0; return dSum / uPairCount; } void WriteScoreFile(const MSA &msa) { FILE *f = fopen(g_pstrScoreFileName, "w"); if (0 == f) Quit("Cannot open score file '%s' errno=%d", g_pstrScoreFileName, errno); const unsigned uColCount = msa.GetColCount(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uCol = 0; uCol < uColCount; ++uCol) { double Score = GetColScore(msa, uCol); fprintf(f, "%10.3f ", Score); for (unsigned uSeq = 0; uSeq < uSeqCount; ++uSeq) { char c = msa.GetChar(uSeq, uCol); fprintf(f, "%c", c); } fprintf(f, "\n"); } fclose(f); } muscle-3.8.31.orig/glbalndimer.cpp0000644000175000017500000002415511352261667016377 0ustar kratzcharles#include "muscle.h" #include #include // for sprintf #include "pwpath.h" #include "profile.h" #include "gapscoredimer.h" #define TRACE 0 static SCORE TraceBackDimer( const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, const char *TBM_, const char *TBD_, const char *TBI_, unsigned uLengthA, unsigned uLengthB, PWPath &Path); static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (MINUS_INFINITY == s) return " *"; sprintf(str, "%6.3g", s); return str; } #if TRACE static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log("%2d", uPrefixLengthB); Log("\n"); Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %c", c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %c", TBM(uPrefixLengthA, uPrefixLengthB)); Log("\n"); } } #endif // TRACE static ProfPos PPTerm; static bool InitializePPTerm() { PPTerm.m_bAllGaps = false; PPTerm.m_LL = 1; PPTerm.m_LG = 0; PPTerm.m_GL = 0; PPTerm.m_GG = 0; PPTerm.m_fOcc = 1; return true; } static bool PPTermInitialized = InitializePPTerm(); static SCORE ScoreProfPosDimerLE(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 20; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } if (0 == Score) return -2.5; SCORE logScore = logf(Score); return (SCORE) (logScore*(PPA.m_fOcc * PPB.m_fOcc)); } static SCORE ScoreProfPosDimerPSP(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 20; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } return Score; } static SCORE ScoreProfPosDimer(const ProfPos &PPA, const ProfPos &PPB) { switch (g_PPScore) { case PPSCORE_LE: return ScoreProfPosDimerLE(PPA, PPB); case PPSCORE_SP: case PPSCORE_SV: return ScoreProfPosDimerPSP(PPA, PPB); } Quit("Invalid g_PPScore"); return 0; } // Global alignment dynamic programming // This variant optimizes the profile-profile SP score under the // dimer approximation. SCORE GlobalAlignDimer(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; // Allocate DP matrices const size_t LM = uPrefixCountA*uPrefixCountB; SCORE *DPM_ = new SCORE[LM]; SCORE *DPD_ = new SCORE[LM]; SCORE *DPI_ = new SCORE[LM]; char *TBM_ = new char[LM]; char *TBD_ = new char[LM]; char *TBI_ = new char[LM]; DPM(0, 0) = 0; DPD(0, 0) = MINUS_INFINITY; DPI(0, 0) = MINUS_INFINITY; TBM(0, 0) = 'S'; TBD(0, 0) = '?'; TBI(0, 0) = '?'; DPM(1, 0) = MINUS_INFINITY; DPD(1, 0) = GapScoreMD(PA[0], PPTerm); DPI(1, 0) = MINUS_INFINITY; TBM(1, 0) = '?'; TBD(1, 0) = 'S'; TBI(1, 0) = '?'; DPM(0, 1) = MINUS_INFINITY; DPD(0, 1) = MINUS_INFINITY; DPI(0, 1) = GapScoreMI(PPTerm, PB[0]); TBM(0, 1) = '?'; TBD(0, 1) = '?'; TBI(0, 1) = 'S'; // Empty prefix of B is special case for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { // M=LetterA+LetterB, impossible with empty prefix DPM(uPrefixLengthA, 0) = MINUS_INFINITY; TBM(uPrefixLengthA, 0) = '?'; // D=LetterA+GapB DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + GapScoreDD(PA[uPrefixLengthA - 1], PPTerm); TBD(uPrefixLengthA, 0) = 'D'; // I=GapA+LetterB, impossible with empty prefix DPI(uPrefixLengthA, 0) = MINUS_INFINITY; TBI(uPrefixLengthA, 0) = '?'; } // Empty prefix of A is special case for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { // M=LetterA+LetterB, impossible with empty prefix DPM(0, uPrefixLengthB) = MINUS_INFINITY; TBM(0, uPrefixLengthB) = '?'; // D=LetterA+GapB, impossible with empty prefix DPD(0, uPrefixLengthB) = MINUS_INFINITY; TBD(0, uPrefixLengthB) = '?'; // I=GapA+LetterB DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + GapScoreII(PPTerm, PB[uPrefixLengthB - 1]); TBI(0, uPrefixLengthB) = 'I'; } // ============ // Main DP loop // ============ for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; { // Match M=LetterA+LetterB SCORE scoreLL = ScoreProfPosDimer(PPA, PPB); SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreMM(PPA, PPB); SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreDM(PPA, PPB); SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreIM(PPA, PPB); SCORE scoreBest = scoreMM; char c = 'M'; if (scoreDM > scoreBest) { scoreBest = scoreDM; c = 'D'; } if (scoreIM > scoreBest) { scoreBest = scoreIM; c = 'I'; } DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; TBM(uPrefixLengthA, uPrefixLengthB) = c; } { // Delete D=LetterA+GapB SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + GapScoreMD(PPA, PPB); SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + GapScoreDD(PPA, PPB); SCORE scoreID = DPI(uPrefixLengthA-1, uPrefixLengthB) + GapScoreID(PPA, PPB); SCORE scoreBest = scoreMD; char c = 'M'; if (scoreDD > scoreBest) { scoreBest = scoreDD; c = 'D'; } if (scoreID > scoreBest) { scoreBest = scoreID; c = 'I'; } DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; TBD(uPrefixLengthA, uPrefixLengthB) = c; } { // Insert I=GapA+LetterB SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + GapScoreMI(PPA, PPB); SCORE scoreDI = DPD(uPrefixLengthA, uPrefixLengthB-1) + GapScoreDI(PPA, PPB); SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + GapScoreII(PPA, PPB); SCORE scoreBest = scoreMI; char c = 'M'; if (scoreDI > scoreBest) { scoreBest = scoreDI; c = 'D'; } if (scoreII > scoreBest) { scoreBest = scoreII; c = 'I'; } DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; TBI(uPrefixLengthA, uPrefixLengthB) = c; } } } #if TRACE Log("DPM:\n"); ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPD:\n"); ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPI:\n"); ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBM:\n"); ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBD:\n"); ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBI:\n"); ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); #endif SCORE Score = TraceBackDimer(DPM_, DPD_, DPI_, TBM_, TBD_, TBI_, uLengthA, uLengthB, Path); #if TRACE Log("GlobalAlignDimer score = %.3g\n", Score); #endif delete[] DPM_; delete[] DPD_; delete[] DPI_; delete[] TBM_; delete[] TBD_; delete[] TBI_; return Score; } static SCORE TraceBackDimer( const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, const char *TBM_, const char *TBD_, const char *TBI_, unsigned uLengthA, unsigned uLengthB, PWPath &Path) { const unsigned uPrefixCountA = uLengthA + 1; unsigned uPrefixLengthA = uLengthA; unsigned uPrefixLengthB = uLengthB; char cEdge = 'M'; SCORE scoreMax = DPM(uLengthA, uLengthB); if (DPD(uLengthA, uLengthB) > scoreMax) { scoreMax = DPD(uLengthA, uLengthB); cEdge = 'D'; } if (DPI(uLengthA, uLengthB) > scoreMax) { scoreMax = DPI(uLengthA, uLengthB); cEdge = 'I'; } for (;;) { if (0 == uPrefixLengthA && 0 == uPrefixLengthB) break; PWEdge Edge; Edge.cType = cEdge; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; Path.PrependEdge(Edge); #if TRACE Log("PLA=%u PLB=%u Edge=%c\n", uPrefixLengthA, uPrefixLengthB, cEdge); #endif switch (cEdge) { case 'M': assert(uPrefixLengthA > 0 && uPrefixLengthB > 0); cEdge = TBM(uPrefixLengthA, uPrefixLengthB); --uPrefixLengthA; --uPrefixLengthB; break; case 'D': assert(uPrefixLengthA > 0); cEdge = TBD(uPrefixLengthA, uPrefixLengthB); --uPrefixLengthA; break; case 'I': assert(uPrefixLengthB > 0); cEdge = TBI(uPrefixLengthA, uPrefixLengthB); --uPrefixLengthB; break; default: Quit("Invalid edge PLA=%u PLB=%u %c", uPrefixLengthA, uPrefixLengthB, cEdge); } } #if TRACE Path.LogMe(); #endif return scoreMax; } muscle-3.8.31.orig/muscle.h0000644000175000017500000002704511367131123015042 0ustar kratzcharles#if DEBUG && !_DEBUG #define _DEBUG 1 #endif #if _DEBUG && !DEBUG #define DEBUG 1 #endif #if _MSC_VER #define TIMING 0 #endif #define VER_3_52 0 #ifdef _MSC_VER // Miscrosoft compiler #pragma warning(disable : 4800) // int-bool conversion #pragma warning(disable : 4996) // deprecated names like strdup, isatty. #endif extern const char *MUSCLE_LONG_VERSION; #define SHORT_VERSION "3.8" #include #include #include #include #include #define DOUBLE_AFFINE 0 #define SINGLE_AFFINE 1 #define PAF 0 #include "types.h" #include "intmath.h" #include "alpha.h" #include "params.h" #ifndef _WIN32 #define stricmp strcasecmp #define strnicmp strncasecmp #define _snprintf snprintf #define _fsopen(name, mode, share) fopen((name), (mode)) #endif #if DEBUG #undef assert #define assert(b) Call_MY_ASSERT(__FILE__, __LINE__, b, #b) void Call_MY_ASSERT(const char *file, int line, bool b, const char *msg); #else #define assert(exp) ((void)0) #endif extern int g_argc; extern char **g_argv; #define Rotate(a, b, c) { SCORE *tmp = a; a = b; b = c; c = tmp; } const double VERY_LARGE_DOUBLE = 1e20; extern unsigned g_uTreeSplitNode1; extern unsigned g_uTreeSplitNode2; // Number of elements in array a[] #define countof(a) (sizeof(a)/sizeof(a[0])) // Maximum of two of any type #define Max2(a, b) ((a) > (b) ? (a) : (b)) // Maximum of three of any type #define Max3(a, b, c) Max2(Max2(a, b), c) // Minimum of two of any type #define Min2(a, b) ((a) < (b) ? (a) : (b)) // Maximum of four of any type #define Max4(a, b, c, d) Max2(Max2(a, b), Max2(c, d)) const double VERY_NEGATIVE_DOUBLE = -9e29; const float VERY_NEGATIVE_FLOAT = (float) -9e29; const double BLOSUM_DIST = 0.62; // todo settable // insane value for uninitialized variables const unsigned uInsane = 8888888; const int iInsane = 8888888; const SCORE scoreInsane = 8888888; const char cInsane = (char) 0xcd; // int 3 instruction, used e.g. for unint. memory const double dInsane = VERY_NEGATIVE_DOUBLE; const float fInsane = VERY_NEGATIVE_FLOAT; const char INVALID_STATE = '*'; const BASETYPE BTInsane = (BASETYPE) dInsane; const WEIGHT wInsane = BTInsane; extern double g_dNAN; extern unsigned long g_tStart; void Quit(const char szFormat[], ...); void Warning(const char szFormat[], ...); void TrimBlanks(char szStr[]); void TrimLeadingBlanks(char szStr[]); void TrimTrailingBlanks(char szStr[]); void Log(const char szFormat[], ...); bool Verbose(); const char *ScoreToStr(SCORE Score); const char *ScoreToStrL(SCORE Score); SCORE StrToScore(const char *pszStr); void Break(); double VecSum(const double v[], unsigned n); bool IsValidInteger(const char *Str); bool IsValidSignedInteger(const char *Str); bool IsValidIdentifier(const char *Str); bool IsValidFloatChar(char c); bool isident(char c); bool isidentf(char c); void TreeFromSeqVect(const SeqVect &c, Tree &tree, CLUSTER Cluster, DISTANCE Distance, ROOT Root, const char *SaveFileName = 0); void TreeFromMSA(const MSA &msa, Tree &tree, CLUSTER Cluster, DISTANCE Distance, ROOT Root, const char *SaveFileName = 0); void StripGaps(char szStr[]); void StripWhitespace(char szStr[]); const char *GetTimeAsStr(); unsigned CalcBLOSUMWeights(MSA &Aln, ClusterTree &BlosumCluster); void CalcGSCWeights(MSA &Aln, const ClusterTree &BlosumCluster); void AssertNormalized(const PROB p[]); void AssertNormalizedOrZero(const PROB p[]); void AssertNormalized(const double p[]); bool VectorIsZero(const double dValues[], unsigned n); void VectorSet(double dValues[], unsigned n, double d); bool VectorIsZero(const float dValues[], unsigned n); void VectorSet(float dValues[], unsigned n, float d); // @@TODO should be "not linux" #if _WIN32 double log2(double x); // Defined in on Linux #endif double pow2(double x); double lnTolog2(double ln); double lp2(double x); SCORE SumLog(SCORE x, SCORE y); SCORE SumLog(SCORE x, SCORE y, SCORE z); SCORE SumLog(SCORE w, SCORE x, SCORE y, SCORE z); double lp2Fast(double x); double SumLogFast(double x, double y); double SumLogFast(double x, double y, double z); double SumLogFast(double w, double x, double y, double z); void chkmem(const char szMsg[] = ""); void Normalize(PROB p[], unsigned n); void Normalize(PROB p[], unsigned n, double dRequiredTotal); void NormalizeUnlessZero(PROB p[], unsigned n); void DebugPrintf(const char szFormat[], ...); void SetListFileName(const char *ptrListFileName, bool bAppend); void ModelFromAlign(const char *strInputFileName, const char *strModelFileName, double dMaxNIC); double GetMemUseMB(); double GetRAMSizeMB(); double GetPeakMemUseMB(); void CheckMemUse(); const char *ElapsedTimeAsString(); char *SecsToHHMMSS(long lSecs, char szStr[]); double GetCPUGHz(); SCORE GetBlosum62(unsigned uLetterA, unsigned uLetterB); SCORE GetBlosum62d(unsigned uLetterA, unsigned uLetterB); SCORE GetBlosum50(unsigned uLetterA, unsigned uLetterB); void AssertNormalizedDist(const PROB p[], unsigned N); void CmdLineError(const char *Format, ...); void Fatal(const char *Format, ...); void InitCmd(); void ExecCommandLine(int argc, char *argv[]); void DoCmd(); void SetLogFile(); void NameFromPath(const char szPath[], char szName[], unsigned uBytes); char *strsave(const char *s); void DistKmer20_3(const SeqVect &v, DistFunc &DF); void DistKbit20_3(const SeqVect &v, DistFunc &DF); void DistKmer6_6(const SeqVect &v, DistFunc &DF); void DistKmer4_6(const SeqVect &v, DistFunc &DF); void DistPWKimura(const SeqVect &v, DistFunc &DF); void FastDistKmer(const SeqVect &v, DistFunc &DF); void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF); double PctIdToMAFFTDist(double dPctId); double KimuraDist(double dPctId); void SetFastParams(); void AssertProfsEq(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB); void ValidateMuscleIds(const MSA &msa); void ValidateMuscleIds(const Tree &tree); void TraceBackToPath(int **TraceBack, unsigned uLengthA, unsigned uLengthB, PWPath &Path); void BitTraceBack(char **TraceBack, unsigned uLengthA, unsigned uLengthB, char LastEdge, PWPath &Path); SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path, bool bLockLeft = false, bool bLockRight = false); SCORE AlignTwoProfs( const ProfPos *PA, unsigned uLengthA, WEIGHT wA, const ProfPos *PB, unsigned uLengthB, WEIGHT wB, PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut); void AlignTwoProfsGivenPath(const PWPath &Path, const ProfPos *PA, unsigned uLengthA, WEIGHT wA, const ProfPos *PB, unsigned uLengthB, WEIGHT wB, ProfPos **ptrPOut, unsigned *ptruLengthOut); void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB, MSA &msaCombined); void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB, MSA &msaCombined); SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const PWPath &Path); SCORE GlobalAlignDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignSP(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignSPN(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignLE(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); void CalcThreeWayWeights(const Tree &tree, unsigned uNode1, unsigned uNode2, WEIGHT *Weights); SCORE GlobalAlignSS(const Seq &seqA, const Seq &seqB, PWPath &Path); bool RefineHoriz(MSA &msaIn, const Tree &tree, unsigned uIters, bool bLockLeft, bool bLockRight); bool RefineVert(MSA &msaIn, const Tree &tree, unsigned uIters); SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); void SetInputFileName(const char *pstrFileName); void SetIter(unsigned uIter); void IncIter(); void SetMaxIters(unsigned uMaxIters); void Progress(unsigned uStep, unsigned uTotalSteps); void Progress(const char *szFormat, ...); void SetStartTime(); void ProgressStepsDone(); void SetProgressDesc(const char szDesc[]); void SetSeqStats(unsigned uSeqCount, unsigned uMaxL, unsigned uAvgL); void SetNewHandler(); void SaveCurrentAlignment(); void SetCurrentAlignment(MSA &msa); void SetOutputFileName(const char *out); #if DEBUG void SetMuscleSeqVect(SeqVect &v); void SetMuscleInputMSA(MSA &msa); void ValidateMuscleIds(const MSA &msa); void ValidateMuscleIds(const Tree &tree); #else #define SetMuscleSeqVect(x) /* empty */ #define SetMuscleInputMSA(x) /* empty */ #define ValidateMuscleIds(x) /* empty */ #endif void ProcessArgVect(int argc, char *argv[]); void ProcessArgStr(const char *Str); void Usage(); void SetParams(); void SortCounts(const FCOUNT fcCounts[], unsigned SortOrder[]); unsigned ResidueGroupFromFCounts(const FCOUNT fcCounts[]); FCOUNT SumCounts(const FCOUNT Counts[]); bool FlagOpt(const char *Name); const char *ValueOpt(const char *Name); void DoMuscle(); void ProfDB(); void DoSP(); void ProgAlignSubFams(); void Run(); void ListParams(); void OnException(); void SetSeqWeightMethod(SEQWEIGHT Method); SEQWEIGHT GetSeqWeightMethod(); WEIGHT GetMuscleSeqWeightById(unsigned uId); void ListDiagSavings(); void CheckMaxTime(); const char *MaxSecsToStr(); unsigned long GetStartTime(); void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a); ProgNode *ProgressiveAlignE(const SeqVect &v, const Tree &GuideTree, MSA &a); void CalcDistRangeKmer6_6(const MSA &msa, unsigned uRow, float Dist[]); void CalcDistRangeKmer20_3(const MSA &msa, unsigned uRow, float Dist[]); void CalcDistRangeKmer20_4(const MSA &msa, unsigned uRow, float Dist[]); void CalcDistRangePctIdKimura(const MSA &msa, unsigned uRow, float Dist[]); void CalcDistRangePctIdLog(const MSA &msa, unsigned uRow, float Dist[]); void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a); void MakeRootMSABrenner(SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a); void Refine(); void Local(); void Profile(); void PPScore(); void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage); char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, bool DeleteGaps = true); SCORE SW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); void TraceBackSW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, unsigned uPrefixLengthAMax, unsigned uPrefixLengthBMax, PWPath &Path); void DiffPaths(const PWPath &p1, const PWPath &p2, unsigned Edges1[], unsigned *ptruDiffCount1, unsigned Edges2[], unsigned *ptruDiffCount2); void SetPPScore(bool bRespectFlagOpts = true); void SetPPScore(PPSCORE p); SCORE GlobalAlignDimer(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); bool MissingCommand(); void Credits(); void ProfileProfile(MSA &msa1, MSA &msa2, MSA &msaOut); void MHackStart(SeqVect &v); void MHackEnd(MSA &msa); void WriteScoreFile(const MSA &msa); char ConsensusChar(const ProfPos &PP); void Stabilize(const MSA &msa, MSA &msaStable); void MuscleOutput(MSA &msa); PTR_SCOREMATRIX ReadMx(TextFile &File); void MemPlus(size_t Bytes, char *Where); void MemMinus(size_t Bytes, char *Where); muscle-3.8.31.orig/clustset.h0000644000175000017500000000110011352261612015401 0ustar kratzcharles#ifndef ClustSet_h #define ClustSet_h enum JOIN; enum LINKAGE; class Clust; class ClustSet { public: virtual unsigned GetLeafCount() = 0; virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1, unsigned uNodeIndex2) = 0; virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex, unsigned uRightNodeIndex, unsigned uJoinedNodeIndex, double *ptrdLeftLength, double *ptrdRightLength) = 0; virtual const char *GetLeafName(unsigned uNodeIndex) = 0; virtual unsigned GetLeafId(unsigned uNodeIndex) = 0; }; #endif // ClustSet_h muscle-3.8.31.orig/seqvect.cpp0000644000175000017500000001433011352261667015563 0ustar kratzcharles#include "muscle.h" #include "seqvect.h" #include "textfile.h" #include "msa.h" const size_t MAX_FASTA_LINE = 16000; SeqVect::~SeqVect() { Clear(); } void SeqVect::Clear() { for (size_t n = 0; n < size(); ++n) delete (*this)[n]; } void SeqVect::ToFASTAFile(TextFile &File) const { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); ptrSeq->ToFASTAFile(File); } } void SeqVect::FromFASTAFile(TextFile &File) { Clear(); FILE *f = File.GetStdioFile(); for (;;) { char *Label; unsigned uLength; char *SeqData = GetFastaSeq(f, &uLength, &Label); if (0 == SeqData) return; Seq *ptrSeq = new Seq; for (unsigned i = 0; i < uLength; ++i) { char c = SeqData[i]; ptrSeq->push_back(c); } ptrSeq->SetName(Label); push_back(ptrSeq); delete[] SeqData; delete[] Label; } } void SeqVect::PadToMSA(MSA &msa) { unsigned uSeqCount = Length(); if (0 == uSeqCount) { msa.Clear(); return; } unsigned uLongestSeqLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); unsigned uColCount = ptrSeq->Length(); if (uColCount > uLongestSeqLength) uLongestSeqLength = uColCount; } msa.SetSize(uSeqCount, uLongestSeqLength); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); msa.SetSeqName(uSeqIndex, ptrSeq->GetName()); unsigned uColCount = ptrSeq->Length(); unsigned uColIndex; for (uColIndex = 0; uColIndex < uColCount; ++uColIndex) { char c = ptrSeq->at(uColIndex); msa.SetChar(uSeqIndex, uColIndex, c); } while (uColIndex < uLongestSeqLength) msa.SetChar(uSeqIndex, uColIndex++, '.'); } } void SeqVect::Copy(const SeqVect &rhs) { clear(); unsigned uSeqCount = rhs.Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = rhs.at(uSeqIndex); Seq *ptrSeqCopy = new Seq; ptrSeqCopy->Copy(*ptrSeq); push_back(ptrSeqCopy); } } void SeqVect::StripGaps() { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); ptrSeq->StripGaps(); } } void SeqVect::StripGapsAndWhitespace() { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); ptrSeq->StripGapsAndWhitespace(); } } void SeqVect::ToUpper() { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); ptrSeq->ToUpper(); } } bool SeqVect::FindName(const char *ptrName, unsigned *ptruIndex) const { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const Seq *ptrSeq = at(uSeqIndex); if (0 == stricmp(ptrSeq->GetName(), ptrName)) { *ptruIndex = uSeqIndex; return true; } } return false; } void SeqVect::AppendSeq(const Seq &s) { Seq *ptrSeqCopy = new Seq; ptrSeqCopy->Copy(s); push_back(ptrSeqCopy); } void SeqVect::LogMe() const { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const Seq *ptrSeq = at(uSeqIndex); ptrSeq->LogMe(); } } const char *SeqVect::GetSeqName(unsigned uSeqIndex) const { assert(uSeqIndex < size()); const Seq *ptrSeq = at(uSeqIndex); return ptrSeq->GetName(); } unsigned SeqVect::GetSeqId(unsigned uSeqIndex) const { assert(uSeqIndex < size()); const Seq *ptrSeq = at(uSeqIndex); return ptrSeq->GetId(); } unsigned SeqVect::GetSeqIdFromName(const char *Name) const { const unsigned uSeqCount = GetSeqCount(); for (unsigned i = 0; i < uSeqCount; ++i) { if (!strcmp(Name, GetSeqName(i))) return GetSeqId(i); } Quit("SeqVect::GetSeqIdFromName(%s): not found", Name); return 0; } Seq &SeqVect::GetSeqById(unsigned uId) { const unsigned uSeqCount = GetSeqCount(); for (unsigned i = 0; i < uSeqCount; ++i) { if (GetSeqId(i) == uId) return GetSeq(i); } Quit("SeqVect::GetSeqIdByUd(%d): not found", uId); return (Seq &) *((Seq *) 0); } unsigned SeqVect::GetSeqLength(unsigned uSeqIndex) const { assert(uSeqIndex < size()); const Seq *ptrSeq = at(uSeqIndex); return ptrSeq->Length(); } Seq &SeqVect::GetSeq(unsigned uSeqIndex) { assert(uSeqIndex < size()); return *at(uSeqIndex); } const Seq &SeqVect::GetSeq(unsigned uSeqIndex) const { assert(uSeqIndex < size()); return *at(uSeqIndex); } void SeqVect::SetSeqId(unsigned uSeqIndex, unsigned uId) { assert(uSeqIndex < size()); Seq *ptrSeq = at(uSeqIndex); return ptrSeq->SetId(uId); } ALPHA SeqVect::GuessAlpha() const { // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap // letters belong to the nucleotide alphabet, guess nucleo. // Otherwise amino. const unsigned CHAR_COUNT = 100; const unsigned MIN_NUCLEO_PCT = 95; const unsigned uSeqCount = GetSeqCount(); if (0 == uSeqCount) return ALPHA_Amino; unsigned uSeqIndex = 0; unsigned uPos = 0; unsigned uSeqLength = GetSeqLength(0); unsigned uDNACount = 0; unsigned uRNACount = 0; unsigned uTotal = 0; const Seq *ptrSeq = &GetSeq(0); for (;;) { while (uPos >= uSeqLength) { ++uSeqIndex; if (uSeqIndex >= uSeqCount) break; ptrSeq = &GetSeq(uSeqIndex); uSeqLength = ptrSeq->Length(); uPos = 0; } if (uSeqIndex >= uSeqCount) break; char c = ptrSeq->at(uPos++); if (IsGapChar(c)) continue; if (IsDNA(c)) ++uDNACount; if (IsRNA(c)) ++uRNACount; ++uTotal; if (uTotal >= CHAR_COUNT) break; } if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_DNA; if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_RNA; return ALPHA_Amino; } void SeqVect::FixAlpha() { ClearInvalidLetterWarning(); unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); ptrSeq->FixAlpha(); } ReportInvalidLetters(); } muscle-3.8.31.orig/releases.txt0000644000175000017500000000031311352262310015727 0ustar kratzcharlesver=2.01 rev=1 ver=2.10 rev=3 ver=3.00 rev=5 ver=3.20 rev=7 ver=3.30 rev=9 ver=3.41 rev=11 ver=3.40 rev=12 ver=3.51 rev=14 ver=3.52 rev=16 ver=3.50 rev=17 ver=3.60 rev=19 ver=3.70 rev=21 ver=3.80 rev=22 muscle-3.8.31.orig/scorehistory.h0000644000175000017500000000077411352261600016305 0ustar kratzcharles#ifndef ScoreHistory_h #define ScoreHistory_h class ScoreHistory { public: ScoreHistory(unsigned uIters, unsigned uInternalNodeCount); ~ScoreHistory(); bool SetScore(unsigned uIter, unsigned uInternalNodeIndex, bool bRight, SCORE Score); void LogMe() const; SCORE GetScore(unsigned uIter, unsigned uInternalNodeIndex, bool bReversed, bool bRight) const; private: SCORE **m_Score; bool **m_bScoreSet; unsigned m_uIters; unsigned m_uNodeCount; }; #endif // ScoreHistory_h muscle-3.8.31.orig/nwdasimple2.cpp0000644000175000017500000003403511352261666016341 0ustar kratzcharles#include "muscle.h" #include "pwpath.h" #include "profile.h" #if DOUBLE_AFFINE #define TRACE 0 extern bool g_bKeepSimpleDP; extern SCORE *g_DPM; extern SCORE *g_DPD; extern SCORE *g_DPE; extern SCORE *g_DPI; extern SCORE *g_DPJ; extern char *g_TBM; extern char *g_TBD; extern char *g_TBE; extern char *g_TBI; extern char *g_TBJ; static char XlatEdgeType(char c) { if ('E' == c) return 'D'; if ('J' == c) return 'I'; return c; } static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -100000) return " *"; sprintf(str, "%6.1f", s); return str; } static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB)); Log("\n"); } } static void ListDPM(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { SCORE x = (uPrefixLengthA + uPrefixLengthB)*g_scoreGapExtend; SCORE s = DPM(uPrefixLengthA, uPrefixLengthB) - x; Log(" %s", LocalScoreToStr(s)); } Log("\n"); } } extern SCORE ScoreProfPos2(const ProfPos &PP, const ProfPos &PPB); SCORE NWDASimple2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; // Allocate DP matrices const size_t LM = uPrefixCountA*uPrefixCountB; SCORE *DPM_ = new SCORE[LM]; SCORE *DPD_ = new SCORE[LM]; SCORE *DPE_ = new SCORE[LM]; SCORE *DPI_ = new SCORE[LM]; SCORE *DPJ_ = new SCORE[LM]; SCORE *DPL_ = new SCORE[LM]; char *TBM_ = new char[LM]; char *TBD_ = new char[LM]; char *TBE_ = new char[LM]; char *TBI_ = new char[LM]; char *TBJ_ = new char[LM]; memset(DPM_, 0, LM*sizeof(SCORE)); memset(DPD_, 0, LM*sizeof(SCORE)); memset(DPE_, 0, LM*sizeof(SCORE)); memset(DPI_, 0, LM*sizeof(SCORE)); memset(DPJ_, 0, LM*sizeof(SCORE)); // memset(DPL_, 0, LM*sizeof(SCORE)); memset(TBM_, '?', LM); memset(TBD_, '?', LM); memset(TBE_, '?', LM); memset(TBI_, '?', LM); memset(TBJ_, '?', LM); DPM(0, 0) = 0; DPD(0, 0) = MINUS_INFINITY; DPE(0, 0) = MINUS_INFINITY; DPI(0, 0) = MINUS_INFINITY; DPJ(0, 0) = MINUS_INFINITY; DPM(1, 0) = MINUS_INFINITY; DPD(1, 0) = PA[0].m_scoreGapOpen; DPE(1, 0) = PA[0].m_scoreGapOpen2; DPI(1, 0) = MINUS_INFINITY; DPJ(1, 0) = MINUS_INFINITY; DPM(0, 1) = MINUS_INFINITY; DPD(0, 1) = MINUS_INFINITY; DPE(0, 1) = MINUS_INFINITY; DPI(0, 1) = PB[0].m_scoreGapOpen; DPJ(0, 1) = PB[0].m_scoreGapOpen2; // Empty prefix of B is special case for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { // M=LetterA+LetterB, impossible with empty prefix DPM(uPrefixLengthA, 0) = MINUS_INFINITY; // D=LetterA+GapB DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend; TBD(uPrefixLengthA, 0) = 'D'; DPE(uPrefixLengthA, 0) = DPE(uPrefixLengthA - 1, 0) + g_scoreGapExtend2; TBE(uPrefixLengthA, 0) = 'E'; // I=GapA+LetterB, impossible with empty prefix DPI(uPrefixLengthA, 0) = MINUS_INFINITY; DPJ(uPrefixLengthA, 0) = MINUS_INFINITY; } // Empty prefix of A is special case for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { // M=LetterA+LetterB, impossible with empty prefix DPM(0, uPrefixLengthB) = MINUS_INFINITY; // D=LetterA+GapB, impossible with empty prefix DPD(0, uPrefixLengthB) = MINUS_INFINITY; DPE(0, uPrefixLengthB) = MINUS_INFINITY; // I=GapA+LetterB DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend; TBI(0, uPrefixLengthB) = 'I'; DPJ(0, uPrefixLengthB) = DPJ(0, uPrefixLengthB - 1) + g_scoreGapExtend2; TBJ(0, uPrefixLengthB) = 'J'; } // ============ // Main DP loop // ============ for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreGapCloseB; if (uPrefixLengthB == 1) scoreGapCloseB = MINUS_INFINITY; else scoreGapCloseB = PB[uPrefixLengthB-2].m_scoreGapClose; SCORE scoreGapClose2B; if (uPrefixLengthB == 1) scoreGapClose2B = MINUS_INFINITY; else scoreGapClose2B = PB[uPrefixLengthB-2].m_scoreGapClose2; for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; { // Match M=LetterA+LetterB SCORE scoreLL = ScoreProfPos2(PPA, PPB); DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL; SCORE scoreGapCloseA; if (uPrefixLengthA == 1) scoreGapCloseA = MINUS_INFINITY; else scoreGapCloseA = PA[uPrefixLengthA-2].m_scoreGapClose; SCORE scoreGapClose2A; if (uPrefixLengthA == 1) scoreGapClose2A = MINUS_INFINITY; else scoreGapClose2A = PA[uPrefixLengthA-2].m_scoreGapClose2; SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; SCORE scoreEM = DPE(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2A; SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; SCORE scoreJM = DPJ(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2B; SCORE scoreBest; if (scoreMM >= scoreDM && scoreMM >= scoreIM && scoreMM >= scoreEM && scoreMM >= scoreJM) { scoreBest = scoreMM; TBM(uPrefixLengthA, uPrefixLengthB) = 'M'; } else if (scoreDM >= scoreMM && scoreDM >= scoreIM && scoreDM >= scoreEM && scoreDM >= scoreJM) { scoreBest = scoreDM; TBM(uPrefixLengthA, uPrefixLengthB) = 'D'; } else if (scoreEM >= scoreMM && scoreEM >= scoreIM && scoreEM >= scoreDM && scoreEM >= scoreJM) { scoreBest = scoreEM; TBM(uPrefixLengthA, uPrefixLengthB) = 'E'; } else if (scoreIM >= scoreMM && scoreIM >= scoreDM && scoreIM >= scoreEM && scoreIM >= scoreJM) { scoreBest = scoreIM; TBM(uPrefixLengthA, uPrefixLengthB) = 'I'; } else if (scoreJM >= scoreMM && scoreJM >= scoreDM && scoreJM >= scoreEM && scoreJM >= scoreIM) { scoreBest = scoreJM; TBM(uPrefixLengthA, uPrefixLengthB) = 'J'; } else Quit("Max failed (M)"); DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; } { // Delete D=LetterA+GapB SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen; SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend; SCORE scoreBest; if (scoreMD >= scoreDD) { scoreBest = scoreMD; TBD(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreDD >= scoreMD); scoreBest = scoreDD; TBD(uPrefixLengthA, uPrefixLengthB) = 'D'; } DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; } { // Delete E=LetterA+GapB SCORE scoreME = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen2; SCORE scoreEE = DPE(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend2; SCORE scoreBest; if (scoreME >= scoreEE) { scoreBest = scoreME; TBE(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreEE >= scoreME); scoreBest = scoreEE; TBE(uPrefixLengthA, uPrefixLengthB) = 'E'; } DPE(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert I=GapA+LetterB { SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB-1].m_scoreGapOpen; SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend; SCORE scoreBest; if (scoreMI >= scoreII) { scoreBest = scoreMI; TBI(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreII > scoreMI); scoreBest = scoreII; TBI(uPrefixLengthA, uPrefixLengthB) = 'I'; } DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert J=GapA+LetterB { SCORE scoreMJ = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB-1].m_scoreGapOpen2; SCORE scoreJJ = DPJ(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend2; SCORE scoreBest; if (scoreMJ > scoreJJ) { scoreBest = scoreMJ; TBJ(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreJJ >= scoreMJ); scoreBest = scoreJJ; TBJ(uPrefixLengthA, uPrefixLengthB) = 'J'; } DPJ(uPrefixLengthA, uPrefixLengthB) = scoreBest; } } } // Special case: close gaps at end of alignment DPD(uLengthA, uLengthB) += PA[uLengthA-1].m_scoreGapClose; DPE(uLengthA, uLengthB) += PA[uLengthA-1].m_scoreGapClose2; DPI(uLengthA, uLengthB) += PB[uLengthB-1].m_scoreGapClose; DPJ(uLengthA, uLengthB) += PB[uLengthB-1].m_scoreGapClose2; #if TRACE Log("DPL:\n"); ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPM:\n"); ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPD:\n"); ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPE:\n"); ListDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPI:\n"); ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPJ:\n"); ListDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBM:\n"); ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBD:\n"); ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBE:\n"); ListTB(TBE_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBI:\n"); ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBJ:\n"); ListTB(TBJ_, PA, PB, uPrefixCountA, uPrefixCountB); #endif // ========== // Trace-back // ========== Path.Clear(); // Find last edge char cEdgeType = '?'; SCORE BestScore = MINUS_INFINITY; SCORE M = DPM(uLengthA, uLengthB); SCORE D = DPD(uLengthA, uLengthB); SCORE E = DPE(uLengthA, uLengthB); SCORE I = DPI(uLengthA, uLengthB); SCORE J = DPJ(uLengthA, uLengthB); if (M >= D && M >= E && M >= I && M >= J) { cEdgeType = 'M'; BestScore = M; } else if (D >= M && D >= E && D >= I && D >= J) { cEdgeType = 'D'; BestScore = D; } else if (E >= M && E >= D && E >= I && E >= J) { cEdgeType = 'E'; BestScore = E; } else if (I >= M && I >= D && I >= E && I >= J) { cEdgeType = 'I'; BestScore = I; } else if (J >= M && J >= D && J >= E && J >= I) { cEdgeType = 'J'; BestScore = J; } else Quit("Bad max"); unsigned PLA = uLengthA; unsigned PLB = uLengthB; unsigned ECount = 0; unsigned JCount = 0; for (;;) { #if TRACE Log("TraceBack: %c%u.%u\n", cEdgeType, PLA, PLB); #endif PWEdge Edge; Edge.cType = XlatEdgeType(cEdgeType); Edge.uPrefixLengthA = PLA; Edge.uPrefixLengthB = PLB; Path.PrependEdge(Edge); switch (cEdgeType) { case 'M': assert(PLA > 0); assert(PLB > 0); cEdgeType = TBM(PLA, PLB); --PLA; --PLB; break; case 'D': assert(PLA > 0); cEdgeType = TBD(PLA, PLB); --PLA; break; case 'E': ++ECount; assert(PLA > 0); cEdgeType = TBE(PLA, PLB); --PLA; break; case 'I': assert(PLB > 0); cEdgeType = TBI(PLA, PLB); --PLB; break; case 'J': ++JCount; assert(PLB > 0); cEdgeType = TBJ(PLA, PLB); --PLB; break; default: Quit("Invalid edge %c", cEdgeType); } if (0 == PLA && 0 == PLB) break; } //if (ECount > 0 || JCount > 0) // fprintf(stderr, "E=%d J=%d\n", ECount, JCount); Path.Validate(); if (Path.GetMatchCount() + Path.GetDeleteCount() != uLengthA) Quit("Path count A"); if (Path.GetMatchCount() + Path.GetInsertCount() != uLengthB) Quit("Path count B"); if (g_bKeepSimpleDP) { g_DPM = DPM_; g_DPD = DPD_; g_DPE = DPE_; g_DPI = DPI_; g_DPJ = DPJ_; g_TBM = TBM_; g_TBD = TBD_; g_TBE = TBE_; g_TBI = TBI_; g_TBJ = TBJ_; } else { delete[] DPM_; delete[] DPD_; delete[] DPE_; delete[] DPI_; delete[] DPJ_; delete[] TBM_; delete[] TBD_; delete[] TBE_; delete[] TBI_; delete[] TBJ_; } #if TRACE Log("BestScore=%.6g\n", BestScore); #endif return BestScore; } #endif // DOUBLE_AFFINE muscle-3.8.31.orig/threewaywt.cpp0000644000175000017500000002426111352261600016303 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include #define TRACE 0 /*** Sequence weights derived from a tree using Gotoh's three-way method. Gotoh (1995) CABIOS 11(5), 543-51. Each edge e is assigned a weight w(e). Consider first a tree with three leaves A,B and C having branch lengths a, b and c, as follows. B | b | A---a---R---c---C The internal node is denoted by R. Define: S = (ab + ca + ab) x = bc(a + b)(a + c) y = a(b + c)FS Here F is a tunable normalization factor which is approximately 1.0. Then the edge weight for AR is computed as: w(AR) = sqrt(x/y) Similar expressions for the other edges follow by symmetry. For a tree with more than three edges, the weight of an edge that ends in a leaf is computed from the three-way tree that includes the edge and its two neighbors. The weight of an internal edge is computed as the product of the weights for that edge derived from the two three-way subtrees that include that edge. For example, consider the following tree. B | A--R--V--C | D Here, w(RV) is computed as the product of the two values for w(RV) derived from the three-way trees with leaves ABV and RCD respectively. The calculation is done using "Gotoh lengths", not the real edge lengths. The Gotoh length G of a directed edge is calculated recursively as: G = d + LR/(L + R) where d is the length of the edge, and L and R are the Gotoh lengths of the left and right edges adjoining the terminal end of the edge. If the edge terminates on a leaf, then G=d. Pairwise sequence weights are computed as the product of edge weights on the path that connects their leaves. If the tree is split into two subtrees by deleting a given edge e, then the pairwise weights factorize. For operations on profiles formed from the two subtrees, it is possible to assign a weight to a sequence as the product of edge weights on a path from e to its leaf. ***/ // The xxxUnrooted functions present a rooted tree as // if it had been unrooted by deleting the root node. static unsigned GetFirstNeighborUnrooted(const Tree &tree, unsigned uNode1, unsigned uNode2) { if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) Quit("GetFirstNeighborUnrooted, should never be called with root"); if (!tree.IsEdge(uNode1, uNode2)) { if (!tree.IsRoot(tree.GetParent(uNode1)) || !tree.IsRoot(tree.GetParent(uNode2))) Quit("GetFirstNeighborUnrooted, not edge"); const unsigned uRoot = tree.GetRootNodeIndex(); return tree.GetFirstNeighbor(uNode1, uRoot); } unsigned uNeighbor = tree.GetFirstNeighbor(uNode1, uNode2); if (tree.IsRoot(uNeighbor)) return tree.GetFirstNeighbor(uNeighbor, uNode1); return uNeighbor; } static unsigned GetSecondNeighborUnrooted(const Tree &tree, unsigned uNode1, unsigned uNode2) { if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) Quit("GetFirstNeighborUnrooted, should never be called with root"); if (!tree.IsEdge(uNode1, uNode2)) { if (!tree.IsRoot(tree.GetParent(uNode1)) || !tree.IsRoot(tree.GetParent(uNode2))) Quit("GetFirstNeighborUnrooted, not edge"); const unsigned uRoot = tree.GetRootNodeIndex(); return tree.GetSecondNeighbor(uNode1, uRoot); } unsigned uNeighbor = tree.GetSecondNeighbor(uNode1, uNode2); if (tree.IsRoot(uNeighbor)) return tree.GetFirstNeighbor(uNeighbor, uNode1); return uNeighbor; } static unsigned GetNeighborUnrooted(const Tree &tree, unsigned uNode1, unsigned uSub) { unsigned uNeighbor = tree.GetNeighbor(uNode1, uSub); if (tree.IsRoot(uNeighbor)) return tree.GetFirstNeighbor(uNeighbor, uNode1); return uNeighbor; } static unsigned GetNeighborSubscriptUnrooted(const Tree &tree, unsigned uNode1, unsigned uNode2) { if (tree.IsEdge(uNode1, uNode2)) return tree.GetNeighborSubscript(uNode1, uNode2); if (!tree.IsRoot(tree.GetParent(uNode1)) || !tree.IsRoot(tree.GetParent(uNode2))) Quit("GetNeighborSubscriptUnrooted, not edge"); for (unsigned uSub = 0; uSub < 3; ++uSub) if (GetNeighborUnrooted(tree, uNode1, uSub) == uNode2) return uSub; Quit("GetNeighborSubscriptUnrooted, not a neighbor"); return NULL_NEIGHBOR; } static double GetEdgeLengthUnrooted(const Tree &tree, unsigned uNode1, unsigned uNode2) { if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) Quit("GetEdgeLengthUnrooted, should never be called with root"); if (!tree.IsEdge(uNode1, uNode2)) { if (!tree.IsRoot(tree.GetParent(uNode1)) || !tree.IsRoot(tree.GetParent(uNode2))) Quit("GetEdgeLengthUnrooted, not edge"); const unsigned uRoot = tree.GetRootNodeIndex(); return tree.GetEdgeLength(uNode1, uRoot) + tree.GetEdgeLength(uNode2, uRoot); } return tree.GetEdgeLength(uNode1, uNode2); } double GetGotohLength(const Tree &tree, unsigned R, unsigned A) { double dThis = GetEdgeLengthUnrooted(tree, R, A); // Enforce non-negative edge lengths if (dThis < 0) dThis = 0; if (tree.IsLeaf(A)) return dThis; const unsigned uFirst = GetFirstNeighborUnrooted(tree, A, R); const unsigned uSecond = GetSecondNeighborUnrooted(tree, A, R); const double dFirst = GetGotohLength(tree, A, uFirst); const double dSecond = GetGotohLength(tree, A, uSecond); const double dSum = dFirst + dSecond; const double dThird = dSum == 0 ? 0 : (dFirst*dSecond)/dSum; return dThis + dThird; } // Return weight of edge A-R in three-way subtree that has // leaves A,B,C and internal node R. static double GotohWeightThreeWay(const Tree &tree, unsigned A, unsigned B, unsigned C, unsigned R) { const double F = 1.0; if (tree.IsLeaf(R)) Quit("GotohThreeWay: R must be internal node"); double a = GetGotohLength(tree, R, A); double b = GetGotohLength(tree, R, B); double c = GetGotohLength(tree, R, C); double S = b*c + c*a + a*b; double x = b*c*(a + b)*(a + c); double y = a*(b + c)*F*S; // y is zero iff all three branch lengths are zero. if (y < 0.001) return 1.0; return sqrt(x/y); } static double GotohWeightEdge(const Tree &tree, unsigned uNodeIndex1, unsigned uNodeIndex2) { double w1 = 1.0; double w2 = 1.0; if (!tree.IsLeaf(uNodeIndex1)) { unsigned R = uNodeIndex1; unsigned A = uNodeIndex2; unsigned B = GetFirstNeighborUnrooted(tree, R, A); unsigned C = GetSecondNeighborUnrooted(tree, R, A); w1 = GotohWeightThreeWay(tree, A, B, C, R); } if (!tree.IsLeaf(uNodeIndex2)) { unsigned R = uNodeIndex2; unsigned A = uNodeIndex1; unsigned B = GetFirstNeighborUnrooted(tree, R, A); unsigned C = GetSecondNeighborUnrooted(tree, R, A); w2 = GotohWeightThreeWay(tree, A, B, C, R); } return w1*w2; } void CalcThreeWayEdgeWeights(const Tree &tree, WEIGHT **EdgeWeights) { const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex1 = 0; uNodeIndex1 < uNodeCount; ++uNodeIndex1) { if (tree.IsRoot(uNodeIndex1)) continue; for (unsigned uSub1 = 0; uSub1 < 3; ++uSub1) { const unsigned uNodeIndex2 = GetNeighborUnrooted(tree, uNodeIndex1, uSub1); if (NULL_NEIGHBOR == uNodeIndex2) continue; // Avoid computing same edge twice in reversed order if (uNodeIndex2 < uNodeIndex1) continue; const WEIGHT w = (WEIGHT) GotohWeightEdge(tree, uNodeIndex1, uNodeIndex2); const unsigned uSub2 = GetNeighborSubscriptUnrooted(tree, uNodeIndex2, uNodeIndex1); #if DEBUG { assert(uNodeIndex2 == GetNeighborUnrooted(tree, uNodeIndex1, uSub1)); assert(uNodeIndex1 == GetNeighborUnrooted(tree, uNodeIndex2, uSub2)); const WEIGHT wRev = (WEIGHT) GotohWeightEdge(tree, uNodeIndex2, uNodeIndex1); if (!BTEq(w, wRev)) Quit("CalcThreeWayWeights: rev check failed %g %g", w, wRev); } #endif EdgeWeights[uNodeIndex1][uSub1] = w; EdgeWeights[uNodeIndex2][uSub2] = w; } } } static void SetSeqWeights(const Tree &tree, unsigned uNode1, unsigned uNode2, double dPathWeight, WEIGHT *Weights) { if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) Quit("SetSeqWeights, should never be called with root"); const double dThisLength = GetEdgeLengthUnrooted(tree, uNode1, uNode2); if (tree.IsLeaf(uNode2)) { const unsigned Id = tree.GetLeafId(uNode2); Weights[Id] = (WEIGHT) (dPathWeight + dThisLength); return; } const unsigned uFirst = GetFirstNeighborUnrooted(tree, uNode2, uNode1); const unsigned uSecond = GetSecondNeighborUnrooted(tree, uNode2, uNode1); dPathWeight *= dThisLength; SetSeqWeights(tree, uNode2, uFirst, dPathWeight, Weights); SetSeqWeights(tree, uNode2, uSecond, dPathWeight, Weights); } void CalcThreeWayWeights(const Tree &tree, unsigned uNode1, unsigned uNode2, WEIGHT *Weights) { #if TRACE Log("CalcThreeWayEdgeWeights\n"); tree.LogMe(); #endif if (tree.IsRoot(uNode1)) uNode1 = tree.GetFirstNeighbor(uNode1, uNode2); else if (tree.IsRoot(uNode2)) uNode2 = tree.GetFirstNeighbor(uNode2, uNode1); const unsigned uNodeCount = tree.GetNodeCount(); WEIGHT **EdgeWeights = new WEIGHT *[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) EdgeWeights[uNodeIndex] = new WEIGHT[3]; CalcThreeWayEdgeWeights(tree, EdgeWeights); #if TRACE { Log("Node1 Node2 Length Gotoh EdgeWt\n"); Log("----- ----- ------ ------ ------\n"); for (unsigned uNodeIndex1 = 0; uNodeIndex1 < uNodeCount; ++uNodeIndex1) { if (tree.IsRoot(uNodeIndex1)) continue; for (unsigned uSub1 = 0; uSub1 < 3; ++uSub1) { const unsigned uNodeIndex2 = GetNeighborUnrooted(tree, uNodeIndex1, uSub1); if (NULL_NEIGHBOR == uNodeIndex2) continue; if (uNodeIndex2 < uNodeIndex1) continue; const WEIGHT ew = EdgeWeights[uNodeIndex1][uSub1]; const double d = GetEdgeLengthUnrooted(tree, uNodeIndex1, uNodeIndex2); const double g = GetGotohLength(tree, uNodeIndex1, uNodeIndex2); Log("%5u %5u %6.3f %6.3f %6.3f\n", uNodeIndex1, uNodeIndex2, d, g, ew); } } } #endif SetSeqWeights(tree, uNode1, uNode2, 0.0, Weights); SetSeqWeights(tree, uNode2, uNode1, 0.0, Weights); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) delete[] EdgeWeights[uNodeIndex]; delete[] EdgeWeights; } muscle-3.8.31.orig/refinesubfams.cpp0000644000175000017500000001355711352261620016741 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "tree.h" #include "clust.h" #include "profile.h" #include "pwpath.h" #define TRACE 0 static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[], unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa); // Identify subfamilies in a tree. // Returns array of internal node indexes, one for each subfamily. // First try is to select groups by height (which should approximate // minimum percent identity), if this gives too many subfamilies then // we cut at a point that gives the maximum allowed number of subfams. static void GetSubfams(const Tree &tree, double dMaxHeight, unsigned uMaxSubfamCount, unsigned **ptrptrSubfams, unsigned *ptruSubfamCount) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Subfams = new unsigned[uNodeCount]; unsigned uSubfamCount; ClusterByHeight(tree, dMaxHeight, Subfams, &uSubfamCount); if (uSubfamCount > uMaxSubfamCount) ClusterBySubfamCount(tree, uMaxSubfamCount, Subfams, &uSubfamCount); *ptrptrSubfams = Subfams; *ptruSubfamCount = uSubfamCount; } static void LogSubfams(const Tree &tree, const unsigned Subfams[], unsigned uSubfamCount) { const unsigned uNodeCount = tree.GetNodeCount(); Log("%u subfamilies found\n", uSubfamCount); Log("Subfam Sequence\n"); Log("------ --------\n"); unsigned *Leaves = new unsigned[uNodeCount]; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uSubfamNodeIndex = Subfams[uSubfamIndex]; unsigned uLeafCount; GetLeaves(tree, uSubfamNodeIndex, Leaves, &uLeafCount); for (unsigned uLeafIndex = 0; uLeafIndex < uLeafCount; ++uLeafIndex) Log("%6u %s\n", uSubfamIndex + 1, tree.GetLeafName(Leaves[uLeafIndex])); Log("\n"); } delete[] Leaves; } bool RefineSubfams(MSA &msa, const Tree &tree, unsigned uIters) { const unsigned uSeqCount = msa.GetSeqCount(); if (uSeqCount < 3) return false; const double dMaxHeight = 0.6; const unsigned uMaxSubfamCount = 16; const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Subfams; unsigned uSubfamCount; GetSubfams(tree, dMaxHeight, uMaxSubfamCount, &Subfams, &uSubfamCount); assert(uSubfamCount <= uSeqCount); if (g_bVerbose) LogSubfams(tree, Subfams, uSubfamCount); MSA *SubfamMSAs = new MSA[uSubfamCount]; unsigned *Leaves = new unsigned[uSeqCount]; unsigned *Ids = new unsigned[uSeqCount]; bool bAnyChanges = false; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uSubfam = Subfams[uSubfamIndex]; unsigned uLeafCount; GetLeaves(tree, uSubfam, Leaves, &uLeafCount); assert(uLeafCount <= uSeqCount); LeafIndexesToIds(tree, Leaves, uLeafCount, Ids); MSA &msaSubfam = SubfamMSAs[uSubfamIndex]; MSASubsetByIds(msa, Ids, uLeafCount, msaSubfam); DeleteGappedCols(msaSubfam); #if TRACE Log("Subfam %u MSA=\n", uSubfamIndex); msaSubfam.LogMe(); #endif if (msaSubfam.GetSeqCount() <= 2) continue; // TODO ///////////////////////////////////////// // Try using existing tree, may actually hurt to // re-estimate, may also be a waste of CPU & mem. ///////////////////////////////////////////////// Tree SubfamTree; TreeFromMSA(msaSubfam, SubfamTree, g_Cluster2, g_Distance2, g_Root2); bool bAnyChangesThisSubfam; if (g_bAnchors) bAnyChangesThisSubfam = RefineVert(msaSubfam, SubfamTree, uIters); else bAnyChangesThisSubfam = RefineHoriz(msaSubfam, SubfamTree, uIters, false, false); #if TRACE Log("Subfam %u Changed %d\n", uSubfamIndex, bAnyChangesThisSubfam); #endif if (bAnyChangesThisSubfam) bAnyChanges = true; } if (bAnyChanges) ProgressiveAlignSubfams(tree, Subfams, uSubfamCount, SubfamMSAs, msa); delete[] Leaves; delete[] Subfams; delete[] SubfamMSAs; return bAnyChanges; } static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[], unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa) { const unsigned uNodeCount = tree.GetNodeCount(); bool *Ready = new bool[uNodeCount]; MSA **MSAs = new MSA *[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { Ready[uNodeIndex] = false; MSAs[uNodeIndex] = 0; } for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uNodeIndex = Subfams[uSubfamIndex]; Ready[uNodeIndex] = true; MSA *ptrMSA = new MSA; // TODO: Wasteful copy, needs re-design ptrMSA->Copy(SubfamMSAs[uSubfamIndex]); MSAs[uNodeIndex] = ptrMSA; } for (unsigned uNodeIndex = tree.FirstDepthFirstNode(); NULL_NEIGHBOR != uNodeIndex; uNodeIndex = tree.NextDepthFirstNode(uNodeIndex)) { if (tree.IsLeaf(uNodeIndex)) continue; unsigned uRight = tree.GetRight(uNodeIndex); unsigned uLeft = tree.GetLeft(uNodeIndex); if (!Ready[uRight] || !Ready[uLeft]) continue; MSA *ptrLeft = MSAs[uLeft]; MSA *ptrRight = MSAs[uRight]; assert(ptrLeft != 0 && ptrRight != 0); MSA *ptrParent = new MSA; PWPath Path; AlignTwoMSAs(*ptrLeft, *ptrRight, *ptrParent, Path); MSAs[uNodeIndex] = ptrParent; Ready[uNodeIndex] = true; Ready[uLeft] = false; Ready[uRight] = false; delete MSAs[uLeft]; delete MSAs[uRight]; MSAs[uLeft] = 0; MSAs[uRight] = 0; } #if DEBUG { unsigned uReadyCount = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (Ready[uNodeIndex]) { assert(tree.IsRoot(uNodeIndex)); ++uReadyCount; assert(0 != MSAs[uNodeIndex]); } else assert(0 == MSAs[uNodeIndex]); } assert(1 == uReadyCount); } #endif const unsigned uRoot = tree.GetRootNodeIndex(); MSA *ptrRootAlignment = MSAs[uRoot]; msa.Copy(*ptrRootAlignment); delete ptrRootAlignment; #if TRACE Log("After refine subfamilies, root alignment=\n"); msa.LogMe(); #endif } muscle-3.8.31.orig/typetostr.cpp0000644000175000017500000000264211366141374016166 0ustar kratzcharles#include "muscle.h" #include const char *SecsToStr(unsigned long Secs) { static char Str[16]; long hh, mm, ss; hh = Secs/(60*60); mm = (Secs/60)%60; ss = Secs%60; sprintf(Str, "%02ld:%02ld:%02ld", hh, mm, ss); return Str; } const char *BoolToStr(bool b) { return b ? "True" : "False"; } const char *ScoreToStr(SCORE Score) { if (MINUS_INFINITY >= Score) return " *"; // Hack to use "circular" buffer so when called multiple // times in a printf-like argument list it works OK. const int iBufferCount = 16; const int iBufferLength = 16; static char szStr[iBufferCount*iBufferLength]; static int iBufferIndex = 0; iBufferIndex = (iBufferIndex + 1)%iBufferCount; char *pStr = szStr + iBufferIndex*iBufferLength; sprintf(pStr, "%8g", Score); return pStr; } // Left-justified version of ScoreToStr const char *ScoreToStrL(SCORE Score) { if (MINUS_INFINITY >= Score) return "*"; // Hack to use "circular" buffer so when called multiple // times in a printf-like argument list it works OK. const int iBufferCount = 16; const int iBufferLength = 16; static char szStr[iBufferCount*iBufferLength]; static int iBufferIndex = 0; iBufferIndex = (iBufferIndex + 1)%iBufferCount; char *pStr = szStr + iBufferIndex*iBufferLength; sprintf(pStr, "%.3g", Score); return pStr; } const char *WeightToStr(WEIGHT w) { return ScoreToStr(w); } muscle-3.8.31.orig/tracebackopt.cpp0000644000175000017500000000266711352261600016550 0ustar kratzcharles#include "muscle.h" #include "pwpath.h" void TraceBackToPath(int **TraceBack, unsigned uLengthA, unsigned uLengthB, PWPath &Path) { Path.Clear(); PWEdge Edge; Edge.uPrefixLengthA = uLengthA; Edge.uPrefixLengthB = uLengthB; for (;;) { if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB) break; int iDelta = TraceBack[Edge.uPrefixLengthA][Edge.uPrefixLengthB]; #if TRACE Log("TraceBack[%u][%u] = %d\n", Edge.uPrefixLengthA, Edge.uPrefixLengthB, iDelta); #endif if (0 == iDelta) { assert(Edge.uPrefixLengthA > 0); assert(Edge.uPrefixLengthB > 0); Edge.cType = 'M'; Path.PrependEdge(Edge); --(Edge.uPrefixLengthA); --(Edge.uPrefixLengthB); continue; } else if (iDelta > 0) { Edge.cType = 'D'; while (iDelta-- > 0) { assert(Edge.uPrefixLengthA > 0); Path.PrependEdge(Edge); --(Edge.uPrefixLengthA); } } else if (iDelta < 0) { Edge.cType = 'I'; while (iDelta++ < 0) { assert(Edge.uPrefixLengthB > 0); Path.PrependEdge(Edge); --(Edge.uPrefixLengthB); } } if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB) break; assert(Edge.uPrefixLengthA > 0); assert(Edge.uPrefixLengthB > 0); Edge.cType = 'M'; Path.PrependEdge(Edge); --(Edge.uPrefixLengthA); --(Edge.uPrefixLengthB); } #if TRACE Log("TraceBackToPath "); Path.LogMe(); #endif } muscle-3.8.31.orig/README.txt0000644000175000017500000000166111352261633015077 0ustar kratzcharlesMUSCLE v3.0 source code README ------------------------------ http://www.drive5.com/muscle This version of MUSCLE was built and tested on two platforms: Windows XP and Red Hat Linux 8.0. On Windows, I used Microsoft Visual C++ .Net, which I find to be the best C++ compile / edit / test environment I've tried on any platform. The Microsoft project file is muscle.vcproj. The Linux make file is Makefile. This is a very simple-minded make file (because I am a Linux development novice), so should be easy to understand. By default, it uses shared libraries, but I found this to give problems when copying between different Linux versions. The fix was to use the linker flag -lm static (commented out), which gives a much bigger but more portable binary. The posted binary was linked with static libraries. The source code was not written to be maintained by anyone but me, so the usual apologies and caveats apply. Bob Edgar, January 2004 muscle-3.8.31.orig/pwpath.cpp0000644000175000017500000002316511352261623015412 0ustar kratzcharles#include "muscle.h" #include "pwpath.h" #include "seq.h" #include "textfile.h" #include "msa.h" PWPath::PWPath() { m_uArraySize = 0; m_uEdgeCount = 0; m_Edges = 0; } PWPath::~PWPath() { Clear(); } void PWPath::Clear() { delete[] m_Edges; m_Edges = 0; m_uArraySize = 0; m_uEdgeCount = 0; } void PWPath::ExpandPath(unsigned uAdditionalEdgeCount) { PWEdge *OldPath = m_Edges; unsigned uEdgeCount = m_uArraySize + uAdditionalEdgeCount; m_Edges = new PWEdge[uEdgeCount]; m_uArraySize = uEdgeCount; if (m_uEdgeCount > 0) memcpy(m_Edges, OldPath, m_uEdgeCount*sizeof(PWEdge)); delete[] OldPath; } void PWPath::AppendEdge(const PWEdge &Edge) { if (0 == m_uArraySize || m_uEdgeCount + 1 == m_uArraySize) ExpandPath(200); m_Edges[m_uEdgeCount] = Edge; ++m_uEdgeCount; } void PWPath::AppendEdge(char cType, unsigned uPrefixLengthA, unsigned uPrefixLengthB) { PWEdge e; e.uPrefixLengthA = uPrefixLengthA; e.uPrefixLengthB = uPrefixLengthB; e.cType = cType; AppendEdge(e); } void PWPath::PrependEdge(const PWEdge &Edge) { if (0 == m_uArraySize || m_uEdgeCount + 1 == m_uArraySize) ExpandPath(1000); if (m_uEdgeCount > 0) memmove(m_Edges + 1, m_Edges, sizeof(PWEdge)*m_uEdgeCount); m_Edges[0] = Edge; ++m_uEdgeCount; } const PWEdge &PWPath::GetEdge(unsigned uEdgeIndex) const { assert(uEdgeIndex < m_uEdgeCount); return m_Edges[uEdgeIndex]; } void PWPath::Validate() const { const unsigned uEdgeCount = GetEdgeCount(); if (0 == uEdgeCount) return; const PWEdge &FirstEdge = GetEdge(0); const PWEdge &LastEdge = GetEdge(uEdgeCount - 1); unsigned uStartA = FirstEdge.uPrefixLengthA; unsigned uStartB = FirstEdge.uPrefixLengthB; if (FirstEdge.cType != 'I') --uStartA; if (FirstEdge.cType != 'D') --uStartB; unsigned uPrefixLengthA = FirstEdge.uPrefixLengthA; unsigned uPrefixLengthB = FirstEdge.uPrefixLengthB; for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = GetEdge(uEdgeIndex); switch (Edge.cType) { case 'M': if (uPrefixLengthA + 1 != Edge.uPrefixLengthA) Quit("PWPath::Validate MA %u", uPrefixLengthA); if (uPrefixLengthB + 1 != Edge.uPrefixLengthB) Quit("PWPath::Validate MB %u", uPrefixLengthB); ++uPrefixLengthA; ++uPrefixLengthB; break; case 'D': if (uPrefixLengthA + 1 != Edge.uPrefixLengthA) Quit("PWPath::Validate DA %u", uPrefixLengthA); if (uPrefixLengthB != Edge.uPrefixLengthB) Quit("PWPath::Validate DB %u", uPrefixLengthB); ++uPrefixLengthA; break; case 'I': if (uPrefixLengthA != Edge.uPrefixLengthA) Quit("PWPath::Validate IA %u", uPrefixLengthA); if (uPrefixLengthB + 1 != Edge.uPrefixLengthB) Quit("PWPath::Validate IB %u", uPrefixLengthB); ++uPrefixLengthB; break; } } } void PWPath::LogMe() const { for (unsigned uEdgeIndex = 0; uEdgeIndex < GetEdgeCount(); ++uEdgeIndex) { const PWEdge &Edge = GetEdge(uEdgeIndex); if (uEdgeIndex > 0) Log(" "); Log("%c%d.%d", Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); if ((uEdgeIndex > 0 && uEdgeIndex%10 == 0) || uEdgeIndex == GetEdgeCount() - 1) Log("\n"); } } void PWPath::Copy(const PWPath &Path) { Clear(); const unsigned uEdgeCount = Path.GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); AppendEdge(Edge); } } void PWPath::FromMSAPair(const MSA &msaA, const MSA &msaB) { const unsigned uColCount = msaA.GetColCount(); if (uColCount != msaB.GetColCount()) Quit("PWPath::FromMSAPair, lengths differ"); Clear(); unsigned uPrefixLengthA = 0; unsigned uPrefixLengthB = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bIsGapA = msaA.IsGapColumn(uColIndex); bool bIsGapB = msaB.IsGapColumn(uColIndex); PWEdge Edge; char cType; if (!bIsGapA && !bIsGapB) { cType = 'M'; ++uPrefixLengthA; ++uPrefixLengthB; } else if (bIsGapA && !bIsGapB) { cType = 'I'; ++uPrefixLengthB; } else if (!bIsGapA && bIsGapB) { cType = 'D'; ++uPrefixLengthA; } else { assert(bIsGapB && bIsGapA); continue; } Edge.cType = cType; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; AppendEdge(Edge); } } // Very similar to HMMPath::FromFile, should consolidate. void PWPath::FromFile(TextFile &File) { Clear(); char szToken[1024]; File.GetTokenX(szToken, sizeof(szToken)); if (0 != strcmp(szToken, "Path")) Quit("Invalid path file (Path)"); File.GetTokenX(szToken, sizeof(szToken)); if (0 != strcmp(szToken, "edges")) Quit("Invalid path file (edges)"); File.GetTokenX(szToken, sizeof(szToken)); if (!IsValidInteger(szToken)) Quit("Invalid path file (edges value)"); const unsigned uEdgeCount = (unsigned) atoi(szToken); unsigned uEdgeIndex = 0; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { // index File.GetTokenX(szToken, sizeof(szToken)); if (!IsValidInteger(szToken)) Quit("Invalid path file, invalid index '%s'", szToken); unsigned n = (unsigned) atoi(szToken); if (n != uEdgeIndex) Quit("Invalid path file, expecting edge %u got %u", uEdgeIndex, n); // type File.GetTokenX(szToken, sizeof(szToken)); if (1 != strlen(szToken)) Quit("Invalid path file, expecting state, got '%s'", szToken); const char cType = szToken[0]; if ('M' != cType && 'D' != cType && cType != 'I' && 'S' != cType) Quit("Invalid path file, expecting state, got '%c'", cType); // prefix length A File.GetTokenX(szToken, sizeof(szToken)); if (!IsValidInteger(szToken)) Quit("Invalid path file, bad prefix length A '%s'", szToken); const unsigned uPrefixLengthA = (unsigned) atoi(szToken); // prefix length B File.GetTokenX(szToken, sizeof(szToken)); if (!IsValidInteger(szToken)) Quit("Invalid path file, bad prefix length B '%s'", szToken); const unsigned uPrefixLengthB = (unsigned) atoi(szToken); PWEdge Edge; Edge.cType = cType; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; AppendEdge(Edge); } File.GetTokenX(szToken, sizeof(szToken)); if (0 != strcmp(szToken, "//")) Quit("Invalid path file (//)"); } void PWPath::ToFile(TextFile &File) const { const unsigned uEdgeCount = GetEdgeCount(); File.PutString("Path\n"); File.PutFormat("edges %u\n", uEdgeCount); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = GetEdge(uEdgeIndex); File.PutFormat("%u %c %u %u\n", uEdgeIndex, Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); } File.PutString("//\n"); } void PWPath::AssertEqual(const PWPath &Path) const { const unsigned uEdgeCount = GetEdgeCount(); if (uEdgeCount != Path.GetEdgeCount()) { Log("PWPath::AssertEqual, this=\n"); LogMe(); Log("\nOther path=\n"); Path.LogMe(); Log("\n"); Quit("PWPath::AssertEqual, Edge count different %u %u\n", uEdgeCount, Path.GetEdgeCount()); } for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &e1 = GetEdge(uEdgeIndex); const PWEdge &e2 = Path.GetEdge(uEdgeIndex); if (e1.cType != e2.cType || e1.uPrefixLengthA != e2.uPrefixLengthA || e1.uPrefixLengthB != e2.uPrefixLengthB) { Log("PWPath::AssertEqual, this=\n"); LogMe(); Log("\nOther path=\n"); Path.LogMe(); Log("\n"); Log("This edge %c%u.%u, other edge %c%u.%u\n", e1.cType, e1.uPrefixLengthA, e1.uPrefixLengthB, e2.cType, e2.uPrefixLengthA, e2.uPrefixLengthB); Quit("PWPath::AssertEqual, edge %u different\n", uEdgeIndex); } } } bool PWPath::Equal(const PWPath &Path) const { const unsigned uEdgeCount = GetEdgeCount(); if (uEdgeCount != Path.GetEdgeCount()) return false; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &e1 = GetEdge(uEdgeIndex); const PWEdge &e2 = Path.GetEdge(uEdgeIndex); if (e1.cType != e2.cType || e1.uPrefixLengthA != e2.uPrefixLengthA || e1.uPrefixLengthB != e2.uPrefixLengthB) return false; } return true; } unsigned PWPath::GetMatchCount() const { unsigned uMatchCount = 0; const unsigned uEdgeCount = GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &e = GetEdge(uEdgeIndex); if ('M' == e.cType) ++uMatchCount; } return uMatchCount; } unsigned PWPath::GetInsertCount() const { unsigned uInsertCount = 0; const unsigned uEdgeCount = GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &e = GetEdge(uEdgeIndex); if ('I' == e.cType) ++uInsertCount; } return uInsertCount; } unsigned PWPath::GetDeleteCount() const { unsigned uDeleteCount = 0; const unsigned uEdgeCount = GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &e = GetEdge(uEdgeIndex); if ('D' == e.cType) ++uDeleteCount; } return uDeleteCount; } void PWPath::FromStr(const char Str[]) { Clear(); unsigned uPrefixLengthA = 0; unsigned uPrefixLengthB = 0; while (char c = *Str++) { switch (c) { case 'M': ++uPrefixLengthA; ++uPrefixLengthB; break; case 'D': ++uPrefixLengthA; break; case 'I': ++uPrefixLengthB; break; default: Quit("PWPath::FromStr, invalid state %c", c); } AppendEdge(c, uPrefixLengthA, uPrefixLengthB); } } muscle-3.8.31.orig/intmath.cpp0000644000175000017500000001674411352261676015570 0ustar kratzcharles#include "muscle.h" #include PROB ScoreToProb(SCORE Score) { if (MINUS_INFINITY >= Score) return 0.0; return (PROB) pow(2.0, (double) Score/INTSCALE); } //#if 0 //static const double log2e = log2(exp(1.0)); // //double lnTolog2(double ln) // { // return ln*log2e; // } // //double log2(double x) // { // if (0 == x) // return MINUS_INFINITY; // // static const double dInvLn2 = 1.0/log(2.0); //// Multiply by inverse of log(2) just in case multiplication //// is faster than division. // return log(x)*dInvLn2; // } //#endif //SCORE ProbToScore(PROB Prob) // { // if (0.0 == Prob) // return MINUS_INFINITY; //// return (SCORE) floor(INTSCALE*log2(Prob)); // return (SCORE) log2(Prob); // } WEIGHT DoubleToWeight(double d) { assert(d >= 0); return (WEIGHT) (INTSCALE*d); } double WeightToDouble(WEIGHT w) { return (double) w / (double) INTSCALE; } SCORE DoubleToScore(double d) { return (SCORE)(d*(double) INTSCALE); } bool ScoreEq(SCORE s1, SCORE s2) { return BTEq(s1, s2); } static bool BTEq2(BASETYPE b1, BASETYPE b2) { double diff = fabs(b1 - b2); if (diff < 0.0001) return true; double sum = fabs(b1) + fabs(b2); return diff/sum < 0.005; } bool BTEq(double b1, double b2) { return BTEq2((BASETYPE) b1, (BASETYPE) b2); } //const double dLn2 = log(2.0); //// pow2(x)=2^x //double pow2(double x) // { // if (MINUS_INFINITY == x) // return 0; // return exp(x*dLn2); // } //// lp2(x) = log2(1 + 2^-x), x >= 0 //double lp2(double x) // { // return log2(1 + pow2(-x)); // } // SumLog(x, y) = log2(2^x + 2^y) //SCORE SumLog(SCORE x, SCORE y) // { // return (SCORE) log2(pow2(x) + pow2(y)); // } // //// SumLog(x, y, z) = log2(2^x + 2^y + 2^z) //SCORE SumLog(SCORE x, SCORE y, SCORE z) // { // return (SCORE) log2(pow2(x) + pow2(y) + pow2(z)); // } // //// SumLog(w, x, y, z) = log2(2^w + 2^x + 2^y + 2^z) //SCORE SumLog(SCORE w, SCORE x, SCORE y, SCORE z) // { // return (SCORE) log2(pow2(w) + pow2(x) + pow2(y) + pow2(z)); // } //SCORE lp2Fast(SCORE x) // { // assert(x >= 0); // const int iTableSize = 1000; // const double dRange = 20.0; // const double dScale = dRange/iTableSize; // static SCORE dValue[iTableSize]; // static bool bInit = false; // if (!bInit) // { // for (int i = 0; i < iTableSize; ++i) // dValue[i] = (SCORE) lp2(i*dScale); // bInit = true; // } // if (x >= dRange) // return 0.0; // int i = (int) (x/dScale); // assert(i >= 0 && i < iTableSize); // SCORE dResult = dValue[i]; // assert(BTEq(dResult, lp2(x))); // return dResult; // } // //// SumLog(x, y) = log2(2^x + 2^y) //SCORE SumLogFast(SCORE x, SCORE y) // { // if (MINUS_INFINITY == x) // { // if (MINUS_INFINITY == y) // return MINUS_INFINITY; // return y; // } // else if (MINUS_INFINITY == y) // return x; // // SCORE dResult; // if (x > y) // dResult = x + lp2Fast(x-y); // else // dResult = y + lp2Fast(y-x); // assert(SumLog(x, y) == dResult); // return dResult; // } // //SCORE SumLogFast(SCORE x, SCORE y, SCORE z) // { // SCORE dResult = SumLogFast(x, SumLogFast(y, z)); // assert(SumLog(x, y, z) == dResult); // return dResult; // } //SCORE SumLogFast(SCORE w, SCORE x, SCORE y, SCORE z) // { // SCORE dResult = SumLogFast(SumLogFast(w, x), SumLogFast(y, z)); // assert(SumLog(w, x, y, z) == dResult); // return dResult; // } double VecSum(const double v[], unsigned n) { double dSum = 0.0; for (unsigned i = 0; i < n; ++i) dSum += v[i]; return dSum; } void Normalize(PROB p[], unsigned n) { unsigned i; PROB dSum = 0.0; for (i = 0; i < n; ++i) dSum += p[i]; if (0.0 == dSum) Quit("Normalize, sum=0"); for (i = 0; i < n; ++i) p[i] /= dSum; } void NormalizeUnlessZero(PROB p[], unsigned n) { unsigned i; PROB dSum = 0.0; for (i = 0; i < n; ++i) dSum += p[i]; if (0.0 == dSum) return; for (i = 0; i < n; ++i) p[i] /= dSum; } void Normalize(PROB p[], unsigned n, double dRequiredTotal) { unsigned i; double dSum = 0.0; for (i = 0; i < n; ++i) dSum += p[i]; if (0.0 == dSum) Quit("Normalize, sum=0"); double dFactor = dRequiredTotal / dSum; for (i = 0; i < n; ++i) p[i] *= (PROB) dFactor; } bool VectorIsZero(const double dValues[], unsigned n) { for (unsigned i = 0; i < n; ++i) if (dValues[i] != 0.0) return false; return true; } void VectorSet(double dValues[], unsigned n, double d) { for (unsigned i = 0; i < n; ++i) dValues[i] = d; } bool VectorIsZero(const float dValues[], unsigned n) { for (unsigned i = 0; i < n; ++i) if (dValues[i] != 0.0) return false; return true; } void VectorSet(float dValues[], unsigned n, float d) { for (unsigned i = 0; i < n; ++i) dValues[i] = d; } double Correl(const double P[], const double Q[], unsigned uCount) { double dSumP = 0.0; double dSumQ = 0.0; for (unsigned n = 0; n < uCount; ++n) { dSumP += P[n]; dSumQ += Q[n]; } const double dMeanP = dSumP/uCount; const double dMeanQ = dSumQ/uCount; double dSum1 = 0.0; double dSum2 = 0.0; double dSum3 = 0.0; for (unsigned n = 0; n < uCount; ++n) { const double dDiffP = P[n] - dMeanP; const double dDiffQ = Q[n] - dMeanQ; dSum1 += dDiffP*dDiffQ; dSum2 += dDiffP*dDiffP; dSum3 += dDiffQ*dDiffQ; } if (0 == dSum1) return 0; const double dCorrel = dSum1 / sqrt(dSum2*dSum3); return dCorrel; } float Correl(const float P[], const float Q[], unsigned uCount) { float dSumP = 0.0; float dSumQ = 0.0; for (unsigned n = 0; n < uCount; ++n) { dSumP += P[n]; dSumQ += Q[n]; } const float dMeanP = dSumP/uCount; const float dMeanQ = dSumQ/uCount; float dSum1 = 0.0; float dSum2 = 0.0; float dSum3 = 0.0; for (unsigned n = 0; n < uCount; ++n) { const float dDiffP = P[n] - dMeanP; const float dDiffQ = Q[n] - dMeanQ; dSum1 += dDiffP*dDiffQ; dSum2 += dDiffP*dDiffP; dSum3 += dDiffQ*dDiffQ; } if (0 == dSum1) return 0; const float dCorrel = dSum1 / (float) sqrt(dSum2*dSum3); return dCorrel; } // Simple (but slow) function to compute Pearson ranks // that allows for ties. Correctness and simplicity // are priorities over speed here. void Rank(const float P[], float Ranks[], unsigned uCount) { for (unsigned n = 0; n < uCount; ++n) { unsigned uNumberGreater = 0; unsigned uNumberEqual = 0; unsigned uNumberLess = 0; double dValue = P[n]; for (unsigned i = 0; i < uCount; ++i) { double v = P[i]; if (v == dValue) ++uNumberEqual; else if (v < dValue) ++uNumberLess; else ++uNumberGreater; } assert(uNumberEqual >= 1); assert(uNumberEqual + uNumberLess + uNumberGreater == uCount); Ranks[n] = (float) (1 + uNumberLess + (uNumberEqual - 1)/2.0); } } void Rank(const double P[], double Ranks[], unsigned uCount) { for (unsigned n = 0; n < uCount; ++n) { unsigned uNumberGreater = 0; unsigned uNumberEqual = 0; unsigned uNumberLess = 0; double dValue = P[n]; for (unsigned i = 0; i < uCount; ++i) { double v = P[i]; if (v == dValue) ++uNumberEqual; else if (v < dValue) ++uNumberLess; else ++uNumberGreater; } assert(uNumberEqual >= 1); assert(uNumberEqual + uNumberLess + uNumberGreater == uCount); Ranks[n] = (double) (1 + uNumberLess + (uNumberEqual - 1)/2.0); } } FCOUNT SumCounts(const FCOUNT Counts[]) { FCOUNT Sum = 0; for (int i = 0; i < 20; ++i) Sum += Counts[i]; return Sum; } muscle-3.8.31.orig/subfams.cpp0000644000175000017500000000254511352261666015555 0ustar kratzcharles#include "muscle.h" #include "distfunc.h" const float INFINITY = float(1e29); const unsigned NILL = uInsane; static float *ShortestPathEstimate; static unsigned *Predecessor; static void GetMostDistantPair(DistFunc &DF, unsigned *ptrIndex1, unsigned *ptrIndex2) { const unsigned uNodeCount = DF.GetCount(); if (uNodeCount < 2) Quit("GetMostDistantPair: < 2 seqs"); float MaxDist = -1; unsigned Index1 = uInsane; unsigned Index2 = uInsane; for (unsigned i = 0; i < uNodeCount; ++i) { for (unsigned j = i + 1; j < uNodeCount; ++j) { float d = DF.GetDist(i, j); if (d > MaxDist) { MaxDist = d; Index1 = i; Index2 = j; } } } assert(Index1 != uInsane); assert(Index2 != uInsane); *ptrIndex1 = Index1; *ptrIndex2 = Index2; } static void InitializeSingleSource(DistFunc &DF, unsigned uIndex) { const unsigned uNodeCount = 0; for (unsigned i = 0; i < uNodeCount; ++i) { ShortestPathEstimate[i] = INFINITY; Predecessor[i] = NILL; } ShortestPathEstimate[uIndex] = 0; } static void Relax(DistFunc &DF, unsigned u, unsigned v) { float w = DF.GetDist(u, v); float d = ShortestPathEstimate[u] + w; if (ShortestPathEstimate[v] > d) { ShortestPathEstimate[v] = d; Predecessor[v] = u; } } void ShortestPath(DistFunc &DF, unsigned uIndex) { } muscle-3.8.31.orig/enumtostr.cpp0000644000175000017500000000120411352261600016130 0ustar kratzcharles#include "muscle.h" #include static char szMsg[64]; // Define XXXToStr(XXX x) functions for each enum type XXX. #define s(t) const char *t##ToStr(t x) { switch (x) { case t##_Undefined: return "Undefined"; #define c(t, x) case t##_##x: return #x; #define e(t) } sprintf(szMsg, #t "_%d", x); return szMsg; } #include "enums.h" // Define StrToXXX(const char *Str) functions for each enum type XXX. #define s(t) t StrTo##t(const char *Str) { if (0) ; #define c(t, x) else if (0 == stricmp(#x, Str)) return t##_##x; #define e(t) Quit("Invalid value %s for type %s", Str, #t); return t##_Undefined; } #include "enums.h" muscle-3.8.31.orig/gatest.cpp0000644000175000017500000000145311352261615015373 0ustar kratzcharles#include "muscle.h" #include "pwpath.h" #include "timing.h" #include "textfile.h" #include "msa.h" #include "profile.h" SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { if (g_bDiags) return GlobalAlignDiags(PA, uLengthA, PB, uLengthB, Path); else return GlobalAlignNoDiags(PA, uLengthA, PB, uLengthB, Path); } SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { switch (g_PPScore) { case PPSCORE_LE: return GlobalAlignLA(PA, uLengthA, PB, uLengthB, Path); case PPSCORE_SP: return GlobalAlignNS(PA, uLengthA, PB, uLengthB, Path); case PPSCORE_SV: return GlobalAlignSimple(PA, uLengthA, PB, uLengthB, Path); } return 0; } muscle-3.8.31.orig/scoremx.cpp0000644000175000017500000000154111352261636015565 0ustar kratzcharles#include "muscle.h" #include "profile.h" extern SCOREMATRIX VTML_LA; extern SCOREMATRIX PAM200; extern SCOREMATRIX PAM200NoCenter; extern SCOREMATRIX VTML_SP; extern SCOREMATRIX VTML_SPNoCenter; extern SCOREMATRIX NUC_SP; PTR_SCOREMATRIX g_ptrScoreMatrix; void SetScoreMatrix() { switch (g_PPScore) { case PPSCORE_LE: g_ptrScoreMatrix = &VTML_LA; break; case PPSCORE_SP: if (g_bPrecompiledCenter) g_ptrScoreMatrix = &PAM200; else g_ptrScoreMatrix = &PAM200NoCenter; break; case PPSCORE_SV: if (g_bPrecompiledCenter) g_ptrScoreMatrix = &VTML_SP; else g_ptrScoreMatrix = &VTML_SPNoCenter; break; case PPSCORE_SPN: if (g_bPrecompiledCenter) g_ptrScoreMatrix = &NUC_SP; else Quit("SPN requires precompiled center"); break; default: Quit("Invalid g_PPScore"); } } muscle-3.8.31.orig/scoregaps.cpp0000644000175000017500000001063211352261666016077 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "objscore.h" #define TRACE 0 struct GAPINFO { GAPINFO *Next; unsigned Start; unsigned End; }; static GAPINFO **g_Gaps; static GAPINFO *g_FreeList; static unsigned g_MaxSeqCount; static unsigned g_MaxColCount; static unsigned g_ColCount; static bool *g_ColDiff; static GAPINFO *NewGapInfo() { if (0 == g_FreeList) { const int NEWCOUNT = 256; GAPINFO *NewList = new GAPINFO[NEWCOUNT]; g_FreeList = &NewList[0]; for (int i = 0; i < NEWCOUNT-1; ++i) NewList[i].Next = &NewList[i+1]; NewList[NEWCOUNT-1].Next = 0; } GAPINFO *GI = g_FreeList; g_FreeList = g_FreeList->Next; return GI; } static void FreeGapInfo(GAPINFO *GI) { GI->Next = g_FreeList; g_FreeList = GI; } // TODO: This could be much faster, no need to look // at all columns. static void FindIntersectingGaps(const MSA &msa, unsigned SeqIndex) { const unsigned ColCount = msa.GetColCount(); bool InGap = false; bool Intersects = false; unsigned Start = uInsane; for (unsigned Col = 0; Col <= ColCount; ++Col) { bool Gap = ((Col != ColCount) && msa.IsGap(SeqIndex, Col)); if (Gap) { if (!InGap) { InGap = true; Start = Col; } if (g_ColDiff[Col]) Intersects = true; } else if (InGap) { InGap = false; if (Intersects) { GAPINFO *GI = NewGapInfo(); GI->Start = Start; GI->End = Col - 1; GI->Next = g_Gaps[SeqIndex]; g_Gaps[SeqIndex] = GI; } Intersects = false; } } } static SCORE Penalty(unsigned Length, bool Term) { if (0 == Length) return 0; SCORE s1 = g_scoreGapOpen + g_scoreGapExtend*(Length - 1); #if DOUBLE_AFFINE SCORE s2 = g_scoreGapOpen2 + g_scoreGapExtend2*(Length - 1); if (s1 > s2) return s1; return s2; #else return s1; #endif } //static SCORE ScorePair(unsigned Seq1, unsigned Seq2) // { //#if TRACE // { // Log("ScorePair(%d,%d)\n", Seq1, Seq2); // Log("Gaps seq 1: "); // for (GAPINFO *GI = g_Gaps[Seq1]; GI; GI = GI->Next) // Log(" %d-%d", GI->Start, GI->End); // Log("\n"); // Log("Gaps seq 2: "); // for (GAPINFO *GI = g_Gaps[Seq2]; GI; GI = GI->Next) // Log(" %d-%d", GI->Start, GI->End); // Log("\n"); // } //#endif // return 0; // } SCORE ScoreGaps(const MSA &msa, const unsigned DiffCols[], unsigned DiffColCount) { #if TRACE { Log("ScoreGaps\n"); Log("DiffCols "); for (unsigned i = 0; i < DiffColCount; ++i) Log(" %u", DiffCols[i]); Log("\n"); Log("msa=\n"); msa.LogMe(); Log("\n"); } #endif const unsigned SeqCount = msa.GetSeqCount(); const unsigned ColCount = msa.GetColCount(); g_ColCount = ColCount; if (SeqCount > g_MaxSeqCount) { delete[] g_Gaps; g_MaxSeqCount = SeqCount + 256; g_Gaps = new GAPINFO *[g_MaxSeqCount]; } memset(g_Gaps, 0, SeqCount*sizeof(GAPINFO *)); if (ColCount > g_MaxColCount) { delete[] g_ColDiff; g_MaxColCount = ColCount + 256; g_ColDiff = new bool[g_MaxColCount]; } memset(g_ColDiff, 0, g_ColCount*sizeof(bool)); for (unsigned i = 0; i < DiffColCount; ++i) { unsigned Col = DiffCols[i]; assert(Col < ColCount); g_ColDiff[Col] = true; } for (unsigned SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) FindIntersectingGaps(msa, SeqIndex); #if TRACE { Log("\n"); Log("Intersecting gaps:\n"); Log(" "); for (unsigned Col = 0; Col < ColCount; ++Col) Log("%c", g_ColDiff[Col] ? '*' : ' '); Log("\n"); Log(" "); for (unsigned Col = 0; Col < ColCount; ++Col) Log("%d", Col%10); Log("\n"); for (unsigned Seq = 0; Seq < SeqCount; ++Seq) { Log("%3d: ", Seq); for (unsigned Col = 0; Col < ColCount; ++Col) Log("%c", msa.GetChar(Seq, Col)); Log(" :: "); for (GAPINFO *GI = g_Gaps[Seq]; GI; GI = GI->Next) Log(" (%d,%d)", GI->Start, GI->End); Log(" >%s\n", msa.GetSeqName(Seq)); } Log("\n"); } #endif SCORE Score = 0; for (unsigned Seq1 = 0; Seq1 < SeqCount; ++Seq1) { const WEIGHT w1 = msa.GetSeqWeight(Seq1); for (unsigned Seq2 = Seq1 + 1; Seq2 < SeqCount; ++Seq2) { const WEIGHT w2 = msa.GetSeqWeight(Seq2); // const SCORE Pair = ScorePair(Seq1, Seq2); const SCORE Pair = ScoreSeqPairGaps(msa, Seq1, msa, Seq2); Score += w1*w2*Pair; #if TRACE Log("Seq1=%u Seq2=%u ScorePair=%.4g w1=%.4g w2=%.4g Sum=%.4g\n", Seq1, Seq2, Pair, w1, w2, Score); #endif } } return Score; } muscle-3.8.31.orig/clustsetmsa.h0000644000175000017500000000235611352261600016115 0ustar kratzcharles#ifndef ClustSetMSA_h #define ClustSetMSA_h class MSA; class Clust; #include "clustset.h" #include "msadist.h" // Distance matrix based set. // Computes distances between leaves, never between // joined clusters (leaves this to distance matrix method). class ClustSetMSA : public ClustSet { public: ClustSetMSA(const MSA &msa, MSADist &MD) : m_ptrMSA(&msa), m_ptrMSADist(&MD) { } public: virtual unsigned GetLeafCount() { return m_ptrMSA->GetSeqCount(); } virtual const char *GetLeafName(unsigned uNodeIndex) { return m_ptrMSA->GetSeqName(uNodeIndex); } virtual unsigned GetLeafId(unsigned uNodeIndex) { return m_ptrMSA->GetSeqId(uNodeIndex); } virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex, unsigned uRightNodeIndex, unsigned uJoinedNodeIndex, double *ptrdLeftLength, double *ptrdRightLength) { Quit("ClustSetMSA::JoinNodes, should never be called"); } virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1, unsigned uNodeIndex2) { return m_ptrMSADist->ComputeDist(*m_ptrMSA, uNodeIndex1, uNodeIndex2); } public: const MSA &GetMSA(); private: const MSA *m_ptrMSA; MSADist *m_ptrMSADist; }; #endif // ClustSetMSA_h muscle-3.8.31.orig/refinetreee.cpp0000644000175000017500000000224211352261673016402 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "tree.h" #include "profile.h" #include #define TRACE 0 void RefineTreeE(MSA &msa, const SeqVect &v, Tree &tree, ProgNode *ProgNodes) { const unsigned uSeqCount = msa.GetSeqCount(); if (tree.GetLeafCount() != uSeqCount) Quit("Refine tree, tree has different number of nodes"); if (uSeqCount < 3) return; #if DEBUG ValidateMuscleIds(msa); ValidateMuscleIds(tree); #endif const unsigned uNodeCount = tree.GetNodeCount(); unsigned *uNewNodeIndexToOldNodeIndex= new unsigned[uNodeCount]; Tree Tree2; TreeFromMSA(msa, Tree2, g_Cluster2, g_Distance2, g_Root2, g_pstrDistMxFileName2); #if DEBUG ValidateMuscleIds(Tree2); #endif DiffTreesE(Tree2, tree, uNewNodeIndexToOldNodeIndex); unsigned uRoot = Tree2.GetRootNodeIndex(); if (NODE_CHANGED == uNewNodeIndexToOldNodeIndex[uRoot]) { MSA msa2; RealignDiffsE(msa, v, Tree2, tree, uNewNodeIndexToOldNodeIndex, msa2, ProgNodes); tree.Copy(Tree2); msa.Copy(msa2); #if DEBUG ValidateMuscleIds(msa2); #endif } delete[] uNewNodeIndexToOldNodeIndex; SetCurrentAlignment(msa); ProgressStepsDone(); } muscle-3.8.31.orig/difftreese.cpp0000644000175000017500000001612711352261611016224 0ustar kratzcharles#include "muscle.h" #include "tree.h" #define TRACE 0 /*** Algorithm to compare two trees, X and Y. A node x in X and node y in Y are defined to be similar iff the set of leaves in the subtree under x is identical to the set of leaves under y. A node is defined to be changed iff it is not similar to any node in the other tree. Nodes x and y are defined to be married iff every node in the subtree under x is similar to a node in the subtree under y. Married nodes are considered to be equal. The subtrees under two married nodes can at most differ by exchanges of left and right branches, which we do not consider to be significant here. A node is changed iff it is not married. If a node is changed, then it has a dissimilar node in its subtree, and it follows immediately from the definition of marriage that its parent is also a bachelor. Hence all nodes on the path from a changed node to the root are changed. We assume the trees have the same set of leaves, so every leaf is trivially both similar and married to the same leaf in the opposite tree. Changed nodes are therefore always internal (i.e., non-leaf) nodes. Example: -----A -----k ----j -----B --i -----C ------D -----A -----p ----n -----B --m -----D ------C The following pairs of internal nodes are similar. Nodes Set of leaves ----- ------------- k,p A,B i,m A,B,C,D Changed nodes in the first tree are i and j, changed nodes in the second tree are m and n. Node k and p are married, but i and m are not (because j and n are changed). The diffs are C, D and k. To achieve O(N) we avoid traversing a given subtree multiple times and also avoid comparing lists of leaves. We visit nodes in depth-first order (i.e., a node is visited before its parent). If either child of a node is changed, we flag it as changed. If both children of the node we are visiting are married, we check whether the spouses of those children have the same parent in the other tree. If the parents are different, the current node is a bachelor. If they have the same parent, then the node we are visiting is the spouse of that parent. We assign this newly identified married couple a unique integer id. The id of a node is in one-to-one correspondence with the set of leaves in its subtree. Two nodes have the same set of leaves iff they have the same id. Changed nodes do not get an id. ***/ void DiffTreesE(const Tree &NewTree, const Tree &OldTree, unsigned NewNodeIndexToOldNodeIndex[]) { #if TRACE Log("DiffTreesE NewTree:\n"); NewTree.LogMe(); Log("\n"); Log("OldTree:\n"); OldTree.LogMe(); #endif if (!NewTree.IsRooted() || !OldTree.IsRooted()) Quit("DiffTrees: requires rooted trees"); const unsigned uNodeCount = NewTree.GetNodeCount(); const unsigned uOldNodeCount = OldTree.GetNodeCount(); const unsigned uLeafCount = NewTree.GetLeafCount(); const unsigned uOldLeafCount = OldTree.GetLeafCount(); if (uNodeCount != uOldNodeCount || uLeafCount != uOldLeafCount) Quit("DiffTreesE: different node counts"); { unsigned *IdToOldNodeIndex = new unsigned[uNodeCount]; for (unsigned uOldNodeIndex = 0; uOldNodeIndex < uNodeCount; ++uOldNodeIndex) { if (OldTree.IsLeaf(uOldNodeIndex)) { unsigned Id = OldTree.GetLeafId(uOldNodeIndex); IdToOldNodeIndex[Id] = uOldNodeIndex; } } // Initialize NewNodeIndexToOldNodeIndex[] // All internal nodes are marked as changed, but may be updated later. for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex) { if (NewTree.IsLeaf(uNewNodeIndex)) { unsigned uId = NewTree.GetLeafId(uNewNodeIndex); assert(uId < uLeafCount); unsigned uOldNodeIndex = IdToOldNodeIndex[uId]; assert(uOldNodeIndex < uNodeCount); NewNodeIndexToOldNodeIndex[uNewNodeIndex] = uOldNodeIndex; } else NewNodeIndexToOldNodeIndex[uNewNodeIndex] = NODE_CHANGED; } delete[] IdToOldNodeIndex; } // Depth-first traversal of tree. // The order guarantees that a node is visited before // its parent is visited. for (unsigned uNewNodeIndex = NewTree.FirstDepthFirstNode(); NULL_NEIGHBOR != uNewNodeIndex; uNewNodeIndex = NewTree.NextDepthFirstNode(uNewNodeIndex)) { if (NewTree.IsLeaf(uNewNodeIndex)) continue; // If either child is changed, flag this node as changed and continue. unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex); unsigned uOldLeft = NewNodeIndexToOldNodeIndex[uNewLeft]; if (NODE_CHANGED == uOldLeft) { NewNodeIndexToOldNodeIndex[uNewLeft] = NODE_CHANGED; continue; } unsigned uNewRight = NewTree.GetRight(uNewNodeIndex); unsigned uOldRight = NewNodeIndexToOldNodeIndex[uNewRight]; if (NODE_CHANGED == NewNodeIndexToOldNodeIndex[uNewRight]) { NewNodeIndexToOldNodeIndex[uNewRight] = NODE_CHANGED; continue; } unsigned uOldParentLeft = OldTree.GetParent(uOldLeft); unsigned uOldParentRight = OldTree.GetParent(uOldRight); if (uOldParentLeft == uOldParentRight) NewNodeIndexToOldNodeIndex[uNewNodeIndex] = uOldParentLeft; else NewNodeIndexToOldNodeIndex[uNewNodeIndex] = NODE_CHANGED; } #if TRACE { Log("NewToOld "); for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex) { Log(" [%3u]=", uNewNodeIndex); if (NODE_CHANGED == NewNodeIndexToOldNodeIndex[uNewNodeIndex]) Log(" X"); else Log("%3u", NewNodeIndexToOldNodeIndex[uNewNodeIndex]); if ((uNewNodeIndex+1)%8 == 0) Log("\n "); } Log("\n"); } #endif #if DEBUG { for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex) { unsigned uOld = NewNodeIndexToOldNodeIndex[uNewNodeIndex]; if (NewTree.IsLeaf(uNewNodeIndex)) { if (uOld >= uNodeCount) { Log("NewNode=%u uOld=%u > uNodeCount=%u\n", uNewNodeIndex, uOld, uNodeCount); Quit("Diff check failed"); } unsigned uIdNew = NewTree.GetLeafId(uNewNodeIndex); unsigned uIdOld = OldTree.GetLeafId(uOld); if (uIdNew != uIdOld) { Log("NewNode=%u uOld=%u IdNew=%u IdOld=%u\n", uNewNodeIndex, uOld, uIdNew, uIdOld); Quit("Diff check failed"); } continue; } if (NODE_CHANGED == uOld) continue; unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex); unsigned uNewRight = NewTree.GetRight(uNewNodeIndex); unsigned uOldLeft = OldTree.GetLeft(uOld); unsigned uOldRight = OldTree.GetRight(uOld); unsigned uNewLeftPartner = NewNodeIndexToOldNodeIndex[uNewLeft]; unsigned uNewRightPartner = NewNodeIndexToOldNodeIndex[uNewRight]; bool bSameNotRotated = (uNewLeftPartner == uOldLeft && uNewRightPartner == uOldRight); bool bSameRotated = (uNewLeftPartner == uOldRight && uNewRightPartner == uOldLeft); if (!bSameNotRotated && !bSameRotated) { Log("NewNode=%u NewL=%u NewR=%u\n", uNewNodeIndex, uNewLeft, uNewRight); Log("OldNode=%u OldL=%u OldR=%u\n", uOld, uOldLeft, uOldRight); Log("NewLPartner=%u NewRPartner=%u\n", uNewLeftPartner, uNewRightPartner); Quit("Diff check failed"); } } } #endif } muscle-3.8.31.orig/tomhydro.cpp0000644000175000017500000000465611352261672015764 0ustar kratzcharles#include "muscle.h" #include "profile.h" // Original: //HYDROPHILIC_CONTEXT 0 6 -0.3969495574 //HYDROPHILIC_CONTEXT 1 6 -0.9407126603 //HYDROPHILIC_CONTEXT 2 6 -0.4968150972 //HYDROPHILIC_CONTEXT 3 6 -0.271646023 //HYDROPHILIC_CONTEXT 4 6 0.006990406416 //HYDROPHILIC_CONTEXT 5 6 0.1381111256 //HYDROPHILIC_CONTEXT 6 6 0.2541439872 // Blosum62: //HYDROPHILIC_CONTEXT 0 6 -0.2448419585 //HYDROPHILIC_CONTEXT 1 6 -0.8734889946 //HYDROPHILIC_CONTEXT 2 6 -0.5724336598 //HYDROPHILIC_CONTEXT 3 6 -0.2670439975 //HYDROPHILIC_CONTEXT 4 6 0.004844647323 //HYDROPHILIC_CONTEXT 5 6 0.1812057148 //HYDROPHILIC_CONTEXT 6 6 0.1036540864 static SCORE Factors[7] = { (SCORE) -0.2448419585, (SCORE) -0.8734889946, (SCORE) -0.5724336598, (SCORE) -0.2670439975, (SCORE) 0.004844647323, (SCORE) 0.1812057148, (SCORE) 0.1036540864 }; static bool Hydrophilic[20] = { false, // A false, // C true, // D true, // E false, // F true, // G false, // H false, // I true, // K false, // L false, // M true, // N true, // P true, // Q true, // R true, // S false, // T false, // V false, // Y false, // W }; bool IsHydrophilic(const FCOUNT fcCounts[]) { for (unsigned uLetter = 0; uLetter < 20; ++uLetter) if (fcCounts[uLetter] > 0.0 && Hydrophilic[uLetter]) return false; return true; } static double HydrophilicFraction(const FCOUNT fcCounts[]) { double TotalAll = 0.0; double TotalHydrophilic = 0.0; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { FCOUNT Freq = fcCounts[uLetter]; TotalAll += Freq; if (Hydrophilic[uLetter]) TotalHydrophilic += Freq; } return TotalHydrophilic / TotalAll; } void TomHydro(ProfPos *Prof, unsigned uLength) { if (ALPHA_Amino != g_Alpha) return; if (uLength < 6) return; for (unsigned uColIndex = 3; uColIndex < uLength - 2; ++uColIndex) { // 6-residue window: // xxxxxx // AARNCARNGTAGCATNAC // AARN----------TNAC double dCount = 0.0; for (unsigned uColIndexW = uColIndex - 3; uColIndexW < uColIndex + 3; ++uColIndexW) { const ProfPos &PP = Prof[uColIndexW]; dCount += HydrophilicFraction(PP.m_fcCounts); } // Round to nearest integer unsigned uCount = (unsigned) (dCount + 0.5); if (uCount > 6) uCount = 6; SCORE dFactor = Factors[uCount]; ProfPos &PP = Prof[uColIndex]; PP.m_scoreGapOpen += dFactor; PP.m_scoreGapClose += dFactor; } } muscle-3.8.31.orig/redblack.cpp0000644000175000017500000002432711352261600015652 0ustar kratzcharles#include "muscle.h" #include "clust.h" void Clust::InsertMetric(unsigned uIndex1, unsigned uIndex2, float dMetric) { RBInsert(uIndex1, uIndex2, dMetric); } void Clust::DeleteMetric(unsigned uIndex) { for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane; uNodeIndex = GetNextCluster(uNodeIndex)) { if (uIndex == uNodeIndex) continue; DeleteMetric(uIndex, uNodeIndex); } } void Clust::InitMetric(unsigned uMaxNodeIndex) { m_uRBNodeCount = m_uTriangularMatrixSize; m_RBParent = new unsigned[m_uRBNodeCount]; m_RBLeft = new unsigned[m_uRBNodeCount]; m_RBRight = new unsigned[m_uRBNodeCount]; m_RBi = new ushort[m_uRBNodeCount]; m_RBj = new ushort[m_uRBNodeCount]; m_RBMetric = new float[m_uRBNodeCount]; m_RBColor = new bool[m_uRBNodeCount]; m_RBRoot = RB_NIL; #if DEBUG { // Initialize fields to invalid values so we have a chance // catch attempts to use them if they're not properly set. unsigned InvalidNode = m_uRBNodeCount + 1; for (unsigned Node = 0; Node < m_uRBNodeCount; ++Node) { m_RBParent[Node] = InvalidNode; m_RBLeft[Node] = InvalidNode; m_RBRight[Node] = InvalidNode; m_RBi[Node] = InvalidNode; m_RBj[Node] = InvalidNode; } } #endif } void Clust::ListMetric() const { Log("Red-black tree root=%u\n", m_RBRoot); Log("\n"); Log(" Node Parent Left Right Color i j Metric\n"); Log("----- ------ ----- ----- ----- ----- ----- ------\n"); if (RB_NIL == m_RBRoot) return; unsigned Count = 0; unsigned Start = RBMin(m_RBRoot); for (unsigned Node = Start; RB_NIL != Node; Node = RBNext(Node)) { Log("%5u", Node); if (RB_NIL != m_RBParent[Node]) Log(" %6u", m_RBParent[Node]); else Log(" "); if (RB_NIL != m_RBLeft[Node]) Log(" %5u", m_RBLeft[Node]); else Log(" "); if (RB_NIL != m_RBRight[Node]) Log(" %5u", m_RBRight[Node]); else Log(" "); Log(" %s %5u %5u %g\n", m_RBColor[Node] ? " Red" : "Black", m_RBi[Node], m_RBj[Node], m_RBMetric[Node]); if (++Count > m_uRBNodeCount) { Log(" ** LOOP ** \n"); break; } } } // If there is a left subtree, predecessor is the // largest key found under the left branch. Otherwise, // is first node in path to root that is a right child. unsigned Clust::RBPrev(unsigned Node) const { assert(Node < m_uRBNodeCount); unsigned Left = m_RBLeft[Node]; if (RB_NIL != Left) return RBMax(Left); for (;;) { unsigned Parent = m_RBParent[Node]; if (RB_NIL == Parent) return RB_NIL; if (m_RBRight[Parent] == Node) return Parent; Node = Parent; } } // If there is a right subtree, sucessor is the // smallest key found under the right branch. Otherwise, // is first node in path to root that is a left child. unsigned Clust::RBNext(unsigned Node) const { if (Node >= m_uRBNodeCount) Quit("RBNext(%u)", Node); assert(Node < m_uRBNodeCount); unsigned Right = m_RBRight[Node]; if (RB_NIL != Right) return RBMin(Right); for (;;) { unsigned Parent = m_RBParent[Node]; if (RB_NIL == Parent) return RB_NIL; if (m_RBLeft[Parent] == Node) return Parent; Node = Parent; } } // Minimum is in leftmost leaf unsigned Clust::RBMin(unsigned RBNode) const { assert(RB_NIL != RBNode); for (;;) { unsigned Left = m_RBLeft[RBNode]; if (RB_NIL == Left) return RBNode; RBNode = Left; } } // Maximum is in rightmost leaf unsigned Clust::RBMax(unsigned RBNode) const { assert(RB_NIL != RBNode); for (;;) { unsigned Right = m_RBRight[RBNode]; if (RB_NIL == Right) return RBNode; RBNode = Right; } } void Clust::DeleteMetric(unsigned uIndex1, unsigned uIndex2) { unsigned RBNode = (unsigned) VectorIndex(uIndex1, uIndex2); RBDelete(RBNode); } void Clust::RBDelete(unsigned Node) { #if DEBUG ValidateRB(); //Log("@@ Before RBDelete(%u)\n", Node); //ListMetric(); #endif unsigned Left = m_RBLeft[Node]; unsigned Right = m_RBRight[Node]; unsigned Parent = m_RBParent[Node]; // If one or two nil children, splice out this node. if (RB_NIL == Left || RB_NIL == Right) { // Log("@@ One child\n"); // Child is non-NIL child, or NIL if none. unsigned Child = (Left != RB_NIL ? Left : Right); // Special case if root if (RB_NIL == Parent) { assert(Node == m_RBRoot); m_RBRoot = Child; if (RB_NIL != Child) m_RBParent[Child] = RB_NIL; return; } // Typical case. // Update parent->child link if (m_RBLeft[Parent] == Node) m_RBLeft[Parent] = Child; else { assert(m_RBRight[Parent] == Node); m_RBRight[Parent] = Child; } // Update child->parent link if (RB_NIL != Child) m_RBParent[Child] = Parent; #if DEBUG //Log("@@ After RBDelete(%u)\n", Node); //ListMetric(); ValidateRB(); #endif return; } //Log("@@ RBDelete(%u) Tricky case\n", Node); //ListMetric(); // Trickier case, node has two children. assert(Left != RB_NIL && Right != RB_NIL); // We're going to splice out successor node from its // current position and insert it in place of node // to be deleted. // Successor cannot be nil because there is a right child. unsigned Next = RBNext(Node); assert(Next != RB_NIL); // The successor of a node with two children is // guaranteed to have no more than one child. unsigned NextLeft = m_RBLeft[Next]; unsigned NextRight = m_RBRight[Next]; assert(RB_NIL == NextLeft || RB_NIL == NextRight); // Successor of node with two children cannot be the root. unsigned NextParent = m_RBParent[Next]; assert(RB_NIL != NextParent); // Ugly special case if successor is right child if (Next == Right) { #if DEBUG //Log("@@ Before RBDelete(%u) (tricky next==right)\n", Node); //ListMetric(); #endif m_RBParent[Next] = Parent; if (RB_NIL == Parent) { m_RBRoot = Next; m_RBParent[Next] = RB_NIL; } else { if (m_RBLeft[Parent] == Node) m_RBLeft[Parent] = Next; else { assert(m_RBRight[Parent] == Node); m_RBRight[Parent] = Next; } } m_RBLeft[Next] = Left; if (RB_NIL != Left) m_RBParent[Left] = Next; #if DEBUG //Log("@@ After RBDelete(%u) (tricky next==right)\n", Node); //ListMetric(); ValidateRB(); #endif return; } // Set NextChild either to the one child of successor, or nil. unsigned NextChild = (NextLeft != RB_NIL ? NextLeft : NextRight); // Splice successor from its current position if (m_RBLeft[NextParent] == Next) m_RBLeft[NextParent] = NextChild; else { assert(m_RBRight[NextParent] == Next); m_RBRight[NextParent] = NextChild; } if (RB_NIL != NextChild) m_RBParent[NextChild] = NextParent; // Insert successor into position currently held by node // to be deleted. if (RB_NIL == Parent) { m_RBRoot = Next; m_RBParent[Next] = RB_NIL; } else { if (m_RBLeft[Parent] == Node) m_RBLeft[Parent] = Next; else { assert(m_RBRight[Parent] == Node); m_RBRight[Parent] = Next; } } m_RBLeft[Next] = Left; m_RBRight[Next] = Right; m_RBParent[Next] = Parent; m_RBParent[Left] = Next; m_RBParent[Right] = Next; #if DEBUG //Log("@@ After RBDelete(%u)\n", Node); //ListMetric(); ValidateRB(); #endif } unsigned Clust::RBInsert(unsigned i, unsigned j, float fMetric) { #if DEBUG ValidateRB(); #endif unsigned NewNode = VectorIndex(i, j); m_RBMetric[NewNode] = fMetric; m_RBi[NewNode] = i; m_RBj[NewNode] = j; // New node is always inserted as a leaf. // Proof that this is possible is found in algorithm // textbooks (I forget the argument). m_RBLeft[NewNode] = RB_NIL; m_RBRight[NewNode] = RB_NIL; unsigned NewParent = RB_NIL; unsigned Node = m_RBRoot; unsigned uCount = 0; while (RB_NIL != Node) { NewParent = Node; if (fMetric < m_RBMetric[Node]) Node = m_RBLeft[Node]; else Node = m_RBRight[Node]; ++uCount; if (uCount > m_uRBNodeCount) Quit("Infinite loop in RBInsert"); } m_RBParent[NewNode] = NewParent; if (RB_NIL == NewParent) m_RBRoot = NewNode; else { if (fMetric < m_RBMetric[NewParent]) m_RBLeft[NewParent] = NewNode; else m_RBRight[NewParent] = NewNode; } #if DEBUG { unsigned Next = RBNext(NewNode); if (Next != RB_NIL) assert(NewNode == RBPrev(Next)); unsigned Prev = RBPrev(NewNode); if (Prev != RB_NIL) assert(NewNode == RBNext(Prev)); ValidateRB(); } #endif return NewNode; } void Clust::ValidateRBNode(unsigned Node, const char szMsg[]) const { if (RB_NIL == Node) return; unsigned Parent = m_RBParent[Node]; unsigned Left = m_RBLeft[Node]; unsigned Right = m_RBRight[Node]; unsigned Next = RBNext(Node); unsigned Prev = RBPrev(Node); if (RB_NIL != Next && RBPrev(Next) != Node) { ListMetric(); Quit("ValidateRB(%s) Node=%u Next=%u Prev(Next)=%u", szMsg, Node, Next, RBPrev(Next)); } if (RB_NIL != Prev && RBNext(Prev) != Node) { ListMetric(); Quit("ValidateRB(%s) Node=%u Prev=%u Next(Prev)=%u", szMsg, Node, Prev, RBNext(Prev)); } if (RB_NIL != Parent) { if (m_RBLeft[Parent] != Node && m_RBRight[Parent] != Node) { ListMetric(); Quit("ValidateRB(%s): Parent %u not linked to child %u\n", szMsg, Parent, Node); } } if (RB_NIL != Left) { if (m_RBParent[Left] != Node) { ListMetric(); Quit("ValidateRB(%s): Left child %u not linked to parent %u\n", szMsg, Left, Node); } } if (RB_NIL != Right) { if (m_RBParent[Right] != Node) { ListMetric(); Quit("ValidateRB(%s): Right child %u not linked to parent %u\n", szMsg, Right, Node); } } ValidateRBNode(Left, szMsg); ValidateRBNode(Right, szMsg); } void Clust::ValidateRB(const char szMsg[]) const { if (RB_NIL == m_RBRoot) return; ValidateRBNode(m_RBRoot, szMsg); unsigned Node = RBMin(m_RBRoot); for (;;) { unsigned Next = RBNext(Node); if (RB_NIL == Next) break; if (m_RBMetric[Node] > m_RBMetric[Next]) { ListMetric(); Quit("ValidateRBNode(%s): metric out of order %u=%g %u=%g", szMsg, Node, m_RBMetric[Node], Next, m_RBMetric[Next]); } Node = Next; } } muscle-3.8.31.orig/alpha.cpp0000644000175000017500000001304311352261673015173 0ustar kratzcharles#include "muscle.h" #include /*** From Bioperl docs: Extended DNA / RNA alphabet ------------------------------------------ Symbol Meaning Nucleic Acid ------------------------------------------ A A Adenine C C Cytosine G G Guanine T T Thymine U U Uracil M A or C R A or G W A or T S C or G Y C or T K G or T V A or C or G H A or C or T D A or G or T B C or G or T X G or A or T or C N G or A or T or C IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE: Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030. ***/ unsigned g_CharToLetter[MAX_CHAR]; unsigned g_CharToLetterEx[MAX_CHAR]; char g_LetterToChar[MAX_ALPHA]; char g_LetterExToChar[MAX_ALPHA_EX]; char g_UnalignChar[MAX_CHAR]; char g_AlignChar[MAX_CHAR]; bool g_IsWildcardChar[MAX_CHAR]; bool g_IsResidueChar[MAX_CHAR]; ALPHA g_Alpha = ALPHA_Undefined; unsigned g_AlphaSize = 0; #define Res(c, Letter) \ { \ const unsigned char Upper = (unsigned char) toupper(c); \ const unsigned char Lower = (unsigned char) tolower(c); \ g_CharToLetter[Upper] = Letter; \ g_CharToLetter[Lower] = Letter; \ g_CharToLetterEx[Upper] = Letter; \ g_CharToLetterEx[Lower] = Letter; \ g_LetterToChar[Letter] = Upper; \ g_LetterExToChar[Letter] = Upper; \ g_IsResidueChar[Upper] = true; \ g_IsResidueChar[Lower] = true; \ g_AlignChar[Upper] = Upper; \ g_AlignChar[Lower] = Upper; \ g_UnalignChar[Upper] = Lower; \ g_UnalignChar[Lower] = Lower; \ } #define Wild(c, Letter) \ { \ const unsigned char Upper = (unsigned char) toupper(c); \ const unsigned char Lower = (unsigned char) tolower(c); \ g_CharToLetterEx[Upper] = Letter; \ g_CharToLetterEx[Lower] = Letter; \ g_LetterExToChar[Letter] = Upper; \ g_IsResidueChar[Upper] = true; \ g_IsResidueChar[Lower] = true; \ g_AlignChar[Upper] = Upper; \ g_AlignChar[Lower] = Upper; \ g_UnalignChar[Upper] = Lower; \ g_UnalignChar[Lower] = Lower; \ g_IsWildcardChar[Lower] = true; \ g_IsWildcardChar[Upper] = true; \ } static unsigned GetAlphaSize(ALPHA Alpha) { switch (Alpha) { case ALPHA_Amino: return 20; case ALPHA_RNA: case ALPHA_DNA: return 4; } Quit("Invalid Alpha=%d", Alpha); return 0; } static void InitArrays() { memset(g_CharToLetter, 0xff, sizeof(g_CharToLetter)); memset(g_CharToLetterEx, 0xff, sizeof(g_CharToLetterEx)); memset(g_LetterToChar, '?', sizeof(g_LetterToChar)); memset(g_LetterExToChar, '?', sizeof(g_LetterExToChar)); memset(g_AlignChar, '?', sizeof(g_UnalignChar)); memset(g_UnalignChar, '?', sizeof(g_UnalignChar)); memset(g_IsWildcardChar, 0, sizeof(g_IsWildcardChar)); } static void SetGapChar(char c) { unsigned char u = (unsigned char) c; g_CharToLetterEx[u] = AX_GAP; g_LetterExToChar[AX_GAP] = u; g_AlignChar[u] = u; g_UnalignChar[u] = u; } static void SetAlphaDNA() { Res('A', NX_A) Res('C', NX_C) Res('G', NX_G) Res('T', NX_T) Wild('M', NX_M) Wild('R', NX_R) Wild('W', NX_W) Wild('S', NX_S) Wild('Y', NX_Y) Wild('K', NX_K) Wild('V', NX_V) Wild('H', NX_H) Wild('D', NX_D) Wild('B', NX_B) Wild('X', NX_X) Wild('N', NX_N) } static void SetAlphaRNA() { Res('A', NX_A) Res('C', NX_C) Res('G', NX_G) Res('U', NX_U) Res('T', NX_T) Wild('M', NX_M) Wild('R', NX_R) Wild('W', NX_W) Wild('S', NX_S) Wild('Y', NX_Y) Wild('K', NX_K) Wild('V', NX_V) Wild('H', NX_H) Wild('D', NX_D) Wild('B', NX_B) Wild('X', NX_X) Wild('N', NX_N) } static void SetAlphaAmino() { Res('A', AX_A) Res('C', AX_C) Res('D', AX_D) Res('E', AX_E) Res('F', AX_F) Res('G', AX_G) Res('H', AX_H) Res('I', AX_I) Res('K', AX_K) Res('L', AX_L) Res('M', AX_M) Res('N', AX_N) Res('P', AX_P) Res('Q', AX_Q) Res('R', AX_R) Res('S', AX_S) Res('T', AX_T) Res('V', AX_V) Res('W', AX_W) Res('Y', AX_Y) Wild('B', AX_B) Wild('X', AX_X) Wild('Z', AX_Z) } void SetAlpha(ALPHA Alpha) { InitArrays(); SetGapChar('.'); SetGapChar('-'); switch (Alpha) { case ALPHA_Amino: SetAlphaAmino(); break; case ALPHA_DNA: SetAlphaDNA(); case ALPHA_RNA: SetAlphaRNA(); break; default: Quit("Invalid Alpha=%d", Alpha); } g_AlphaSize = GetAlphaSize(Alpha); g_Alpha = Alpha; if (g_bVerbose) Log("Alphabet %s\n", ALPHAToStr(g_Alpha)); } char GetWildcardChar() { switch (g_Alpha) { case ALPHA_Amino: return 'X'; case ALPHA_DNA: case ALPHA_RNA: return 'N'; default: Quit("Invalid Alpha=%d", g_Alpha); } return '?'; } bool IsNucleo(char c) { return strchr("ACGTURYNacgturyn", c) != 0; } bool IsDNA(char c) { return strchr("AGCTNagctn", c) != 0; } bool IsRNA(char c) { return strchr("AGCUNagcun", c) != 0; } static char InvalidLetters[256]; static int InvalidLetterCount = 0; void ClearInvalidLetterWarning() { memset(InvalidLetters, 0, 256); } void InvalidLetterWarning(char c, char w) { InvalidLetters[(unsigned char) c] = 1; ++InvalidLetterCount; } void ReportInvalidLetters() { if (0 == InvalidLetterCount) return; char Str[257]; memset(Str, 0, 257); int n = 0; for (int i = 0; i < 256; ++i) { if (InvalidLetters[i]) Str[n++] = (char) i; } Warning("Assuming %s (see -seqtype option), invalid letters found: %s", ALPHAToStr(g_Alpha), Str); } muscle-3.8.31.orig/msadist.h0000644000175000017500000000146511352261673015224 0ustar kratzcharles#ifndef MSADist_h #define MSADist_h #include double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2); class MSADist { public: MSADist(DISTANCE Distance) { m_Distance = Distance; } double ComputeDist(const MSA &msa, unsigned uSeqIndex1, unsigned uSeqIndex2) { if (m_Distance == DISTANCE_ScoreDist) return GetScoreDist(msa, uSeqIndex1, uSeqIndex2); double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2); switch(m_Distance) { case DISTANCE_PctIdKimura: return KimuraDist(dPctId); case DISTANCE_PctIdLog: if (dPctId < 0.05) dPctId = 0.05; return -log(dPctId); } Quit("MSADist::ComputeDist, invalid DISTANCE_%u", m_Distance); return 0; } private: DISTANCE m_Distance; }; #endif // MSADist_h muscle-3.8.31.orig/aligngivenpathsw.cpp0000644000175000017500000001617411352261617017466 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "pwpath.h" #include "profile.h" #define TRACE 0 static void AppendDelete(const MSA &msaA, unsigned &uColIndexA, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendDelete ColIxA=%u ColIxCmb=%u\n", uColIndexA, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA); msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-'); ++uColIndexCombined; ++uColIndexA; } static void AppendInsert(const MSA &msaB, unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendInsert ColIxB=%u ColIxCmb=%u\n", uColIndexB, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-'); for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); } ++uColIndexCombined; ++uColIndexB; } static void AppendUnalignedTerminals(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA, const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendUnalignedTerminals ColIxA=%u ColIxB=%u ColIxCmb=%u\n", uColIndexA, uColIndexB, uColIndexCombined); #endif const unsigned uLengthA = msaA.GetColCount(); const unsigned uLengthB = msaB.GetColCount(); unsigned uNewColCount = uColCountA; if (uColCountB > uNewColCount) uNewColCount = uColCountB; for (unsigned n = 0; n < uColCountA; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA + n); c = UnalignChar(c); msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c); } } for (unsigned n = uColCountA; n < uNewColCount; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.'); } for (unsigned n = 0; n < uColCountB; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB + n); c = UnalignChar(c); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c); } } for (unsigned n = uColCountB; n < uNewColCount; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.'); } uColIndexCombined += uNewColCount; uColIndexA += uColCountA; uColIndexB += uColCountB; } static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB, unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n", uColIndexA, uColIndexB, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA); msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); } ++uColIndexA; ++uColIndexB; ++uColIndexCombined; } void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB, MSA &msaCombined) { msaCombined.Clear(); #if TRACE Log("AlignTwoMSAsGivenPathSW\n"); Log("Template A:\n"); msaA.LogMe(); Log("Template B:\n"); msaB.LogMe(); #endif const unsigned uColCountA = msaA.GetColCount(); const unsigned uColCountB = msaB.GetColCount(); const unsigned uSeqCountA = msaA.GetSeqCount(); const unsigned uSeqCountB = msaB.GetSeqCount(); msaCombined.SetSeqCount(uSeqCountA + uSeqCountB); // Copy sequence names into combined MSA for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA)); msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA)); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB)); msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB)); } unsigned uColIndexA = 0; unsigned uColIndexB = 0; unsigned uColIndexCombined = 0; const unsigned uEdgeCount = Path.GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); #if TRACE Log("\nEdge %u %c%u.%u\n", uEdgeIndex, Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); #endif const char cType = Edge.cType; const unsigned uPrefixLengthA = Edge.uPrefixLengthA; unsigned uColCountA = 0; if (uPrefixLengthA > 0) { const unsigned uNodeIndexA = uPrefixLengthA - 1; const unsigned uTplColIndexA = uNodeIndexA; if (uTplColIndexA > uColIndexA) uColCountA = uTplColIndexA - uColIndexA; } const unsigned uPrefixLengthB = Edge.uPrefixLengthB; unsigned uColCountB = 0; if (uPrefixLengthB > 0) { const unsigned uNodeIndexB = uPrefixLengthB - 1; const unsigned uTplColIndexB = uNodeIndexB; if (uTplColIndexB > uColIndexB) uColCountB = uTplColIndexB - uColIndexB; } AppendUnalignedTerminals(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); switch (cType) { case 'M': { assert(uPrefixLengthA > 0); assert(uPrefixLengthB > 0); const unsigned uColA = uPrefixLengthA - 1; const unsigned uColB = uPrefixLengthB - 1; assert(uColIndexA == uColA); assert(uColIndexB == uColB); AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } case 'D': { assert(uPrefixLengthA > 0); const unsigned uColA = uPrefixLengthA - 1; assert(uColIndexA == uColA); AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } case 'I': { assert(uPrefixLengthB > 0); const unsigned uColB = uPrefixLengthB - 1; assert(uColIndexB == uColB); AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } default: assert(false); } } unsigned uInsertColCountA = uColCountA - uColIndexA; unsigned uInsertColCountB = uColCountB - uColIndexB; AppendUnalignedTerminals(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB, uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); } muscle-3.8.31.orig/seqvect.h0000644000175000017500000000301611352261667015227 0ustar kratzcharles#ifndef SeqVect_h #define SeqVect_h #include #include "seq.h" typedef std::vector SeqVectBase; class SeqVect : public SeqVectBase { public: SeqVect() {} virtual ~SeqVect(); private: // Not implemented; prevent use of copy c'tor and assignment. SeqVect(const SeqVect &); SeqVect &operator=(const SeqVect &); public: void FromFile(TextFile &File) { FromFASTAFile(File); } void FromFASTAFile(TextFile &File); void ToFASTAFile(TextFile &File) const; void ToFile(TextFile &File) const { ToFASTAFile(File); } void PadToMSA(MSA &msa); void Copy(const SeqVect &rhs); void StripGaps(); void StripGapsAndWhitespace(); void ToUpper(); void Clear(); unsigned Length() const { return (unsigned) size(); } unsigned GetSeqCount() const { return (unsigned) size(); } void AppendSeq(const Seq &s); bool FindName(const char *ptrName, unsigned *ptruIndex) const; void LogMe() const; const char *GetSeqName(unsigned uSeqIndex) const; unsigned GetSeqId(unsigned uSeqIndex) const; unsigned GetSeqIdFromName(const char *Name) const; unsigned GetSeqLength(unsigned uSeqIndex) const; void SetSeqId(unsigned uSeqIndex, unsigned uId); Seq &GetSeq(unsigned uIndex); Seq &GetSeqById(unsigned uId); const Seq &GetSeq(unsigned uIndex) const; ALPHA GuessAlpha() const; void FixAlpha(); #ifndef _WIN32 reference at(size_type i) { return operator[](i); } const_reference at(size_type i) const { return operator[](i); } #endif }; #endif // SeqVect_h muscle-3.8.31.orig/maketree.cpp0000644000175000017500000000156011352261672015703 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "textfile.h" #include "tree.h" void DoMakeTree() { if (g_pstrInFileName == 0 || g_pstrOutFileName == 0) Quit("-maketree requires -in and -out "); SetStartTime(); SetSeqWeightMethod(g_SeqWeight1); TextFile MSAFile(g_pstrInFileName); MSA msa; msa.FromFile(MSAFile); unsigned uSeqCount = msa.GetSeqCount(); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); SetMuscleInputMSA(msa); Progress("%u sequences", uSeqCount); Tree tree; TreeFromMSA(msa, tree, g_Cluster2, g_Distance2, g_Root2); TextFile TreeFile(g_pstrOutFileName, true); tree.ToFile(TreeFile); Progress("Tree created"); } muscle-3.8.31.orig/dosp.cpp0000644000175000017500000000222311352261667015054 0ustar kratzcharles#include "muscle.h" #include "textfile.h" #include "msa.h" #include "objscore.h" #include "tree.h" #include "profile.h" void DoSP() { TextFile f(g_pstrSPFileName); MSA a; a.FromFile(f); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = a.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); a.FixAlpha(); SetPPScore(); const unsigned uSeqCount = a.GetSeqCount(); if (0 == uSeqCount) Quit("No sequences in input file %s", g_pstrSPFileName); MSA::SetIdCount(uSeqCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) a.SetSeqId(uSeqIndex, uSeqIndex); SetSeqWeightMethod(g_SeqWeight1); Tree tree; TreeFromMSA(a, tree, g_Cluster2, g_Distance2, g_Root2); SetMuscleTree(tree); SetMSAWeightsMuscle((MSA &) a); SCORE SP = ObjScoreSP(a); Log("File=%s;SP=%.4g\n", g_pstrSPFileName, SP); fprintf(stderr, "File=%s;SP=%.4g\n", g_pstrSPFileName, SP); } muscle-3.8.31.orig/seq.h0000644000175000017500000000351411352261636014344 0ustar kratzcharles#ifndef Seq_h #define Seq_h #include class TextFile; class MSA; typedef std::vector CharVect; class Seq : public CharVect { public: Seq() { m_ptrName = 0; // Start with moderate size to avoid // thrashing the heap. reserve(200); } virtual ~Seq() { delete[] m_ptrName; } private: // Not implemented; prevent use of copy c'tor and assignment. Seq(const Seq &); Seq &operator=(const Seq &); public: void Clear() { clear(); delete[] m_ptrName; m_ptrName = 0; m_uId = uInsane; } const char *GetName() const { return m_ptrName; } unsigned GetId() const { if (uInsane == m_uId) Quit("Seq::GetId, id not set"); return m_uId; } void SetId(unsigned uId) { m_uId = uId; } bool FromFASTAFile(TextFile &File); void ToFASTAFile(TextFile &File) const; void ExtractUngapped(MSA &msa) const; void FromString(const char *pstrSeq, const char *pstrName); void Copy(const Seq &rhs); void CopyReversed(const Seq &rhs); void StripGaps(); void StripGapsAndWhitespace(); void ToUpper(); void SetName(const char *ptrName); unsigned GetLetter(unsigned uIndex) const; unsigned Length() const { return (unsigned) size(); } bool Eq(const Seq &s) const; bool EqIgnoreCase(const Seq &s) const; bool EqIgnoreCaseAndGaps(const Seq &s) const; bool HasGap() const; unsigned GetUngappedLength() const; void LogMe() const; char GetChar(unsigned uIndex) const { return operator[](uIndex); } void SetChar(unsigned uIndex, char c) { operator[](uIndex) = c; } void AppendChar(char c) { push_back(c); } void FixAlpha(); #ifndef _WIN32 reference at(size_type i) { return operator[](i); } const_reference at(size_type i) const { return operator[](i); } #endif private: char *m_ptrName; unsigned m_uId; }; #endif // Seq.h muscle-3.8.31.orig/validateids.cpp0000644000175000017500000000617011352261673016402 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "tree.h" #include "seqvect.h" #if DEBUG static SeqVect *g_ptrMuscleSeqVect = 0; static MSA MuscleInputMSA; void SetMuscleInputMSA(MSA &msa) { MuscleInputMSA.Copy(msa); } void SetMuscleSeqVect(SeqVect &v) { g_ptrMuscleSeqVect = &v; } void ValidateMuscleIdsSeqVect(const MSA &msa) { const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const unsigned uId = msa.GetSeqId(uSeqIndex); const char *ptrNameMSA = msa.GetSeqName(uSeqIndex); const char *ptrName = g_ptrMuscleSeqVect->GetSeqName(uId); if (0 != strcmp(ptrNameMSA, ptrName)) Quit("ValidateMuscleIdsSeqVect, names don't match"); } } void ValidateMuscleIdsMSA(const MSA &msa) { const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const unsigned uId = msa.GetSeqId(uSeqIndex); const char *ptrNameMSA = msa.GetSeqName(uSeqIndex); const char *ptrName = MuscleInputMSA.GetSeqName(uId); if (0 != strcmp(ptrNameMSA, ptrName)) { Log("Input MSA:\n"); MuscleInputMSA.LogMe(); Log("MSA being tested:\n"); msa.LogMe(); Log("Id=%u\n", uId); Log("Input name=%s\n", ptrName); Log("Test name=%s\n", ptrNameMSA); Quit("ValidateMuscleIdsMSA, names don't match"); } } } void ValidateMuscleIds(const MSA &msa) { if (0 != g_ptrMuscleSeqVect) ValidateMuscleIdsSeqVect(msa); else if (0 != MuscleInputMSA.GetSeqCount()) ValidateMuscleIdsMSA(msa); else Quit("ValidateMuscleIds, ptrMuscleSeqVect=0 && 0 == MuscleInputMSA.SeqCount()"); } void ValidateMuscleIdsSeqVect(const Tree &tree) { const unsigned uSeqCount = g_ptrMuscleSeqVect->GetSeqCount(); const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!tree.IsLeaf(uNodeIndex)) continue; const unsigned uId = tree.GetLeafId(uNodeIndex); if (uId >= uSeqCount) { tree.LogMe(); Quit("Leaf with node index %u has id=%u, there are %u seqs", uNodeIndex, uId, uSeqCount); } const char *ptrNameTree = tree.GetLeafName(uNodeIndex); const char *ptrName = g_ptrMuscleSeqVect->GetSeqName(uId); if (0 != strcmp(ptrNameTree, ptrName)) Quit("ValidateMuscleIds: names don't match"); } } void ValidateMuscleIdsMSA(const Tree &tree) { const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!tree.IsLeaf(uNodeIndex)) continue; const unsigned uId = tree.GetLeafId(uNodeIndex); const char *ptrNameTree = tree.GetLeafName(uNodeIndex); const char *ptrName = MuscleInputMSA.GetSeqName(uId); if (0 != strcmp(ptrNameTree, ptrName)) Quit("ValidateMuscleIds: names don't match"); } } void ValidateMuscleIds(const Tree &tree) { if (0 != g_ptrMuscleSeqVect) ValidateMuscleIdsSeqVect(tree); else if (0 != MuscleInputMSA.GetSeqCount()) ValidateMuscleIdsMSA(tree); else Quit("ValidateMuscleIds, ptrMuscleSeqVect=0 && 0 == MuscleInputMSA.SeqCount"); } #endif muscle-3.8.31.orig/nucmx.cpp0000644000175000017500000000103411352261635015233 0ustar kratzcharles#include "muscle.h" // BLASTZ default parameters // open 400, extend 30, matrix as below const float NUC_EXTEND = 30; const float NUC_SP_CENTER = 2*NUC_EXTEND; #define v(x) ((float) x + NUC_SP_CENTER) #define ROW(A, C, G, T) \ { v(A), v(C), v(G), v(T) }, float NUC_SP[32][32] = { // A C G T ROW( 91, -114, -31, -123) // A ROW( -114, 100, -125, -31) // C ROW( -31, -125, 100, -114) // G ROW( -123, -31, -114, 91) // T }; muscle-3.8.31.orig/cluster.h0000644000175000017500000000445111352261600015225 0ustar kratzcharlesclass DistFunc; class ClusterNode { friend class ClusterTree; public: ClusterNode() { m_dWeight = 0.0; m_dWeight2 = 0.0; m_ptrLeft = 0; m_ptrRight = 0; m_ptrParent = 0; m_uIndex = 0; m_ptrPrevDisjoint = 0; m_ptrNextDisjoint = 0; } ~ClusterNode() {} public: unsigned GetIndex() const { return m_uIndex; } ClusterNode *GetLeft() const { return m_ptrLeft; } ClusterNode *GetRight() const { return m_ptrRight; } ClusterNode *GetParent() const { return m_ptrParent; } double GetWeight() const { return m_dWeight; } const ClusterNode *GetClusterLeaf(unsigned uLeafIndex) const; unsigned GetClusterSize() const; double GetClusterWeight() const; double GetLeftBranchWeight() const; double GetRightBranchWeight() const; double GetLeftWeight() const; double GetRightWeight() const; void LogMe() const; double GetWeight2() const { return m_dWeight2; } void SetWeight2(double dWeight2) { m_dWeight2 = dWeight2; } protected: void SetIndex(unsigned uIndex) { m_uIndex = uIndex; } void SetWeight(double dWeight) { m_dWeight = dWeight; } void SetLeft(ClusterNode *ptrLeft) { m_ptrLeft = ptrLeft; } void SetRight(ClusterNode *ptrRight) { m_ptrRight = ptrRight; } void SetParent(ClusterNode *ptrParent) { m_ptrParent = ptrParent; } void SetNextDisjoint(ClusterNode *ptrNode) { m_ptrNextDisjoint = ptrNode; } void SetPrevDisjoint(ClusterNode *ptrNode) { m_ptrPrevDisjoint = ptrNode; } ClusterNode *GetNextDisjoint() { return m_ptrNextDisjoint; } ClusterNode *GetPrevDisjoint() { return m_ptrPrevDisjoint; } private: double m_dWeight; double m_dWeight2; unsigned m_uIndex; ClusterNode *m_ptrLeft; ClusterNode *m_ptrRight; ClusterNode *m_ptrParent; ClusterNode *m_ptrNextDisjoint; ClusterNode *m_ptrPrevDisjoint; }; class ClusterTree { public: ClusterTree(); virtual ~ClusterTree(); void Create(const DistFunc &DF); ClusterNode *GetRoot() const; void LogMe() const; protected: void Join(ClusterNode *ptrNode1, ClusterNode *ptrNode2, ClusterNode *ptrJoin); void AddToDisjoints(ClusterNode *ptrNode); void DeleteFromDisjoints(ClusterNode *ptrNode); void Validate(unsigned uNodeCount); private: ClusterNode *m_ptrDisjoints; ClusterNode *m_Nodes; unsigned m_uNodeCount; unsigned m_uLeafCount; }; muscle-3.8.31.orig/msa2.cpp0000644000175000017500000003336011352261667014757 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "seqvect.h" #include "profile.h" #include "tree.h" // These global variables are a hack to allow the tree // dependent iteration code to communicate the edge // used to divide the tree. The three-way weighting // scheme needs to know this edge in order to compute // sequence weights. static const Tree *g_ptrMuscleTree = 0; unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR; unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR; void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize, FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd, FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc, FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const { const unsigned uSeqCount = GetSeqCount(); const unsigned uColCount = GetColCount(); memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT)); WEIGHT wTotal = 0; FCOUNT fGap = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const WEIGHT w = GetSeqWeight(uSeqIndex); if (IsGap(uSeqIndex, uColIndex)) { fGap += w; continue; } else if (IsWildcard(uSeqIndex, uColIndex)) { const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); switch (g_Alpha) { case ALPHA_Amino: switch (uLetter) { case AX_B: // D or N fcCounts[AX_D] += w/2; fcCounts[AX_N] += w/2; break; case AX_Z: // E or Q fcCounts[AX_E] += w/2; fcCounts[AX_Q] += w/2; break; default: // any { const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) fcCounts[uLetter] += f; break; } } break; case ALPHA_DNA: case ALPHA_RNA: switch (uLetter) { case AX_R: // G or A fcCounts[NX_G] += w/2; fcCounts[NX_A] += w/2; break; case AX_Y: // C or T/U fcCounts[NX_C] += w/2; fcCounts[NX_T] += w/2; break; default: // any const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) fcCounts[uLetter] += f; break; } break; default: Quit("Alphabet %d not supported", g_Alpha); } continue; } unsigned uLetter = GetLetter(uSeqIndex, uColIndex); fcCounts[uLetter] += w; wTotal += w; } *ptrfOcc = (float) (1.0 - fGap); if (bNormalize && wTotal > 0) { if (wTotal > 1.001) Quit("wTotal=%g\n", wTotal); for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) fcCounts[uLetter] /= wTotal; // AssertNormalized(fcCounts); } FCOUNT fcStartCount = 0; if (uColIndex == 0) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcStartCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1)) fcStartCount += GetSeqWeight(uSeqIndex); } FCOUNT fcEndCount = 0; if (uColCount - 1 == uColIndex) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcEndCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1)) fcEndCount += GetSeqWeight(uSeqIndex); } FCOUNT LL = 0; FCOUNT LG = 0; FCOUNT GL = 0; FCOUNT GG = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { WEIGHT w = GetSeqWeight(uSeqIndex); bool bLetterHere = !IsGap(uSeqIndex, uColIndex); bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1)); if (bLetterHere) { if (bLetterPrev) LL += w; else GL += w; } else { if (bLetterPrev) LG += w; else GG += w; } } FCOUNT fcExtendCount = 0; if (uColIndex > 0 && uColIndex < GetColCount() - 1) for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) && IsGap(uSeqIndex, uColIndex + 1)) fcExtendCount += GetSeqWeight(uSeqIndex); *ptrfcLL = LL; *ptrfcLG = LG; *ptrfcGL = GL; *ptrfcGG = GG; *ptrfcGapStart = fcStartCount; *ptrfcGapEnd = fcEndCount; *ptrfcGapExtend = fcExtendCount; } // Return true if the given column has no gaps and all // its residues are in the same biochemical group. bool MSAColIsConservative(const MSA &msa, unsigned uColIndex) { extern unsigned ResidueGroup[]; const unsigned uSeqCount = msa.GetColCount(); if (0 == uSeqCount) Quit("MSAColIsConservative: empty alignment"); if (msa.IsGap(0, uColIndex)) return false; unsigned uLetter = msa.GetLetterEx(0, uColIndex); const unsigned uGroup = ResidueGroup[uLetter]; for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex) { if (msa.IsGap(uSeqIndex, uColIndex)) return false; uLetter = msa.GetLetter(uSeqIndex, uColIndex); if (ResidueGroup[uLetter] != uGroup) return false; } return true; } void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex); msaOut.SetChar(uSeqIndex, uColIndex, c); } } } void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount, MSA &msaOut) { const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uInColCount = msaIn.GetColCount(); if (uFromColIndex + uColCount - 1 > uInColCount) Quit("MSAFromColRange, out of bounds"); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uSeqIndex); unsigned uId = msaIn.GetSeqId(uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); msaOut.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex); msaOut.SetChar(uSeqIndex, uColIndex, c); } } } void SeqVectFromMSA(const MSA &msa, SeqVect &v) { v.Clear(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq s; msa.GetSeq(uSeqIndex, s); s.StripGaps(); //if (0 == s.Length()) // continue; const char *ptrName = msa.GetSeqName(uSeqIndex); s.SetName(ptrName); v.AppendSeq(s); } } void DeleteGappedCols(MSA &msa) { unsigned uColIndex = 0; for (;;) { if (uColIndex >= msa.GetColCount()) break; if (msa.IsGapColumn(uColIndex)) msa.DeleteCol(uColIndex); else ++uColIndex; } } void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut) { unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut]; const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); unsigned uId = msaIn.GetSeqId(uSeqIndexIn); msaOut.SetSeqName(uSeqIndexOut, ptrName); msaOut.SetSeqId(uSeqIndexOut, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); msaOut.SetChar(uSeqIndexOut, uColIndex, c); } } } void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2) { const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); if (uSeqCount1 != uSeqCount2) Quit("Seq count differs"); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex) { Seq seq1; msa1.GetSeq(uSeqIndex, seq1); unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); Seq seq2; msa2.GetSeq(uSeqIndex2, seq2); if (!seq1.EqIgnoreCaseAndGaps(seq2)) { Log("Input:\n"); seq1.LogMe(); Log("Output:\n"); seq2.LogMe(); Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex)); } } } void AssertMSAEq(const MSA &msa1, const MSA &msa2) { const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); if (uSeqCount1 != uSeqCount2) Quit("Seq count differs"); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex) { Seq seq1; msa1.GetSeq(uSeqIndex, seq1); unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); Seq seq2; msa2.GetSeq(uSeqIndex2, seq2); if (!seq1.Eq(seq2)) { Log("Input:\n"); seq1.LogMe(); Log("Output:\n"); seq2.LogMe(); Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex)); } } } void SetMSAWeightsMuscle(MSA &msa) { SEQWEIGHT Method = GetSeqWeightMethod(); switch (Method) { case SEQWEIGHT_None: msa.SetUniformWeights(); return; case SEQWEIGHT_Henikoff: msa.SetHenikoffWeights(); return; case SEQWEIGHT_HenikoffPB: msa.SetHenikoffWeightsPB(); return; case SEQWEIGHT_GSC: msa.SetGSCWeights(); return; case SEQWEIGHT_ClustalW: SetClustalWWeightsMuscle(msa); return; case SEQWEIGHT_ThreeWay: SetThreeWayWeightsMuscle(msa); return; } Quit("SetMSAWeightsMuscle, Invalid method=%d", Method); } static WEIGHT *g_MuscleWeights; static unsigned g_uMuscleIdCount; WEIGHT GetMuscleSeqWeightById(unsigned uId) { if (0 == g_MuscleWeights) Quit("g_MuscleWeights = 0"); if (uId >= g_uMuscleIdCount) Quit("GetMuscleSeqWeightById(%u): count=%u", uId, g_uMuscleIdCount); return g_MuscleWeights[uId]; } void SetMuscleTree(const Tree &tree) { g_ptrMuscleTree = &tree; if (SEQWEIGHT_ClustalW != GetSeqWeightMethod()) return; delete[] g_MuscleWeights; const unsigned uLeafCount = tree.GetLeafCount(); g_uMuscleIdCount = uLeafCount; g_MuscleWeights = new WEIGHT[uLeafCount]; CalcClustalWWeights(tree, g_MuscleWeights); } void SetClustalWWeightsMuscle(MSA &msa) { if (0 == g_MuscleWeights) Quit("g_MuscleWeights = 0"); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const unsigned uId = msa.GetSeqId(uSeqIndex); if (uId >= g_uMuscleIdCount) Quit("SetClustalWWeightsMuscle: id out of range"); msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]); } msa.NormalizeWeights((WEIGHT) 1.0); } #define LOCAL_VERBOSE 0 void SetThreeWayWeightsMuscle(MSA &msa) { if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2) { msa.SetHenikoffWeightsPB(); return; } const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount(); WEIGHT *Weights = new WEIGHT[uMuscleSeqCount]; CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2, Weights); const unsigned uMSASeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex) { const unsigned uId = msa.GetSeqId(uSeqIndex); if (uId >= uMuscleSeqCount) Quit("SetThreeWayWeightsMuscle: id out of range"); msa.SetSeqWeight(uSeqIndex, Weights[uId]); } #if LOCAL_VERBOSE { Log("SetThreeWayWeightsMuscle\n"); for (unsigned n = 0; n < uMSASeqCount; ++n) { const unsigned uId = msa.GetSeqId(n); Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]); } } #endif msa.NormalizeWeights((WEIGHT) 1.0); delete[] Weights; } // Append msa2 at the end of msa1 void MSAAppend(MSA &msa1, const MSA &msa2) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } } // "Catenate" two MSAs (by bad analogy with UNIX cat command). // msa1 and msa2 must have same sequence names, but possibly // in a different order. // msaCat is the combined alignment produce by appending // sequences in msa2 to sequences in msa1. void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; msaCat.SetSize(uSeqCount, uColCountCat); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex) { const char c = msa1.GetChar(uSeqIndex, uColIndex); msaCat.SetChar(uSeqIndex, uColIndex, c); } const char *ptrSeqName = msa1.GetSeqName(uSeqIndex); unsigned uSeqIndex2; msaCat.SetSeqName(uSeqIndex, ptrSeqName); bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2); if (bFound) { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } else { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } } muscle-3.8.31.orig/glbalignla.cpp0000644000175000017500000002425211352261612016177 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "pwpath.h" #define OCC 1 struct DP_MEMORY { unsigned uLength; SCORE *GapOpenA; SCORE *GapOpenB; SCORE *GapCloseA; SCORE *GapCloseB; SCORE *MPrev; SCORE *MCurr; SCORE *MWork; SCORE *DPrev; SCORE *DCurr; SCORE *DWork; SCORE **ScoreMxB; #if OCC FCOUNT *OccA; FCOUNT *OccB; #endif unsigned **SortOrderA; unsigned *uDeletePos; FCOUNT **FreqsA; int **TraceBack; }; static struct DP_MEMORY DPM; static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) { // Max prefix length unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; if (uLength < DPM.uLength) return; // Add 256 to allow for future expansion and // round up to next multiple of 32. uLength += 256; uLength += 32 - uLength%32; const unsigned uOldLength = DPM.uLength; if (uOldLength > 0) { for (unsigned i = 0; i < uOldLength; ++i) { delete[] DPM.TraceBack[i]; delete[] DPM.FreqsA[i]; delete[] DPM.SortOrderA[i]; } for (unsigned n = 0; n < 20; ++n) delete[] DPM.ScoreMxB[n]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.uDeletePos; delete[] DPM.GapOpenA; delete[] DPM.GapOpenB; delete[] DPM.GapCloseA; delete[] DPM.GapCloseB; delete[] DPM.SortOrderA; delete[] DPM.FreqsA; delete[] DPM.ScoreMxB; delete[] DPM.TraceBack; #if OCC delete[] DPM.OccA; delete[] DPM.OccB; #endif } DPM.uLength = uLength; DPM.GapOpenA = new SCORE[uLength]; DPM.GapOpenB = new SCORE[uLength]; DPM.GapCloseA = new SCORE[uLength]; DPM.GapCloseB = new SCORE[uLength]; #if OCC DPM.OccA = new FCOUNT[uLength]; DPM.OccB = new FCOUNT[uLength]; #endif DPM.SortOrderA = new unsigned*[uLength]; DPM.FreqsA = new FCOUNT*[uLength]; DPM.ScoreMxB = new SCORE*[20]; DPM.MPrev = new SCORE[uLength]; DPM.MCurr = new SCORE[uLength]; DPM.MWork = new SCORE[uLength]; DPM.DPrev = new SCORE[uLength]; DPM.DCurr = new SCORE[uLength]; DPM.DWork = new SCORE[uLength]; DPM.uDeletePos = new unsigned[uLength]; DPM.TraceBack = new int*[uLength]; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) DPM.ScoreMxB[uLetter] = new SCORE[uLength]; for (unsigned i = 0; i < uLength; ++i) { DPM.SortOrderA[i] = new unsigned[20]; DPM.FreqsA[i] = new FCOUNT[20]; DPM.TraceBack[i] = new int[uLength]; } } SCORE GlobalAlignLA(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; AllocDPMem(uLengthA, uLengthB); SCORE *GapOpenA = DPM.GapOpenA; SCORE *GapOpenB = DPM.GapOpenB; SCORE *GapCloseA = DPM.GapCloseA; SCORE *GapCloseB = DPM.GapCloseB; unsigned **SortOrderA = DPM.SortOrderA; FCOUNT **FreqsA = DPM.FreqsA; SCORE **ScoreMxB = DPM.ScoreMxB; SCORE *MPrev = DPM.MPrev; SCORE *MCurr = DPM.MCurr; SCORE *MWork = DPM.MWork; SCORE *DPrev = DPM.DPrev; SCORE *DCurr = DPM.DCurr; SCORE *DWork = DPM.DWork; #if OCC FCOUNT *OccA = DPM.OccA; FCOUNT *OccB = DPM.OccB; #endif unsigned *uDeletePos = DPM.uDeletePos; int **TraceBack = DPM.TraceBack; for (unsigned i = 0; i < uLengthA; ++i) { GapOpenA[i] = PA[i].m_scoreGapOpen; GapCloseA[i] = PA[i].m_scoreGapClose; #if OCC OccA[i] = PA[i].m_fOcc; #endif for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; } } for (unsigned j = 0; j < uLengthB; ++j) { GapOpenB[j] = PB[j].m_scoreGapOpen; GapCloseB[j] = PB[j].m_scoreGapClose; #if OCC OccB[j] = PB[j].m_fOcc; #endif } for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { for (unsigned j = 0; j < uLengthB; ++j) ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; } for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); // Special case for i=0 unsigned **ptrSortOrderA = SortOrderA; FCOUNT **ptrFreqsA = FreqsA; assert(ptrSortOrderA == &(SortOrderA[0])); assert(ptrFreqsA == &(FreqsA[0])); TraceBack[0][0] = 0; SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][0]; } if (0 == scoreSum) MPrev[0] = -2.5; else { #if OCC MPrev[0] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[0]; #else MPrev[0] = (logf(scoreSum) - g_scoreCenter); #endif } // D(0,0) is -infinity (requires I->D). DPrev[0] = MINUS_INFINITY; for (unsigned j = 1; j < uLengthB; ++j) { // Only way to get M(0, j) looks like this: // A ----X // B XXXXX // 0 j // So gap-open at j=0, gap-close at j-1. SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][j]; } if (0 == scoreSum) MPrev[j] = -2.5; else { #if OCC MPrev[j] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[j] + GapOpenB[0] + GapCloseB[j-1]; #else MPrev[j] = (logf(scoreSum) - g_scoreCenter) + GapOpenB[0] + GapCloseB[j-1]; #endif } TraceBack[0][j] = -(int) j; // Assume no D->I transitions, then can't be a delete if only // one letter from A. DPrev[j] = MINUS_INFINITY; } SCORE IPrev_j_1; for (unsigned i = 1; i < uLengthA; ++i) { ++ptrSortOrderA; ++ptrFreqsA; assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); SCORE *ptrMCurr_j = MCurr; memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); const FCOUNT *FreqsAi = *ptrFreqsA; const unsigned *SortOrderAi = *ptrSortOrderA; const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; const SCORE *ptrMCurrMax = MCurr + uLengthB; for (const unsigned *ptrSortOrderAi = SortOrderAi; ptrSortOrderAi != ptrSortOrderAiEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; SCORE *NSBR_Letter = ScoreMxB[uLetter]; const FCOUNT fcLetter = FreqsAi[uLetter]; if (0 == fcLetter) break; SCORE *ptrNSBR = NSBR_Letter; for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) *ptrMCurr += fcLetter*(*ptrNSBR++); } #if OCC const FCOUNT OccAi = OccA[i]; #endif for (unsigned j = 0; j < uLengthB; ++j) { if (MCurr[j] == 0) MCurr[j] = -2.5; else #if OCC MCurr[j] = (logf(MCurr[j]) - g_scoreCenter)*OccAi*OccB[j]; #else MCurr[j] = (logf(MCurr[j]) - g_scoreCenter); #endif } ptrMCurr_j = MCurr; unsigned *ptrDeletePos = uDeletePos; // Special case for j=0 // Only way to get M(i, 0) looks like this: // 0 i // A XXXXX // B ----X // So gap-open at i=0, gap-close at i-1. assert(ptrMCurr_j == &(MCurr[0])); *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; ++ptrMCurr_j; int *ptrTraceBack_ij = TraceBack[i]; *ptrTraceBack_ij++ = (int) i; SCORE *ptrMPrev_j = MPrev; SCORE *ptrDPrev = DPrev; SCORE d = *ptrDPrev; SCORE DNew = *ptrMPrev_j + GapOpenA[i]; if (DNew > d) { d = DNew; *ptrDeletePos = i; } SCORE *ptrDCurr = DCurr; assert(ptrDCurr == &(DCurr[0])); *ptrDCurr = d; // Can't have an insert if no letters from B IPrev_j_1 = MINUS_INFINITY; unsigned uInsertPos; const SCORE scoreGapOpenAi = GapOpenA[i]; const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; for (unsigned j = 1; j < uLengthB; ++j) { // Here, MPrev_j is preserved from previous // iteration so with current i,j is M[i-1][j-1] SCORE MPrev_j = *ptrMPrev_j; SCORE INew = MPrev_j + GapOpenB[j]; if (INew > IPrev_j_1) { IPrev_j_1 = INew; uInsertPos = j; } SCORE scoreMax = MPrev_j; assert(ptrDPrev == &(DPrev[j-1])); SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; if (scoreD > scoreMax) { scoreMax = scoreD; assert(ptrDeletePos == &(uDeletePos[j-1])); *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; assert(*ptrTraceBack_ij > 0); } ++ptrDeletePos; SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; if (scoreI > scoreMax) { scoreMax = scoreI; *ptrTraceBack_ij = (int) uInsertPos - (int) j; assert(*ptrTraceBack_ij < 0); } assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); *ptrMCurr_j += scoreMax; assert(ptrMCurr_j == &(MCurr[j])); ++ptrMCurr_j; MPrev_j = *(++ptrMPrev_j); assert(ptrDPrev == &(DPrev[j])); SCORE d = *ptrDPrev; SCORE DNew = MPrev_j + scoreGapOpenAi; if (DNew > d) { d = DNew; assert(ptrDeletePos == &uDeletePos[j]); *ptrDeletePos = i; } assert(ptrDCurr + 1 == &(DCurr[j])); *(++ptrDCurr) = d; ++ptrTraceBack_ij; } Rotate(MPrev, MCurr, MWork); Rotate(DPrev, DCurr, DWork); } // Special case for i=uLengthA SCORE IPrev = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { SCORE INew = MPrev[j-1] + GapOpenB[j]; if (INew > IPrev) { uInsertPos = j; IPrev = INew; } } // Special case for i=uLengthA, j=uLengthB SCORE scoreMax = MPrev[uLengthB-1]; int iTraceBack = 0; SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; if (scoreD > scoreMax) { scoreMax = scoreD; iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; } SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; if (scoreI > scoreMax) { scoreMax = scoreI; iTraceBack = (int) uInsertPos - (int) uLengthB; } TraceBack[uLengthA][uLengthB] = iTraceBack; TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); return scoreMax; } muscle-3.8.31.orig/fastdistnuc.cpp0000644000175000017500000001546711352261667016454 0ustar kratzcharles#include "muscle.h" #include "distfunc.h" #include "seqvect.h" #include #define TRACE 0 #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) const unsigned TUPLE_COUNT = 6*6*6*6*6*6; static unsigned char Count1[TUPLE_COUNT]; static unsigned char Count2[TUPLE_COUNT]; // Nucleotide groups according to MAFFT (sextet5) // 0 = A // 1 = C // 2 = G // 3 = T // 4 = other static unsigned ResidueGroup[] = { 0, // NX_A, 1, // NX_C, 2, // NX_G, 3, // NX_T/U 4, // NX_N, 4, // NX_R, 4, // NX_Y, 4, // NX_GAP }; static unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]); static char *TupleToStr(int t) { static char s[7]; int t1, t2, t3, t4, t5, t6; t1 = t%6; t2 = (t/6)%6; t3 = (t/(6*6))%6; t4 = (t/(6*6*6))%6; t5 = (t/(6*6*6*6))%6; t6 = (t/(6*6*6*6*6))%6; s[5] = '0' + t1; s[4] = '0' + t2; s[3] = '0' + t3; s[2] = '0' + t4; s[1] = '0' + t5; s[0] = '0' + t6; return s; } static unsigned GetTuple(const unsigned uLetters[], unsigned n) { assert(uLetters[n] < uResidueGroupCount); assert(uLetters[n+1] < uResidueGroupCount); assert(uLetters[n+2] < uResidueGroupCount); assert(uLetters[n+3] < uResidueGroupCount); assert(uLetters[n+4] < uResidueGroupCount); assert(uLetters[n+5] < uResidueGroupCount); unsigned u1 = ResidueGroup[uLetters[n]]; unsigned u2 = ResidueGroup[uLetters[n+1]]; unsigned u3 = ResidueGroup[uLetters[n+2]]; unsigned u4 = ResidueGroup[uLetters[n+3]]; unsigned u5 = ResidueGroup[uLetters[n+4]]; unsigned u6 = ResidueGroup[uLetters[n+5]]; return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6; } static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[]) { memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char)); for (unsigned n = 0; n < uTupleCount; ++n) { const unsigned uTuple = GetTuple(L, n); ++(Count[uTuple]); } } static void ListCount(const unsigned char Count[]) { for (unsigned n = 0; n < TUPLE_COUNT; ++n) { if (0 == Count[n]) continue; Log("%s %u\n", TupleToStr(n), Count[n]); } } void DistKmer4_6(const SeqVect &v, DistFunc &DF) { if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha) Quit("DistKmer4_6 requires nucleo alphabet"); const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; // Initialize distance matrix to zero for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } // Convert to letters unsigned **Letters = new unsigned *[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); unsigned *L = new unsigned[uSeqLength]; Letters[uSeqIndex] = L; for (unsigned n = 0; n < uSeqLength; ++n) { char c = s[n]; L[n] = CharToLetterEx(c); if (L[n] >= 4) L[n] = 4; } } unsigned **uCommonTupleCount = new unsigned *[uSeqCount]; for (unsigned n = 0; n < uSeqCount; ++n) { uCommonTupleCount[n] = new unsigned[uSeqCount]; memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned)); } const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &seq1 = *(v[uSeq1]); const unsigned uSeqLength1 = seq1.Length(); if (uSeqLength1 < 5) continue; const unsigned uTupleCount = uSeqLength1 - 5; const unsigned *L = Letters[uSeq1]; CountTuples(L, uTupleCount, Count1); #if TRACE { Log("Seq1=%d\n", uSeq1); Log("Groups:\n"); for (unsigned n = 0; n < uSeqLength1; ++n) Log("%u", ResidueGroup[L[n]]); Log("\n"); Log("Tuples:\n"); ListCount(Count1); } #endif SetProgressDesc("K-mer dist pass 1"); for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; Seq &seq2 = *(v[uSeq2]); const unsigned uSeqLength2 = seq2.Length(); if (uSeqLength2 < 5) { if (uSeq1 == uSeq2) DF.SetDist(uSeq1, uSeq2, 0); else DF.SetDist(uSeq1, uSeq2, 1); continue; } // First pass through seq 2 to count tuples const unsigned uTupleCount = uSeqLength2 - 5; const unsigned *L = Letters[uSeq2]; CountTuples(L, uTupleCount, Count2); #if TRACE Log("Seq2=%d Counts=\n", uSeq2); ListCount(Count2); #endif // Second pass to accumulate sum of shared tuples // MAFFT defines this as the sum over unique tuples // in seq2 of the minimum of the number of tuples found // in the two sequences. unsigned uSum = 0; for (unsigned n = 0; n < uTupleCount; ++n) { const unsigned uTuple = GetTuple(L, n); uSum += MIN(Count1[uTuple], Count2[uTuple]); // This is a hack to make sure each unique tuple counted only once. Count2[uTuple] = 0; } #if TRACE { Seq &s1 = *(v[uSeq1]); Seq &s2 = *(v[uSeq2]); const char *pName1 = s1.GetName(); const char *pName2 = s2.GetName(); Log("Common count %s(%d) - %s(%d) =%u\n", pName1, uSeq1, pName2, uSeq2, uSum); } #endif uCommonTupleCount[uSeq1][uSeq2] = uSum; uCommonTupleCount[uSeq2][uSeq1] = uSum; } } ProgressStepsDone(); uCount = 0; SetProgressDesc("K-mer dist pass 2"); for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &s1 = *(v[uSeq1]); const char *pName1 = s1.GetName(); double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1]; if (0 == dCommonTupleCount11) dCommonTupleCount11 = 1; DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2]; if (0 == dCommonTupleCount22) dCommonTupleCount22 = 1; const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount11; const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount22; // dMinDist is the value used for tree-building in MAFFT const double dMinDist = MIN(dDist1, dDist2); DF.SetDist(uSeq1, uSeq2, (float) dMinDist); //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist); //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId); // **** TODO **** why does this make score slightly worse?? //const double dKimuraDist = KimuraDist(dEstimatedPctId); //DF.SetDist(uSeq1, uSeq2, dKimuraDist); } } ProgressStepsDone(); for (unsigned n = 0; n < uSeqCount; ++n) { delete[] uCommonTupleCount[n]; delete[] Letters[n]; } delete[] uCommonTupleCount; delete[] Letters; } muscle-3.8.31.orig/objscoreda.cpp0000644000175000017500000001555711352261666016237 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "profile.h" #include "objscore.h" #if DOUBLE_AFFINE #define TRACE 0 #define TEST_SPFAST 0 static SCORE GapPenalty(unsigned uLength, bool Term, SCORE g, SCORE e) { //if (Term) // { // switch (g_TermGap) // { // case TERMGAP_Full: // return g + (uLength - 1)*e; // case TERMGAP_Half: // return g/2 + (uLength - 1)*e; // case TERMGAP_Ext: // return uLength*e; // } // Quit("Bad termgap"); // } //else // return g + (uLength - 1)*e; //return MINUS_INFINITY; return g + (uLength - 1)*e; } static SCORE GapPenalty(unsigned uLength, bool Term) { SCORE s1 = GapPenalty(uLength, Term, g_scoreGapOpen, g_scoreGapExtend); #if DOUBLE_AFFINE SCORE s2 = GapPenalty(uLength, Term, g_scoreGapOpen2, g_scoreGapExtend2); if (s1 > s2) return s1; return s2; #else return s1; #endif } static const MSA *g_ptrMSA1; static const MSA *g_ptrMSA2; static unsigned g_uSeqIndex1; static unsigned g_uSeqIndex2; static void LogGap(unsigned uStart, unsigned uEnd, unsigned uGapLength, bool bNTerm, bool bCTerm) { Log("%16.16s ", ""); for (unsigned i = 0; i < uStart; ++i) Log(" "); unsigned uMyLength = 0; for (unsigned i = uStart; i <= uEnd; ++i) { bool bGap1 = g_ptrMSA1->IsGap(g_uSeqIndex1, i); bool bGap2 = g_ptrMSA2->IsGap(g_uSeqIndex2, i); if (!bGap1 && !bGap2) Quit("Error -- neither gapping"); if (bGap1 && bGap2) Log("."); else { ++uMyLength; Log("-"); } } SCORE s = GapPenalty(uGapLength, bNTerm || bCTerm); Log(" L=%d N%d C%d s=%.3g", uGapLength, bNTerm, bCTerm, s); Log("\n"); if (uMyLength != uGapLength) Quit("Lengths differ"); } static SCORE ScoreSeqPair(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2, SCORE *ptrLetters, SCORE *ptrGaps) { g_ptrMSA1 = &msa1; g_ptrMSA2 = &msa2; g_uSeqIndex1 = uSeqIndex1; g_uSeqIndex2 = uSeqIndex2; const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPair, different lengths"); #if TRACE Log("ScoreSeqPair\n"); Log("%16.16s ", msa1.GetSeqName(uSeqIndex1)); for (unsigned i = 0; i < uColCount; ++i) Log("%c", msa1.GetChar(uSeqIndex1, i)); Log("\n"); Log("%16.16s ", msa2.GetSeqName(uSeqIndex2)); for (unsigned i = 0; i < uColCount; ++i) Log("%c", msa1.GetChar(uSeqIndex2, i)); Log("\n"); #endif SCORE scoreTotal = 0; // Substitution scores unsigned uFirstLetter1 = uInsane; unsigned uFirstLetter2 = uInsane; unsigned uLastLetter1 = uInsane; unsigned uLastLetter2 = uInsane; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); bool bWildcard1 = msa1.IsWildcard(uSeqIndex1, uColIndex); bool bWildcard2 = msa2.IsWildcard(uSeqIndex2, uColIndex); if (!bGap1) { if (uInsane == uFirstLetter1) uFirstLetter1 = uColIndex; uLastLetter1 = uColIndex; } if (!bGap2) { if (uInsane == uFirstLetter2) uFirstLetter2 = uColIndex; uLastLetter2 = uColIndex; } if (bGap1 || bGap2 || bWildcard1 || bWildcard2) continue; unsigned uLetter1 = msa1.GetLetter(uSeqIndex1, uColIndex); unsigned uLetter2 = msa2.GetLetter(uSeqIndex2, uColIndex); SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2]; scoreTotal += scoreMatch; #if TRACE Log("%c <-> %c = %7.1f %10.1f\n", msa1.GetChar(uSeqIndex1, uColIndex), msa2.GetChar(uSeqIndex2, uColIndex), scoreMatch, scoreTotal); #endif } *ptrLetters = scoreTotal; // Gap penalties unsigned uGapLength = uInsane; unsigned uGapStartCol = uInsane; bool bGapping1 = false; bool bGapping2 = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (bGap1 && bGap2) continue; if (bGapping1) { if (bGap1) ++uGapLength; else { bGapping1 = false; bool bNTerm = (uFirstLetter2 == uGapStartCol); bool bCTerm = (uLastLetter2 + 1 == uColIndex); SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } continue; } else { if (bGap1) { uGapStartCol = uColIndex; bGapping1 = true; uGapLength = 1; continue; } } if (bGapping2) { if (bGap2) ++uGapLength; else { bGapping2 = false; bool bNTerm = (uFirstLetter1 == uGapStartCol); bool bCTerm = (uLastLetter1 + 1 == uColIndex); SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } } else { if (bGap2) { uGapStartCol = uColIndex; bGapping2 = true; uGapLength = 1; } } } if (bGapping1 || bGapping2) { SCORE scoreGap = GapPenalty(uGapLength, true); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColCount - 1, uGapLength, false, true); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } *ptrGaps = scoreTotal - *ptrLetters; return scoreTotal; } // The usual sum-of-pairs objective score: sum the score // of the alignment of each pair of sequences. SCORE ObjScoreDA(const MSA &msa, SCORE *ptrLetters, SCORE *ptrGaps) { const unsigned uSeqCount = msa.GetSeqCount(); SCORE scoreTotal = 0; unsigned uPairCount = 0; #if TRACE msa.LogMe(); Log(" Score Weight Weight Total\n"); Log("---------- ------ ------ ----------\n"); #endif SCORE TotalLetters = 0; SCORE TotalGaps = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE Letters; SCORE Gaps; SCORE scorePair = ScoreSeqPair(msa, uSeqIndex1, msa, uSeqIndex2, &Letters, &Gaps); scoreTotal += w1*w2*scorePair; TotalLetters += w1*w2*Letters; TotalGaps += w1*w2*Gaps; ++uPairCount; #if TRACE Log("%10.2f %6.3f %6.3f %10.2f %d=%s %d=%s\n", scorePair, w1, w2, scorePair*w1*w2, uSeqIndex1, msa.GetSeqName(uSeqIndex1), uSeqIndex2, msa.GetSeqName(uSeqIndex2)); #endif } } *ptrLetters = TotalLetters; *ptrGaps = TotalGaps; return scoreTotal; } #endif // DOUBLE_AFFINE muscle-3.8.31.orig/intmath.h0000644000175000017500000001343411352261600015211 0ustar kratzcharles// IntMath.h: Header for doing fractional math with integers for speed. #ifndef IntMath_h #define IntMath_h typedef float BASETYPE; //typedef double BASETYPE; // Scaling factor used to store certain floating point // values as integers to a few significant figures. //const int INTSCALE = 1000; const int INTSCALE = 1; // Type for a probability in range 0.0 to 1.0. typedef BASETYPE PROB; // Type for an log-odds integer score. // Stored as log2(PROB)*INTSCALE. //typedef int SCORE; typedef BASETYPE SCORE; // Type for a weight. // Stored as w*INTSCALE where w is in range 0.0 to 1.0. //typedef unsigned WEIGHT; typedef BASETYPE WEIGHT; // Type for a fractional weighted count stored as n*WEIGHT/N // where n=measured count (integer >= 0) and N is total for // the distribution (e.g., n=number of residues of a given // type in a column, N=number of residues in the column). // Hence values in an FCOUNT variable range from 0..INTSCALE // as an integer, representing "true" values 0.0 to 1.0. //typedef unsigned FCOUNT; typedef BASETYPE FCOUNT; // Representation of -infinity. Value should // be large and negative, but not so large // that adding a few of them overflows. // TODO: Multiplied by 10 to work around bug // when aligning Bali 1ckaA in ref4, which is // so long that B->Mmax got to -infinity, causing // traceback to fail. //const int MINUS_INFINITY = -10000000; const BASETYPE MINUS_INFINITY = (BASETYPE) -1e37; const BASETYPE PLUS_INFINITY = (BASETYPE) 1e37; // Probability relative to a null model typedef double RPROB; PROB ScoreToProb(SCORE Score); SCORE ProbToScore(PROB Prob); SCORE DoubleToScore(double d); WEIGHT DoubleToWeight(double d); double WeightToDouble(WEIGHT w); SCORE MulScoreWeight(SCORE Score, WEIGHT Weight); bool ScoreEq(SCORE s1, SCORE s2); bool BTEq(double b1, double b2); static double ScoreToDouble(SCORE Score) { return (double) Score / (double) INTSCALE; } #if 0 // In-line assembler for Result = (x*y)/z // Note that imul and idiv will do 64-bit arithmetic // on 32-bit operands, so this shouldn't overflow // Can't write this efficiently in C/C++ (would // often overlow 32 bits). #define MulDivAssign(Result, x, y, z) \ { \ int X = (x); \ int Y = (y); \ int Z = (z); \ _asm mov eax,X \ _asm imul Y \ _asm mov ecx,Z \ _asm idiv ecx \ _asm mov Result,eax \ } #else #define MulDivAssign(Result, x, y, z) Result = (((x)*(y))/(z)) #endif #define MulScoreWeight(r, s, w) MulDivAssign(r, s, w, INTSCALE) #define MulWeightWCount(r, wt, wc) MulDivAssign(r, wt, wc, INTSCALE) #define MulFCountScore(r, fc, sc) MulDivAssign(r, fc, sc, INTSCALE) #if _DEBUG static inline SCORE Add2(SCORE a, SCORE b) { if (MINUS_INFINITY == a) return MINUS_INFINITY; if (MINUS_INFINITY == b) return MINUS_INFINITY; SCORE sum = a + b; if (sum < MINUS_INFINITY) return MINUS_INFINITY; // assert(sum < OVERFLOW_WARN); return sum; } static inline SCORE Add3(SCORE a, SCORE b, SCORE c) { return Add2(Add2(a, b), c); } static inline SCORE Add4(SCORE a, SCORE b, SCORE c, SCORE d) { return Add2(Add2(a, b), Add2(c, d)); } static inline SCORE Add5(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e) { return Add3(Add2(a, b), Add2(c, d), e); } static inline SCORE Add6(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e, SCORE f) { return Add3(Add2(a, b), Add2(c, d), Add2(e, f)); } static inline SCORE Add7(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e, SCORE f, SCORE g) { return Add4(Add2(a, b), Add2(c, d), Add2(e, f), g); } static inline SCORE Mul2(SCORE a, SCORE b) { if (MINUS_INFINITY == a) return MINUS_INFINITY; if (MINUS_INFINITY == b) return MINUS_INFINITY; //__int64 prod = (__int64) a * (__int64) b; //assert((SCORE) prod == prod); //return (SCORE) prod; return a*b; } static inline SCORE Sub2(SCORE a, SCORE b) { if (MINUS_INFINITY == a) return MINUS_INFINITY; if (MINUS_INFINITY == b) return MINUS_INFINITY; SCORE diff = a - b; if (diff < MINUS_INFINITY) return MINUS_INFINITY; // assert(diff < OVERFLOW_WARN); return diff; } static inline SCORE Div2(SCORE a, int b) { if (MINUS_INFINITY == a) return MINUS_INFINITY; return a/b; } //static inline SCORE MulScoreWeight(SCORE s, WEIGHT w) // { // SCORE Prod = s*(SCORE) w; // assert(Prod < OVERFLOW_WARN); // extern void Log(const char Format[], ...); // if (Prod/(SCORE) w != s) // Log("**WARRNING MulScoreWeight Prod=%d w=%d Prod/w=%d s=%d\n", // Prod, // w, // Prod/(SCORE) w, // s); // assert(Prod/ (SCORE) w == s); // return Prod/INTSCALE; // } // //static inline WCOUNT MulWeightWCount(WEIGHT wt, WCOUNT wc) // { // return (wt*wc)/INTSCALE; // } #else #define Add2(a, b) ((a) + (b)) #define Sub2(a, b) ((MINUS_INFINITY == (a)) ? MINUS_INFINITY : ((a) - (b))) #define Div2(a, b) ((MINUS_INFINITY == (a)) ? MINUS_INFINITY : ((a) / (b))) #define Add3(a, b, c) ((a) + (b) + (c)) #define Add4(a, b, c, d) ((a) + (b) + (c) + (d)) #define Add5(a, b, c, d, e) ((a) + (b) + (c) + (d) + (e)) #define Add6(a, b, c, d, e, f) ((a) + (b) + (c) + (d) + (e) + (f)) #define Add7(a, b, c, d, e, f, g) ((a) + (b) + (c) + (d) + (e) + (f) + (g)) //#define MulScoreWeight(s, w) (((s)*(SCORE) (w))/INTSCALE) #define Mul2(a, b) ((a)*(b)) #endif //static inline SCORE MulFCountScore(FCOUNT fc, SCORE sc) // { //// Fast way to say "if (fc >= 2^15 || sc >= 2^15)": // if ((fc | sc) & 0xffff1000) // { // SCORE Score = ((fc+5)/10)*sc; // assert(Score < assert); // OVERFLOW_WARN(Score > MINUS_INFINITY); // return Score/(INTSCALE/10); // } // SCORE Score = fc*sc; // assert(Score < OVERFLOW_WARN); // assert(Score > MINUS_INFINITY); // return Score/INTSCALE; // } #endif // IntMath_h muscle-3.8.31.orig/phytofile.cpp0000644000175000017500000000406311352261600016101 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include "textfile.h" unsigned Tree::GetAnyNonLeafNode() const { for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) if (!IsLeaf(uNodeIndex)) return uNodeIndex; return NULL_NEIGHBOR; } void Tree::ToFile(TextFile &File) const { if (IsRooted()) { ToFileNodeRooted(File, m_uRootNodeIndex); File.PutString(";\n"); return; } // Unrooted. unsigned uNodeIndex = GetAnyNonLeafNode(); File.PutString("(\n"); ToFileNodeUnrooted(File, m_uNeighbor1[uNodeIndex], uNodeIndex); File.PutString(",\n"); ToFileNodeUnrooted(File, m_uNeighbor2[uNodeIndex], uNodeIndex); File.PutString(",\n"); ToFileNodeUnrooted(File, m_uNeighbor3[uNodeIndex], uNodeIndex); File.PutString(");\n"); } void Tree::ToFileNodeUnrooted(TextFile &File, unsigned uNodeIndex, unsigned uParent) const { assert(!IsRooted()); bool bGroup = !IsLeaf(uNodeIndex); if (bGroup) File.PutString("(\n"); if (IsLeaf(uNodeIndex)) File.PutString(GetName(uNodeIndex)); else { ToFileNodeUnrooted(File, GetFirstNeighbor(uNodeIndex, uParent), uNodeIndex); File.PutString(",\n"); ToFileNodeUnrooted(File, GetSecondNeighbor(uNodeIndex, uParent), uNodeIndex); } if (bGroup) File.PutString(")"); if (HasEdgeLength(uNodeIndex, uParent)) File.PutFormat(":%g", GetEdgeLength(uNodeIndex, uParent)); File.PutString("\n"); } void Tree::ToFileNodeRooted(TextFile &File, unsigned uNodeIndex) const { assert(IsRooted()); bool bGroup = !IsLeaf(uNodeIndex) || IsRoot(uNodeIndex); if (bGroup) File.PutString("(\n"); if (IsLeaf(uNodeIndex)) File.PutString(GetName(uNodeIndex)); else { ToFileNodeRooted(File, GetLeft(uNodeIndex)); File.PutString(",\n"); ToFileNodeRooted(File, GetRight(uNodeIndex)); } if (bGroup) File.PutString(")"); if (!IsRoot(uNodeIndex)) { unsigned uParent = GetParent(uNodeIndex); if (HasEdgeLength(uNodeIndex, uParent)) File.PutFormat(":%g", GetEdgeLength(uNodeIndex, uParent)); } File.PutString("\n"); } muscle-3.8.31.orig/makerootmsab.cpp0000644000175000017500000000400511352261667016573 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include "profile.h" #include "msa.h" #include "seqvect.h" #include "pwpath.h" static void DoSeq(Seq &s, unsigned uSeqIndex, const ProfPos *RootProf, unsigned uRootProfLength, MSA &msaOut) { MSA msaSeq; msaSeq.FromSeq(s); const unsigned uSeqLength = s.Length(); MSA msaDummy; msaDummy.SetSize(1, uRootProfLength); msaDummy.SetSeqId(0, 0); msaDummy.SetSeqName(0, "Dummy0"); for (unsigned uColIndex = 0; uColIndex < uRootProfLength; ++uColIndex) msaDummy.SetChar(0, uColIndex, '?'); ProfPos *SeqProf = ProfileFromMSA(msaSeq); for (unsigned uColIndex = 0; uColIndex < uSeqLength; ++uColIndex) { ProfPos &PP = SeqProf[uColIndex]; PP.m_scoreGapOpen = MINUS_INFINITY; PP.m_scoreGapClose = MINUS_INFINITY; } ProfPos *ProfOut; unsigned uLengthOut; PWPath Path; AlignTwoProfs(SeqProf, uSeqLength, 1.0, RootProf, uRootProfLength, 1.0, Path, &ProfOut, &uLengthOut); assert(uLengthOut = uRootProfLength); delete[] ProfOut; MSA msaCombined; AlignTwoMSAsGivenPath(Path, msaSeq, msaDummy, msaCombined); msaCombined.LogMe(); msaOut.SetSeqName(uSeqIndex, s.GetName()); msaOut.SetSeqId(uSeqIndex, s.GetId()); for (unsigned uColIndex = 0; uColIndex < uRootProfLength; ++uColIndex) msaOut.SetChar(uSeqIndex, uColIndex, msaCombined.GetChar(0, uColIndex)); } // Steven Brenner's O(NL^2) proposal for creating a root alignment // Align each sequence to the profile at the root. // Compare the e-string solution, which is O(NL log N). void MakeRootMSABrenner(SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a) { const unsigned uSeqCount = v.Length(); const unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const ProfPos *RootProfile = Nodes[uRootNodeIndex].m_Prof; const unsigned uRootColCount = Nodes[uRootNodeIndex].m_uLength; a.SetSize(uSeqCount, uRootColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) DoSeq(*v[uSeqIndex], uSeqIndex, RootProfile, uRootColCount, a); } muscle-3.8.31.orig/glbalignspn.cpp0000644000175000017500000002375511352261667016424 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "pwpath.h" struct DP_MEMORY { unsigned uLength; SCORE *GapOpenA; SCORE *GapOpenB; SCORE *GapCloseA; SCORE *GapCloseB; SCORE *MPrev; SCORE *MCurr; SCORE *MWork; SCORE *DPrev; SCORE *DCurr; SCORE *DWork; SCORE **ScoreMxB; unsigned **SortOrderA; unsigned *uDeletePos; FCOUNT **FreqsA; int **TraceBack; }; static struct DP_MEMORY DPM; void FreeDPMemSPN() { const unsigned uOldLength = DPM.uLength; if (0 == uOldLength) return; for (unsigned i = 0; i < uOldLength; ++i) { delete[] DPM.TraceBack[i]; delete[] DPM.FreqsA[i]; delete[] DPM.SortOrderA[i]; } for (unsigned n = 0; n < 4; ++n) delete[] DPM.ScoreMxB[n]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.uDeletePos; delete[] DPM.GapOpenA; delete[] DPM.GapOpenB; delete[] DPM.GapCloseA; delete[] DPM.GapCloseB; delete[] DPM.SortOrderA; delete[] DPM.FreqsA; delete[] DPM.ScoreMxB; delete[] DPM.TraceBack; } static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) { // Max prefix length unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; if (uLength < DPM.uLength) return; // Add 256 to allow for future expansion and // round up to next multiple of 32. uLength += 256; uLength += 32 - uLength%32; const unsigned uOldLength = DPM.uLength; if (uOldLength > 0) { for (unsigned i = 0; i < uOldLength; ++i) { delete[] DPM.TraceBack[i]; delete[] DPM.FreqsA[i]; delete[] DPM.SortOrderA[i]; } for (unsigned n = 0; n < 4; ++n) delete[] DPM.ScoreMxB[n]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.uDeletePos; delete[] DPM.GapOpenA; delete[] DPM.GapOpenB; delete[] DPM.GapCloseA; delete[] DPM.GapCloseB; delete[] DPM.SortOrderA; delete[] DPM.FreqsA; delete[] DPM.ScoreMxB; delete[] DPM.TraceBack; } DPM.uLength = uLength; DPM.GapOpenA = new SCORE[uLength]; DPM.GapOpenB = new SCORE[uLength]; DPM.GapCloseA = new SCORE[uLength]; DPM.GapCloseB = new SCORE[uLength]; DPM.SortOrderA = new unsigned*[uLength]; DPM.FreqsA = new FCOUNT*[uLength]; DPM.ScoreMxB = new SCORE*[4]; DPM.MPrev = new SCORE[uLength]; DPM.MCurr = new SCORE[uLength]; DPM.MWork = new SCORE[uLength]; DPM.DPrev = new SCORE[uLength]; DPM.DCurr = new SCORE[uLength]; DPM.DWork = new SCORE[uLength]; DPM.uDeletePos = new unsigned[uLength]; DPM.TraceBack = new int*[uLength]; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) DPM.ScoreMxB[uLetter] = new SCORE[uLength]; for (unsigned i = 0; i < uLength; ++i) { DPM.SortOrderA[i] = new unsigned[4]; DPM.FreqsA[i] = new FCOUNT[4]; DPM.TraceBack[i] = new int[uLength]; } } SCORE GlobalAlignSPN(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { if (ALPHA_DNA != g_Alpha || ALPHA_RNA == g_Alpha) Quit("GlobalAlignSPN: must be nucleo"); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; AllocDPMem(uLengthA, uLengthB); SCORE *GapOpenA = DPM.GapOpenA; SCORE *GapOpenB = DPM.GapOpenB; SCORE *GapCloseA = DPM.GapCloseA; SCORE *GapCloseB = DPM.GapCloseB; unsigned **SortOrderA = DPM.SortOrderA; FCOUNT **FreqsA = DPM.FreqsA; SCORE **ScoreMxB = DPM.ScoreMxB; SCORE *MPrev = DPM.MPrev; SCORE *MCurr = DPM.MCurr; SCORE *MWork = DPM.MWork; SCORE *DPrev = DPM.DPrev; SCORE *DCurr = DPM.DCurr; SCORE *DWork = DPM.DWork; unsigned *uDeletePos = DPM.uDeletePos; int **TraceBack = DPM.TraceBack; for (unsigned i = 0; i < uLengthA; ++i) { GapOpenA[i] = PA[i].m_scoreGapOpen; GapCloseA[i] = PA[i].m_scoreGapClose; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) { SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; } } for (unsigned j = 0; j < uLengthB; ++j) { GapOpenB[j] = PB[j].m_scoreGapOpen; GapCloseB[j] = PB[j].m_scoreGapClose; } for (unsigned uLetter = 0; uLetter < 4; ++uLetter) { for (unsigned j = 0; j < uLengthB; ++j) ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; } for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); // Special case for i=0 unsigned **ptrSortOrderA = SortOrderA; FCOUNT **ptrFreqsA = FreqsA; assert(ptrSortOrderA == &(SortOrderA[0])); assert(ptrFreqsA == &(FreqsA[0])); TraceBack[0][0] = 0; SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 4; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][0]; } MPrev[0] = scoreSum - g_scoreCenter; // D(0,0) is -infinity (requires I->D). DPrev[0] = MINUS_INFINITY; for (unsigned j = 1; j < uLengthB; ++j) { // Only way to get M(0, j) looks like this: // A ----X // B XXXXX // 0 j // So gap-open at j=0, gap-close at j-1. SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 4; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][j]; } MPrev[j] = scoreSum - g_scoreCenter + GapOpenB[0] + GapCloseB[j-1]; TraceBack[0][j] = -(int) j; // Assume no D->I transitions, then can't be a delete if only // one letter from A. DPrev[j] = MINUS_INFINITY; } SCORE IPrev_j_1; for (unsigned i = 1; i < uLengthA; ++i) { ++ptrSortOrderA; ++ptrFreqsA; assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); SCORE *ptrMCurr_j = MCurr; memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); const FCOUNT *FreqsAi = *ptrFreqsA; const unsigned *SortOrderAi = *ptrSortOrderA; const unsigned *ptrSortOrderAiEnd = SortOrderAi + 4; const SCORE *ptrMCurrMax = MCurr + uLengthB; for (const unsigned *ptrSortOrderAi = SortOrderAi; ptrSortOrderAi != ptrSortOrderAiEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; SCORE *NSBR_Letter = ScoreMxB[uLetter]; const FCOUNT fcLetter = FreqsAi[uLetter]; if (0 == fcLetter) break; SCORE *ptrNSBR = NSBR_Letter; for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) *ptrMCurr += fcLetter*(*ptrNSBR++); } for (unsigned j = 0; j < uLengthB; ++j) MCurr[j] -= g_scoreCenter; ptrMCurr_j = MCurr; unsigned *ptrDeletePos = uDeletePos; // Special case for j=0 // Only way to get M(i, 0) looks like this: // 0 i // A XXXXX // B ----X // So gap-open at i=0, gap-close at i-1. assert(ptrMCurr_j == &(MCurr[0])); *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; ++ptrMCurr_j; int *ptrTraceBack_ij = TraceBack[i]; *ptrTraceBack_ij++ = (int) i; SCORE *ptrMPrev_j = MPrev; SCORE *ptrDPrev = DPrev; SCORE d = *ptrDPrev; SCORE DNew = *ptrMPrev_j + GapOpenA[i]; if (DNew > d) { d = DNew; *ptrDeletePos = i; } SCORE *ptrDCurr = DCurr; assert(ptrDCurr == &(DCurr[0])); *ptrDCurr = d; // Can't have an insert if no letters from B IPrev_j_1 = MINUS_INFINITY; unsigned uInsertPos; const SCORE scoreGapOpenAi = GapOpenA[i]; const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; for (unsigned j = 1; j < uLengthB; ++j) { // Here, MPrev_j is preserved from previous // iteration so with current i,j is M[i-1][j-1] SCORE MPrev_j = *ptrMPrev_j; SCORE INew = MPrev_j + GapOpenB[j]; if (INew > IPrev_j_1) { IPrev_j_1 = INew; uInsertPos = j; } SCORE scoreMax = MPrev_j; assert(ptrDPrev == &(DPrev[j-1])); SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; if (scoreD > scoreMax) { scoreMax = scoreD; assert(ptrDeletePos == &(uDeletePos[j-1])); *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; assert(*ptrTraceBack_ij > 0); } ++ptrDeletePos; SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; if (scoreI > scoreMax) { scoreMax = scoreI; *ptrTraceBack_ij = (int) uInsertPos - (int) j; assert(*ptrTraceBack_ij < 0); } assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); *ptrMCurr_j += scoreMax; assert(ptrMCurr_j == &(MCurr[j])); ++ptrMCurr_j; MPrev_j = *(++ptrMPrev_j); assert(ptrDPrev == &(DPrev[j])); SCORE d = *ptrDPrev; SCORE DNew = MPrev_j + scoreGapOpenAi; if (DNew > d) { d = DNew; assert(ptrDeletePos == &uDeletePos[j]); *ptrDeletePos = i; } assert(ptrDCurr + 1 == &(DCurr[j])); *(++ptrDCurr) = d; ++ptrTraceBack_ij; } Rotate(MPrev, MCurr, MWork); Rotate(DPrev, DCurr, DWork); } // Special case for i=uLengthA SCORE IPrev = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { SCORE INew = MPrev[j-1] + GapOpenB[j]; if (INew > IPrev) { uInsertPos = j; IPrev = INew; } } // Special case for i=uLengthA, j=uLengthB SCORE scoreMax = MPrev[uLengthB-1]; int iTraceBack = 0; SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; if (scoreD > scoreMax) { scoreMax = scoreD; iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; } SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; if (scoreI > scoreMax) { scoreMax = scoreI; iTraceBack = (int) uInsertPos - (int) uLengthB; } TraceBack[uLengthA][uLengthB] = iTraceBack; TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); return scoreMax; } muscle-3.8.31.orig/clwwt.cpp0000644000175000017500000001150711352261620015241 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include "msa.h" /*** Compute weights by the CLUSTALW method. Thompson, Higgins and Gibson (1994), CABIOS (10) 19-29; see also CLUSTALW paper. Weights are computed from the edge lengths of a rooted tree. Define the strength of an edge to be its length divided by the number of leaves under that edge. The weight of a sequence is then the sum of edge strengths on the path from the root to the leaf. Example. 0.2 -----A 0.1 -x ------- B 0.7 --------y ----------- C 0.3 ----------z 0.4 -------------- D 0.8 Edge Length Leaves Strength ---- ----- ------ -------- xy 0.3 3 0.1 xA 0.2 1 0.2 yz 0.4 2 0.2 yB 0.1 1 0.1 zC 0.7 1 0.7 zD 0.8 1 0.8 Leaf Path Strengths Weight ---- ---- --------- ------ A xA 0.2 0.2 B xy-yB 0.1 + 0.1 0.2 C xy-yz-zC 0.1 + 0.2 + 0.7 1.0 D xy-yz-zD 0.1 + 0.2 + 0.8 1.1 ***/ #define TRACE 0 static unsigned CountLeaves(const Tree &tree, unsigned uNodeIndex, unsigned LeavesUnderNode[]) { if (tree.IsLeaf(uNodeIndex)) { LeavesUnderNode[uNodeIndex] = 1; return 1; } const unsigned uLeft = tree.GetLeft(uNodeIndex); const unsigned uRight = tree.GetRight(uNodeIndex); const unsigned uRightCount = CountLeaves(tree, uRight, LeavesUnderNode); const unsigned uLeftCount = CountLeaves(tree, uLeft, LeavesUnderNode); const unsigned uCount = uRightCount + uLeftCount; LeavesUnderNode[uNodeIndex] = uCount; return uCount; } void CalcClustalWWeights(const Tree &tree, WEIGHT Weights[]) { #if TRACE Log("CalcClustalWWeights\n"); tree.LogMe(); #endif const unsigned uLeafCount = tree.GetLeafCount(); if (0 == uLeafCount) return; else if (1 == uLeafCount) { Weights[0] = (WEIGHT) 1.0; return; } else if (2 == uLeafCount) { Weights[0] = (WEIGHT) 0.5; Weights[1] = (WEIGHT) 0.5; return; } if (!tree.IsRooted()) Quit("CalcClustalWWeights requires rooted tree"); const unsigned uNodeCount = tree.GetNodeCount(); unsigned *LeavesUnderNode = new unsigned[uNodeCount]; memset(LeavesUnderNode, 0, uNodeCount*sizeof(unsigned)); const unsigned uRootNodeIndex = tree.GetRootNodeIndex(); unsigned uLeavesUnderRoot = CountLeaves(tree, uRootNodeIndex, LeavesUnderNode); if (uLeavesUnderRoot != uLeafCount) Quit("WeightsFromTreee: Internal error, root count %u %u", uLeavesUnderRoot, uLeafCount); #if TRACE Log("Node Leaves Length Strength\n"); Log("---- ------ -------- --------\n"); // 1234 123456 12345678 12345678 #endif double *Strengths = new double[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (tree.IsRoot(uNodeIndex)) { Strengths[uNodeIndex] = 0.0; continue; } const unsigned uParent = tree.GetParent(uNodeIndex); const double dLength = tree.GetEdgeLength(uNodeIndex, uParent); const unsigned uLeaves = LeavesUnderNode[uNodeIndex]; const double dStrength = dLength / (double) uLeaves; Strengths[uNodeIndex] = dStrength; #if TRACE Log("%4u %6u %8g %8g\n", uNodeIndex, uLeaves, dLength, dStrength); #endif } #if TRACE Log("\n"); Log(" Seq Path..Weight\n"); Log("-------------------- ------------\n"); #endif for (unsigned n = 0; n < uLeafCount; ++n) { const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n); #if TRACE Log("%20.20s %4u ", tree.GetLeafName(uLeafNodeIndex), uLeafNodeIndex); #endif if (!tree.IsLeaf(uLeafNodeIndex)) Quit("CalcClustalWWeights: leaf"); double dWeight = 0; unsigned uNode = uLeafNodeIndex; while (!tree.IsRoot(uNode)) { dWeight += Strengths[uNode]; uNode = tree.GetParent(uNode); #if TRACE Log("->%u(%g)", uNode, Strengths[uNode]); #endif } if (dWeight < 0.0001) { #if TRACE Log("zero->one"); #endif dWeight = 1.0; } Weights[n] = (WEIGHT) dWeight; #if TRACE Log(" = %g\n", dWeight); #endif } delete[] Strengths; delete[] LeavesUnderNode; Normalize(Weights, uLeafCount); } void MSA::SetClustalWWeights(const Tree &tree) { const unsigned uSeqCount = GetSeqCount(); const unsigned uLeafCount = tree.GetLeafCount(); WEIGHT *Weights = new WEIGHT[uSeqCount]; CalcClustalWWeights(tree, Weights); for (unsigned n = 0; n < uLeafCount; ++n) { const WEIGHT w = Weights[n]; const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n); const unsigned uId = tree.GetLeafId(uLeafNodeIndex); const unsigned uSeqIndex = GetSeqIndex(uId); #if DEBUG if (GetSeqName(uSeqIndex) != tree.GetLeafName(uLeafNodeIndex)) Quit("MSA::SetClustalWWeights: names don't match"); #endif SetSeqWeight(uSeqIndex, w); } NormalizeWeights((WEIGHT) 1.0); delete[] Weights; } muscle-3.8.31.orig/options.cpp0000644000175000017500000001130511352261673015600 0ustar kratzcharles#include "muscle.h" #include struct VALUE_OPT { const char *m_pstrName; const char *m_pstrValue; }; struct FLAG_OPT { const char *m_pstrName; bool m_bSet; }; static VALUE_OPT ValueOpts[] = { "in", 0, "in1", 0, "in2", 0, "out", 0, "MaxIters", 0, "MaxHours", 0, "GapOpen", 0, "GapOpen2", 0, "GapExtend", 0, "GapExtend2", 0, "GapAmbig", 0, "Center", 0, "SmoothScoreCeil", 0, "MinBestColScore", 0, "MinSmoothScore", 0, "ObjScore", 0, "SmoothWindow", 0, "RefineWindow", 0, "FromWindow", 0, "ToWindow", 0, "SaveWindow", 0, "WindowOffset", 0, "FirstWindow", 0, "AnchorSpacing", 0, "Log", 0, "LogA", 0, "MaxTrees", 0, "SUEFF", 0, "Distance", 0, "Distance1", 0, "Distance2", 0, "Weight", 0, "Weight1", 0, "Weight2", 0, "Cluster", 0, "Cluster1", 0, "Cluster2", 0, "Root1", 0, "Root2", 0, "Tree1", 0, "Tree2", 0, "UseTree", 0, "UseTree_NoWarn", 0, "DiagLength", 0, "DiagMargin", 0, "DiagBreak", 0, "Hydro", 0, "HydroFactor", 0, "SPScore", 0, "SeqType", 0, "MaxMB", 0, "ComputeWeights", 0, "MaxSubFam", 0, "ScoreFile", 0, "TermGaps", 0, "FASTAOut", 0, "CLWOut", 0, "CLWStrictOut", 0, "HTMLOut", 0, "MSFOut", 0, "PHYIOut", 0, "PHYSOut", 0, "Matrix", 0, "DistMx1", 0, "DistMx2", 0, "Weight", 0, }; static int ValueOptCount = sizeof(ValueOpts)/sizeof(ValueOpts[0]); static FLAG_OPT FlagOpts[] = { "LE", false, "SP", false, "SV", false, "SPN", false, "Core", false, "NoCore", false, "Diags1", false, "Diags2", false, "Diags", false, "Quiet", false, "MSF", false, "Verbose", false, "Anchors", false, "NoAnchors", false, "Refine", false, "RefineW", false, "SW", false, "Profile", false, "PPScore", false, "ClusterOnly", false, "Brenner", false, "Dimer", false, "clw", false, "clwstrict", false, "HTML", false, "Version", false, "Stable", false, "Group", false, "FASTA", false, "ProfDB", false, "PAS", false, "PHYI", false, "PHYS", false, "TomHydro", false, "MakeTree", false, }; static int FlagOptCount = sizeof(FlagOpts)/sizeof(FlagOpts[0]); static bool TestSetFlagOpt(const char *Arg) { for (int i = 0; i < FlagOptCount; ++i) if (!stricmp(Arg, FlagOpts[i].m_pstrName)) { FlagOpts[i].m_bSet = true; return true; } return false; } static bool TestSetValueOpt(const char *Arg, const char *Value) { for (int i = 0; i < ValueOptCount; ++i) if (!stricmp(Arg, ValueOpts[i].m_pstrName)) { if (0 == Value) { fprintf(stderr, "Option -%s must have value\n", Arg); exit(EXIT_NotStarted); } ValueOpts[i].m_pstrValue = strsave(Value); return true; } return false; } bool FlagOpt(const char *Name) { for (int i = 0; i < FlagOptCount; ++i) if (!stricmp(Name, FlagOpts[i].m_pstrName)) return FlagOpts[i].m_bSet; Quit("FlagOpt(%s) invalid", Name); return false; } const char *ValueOpt(const char *Name) { for (int i = 0; i < ValueOptCount; ++i) if (!stricmp(Name, ValueOpts[i].m_pstrName)) return ValueOpts[i].m_pstrValue; Quit("ValueOpt(%s) invalid", Name); return 0; } void ProcessArgVect(int argc, char *argv[]) { for (int iArgIndex = 0; iArgIndex < argc; ) { const char *Arg = argv[iArgIndex]; if (Arg[0] != '-') { fprintf(stderr, "Command-line option \"%s\" must start with '-'\n", Arg); exit(EXIT_NotStarted); } const char *ArgName = Arg + 1; if (TestSetFlagOpt(ArgName)) { ++iArgIndex; continue; } char *Value = 0; if (iArgIndex < argc - 1) Value = argv[iArgIndex+1]; if (TestSetValueOpt(ArgName, Value)) { iArgIndex += 2; continue; } fprintf(stderr, "Invalid command line option \"%s\"\n", ArgName); Usage(); exit(EXIT_NotStarted); } } void ProcessArgStr(const char *ArgStr) { const int MAX_ARGS = 64; char *argv[MAX_ARGS]; if (0 == ArgStr) return; // Modifiable copy char *StrCopy = strsave(ArgStr); int argc = 0; bool bInArg = false; char *Str = StrCopy; while (char c = *Str) { if (isspace(c)) { *Str = 0; bInArg = false; } else if (!bInArg) { bInArg = true; if (argc >= MAX_ARGS) Quit("Too many args in MUSCLE_CMDLINE"); argv[argc++] = Str; } Str++; } ProcessArgVect(argc, argv); free(StrCopy); } void ListFlagOpts() { for (int i = 0; i < FlagOptCount; ++i) Log("%s %d\n", FlagOpts[i].m_pstrName, FlagOpts[i].m_bSet); } muscle-3.8.31.orig/cons.cpp0000644000175000017500000000611211352261636015046 0ustar kratzcharles/*** Conservation value for a column in an MSA is defined as the number of times the most common letter appears divided by the number of sequences. ***/ #include "muscle.h" #include "msa.h" #include double MSA::GetAvgCons() const { assert(GetSeqCount() > 0); double dSum = 0; unsigned uNonGapColCount = 0; for (unsigned uColIndex = 0; uColIndex < GetColCount(); ++uColIndex) { if (!IsGapColumn(uColIndex)) { dSum += GetCons(uColIndex); ++uNonGapColCount; } } assert(uNonGapColCount > 0); double dAvg = dSum / uNonGapColCount; assert(dAvg > 0 && dAvg <= 1); return dAvg; } double MSA::GetCons(unsigned uColIndex) const { unsigned Counts[MAX_ALPHA]; for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) Counts[uLetter] = 0; unsigned uMaxCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { if (IsGap(uSeqIndex, uColIndex)) continue; char c = GetChar(uSeqIndex, uColIndex); c = toupper(c); if ('X' == c || 'B' == c || 'Z' == c) continue; unsigned uLetter = GetLetter(uSeqIndex, uColIndex); unsigned uCount = Counts[uLetter] + 1; if (uCount > uMaxCount) uMaxCount = uCount; Counts[uLetter] = uCount; } // Cons is undefined for all-gap column if (0 == uMaxCount) { // assert(false); return 1; } double dCons = (double) uMaxCount / (double) GetSeqCount(); assert(dCons > 0 && dCons <= 1); return dCons; } // Perecent identity of a pair of sequences. // Positions with one or both gapped are ignored. double MSA::GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const { const unsigned uColCount = GetColCount(); unsigned uPosCount = 0; unsigned uSameCount = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c1 = GetChar(uSeqIndex1, uColIndex); const char c2 = GetChar(uSeqIndex2, uColIndex); if (IsGapChar(c1) || IsGapChar(c2)) continue; if (c1 == c2) ++uSameCount; ++uPosCount; } if (0 == uPosCount) return 0; return (double) uSameCount / (double) uPosCount; } // Perecent group identity of a pair of sequences. // Positions with one or both gapped are ignored. double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const { extern unsigned ResidueGroup[]; const unsigned uColCount = GetColCount(); unsigned uPosCount = 0; unsigned uSameCount = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { if (IsGap(uSeqIndex1, uColIndex)) continue; if (IsGap(uSeqIndex2, uColIndex)) continue; if (IsWildcard(uSeqIndex1, uColIndex)) continue; if (IsWildcard(uSeqIndex2, uColIndex)) continue; const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex); const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex); const unsigned uGroup1 = ResidueGroup[uLetter1]; const unsigned uGroup2 = ResidueGroup[uLetter2]; if (uGroup1 == uGroup2) ++uSameCount; ++uPosCount; } if (0 == uPosCount) return 0; return (double) uSameCount / (double) uPosCount; } muscle-3.8.31.orig/difftrees.cpp0000644000175000017500000002560611352261600016057 0ustar kratzcharles#include "muscle.h" #include "tree.h" #define TRACE 0 /*** Algorithm to compare two trees, X and Y. A node x in X and node y in Y are defined to be similar iff the set of leaves in the subtree under x is identical to the set of leaves under y. A node is defined to be dissimilar iff it is not similar to any node in the other tree. Nodes x and y are defined to be married iff every node in the subtree under x is similar to a node in the subtree under y. Married nodes are considered to be equal. The subtrees under two married nodes can at most differ by exchanges of left and right branches, which we do not consider to be significant here. A node is defined to be a bachelor iff it is not married. If a node is a bachelor, then it has a dissimilar node in its subtree, and it follows immediately from the definition of marriage that its parent is also a bachelor. Hence all nodes on the path from a bachelor node to the root are bachelors. We assume the trees have the same set of leaves, so every leaf is trivially both similar and married to the same leaf in the opposite tree. Bachelor nodes are therefore always internal (i.e., non-leaf) nodes. A node is defined to be a diff iff (a) it is married and (b) its parent is a bachelor. The subtree under a diff is maximally similar to the other tree. (In other words, you cannot extend the subtree without adding a bachelor). The set of diffs is the subset of the two trees that we consider to be identical. Example: -----A -----k ----j -----B --i -----C ------D -----A -----p ----n -----B --m -----D ------C The following pairs of internal nodes are similar. Nodes Set of leaves ----- ------------- k,p A,B i,m A,B,C,D Bachelors in the first tree are i and j, bachelors in the second tree are m and n. Node k and p are married, but i and m are not (because j and n are bachelors). The diffs are C, D and k. The set of bachelor nodes can be viewed as the internal nodes of a tree, the leaves of which are diffs. (To see that there can't be disjoint subtrees, note that the path from a diff to a root is all bachelor nodes, so there is always a path between two diffs that goes through the root). We call this tree the "diffs tree". There is a simple O(N) algorithm to build the diffs tree. To achieve O(N) we avoid traversing a given subtree multiple times and also avoid comparing lists of leaves. We visit nodes in depth-first order (i.e., a node is visited before its parent). If either child of a node is a bachelor, we flag it as a bachelor. If both children of the node we are visiting are married, we check whether the spouses of those children have the same parent in the other tree. If the parents are different, the current node is a bachelor. If they have the same parent, then the node we are visiting is the spouse of that parent. We assign this newly identified married couple a unique integer id. The id of a node is in one-to-one correspondence with the set of leaves in its subtree. Two nodes have the same set of leaves iff they have the same id. Bachelor nodes do not get an id. ***/ static void BuildDiffs(const Tree &tree, unsigned uTreeNodeIndex, const bool bIsDiff[], Tree &Diffs, unsigned uDiffsNodeIndex, unsigned IdToDiffsLeafNodeIndex[]) { #if TRACE Log("BuildDiffs(TreeNode=%u IsDiff=%d IsLeaf=%d)\n", uTreeNodeIndex, bIsDiff[uTreeNodeIndex], tree.IsLeaf(uTreeNodeIndex)); #endif if (bIsDiff[uTreeNodeIndex]) { unsigned uLeafCount = tree.GetLeafCount(); unsigned *Leaves = new unsigned[uLeafCount]; GetLeaves(tree, uTreeNodeIndex, Leaves, &uLeafCount); for (unsigned n = 0; n < uLeafCount; ++n) { const unsigned uLeafNodeIndex = Leaves[n]; const unsigned uId = tree.GetLeafId(uLeafNodeIndex); if (uId >= tree.GetLeafCount()) Quit("BuildDiffs, id out of range"); IdToDiffsLeafNodeIndex[uId] = uDiffsNodeIndex; #if TRACE Log(" Leaf id=%u DiffsNode=%u\n", uId, uDiffsNodeIndex); #endif } delete[] Leaves; return; } if (tree.IsLeaf(uTreeNodeIndex)) Quit("BuildDiffs: should never reach leaf"); const unsigned uTreeLeft = tree.GetLeft(uTreeNodeIndex); const unsigned uTreeRight = tree.GetRight(uTreeNodeIndex); const unsigned uDiffsLeft = Diffs.AppendBranch(uDiffsNodeIndex); const unsigned uDiffsRight = uDiffsLeft + 1; BuildDiffs(tree, uTreeLeft, bIsDiff, Diffs, uDiffsLeft, IdToDiffsLeafNodeIndex); BuildDiffs(tree, uTreeRight, bIsDiff, Diffs, uDiffsRight, IdToDiffsLeafNodeIndex); } void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs, unsigned IdToDiffsLeafNodeIndex[]) { #if TRACE Log("Tree1:\n"); Tree1.LogMe(); Log("\n"); Log("Tree2:\n"); Tree2.LogMe(); #endif if (!Tree1.IsRooted() || !Tree2.IsRooted()) Quit("DiffTrees: requires rooted trees"); const unsigned uNodeCount = Tree1.GetNodeCount(); const unsigned uNodeCount2 = Tree2.GetNodeCount(); const unsigned uLeafCount = Tree1.GetLeafCount(); const unsigned uLeafCount2 = Tree2.GetLeafCount(); assert(uLeafCount == uLeafCount2); if (uNodeCount != uNodeCount2) Quit("DiffTrees: different node counts"); // Allocate tables so we can convert tree node index to // and from the unique id with a O(1) lookup. unsigned *NodeIndexToId1 = new unsigned[uNodeCount]; unsigned *IdToNodeIndex2 = new unsigned[uNodeCount]; bool *bIsBachelor1 = new bool[uNodeCount]; bool *bIsDiff1 = new bool[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { NodeIndexToId1[uNodeIndex] = uNodeCount; bIsBachelor1[uNodeIndex] = false; bIsDiff1[uNodeIndex] = false; // Use uNodeCount as value meaning "not set". IdToNodeIndex2[uNodeIndex] = uNodeCount; } // Initialize node index <-> id lookup tables for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (Tree1.IsLeaf(uNodeIndex)) { const unsigned uId = Tree1.GetLeafId(uNodeIndex); if (uId >= uNodeCount) Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)"); NodeIndexToId1[uNodeIndex] = uId; } if (Tree2.IsLeaf(uNodeIndex)) { const unsigned uId = Tree2.GetLeafId(uNodeIndex); if (uId >= uNodeCount) Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)"); IdToNodeIndex2[uId] = uNodeIndex; } } // Validity check. This verifies that the ids // pre-assigned to the leaves in Tree1 are unique // (note that the id= 20) continue; for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); unsigned uLetter2 = msa.GetLetterEx(uSeqIndex2, uColIndex); if (uLetter2 >= 20) continue; SCORE t = w1*w2*(*g_ptrScoreMatrix)[uLetter1][uLetter2]; #if TRACE Log("Check %c %c w1=%.3g w2=%.3g Mx=%.3g t=%.3g\n", LetterToCharAmino(uLetter1), LetterToCharAmino(uLetter2), w1, w2, (*g_ptrScoreMatrix)[uLetter1][uLetter2], t); #endif Sum += t; } } return Sum; } static SCORE SPGapFreqs(const FCOUNT Freqs[]) { #if TRACE Log("Freqs="); for (unsigned i = 0; i < 4; ++i) if (Freqs[i] != 0) Log(" %s=%.3g", GapTypeToStr(i), Freqs[i]); Log("\n"); #endif SCORE TotalOffDiag = 0; SCORE TotalDiag = 0; for (unsigned i = 0; i < 4; ++i) { const FCOUNT fi = Freqs[i]; if (0 == fi) continue; const float *Row = GapScoreMatrix[i]; SCORE diagt = fi*fi*Row[i]; TotalDiag += diagt; #if TRACE Log("SPFGaps %s %s + Mx=%.3g TotalDiag += %.3g\n", GapTypeToStr(i), GapTypeToStr(i), Row[i], diagt); #endif SCORE Sum = 0; for (unsigned j = 0; j < i; ++j) { SCORE t = Freqs[j]*Row[j]; #if TRACE if (Freqs[j] != 0) Log("SPFGaps %s %s + Mx=%.3g Sum += %.3g\n", GapTypeToStr(i), GapTypeToStr(j), Row[j], fi*t); #endif Sum += t; } TotalOffDiag += fi*Sum; } #if TRACE Log("SPFGap TotalOffDiag=%.3g + TotalDiag=%.3g = %.3g\n", TotalOffDiag, TotalDiag, TotalOffDiag + TotalDiag); #endif return TotalOffDiag*2 + TotalDiag; } static SCORE SPFreqs(const FCOUNT Freqs[]) { #if TRACE Log("Freqs="); for (unsigned i = 0; i < 20; ++i) if (Freqs[i] != 0) Log(" %c=%.3g", LetterToCharAmino(i), Freqs[i]); Log("\n"); #endif SCORE TotalOffDiag = 0; SCORE TotalDiag = 0; for (unsigned i = 0; i < 20; ++i) { const FCOUNT fi = Freqs[i]; if (0 == fi) continue; const float *Row = (*g_ptrScoreMatrix)[i]; SCORE diagt = fi*fi*Row[i]; TotalDiag += diagt; #if TRACE Log("SPF %c %c + Mx=%.3g TotalDiag += %.3g\n", LetterToCharAmino(i), LetterToCharAmino(i), Row[i], diagt); #endif SCORE Sum = 0; for (unsigned j = 0; j < i; ++j) { SCORE t = Freqs[j]*Row[j]; #if TRACE if (Freqs[j] != 0) Log("SPF %c %c + Mx=%.3g Sum += %.3g\n", LetterToCharAmino(i), LetterToCharAmino(j), Row[j], fi*t); #endif Sum += t; } TotalOffDiag += fi*Sum; } #if TRACE Log("SPF TotalOffDiag=%.3g + TotalDiag=%.3g = %.3g\n", TotalOffDiag, TotalDiag, TotalOffDiag + TotalDiag); #endif return TotalOffDiag*2 + TotalDiag; } static SCORE ObjScoreSPCol(const MSA &msa, unsigned uColIndex) { FCOUNT Freqs[20]; FCOUNT GapFreqs[4]; memset(Freqs, 0, sizeof(Freqs)); memset(GapFreqs, 0, sizeof(GapFreqs)); const unsigned uSeqCount = msa.GetSeqCount(); #if TRACE Log("Weights="); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log(" %u=%.3g", uSeqIndex, msa.GetSeqWeight(uSeqIndex)); Log("\n"); #endif SCORE SelfOverCount = 0; SCORE GapSelfOverCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { WEIGHT w = msa.GetSeqWeight(uSeqIndex); bool bGapThisCol = msa.IsGap(uSeqIndex, uColIndex); bool bGapPrevCol = (uColIndex == 0 ? false : msa.IsGap(uSeqIndex, uColIndex - 1)); int GapType = bGapThisCol + 2*bGapPrevCol; assert(GapType >= 0 && GapType < 4); GapFreqs[GapType] += w; SCORE gapt = w*w*GapScoreMatrix[GapType][GapType]; GapSelfOverCount += gapt; if (bGapThisCol) continue; unsigned uLetter = msa.GetLetterEx(uSeqIndex, uColIndex); if (uLetter >= 20) continue; Freqs[uLetter] += w; SCORE t = w*w*(*g_ptrScoreMatrix)[uLetter][uLetter]; #if TRACE Log("FastCol compute freqs & SelfOverCount %c w=%.3g M=%.3g SelfOverCount += %.3g\n", LetterToCharAmino(uLetter), w, (*g_ptrScoreMatrix)[uLetter][uLetter], t); #endif SelfOverCount += t; } SCORE SPF = SPFreqs(Freqs); SCORE Col = SPF - SelfOverCount; SCORE SPFGaps = SPGapFreqs(GapFreqs); SCORE ColGaps = SPFGaps - GapSelfOverCount; #if TRACE Log("SPF=%.3g - SelfOverCount=%.3g = %.3g\n", SPF, SelfOverCount, Col); Log("SPFGaps=%.3g - GapsSelfOverCount=%.3g = %.3g\n", SPFGaps, GapSelfOverCount, ColGaps); #endif return Col + ColGaps; } SCORE ObjScoreSPDimer(const MSA &msa) { static bool bGapScoreMatrixInit = false; if (!bGapScoreMatrixInit) InitGapScoreMatrix(); SCORE Total = 0; const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { SCORE Col = ObjScoreSPCol(msa, uColIndex); #if TRACE { SCORE ColCheck = SPColBrute(msa, uColIndex); Log("FastCol=%.3g CheckCol=%.3g\n", Col, ColCheck); } #endif Total += Col; } #if TRACE Log("Total/2 = %.3g (final result from fast)\n", Total/2); #endif return Total/2; } muscle-3.8.31.orig/fastclust.cpp0000644000175000017500000000334011352261673016115 0ustar kratzcharles#include "muscle.h" #include "seqvect.h" #include "distfunc.h" #include "clust.h" #include "clustsetdf.h" #include "tree.h" #include "clust.h" #include "distcalc.h" #include static void TreeFromSeqVect_NJ(const DistFunc &DF, CLUSTER Cluster, Tree &tree) { ClustSetDF CSD(DF); Clust C; C.Create(CSD, Cluster); tree.FromClust(C); } static void TreeFromSeqVect_UPGMA(const DistFunc &DF, CLUSTER Cluster, Tree &tree) { LINKAGE Linkage = LINKAGE_Undefined; switch (Cluster) { case CLUSTER_UPGMA: Linkage = LINKAGE_Avg; break; case CLUSTER_UPGMAMin: Linkage = LINKAGE_Min; break; case CLUSTER_UPGMAMax: Linkage = LINKAGE_Max; break; case CLUSTER_UPGMB: Linkage = LINKAGE_Biased; break; default: Quit("TreeFromSeqVect_UPGMA, CLUSTER_%u not supported", Cluster); } DistCalcDF DC; DC.Init(DF); UPGMA2(DC, tree, Linkage); } static void SaveDF(const SeqVect &v, DistFunc &d, const char *FileName) { FILE *f = fopen(FileName, "w"); if (f == 0) Quit("Cannot create %s", FileName); unsigned n = v.GetSeqCount(); fprintf(f, "%u\n", n); for (unsigned i = 0; i < n; ++i) { fprintf(f, "%10.10s ", v.GetSeqName(i)); for (unsigned j = 0; j < i; ++j) fprintf(f, " %9g", d.GetDist(i, j)); fprintf(f, "\n"); } fclose(f); } void TreeFromSeqVect(const SeqVect &v, Tree &tree, CLUSTER Cluster, DISTANCE Distance, ROOT Root, const char *SaveFileName) { DistFunc DF; DistUnaligned(v, Distance, DF); if (SaveFileName != 0) SaveDF(v, DF, SaveFileName); if (CLUSTER_NeighborJoining == Cluster) TreeFromSeqVect_NJ(DF, Cluster, tree); else TreeFromSeqVect_UPGMA(DF, Cluster, tree); FixRoot(tree, Root); } muscle-3.8.31.orig/unixio.h0000644000175000017500000000026411352261600015055 0ustar kratzcharles#ifdef WIN32 #include #include #else #include #include #endif #if !defined(WIN32) && !defined(O_BINARY) #define O_BINARY 0 #endif muscle-3.8.31.orig/profile.h0000644000175000017500000001064211352261673015215 0ustar kratzcharles#ifndef FastProf2_h #define FastProf2_h #include "msa.h" #include "pwpath.h" #include // for log function class DiagList; class WeightList; struct ProfPos { bool m_bAllGaps; unsigned m_uSortOrder[21]; FCOUNT m_fcCounts[20]; FCOUNT m_LL; FCOUNT m_LG; FCOUNT m_GL; FCOUNT m_GG; SCORE m_AAScores[20]; unsigned m_uResidueGroup; FCOUNT m_fOcc; FCOUNT m_fcStartOcc; FCOUNT m_fcEndOcc; SCORE m_scoreGapOpen; SCORE m_scoreGapClose; #if DOUBLE_AFFINE SCORE m_scoreGapOpen2; SCORE m_scoreGapClose2; #endif // SCORE m_scoreGapExtend; }; struct ProgNode { ProgNode() { m_Prof = 0; m_EstringL = 0; m_EstringR = 0; } MSA m_MSA; ProfPos *m_Prof; PWPath m_Path; short *m_EstringL; short *m_EstringR; unsigned m_uLength; WEIGHT m_Weight; }; extern unsigned ResidueGroup[]; const unsigned RESIDUE_GROUP_MULTIPLE = (unsigned) ~0; extern PTR_SCOREMATRIX g_ptrScoreMatrix; ProfPos *ProfileFromMSA(const MSA &a); SCORE TraceBack(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, PWPath &Path); SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); void ProgressiveAlign(const SeqVect &v, const Tree &tree, MSA &a); SCORE MSAPairSP(const MSA &msa1, const MSA &msa2); void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB, MSA &msaCombined); void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA = 0); SCORE ScoreProfPos2(const ProfPos &PPA, const ProfPos &PPB); SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const PWPath &Path); bool IsHydrophilic(const FCOUNT fcCounts[]); int PAM200_Letter(unsigned uLetter1, unsigned uLetter2); SCORE AverageMatchScore(const PWPath &Path, unsigned uEdgeIndex, unsigned uWindowLength); void WindowSmooth(const SCORE Score[], unsigned uCount, unsigned uWindowLength, SCORE SmoothScore[], double dCeil = 9e29); SCORE FastScoreMSA_LA(const MSA &msa, SCORE MatchScore[] = 0); SCORE FastScoreMSA_NS(const MSA &msa, SCORE MatchScore[] = 0); SCORE FastScoreMSA_SP(const MSA &msa, SCORE MatchScore[] = 0); bool RefineMSA(MSA &msa, const Tree &tree); SCORE MSAQScore(const MSA &msa, SCORE MatchScore[] = 0); bool RefineBiParts(MSA &msa, const Tree &tree, bool R); void FindAnchorCols(const MSA &msa, unsigned AnchorCols[], unsigned *ptruAnchorColCount); double PctIdToHeight(double dPctId); double PctIdToHeightKimura(double dPctId); double PctIdToHeightMAFFT(double dPctId); double PctIdToMAFFTDist(double dPctId); bool RefineBlocks(MSA &msa, const Tree &tree); bool RefineSubfams(MSA &msaIn, const Tree &tree, unsigned uIters); void SetMuscleTree(const Tree &tree); void CalcClustalWWeights(const Tree &tree, WEIGHT Weights[]); void RealignDiffs(const MSA &msaIn, const Tree &Diffs, const unsigned IdToDiffsTreeNodeIndex[], MSA &msaOut); void RealignDiffsE(const MSA &msaIn, const SeqVect &v, const Tree &NewTree, const Tree &OldTree, const unsigned uNewNodeIndexToOldNodeIndex[], MSA &msaOut, ProgNode *OldProgNodes); void RefineTree(MSA &msa, Tree &tree); void RefineTreeE(MSA &msa, const SeqVect &v, Tree &tree, ProgNode *ProgNodes); bool IsHydrophobic(const FCOUNT fcCounts[]); void Hydro(ProfPos *Prof, unsigned uLength); void SetTermGaps(const ProfPos *Prof, unsigned uLength); // Macros to simulate 2D matrices #define DPL(PLA, PLB) DPL_[(PLB)*uPrefixCountA + (PLA)] #define DPM(PLA, PLB) DPM_[(PLB)*uPrefixCountA + (PLA)] #define DPD(PLA, PLB) DPD_[(PLB)*uPrefixCountA + (PLA)] #define DPE(PLA, PLB) DPE_[(PLB)*uPrefixCountA + (PLA)] #define DPI(PLA, PLB) DPI_[(PLB)*uPrefixCountA + (PLA)] #define DPJ(PLA, PLB) DPJ_[(PLB)*uPrefixCountA + (PLA)] #define DPU(PLA, PLB) DPU_[(PLB)*uPrefixCountA + (PLA)] #define TBM(PLA, PLB) TBM_[(PLB)*uPrefixCountA + (PLA)] #define TBD(PLA, PLB) TBD_[(PLB)*uPrefixCountA + (PLA)] #define TBE(PLA, PLB) TBE_[(PLB)*uPrefixCountA + (PLA)] #define TBI(PLA, PLB) TBI_[(PLB)*uPrefixCountA + (PLA)] #define TBJ(PLA, PLB) TBJ_[(PLB)*uPrefixCountA + (PLA)] SCORE ScoreProfPos2LA(const ProfPos &PPA, const ProfPos &PPB); SCORE ScoreProfPos2NS(const ProfPos &PPA, const ProfPos &PPB); SCORE ScoreProfPos2SP(const ProfPos &PPA, const ProfPos &PPB); SCORE ScoreProfPos2SPN(const ProfPos &PPA, const ProfPos &PPB); #endif // FastProf_h muscle-3.8.31.orig/enumopts.cpp0000644000175000017500000000024011352261600015741 0ustar kratzcharles#include "muscle.h" #include "enumopts.h" #define s(t) EnumOpt t##_Opts[] = { #define c(t, x) #x, t##_##x, #define e(t) 0, 0 }; #include "enums.h" muscle-3.8.31.orig/muscle.vcproj0000644000175000017500000003636011367131123016116 0ustar kratzcharles muscle-3.8.31.orig/physeq.cpp0000644000175000017500000000513711352261666015426 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "textfile.h" const int BLOCKSIZE = 60; static char FixChar(char c) { switch (c) { case '(': case ')': case '[': case ']': case ':': case ';': case ',': return '_'; } if (!isprint(c)) return '_'; return c; } static void FixName(char Name[]) { while (char c = *Name) *Name++ = FixChar(c); } void MSA::ToPhySequentialFile(TextFile &File) const { const unsigned SeqCount = GetSeqCount(); const unsigned ColCount = GetColCount(); File.PutFormat("%d %d\n", SeqCount, ColCount); if (0 == ColCount) return; for (unsigned Seq = 0; Seq < SeqCount; ++Seq) { char Name[11]; const char *ptrName = GetSeqName(Seq); size_t n = strlen(ptrName); if (n > 10) n = 10; memcpy(Name, ptrName, n); Name[n] = 0; FixName(Name); File.PutFormat("%-10.10s", Name); int BlockIndex = 0; int Col = 0; for (;;) { const unsigned MaxCols = (BlockIndex == 0) ? (BLOCKSIZE - 10) : BLOCKSIZE; for (unsigned ColsThisBlock = 0; ColsThisBlock < MaxCols; ++ColsThisBlock) { if (Col == ColCount) break; if (ColsThisBlock%10 == 0 && (BlockIndex == 0 || ColsThisBlock > 0)) File.PutChar(' '); char c = GetChar(Seq, Col); if (isalpha(c)) c = toupper(c); File.PutChar(c); ++Col; } File.PutChar('\n'); if (Col == ColCount) break; ++BlockIndex; } } } void MSA::ToPhyInterleavedFile(TextFile &File) const { const unsigned SeqCount = GetSeqCount(); const unsigned ColCount = GetColCount(); File.PutFormat("%d %d\n", SeqCount, ColCount); if (0 == ColCount) return; int Col = 0; for (;;) { const unsigned ColBlockStart = Col; const unsigned MaxCols = (ColBlockStart == 0) ? (BLOCKSIZE - 10) : BLOCKSIZE; for (unsigned Seq = 0; Seq < SeqCount; ++Seq) { if (0 == ColBlockStart) { char Name[11]; const char *ptrName = GetSeqName(Seq); size_t n = strlen(ptrName); if (n > 10) n = 10; memcpy(Name, ptrName, n); Name[n] = 0; FixName(Name); File.PutFormat("%-10.10s", Name); } Col = ColBlockStart; for (unsigned ColsThisBlock = 0; ColsThisBlock < MaxCols; ++ColsThisBlock) { if (Col == ColCount) break; if (ColsThisBlock%10 == 0 && (0 == ColBlockStart || ColsThisBlock > 0)) File.PutChar(' '); char c = GetChar(Seq, Col); if (isalpha(c)) c = toupper(c); File.PutChar(c); ++Col; } File.PutChar('\n'); } if (Col == ColCount) break; File.PutChar('\n'); } } muscle-3.8.31.orig/muscleout.cpp0000644000175000017500000000415411352261673016131 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "params.h" #include "textfile.h" static void DoOutput(MSA &msa) { bool AnyOutput = false; // Value options if (g_pstrFASTAOutFileName) { TextFile File(g_pstrFASTAOutFileName, true); msa.ToFASTAFile(File); AnyOutput = true; } if (g_pstrMSFOutFileName) { TextFile File(g_pstrMSFOutFileName, true); msa.ToMSFFile(File); AnyOutput = true; } if (g_pstrClwOutFileName) { TextFile File(g_pstrClwOutFileName, true); msa.ToAlnFile(File); AnyOutput = true; } if (g_pstrClwStrictOutFileName) { g_bClwStrict = true; TextFile File(g_pstrClwStrictOutFileName, true); msa.ToAlnFile(File); AnyOutput = true; } if (g_pstrHTMLOutFileName) { TextFile File(g_pstrHTMLOutFileName, true); msa.ToHTMLFile(File); AnyOutput = true; } if (g_pstrPHYIOutFileName) { TextFile File(g_pstrPHYIOutFileName, true); msa.ToPhyInterleavedFile(File); AnyOutput = true; } if (g_pstrPHYSOutFileName) { TextFile File(g_pstrPHYSOutFileName, true); msa.ToPhySequentialFile(File); AnyOutput = true; } // Flag options, at most one used (because only one -out filename) TextFile fileOut(g_pstrOutFileName, true); if (g_bFASTA) { msa.ToFASTAFile(fileOut); AnyOutput = true; } else if (g_bMSF) { msa.ToMSFFile(fileOut); AnyOutput = true; } else if (g_bAln) { msa.ToAlnFile(fileOut); AnyOutput = true; } else if (g_bHTML) { msa.ToHTMLFile(fileOut); AnyOutput = true; } else if (g_bPHYI) { msa.ToPhyInterleavedFile(fileOut); AnyOutput = true; } else if (g_bPHYS) { msa.ToPhySequentialFile(fileOut); AnyOutput = true; } // If -out option was given but no flags, output as FASTA if (!AnyOutput) msa.ToFASTAFile(fileOut); fileOut.Close(); if (0 != g_pstrScoreFileName) WriteScoreFile(msa); } void MuscleOutput(MSA &msa) { MHackEnd(msa); if (g_bStable) { MSA msaStable; Stabilize(msa, msaStable); msa.Clear(); // save memory DoOutput(msaStable); } else DoOutput(msa); } muscle-3.8.31.orig/textfile.h0000644000175000017500000000306211352261673015377 0ustar kratzcharles#ifndef TextFile_h #define TextFile_h #include struct TEXTFILEPOS { unsigned uOffset; unsigned uLineNr; unsigned uColNr; }; const unsigned TextFileBufferSize = 256; class TextFile { private: // no default c'tor, not implemented TextFile(); public: virtual ~TextFile(); TextFile(const char szFileName[], bool bWrite = false); TextFile(FILE *ptrFile, const char *ptrFileName = "-"); void Close() { fclose(m_ptrFile); m_ptrFile = 0; } bool GetLine(char szLine[], unsigned uBytes); bool GetTrimLine(char szLine[], unsigned uBytes); void GetLineX(char szLine[], unsigned uBytes); bool GetToken(char szToken[], unsigned uBytes, const char szCharTokens[] = "{}"); void GetTokenX(char szToken[], unsigned uBytes, const char szCharTokens[] = "{}"); void Skip(); void SkipLine(); void SkipWhite(); bool SkipWhiteX(); void Rewind(); TEXTFILEPOS GetPos(); void SetPos(TEXTFILEPOS Pos); bool GetChar(char &c); void GetCharX(char &c); void GetNonblankChar(char &c); unsigned GetLineNr() { return m_uLineNr; } void PutString(const char szLine[]); void PutFormat(const char szFormat[], ...); void PutChar(char c); const char *GetFileName() { return m_ptrName; } void PushBack(int c) { m_cPushedBack = c; } FILE *GetStdioFile() const { return m_ptrFile; } private: void Init(FILE *ptrFile, const char *ptrFileName); private: FILE *m_ptrFile; unsigned m_uLineNr; unsigned m_uColNr; char *m_ptrName; bool m_bLastCharWasEOL; int m_cPushedBack; }; #endif // TextFile_h muscle-3.8.31.orig/aligntwoprofs.cpp0000644000175000017500000000147711352261667017017 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "profile.h" #include "pwpath.h" SCORE GlobalAlign4(ProfPos *PA, unsigned uLengthA, ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE AlignTwoProfs( const ProfPos *PA, unsigned uLengthA, WEIGHT wA, const ProfPos *PB, unsigned uLengthB, WEIGHT wB, PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut) { assert(uLengthA < 100000); assert(uLengthB < 100000); float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0 if (r < 1) r = 1/r; SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path); AlignTwoProfsGivenPath(Path, PA, uLengthB, wA/(wA + wB), PB, uLengthB, wB/(wA + wB), ptrPout, ptruLengthOut); #if HYDRO if (ALPHA_Amino == g_Alpha) Hydro(*ptrPout, *ptruLengthOut); #endif return Score; } muscle-3.8.31.orig/upgma2.cpp0000644000175000017500000002425611352261611015301 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include "distcalc.h" // UPGMA clustering in O(N^2) time and space. #define TRACE 0 #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define AVG(x, y) (((x) + (y))/2) static unsigned g_uLeafCount; static unsigned g_uTriangleSize; static unsigned g_uInternalNodeCount; static unsigned g_uInternalNodeIndex; // Triangular distance matrix is g_Dist, which is allocated // as a one-dimensional vector of length g_uTriangleSize. // TriangleSubscript(i,j) maps row,column=i,j to the subscript // into this vector. // Row / column coordinates are a bit messy. // Initially they are leaf indexes 0..N-1. // But each time we create a new node (=new cluster, new subtree), // we re-use one of the two rows that become available (the children // of the new node). This saves memory. // We keep track of this through the g_uNodeIndex vector. static dist_t *g_Dist; // Distance to nearest neighbor in row i of distance matrix. // Subscript is distance matrix row. static dist_t *g_MinDist; // Nearest neighbor to row i of distance matrix. // Subscript is distance matrix row. static unsigned *g_uNearestNeighbor; // Node index of row i in distance matrix. // Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes. // Subscript is distance matrix row. static unsigned *g_uNodeIndex; // The following vectors are defined on internal nodes, // subscripts are internal node index 0..N-2. // For g_uLeft/Right, value is the node index 0 .. 2N-2 // because a child can be internal or leaf. static unsigned *g_uLeft; static unsigned *g_uRight; static dist_t *g_Height; static dist_t *g_LeftLength; static dist_t *g_RightLength; static inline unsigned TriangleSubscript(unsigned uIndex1, unsigned uIndex2) { #if DEBUG if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount) Quit("TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount); #endif unsigned v; if (uIndex1 >= uIndex2) v = uIndex2 + (uIndex1*(uIndex1 - 1))/2; else v = uIndex1 + (uIndex2*(uIndex2 - 1))/2; assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2); return v; } static void ListState() { Log("Dist matrix\n"); Log(" "); for (unsigned i = 0; i < g_uLeafCount; ++i) { if (uInsane == g_uNodeIndex[i]) continue; Log(" %5u", g_uNodeIndex[i]); } Log("\n"); for (unsigned i = 0; i < g_uLeafCount; ++i) { if (uInsane == g_uNodeIndex[i]) continue; Log("%5u ", g_uNodeIndex[i]); for (unsigned j = 0; j < g_uLeafCount; ++j) { if (uInsane == g_uNodeIndex[j]) continue; if (i == j) Log(" "); else { unsigned v = TriangleSubscript(i, j); Log("%5.2g ", g_Dist[v]); } } Log("\n"); } Log("\n"); Log(" i Node NrNb Dist\n"); Log("----- ----- ----- --------\n"); for (unsigned i = 0; i < g_uLeafCount; ++i) { if (uInsane == g_uNodeIndex[i]) continue; Log("%5u %5u %5u %8.3f\n", i, g_uNodeIndex[i], g_uNearestNeighbor[i], g_MinDist[i]); } Log("\n"); Log(" Node L R Height LLength RLength\n"); Log("----- ----- ----- ------ ------- -------\n"); for (unsigned i = 0; i <= g_uInternalNodeIndex; ++i) Log("%5u %5u %5u %6.2g %6.2g %6.2g\n", i, g_uLeft[i], g_uRight[i], g_Height[i], g_LeftLength[i], g_RightLength[i]); } void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage) { g_uLeafCount = DC.GetCount(); g_uTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2; g_uInternalNodeCount = g_uLeafCount - 1; g_Dist = new dist_t[g_uTriangleSize]; g_uNodeIndex = new unsigned[g_uLeafCount]; g_uNearestNeighbor = new unsigned[g_uLeafCount]; g_MinDist = new dist_t[g_uLeafCount]; unsigned *Ids = new unsigned [g_uLeafCount]; char **Names = new char *[g_uLeafCount]; g_uLeft = new unsigned[g_uInternalNodeCount]; g_uRight = new unsigned[g_uInternalNodeCount]; g_Height = new dist_t[g_uInternalNodeCount]; g_LeftLength = new dist_t[g_uInternalNodeCount]; g_RightLength = new dist_t[g_uInternalNodeCount]; for (unsigned i = 0; i < g_uLeafCount; ++i) { g_MinDist[i] = BIG_DIST; g_uNodeIndex[i] = i; g_uNearestNeighbor[i] = uInsane; Ids[i] = DC.GetId(i); Names[i] = strsave(DC.GetName(i)); } for (unsigned i = 0; i < g_uInternalNodeCount; ++i) { g_uLeft[i] = uInsane; g_uRight[i] = uInsane; g_LeftLength[i] = BIG_DIST; g_RightLength[i] = BIG_DIST; g_Height[i] = BIG_DIST; } // Compute initial NxN triangular distance matrix. // Store minimum distance for each full (not triangular) row. // Loop from 1, not 0, because "row" is 0, 1 ... i-1, // so nothing to do when i=0. for (unsigned i = 1; i < g_uLeafCount; ++i) { dist_t *Row = g_Dist + TriangleSubscript(i, 0); DC.CalcDistRange(i, Row); for (unsigned j = 0; j < i; ++j) { const dist_t d = Row[j]; if (d < g_MinDist[i]) { g_MinDist[i] = d; g_uNearestNeighbor[i] = j; } if (d < g_MinDist[j]) { g_MinDist[j] = d; g_uNearestNeighbor[j] = i; } } } #if TRACE Log("Initial state:\n"); ListState(); #endif for (g_uInternalNodeIndex = 0; g_uInternalNodeIndex < g_uLeafCount - 1; ++g_uInternalNodeIndex) { #if TRACE Log("\n"); Log("Internal node index %5u\n", g_uInternalNodeIndex); Log("-------------------------\n"); #endif // Find nearest neighbors unsigned Lmin = uInsane; unsigned Rmin = uInsane; dist_t dtMinDist = BIG_DIST; for (unsigned j = 0; j < g_uLeafCount; ++j) { if (uInsane == g_uNodeIndex[j]) continue; dist_t d = g_MinDist[j]; if (d < dtMinDist) { dtMinDist = d; Lmin = j; Rmin = g_uNearestNeighbor[j]; assert(uInsane != Rmin); assert(uInsane != g_uNodeIndex[Rmin]); } } assert(Lmin != uInsane); assert(Rmin != uInsane); assert(dtMinDist != BIG_DIST); #if TRACE Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n", Lmin, g_uNodeIndex[Lmin], Rmin, g_uNodeIndex[Rmin], dtMinDist); #endif // Compute distances to new node // New node overwrites row currently assigned to Lmin dist_t dtNewMinDist = BIG_DIST; unsigned uNewNearestNeighbor = uInsane; for (unsigned j = 0; j < g_uLeafCount; ++j) { if (j == Lmin || j == Rmin) continue; if (uInsane == g_uNodeIndex[j]) continue; const unsigned vL = TriangleSubscript(Lmin, j); const unsigned vR = TriangleSubscript(Rmin, j); const dist_t dL = g_Dist[vL]; const dist_t dR = g_Dist[vR]; dist_t dtNewDist; switch (Linkage) { case LINKAGE_Avg: dtNewDist = AVG(dL, dR); break; case LINKAGE_Min: dtNewDist = MIN(dL, dR); break; case LINKAGE_Max: dtNewDist = MAX(dL, dR); break; case LINKAGE_Biased: dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR); break; default: Quit("UPGMA2: Invalid LINKAGE_%u", Linkage); } // Nasty special case. // If nearest neighbor of j is Lmin or Rmin, then make the new // node (which overwrites the row currently occupied by Lmin) // the nearest neighbor. This situation can occur when there are // equal distances in the matrix. If we don't make this fix, // the nearest neighbor pointer for j would become invalid. // (We don't need to test for == Lmin, because in that case // the net change needed is zero due to the change in row // numbering). if (g_uNearestNeighbor[j] == Rmin) g_uNearestNeighbor[j] = Lmin; #if TRACE Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n", j, Lmin, dL, Rmin, dR, dtNewDist); #endif g_Dist[vL] = dtNewDist; if (dtNewDist < dtNewMinDist) { dtNewMinDist = dtNewDist; uNewNearestNeighbor = j; } } assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist); assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor); const unsigned v = TriangleSubscript(Lmin, Rmin); const dist_t dLR = g_Dist[v]; const dist_t dHeightNew = dLR/2; const unsigned uLeft = g_uNodeIndex[Lmin]; const unsigned uRight = g_uNodeIndex[Rmin]; const dist_t HeightLeft = uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount]; const dist_t HeightRight = uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount]; g_uLeft[g_uInternalNodeIndex] = uLeft; g_uRight[g_uInternalNodeIndex] = uRight; g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft; g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight; g_Height[g_uInternalNodeIndex] = dHeightNew; // Row for left child overwritten by row for new node g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex; g_uNearestNeighbor[Lmin] = uNewNearestNeighbor; g_MinDist[Lmin] = dtNewMinDist; // Delete row for right child g_uNodeIndex[Rmin] = uInsane; #if TRACE Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n", g_uInternalNodeIndex, Lmin, Rmin); ListState(); #endif } unsigned uRoot = g_uLeafCount - 2; tree.Create(g_uLeafCount, uRoot, g_uLeft, g_uRight, g_LeftLength, g_RightLength, Ids, Names); #if TRACE tree.LogMe(); #endif delete[] g_Dist; delete[] g_uNodeIndex; delete[] g_uNearestNeighbor; delete[] g_MinDist; delete[] g_Height; delete[] g_uLeft; delete[] g_uRight; delete[] g_LeftLength; delete[] g_RightLength; for (unsigned i = 0; i < g_uLeafCount; ++i) free(Names[i]); delete[] Names; delete[] Ids; } class DistCalcTest : public DistCalc { virtual void CalcDistRange(unsigned i, dist_t Dist[]) const { static dist_t TestDist[5][5] = { 0, 2, 14, 14, 20, 2, 0, 14, 14, 20, 14, 14, 0, 4, 20, 14, 14, 4, 0, 20, 20, 20, 20, 20, 0, }; for (unsigned j = 0; j < i; ++j) Dist[j] = TestDist[i][j]; } virtual unsigned GetCount() const { return 5; } virtual unsigned GetId(unsigned i) const { return i; } virtual const char *GetName(unsigned i) const { return "name"; } }; void Test() { SetListFileName("c:\\tmp\\lobster.log", false); DistCalcTest DC; Tree tree; UPGMA2(DC, tree, LINKAGE_Avg); } muscle-3.8.31.orig/phyfromfile.cpp0000644000175000017500000001342611352261673016437 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include "textfile.h" #define TRACE 0 // Tokens in Newick files are: // ( ) : , ; // string // 'string' // "string" // [ comment ] // // We can't safely distinguish between identifiers and floating point // numbers at the lexical level (because identifiers may be numeric, // or start with digits), so both edge lengths and identifiers are // returned as strings. const char *Tree::NTTStr(NEWICK_TOKEN_TYPE NTT) const { switch (NTT) { #define c(x) case NTT_##x: return #x; c(Unknown) c(Lparen) c(Rparen) c(Colon) c(Comma) c(Semicolon) c(String) c(SingleQuotedString) c(DoubleQuotedString) c(Comment) #undef c } return "??"; } NEWICK_TOKEN_TYPE Tree::GetToken(TextFile &File, char szToken[], unsigned uBytes) const { // Skip leading white space File.SkipWhite(); char c; File.GetCharX(c); // In case a single-character token szToken[0] = c; szToken[1] = 0; unsigned uBytesCopied = 0; NEWICK_TOKEN_TYPE TT; switch (c) { case '(': return NTT_Lparen; case ')': return NTT_Rparen; case ':': return NTT_Colon; case ';': return NTT_Semicolon; case ',': return NTT_Comma; case '\'': TT = NTT_SingleQuotedString; File.GetCharX(c); break; case '"': TT = NTT_DoubleQuotedString; File.GetCharX(c); break; case '[': TT = NTT_Comment; break; default: TT = NTT_String; break; } for (;;) { if (TT != NTT_Comment) { if (uBytesCopied < uBytes - 2) { szToken[uBytesCopied++] = c; szToken[uBytesCopied] = 0; } else Quit("Tree::GetToken: input buffer too small, token so far='%s'", szToken); } bool bEof = File.GetChar(c); if (bEof) return TT; switch (TT) { case NTT_String: if (0 != strchr("():;,", c)) { File.PushBack(c); return NTT_String; } if (isspace(c)) return NTT_String; break; case NTT_SingleQuotedString: if ('\'' == c) return NTT_String; break; case NTT_DoubleQuotedString: if ('"' == c) return NTT_String; break; case NTT_Comment: if (']' == c) return GetToken(File, szToken, uBytes); break; default: Quit("Tree::GetToken, invalid TT=%u", TT); } } } // NOTE: this hack must come after definition of Tree::GetToken. #if TRACE #define GetToken GetTokenVerbose #endif void Tree::FromFile(TextFile &File) { // Assume rooted. // If we discover that it is unrooted, will convert on the fly. CreateRooted(); double dEdgeLength; bool bEdgeLength = GetGroupFromFile(File, 0, &dEdgeLength); // Next token should be either ';' for rooted tree or ',' for unrooted. char szToken[16]; NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken)); // If rooted, all done. if (NTT_Semicolon == NTT) { if (bEdgeLength) Log(" *** Warning *** edge length on root group in Newick file %s\n", File.GetFileName()); Validate(); return; } if (NTT_Comma != NTT) Quit("Tree::FromFile, expected ';' or ',', got '%s'", szToken); const unsigned uThirdNode = UnrootFromFile(); bEdgeLength = GetGroupFromFile(File, uThirdNode, &dEdgeLength); if (bEdgeLength) SetEdgeLength(0, uThirdNode, dEdgeLength); Validate(); } // Return true if edge length for this group. bool Tree::GetGroupFromFile(TextFile &File, unsigned uNodeIndex, double *ptrdEdgeLength) { char szToken[1024]; NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken)); // Group is either leaf name or (left, right). if (NTT_String == NTT) { SetLeafName(uNodeIndex, szToken); #if TRACE Log("Group is leaf '%s'\n", szToken); #endif } else if (NTT_Lparen == NTT) { const unsigned uLeft = AppendBranch(uNodeIndex); const unsigned uRight = uLeft + 1; // Left sub-group... #if TRACE Log("Got '(', group is compound, expect left sub-group\n"); #endif double dEdgeLength; bool bLeftLength = GetGroupFromFile(File, uLeft, &dEdgeLength); #if TRACE if (bLeftLength) Log("Edge length for left sub-group: %.3g\n", dEdgeLength); else Log("No edge length for left sub-group\n"); #endif if (bLeftLength) SetEdgeLength(uNodeIndex, uLeft, dEdgeLength); // ... then comma ... #if TRACE Log("Expect comma\n"); #endif NTT = GetToken(File, szToken, sizeof(szToken)); if (NTT_Comma != NTT) Quit("Tree::GetGroupFromFile, expected ',', got '%s'", szToken); // ...then right sub-group... #if TRACE Log("Expect right sub-group\n"); #endif bool bRightLength = GetGroupFromFile(File, uRight, &dEdgeLength); if (bRightLength) SetEdgeLength(uNodeIndex, uRight, dEdgeLength); #if TRACE if (bRightLength) Log("Edge length for right sub-group: %.3g\n", dEdgeLength); else Log("No edge length for right sub-group\n"); #endif // ... then closing parenthesis. #if TRACE Log("Expect closing parenthesis (or comma if > 2-ary)\n"); #endif NTT = GetToken(File, szToken, sizeof(szToken)); if (NTT_Rparen == NTT) ; else if (NTT_Comma == NTT) { File.PushBack(','); return false; } else Quit("Tree::GetGroupFromFile, expected ')' or ',', got '%s'", szToken); } else Quit("Tree::GetGroupFromFile, expected '(' or leaf name, got '%s'", szToken); // Group may optionally be followed by edge length. bool bEof = File.SkipWhiteX(); if (bEof) return false; char c; File.GetCharX(c); #if TRACE Log("Character following group, could be colon, is '%c'\n", c); #endif if (':' == c) { NTT = GetToken(File, szToken, sizeof(szToken)); if (NTT_String != NTT) Quit("Tree::GetGroupFromFile, expected edge length, got '%s'", szToken); *ptrdEdgeLength = atof(szToken); return true; } File.PushBack(c); return false; } muscle-3.8.31.orig/refinew.cpp0000644000175000017500000001245011352261666015550 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "seqvect.h" #include "textfile.h" #define MEMDEBUG 0 #if MEMDEBUG #include #endif void MUSCLE(SeqVect &v, MSA &msaOut); // Append msa2 at the end of msa1 void AppendMSA(MSA &msa1, const MSA &msa2) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2; bool bFound = msa2.GetSeqIndex(uId, &uSeqIndex2); if (bFound) { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } else { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } } static void SeqFromMSACols(const MSA &msa, unsigned uSeqIndex, unsigned uColFrom, unsigned uColTo, Seq &s) { s.Clear(); s.SetName(msa.GetSeqName(uSeqIndex)); s.SetId(msa.GetSeqId(uSeqIndex)); for (unsigned uColIndex = uColFrom; uColIndex <= uColTo; ++uColIndex) { char c = msa.GetChar(uSeqIndex, uColIndex); if (!IsGapChar(c)) s.AppendChar(c); } } static void SeqVectFromMSACols(const MSA &msa, unsigned uColFrom, unsigned uColTo, SeqVect &v) { v.Clear(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq s; SeqFromMSACols(msa, uSeqIndex, uColFrom, uColTo, s); v.AppendSeq(s); } } void RefineW(const MSA &msaIn, MSA &msaOut) { const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uColCount = msaIn.GetColCount(); // Reserve same nr seqs, 20% more cols const unsigned uReserveColCount = (uColCount*120)/100; msaOut.SetSize(uSeqCount, uReserveColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { msaOut.SetSeqName(uSeqIndex, msaIn.GetSeqName(uSeqIndex)); msaOut.SetSeqId(uSeqIndex, msaIn.GetSeqId(uSeqIndex)); } const unsigned uWindowCount = (uColCount + g_uRefineWindow - 1)/g_uRefineWindow; if (0 == g_uWindowTo) g_uWindowTo = uWindowCount - 1; #if MEMDEBUG _CrtSetBreakAlloc(1560); #endif if (g_uWindowOffset > 0) { MSA msaTmp; MSAFromColRange(msaIn, 0, g_uWindowOffset, msaOut); } fprintf(stderr, "\n"); for (unsigned uWindowIndex = g_uWindowFrom; uWindowIndex <= g_uWindowTo; ++uWindowIndex) { fprintf(stderr, "Window %d of %d \r", uWindowIndex, uWindowCount); const unsigned uColFrom = g_uWindowOffset + uWindowIndex*g_uRefineWindow; unsigned uColTo = uColFrom + g_uRefineWindow - 1; if (uColTo >= uColCount) uColTo = uColCount - 1; assert(uColTo >= uColFrom); SeqVect v; SeqVectFromMSACols(msaIn, uColFrom, uColTo, v); #if MEMDEBUG _CrtMemState s1; _CrtMemCheckpoint(&s1); #endif MSA msaTmp; MUSCLE(v, msaTmp); AppendMSA(msaOut, msaTmp); if (uWindowIndex == g_uSaveWindow) { MSA msaInTmp; unsigned uOutCols = msaOut.GetColCount(); unsigned un = uColTo - uColFrom + 1; MSAFromColRange(msaIn, uColFrom, un, msaInTmp); char fn[256]; sprintf(fn, "win%d_inaln.tmp", uWindowIndex); TextFile fIn(fn, true); msaInTmp.ToFile(fIn); sprintf(fn, "win%d_inseqs.tmp", uWindowIndex); TextFile fv(fn, true); v.ToFile(fv); sprintf(fn, "win%d_outaln.tmp", uWindowIndex); TextFile fOut(fn, true); msaTmp.ToFile(fOut); } #if MEMDEBUG void FreeDPMemSPN(); FreeDPMemSPN(); _CrtMemState s2; _CrtMemCheckpoint(&s2); _CrtMemState s; _CrtMemDifference(&s, &s1, &s2); _CrtMemDumpStatistics(&s); _CrtMemDumpAllObjectsSince(&s1); exit(1); #endif //#if DEBUG // AssertMSAEqIgnoreCaseAndGaps(msaInTmp, msaTmp); //#endif } fprintf(stderr, "\n"); // AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut);//@@uncomment! } void DoRefineW() { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetStartTime(); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); MSA msa; msa.FromFile(fileIn); const unsigned uSeqCount = msa.GetSeqCount(); if (0 == uSeqCount) Quit("No sequences in input file"); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); SetMuscleInputMSA(msa); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = msa.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); msa.FixAlpha(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); MSA msaOut; RefineW(msa, msaOut); // ValidateMuscleIds(msa); // TextFile fileOut(g_pstrOutFileName, true); // msaOut.ToFile(fileOut); MuscleOutput(msaOut); } muscle-3.8.31.orig/objscore.h0000644000175000017500000000237711352261667015374 0ustar kratzcharles#ifndef ObjScore_h #define ObjScore_h SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2); SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2); SCORE ScoreGaps(const MSA &msa, const unsigned Cols[], unsigned ColCount); SCORE ObjScore(const MSA &msa, const unsigned SeqIndexes1[], unsigned uSeqCount1, const unsigned SeqIndexes2[], unsigned uSeqCount2); SCORE ObjScoreIds(const MSA &msa, const unsigned Ids1[], unsigned uCount1, const unsigned Ids2[], unsigned uCount2); void GetLetterScores(const MSA &msa, SCORE LetterScores[]); SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[] = 0); SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[] = 0); SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[] = 0); SCORE ObjScoreXP(const MSA &msa, const MSA &msa2); SCORE ObjScoreSPDimer(const MSA &msa); SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount, SCORE MatchScore[] = 0); SCORE DiffObjScore( const MSA &msa1, const PWPath &Path1, const unsigned Edges1[], unsigned uEdgeCount1, const MSA &msa2, const PWPath &Path2, const unsigned Edges2[], unsigned uEdgeCount2); #endif // ObjScore_h muscle-3.8.31.orig/version.txt0000644000175000017500000000000411366122745015622 0ustar kratzcharles3.9 muscle-3.8.31.orig/glbalignss.cpp0000644000175000017500000001631011352261636016232 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "pwpath.h" #include "seq.h" extern SCOREMATRIX VTML_SP; // #define SUBST(i, j) Subst(seqA, seqB, i, j) #define SUBST(i, j) MxRowA[i][seqB.GetLetter(j)] static SCORE Subst(const Seq &seqA, const Seq &seqB, unsigned i, unsigned j) { assert(i < seqA.Length()); assert(j < seqB.Length()); unsigned uLetterA = seqA.GetLetter(i); unsigned uLetterB = seqB.GetLetter(j); return VTML_SP[uLetterA][uLetterB] + g_scoreCenter; } struct DP_MEMORY { unsigned uLength; SCORE *MPrev; SCORE *MCurr; SCORE *MWork; SCORE *DPrev; SCORE *DCurr; SCORE *DWork; SCORE **MxRowA; unsigned *LettersB; unsigned *uDeletePos; int **TraceBack; }; static struct DP_MEMORY DPM; static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) { // Max prefix length unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; if (uLength < DPM.uLength) return; // Add 256 to allow for future expansion and // round up to next multiple of 32. uLength += 256; uLength += 32 - uLength%32; const unsigned uOldLength = DPM.uLength; if (uOldLength > 0) { for (unsigned i = 0; i < uOldLength; ++i) delete[] DPM.TraceBack[i]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.MxRowA; delete[] DPM.LettersB; delete[] DPM.uDeletePos; delete[] DPM.TraceBack; } DPM.uLength = uLength; DPM.MPrev = new SCORE[uLength]; DPM.MCurr = new SCORE[uLength]; DPM.MWork = new SCORE[uLength]; DPM.DPrev = new SCORE[uLength]; DPM.DCurr = new SCORE[uLength]; DPM.DWork = new SCORE[uLength]; DPM.MxRowA = new SCORE *[uLength]; DPM.LettersB = new unsigned[uLength]; DPM.uDeletePos = new unsigned[uLength]; DPM.TraceBack = new int*[uLength]; for (unsigned i = 0; i < uLength; ++i) DPM.TraceBack[i] = new int[uLength]; } static void RowFromSeq(const Seq &s, SCORE *Row[]) { const unsigned uLength = s.Length(); for (unsigned i = 0; i < uLength; ++i) { char c = s.GetChar(i); unsigned uLetter = CharToLetter(c); if (uLetter < 20) Row[i] = VTML_SP[uLetter]; else Row[i] = VTML_SP[AX_X]; } } static void LettersFromSeq(const Seq &s, unsigned Letters[]) { const unsigned uLength = s.Length(); for (unsigned i = 0; i < uLength; ++i) { char c = s.GetChar(i); unsigned uLetter = CharToLetter(c); if (uLetter < 20) Letters[i] = uLetter; else Letters[i] = AX_X; } } SCORE GlobalAlignSS(const Seq &seqA, const Seq &seqB, PWPath &Path) { const unsigned uLengthA = seqA.Length(); const unsigned uLengthB = seqB.Length(); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; AllocDPMem(uLengthA, uLengthB); SCORE *MPrev = DPM.MPrev; SCORE *MCurr = DPM.MCurr; SCORE *MWork = DPM.MWork; SCORE *DPrev = DPM.DPrev; SCORE *DCurr = DPM.DCurr; SCORE *DWork = DPM.DWork; SCORE **MxRowA = DPM.MxRowA; unsigned *LettersB = DPM.LettersB; RowFromSeq(seqA, MxRowA); LettersFromSeq(seqB, LettersB); unsigned *uDeletePos = DPM.uDeletePos; int **TraceBack = DPM.TraceBack; #if DEBUG for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); #endif // Special case for i=0 TraceBack[0][0] = 0; MPrev[0] = MxRowA[0][LettersB[0]]; // D(0,0) is -infinity (requires I->D). DPrev[0] = MINUS_INFINITY; for (unsigned j = 1; j < uLengthB; ++j) { unsigned uLetterB = LettersB[j]; // Only way to get M(0, j) looks like this: // A ----X // B XXXXX // 0 j // So gap-open at j=0, gap-close at j-1. MPrev[j] = MxRowA[0][uLetterB] + g_scoreGapOpen/2; // term gaps half TraceBack[0][j] = -(int) j; // Assume no D->I transitions, then can't be a delete if only // one letter from A. DPrev[j] = MINUS_INFINITY; } SCORE IPrev_j_1; for (unsigned i = 1; i < uLengthA; ++i) { SCORE *ptrMCurr_j = MCurr; memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); const SCORE *RowA = MxRowA[i]; const SCORE *ptrRowA = MxRowA[i]; const SCORE *ptrMCurrEnd = ptrMCurr_j + uLengthB; unsigned *ptrLettersB = LettersB; for (; ptrMCurr_j != ptrMCurrEnd; ++ptrMCurr_j) { *ptrMCurr_j = RowA[*ptrLettersB]; ++ptrLettersB; } unsigned *ptrDeletePos = uDeletePos; // Special case for j=0 // Only way to get M(i, 0) looks like this: // 0 i // A XXXXX // B ----X // So gap-open at i=0, gap-close at i-1. ptrMCurr_j = MCurr; assert(ptrMCurr_j == &(MCurr[0])); *ptrMCurr_j += g_scoreGapOpen/2; // term gaps half ++ptrMCurr_j; int *ptrTraceBack_ij = TraceBack[i]; *ptrTraceBack_ij++ = (int) i; SCORE *ptrMPrev_j = MPrev; SCORE *ptrDPrev = DPrev; SCORE d = *ptrDPrev; SCORE DNew = *ptrMPrev_j + g_scoreGapOpen; if (DNew > d) { d = DNew; *ptrDeletePos = i; } SCORE *ptrDCurr = DCurr; assert(ptrDCurr == &(DCurr[0])); *ptrDCurr = d; // Can't have an insert if no letters from B IPrev_j_1 = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { // Here, MPrev_j is preserved from previous // iteration so with current i,j is M[i-1][j-1] SCORE MPrev_j = *ptrMPrev_j; SCORE INew = MPrev_j + g_scoreGapOpen; if (INew > IPrev_j_1) { IPrev_j_1 = INew; uInsertPos = j; } SCORE scoreMax = MPrev_j; assert(ptrDPrev == &(DPrev[j-1])); SCORE scoreD = *ptrDPrev++; if (scoreD > scoreMax) { scoreMax = scoreD; assert(ptrDeletePos == &(uDeletePos[j-1])); *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; assert(*ptrTraceBack_ij > 0); } ++ptrDeletePos; SCORE scoreI = IPrev_j_1; if (scoreI > scoreMax) { scoreMax = scoreI; *ptrTraceBack_ij = (int) uInsertPos - (int) j; assert(*ptrTraceBack_ij < 0); } *ptrMCurr_j += scoreMax; assert(ptrMCurr_j == &(MCurr[j])); ++ptrMCurr_j; MPrev_j = *(++ptrMPrev_j); assert(ptrDPrev == &(DPrev[j])); SCORE d = *ptrDPrev; SCORE DNew = MPrev_j + g_scoreGapOpen; if (DNew > d) { d = DNew; assert(ptrDeletePos == &uDeletePos[j]); *ptrDeletePos = i; } assert(ptrDCurr + 1 == &(DCurr[j])); *(++ptrDCurr) = d; ++ptrTraceBack_ij; } Rotate(MPrev, MCurr, MWork); Rotate(DPrev, DCurr, DWork); } // Special case for i=uLengthA SCORE IPrev = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { SCORE INew = MPrev[j-1]; if (INew > IPrev) { uInsertPos = j; IPrev = INew; } } // Special case for i=uLengthA, j=uLengthB SCORE scoreMax = MPrev[uLengthB-1]; int iTraceBack = 0; SCORE scoreD = DPrev[uLengthB-1] - g_scoreGapOpen/2; // term gaps half if (scoreD > scoreMax) { scoreMax = scoreD; iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; } SCORE scoreI = IPrev - g_scoreGapOpen/2; if (scoreI > scoreMax) { scoreMax = scoreI; iTraceBack = (int) uInsertPos - (int) uLengthB; } TraceBack[uLengthA][uLengthB] = iTraceBack; TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); return scoreMax; } muscle-3.8.31.orig/henikoffweight.cpp0000644000175000017500000000467611352261667017126 0ustar kratzcharles#include "muscle.h" #include "msa.h" /*** Compute Henikoff weights. Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights. J. Mol. Biol., 243(4):574-578. Award each different residue an equal share of the weight, and then to divide up that weight equally among the sequences sharing the same residue. So if in a position of a multiple alignment, r different residues are represented, a residue represented in only one sequence contributes a score of 1/r to that sequence, whereas a residue represented in s sequences contributes a score of 1/rs to each of the s sequences. For each sequence, the contributions from each position are summed to give a sequence weight. See also HenikoffWeightPB. ***/ void MSA::CalcHenikoffWeightsCol(unsigned uColIndex) const { const unsigned uSeqCount = GetSeqCount(); // Compute letter counts in this column unsigned uLetterCount[MAX_ALPHA]; memset(uLetterCount, 0, sizeof(uLetterCount)); unsigned uDifferentLetterCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); if (uLetter >= 20) continue; unsigned uNewCount = uLetterCount[uLetter] + 1; uLetterCount[uLetter] = uNewCount; if (1 == uNewCount) ++uDifferentLetterCount; } // Compute weight contributions for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); if (uLetter >= 20) continue; const unsigned uCount = uLetterCount[uLetter]; unsigned uDenom = uCount*uDifferentLetterCount; if (uDenom == 0) continue; m_Weights[uSeqIndex] += (WEIGHT) (1.0/uDenom); } } void MSA::SetHenikoffWeights() const { const unsigned uColCount = GetColCount(); const unsigned uSeqCount = GetSeqCount(); if (0 == uSeqCount) return; else if (1 == uSeqCount) { m_Weights[0] = (WEIGHT) 1.0; return; } else if (2 == uSeqCount) { m_Weights[0] = (WEIGHT) 0.5; m_Weights[1] = (WEIGHT) 0.5; return; } for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) m_Weights[uSeqIndex] = 0.0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) CalcHenikoffWeightsCol(uColIndex); // Set all-gap seqs weight to 0 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGapSeq(uSeqIndex)) m_Weights[uSeqIndex] = 0.0; Normalize(m_Weights, uSeqCount); } muscle-3.8.31.orig/fastdistmafft.cpp0000644000175000017500000001651711352261667016761 0ustar kratzcharles#include "muscle.h" #include "distfunc.h" #include "seqvect.h" #include #define TRACE 0 #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) const unsigned TUPLE_COUNT = 6*6*6*6*6*6; static unsigned char Count1[TUPLE_COUNT]; static unsigned char Count2[TUPLE_COUNT]; // Amino acid groups according to MAFFT (sextet5) // 0 = A G P S T // 1 = I L M V // 2 = N D Q E B Z // 3 = R H K // 4 = F W Y // 5 = C // 6 = X . - U unsigned ResidueGroup[] = { 0, // AX_A, 5, // AX_C, 2, // AX_D, 2, // AX_E, 4, // AX_F, 0, // AX_G, 3, // AX_H, 1, // AX_I, 3, // AX_K, 1, // AX_L, 1, // AX_M, 2, // AX_N, 0, // AX_P, 2, // AX_Q, 3, // AX_R, 0, // AX_S, 0, // AX_T, 1, // AX_V, 4, // AX_W, 4, // AX_Y, 2, // AX_B, // D or N 2, // AX_Z, // E or Q 0, // AX_X, // Unknown // ******** TODO ************* // This isn't the correct way of avoiding group 6 0 // AX_GAP, // ******** TODO ****************** }; unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]); static char *TupleToStr(int t) { static char s[7]; int t1, t2, t3, t4, t5, t6; t1 = t%6; t2 = (t/6)%6; t3 = (t/(6*6))%6; t4 = (t/(6*6*6))%6; t5 = (t/(6*6*6*6))%6; t6 = (t/(6*6*6*6*6))%6; s[5] = '0' + t1; s[4] = '0' + t2; s[3] = '0' + t3; s[2] = '0' + t4; s[1] = '0' + t5; s[0] = '0' + t6; return s; } static unsigned GetTuple(const unsigned uLetters[], unsigned n) { assert(uLetters[n] < uResidueGroupCount); assert(uLetters[n+1] < uResidueGroupCount); assert(uLetters[n+2] < uResidueGroupCount); assert(uLetters[n+3] < uResidueGroupCount); assert(uLetters[n+4] < uResidueGroupCount); assert(uLetters[n+5] < uResidueGroupCount); unsigned u1 = ResidueGroup[uLetters[n]]; unsigned u2 = ResidueGroup[uLetters[n+1]]; unsigned u3 = ResidueGroup[uLetters[n+2]]; unsigned u4 = ResidueGroup[uLetters[n+3]]; unsigned u5 = ResidueGroup[uLetters[n+4]]; unsigned u6 = ResidueGroup[uLetters[n+5]]; return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6; } static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[]) { memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char)); for (unsigned n = 0; n < uTupleCount; ++n) { const unsigned uTuple = GetTuple(L, n); ++(Count[uTuple]); } } static void ListCount(const unsigned char Count[]) { for (unsigned n = 0; n < TUPLE_COUNT; ++n) { if (0 == Count[n]) continue; Log("%s %u\n", TupleToStr(n), Count[n]); } } void DistKmer6_6(const SeqVect &v, DistFunc &DF) { const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; // Initialize distance matrix to zero for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } // Convert to letters unsigned **Letters = new unsigned *[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); unsigned *L = new unsigned[uSeqLength]; Letters[uSeqIndex] = L; for (unsigned n = 0; n < uSeqLength; ++n) { char c = s[n]; L[n] = CharToLetterEx(c); assert(L[n] < uResidueGroupCount); } } unsigned **uCommonTupleCount = new unsigned *[uSeqCount]; for (unsigned n = 0; n < uSeqCount; ++n) { uCommonTupleCount[n] = new unsigned[uSeqCount]; memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned)); } const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &seq1 = *(v[uSeq1]); const unsigned uSeqLength1 = seq1.Length(); if (uSeqLength1 < 5) continue; const unsigned uTupleCount = uSeqLength1 - 5; const unsigned *L = Letters[uSeq1]; CountTuples(L, uTupleCount, Count1); #if TRACE { Log("Seq1=%d\n", uSeq1); Log("Groups:\n"); for (unsigned n = 0; n < uSeqLength1; ++n) Log("%u", ResidueGroup[L[n]]); Log("\n"); Log("Tuples:\n"); ListCount(Count1); } #endif SetProgressDesc("K-mer dist pass 1"); for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; Seq &seq2 = *(v[uSeq2]); const unsigned uSeqLength2 = seq2.Length(); if (uSeqLength2 < 5) { if (uSeq1 == uSeq2) DF.SetDist(uSeq1, uSeq2, 0); else DF.SetDist(uSeq1, uSeq2, 1); continue; } // First pass through seq 2 to count tuples const unsigned uTupleCount = uSeqLength2 - 5; const unsigned *L = Letters[uSeq2]; CountTuples(L, uTupleCount, Count2); #if TRACE Log("Seq2=%d Counts=\n", uSeq2); ListCount(Count2); #endif // Second pass to accumulate sum of shared tuples // MAFFT defines this as the sum over unique tuples // in seq2 of the minimum of the number of tuples found // in the two sequences. unsigned uSum = 0; for (unsigned n = 0; n < uTupleCount; ++n) { const unsigned uTuple = GetTuple(L, n); uSum += MIN(Count1[uTuple], Count2[uTuple]); // This is a hack to make sure each unique tuple counted only once. Count2[uTuple] = 0; } #if TRACE { Seq &s1 = *(v[uSeq1]); Seq &s2 = *(v[uSeq2]); const char *pName1 = s1.GetName(); const char *pName2 = s2.GetName(); Log("Common count %s(%d) - %s(%d) =%u\n", pName1, uSeq1, pName2, uSeq2, uSum); } #endif uCommonTupleCount[uSeq1][uSeq2] = uSum; uCommonTupleCount[uSeq2][uSeq1] = uSum; } } ProgressStepsDone(); uCount = 0; SetProgressDesc("K-mer dist pass 2"); for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &s1 = *(v[uSeq1]); const char *pName1 = s1.GetName(); double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1]; if (0 == dCommonTupleCount11) dCommonTupleCount11 = 1; DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2]; if (0 == dCommonTupleCount22) dCommonTupleCount22 = 1; const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount11; const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount22; // dMinDist is the value used for tree-building in MAFFT const double dMinDist = MIN(dDist1, dDist2); DF.SetDist(uSeq1, uSeq2, (float) dMinDist); //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist); //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId); // **** TODO **** why does this make score slightly worse?? //const double dKimuraDist = KimuraDist(dEstimatedPctId); //DF.SetDist(uSeq1, uSeq2, dKimuraDist); } } ProgressStepsDone(); for (unsigned n = 0; n < uSeqCount; ++n) delete[] uCommonTupleCount[n]; delete[] uCommonTupleCount; delete[] Letters; } double PctIdToMAFFTDist(double dPctId) { if (dPctId < 0.05) dPctId = 0.05; double dDist = -log(dPctId); return dDist; } double PctIdToHeightMAFFT(double dPctId) { return PctIdToMAFFTDist(dPctId); } muscle-3.8.31.orig/blosum62.cpp0000644000175000017500000000377711352261672015573 0ustar kratzcharles#include "muscle.h" int BLOSUM62[20][20] = { // A C D E F G H I K L M N P Q R S T V W Y { 4, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, -2}, // A { 0, 9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2}, // C {-2, -3, 6, 2, -3, -1, -1, -3, -1, -4, -3, 1, -1, 0, -2, 0, -1, -3, -4, -3}, // D {-1, -4, 2, 5, -3, -2, 0, -3, 1, -3, -2, 0, -1, 2, 0, 0, -1, -2, -3, -2}, // E {-2, -2, -3, -3, 6, -3, -1, 0, -3, 0, 0, -3, -4, -3, -3, -2, -2, -1, 1, 3}, // F { 0, -3, -1, -2, -3, 6, -2, -4, -2, -4, -3, 0, -2, -2, -2, 0, -2, -3, -2, -3}, // G {-2, -3, -1, 0, -1, -2, 8, -3, -1, -3, -2, 1, -2, 0, 0, -1, -2, -3, -2, 2}, // H {-1, -1, -3, -3, 0, -4, -3, 4, -3, 2, 1, -3, -3, -3, -3, -2, -1, 3, -3, -1}, // I {-1, -3, -1, 1, -3, -2, -1, -3, 5, -2, -1, 0, -1, 1, 2, 0, -1, -2, -3, -2}, // K {-1, -1, -4, -3, 0, -4, -3, 2, -2, 4, 2, -3, -3, -2, -2, -2, -1, 1, -2, -1}, // L {-1, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5, -2, -2, 0, -1, -1, -1, 1, -1, -1}, // M {-2, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -2}, // N {-1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7, -1, -2, -1, -1, -2, -4, -3}, // P {-1, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5, 1, 0, -1, -2, -2, -1}, // Q {-1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5, -1, -1, -3, -3, -2}, // R { 1, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4, 1, -2, -3, -2}, // S { 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5, 0, -2, -2}, // T { 0, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4, -3, -1}, // V {-3, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, 2}, // W {-2, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, 7}, // Y }; double BLOSUM62_Expected = -0.5209; muscle-3.8.31.orig/distfunc.cpp0000644000175000017500000000457011352261600015720 0ustar kratzcharles#include "muscle.h" #include "distfunc.h" #include DistFunc::DistFunc() { m_Dists = 0; m_uCount = 0; m_uCacheCount = 0; m_Names = 0; m_Ids = 0; } DistFunc::~DistFunc() { if (0 != m_Names) { for (unsigned i = 0; i < m_uCount; ++i) free(m_Names[i]); } delete[] m_Dists; delete[] m_Names; delete[] m_Ids; } float DistFunc::GetDist(unsigned uIndex1, unsigned uIndex2) const { return m_Dists[VectorIndex(uIndex1, uIndex2)]; } unsigned DistFunc::GetCount() const { return m_uCount; } void DistFunc::SetCount(unsigned uCount) { m_uCount = uCount; if (uCount <= m_uCacheCount) return; delete[] m_Dists; m_Dists = new float[VectorLength()]; m_Names = new char *[m_uCount]; m_Ids = new unsigned[m_uCount]; m_uCacheCount = uCount; memset(m_Names, 0, m_uCount*sizeof(char *)); memset(m_Ids, 0xff, m_uCount*sizeof(unsigned)); memset(m_Dists, 0, VectorLength()*sizeof(float)); } void DistFunc::SetDist(unsigned uIndex1, unsigned uIndex2, float dDist) { m_Dists[VectorIndex(uIndex1, uIndex2)] = dDist; m_Dists[VectorIndex(uIndex2, uIndex1)] = dDist; } unsigned DistFunc::VectorIndex(unsigned uIndex1, unsigned uIndex2) const { assert(uIndex1 < m_uCount && uIndex2 < m_uCount); return uIndex1*m_uCount + uIndex2; } unsigned DistFunc::VectorLength() const { return m_uCount*m_uCount; } void DistFunc::SetName(unsigned uIndex, const char szName[]) { assert(uIndex < m_uCount); m_Names[uIndex] = strsave(szName); } void DistFunc::SetId(unsigned uIndex, unsigned uId) { assert(uIndex < m_uCount); m_Ids[uIndex] = uId; } const char *DistFunc::GetName(unsigned uIndex) const { assert(uIndex < m_uCount); return m_Names[uIndex]; } unsigned DistFunc::GetId(unsigned uIndex) const { assert(uIndex < m_uCount); return m_Ids[uIndex]; } void DistFunc::LogMe() const { Log("DistFunc::LogMe count=%u\n", m_uCount); Log(" "); for (unsigned i = 0; i < m_uCount; ++i) Log(" %7u", i); Log("\n"); Log(" "); for (unsigned i = 0; i < m_uCount; ++i) Log(" %7.7s", m_Names[i] ? m_Names[i] : ""); Log("\n"); for (unsigned i = 0; i < m_uCount; ++i) { Log("%4u %10.10s : ", i, m_Names[i] ? m_Names[i] : ""); for (unsigned j = 0; j <= i; ++j) Log(" %7.4g", GetDist(i, j)); Log("\n"); } } muscle-3.8.31.orig/objscore.cpp0000644000175000017500000000452711352261667015726 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "objscore.h" #include "profile.h" #include "timing.h" #if TIMING TICKS g_ticksObjScore = 0; #endif SCORE ObjScore(const MSA &msa, const unsigned SeqIndexes1[], unsigned uSeqCount1, const unsigned SeqIndexes2[], unsigned uSeqCount2) { #if TIMING TICKS t1 = GetClockTicks(); #endif const unsigned uSeqCount = msa.GetSeqCount(); OBJSCORE OS = g_ObjScore; if (g_ObjScore == OBJSCORE_SPM) { if (uSeqCount <= 100) OS = OBJSCORE_XP; else OS = OBJSCORE_SPF; } MSA msa1; MSA msa2; switch (OS) { case OBJSCORE_DP: case OBJSCORE_XP: MSAFromSeqSubset(msa, SeqIndexes1, uSeqCount1, msa1); MSAFromSeqSubset(msa, SeqIndexes2, uSeqCount2, msa2); SetMSAWeightsMuscle(msa1); SetMSAWeightsMuscle(msa2); break; case OBJSCORE_SP: case OBJSCORE_SPF: case OBJSCORE_PS: // Yuck -- casting away const (design flaw) SetMSAWeightsMuscle((MSA &) msa); break; } SCORE Score = 0; switch (OS) { case OBJSCORE_SP: Score = ObjScoreSP(msa); break; case OBJSCORE_DP: Score = ObjScoreDP(msa1, msa2); break; case OBJSCORE_XP: Score = ObjScoreXP(msa1, msa2); break; case OBJSCORE_PS: Score = ObjScorePS(msa); break; case OBJSCORE_SPF: Score = ObjScoreSPDimer(msa); break; default: Quit("Invalid g_ObjScore=%d", g_ObjScore); } #if TIMING TICKS t2 = GetClockTicks(); g_ticksObjScore += (t2 - t1); #endif return Score; } SCORE ObjScoreIds(const MSA &msa, const unsigned Ids1[], unsigned uCount1, const unsigned Ids2[], unsigned uCount2) { #if TIMING TICKS t1 = GetClockTicks(); #endif unsigned *SeqIndexes1 = new unsigned[uCount1]; unsigned *SeqIndexes2 = new unsigned[uCount2]; for (unsigned n = 0; n < uCount1; ++n) SeqIndexes1[n] = msa.GetSeqIndex(Ids1[n]); for (unsigned n = 0; n < uCount2; ++n) SeqIndexes2[n] = msa.GetSeqIndex(Ids2[n]); #if DOUBLE_AFFINE extern SCORE ObjScoreDA(const MSA &msa, SCORE *ptrLetters, SCORE *ptrGaps); SCORE Letters, Gaps; SCORE dObjScore = ObjScoreDA(msa, &Letters, &Gaps); delete[] SeqIndexes1; delete[] SeqIndexes2; #else SCORE dObjScore = ObjScore(msa, SeqIndexes1, uCount1, SeqIndexes2, uCount2); #endif #if TIMING TICKS t2 = GetClockTicks(); g_ticksObjScore += (t2 - t1); #endif return dObjScore; } muscle-3.8.31.orig/mk0000755000175000017500000000720511367133036013737 0ustar kratzcharles#!/bin/bash CPPNames='aligngivenpath aligngivenpathsw aligntwomsas aligntwoprofs aln alpha anchors bittraceback blosum62 blosumla clust cluster clwwt color cons diaglist diffobjscore diffpaths difftrees difftreese distcalc distfunc distpwkimura domuscle dosp dpreglist drawtree edgelist enumopts enumtostr estring fasta fasta2 fastclust fastdist fastdistjones fastdistkbit fastdistkmer fastdistmafft fastdistnuc fastscorepath2 finddiags finddiagsn glbalign glbalign352 glbaligndiag glbalignle glbalignsimple glbalignsp glbalignspn glbalignss glbalndimer globals globalslinux globalsosx globalsother globalswin32 gonnet henikoffweight henikoffweightpb html hydro intmath local main makerootmsa makerootmsab maketree mhack mpam200 msa msa2 msadistkimura msf muscle muscleout nucmx nwdasimple nwdasimple2 nwdasmall nwrec nwsmall objscore objscore2 objscoreda onexception options outweights pam200mafft params phy phy2 phy3 phy4 phyfromclust phyfromfile physeq phytofile posgap ppscore profdb profile profilefrommsa progalign progress progressivealign pwpath readmx realigndiffs realigndiffse refine refinehoriz refinesubfams refinetree refinetreee refinevert refinew savebest scoredist scoregaps scorehistory scorepp seq seqvect setblosumweights setgscweights setnewhandler spfast sptest stabilize subfam subfams sw termgaps textfile threewaywt tomhydro traceback tracebackopt tracebacksw treefrommsa typetostr upgma2 usage validateids vtml2 writescorefile' ObjNames='aligngivenpath.o aligngivenpathsw.o aligntwomsas.o aligntwoprofs.o aln.o alpha.o anchors.o bittraceback.o blosum62.o blosumla.o clust.o cluster.o clwwt.o color.o cons.o diaglist.o diffobjscore.o diffpaths.o difftrees.o difftreese.o distcalc.o distfunc.o distpwkimura.o domuscle.o dosp.o dpreglist.o drawtree.o edgelist.o enumopts.o enumtostr.o estring.o fasta.o fasta2.o fastclust.o fastdist.o fastdistjones.o fastdistkbit.o fastdistkmer.o fastdistmafft.o fastdistnuc.o fastscorepath2.o finddiags.o finddiagsn.o glbalign.o glbalign352.o glbaligndiag.o glbalignle.o glbalignsimple.o glbalignsp.o glbalignspn.o glbalignss.o glbalndimer.o globals.o globalslinux.o globalsosx.o globalsother.o globalswin32.o gonnet.o henikoffweight.o henikoffweightpb.o html.o hydro.o intmath.o local.o main.o makerootmsa.o makerootmsab.o maketree.o mhack.o mpam200.o msa.o msa2.o msadistkimura.o msf.o muscle.o muscleout.o nucmx.o nwdasimple.o nwdasimple2.o nwdasmall.o nwrec.o nwsmall.o objscore.o objscore2.o objscoreda.o onexception.o options.o outweights.o pam200mafft.o params.o phy.o phy2.o phy3.o phy4.o phyfromclust.o phyfromfile.o physeq.o phytofile.o posgap.o ppscore.o profdb.o profile.o profilefrommsa.o progalign.o progress.o progressivealign.o pwpath.o readmx.o realigndiffs.o realigndiffse.o refine.o refinehoriz.o refinesubfams.o refinetree.o refinetreee.o refinevert.o refinew.o savebest.o scoredist.o scoregaps.o scorehistory.o scorepp.o seq.o seqvect.o setblosumweights.o setgscweights.o setnewhandler.o spfast.o sptest.o stabilize.o subfam.o subfams.o sw.o termgaps.o textfile.o threewaywt.o tomhydro.o traceback.o tracebackopt.o tracebacksw.o treefrommsa.o typetostr.o upgma2.o usage.o validateids.o vtml2.o writescorefile.o' rm -f *.o muscle.make.stdout.txt muscle.make.stderr.txt for CPPName in $CPPNames do echo $CPPName >> /dev/tty g++ $ENV_GCC_OPTS -c -O3 -msse2 -mfpmath=sse -D_FILE_OFFSET_BITS=64 -DNDEBUG=1 $CPPName.cpp -o $CPPName.o >> muscle.make.stdout.txt 2>> muscle.make.stderr.txt done LINK_OPTS= if [ `uname -s` == Linux ] ; then LINK_OPTS=-static fi g++ $LINK_OPTS $ENV_LINK_OPTS -g -o muscle $ObjNames >> muscle.make.stdout.txt 2>> muscle.make.stderr.txt tail muscle.make.stderr.txt strip muscle ls -lh muscle sum muscle muscle-3.8.31.orig/make.err0000644000175000017500000000000011352261611015006 0ustar kratzcharlesmuscle-3.8.31.orig/finddiags.cpp0000644000175000017500000000711311352261636016036 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "diaglist.h" #define TRACE 0 const unsigned KTUP = 5; const unsigned KTUPS = 6*6*6*6*6; static unsigned TuplePos[KTUPS]; static char *TupleToStr(int t) { static char s[7]; int t1, t2, t3, t4, t5; t1 = t%6; t2 = (t/6)%6; t3 = (t/(6*6))%6; t4 = (t/(6*6*6))%6; t5 = (t/(6*6*6*6))%6; s[4] = '0' + t1; s[3] = '0' + t2; s[2] = '0' + t3; s[1] = '0' + t4; s[0] = '0' + t5; return s; } static unsigned GetTuple(const ProfPos *PP, unsigned uPos) { const unsigned t0 = PP[uPos].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == t0) return EMPTY; const unsigned t1 = PP[uPos+1].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == t1) return EMPTY; const unsigned t2 = PP[uPos+2].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == t2) return EMPTY; const unsigned t3 = PP[uPos+3].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == t3) return EMPTY; const unsigned t4 = PP[uPos+4].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == t4) return EMPTY; return t0 + t1*6 + t2*6*6 + t3*6*6*6 + t4*6*6*6*6; } void FindDiags(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, unsigned uLengthY, DiagList &DL) { if (ALPHA_Amino != g_Alpha) Quit("FindDiags: requires amino acid alphabet"); DL.Clear(); if (uLengthX < 12 || uLengthY < 12) return; // Set A to shorter profile, B to longer const ProfPos *PA; const ProfPos *PB; unsigned uLengthA; unsigned uLengthB; bool bSwap; if (uLengthX < uLengthY) { bSwap = false; PA = PX; PB = PY; uLengthA = uLengthX; uLengthB = uLengthY; } else { bSwap = true; PA = PY; PB = PX; uLengthA = uLengthY; uLengthB = uLengthX; } // Build tuple map for the longer profile, B if (uLengthB < KTUP) Quit("FindDiags: profile too short"); memset(TuplePos, EMPTY, sizeof(TuplePos)); for (unsigned uPos = 0; uPos < uLengthB - KTUP; ++uPos) { const unsigned uTuple = GetTuple(PB, uPos); if (EMPTY == uTuple) continue; TuplePos[uTuple] = uPos; } // Find matches for (unsigned uPosA = 0; uPosA < uLengthA - KTUP; ++uPosA) { const unsigned uTuple = GetTuple(PA, uPosA); if (EMPTY == uTuple) continue; const unsigned uPosB = TuplePos[uTuple]; if (EMPTY == uPosB) continue; // This tuple is found in both profiles unsigned uStartPosA = uPosA; unsigned uStartPosB = uPosB; // Try to extend the match forwards unsigned uEndPosA = uPosA + KTUP - 1; unsigned uEndPosB = uPosB + KTUP - 1; for (;;) { if (uLengthA - 1 == uEndPosA || uLengthB - 1 == uEndPosB) break; const unsigned uAAGroupA = PA[uEndPosA+1].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == uAAGroupA) break; const unsigned uAAGroupB = PB[uEndPosB+1].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == uAAGroupB) break; if (uAAGroupA != uAAGroupB) break; ++uEndPosA; ++uEndPosB; } uPosA = uEndPosA; #if TRACE { Log("Match: A %4u-%4u ", uStartPosA, uEndPosA); for (unsigned n = uStartPosA; n <= uEndPosA; ++n) Log("%c", 'A' + PA[n].m_uResidueGroup); Log("\n"); Log(" B %4u-%4u ", uStartPosB, uEndPosB); for (unsigned n = uStartPosB; n <= uEndPosB; ++n) Log("%c", 'A' + PB[n].m_uResidueGroup); Log("\n"); } #endif const unsigned uLength = uEndPosA - uStartPosA + 1; assert(uEndPosB - uStartPosB + 1 == uLength); if (uLength >= g_uMinDiagLength) { if (bSwap) DL.Add(uStartPosB, uStartPosA, uLength); else DL.Add(uStartPosA, uStartPosB, uLength); } } } muscle-3.8.31.orig/estring.h0000644000175000017500000000101711352261620015214 0ustar kratzcharles#ifndef pathsum_h #define pathsum_h void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB); void EstringsToPath(const short esA[], const short esB[], PWPath &Path); void MulEstrings(const short es1[], const short es2[], short esp[]); void EstringOp(const short es[], const Seq &sIn, Seq &sOut); unsigned EstringOp(const short es[], const Seq &sIn, MSA &a); void LogEstring(const short es[]); unsigned LengthEstring(const short es[]); short *EstringNewCopy(const short es[]); #endif // pathsum_h muscle-3.8.31.orig/termgaps.cpp0000644000175000017500000000143211352261666015731 0ustar kratzcharles#include "muscle.h" #include "profile.h" void SetTermGaps(const ProfPos *Prof, unsigned uLength) { if (0 == uLength) return; ProfPos *First = (ProfPos *) Prof; ProfPos *Last = (ProfPos *) (Prof + uLength - 1); switch (g_TermGaps) { case TERMGAPS_Full: break; case TERMGAPS_Half: // -infinity check for lock left/right if (First->m_scoreGapOpen != MINUS_INFINITY) First->m_scoreGapOpen = 0; if (uLength > 1 && Last->m_scoreGapClose != MINUS_INFINITY) Last->m_scoreGapClose = 0; case TERMGAPS_Ext: if (First->m_scoreGapOpen != MINUS_INFINITY) First->m_scoreGapOpen *= -1; if (uLength > 1 && Last->m_scoreGapClose != MINUS_INFINITY) Last->m_scoreGapClose *= -1; break; default: Quit("Invalid g_TermGaps"); } } muscle-3.8.31.orig/svnmods.h0000644000175000017500000000001111367133036015230 0ustar kratzcharles"export" muscle-3.8.31.orig/fastdistjones.cpp0000644000175000017500000001357311352261636016775 0ustar kratzcharles#include "muscle.h" #include "distfunc.h" #include "seqvect.h" #include const unsigned TRIPLE_COUNT = 20*20*20; struct TripleCount { unsigned m_uSeqCount; // How many sequences have this triple? unsigned short *m_Counts; // m_Counts[s] = nr of times triple found in seq s }; static TripleCount *TripleCounts; // WARNING: Sequences MUST be stripped of gaps and upper case! void DistKmer20_3(const SeqVect &v, DistFunc &DF) { const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount); TripleCounts = (TripleCount *) malloc(uTripleArrayBytes); if (0 == TripleCounts) Quit("Not enough memory (TripleCounts)"); memset(TripleCounts, 0, uTripleArrayBytes); for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { TripleCount &tc = *(TripleCounts + uWord); const unsigned uBytes = uSeqCount*sizeof(short); tc.m_Counts = (unsigned short *) malloc(uBytes); memset(tc.m_Counts, 0, uBytes); } for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos) { const unsigned uLetter1 = CharToLetterEx(s[uPos]); if (uLetter1 >= 20) continue; const unsigned uLetter2 = CharToLetterEx(s[uPos+1]); if (uLetter2 >= 20) continue; const unsigned uLetter3 = CharToLetterEx(s[uPos+2]); if (uLetter3 >= 20) continue; const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20; assert(uWord < TRIPLE_COUNT); TripleCount &tc = *(TripleCounts + uWord); const unsigned uOldCount = tc.m_Counts[uSeqIndex]; if (0 == uOldCount) ++(tc.m_uSeqCount); ++(tc.m_Counts[uSeqIndex]); } } #if TRACE { Log("TripleCounts\n"); unsigned uGrandTotal = 0; for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { const TripleCount &tc = *(TripleCounts + uWord); if (0 == tc.m_uSeqCount) continue; const unsigned uLetter3 = uWord/(20*20); const unsigned uLetter2 = (uWord - uLetter3*20*20)/20; const unsigned uLetter1 = uWord%20; Log("Word %6u %c%c%c %6u", uWord, LetterToCharAmino(uLetter1), LetterToCharAmino(uLetter2), LetterToCharAmino(uLetter3), tc.m_uSeqCount); unsigned uSeqCountWithThisWord = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const unsigned uCount = tc.m_Counts[uSeqIndex]; if (uCount > 0) { ++uSeqCountWithThisWord; Log(" %u=%u", uSeqIndex, uCount); uGrandTotal += uCount; } } if (uSeqCountWithThisWord != tc.m_uSeqCount) Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord); Log("\n"); } unsigned uTotalBySeqLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); uTotalBySeqLength += uSeqLength - 2; } if (uGrandTotal != uTotalBySeqLength) Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength); } #endif const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned); unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes); for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { const TripleCount &tc = *(TripleCounts + uWord); if (0 == tc.m_uSeqCount) continue; unsigned uSeqCountFound = 0; memset(SeqList, 0, uSeqListBytes); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { if (tc.m_Counts[uSeqIndex] > 0) { SeqList[uSeqCountFound] = uSeqIndex; ++uSeqCountFound; if (uSeqCountFound == tc.m_uSeqCount) break; } } assert(uSeqCountFound == tc.m_uSeqCount); for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1) { const unsigned uSeqIndex1 = SeqList[uSeq1]; const unsigned uCount1 = tc.m_Counts[uSeqIndex1]; for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { const unsigned uSeqIndex2 = SeqList[uSeq2]; const unsigned uCount2 = tc.m_Counts[uSeqIndex2]; const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2; const double d = DF.GetDist(uSeqIndex1, uSeqIndex2); DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount)); } } } delete[] SeqList; free(TripleCounts); unsigned uDone = 0; const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0.0); const Seq &s1 = *(v[uSeq1]); const unsigned uLength1 = s1.Length(); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { const Seq &s2 = *(v[uSeq2]); const unsigned uLength2 = s2.Length(); unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2; if (uMinLength < 3) { DF.SetDist(uSeq1, uSeq2, 1.0); continue; } const double dTripleCount = DF.GetDist(uSeq1, uSeq2); if (dTripleCount == 0) { DF.SetDist(uSeq1, uSeq2, 1.0); continue; } double dNormalizedTripletScore = dTripleCount/(uMinLength - 2); //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore)); //if (dEstimatedPairwiseIdentity > 1) // dEstimatedPairwiseIdentity = 1; // DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity)); DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore); #if TRACE { Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n", s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2, dEstimatedPairwiseIdentity); } #endif if (uDone%1000 == 0) Progress(uDone, uTotal); } } ProgressStepsDone(); } muscle-3.8.31.orig/glbaligndiag.cpp0000644000175000017500000001032311352261667016513 0ustar kratzcharles#include "muscle.h" #include "dpreglist.h" #include "diaglist.h" #include "pwpath.h" #include "profile.h" #include "timing.h" #define TRACE 0 #define TRACE_PATH 0 #define LIST_DIAGS 0 static double g_dDPAreaWithoutDiags = 0.0; static double g_dDPAreaWithDiags = 0.0; static void OffsetPath(PWPath &Path, unsigned uOffsetA, unsigned uOffsetB) { const unsigned uEdgeCount = Path.GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); // Nasty hack -- poke new values back into path, circumventing class PWEdge &NonConstEdge = (PWEdge &) Edge; NonConstEdge.uPrefixLengthA += uOffsetA; NonConstEdge.uPrefixLengthB += uOffsetB; } } static void DiagToPath(const Diag &d, PWPath &Path) { Path.Clear(); const unsigned uLength = d.m_uLength; for (unsigned i = 0; i < uLength; ++i) { PWEdge Edge; Edge.cType = 'M'; Edge.uPrefixLengthA = d.m_uStartPosA + i + 1; Edge.uPrefixLengthB = d.m_uStartPosB + i + 1; Path.AppendEdge(Edge); } } static void AppendRegPath(PWPath &Path, const PWPath &RegPath) { const unsigned uRegEdgeCount = RegPath.GetEdgeCount(); for (unsigned uRegEdgeIndex = 0; uRegEdgeIndex < uRegEdgeCount; ++uRegEdgeIndex) { const PWEdge &RegEdge = RegPath.GetEdge(uRegEdgeIndex); Path.AppendEdge(RegEdge); } } SCORE GlobalAlignDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { #if LIST_DIAGS TICKS t1 = GetClockTicks(); #endif DiagList DL; if (ALPHA_Amino == g_Alpha) FindDiags(PA, uLengthA, PB, uLengthB, DL); else if (ALPHA_DNA == g_Alpha || ALPHA_RNA == g_Alpha) FindDiagsNuc(PA, uLengthA, PB, uLengthB, DL); else Quit("GlobalAlignDiags: bad alpha"); #if TRACE Log("GlobalAlignDiags, diag list:\n"); DL.LogMe(); #endif DL.Sort(); DL.DeleteIncompatible(); #if TRACE Log("After DeleteIncompatible:\n"); DL.LogMe(); #endif MergeDiags(DL); #if TRACE Log("After MergeDiags:\n"); DL.LogMe(); #endif DPRegionList RL; DiagListToDPRegionList(DL, RL, uLengthA, uLengthB); #if TRACE Log("RegionList:\n"); RL.LogMe(); #endif #if LIST_DIAGS { TICKS t2 = GetClockTicks(); unsigned uArea = RL.GetDPArea(); Log("ticks=%ld\n", (long) (t2 - t1)); Log("area=%u\n", uArea); } #endif g_dDPAreaWithoutDiags += uLengthA*uLengthB; double dDPAreaWithDiags = 0.0; const unsigned uRegionCount = RL.GetCount(); for (unsigned uRegionIndex = 0; uRegionIndex < uRegionCount; ++uRegionIndex) { const DPRegion &r = RL.Get(uRegionIndex); PWPath RegPath; if (DPREGIONTYPE_Diag == r.m_Type) { DiagToPath(r.m_Diag, RegPath); #if TRACE_PATH Log("DiagToPath, path=\n"); RegPath.LogMe(); #endif } else if (DPREGIONTYPE_Rect == r.m_Type) { const unsigned uRegStartPosA = r.m_Rect.m_uStartPosA; const unsigned uRegStartPosB = r.m_Rect.m_uStartPosB; const unsigned uRegLengthA = r.m_Rect.m_uLengthA; const unsigned uRegLengthB = r.m_Rect.m_uLengthB; const ProfPos *RegPA = PA + uRegStartPosA; const ProfPos *RegPB = PB + uRegStartPosB; dDPAreaWithDiags += uRegLengthA*uRegLengthB; GlobalAlignNoDiags(RegPA, uRegLengthA, RegPB, uRegLengthB, RegPath); #if TRACE_PATH Log("GlobalAlignNoDiags RegPath=\n"); RegPath.LogMe(); #endif OffsetPath(RegPath, uRegStartPosA, uRegStartPosB); #if TRACE_PATH Log("After offset path, RegPath=\n"); RegPath.LogMe(); #endif } else Quit("GlobalAlignDiags, Invalid region type %u", r.m_Type); AppendRegPath(Path, RegPath); #if TRACE_PATH Log("After AppendPath, path="); Path.LogMe(); #endif } #if TRACE { double dDPAreaWithoutDiags = uLengthA*uLengthB; Log("DP area with diags %.3g without %.3g pct saved %.3g %%\n", dDPAreaWithDiags, dDPAreaWithoutDiags, (1.0 - dDPAreaWithDiags/dDPAreaWithoutDiags)*100.0); } #endif g_dDPAreaWithDiags += dDPAreaWithDiags; return 0; } void ListDiagSavings() { if (!g_bVerbose || !g_bDiags) return; double dAreaSaved = g_dDPAreaWithoutDiags - g_dDPAreaWithDiags; double dPct = dAreaSaved*100.0/g_dDPAreaWithoutDiags; Log("DP area saved by diagonals %-4.1f%%\n", dPct); } muscle-3.8.31.orig/diffobjscore.cpp0000644000175000017500000001055511352261667016555 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "objscore.h" #include "profile.h" #define TRACE 0 #define COMPARE_3_52 0 #define BRUTE_LETTERS 0 static SCORE ScoreColLetters(const MSA &msa, unsigned uColIndex) { SCOREMATRIX &Mx = *g_ptrScoreMatrix; const unsigned uSeqCount = msa.GetSeqCount(); #if BRUTE_LETTERS SCORE BruteScore = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { unsigned uLetter1 = msa.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter1 >= g_AlphaSize) continue; WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { unsigned uLetter2 = msa.GetLetterEx(uSeqIndex2, uColIndex); if (uLetter2 >= g_AlphaSize) continue; WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); BruteScore += w1*w2*Mx[uLetter1][uLetter2]; } } #endif double N = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { WEIGHT w = msa.GetSeqWeight(uSeqIndex1); N += w; } if (N <= 0) return 0; FCOUNT Freqs[20]; memset(Freqs, 0, sizeof(Freqs)); SCORE Score = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { unsigned uLetter = msa.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter >= g_AlphaSize) continue; WEIGHT w = msa.GetSeqWeight(uSeqIndex1); Freqs[uLetter] += w; Score -= w*w*Mx[uLetter][uLetter]; } for (unsigned uLetter1 = 0; uLetter1 < g_AlphaSize; ++uLetter1) { const FCOUNT f1 = Freqs[uLetter1]; Score += f1*f1*Mx[uLetter1][uLetter1]; for (unsigned uLetter2 = uLetter1 + 1; uLetter2 < g_AlphaSize; ++uLetter2) { const FCOUNT f2 = Freqs[uLetter2]; Score += 2*f1*f2*Mx[uLetter1][uLetter2]; } } Score /= 2; #if BRUTE_LETTERS assert(BTEq(BruteScore, Score)); #endif return Score; } static SCORE ScoreLetters(const MSA &msa, const unsigned Edges[], unsigned uEdgeCount) { const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); // Letters SCORE Score = 0; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const unsigned uColIndex = Edges[uEdgeIndex]; assert(uColIndex < uColCount); Score += ScoreColLetters(msa, uColIndex); } return Score; } void GetLetterScores(const MSA &msa, SCORE Scores[]) { const unsigned uColCount = msa.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) Scores[uColIndex] = ScoreColLetters(msa, uColIndex); } SCORE DiffObjScore( const MSA &msa1, const PWPath &Path1, const unsigned Edges1[], unsigned uEdgeCount1, const MSA &msa2, const PWPath &Path2, const unsigned Edges2[], unsigned uEdgeCount2) { #if TRACE { Log("============DiffObjScore===========\n"); Log("msa1:\n"); msa1.LogMe(); Log("\n"); Log("Cols1: "); for (unsigned i = 0; i < uEdgeCount1; ++i) Log(" %u", Edges1[i]); Log("\n\n"); Log("msa2:\n"); msa2.LogMe(); Log("Cols2: "); for (unsigned i = 0; i < uEdgeCount2; ++i) Log(" %u", Edges2[i]); Log("\n\n"); } #endif #if COMPARE_3_52 extern SCORE g_SPScoreLetters; extern SCORE g_SPScoreGaps; SCORE SP1 = ObjScoreSP(msa1); SCORE SPLetters1 = g_SPScoreLetters; SCORE SPGaps1 = g_SPScoreGaps; SCORE SP2 = ObjScoreSP(msa2); SCORE SPLetters2 = g_SPScoreLetters; SCORE SPGaps2 = g_SPScoreGaps; SCORE SPDiffLetters = SPLetters2 - SPLetters1; SCORE SPDiffGaps = SPGaps2 - SPGaps1; SCORE SPDiff = SPDiffLetters + SPDiffGaps; #endif SCORE Letters1 = ScoreLetters(msa1, Edges1, uEdgeCount1); SCORE Letters2 = ScoreLetters(msa2, Edges2, uEdgeCount2); SCORE Gaps1 = ScoreGaps(msa1, Edges1, uEdgeCount1); SCORE Gaps2 = ScoreGaps(msa2, Edges2, uEdgeCount2); SCORE DiffLetters = Letters2 - Letters1; SCORE DiffGaps = Gaps2 - Gaps1; SCORE Diff = DiffLetters + DiffGaps; #if COMPARE_3_52 Log("ObjScoreSP Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n", SPLetters1, SPLetters2, SPDiffLetters); Log("DiffObjScore Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n", Letters1, Letters2, DiffLetters); Log("ObjScoreSP Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n", SPGaps1, SPGaps2, SPDiffGaps); Log("DiffObjScore Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n", Gaps1, Gaps2, DiffGaps); Log("SP diff=%.4g DiffObjScore Diff=%.4g\n", SPDiff, Diff); #endif return Diff; } muscle-3.8.31.orig/aligngivenpath.cpp0000644000175000017500000005254511352261667017123 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "pwpath.h" #include "profile.h" #define TRACE 0 static void LogPP(const ProfPos &PP) { Log("ResidueGroup %u\n", PP.m_uResidueGroup); Log("AllGaps %d\n", PP.m_bAllGaps); Log("Occ %.3g\n", PP.m_fOcc); Log("LL=%.3g LG=%.3g GL=%.3g GG=%.3g\n", PP.m_LL, PP.m_LG, PP.m_GL, PP.m_GG); Log("Freqs "); for (unsigned i = 0; i < 20; ++i) if (PP.m_fcCounts[i] > 0) Log("%c=%.3g ", LetterToChar(i), PP.m_fcCounts[i]); Log("\n"); } static void AssertProfPosEq(const ProfPos *PA, const ProfPos *PB, unsigned i) { const ProfPos &PPA = PA[i]; const ProfPos &PPB = PB[i]; #define eq(x) if (PPA.m_##x != PPB.m_##x) { LogPP(PPA); LogPP(PPB); Quit("AssertProfPosEq." #x); } #define be(x) if (!BTEq(PPA.m_##x, PPB.m_##x)) { LogPP(PPA); LogPP(PPB); Quit("AssertProfPosEq." #x); } eq(bAllGaps) eq(uResidueGroup) be(LL) be(LG) be(GL) be(GG) be(fOcc) be(scoreGapOpen) be(scoreGapClose) for (unsigned j = 0; j < 20; ++j) { #define eqj(x) if (PPA.m_##x != PPB.m_##x) Quit("AssertProfPosEq j=%u " #x, j); #define bej(x) if (!BTEq(PPA.m_##x, PPB.m_##x)) Quit("AssertProfPosEq j=%u " #x, j); bej(fcCounts[j]); // eqj(uSortOrder[j]) // may differ due to ties, don't check? bej(AAScores[j]) #undef eqj #undef bej } #undef eq #undef be } void AssertProfsEq(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB) { if (uLengthA != uLengthB) Quit("AssertProfsEq: lengths differ %u %u", uLengthA, uLengthB); for (unsigned i = 0; i < uLengthB; ++i) AssertProfPosEq(PA, PB, i); } #if DEBUG static void ValidateProf(const ProfPos *Prof, unsigned uLength) { for (unsigned i = 0; i < uLength; ++i) { const ProfPos &PP = Prof[i]; FCOUNT s1 = PP.m_LL + PP.m_LG + PP.m_GL + PP.m_GG; assert(BTEq(s1, 1.0)); if (i > 0) { const ProfPos &PPPrev = Prof[i-1]; FCOUNT s2 = PPPrev.m_LL + PPPrev.m_GL; FCOUNT s3 = PP.m_LL + PP.m_LG; assert(BTEq(s2, s3)); } if (i < uLength - 1) { const ProfPos &PPNext = Prof[i+1]; FCOUNT s4 = PP.m_LL + PP.m_GL; FCOUNT s5 = PPNext.m_LL + PPNext.m_LG; assert(BTEq(s4, s5)); } } } #else #define ValidateProf(Prof, Length) /* empty */ #endif static void ScoresFromFreqsPos(ProfPos *Prof, unsigned uLength, unsigned uPos) { ProfPos &PP = Prof[uPos]; SortCounts(PP.m_fcCounts, PP.m_uSortOrder); PP.m_uResidueGroup = ResidueGroupFromFCounts(PP.m_fcCounts); // "Occupancy" PP.m_fOcc = PP.m_LL + PP.m_GL; // Frequency of gap-opens in this position (i) // Gap open = letter in i-1 and gap in i // = iff LG in i FCOUNT fcOpen = PP.m_LG; // Frequency of gap-closes in this position // Gap close = gap in i and letter in i+1 // = iff GL in i+1 FCOUNT fcClose; if (uPos + 1 < uLength) fcClose = Prof[uPos + 1].m_GL; else fcClose = PP.m_GG + PP.m_LG; PP.m_scoreGapOpen = (SCORE) ((1.0 - fcOpen)*g_scoreGapOpen/2.0); PP.m_scoreGapClose = (SCORE) ((1.0 - fcClose)*g_scoreGapOpen/2.0); #if DOUBLE_AFFINE PP.m_scoreGapOpen2 = (SCORE) ((1.0 - fcOpen)*g_scoreGapOpen2/2.0); PP.m_scoreGapClose2 = (SCORE) ((1.0 - fcClose)*g_scoreGapOpen2/2.0); #endif for (unsigned i = 0; i < g_AlphaSize; ++i) { SCORE scoreSum = 0; for (unsigned j = 0; j < g_AlphaSize; ++j) scoreSum += PP.m_fcCounts[j]*(*g_ptrScoreMatrix)[i][j]; PP.m_AAScores[i] = scoreSum; } } void ProfScoresFromFreqs(ProfPos *Prof, unsigned uLength) { for (unsigned i = 0; i < uLength; ++i) ScoresFromFreqsPos(Prof, uLength, i); } static void AppendDelete(const MSA &msaA, unsigned &uColIndexA, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendDelete ColIxA=%u ColIxCmb=%u\n", uColIndexA, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA); msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-'); ++uColIndexCombined; ++uColIndexA; } static void AppendInsert(const MSA &msaB, unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendInsert ColIxB=%u ColIxCmb=%u\n", uColIndexB, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-'); for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); } ++uColIndexCombined; ++uColIndexB; } static void AppendTplInserts(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA, const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendTplInserts ColIxA=%u ColIxB=%u ColIxCmb=%u\n", uColIndexA, uColIndexB, uColIndexCombined); #endif const unsigned uLengthA = msaA.GetColCount(); const unsigned uLengthB = msaB.GetColCount(); unsigned uNewColCount = uColCountA; if (uColCountB > uNewColCount) uNewColCount = uColCountB; for (unsigned n = 0; n < uColCountA; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA + n); c = UnalignChar(c); msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c); } } for (unsigned n = uColCountA; n < uNewColCount; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.'); } for (unsigned n = 0; n < uColCountB; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB + n); c = UnalignChar(c); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c); } } for (unsigned n = uColCountB; n < uNewColCount; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.'); } uColIndexCombined += uNewColCount; uColIndexA += uColCountA; uColIndexB += uColCountB; } static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB, unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n", uColIndexA, uColIndexB, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA); msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); } ++uColIndexA; ++uColIndexB; ++uColIndexCombined; } void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB, MSA &msaCombined) { msaCombined.Clear(); #if TRACE Log("FastAlignProfiles\n"); Log("Template A:\n"); msaA.LogMe(); Log("Template B:\n"); msaB.LogMe(); #endif const unsigned uColCountA = msaA.GetColCount(); const unsigned uColCountB = msaB.GetColCount(); const unsigned uSeqCountA = msaA.GetSeqCount(); const unsigned uSeqCountB = msaB.GetSeqCount(); msaCombined.SetSeqCount(uSeqCountA + uSeqCountB); // Copy sequence names into combined MSA for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA)); msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA)); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB)); msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB)); } unsigned uColIndexA = 0; unsigned uColIndexB = 0; unsigned uColIndexCombined = 0; const unsigned uEdgeCount = Path.GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); #if TRACE Log("\nEdge %u %c%u.%u\n", uEdgeIndex, Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); #endif const char cType = Edge.cType; const unsigned uPrefixLengthA = Edge.uPrefixLengthA; unsigned uColCountA = 0; if (uPrefixLengthA > 0) { const unsigned uNodeIndexA = uPrefixLengthA - 1; const unsigned uTplColIndexA = uNodeIndexA; if (uTplColIndexA > uColIndexA) uColCountA = uTplColIndexA - uColIndexA; } const unsigned uPrefixLengthB = Edge.uPrefixLengthB; unsigned uColCountB = 0; if (uPrefixLengthB > 0) { const unsigned uNodeIndexB = uPrefixLengthB - 1; const unsigned uTplColIndexB = uNodeIndexB; if (uTplColIndexB > uColIndexB) uColCountB = uTplColIndexB - uColIndexB; } // TODO: This code looks like a hangover from HMM estimation -- can we delete it? assert(uColCountA == 0); assert(uColCountB == 0); AppendTplInserts(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); switch (cType) { case 'M': { assert(uPrefixLengthA > 0); assert(uPrefixLengthB > 0); const unsigned uColA = uPrefixLengthA - 1; const unsigned uColB = uPrefixLengthB - 1; assert(uColIndexA == uColA); assert(uColIndexB == uColB); AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } case 'D': { assert(uPrefixLengthA > 0); const unsigned uColA = uPrefixLengthA - 1; assert(uColIndexA == uColA); AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } case 'I': { assert(uPrefixLengthB > 0); const unsigned uColB = uPrefixLengthB - 1; assert(uColIndexB == uColB); AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } default: assert(false); } } unsigned uInsertColCountA = uColCountA - uColIndexA; unsigned uInsertColCountB = uColCountB - uColIndexB; // TODO: This code looks like a hangover from HMM estimation -- can we delete it? assert(uInsertColCountA == 0); assert(uInsertColCountB == 0); AppendTplInserts(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB, uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); assert(msaCombined.GetColCount() == uEdgeCount); } static const ProfPos PPStart = { false, //m_bAllGaps; { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_uSortOrder[21]; { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_fcCounts[20]; 1.0, // m_LL; 0.0, // m_LG; 0.0, // m_GL; 0.0, // m_GG; { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_ALScores 0, // m_uResidueGroup; 1.0, // m_fOcc; 0.0, // m_fcStartOcc; 0.0, // m_fcEndOcc; 0.0, // m_scoreGapOpen; 0.0, // m_scoreGapClose; }; // MM // Ai–1 Ai Out // X X LL LL // X - LG LG // - X GL GL // - - GG GG // // Bj–1 Bj // X X LL LL // X - LG LG // - X GL GL // - - GG GG static void SetGapsMM( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wA*PPA.m_LL + wB*PPB.m_LL; PPO.m_LG = wA*PPA.m_LG + wB*PPB.m_LG; PPO.m_GL = wA*PPA.m_GL + wB*PPB.m_GL; PPO.m_GG = wA*PPA.m_GG + wB*PPB.m_GG; } // MD // Ai–1 Ai Out // X X LL LL // X - LG LG // - X GL GL // - - GG GG // // Bj (-) // X - ?L LG // - - ?G GG static void SetGapsMD( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wA*PPA.m_LL; PPO.m_LG = wA*PPA.m_LG + wB*(PPB.m_LL + PPB.m_GL); PPO.m_GL = wA*PPA.m_GL; PPO.m_GG = wA*PPA.m_GG + wB*(PPB.m_LG + PPB.m_GG); } // DD // Ai–1 Ai Out // X X LL LL // X - LG LG // - X GL GL // - - GG GG // // (-) (-) // - - ?? GG static void SetGapsDD( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wA*PPA.m_LL; PPO.m_LG = wA*PPA.m_LG; PPO.m_GL = wA*PPA.m_GL; PPO.m_GG = wA*PPA.m_GG + wB; } // MI // Ai (-) Out // X - ?L LG // - - ?G GG // Bj–1 Bj // X X LL LL // X - LG LG // - X GL GL // - - GG GG static void SetGapsMI( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wB*PPB.m_LL; PPO.m_LG = wB*PPB.m_LG + wA*(PPA.m_LL + PPA.m_GL); PPO.m_GL = wB*PPB.m_GL; PPO.m_GG = wB*PPB.m_GG + wA*(PPA.m_LG + PPA.m_GG); } // DM // Ai–1 Ai Out // X X LL LL // X - LG LG // - X GL GL // - - GG GG // // (-) Bj // - X ?L GL // - - ?G GG static void SetGapsDM( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wA*PPA.m_LL; PPO.m_LG = wA*PPA.m_LG; PPO.m_GL = wA*PPA.m_GL + wB*(PPB.m_LL + PPB.m_GL); PPO.m_GG = wA*PPA.m_GG + wB*(PPB.m_LG + PPB.m_GG); } // IM // (-) Ai Out // - X ?L GL // - - ?G GG // Bj–1 Bj // X X LL LL // X - LG LG // - X GL GL // - - GG GG static void SetGapsIM( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wB*PPB.m_LL; PPO.m_LG = wB*PPB.m_LG; PPO.m_GL = wB*PPB.m_GL + wA*(PPA.m_LL + PPA.m_GL); PPO.m_GG = wB*PPB.m_GG + wA*(PPA.m_LG + PPA.m_GG); } // ID // (-) Ai Out // - X ?L GL // - - ?G GG // Bj (-) // X - ?L LG // - - ?G GG static void SetGapsID( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = 0; PPO.m_LG = wB*PPB.m_GL + wB*PPB.m_LL; PPO.m_GL = wA*PPA.m_GL + wA*PPA.m_LL; PPO.m_GG = wA*(PPA.m_LG + PPA.m_GG) + wB*(PPB.m_LG + PPB.m_GG); } // DI // Ai (-) Out // X - ?L LG // - - ?G GG // (-) Bj // - X ?L GL // - - ?G GG static void SetGapsDI( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = 0; PPO.m_LG = wA*PPA.m_GL + wA*PPA.m_LL; PPO.m_GL = wB*PPB.m_GL + wB*PPB.m_LL; PPO.m_GG = wA*(PPA.m_LG + PPA.m_GG) + wB*(PPB.m_LG + PPB.m_GG); } // II // (-) (-) Out // - - ?? GG // Bj–1 Bj // X X LL LL // X - LG LG // - X GL GL // - - GG GG static void SetGapsII( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wB*PPB.m_LL; PPO.m_LG = wB*PPB.m_LG; PPO.m_GL = wB*PPB.m_GL; PPO.m_GG = wB*PPB.m_GG + wA; } static void SetFreqs( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; if (g_bNormalizeCounts) { const FCOUNT fA = PPA.m_fOcc*wA/(wA + wB); const FCOUNT fB = PPB.m_fOcc*wB/(wA + wB); FCOUNT fTotal = 0; for (unsigned i = 0; i < 20; ++i) { const FCOUNT f = fA*PPA.m_fcCounts[i] + fB*PPB.m_fcCounts[i]; PPO.m_fcCounts[i] = f; fTotal += f; } if (fTotal > 0) for (unsigned i = 0; i < 20; ++i) PPO.m_fcCounts[i] /= fTotal; } else { for (unsigned i = 0; i < 20; ++i) PPO.m_fcCounts[i] = wA*PPA.m_fcCounts[i] + wB*PPB.m_fcCounts[i]; } } void AlignTwoProfsGivenPath(const PWPath &Path, const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos **ptrPOut, unsigned *ptruLengthOut) { #if TRACE Log("AlignTwoProfsGivenPath wA=%.3g wB=%.3g Path=\n", wA, wB); Path.LogMe(); #endif assert(BTEq(wA + wB, 1.0)); unsigned uColIndexA = 0; unsigned uColIndexB = 0; unsigned uColIndexOut = 0; const unsigned uEdgeCount = Path.GetEdgeCount(); ProfPos *POut = new ProfPos[uEdgeCount]; char cPrevType = 'M'; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); const char cType = Edge.cType; const unsigned uPrefixLengthA = Edge.uPrefixLengthA; const unsigned uPrefixLengthB = Edge.uPrefixLengthB; #if TRACE Log("\nEdge %u %c%u.%u ColA=%u ColB=%u\n", uEdgeIndex, Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB, uColIndexA, uColIndexB); #endif POut[uColIndexOut].m_bAllGaps = false; switch (cType) { case 'M': { assert(uPrefixLengthA > 0); assert(uPrefixLengthB > 0); SetFreqs( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); switch (cPrevType) { case 'M': SetGapsMM( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'D': SetGapsDM( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'I': SetGapsIM( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; default: Quit("Bad cPrevType"); } ++uColIndexA; ++uColIndexB; ++uColIndexOut; break; } case 'D': { assert(uPrefixLengthA > 0); SetFreqs( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, 0, POut, uColIndexOut); switch (cPrevType) { case 'M': SetGapsMD( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'D': SetGapsDD( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'I': SetGapsID( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; default: Quit("Bad cPrevType"); } ++uColIndexA; ++uColIndexOut; break; } case 'I': { assert(uPrefixLengthB > 0); SetFreqs( PA, uPrefixLengthA, 0, PB, uPrefixLengthB, wB, POut, uColIndexOut); switch (cPrevType) { case 'M': SetGapsMI( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'D': SetGapsDI( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'I': SetGapsII( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; default: Quit("Bad cPrevType"); } ++uColIndexB; ++uColIndexOut; break; } default: assert(false); } cPrevType = cType; } assert(uColIndexOut == uEdgeCount); ProfScoresFromFreqs(POut, uEdgeCount); ValidateProf(POut, uEdgeCount); *ptrPOut = POut; *ptruLengthOut = uEdgeCount; #if TRACE Log("AlignTwoProfsGivenPath:\n"); ListProfile(POut, uEdgeCount, 0); #endif } muscle-3.8.31.orig/nwsmall.cpp0000644000175000017500000003743411352261673015575 0ustar kratzcharles#include "muscle.h" #include #include "pwpath.h" #include "profile.h" #include // NW small memory #define TRACE 0 #if TRACE extern bool g_bKeepSimpleDP; extern SCORE *g_DPM; extern SCORE *g_DPD; extern SCORE *g_DPI; extern char *g_TBM; extern char *g_TBD; extern char *g_TBI; #endif #if TRACE #define ALLOC_TRACE() \ const SCORE UNINIT = MINUS_INFINITY; \ const size_t LM = uPrefixCountA*uPrefixCountB; \ \ SCORE *DPM_ = new SCORE[LM]; \ SCORE *DPD_ = new SCORE[LM]; \ SCORE *DPI_ = new SCORE[LM]; \ \ char *TBM_ = new char[LM]; \ char *TBD_ = new char[LM]; \ char *TBI_ = new char[LM]; \ \ memset(TBM_, '?', LM); \ memset(TBD_, '?', LM); \ memset(TBI_, '?', LM); \ \ for (unsigned i = 0; i <= uLengthA; ++i) \ for (unsigned j = 0; j <= uLengthB; ++j) \ { \ DPM(i, j) = UNINIT; \ DPD(i, j) = UNINIT; \ DPI(i, j) = UNINIT; \ } #else #define ALLOC_TRACE() #endif #if TRACE #define SetDPM(i, j, x) DPM(i, j) = x #define SetDPD(i, j, x) DPD(i, j) = x #define SetDPI(i, j, x) DPI(i, j) = x #define SetTBM(i, j, x) TBM(i, j) = x #define SetTBD(i, j, x) TBD(i, j) = x #define SetTBI(i, j, x) TBI(i, j) = x #else #define SetDPM(i, j, x) /* empty */ #define SetDPD(i, j, x) /* empty */ #define SetDPI(i, j, x) /* empty */ #define SetTBM(i, j, x) /* empty */ #define SetTBD(i, j, x) /* empty */ #define SetTBI(i, j, x) /* empty */ #endif #define RECURSE_D(i, j) \ { \ SCORE DD = DRow[j] + e; \ SCORE MD = MPrev[j] + PA[i-1].m_scoreGapOpen;\ if (DD > MD) \ { \ DRow[j] = DD; \ SetTBD(i, j, 'D'); \ } \ else \ { \ DRow[j] = MD; \ /* SetBitTBD(TB, i, j, 'M'); */ \ TBRow[j] &= ~BIT_xD; \ TBRow[j] |= BIT_MD; \ SetTBD(i, j, 'M'); \ } \ SetDPD(i, j, DRow[j]); \ } #define RECURSE_D_ATerm(j) RECURSE_D(uLengthA, j) #define RECURSE_D_BTerm(j) RECURSE_D(i, uLengthB) #define RECURSE_I(i, j) \ { \ Iij += e; \ SCORE MI = MCurr[j-1] + PB[j-1].m_scoreGapOpen;\ if (MI >= Iij) \ { \ Iij = MI; \ /* SetBitTBI(TB, i, j, 'M'); */ \ TBRow[j] &= ~BIT_xI; \ TBRow[j] |= BIT_MI; \ SetTBI(i, j, 'M'); \ } \ else \ SetTBI(i, j, 'I'); \ SetDPI(i, j, Iij); \ } #define RECURSE_I_ATerm(j) RECURSE_I(uLengthA, j) #define RECURSE_I_BTerm(j) RECURSE_I(i, uLengthB) #define RECURSE_M(i, j) \ { \ SCORE DM = DRow[j] + PA[i-1].m_scoreGapClose; \ SCORE IM = Iij + PB[j-1].m_scoreGapClose; \ SCORE MM = MCurr[j]; \ TB[i+1][j+1] &= ~BIT_xM; \ if (MM >= DM && MM >= IM) \ { \ MNext[j+1] += MM; \ SetDPM(i+1, j+1, MNext[j+1]); \ SetTBM(i+1, j+1, 'M'); \ /* SetBitTBM(TB, i+1, j+1, 'M'); */ \ TB[i+1][j+1] |= BIT_MM; \ } \ else if (DM >= MM && DM >= IM) \ { \ MNext[j+1] += DM; \ SetDPM(i+1, j+1, MNext[j+1]); \ SetTBM(i+1, j+1, 'D'); \ /* SetBitTBM(TB, i+1, j+1, 'D'); */ \ TB[i+1][j+1] |= BIT_DM; \ } \ else \ { \ assert(IM >= MM && IM >= DM); \ MNext[j+1] += IM; \ SetDPM(i+1, j+1, MNext[j+1]); \ SetTBM(i+1, j+1, 'I'); \ /* SetBitTBM(TB, i+1, j+1, 'I'); */ \ TB[i+1][j+1] |= BIT_IM; \ } \ } #if TRACE static bool LocalEq(BASETYPE b1, BASETYPE b2) { if (b1 < -100000 && b2 < -100000) return true; double diff = fabs(b1 - b2); if (diff < 0.0001) return true; double sum = fabs(b1) + fabs(b2); return diff/sum < 0.005; } static char Get_M_Char(char Bits) { switch (Bits & BIT_xM) { case BIT_MM: return 'M'; case BIT_DM: return 'D'; case BIT_IM: return 'I'; } Quit("Huh?"); return '?'; } static char Get_D_Char(char Bits) { return (Bits & BIT_xD) ? 'M' : 'D'; } static char Get_I_Char(char Bits) { return (Bits & BIT_xI) ? 'M' : 'I'; } static bool DPEq(char c, SCORE *g_DP, SCORE *DPD_, unsigned uPrefixCountA, unsigned uPrefixCountB) { SCORE *DPM_ = g_DP; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) if (!LocalEq(DPM(i, j), DPD(i, j))) { Log("***DPDIFF*** DP%c(%d, %d) Simple = %.2g, Fast = %.2g\n", c, i, j, DPM(i, j), DPD(i, j)); return false; } return true; } static bool CompareTB(char **TB, char *TBM_, char *TBD_, char *TBI_, unsigned uPrefixCountA, unsigned uPrefixCountB) { SCORE *DPM_ = g_DPM; bool Eq = true; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBM(i, j); char c2 = Get_M_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPM(i, j) > -100000) { Log("TBM(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto D; } } D: SCORE *DPD_ = g_DPD; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBD(i, j); char c2 = Get_D_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPD(i, j) > -100000) { Log("TBD(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto I; } } I: SCORE *DPI_ = g_DPI; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBI(i, j); char c2 = Get_I_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPI(i, j) > -100000) { Log("TBI(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto Done; } } Done: if (Eq) Log("TB success\n"); return Eq; } static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -100000) return " *"; sprintf(str, "%6.1f", s); return str; } static void LogDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } static void LogBitTB(char **TB, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); Log("Bit TBM:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_M_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBD:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_D_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBI:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_I_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } } static void ListTB(char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = TBM(uPrefixLengthA, uPrefixLengthB); Log(" %6c", c); } Log("\n"); } } static const char *BitsToStr(char Bits) { static char Str[9]; sprintf(Str, "%cM %cD %cI", Get_M_Char(Bits), Get_D_Char(Bits), Get_I_Char(Bits)); } #endif // TRACE static inline void SetBitTBM(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MM; break; case 'D': Bit = BIT_DM; break; case 'I': Bit = BIT_IM; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xM; TB[i][j] |= Bit; } static inline void SetBitTBD(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MD; break; case 'D': Bit = BIT_DD; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xD; TB[i][j] |= Bit; } static inline void SetBitTBI(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MI; break; case 'I': Bit = BIT_II; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xI; TB[i][j] |= Bit; } #if TRACE #define LogMatrices() \ { \ Log("Bit DPM:\n"); \ LogDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPD:\n"); \ LogDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPI:\n"); \ LogDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit TB:\n"); \ LogBitTB(TB, PA, PB, uPrefixCountA, uPrefixCountB); \ bool Same; \ Same = DPEq('M', g_DPM, DPM_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPM success\n"); \ Same = DPEq('D', g_DPD, DPD_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPD success\n"); \ Same = DPEq('I', g_DPI, DPI_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPI success\n"); \ CompareTB(TB, g_TBM, g_TBD, g_TBI, uPrefixCountA, uPrefixCountB);\ } #else #define LogMatrices() /* empty */ #endif static unsigned uCachePrefixCountB; static unsigned uCachePrefixCountA; static SCORE *CacheMCurr; static SCORE *CacheMNext; static SCORE *CacheMPrev; static SCORE *CacheDRow; static char **CacheTB; static void AllocCache(unsigned uPrefixCountA, unsigned uPrefixCountB) { if (uPrefixCountA <= uCachePrefixCountA && uPrefixCountB <= uCachePrefixCountB) return; delete[] CacheMCurr; delete[] CacheMNext; delete[] CacheMPrev; delete[] CacheDRow; for (unsigned i = 0; i < uCachePrefixCountA; ++i) delete[] CacheTB[i]; delete[] CacheTB; uCachePrefixCountA = uPrefixCountA + 1024; uCachePrefixCountB = uPrefixCountB + 1024; CacheMCurr = new SCORE[uCachePrefixCountB]; CacheMNext = new SCORE[uCachePrefixCountB]; CacheMPrev = new SCORE[uCachePrefixCountB]; CacheDRow = new SCORE[uCachePrefixCountB]; CacheTB = new char *[uCachePrefixCountA]; for (unsigned i = 0; i < uCachePrefixCountA; ++i) CacheTB[i] = new char [uCachePrefixCountB]; } SCORE NWSmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { if (0 == uLengthB || 0 == uLengthA ) Quit("Internal error, NWSmall: length=0"); SetTermGaps(PA, uLengthA); SetTermGaps(PB, uLengthB); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; const SCORE e = g_scoreGapExtend; ALLOC_TRACE() AllocCache(uPrefixCountA, uPrefixCountB); SCORE *MCurr = CacheMCurr; SCORE *MNext = CacheMNext; SCORE *MPrev = CacheMPrev; SCORE *DRow = CacheDRow; char **TB = CacheTB; for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TB[i], 0, uPrefixCountB); SCORE Iij = MINUS_INFINITY; SetDPI(0, 0, Iij); Iij = PB[0].m_scoreGapOpen; SetDPI(0, 1, Iij); for (unsigned j = 2; j <= uLengthB; ++j) { Iij += e; SetDPI(0, j, Iij); SetTBI(0, j, 'I'); } for (unsigned j = 0; j <= uLengthB; ++j) { DRow[j] = MINUS_INFINITY; SetDPD(0, j, DRow[j]); SetTBD(0, j, 'D'); } MPrev[0] = 0; SetDPM(0, 0, MPrev[0]); for (unsigned j = 1; j <= uLengthB; ++j) { MPrev[j] = MINUS_INFINITY; SetDPM(0, j, MPrev[j]); } MCurr[0] = MINUS_INFINITY; SetDPM(1, 0, MCurr[0]); MCurr[1] = ScoreProfPos2(PA[0], PB[0]); SetDPM(1, 1, MCurr[1]); SetBitTBM(TB, 1, 1, 'M'); SetTBM(1, 1, 'M'); for (unsigned j = 2; j <= uLengthB; ++j) { MCurr[j] = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen + (j - 2)*e + PB[j-2].m_scoreGapClose; SetDPM(1, j, MCurr[j]); SetBitTBM(TB, 1, j, 'I'); SetTBM(1, j, 'I'); } // Main DP loop for (unsigned i = 1; i < uLengthA; ++i) { char *TBRow = TB[i]; Iij = MINUS_INFINITY; SetDPI(i, 0, Iij); DRow[0] = PA[0].m_scoreGapOpen + (i - 1)*e; SetDPD(i, 0, DRow[0]); MCurr[0] = MINUS_INFINITY; if (i == 1) { MCurr[1] = ScoreProfPos2(PA[0], PB[0]); SetBitTBM(TB, i, 1, 'M'); SetTBM(i, 1, 'M'); } else { MCurr[1] = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen + (i - 2)*e + PA[i-2].m_scoreGapClose; SetBitTBM(TB, i, 1, 'D'); SetTBM(i, 1, 'D'); } SetDPM(i, 0, MCurr[0]); SetDPM(i, 1, MCurr[1]); for (unsigned j = 1; j < uLengthB; ++j) MNext[j+1] = ScoreProfPos2(PA[i], PB[j]); for (unsigned j = 1; j < uLengthB; ++j) { RECURSE_D(i, j) RECURSE_I(i, j) RECURSE_M(i, j) } // Special case for j=uLengthB RECURSE_D_BTerm(i) RECURSE_I_BTerm(i) // Prev := Curr, Curr := Next, Next := Prev Rotate(MPrev, MCurr, MNext); } // Special case for i=uLengthA char *TBRow = TB[uLengthA]; MCurr[0] = MINUS_INFINITY; if (uLengthA > 1) MCurr[1] = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e + PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose; else MCurr[1] = ScoreProfPos2(PA[uLengthA-1], PB[0]) + PA[0].m_scoreGapOpen + PA[0].m_scoreGapClose; SetBitTBM(TB, uLengthA, 1, 'D'); SetTBM(uLengthA, 1, 'D'); SetDPM(uLengthA, 0, MCurr[0]); SetDPM(uLengthA, 1, MCurr[1]); DRow[0] = MINUS_INFINITY; SetDPD(uLengthA, 0, DRow[0]); for (unsigned j = 1; j <= uLengthB; ++j) RECURSE_D_ATerm(j); Iij = MINUS_INFINITY; for (unsigned j = 1; j <= uLengthB; ++j) RECURSE_I_ATerm(j) LogMatrices(); SCORE MAB = MCurr[uLengthB]; SCORE DAB = DRow[uLengthB]; SCORE IAB = Iij; SCORE Score = MAB; char cEdgeType = 'M'; if (DAB > Score) { Score = DAB; cEdgeType = 'D'; } if (IAB > Score) { Score = IAB; cEdgeType = 'I'; } #if TRACE Log(" Fast: MAB=%.4g DAB=%.4g IAB=%.4g best=%c\n", MAB, DAB, IAB, cEdgeType); #endif BitTraceBack(TB, uLengthA, uLengthB, cEdgeType, Path); #if DBEUG Path.Validate(); #endif return 0; } muscle-3.8.31.orig/clust.cpp0000644000175000017500000004460611352261612015242 0ustar kratzcharles#include "muscle.h" #include "clust.h" #include "clustset.h" #include #define TRACE 0 Clust::Clust() { m_Nodes = 0; m_uNodeCount = 0; m_uLeafCount = 0; m_uClusterCount = 0; m_JoinStyle = JOIN_Undefined; m_dDist = 0; m_uLeafCount = 0; m_ptrSet = 0; } Clust::~Clust() { delete[] m_Nodes; delete[] m_dDist; delete[] m_ClusterIndexToNodeIndex; } void Clust::Create(ClustSet &Set, CLUSTER Method) { m_ptrSet = &Set; SetLeafCount(Set.GetLeafCount()); switch (Method) { case CLUSTER_UPGMA: m_JoinStyle = JOIN_NearestNeighbor; m_CentroidStyle = LINKAGE_Avg; break; case CLUSTER_UPGMAMax: m_JoinStyle = JOIN_NearestNeighbor; m_CentroidStyle = LINKAGE_Max; break; case CLUSTER_UPGMAMin: m_JoinStyle = JOIN_NearestNeighbor; m_CentroidStyle = LINKAGE_Min; break; case CLUSTER_UPGMB: m_JoinStyle = JOIN_NearestNeighbor; m_CentroidStyle = LINKAGE_Biased; break; case CLUSTER_NeighborJoining: m_JoinStyle = JOIN_NeighborJoining; m_CentroidStyle = LINKAGE_NeighborJoining; break; default: Quit("Clust::Create, invalid method %d", Method); } if (m_uLeafCount <= 1) Quit("Clust::Create: no leaves"); m_uNodeCount = 2*m_uLeafCount - 1; m_Nodes = new ClustNode[m_uNodeCount]; m_ClusterIndexToNodeIndex = new unsigned[m_uLeafCount]; m_ptrClusterList = 0; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { ClustNode &Node = m_Nodes[uNodeIndex]; Node.m_uIndex = uNodeIndex; if (uNodeIndex < m_uLeafCount) { Node.m_uSize = 1; Node.m_uLeafIndexes = new unsigned[1]; Node.m_uLeafIndexes[0] = uNodeIndex; AddToClusterList(uNodeIndex); } else Node.m_uSize = 0; } // Compute initial distance matrix between leaves SetProgressDesc("Build dist matrix"); unsigned uPairIndex = 0; const unsigned uPairCount = (m_uLeafCount*(m_uLeafCount - 1))/2; for (unsigned i = 0; i < m_uLeafCount; ++i) for (unsigned j = 0; j < i; ++j) { const float dDist = (float) m_ptrSet->ComputeDist(*this, i, j); SetDist(i, j, dDist); if (0 == uPairIndex%10000) Progress(uPairIndex, uPairCount); ++uPairIndex; } ProgressStepsDone(); // Call CreateCluster once for each internal node in the tree SetProgressDesc("Build guide tree"); m_uClusterCount = m_uLeafCount; const unsigned uInternalNodeCount = m_uNodeCount - m_uLeafCount; for (unsigned uNodeIndex = m_uLeafCount; uNodeIndex < m_uNodeCount; ++uNodeIndex) { unsigned i = uNodeIndex + 1 - m_uLeafCount; Progress(i, uInternalNodeCount); CreateCluster(); } ProgressStepsDone(); } void Clust::CreateCluster() { unsigned uLeftNodeIndex; unsigned uRightNodeIndex; float dLeftLength; float dRightLength; ChooseJoin(&uLeftNodeIndex, &uRightNodeIndex, &dLeftLength, &dRightLength); const unsigned uNewNodeIndex = m_uNodeCount - m_uClusterCount + 1; JoinNodes(uLeftNodeIndex, uRightNodeIndex, dLeftLength, dRightLength, uNewNodeIndex); #if TRACE Log("Merge New=%u L=%u R=%u Ld=%7.2g Rd=%7.2g\n", uNewNodeIndex, uLeftNodeIndex, uRightNodeIndex, dLeftLength, dRightLength); #endif // Compute distances to other clusters --m_uClusterCount; for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane; uNodeIndex = GetNextCluster(uNodeIndex)) { if (uNodeIndex == uLeftNodeIndex || uNodeIndex == uRightNodeIndex) continue; if (uNewNodeIndex == uNodeIndex) continue; const float dDist = ComputeDist(uNewNodeIndex, uNodeIndex); SetDist(uNewNodeIndex, uNodeIndex, dDist); } for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane; uNodeIndex = GetNextCluster(uNodeIndex)) { if (uNodeIndex == uLeftNodeIndex || uNodeIndex == uRightNodeIndex) continue; if (uNewNodeIndex == uNodeIndex) continue; #if REDLACK const float dMetric = ComputeMetric(uNewNodeIndex, uNodeIndex); InsertMetric(uNewNodeIndex, uNodeIndex, dMetric); #endif } } void Clust::ChooseJoin(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength) { switch (m_JoinStyle) { case JOIN_NearestNeighbor: ChooseJoinNearestNeighbor(ptruLeftIndex, ptruRightIndex, ptrdLeftLength, ptrdRightLength); return; case JOIN_NeighborJoining: ChooseJoinNeighborJoining(ptruLeftIndex, ptruRightIndex, ptrdLeftLength, ptrdRightLength); return; } Quit("Clust::ChooseJoin, Invalid join style %u", m_JoinStyle); } void Clust::ChooseJoinNearestNeighbor(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength) { const unsigned uClusterCount = GetClusterCount(); unsigned uMinLeftNodeIndex; unsigned uMinRightNodeIndex; GetMinMetric(&uMinLeftNodeIndex, &uMinRightNodeIndex); float dMinDist = GetDist(uMinLeftNodeIndex, uMinRightNodeIndex); const float dLeftHeight = GetHeight(uMinLeftNodeIndex); const float dRightHeight = GetHeight(uMinRightNodeIndex); *ptruLeftIndex = uMinLeftNodeIndex; *ptruRightIndex = uMinRightNodeIndex; *ptrdLeftLength = dMinDist/2 - dLeftHeight; *ptrdRightLength = dMinDist/2 - dRightHeight; } void Clust::ChooseJoinNeighborJoining(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength) { const unsigned uClusterCount = GetClusterCount(); //unsigned uMinLeftNodeIndex = uInsane; //unsigned uMinRightNodeIndex = uInsane; //float dMinD = PLUS_INFINITY; //for (unsigned i = GetFirstCluster(); i != uInsane; i = GetNextCluster(i)) // { // const float ri = Calc_r(i); // for (unsigned j = GetNextCluster(i); j != uInsane; j = GetNextCluster(j)) // { // const float rj = Calc_r(j); // const float dij = GetDist(i, j); // const float Dij = dij - (ri + rj); // if (Dij < dMinD) // { // dMinD = Dij; // uMinLeftNodeIndex = i; // uMinRightNodeIndex = j; // } // } // } unsigned uMinLeftNodeIndex; unsigned uMinRightNodeIndex; GetMinMetric(&uMinLeftNodeIndex, &uMinRightNodeIndex); const float dDistLR = GetDist(uMinLeftNodeIndex, uMinRightNodeIndex); const float rL = Calc_r(uMinLeftNodeIndex); const float rR = Calc_r(uMinRightNodeIndex); const float dLeftLength = (dDistLR + rL - rR)/2; const float dRightLength = (dDistLR - rL + rR)/2; *ptruLeftIndex = uMinLeftNodeIndex; *ptruRightIndex = uMinRightNodeIndex; *ptrdLeftLength = dLeftLength; *ptrdRightLength = dRightLength; } void Clust::JoinNodes(unsigned uLeftIndex, unsigned uRightIndex, float dLeftLength, float dRightLength, unsigned uNodeIndex) { ClustNode &Parent = m_Nodes[uNodeIndex]; ClustNode &Left = m_Nodes[uLeftIndex]; ClustNode &Right = m_Nodes[uRightIndex]; Left.m_dLength = dLeftLength; Right.m_dLength = dRightLength; Parent.m_ptrLeft = &Left; Parent.m_ptrRight = &Right; Left.m_ptrParent = &Parent; Right.m_ptrParent = &Parent; const unsigned uLeftSize = Left.m_uSize; const unsigned uRightSize = Right.m_uSize; const unsigned uParentSize = uLeftSize + uRightSize; Parent.m_uSize = uParentSize; assert(0 == Parent.m_uLeafIndexes); Parent.m_uLeafIndexes = new unsigned[uParentSize]; const unsigned uLeftBytes = uLeftSize*sizeof(unsigned); const unsigned uRightBytes = uRightSize*sizeof(unsigned); memcpy(Parent.m_uLeafIndexes, Left.m_uLeafIndexes, uLeftBytes); memcpy(Parent.m_uLeafIndexes + uLeftSize, Right.m_uLeafIndexes, uRightBytes); DeleteFromClusterList(uLeftIndex); DeleteFromClusterList(uRightIndex); AddToClusterList(uNodeIndex); } float Clust::Calc_r(unsigned uNodeIndex) const { const unsigned uClusterCount = GetClusterCount(); if (2 == uClusterCount) return 0; float dSum = 0; for (unsigned i = GetFirstCluster(); i != uInsane; i = GetNextCluster(i)) { if (i == uNodeIndex) continue; dSum += GetDist(uNodeIndex, i); } return dSum/(uClusterCount - 2); } float Clust::ComputeDist(unsigned uNewNodeIndex, unsigned uNodeIndex) { switch (m_CentroidStyle) { case LINKAGE_Avg: return ComputeDistAverageLinkage(uNewNodeIndex, uNodeIndex); case LINKAGE_Min: return ComputeDistMinLinkage(uNewNodeIndex, uNodeIndex); case LINKAGE_Max: return ComputeDistMaxLinkage(uNewNodeIndex, uNodeIndex); case LINKAGE_Biased: return ComputeDistMAFFT(uNewNodeIndex, uNodeIndex); case LINKAGE_NeighborJoining: return ComputeDistNeighborJoining(uNewNodeIndex, uNodeIndex); } Quit("Clust::ComputeDist, invalid centroid style %u", m_CentroidStyle); return (float) g_dNAN; } float Clust::ComputeDistMinLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex) { const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); return (dDistL < dDistR ? dDistL : dDistR); } float Clust::ComputeDistMaxLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex) { const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); return (dDistL > dDistR ? dDistL : dDistR); } float Clust::ComputeDistAverageLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex) { const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); return (dDistL + dDistR)/2; } float Clust::ComputeDistNeighborJoining(unsigned uNewNodeIndex, unsigned uNodeIndex) { const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); const float dDistLR = GetDist(uLeftNodeIndex, uRightNodeIndex); const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); const float dDist = (dDistL + dDistR - dDistLR)/2; return dDist; } // This is a mysterious variant of UPGMA reverse-engineered from MAFFT source. float Clust::ComputeDistMAFFT(unsigned uNewNodeIndex, unsigned uNodeIndex) { const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); const float dDistLR = GetDist(uLeftNodeIndex, uRightNodeIndex); const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); const float dMinDistLR = (dDistL < dDistR ? dDistL : dDistR); const float dSumDistLR = dDistL + dDistR; const float dDist = dMinDistLR*(1 - g_dSUEFF) + dSumDistLR*g_dSUEFF/2; return dDist; } unsigned Clust::GetClusterCount() const { return m_uClusterCount; } void Clust::LogMe() const { Log("Clust %u leaves, %u nodes, %u clusters.\n", m_uLeafCount, m_uNodeCount, m_uClusterCount); Log("Distance matrix\n"); const unsigned uNodeCount = GetNodeCount(); Log(" "); for (unsigned i = 0; i < uNodeCount - 1; ++i) Log(" %7u", i); Log("\n"); Log(" "); for (unsigned i = 0; i < uNodeCount - 1; ++i) Log(" ------"); Log("\n"); for (unsigned i = 0; i < uNodeCount - 1; ++i) { Log("%4u: ", i); for (unsigned j = 0; j < i; ++j) Log(" %7.2g", GetDist(i, j)); Log("\n"); } Log("\n"); Log("Node Size Prnt Left Rght Length Name\n"); Log("---- ---- ---- ---- ---- ------ ----\n"); for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { const ClustNode &Node = m_Nodes[uNodeIndex]; Log("%4u %4u", uNodeIndex, Node.m_uSize); if (0 != Node.m_ptrParent) Log(" %4u", Node.m_ptrParent->m_uIndex); else Log(" "); if (0 != Node.m_ptrLeft) Log(" %4u", Node.m_ptrLeft->m_uIndex); else Log(" "); if (0 != Node.m_ptrRight) Log(" %4u", Node.m_ptrRight->m_uIndex); else Log(" "); if (uNodeIndex != m_uNodeCount - 1) Log(" %7.3g", Node.m_dLength); if (IsLeaf(uNodeIndex)) { const char *ptrName = GetNodeName(uNodeIndex); if (0 != ptrName) Log(" %s", ptrName); } if (GetRootNodeIndex() == uNodeIndex) Log(" [ROOT]"); Log("\n"); } } const ClustNode &Clust::GetNode(unsigned uNodeIndex) const { if (uNodeIndex >= m_uNodeCount) Quit("ClustNode::GetNode(%u) %u", uNodeIndex, m_uNodeCount); return m_Nodes[uNodeIndex]; } bool Clust::IsLeaf(unsigned uNodeIndex) const { return uNodeIndex < m_uLeafCount; } unsigned Clust::GetClusterSize(unsigned uNodeIndex) const { const ClustNode &Node = GetNode(uNodeIndex); return Node.m_uSize; } unsigned Clust::GetLeftIndex(unsigned uNodeIndex) const { const ClustNode &Node = GetNode(uNodeIndex); if (0 == Node.m_ptrLeft) Quit("Clust::GetLeftIndex: leaf"); return Node.m_ptrLeft->m_uIndex; } unsigned Clust::GetRightIndex(unsigned uNodeIndex) const { const ClustNode &Node = GetNode(uNodeIndex); if (0 == Node.m_ptrRight) Quit("Clust::GetRightIndex: leaf"); return Node.m_ptrRight->m_uIndex; } float Clust::GetLength(unsigned uNodeIndex) const { const ClustNode &Node = GetNode(uNodeIndex); return Node.m_dLength; } void Clust::SetLeafCount(unsigned uLeafCount) { if (uLeafCount <= 1) Quit("Clust::SetLeafCount(%u)", uLeafCount); m_uLeafCount = uLeafCount; const unsigned uNodeCount = GetNodeCount(); // Triangular matrix size excluding diagonal (all zeros in our case). m_uTriangularMatrixSize = (uNodeCount*(uNodeCount - 1))/2; m_dDist = new float[m_uTriangularMatrixSize]; } unsigned Clust::GetLeafCount() const { return m_uLeafCount; } unsigned Clust::VectorIndex(unsigned uIndex1, unsigned uIndex2) const { const unsigned uNodeCount = GetNodeCount(); if (uIndex1 >= uNodeCount || uIndex2 >= uNodeCount) Quit("DistVectorIndex(%u,%u) %u", uIndex1, uIndex2, uNodeCount); unsigned v; if (uIndex1 >= uIndex2) v = uIndex2 + (uIndex1*(uIndex1 - 1))/2; else v = uIndex1 + (uIndex2*(uIndex2 - 1))/2; assert(v < m_uTriangularMatrixSize); return v; } float Clust::GetDist(unsigned uIndex1, unsigned uIndex2) const { unsigned v = VectorIndex(uIndex1, uIndex2); return m_dDist[v]; } void Clust::SetDist(unsigned uIndex1, unsigned uIndex2, float dDist) { unsigned v = VectorIndex(uIndex1, uIndex2); m_dDist[v] = dDist; } float Clust::GetHeight(unsigned uNodeIndex) const { if (IsLeaf(uNodeIndex)) return 0; const unsigned uLeftIndex = GetLeftIndex(uNodeIndex); const unsigned uRightIndex = GetRightIndex(uNodeIndex); const float dLeftLength = GetLength(uLeftIndex); const float dRightLength = GetLength(uRightIndex); const float dLeftHeight = dLeftLength + GetHeight(uLeftIndex); const float dRightHeight = dRightLength + GetHeight(uRightIndex); return (dLeftHeight + dRightHeight)/2; } const char *Clust::GetNodeName(unsigned uNodeIndex) const { if (!IsLeaf(uNodeIndex)) Quit("Clust::GetNodeName, is not leaf"); return m_ptrSet->GetLeafName(uNodeIndex); } unsigned Clust::GetNodeId(unsigned uNodeIndex) const { if (uNodeIndex >= GetLeafCount()) return 0; return m_ptrSet->GetLeafId(uNodeIndex); } unsigned Clust::GetLeaf(unsigned uNodeIndex, unsigned uLeafIndex) const { const ClustNode &Node = GetNode(uNodeIndex); const unsigned uLeafCount = Node.m_uSize; if (uLeafIndex >= uLeafCount) Quit("Clust::GetLeaf, invalid index"); const unsigned uIndex = Node.m_uLeafIndexes[uLeafIndex]; if (uIndex >= m_uNodeCount) Quit("Clust::GetLeaf, index out of range"); return uIndex; } unsigned Clust::GetFirstCluster() const { if (0 == m_ptrClusterList) return uInsane; return m_ptrClusterList->m_uIndex; } unsigned Clust::GetNextCluster(unsigned uIndex) const { ClustNode *ptrNode = &m_Nodes[uIndex]; if (0 == ptrNode->m_ptrNextCluster) return uInsane; return ptrNode->m_ptrNextCluster->m_uIndex; } void Clust::DeleteFromClusterList(unsigned uNodeIndex) { assert(uNodeIndex < m_uNodeCount); ClustNode *ptrNode = &m_Nodes[uNodeIndex]; ClustNode *ptrPrev = ptrNode->m_ptrPrevCluster; ClustNode *ptrNext = ptrNode->m_ptrNextCluster; if (0 != ptrNext) ptrNext->m_ptrPrevCluster = ptrPrev; if (0 == ptrPrev) { assert(m_ptrClusterList == ptrNode); m_ptrClusterList = ptrNext; } else ptrPrev->m_ptrNextCluster = ptrNext; ptrNode->m_ptrNextCluster = 0; ptrNode->m_ptrPrevCluster = 0; } void Clust::AddToClusterList(unsigned uNodeIndex) { assert(uNodeIndex < m_uNodeCount); ClustNode *ptrNode = &m_Nodes[uNodeIndex]; if (0 != m_ptrClusterList) m_ptrClusterList->m_ptrPrevCluster = ptrNode; ptrNode->m_ptrNextCluster = m_ptrClusterList; ptrNode->m_ptrPrevCluster = 0; m_ptrClusterList = ptrNode; } float Clust::ComputeMetric(unsigned uIndex1, unsigned uIndex2) const { switch (m_JoinStyle) { case JOIN_NearestNeighbor: return ComputeMetricNearestNeighbor(uIndex1, uIndex2); case JOIN_NeighborJoining: return ComputeMetricNeighborJoining(uIndex1, uIndex2); } Quit("Clust::ComputeMetric"); return 0; } float Clust::ComputeMetricNeighborJoining(unsigned i, unsigned j) const { float ri = Calc_r(i); float rj = Calc_r(j); float dij = GetDist(i, j); float dMetric = dij - (ri + rj); return (float) dMetric; } float Clust::ComputeMetricNearestNeighbor(unsigned i, unsigned j) const { return (float) GetDist(i, j); } float Clust::GetMinMetricBruteForce(unsigned *ptruIndex1, unsigned *ptruIndex2) const { unsigned uMinLeftNodeIndex = uInsane; unsigned uMinRightNodeIndex = uInsane; float dMinMetric = PLUS_INFINITY; for (unsigned uLeftNodeIndex = GetFirstCluster(); uLeftNodeIndex != uInsane; uLeftNodeIndex = GetNextCluster(uLeftNodeIndex)) { for (unsigned uRightNodeIndex = GetNextCluster(uLeftNodeIndex); uRightNodeIndex != uInsane; uRightNodeIndex = GetNextCluster(uRightNodeIndex)) { float dMetric = ComputeMetric(uLeftNodeIndex, uRightNodeIndex); if (dMetric < dMinMetric) { dMinMetric = dMetric; uMinLeftNodeIndex = uLeftNodeIndex; uMinRightNodeIndex = uRightNodeIndex; } } } *ptruIndex1 = uMinLeftNodeIndex; *ptruIndex2 = uMinRightNodeIndex; return dMinMetric; } float Clust::GetMinMetric(unsigned *ptruIndex1, unsigned *ptruIndex2) const { return GetMinMetricBruteForce(ptruIndex1, ptruIndex2); } muscle-3.8.31.orig/profdb.cpp0000644000175000017500000000236211352261666015366 0ustar kratzcharles#include "muscle.h" #include "textfile.h" #include "seqvect.h" #include "distfunc.h" #include "msa.h" #include "tree.h" #include "clust.h" #include "profile.h" #include "clustsetmsa.h" void ProfDB() { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrFileName2); SetStartTime(); TextFile file1(g_pstrFileName1); TextFile file2(g_pstrFileName2); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrFileName1); MSA msa1; msa1.FromFile(fileIn); const unsigned uSeqCount1 = msa1.GetSeqCount(); if (0 == uSeqCount1) Quit("No sequences in input alignment"); SeqVect v; v.FromFASTAFile(file2); const unsigned uSeqCount2 = v.Length(); if (0 == uSeqCount2) Quit("No sequences in input alignment"); MSA::SetIdCount(uSeqCount1 + uSeqCount2); SetProgressDesc("Align sequence database to profile"); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount2; ++uSeqIndex) { Progress(uSeqIndex, uSeqCount2); Seq &s = *(v[uSeqIndex]); s.SetId(0); MSA msaTmp; msaTmp.FromSeq(s); MSA msaOut; ProfileProfile(msa1, msaTmp, msaOut); msa1.Copy(msaOut); } ProgressStepsDone(); TextFile fileOut(g_pstrOutFileName, true); msa1.ToFile(fileOut); } muscle-3.8.31.orig/dpreglist.cpp0000644000175000017500000000655311352261667016116 0ustar kratzcharles#include "muscle.h" #include "dpreglist.h" unsigned DPRegionList::GetDPArea() const { unsigned uArea = 0; for (unsigned i = 0; i < m_uCount; ++i) { const DPRegion &r = m_DPRegions[i]; if (DPREGIONTYPE_Rect == r.m_Type) uArea += r.m_Rect.m_uLengthA*r.m_Rect.m_uLengthB; } return uArea; } void DPRegionList::Add(const DPRegion &r) { if (m_uCount == MAX_DPREGIONS) Quit("DPRegionList::Add, overflow %d", m_uCount); m_DPRegions[m_uCount] = r; ++m_uCount; } void DPRegionList::LogMe() const { Log("DPRegionList::LogMe, count=%u\n", m_uCount); Log("Region Type StartA StartB EndA EndB\n"); Log("------ ---- ------ ------ ---- ----\n"); for (unsigned i = 0; i < m_uCount; ++i) { const DPRegion &r = m_DPRegions[i]; Log("%6u ", i); if (DPREGIONTYPE_Diag == r.m_Type) Log("Diag %6u %6u %6u %6u\n", r.m_Diag.m_uStartPosA, r.m_Diag.m_uStartPosB, r.m_Diag.m_uStartPosA + r.m_Diag.m_uLength - 1, r.m_Diag.m_uStartPosB + r.m_Diag.m_uLength - 1); else if (DPREGIONTYPE_Rect == r.m_Type) Log("Rect %6u %6u %6u %6u\n", r.m_Rect.m_uStartPosA, r.m_Rect.m_uStartPosB, r.m_Rect.m_uStartPosA + r.m_Rect.m_uLengthA - 1, r.m_Rect.m_uStartPosB + r.m_Rect.m_uLengthB - 1); else Log(" *** ERROR *** Type=%u\n", r.m_Type); } } void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL, unsigned uLengthA, unsigned uLengthB) { if (g_uDiagMargin > g_uMinDiagLength/2) Quit("Invalid parameters, diagmargin=%d must be <= 2*diaglength=%d", g_uDiagMargin, g_uMinDiagLength); unsigned uStartPosA = 0; unsigned uStartPosB = 0; const unsigned uDiagCount = DL.GetCount(); DPRegion r; for (unsigned uDiagIndex = 0; uDiagIndex < uDiagCount; ++uDiagIndex) { const Diag &d = DL.Get(uDiagIndex); assert(d.m_uLength >= g_uMinDiagLength); const unsigned uStartVertexA = d.m_uStartPosA + g_uDiagMargin - 1; const unsigned uStartVertexB = d.m_uStartPosB + g_uDiagMargin - 1; const unsigned uEndVertexA = d.m_uStartPosA + d.m_uLength - g_uDiagMargin; const unsigned uEndVertexB = d.m_uStartPosB + d.m_uLength - g_uDiagMargin; r.m_Type = DPREGIONTYPE_Rect; r.m_Rect.m_uStartPosA = uStartPosA; r.m_Rect.m_uStartPosB = uStartPosB; assert(uStartVertexA + 1 >= uStartPosA); assert(uStartVertexB + 1 >= uStartPosB); r.m_Rect.m_uLengthA = uStartVertexA + 1 - uStartPosA; r.m_Rect.m_uLengthB = uStartVertexB + 1 - uStartPosB; RL.Add(r); if (uEndVertexA > uStartVertexA + 1) { const unsigned uDiagLengthMinusCaps = uEndVertexA - uStartVertexA - 1; r.m_Type = DPREGIONTYPE_Diag; r.m_Diag.m_uStartPosA = uStartVertexA + 1; r.m_Diag.m_uStartPosB = uStartVertexB + 1; assert(uEndVertexA - uStartVertexA == uEndVertexB - uStartVertexB); r.m_Diag.m_uLength = uEndVertexA - uStartVertexA - 1; RL.Add(r); } uStartPosA = uEndVertexA; uStartPosB = uEndVertexB; } assert((int) uLengthA - (int) uStartPosA >= (int) g_uDiagMargin); assert((int) uLengthB - (int) uStartPosB >= (int) g_uDiagMargin); r.m_Type = DPREGIONTYPE_Rect; r.m_Rect.m_uStartPosA = uStartPosA; r.m_Rect.m_uStartPosB = uStartPosB; assert(uLengthA >= uStartPosA); assert(uLengthB >= uStartPosB); r.m_Rect.m_uLengthA = uLengthA - uStartPosA; r.m_Rect.m_uLengthB = uLengthB - uStartPosB; RL.Add(r); } muscle-3.8.31.orig/fastdistkbit.cpp0000644000175000017500000000540411352261636016602 0ustar kratzcharles#include "muscle.h" #include "distfunc.h" #include "seqvect.h" #include #define MIN(x, y) ((x) < (y) ? (x) : (y)) static void SetKmerBitVector(const Seq &s, byte Bits[]) { const unsigned uLength = s.Length(); const unsigned k = 3; // kmer length unsigned i = 0; unsigned c = 0; unsigned h = 0; for (unsigned j = 0; j < k - 1; ++j) { unsigned x = CharToLetterEx(s[i++]); if (x <= AX_Y) c = c*20 + x; else { c = 0; h = j + 1; } } for ( ; i < uLength; ++i) { unsigned x = CharToLetterEx(s[i++]); if (x <= AX_Y) c = (c*20 + x)%8000; else { c = 0; h = i + k; } if (i >= h) { unsigned ByteOffset = c/8; unsigned BitOffset = c%8; Bits[ByteOffset] |= (1 << BitOffset); } } } static unsigned CommonBitCount(const byte Bits1[], const byte Bits2[]) { const byte * const p1end = Bits1 + 1000; const byte *p2 = Bits2; unsigned uCount = 0; for (const byte *p1 = Bits1; p1 != p1end; ++p1) { // Here is a cute trick for efficiently counting the // bits common between two bytes by combining them into // a single word. unsigned b = *p1 | (*p2 << 8); while (b != 0) { if (b & 0x101) ++uCount; b >>= 1; } ++p2; } return uCount; } void DistKbit20_3(const SeqVect &v, DistFunc &DF) { const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); // There are 20^3 = 8,000 distinct kmers in the 20-letter alphabet. // For each sequence, we create a bit vector of length 8,000, i.e. // 1,000 bytes, having one bit per kmer. The bit is set to 1 if the // kmer is present in the sequence. const unsigned uBytes = uSeqCount*1000; byte *BitVector = new byte[uBytes]; memset(BitVector, 0, uBytes); SetProgressDesc("K-bit distance matrix"); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) SetKmerBitVector(*v[uSeqIndex], BitVector + uSeqIndex*1000); unsigned uDone = 0; const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const byte *Bits1 = BitVector + uSeqIndex1*1000; const unsigned uLength1 = v[uSeqIndex1]->Length(); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) { const byte *Bits2 = BitVector + uSeqIndex2*1000; const unsigned uLength2 = v[uSeqIndex2]->Length(); const float fCount = (float) CommonBitCount(Bits1, Bits2); // Distance measure = K / min(L1, L2) // K is number of distinct kmers that are found in both sequences const float fDist = fCount / MIN(uLength1, uLength2); DF.SetDist(uSeqIndex1, uSeqIndex2, fDist); if (uDone%10000 == 0) Progress(uDone, uTotal); ++uDone; } } ProgressStepsDone(); delete[] BitVector; } muscle-3.8.31.orig/profile.cpp0000644000175000017500000000577511352261673015563 0ustar kratzcharles#include "muscle.h" #include "textfile.h" #include "msa.h" #include "tree.h" #include "profile.h" #include "objscore.h" bool TreeNeededForWeighting(SEQWEIGHT s) { switch (s) { case SEQWEIGHT_ClustalW: case SEQWEIGHT_ThreeWay: return true; default: return false; } } static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree) { const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); if (TreeNeededForWeighting(g_SeqWeight2)) { TreeFromMSA(msa, tree, g_Cluster2, g_Distance2, g_Root1); SetMuscleTree(tree); } return ProfileFromMSA(msa); } void ProfileProfile(MSA &msa1, MSA &msa2, MSA &msaOut) { //ALPHA Alpha = ALPHA_Undefined; //switch (g_SeqType) // { //case SEQTYPE_Auto: // Alpha = msa1.GuessAlpha(); // break; //case SEQTYPE_Protein: // Alpha = ALPHA_Amino; // break; //case SEQTYPE_DNA: // Alpha = ALPHA_DNA; // break; //case SEQTYPE_RNA: // Alpha = ALPHA_RNA; // break; //default: // Quit("Invalid SeqType"); // } //SetAlpha(Alpha); //msa1.FixAlpha(); //msa2.FixAlpha(); unsigned uLength1; unsigned uLength2; uLength1 = msa1.GetColCount(); uLength2 = msa2.GetColCount(); Tree tree1; Tree tree2; ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1); ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2); PWPath Path; ProfPos *ProfOut; unsigned uLengthOut; Progress("Aligning profiles"); AlignTwoProfs(Prof1, uLength1, 1.0, Prof2, uLength2, 1.0, Path, &ProfOut, &uLengthOut); Progress("Building output"); AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut); } // Do profile-profile alignment void Profile() { if (0 == g_pstrFileName1 || 0 == g_pstrFileName2) Quit("-profile needs -in1 and -in2"); SetSeqWeightMethod(g_SeqWeight1); TextFile file1(g_pstrFileName1); TextFile file2(g_pstrFileName2); MSA msa1; MSA msa2; MSA msaOut; Progress("Reading %s", g_pstrFileName1); msa1.FromFile(file1); Progress("%u seqs %u cols", msa1.GetSeqCount(), msa1.GetColCount()); Progress("Reading %s", g_pstrFileName2); msa2.FromFile(file2); Progress("%u seqs %u cols", msa2.GetSeqCount(), msa2.GetColCount()); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = msa1.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); msa1.FixAlpha(); msa2.FixAlpha(); SetPPScore(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); const unsigned uSumSeqCount = uSeqCount1 + uSeqCount2; MSA::SetIdCount(uSumSeqCount); ProfileProfile(msa1, msa2, msaOut); Progress("Writing output"); MuscleOutput(msaOut); } muscle-3.8.31.orig/vtml2.cpp0000644000175000017500000002417511352261626015160 0ustar kratzcharles#include "muscle.h" // Note: We use 32x32 arrays rather than 20x20 as this may give the compiler // optimizer an opportunity to make subscript arithmetic more efficient // (multiplying by 32 is same as shifting left by 5 bits). #define v(x) ((float) x) #define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ v(R), v(S), v(T), v(V), v(W), v(Y) }, // A C D E F G H I K L // M N P Q R S T V W Y // VTML200 float VTML_LA[32][32] = { ROW( 2.25080, 1.31180, 0.82704, 0.88740, 0.55520, 1.09860, 0.71673, 0.80805, 0.81213, 0.68712, 0.79105, 0.86777, 0.99328, 0.86644, 0.72821, 1.33924, 1.20373, 1.05956, 0.38107, 0.54373) // A ROW( 1.31180,15.79469, 0.39862, 0.42329, 0.49882, 0.65541, 0.67100, 0.97185, 0.46414, 0.55673, 0.90230, 0.63236, 0.54479, 0.47895, 0.56465, 1.18490, 0.99069, 1.21604, 0.28988, 0.91338) // C ROW( 0.82704, 0.39862, 4.18833, 2.06850, 0.25194, 0.90937, 1.01617, 0.32860, 1.03391, 0.31300, 0.42498, 1.80888, 0.81307, 1.20043, 0.63712, 1.03001, 0.88191, 0.43557, 0.26313, 0.37947) // D ROW( 0.88740, 0.42329, 2.06850, 3.08354, 0.33456, 0.77183, 0.94536, 0.43151, 1.35989, 0.45579, 0.53423, 1.15745, 0.82832, 1.66752, 0.84500, 0.98693, 0.88132, 0.54047, 0.24519, 0.52025) // E ROW( 0.55520, 0.49882, 0.25194, 0.33456, 6.08351, 0.30140, 1.02191, 1.10969, 0.37069, 1.50587, 1.41207, 0.42850, 0.41706, 0.48113, 0.41970, 0.56867, 0.57172, 0.91256, 2.02494, 3.44675) // F ROW( 1.09860, 0.65541, 0.90937, 0.77183, 0.30140, 5.62829, 0.64191, 0.28432, 0.67874, 0.30549, 0.37739, 1.01012, 0.60851, 0.65996, 0.63660, 1.03448, 0.68435, 0.40728, 0.36034, 0.35679) // G ROW( 0.71673, 0.67100, 1.01617, 0.94536, 1.02191, 0.64191, 6.05494, 0.50783, 1.03822, 0.60887, 0.55685, 1.28619, 0.72275, 1.41503, 1.24635, 0.93344, 0.83543, 0.54817, 0.81780, 1.81552) // H ROW( 0.80805, 0.97185, 0.32860, 0.43151, 1.10969, 0.28432, 0.50783, 3.03766, 0.49310, 1.88886, 1.75039, 0.44246, 0.44431, 0.53213, 0.48153, 0.55603, 0.88168, 2.37367, 0.68494, 0.70035) // I ROW( 0.81213, 0.46414, 1.03391, 1.35989, 0.37069, 0.67874, 1.03822, 0.49310, 2.72883, 0.52739, 0.68244, 1.15671, 0.82911, 1.51333, 2.33521, 0.93858, 0.92730, 0.55467, 0.39944, 0.52549) // K ROW( 0.68712, 0.55673, 0.31300, 0.45579, 1.50587, 0.30549, 0.60887, 1.88886, 0.52739, 3.08540, 2.14480, 0.43539, 0.53630, 0.62771, 0.53025, 0.53468, 0.69924, 1.50372, 0.82822, 0.89854) // L ROW( 0.79105, 0.90230, 0.42498, 0.53423, 1.41207, 0.37739, 0.55685, 1.75039, 0.68244, 2.14480, 4.04057, 0.55603, 0.48415, 0.76770, 0.66775, 0.62409, 0.87759, 1.42742, 0.52278, 0.72067) // M ROW( 0.86777, 0.63236, 1.80888, 1.15745, 0.42850, 1.01012, 1.28619, 0.44246, 1.15671, 0.43539, 0.55603, 3.36000, 0.69602, 1.13490, 0.98603, 1.31366, 1.11252, 0.50603, 0.35810, 0.68349) // N ROW( 0.99328, 0.54479, 0.81307, 0.82832, 0.41706, 0.60851, 0.72275, 0.44431, 0.82911, 0.53630, 0.48415, 0.69602, 7.24709, 0.90276, 0.74827, 1.03719, 0.83014, 0.56795, 0.37867, 0.33127) // P ROW( 0.86644, 0.47895, 1.20043, 1.66752, 0.48113, 0.65996, 1.41503, 0.53213, 1.51333, 0.62771, 0.76770, 1.13490, 0.90276, 2.86937, 1.50116, 0.99561, 0.93103, 0.61085, 0.29926, 0.51971) // Q ROW( 0.72821, 0.56465, 0.63712, 0.84500, 0.41970, 0.63660, 1.24635, 0.48153, 2.33521, 0.53025, 0.66775, 0.98603, 0.74827, 1.50116, 4.28698, 0.84662, 0.80673, 0.51422, 0.47569, 0.59592) // R ROW( 1.33924, 1.18490, 1.03001, 0.98693, 0.56867, 1.03448, 0.93344, 0.55603, 0.93858, 0.53468, 0.62409, 1.31366, 1.03719, 0.99561, 0.84662, 2.13816, 1.52911, 0.67767, 0.45129, 0.66767) // S ROW( 1.20373, 0.99069, 0.88191, 0.88132, 0.57172, 0.68435, 0.83543, 0.88168, 0.92730, 0.69924, 0.87759, 1.11252, 0.83014, 0.93103, 0.80673, 1.52911, 2.58221, 0.98702, 0.31541, 0.57954) // T ROW( 1.05956, 1.21604, 0.43557, 0.54047, 0.91256, 0.40728, 0.54817, 2.37367, 0.55467, 1.50372, 1.42742, 0.50603, 0.56795, 0.61085, 0.51422, 0.67767, 0.98702, 2.65580, 0.43419, 0.63805) // V ROW( 0.38107, 0.28988, 0.26313, 0.24519, 2.02494, 0.36034, 0.81780, 0.68494, 0.39944, 0.82822, 0.52278, 0.35810, 0.37867, 0.29926, 0.47569, 0.45129, 0.31541, 0.43419,31.39564, 2.51433) // W ROW( 0.54373, 0.91338, 0.37947, 0.52025, 3.44675, 0.35679, 1.81552, 0.70035, 0.52549, 0.89854, 0.72067, 0.68349, 0.33127, 0.51971, 0.59592, 0.66767, 0.57954, 0.63805, 2.51433, 7.50693) // Y }; const float VTML_SP_CENTER = (float) 22.0; #undef ROW #undef v #define v(x) ((float) (x + VTML_SP_CENTER)) #define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y, X) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ v(R), v(S), v(T), v(V), v(W), v(Y), v(X) }, // VTML 240 float VTML_SP[32][32] = { // A C D E F G H I K L M N P Q R S T V W Y X ROW( 58, 23, -12, -7, -44, 10, -23, -14, -14, -27, -17, -8, 1, -9, -22, 23, 15, 5, -74, -45, 0) // A ROW( 23, 224, -67, -63, -50, -30, -29, 1, -56, -41, -6, -33, -44, -53, -43, 15, 2, 18, -93, -6, 0) // C ROW( -12, -67, 111, 59,-104, -4, 4, -84, 6, -88, -65, 48, -13, 18, -29, 5, -7, -63,-105, -73, 0) // D ROW( -7, -63, 59, 85, -83, -17, -1, -63, 25, -60, -47, 15, -12, 40, -8, 1, -7, -47,-108, -51, 0) // E ROW( -44, -50,-104, -83, 144, -93, 4, 12, -74, 36, 30, -64, -67, -56, -65, -43, -41, -3, 63, 104, 0) // F ROW( 10, -30, -4, -17, -93, 140, -32, -95, -27, -91, -75, 4, -36, -29, -32, 5, -26, -68, -80, -79, 0) // G ROW( -23, -29, 4, -1, 4, -32, 137, -50, 6, -37, -42, 21, -23, 27, 19, -4, -12, -44, -13, 48, 0) // H ROW( -14, 1, -84, -63, 12, -95, -50, 86, -53, 53, 47, -62, -60, -47, -55, -43, -8, 69, -27, -24, 0) // I ROW( -14, -56, 6, 25, -74, -27, 6, -53, 75, -48, -30, 13, -12, 34, 68, -3, -4, -44, -71, -49, 0) // K ROW( -27, -41, -88, -60, 36, -91, -37, 53, -48, 88, 62, -63, -48, -36, -48, -47, -25, 36, -11, -4, 0) // L ROW( -17, -6, -65, -47, 30, -75, -42, 47, -30, 62, 103, -45, -54, -21, -31, -35, -9, 31, -46, -20, 0) // M ROW( -8, -33, 48, 15, -64, 4, 21, -62, 13, -63, -45, 89, -25, 12, 2, 22, 10, -51, -79, -29, 0) // N ROW( 1, -44, -13, -12, -67, -36, -23, -60, -12, -48, -54, -25, 160, -6, -20, 5, -12, -42, -76, -83, 0) // P ROW( -9, -53, 18, 40, -56, -29, 27, -47, 34, -36, -21, 12, -6, 75, 34, 1, -4, -37, -92, -48, 0) // Q ROW( -22, -43, -29, -8, -65, -32, 19, -55, 68, -48, -31, 2, -20, 34, 113, -10, -14, -49, -58, -39, 0) // R ROW( 23, 15, 5, 1, -43, 5, -4, -43, -3, -47, -35, 22, 5, 1, -10, 53, 32, -28, -62, -31, 0) // S ROW( 15, 2, -7, -7, -41, -26, -12, -8, -4, -25, -9, 10, -12, -4, -14, 32, 68, 0, -87, -40, 0) // T ROW( 5, 18, -63, -47, -3, -68, -44, 69, -44, 36, 31, -51, -42, -37, -49, -28, 0, 74, -61, -32, 0) // V ROW( -74, -93,-105,-108, 63, -80, -13, -27, -71, -11, -46, -79, -76, -92, -58, -62, -87, -61, 289, 81, 0) // W ROW( -45, -6, -73, -51, 104, -79, 48, -24, -49, -4, -20, -29, -83, -48, -39, -31, -40, -32, 81, 162, 0) // Y ROW( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) // X }; #undef v #define v(x) ((float) (x)) #define RNC(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y, X) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ v(R), v(S), v(T), v(V), v(W), v(Y), v(X) }, float VTML_SPNoCenter[32][32] = { // A C D E F G H I K L M N P Q R S T V W Y X RNC( 58, 23, -12, -7, -44, 10, -23, -14, -14, -27, -17, -8, 1, -9, -22, 23, 15, 5, -74, -45, 0) // A RNC( 23, 224, -67, -63, -50, -30, -29, 1, -56, -41, -6, -33, -44, -53, -43, 15, 2, 18, -93, -6, 0) // C RNC( -12, -67, 111, 59,-104, -4, 4, -84, 6, -88, -65, 48, -13, 18, -29, 5, -7, -63,-105, -73, 0) // D RNC( -7, -63, 59, 85, -83, -17, -1, -63, 25, -60, -47, 15, -12, 40, -8, 1, -7, -47,-108, -51, 0) // E RNC( -44, -50,-104, -83, 144, -93, 4, 12, -74, 36, 30, -64, -67, -56, -65, -43, -41, -3, 63, 104, 0) // F RNC( 10, -30, -4, -17, -93, 140, -32, -95, -27, -91, -75, 4, -36, -29, -32, 5, -26, -68, -80, -79, 0) // G RNC( -23, -29, 4, -1, 4, -32, 137, -50, 6, -37, -42, 21, -23, 27, 19, -4, -12, -44, -13, 48, 0) // H RNC( -14, 1, -84, -63, 12, -95, -50, 86, -53, 53, 47, -62, -60, -47, -55, -43, -8, 69, -27, -24, 0) // I RNC( -14, -56, 6, 25, -74, -27, 6, -53, 75, -48, -30, 13, -12, 34, 68, -3, -4, -44, -71, -49, 0) // K RNC( -27, -41, -88, -60, 36, -91, -37, 53, -48, 88, 62, -63, -48, -36, -48, -47, -25, 36, -11, -4, 0) // L RNC( -17, -6, -65, -47, 30, -75, -42, 47, -30, 62, 103, -45, -54, -21, -31, -35, -9, 31, -46, -20, 0) // M RNC( -8, -33, 48, 15, -64, 4, 21, -62, 13, -63, -45, 89, -25, 12, 2, 22, 10, -51, -79, -29, 0) // N RNC( 1, -44, -13, -12, -67, -36, -23, -60, -12, -48, -54, -25, 160, -6, -20, 5, -12, -42, -76, -83, 0) // P RNC( -9, -53, 18, 40, -56, -29, 27, -47, 34, -36, -21, 12, -6, 75, 34, 1, -4, -37, -92, -48, 0) // Q RNC( -22, -43, -29, -8, -65, -32, 19, -55, 68, -48, -31, 2, -20, 34, 113, -10, -14, -49, -58, -39, 0) // R RNC( 23, 15, 5, 1, -43, 5, -4, -43, -3, -47, -35, 22, 5, 1, -10, 53, 32, -28, -62, -31, 0) // S RNC( 15, 2, -7, -7, -41, -26, -12, -8, -4, -25, -9, 10, -12, -4, -14, 32, 68, 0, -87, -40, 0) // T RNC( 5, 18, -63, -47, -3, -68, -44, 69, -44, 36, 31, -51, -42, -37, -49, -28, 0, 74, -61, -32, 0) // V RNC( -74, -93,-105,-108, 63, -80, -13, -27, -71, -11, -46, -79, -76, -92, -58, -62, -87, -61, 289, 81, 0) // W RNC( -45, -6, -73, -51, 104, -79, 48, -24, -49, -4, -20, -29, -83, -48, -39, -31, -40, -32, 81, 162, 0) // Y RNC( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) // X }; muscle-3.8.31.orig/stabilize.cpp0000644000175000017500000000115511352261673016075 0ustar kratzcharles#include "muscle.h" #include "msa.h" void Stabilize(const MSA &msa, MSA &msaStable) { const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); msaStable.SetSize(uSeqCount, uColCount); for (unsigned uId = 0; uId < uSeqCount; ++uId) { const unsigned uSeqIndex = msa.GetSeqIndex(uId); msaStable.SetSeqName(uId, msa.GetSeqName(uSeqIndex)); msaStable.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msa.GetChar(uSeqIndex, uColIndex); msaStable.SetChar(uId, uColIndex, c); } } } muscle-3.8.31.orig/anchors.cpp0000644000175000017500000001407111352261673015545 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "objscore.h" #define TRACE 0 static void WindowSmooth(const SCORE Score[], unsigned uCount, unsigned uWindowLength, SCORE SmoothScore[], double dCeil) { #define Ceil(x) ((SCORE) ((x) > dCeil ? dCeil : (x))) if (1 != uWindowLength%2) Quit("WindowSmooth=%u must be odd", uWindowLength); if (uCount <= uWindowLength) { for (unsigned i = 0; i < uCount; ++i) SmoothScore[i] = 0; return; } const unsigned w2 = uWindowLength/2; for (unsigned i = 0; i < w2; ++i) { SmoothScore[i] = 0; SmoothScore[uCount - i - 1] = 0; } SCORE scoreWindowTotal = 0; for (unsigned i = 0; i < uWindowLength; ++i) { scoreWindowTotal += Ceil(Score[i]); } for (unsigned i = w2; ; ++i) { SmoothScore[i] = scoreWindowTotal/uWindowLength; if (i == uCount - w2 - 1) break; scoreWindowTotal -= Ceil(Score[i - w2]); scoreWindowTotal += Ceil(Score[i + w2 + 1]); } #undef Ceil } // Find columns that score above the given threshold. // A range of scores is defined between the average // and the maximum. The threshold is a fraction 0.0 .. 1.0 // within that range, where 0.0 is the average score // and 1.0 is the maximum score. // "Grade" is by analogy with grading on a curve. static void FindBestColsGrade(const SCORE Score[], unsigned uCount, double dThreshold, unsigned BestCols[], unsigned *ptruBestColCount) { SCORE scoreTotal = 0; for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) scoreTotal += Score[uIndex]; const SCORE scoreAvg = scoreTotal / uCount; SCORE scoreMax = MINUS_INFINITY; for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) if (Score[uIndex] > scoreMax) scoreMax = Score[uIndex]; unsigned uBestColCount = 0; for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) { const SCORE s = Score[uIndex]; const double dHeight = (s - scoreAvg)/(scoreMax - scoreAvg); if (dHeight >= dThreshold) { BestCols[uBestColCount] = uIndex; ++uBestColCount; } } *ptruBestColCount = uBestColCount; } // Best col only if all following criteria satisfied: // (1) Score >= min // (2) Smoothed score >= min // (3) No gaps. static void FindBestColsCombo(const MSA &msa, const SCORE Score[], const SCORE SmoothScore[], double dMinScore, double dMinSmoothScore, unsigned BestCols[], unsigned *ptruBestColCount) { const unsigned uColCount = msa.GetColCount(); unsigned uBestColCount = 0; for (unsigned uIndex = 0; uIndex < uColCount; ++uIndex) { if (Score[uIndex] < dMinScore) continue; if (SmoothScore[uIndex] < dMinSmoothScore) continue; if (msa.ColumnHasGap(uIndex)) continue; BestCols[uBestColCount] = uIndex; ++uBestColCount; } *ptruBestColCount = uBestColCount; } static void ListBestCols(const MSA &msa, const SCORE Score[], const SCORE SmoothScore[], unsigned BestCols[], unsigned uBestColCount) { const unsigned uColCount = msa.GetColCount(); const unsigned uSeqCount = msa.GetSeqCount(); Log("Col "); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%u", uSeqIndex%10); Log(" "); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { Log("%3u ", uColIndex); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%c", msa.GetChar(uSeqIndex, uColIndex)); Log(" %10.3f", Score[uColIndex]); Log(" %10.3f", SmoothScore[uColIndex]); for (unsigned i = 0; i < uBestColCount; ++i) if (BestCols[i] == uColIndex) Log(" <-- Best"); Log("\n"); } } // If two best columns are found within a window, choose // the highest-scoring. If more than two, choose the one // closest to the center of the window. static void MergeBestCols(const SCORE Scores[], const unsigned BestCols[], unsigned uBestColCount, unsigned uWindowLength, unsigned AnchorCols[], unsigned *ptruAnchorColCount) { unsigned uAnchorColCount = 0; for (unsigned n = 0; n < uBestColCount; /* update inside loop */) { unsigned uBestColIndex = BestCols[n]; unsigned uCountWithinWindow = 0; for (unsigned i = n + 1; i < uBestColCount; ++i) { unsigned uBestColIndex2 = BestCols[i]; if (uBestColIndex2 - uBestColIndex >= uWindowLength) break; ++uCountWithinWindow; } unsigned uAnchorCol = uBestColIndex; if (1 == uCountWithinWindow) { unsigned uBestColIndex2 = BestCols[n+1]; if (Scores[uBestColIndex] > Scores[uBestColIndex2]) uAnchorCol = uBestColIndex; else uAnchorCol = uBestColIndex2; } else if (uCountWithinWindow > 1) { unsigned uWindowCenter = uBestColIndex + uWindowLength/2; int iClosestDist = uWindowLength; unsigned uClosestCol = uBestColIndex; for (unsigned i = n + 1; i < n + uCountWithinWindow; ++i) { unsigned uColIndex = BestCols[i]; int iDist = uColIndex - uBestColIndex; if (iDist < 0) iDist = -iDist; if (iDist < iClosestDist) { uClosestCol = uColIndex; iClosestDist = iDist; } } uAnchorCol = uClosestCol; } AnchorCols[uAnchorColCount] = uAnchorCol; ++uAnchorColCount; n += uCountWithinWindow + 1; } *ptruAnchorColCount = uAnchorColCount; } void FindAnchorCols(const MSA &msa, unsigned AnchorCols[], unsigned *ptruAnchorColCount) { const unsigned uColCount = msa.GetColCount(); if (uColCount < 16) { *ptruAnchorColCount = 0; return; } SCORE *MatchScore = new SCORE[uColCount]; SCORE *SmoothScore = new SCORE[uColCount]; unsigned *BestCols = new unsigned[uColCount]; GetLetterScores(msa, MatchScore); WindowSmooth(MatchScore, uColCount, g_uSmoothWindowLength, SmoothScore, g_dSmoothScoreCeil); unsigned uBestColCount; FindBestColsCombo(msa, MatchScore, SmoothScore, g_dMinBestColScore, g_dMinSmoothScore, BestCols, &uBestColCount); #if TRACE ListBestCols(msa, MatchScore, SmoothScore, BestCols, uBestColCount); #endif MergeBestCols(MatchScore, BestCols, uBestColCount, g_uAnchorSpacing, AnchorCols, ptruAnchorColCount); delete[] MatchScore; delete[] SmoothScore; delete[] BestCols; } muscle-3.8.31.orig/globalsother.cpp0000644000175000017500000000131111366144056016566 0ustar kratzcharles#include "muscle.h" #if !defined(__linux__) && !defined(_MSC_VER) && !defined(__MACH__) double GetNAN() { return 0.0; } double g_dNAN = GetNAN(); void chkmem(const char szMsg[]) { } void Break() { } char szCmdLine[4096]; const char *GetCmdLine() { return "muscle"; } double GetMemUseMB() { return 100.0; } void SaveCmdLine(int argc, char *argv[]) { for (int i = 0; i < argc; ++i) { if (i > 0) strcat(szCmdLine, " "); strcat(szCmdLine, argv[i]); } } double GetPeakMemUseMB() { return 100.0; } double GetCPUGHz() { return 2.0; } void CheckMemUse() { } double GetRAMSizeMB() { return 500.0; } #endif muscle-3.8.31.orig/types.h0000644000175000017500000000417311352261667014726 0ustar kratzcharles#ifndef types_h #define types_h typedef unsigned char byte; typedef unsigned short ushort; typedef float SCOREMATRIX[32][32]; typedef SCOREMATRIX *PTR_SCOREMATRIX; class MSA; class Seq; class ClusterTree; class DistFunc; class TextFile; class PWPath; class Tree; class SeqVect; class DistCalc; struct ProgNode; struct ProfPos; #if SINGLE_AFFINE // Compress M, D and I trace-back matrices into 4 bits enum { BIT_MM = 0x00, BIT_DM = 0x01, BIT_IM = 0x02, BIT_xM = 0x03, BIT_DD = 0x00, BIT_MD = 0x04, // ID not allowed BIT_xD = 0x04, BIT_II = 0x00, BIT_MI = 0x08, // DI not allowed BIT_xI = 0x08, }; #endif #if DOUBLE_AFFINE // Compress M, D, E, I and J trace-back matrices into 7 bits enum { BIT_MM = 0x00, BIT_DM = 0x01, BIT_EM = 0x02, BIT_IM = 0x03, BIT_JM = 0x04, BIT_xM = 0x07, BIT_DD = 0x00, BIT_MD = 0x08, // [EIJ]D not sallowed BIT_xD = 0x08, BIT_EE = 0x00, BIT_ME = 0x10, // [DDJ]E not allowed BIT_xE = 0x10, BIT_II = 0x00, BIT_MI = 0x20, // [EDJ]I not allowed BIT_xI = 0x20, BIT_JJ = 0x00, BIT_MJ = 0x40, // [EDI]J not allowed BIT_xJ = 0x40, }; #endif enum EXIT { EXIT_Success = 0, EXIT_NotStarted = 1, EXIT_FatalError = 2, EXIT_Except = 3, }; enum NODECMP { NODECMP_Undefined = 0, NODECMP_Same = 0, // equivalent to node in old tree NODECMP_Diff = 1, // equivalent & parent is changed NODECMP_Changed = 2 // no equivalent node in old tree }; // Declare enums using macro hacks (see enums.h). #define s(t) enum t { t##_Undefined = 0, #define c(t, x) t##_##x, #define e(t) }; #include "enums.h" // Declare conversion function XXXToStr(XXX x) // for each enum type XXX. #define s(t) const char *t##ToStr(t x); #define c(t, x) /* empty */ #define e(t) /* empty */ #include "enums.h" // Declare conversion function StrToXXX(const char *Str) // for each enum type XXX. #define s(t) t StrTo##t(const char *Str); #define c(t, x) /* empty */ #define e(t) /* empty */ #include "enums.h" const char *BoolToStr(bool b); const char *SecsToStr(unsigned long Secs); #endif // types_h muscle-3.8.31.orig/color.cpp0000644000175000017500000001331211352261676015226 0ustar kratzcharles#include "muscle.h" #include "msa.h" static int Blosum62[23][23] = { // A B C D E F G H I K L M N P Q R S T V W X Y Z +4, -2, +0, -2, -1, -2, +0, -2, -1, -1, -1, -1, -2, -1, -1, -1, +1, +0, +0, -3, -1, -2, -1, // A -2, +6, -3, +6, +2, -3, -1, -1, -3, -1, -4, -3, +1, -1, +0, -2, +0, -1, -3, -4, -1, -3, +2, // B +0, -3, +9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -1, -2, -4, // C -2, +6, -3, +6, +2, -3, -1, -1, -3, -1, -4, -3, +1, -1, +0, -2, +0, -1, -3, -4, -1, -3, +2, // D -1, +2, -4, +2, +5, -3, -2, +0, -3, +1, -3, -2, +0, -1, +2, +0, +0, -1, -2, -3, -1, -2, +5, // E -2, -3, -2, -3, -3, +6, -3, -1, +0, -3, +0, +0, -3, -4, -3, -3, -2, -2, -1, +1, -1, +3, -3, // F +0, -1, -3, -1, -2, -3, +6, -2, -4, -2, -4, -3, +0, -2, -2, -2, +0, -2, -3, -2, -1, -3, -2, // G -2, -1, -3, -1, +0, -1, -2, +8, -3, -1, -3, -2, +1, -2, +0, +0, -1, -2, -3, -2, -1, +2, +0, // H -1, -3, -1, -3, -3, +0, -4, -3, +4, -3, +2, +1, -3, -3, -3, -3, -2, -1, +3, -3, -1, -1, -3, // I -1, -1, -3, -1, +1, -3, -2, -1, -3, +5, -2, -1, +0, -1, +1, +2, +0, -1, -2, -3, -1, -2, +1, // K -1, -4, -1, -4, -3, +0, -4, -3, +2, -2, +4, +2, -3, -3, -2, -2, -2, -1, +1, -2, -1, -1, -3, // L -1, -3, -1, -3, -2, +0, -3, -2, +1, -1, +2, +5, -2, -2, +0, -1, -1, -1, +1, -1, -1, -1, -2, // M -2, +1, -3, +1, +0, -3, +0, +1, -3, +0, -3, -2, +6, -2, +0, +0, +1, +0, -3, -4, -1, -2, +0, // N -1, -1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, +7, -1, -2, -1, -1, -2, -4, -1, -3, -1, // P -1, +0, -3, +0, +2, -3, -2, +0, -3, +1, -2, +0, +0, -1, +5, +1, +0, -1, -2, -2, -1, -1, +2, // Q -1, -2, -3, -2, +0, -3, -2, +0, -3, +2, -2, -1, +0, -2, +1, +5, -1, -1, -3, -3, -1, -2, +0, // R +1, +0, -1, +0, +0, -2, +0, -1, -2, +0, -2, -1, +1, -1, +0, -1, +4, +1, -2, -3, -1, -2, +0, // S +0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, +0, -1, -1, -1, +1, +5, +0, -2, -1, -2, -1, // T +0, -3, -1, -3, -2, -1, -3, -3, +3, -2, +1, +1, -3, -2, -2, -3, -2, +0, +4, -3, -1, -1, -2, // V -3, -4, -2, -4, -3, +1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3,+11, -1, +2, -3, // W -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // X -2, -3, -2, -3, -2, +3, -3, +2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, +2, -1, +7, -2, // Y -1, +2, -4, +2, +5, -3, -2, +0, -3, +1, -3, -2, +0, -1, +2, +0, +0, -1, -2, -3, -1, -2, +5, // Z }; static int toi_tab[26] = { 0, // A 1, // B 2, // C 3, // D 4, // E 5, // F 6, // G 7, // H 8, // I -1, // J 9, // K 10, // L 11, // M 12, // N -1, // O 13, // P 14, // Q 15, // R 16, // S 17, // T 17, // U 18, // V 19, // W 20, // X 21, // Y 22, // Z }; static int toi(char c) { c = toupper(c); return toi_tab[c - 'A']; } static int BlosumScore(char c1, char c2) { int i1 = toi(c1); int i2 = toi(c2); return Blosum62[i1][i2]; } /*** Consider a column with 5 As and 3 Bs. There are: 5x4 pairs of As. 3x2 pairs of Bs. 5x3x2 AB pairs 8x7 = 5x4 + 3x2 + 5x3x2 pairs of letters ***/ static double BlosumScoreCol(const MSA &a, unsigned uColIndex) { int iCounts[23]; memset(iCounts, 0, sizeof(iCounts)); const unsigned uSeqCount = a.GetSeqCount(); unsigned uCharCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { char c = a.GetChar(uSeqIndex, uColIndex); if (IsGapChar(c)) continue; int iChar = toi(c); ++iCounts[iChar]; ++uCharCount; } if (uCharCount < 2) return -9; int iTotalScore = 0; for (int i1 = 0; i1 < 23; ++i1) { int iCounts1 = iCounts[i1]; iTotalScore += iCounts1*(iCounts1 - 1)*Blosum62[i1][i1]; for (int i2 = i1 + 1; i2 < 23; ++i2) iTotalScore += iCounts[i2]*iCounts1*2*Blosum62[i1][i2]; } int iPairCount = uCharCount*(uCharCount - 1); return (double) iTotalScore / (double) iPairCount; } /*** Consider a column with 5 As and 3 Bs. A residue of type Q scores: 5xAQ + 3xBQ ***/ static void AssignColorsCol(const MSA &a, unsigned uColIndex, int **Colors) { int iCounts[23]; memset(iCounts, 0, sizeof(iCounts)); const unsigned uSeqCount = a.GetSeqCount(); unsigned uCharCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { char c = a.GetChar(uSeqIndex, uColIndex); if (IsGapChar(c)) continue; int iChar = toi(c); ++iCounts[iChar]; ++uCharCount; } int iMostConservedType = -1; int iMostConservedCount = -1; for (unsigned i = 0; i < 23; ++i) { if (iCounts[i] > iMostConservedCount) { iMostConservedType = i; iMostConservedCount = iCounts[i]; } } double dColScore = BlosumScoreCol(a, uColIndex); int c; if (dColScore >= 3.0) c = 3; //else if (dColScore >= 1.0) // c = 2; else if (dColScore >= 0.2) c = 1; else c = 0; int Color[23]; for (unsigned uLetter = 0; uLetter < 23; ++uLetter) { double dScore = Blosum62[uLetter][iMostConservedType]; if (dScore >= dColScore) Color[uLetter] = c; else Color[uLetter] = 0; } for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { char c = a.GetChar(uSeqIndex, uColIndex); if (IsGapChar(c)) { Colors[uSeqIndex][uColIndex] = 0; continue; } int iLetter = toi(c); if (iLetter >= 0 && iLetter < 23) Colors[uSeqIndex][uColIndex] = Color[iLetter]; else Colors[uSeqIndex][uColIndex] = 0; } } void AssignColors(const MSA &a, int **Colors) { const unsigned uColCount = a.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) AssignColorsCol(a, uColIndex, Colors); } muscle-3.8.31.orig/msa.h0000644000175000017500000001413711352261667014343 0ustar kratzcharles#ifndef MSA_h #define MSA_h const int MAX_SEQ_NAME = 63; struct PathEdge; class TextFile; class Seq; class ClusterNode; class NodeCounts; class DataBuffer; class MSA { public: MSA(); virtual ~MSA(); public: // Ways to create an MSA void FromFile(TextFile &File); void FromFASTAFile(TextFile &File); void FromSeq(const Seq &s); void ToFile(TextFile &File) const; void ToFASTAFile(TextFile &File) const; void ToMSFFile(TextFile &File, const char *ptrComment = 0) const; void ToAlnFile(TextFile &File) const; void ToHTMLFile(TextFile &File) const; void ToPhySequentialFile(TextFile &File) const; void ToPhyInterleavedFile(TextFile &File) const; void SetSize(unsigned uSeqCount, unsigned uColCount); void SetSeqCount(unsigned uSeqCount); char GetChar(unsigned uSeqIndex, unsigned uIndex) const; unsigned GetLetter(unsigned uSeqIndex, unsigned uIndex) const; unsigned GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const; const char *GetSeqName(unsigned uSeqIndex) const; unsigned GetSeqId(unsigned uSeqIndex) const; unsigned GetSeqIndex(unsigned uId) const; bool GetSeqIndex(unsigned uId, unsigned *ptruIndex) const; double GetOcc(unsigned uColIndex) const; void GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize, FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd, FCOUNT *fcGapExtend, FCOUNT *ptrfOcc, FCOUNT *fcLL, FCOUNT *fcLG, FCOUNT *fcGL, FCOUNT *fcGG) const; bool IsGap(unsigned uSeqIndex, unsigned uColIndex) const; bool IsWildcard(unsigned uSeqIndex, unsigned uColIndex) const; bool IsGapColumn(unsigned uColIndex) const; bool ColumnHasGap(unsigned uColIndex) const; bool IsGapSeq(unsigned uSeqIndex) const; void SetChar(unsigned uSeqIndex, unsigned uColIndex, char c); void SetSeqName(unsigned uSeqIndex, const char szName[]); void SetSeqId(unsigned uSeqIndex, unsigned uId); bool HasGap() const; bool IsLegalLetter(unsigned uLetter) const; void GetSeq(unsigned uSeqIndex, Seq &seq) const; void Copy(const MSA &msa); double GetCons(unsigned uColIndex) const; double GetAvgCons() const; double GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const; bool GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const; void DeleteCol(unsigned uColIndex); void DeleteColumns(unsigned uColIndex, unsigned uColCount); void CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex); void DeleteSeq(unsigned uSeqIndex); // void DeleteEmptyCols(bool bProgress = false); bool IsEmptyCol(unsigned uColIndex) const; WEIGHT GetSeqWeight(unsigned uSeqIndex) const; WEIGHT GetTotalSeqWeight() const; void SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const; void NormalizeWeights(WEIGHT wTotal) const; bool WeightsSet() const; unsigned GetGCGCheckSum(unsigned uSeqIndex) const; ALPHA GuessAlpha() const; void FixAlpha(); unsigned UniqueResidueTypes(unsigned uColIndex) const; void UnWeight(); void GetNodeCounts(unsigned uAlignedColIndex, NodeCounts &Counts) const; void ValidateBreakMatrices() const; unsigned GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const; const char *GetSeqBuffer(unsigned uSeqIndex) const; unsigned AlignedColIndexToColIndex(unsigned uAlignedColIndex) const; unsigned GetSeqLength(unsigned uSeqIndex) const; void GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrdPWID, unsigned *ptruPosCount) const; void GetPairMap(unsigned uSeqIndex1, unsigned uSeqIndex2, int iMap1[], int iMap2[]) const; void LogMe() const; void ListWeights() const; void GapInfoToDataBuffer(DataBuffer &Buffer) const; void GapInfoFromDataBuffer(const DataBuffer &Buffer); double GetPctGroupIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const; void Clear() { Free(); } unsigned GetSeqCount() const { return m_uSeqCount; } unsigned GetColCount() const { return m_uColCount; } static bool SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2, unsigned uSeqIndex2); static void SetIdCount(unsigned uIdCount); private: friend void SetMSAWeightsMuscle(MSA &msa); friend void SetThreeWayWeightsMuscle(MSA &msa); void SetHenikoffWeightsPB() const; void SetHenikoffWeights() const; void SetGSCWeights() const; void SetUniformWeights() const; void SetClustalWWeights(const Tree &tree); void Free(); void AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel); void ExpandCache(unsigned uSeqCount, unsigned uColCount); void CalcWeights() const; void GetNameFromFASTAAnnotationLine(const char szLine[], char szName[], unsigned uBytes); void CopyCol(unsigned uFromCol, unsigned uToCol); unsigned CalcBLOSUMWeights(ClusterTree &BlosumCluster) const; void SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const; unsigned SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const; void SetSubtreeWeight2(const ClusterNode *ptrNode) const; void SetSubtreeGSCWeight(ClusterNode *ptrNode) const; void CalcHenikoffWeightsColPB(unsigned uColIndex) const; void CalcHenikoffWeightsCol(unsigned uColIndex) const; private: unsigned m_uSeqCount; unsigned m_uColCount; unsigned m_uCacheSeqLength; unsigned m_uCacheSeqCount; char **m_szSeqs; char **m_szNames; static unsigned m_uIdCount; unsigned *m_IdToSeqIndex; unsigned *m_SeqIndexToId; WEIGHT *m_Weights; }; void SeqVectFromMSA(const MSA &msa, SeqVect &v); void DeleteGappedCols(MSA &msa); void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount, MSA &msaOut); void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat); void MSAAppend(MSA &msa1, const MSA &msa2); void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount, MSA &msaOut); void AssertMSAEq(const MSA &msa1, const MSA &msa2); void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2); void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount, MSA &msaOut); void SetMSAWeightsMuscle(MSA &msa); void SetClustalWWeightsMuscle(MSA &msa); void SetThreeWayWeightsMuscle(MSA &msa); #endif // MSA_h muscle-3.8.31.orig/aln.cpp0000644000175000017500000001102311367131123014644 0ustar kratzcharles#include "muscle.h" #include #include #include "msa.h" #include "textfile.h" const unsigned uCharsPerLine = 60; const int MIN_NAME = 10; const int MAX_NAME = 32; static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex); void MSA::ToAlnFile(TextFile &File) const { if (g_bClwStrict) File.PutString("CLUSTAL W (1.81) multiple sequence alignment\n"); else { File.PutString("MUSCLE (" SHORT_VERSION ")" " multiple sequence alignment\n"); File.PutString("\n"); } int iLongestNameLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *ptrName = GetSeqName(uSeqIndex); const char *ptrBlank = strchr(ptrName, ' '); int iLength; if (0 != ptrBlank) iLength = (int) (ptrBlank - ptrName); else iLength = (int) strlen(ptrName); if (iLength > iLongestNameLength) iLongestNameLength = iLength; } if (iLongestNameLength > MAX_NAME) iLongestNameLength = MAX_NAME; if (iLongestNameLength < MIN_NAME) iLongestNameLength = MIN_NAME; unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1; for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex) { File.PutString("\n"); unsigned uStartColIndex = uLineIndex*uCharsPerLine; unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1; if (uEndColIndex >= GetColCount()) uEndColIndex = GetColCount() - 1; char Name[MAX_NAME+1]; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *ptrName = GetSeqName(uSeqIndex); const char *ptrBlank = strchr(ptrName, ' '); int iLength; if (0 != ptrBlank) iLength = (int) (ptrBlank - ptrName); else iLength = (int) strlen(ptrName); if (iLength > MAX_NAME) iLength = MAX_NAME; memset(Name, ' ', MAX_NAME); memcpy(Name, ptrName, iLength); Name[iLongestNameLength] = 0; File.PutFormat("%s ", Name); for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; ++uColIndex) { const char c = GetChar(uSeqIndex, uColIndex); File.PutFormat("%c", toupper(c)); } File.PutString("\n"); } memset(Name, ' ', MAX_NAME); Name[iLongestNameLength] = 0; File.PutFormat("%s ", Name); for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; ++uColIndex) { const char c = GetAlnConsensusChar(*this, uColIndex); File.PutChar(c); } File.PutString("\n"); } } static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex) { const unsigned uSeqCount = a.GetSeqCount(); unsigned BitMap = 0; unsigned Count = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uLetter = a.GetLetterEx(uSeqIndex, uColIndex); assert(uLetter < 32); unsigned Bit = (1 << uLetter); if (!(BitMap & Bit)) ++Count; BitMap |= Bit; } // '*' indicates positions which have a single, fully conserved residue if (1 == Count) return '*'; if (ALPHA_Amino != g_Alpha) return ' '; #define B(a) (1 << AX_##a) #define S2(a, b) S(B(a) | B(b)) #define S3(a, b, c) S(B(a) | B(b) | B(c)) #define S4(a, b, c, d) S(B(a) | B(b) | B(c) | B(d)) #define S(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return ':'; #define W3(a, b, c) W(B(a) | B(b) | B(c)) #define W4(a, b, c, d) W(B(a) | B(b) | B(c) | B(d)) #define W5(a, b, c, d, e) W(B(a) | B(b) | B(c) | B(d) | B(e)) #define W6(a, b, c, d, e, f) W(B(a) | B(b) | B(c) | B(d) | B(e) | B(f)) #define W(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return '.'; // ':' indicates that one of the following 'strong' // groups is fully conserved // STA // NEQK // NHQK // NDEQ // QHRK // MILV // MILF // HY // FYW // S3(S, T, A) S4(N, E, Q, K) S4(N, H, Q, K) S4(N, D, E, Q) S4(M, I, L, V) S4(M, I, L, F) S2(H, Y) S3(F, Y, W) // '.' indicates that one of the following 'weaker' // groups is fully conserved // CSA // ATV // SAG // STNK // STPA // SGND // SNDEQK // NDEQHK // NEQHRK // FVLIM // HFY W3(C, S, A) W3(A, T, V) W3(S, A, G) W4(S, T, N, K) W4(S, T, P, A) W4(S, G, N, D) W6(S, N, D, E, Q, K) W6(N, W, Q, H, R, K) W5(F, V, L, I, M) W3(H, F, Y) return ' '; } muscle-3.8.31.orig/html.cpp0000644000175000017500000000711011352261626015046 0ustar kratzcharles#include "muscle.h" #include #include #include "msa.h" #include "textfile.h" const unsigned uCharsPerLine = 60; const int MIN_NAME = 10; const int MAX_NAME = 32; extern void AssignColors(const MSA &a, int **Colors); static int **MakeColors(const MSA &a) { const unsigned uSeqCount = a.GetSeqCount(); const unsigned uColCount = a.GetColCount(); int **Colors = new int *[uSeqCount]; for (unsigned i = 0; i < uSeqCount; ++i) { Colors[i] = new int[uColCount]; memset(Colors[i], 0, uColCount*sizeof(int)); } AssignColors(a, Colors); return Colors; } static void ChangeColor(TextFile &File, int From, int To) { if (From == To) return; #define COLOR_WHITE "FFFFFF" #define COLOR_GRAY "C0C0C0" #define COLOR_BLACK "000000" #define COLOR_RED "FF0000" #define COLOR_GREEN "00FF00" #define COLOR_BLUE "5590FF" #define COLOR_LIGHTBLUE "77FFFF" #define X(c) File.PutString(""); switch (To) { case 0: X(COLOR_WHITE) break; case 1: X(COLOR_GRAY) break; case 2: X(COLOR_BLUE) break; case 3: X(COLOR_LIGHTBLUE) break; } } #define COLOR_WINDOW "FFEEE0" void MSA::ToHTMLFile(TextFile &File) const { File.PutString("\n"); File.PutString("\n"); File.PutString("
");

	int **Colors = MakeColors(*this);

	int iLongestNameLength = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
		{
		const char *ptrName = GetSeqName(uSeqIndex);
		const char *ptrBlank = strchr(ptrName, ' ');
		int iLength;
		if (0 != ptrBlank)
			iLength = (int) (ptrBlank - ptrName);
		else
			iLength = (int) strlen(ptrName);
		if (iLength > iLongestNameLength)
			iLongestNameLength = iLength;
		}
	if (iLongestNameLength > MAX_NAME)
		iLongestNameLength = MAX_NAME;
	if (iLongestNameLength < MIN_NAME)
		iLongestNameLength = MIN_NAME;

	unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1;
	int CurrentColor = -1;
	for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex)
		{
		File.PutString("\n");
		unsigned uStartColIndex = uLineIndex*uCharsPerLine;
		unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1;
		if (uEndColIndex >= GetColCount())
			uEndColIndex = GetColCount() - 1;
		char Name[MAX_NAME+1];
		for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
			{
			const char *ptrName = GetSeqName(uSeqIndex);
			const char *ptrBlank = strchr(ptrName, ' ');
			int iLength;
			if (0 != ptrBlank)
				iLength = (int) (ptrBlank - ptrName);
			else
				iLength = (int) strlen(ptrName);
			if (iLength > MAX_NAME)
				iLength = MAX_NAME;
			memset(Name, ' ', MAX_NAME);
			memcpy(Name, ptrName, iLength);
			Name[iLongestNameLength] = 0;

//			File.PutString("");
			CurrentColor = -1;
			File.PutString("");
			File.PutFormat("%s      ", Name);
			File.PutString("");
			for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
			  ++uColIndex)
				{
				const int Color = Colors[uSeqIndex][uColIndex];
				ChangeColor(File, CurrentColor, Color);
				CurrentColor = Color;
				const char c = GetChar(uSeqIndex, uColIndex);
				if (Color == 0)
					File.PutFormat("%c", tolower(c));
				else
					File.PutFormat("%c", toupper(c));
				}
			File.PutString("\n");
			}
		}
	File.PutString("\n");
	File.PutString("
\n"); File.PutString("\n"); File.PutString("\n"); } muscle-3.8.31.orig/hydro.cpp0000644000175000017500000000162711352261673015240 0ustar kratzcharles#include "muscle.h" #include "profile.h" extern void TomHydro(ProfPos *Prof, unsigned Length); // Apply hydrophobicity heuristic to a profile void Hydro(ProfPos *Prof, unsigned uLength) { if (ALPHA_Amino != g_Alpha) return; if (g_bTomHydro) { TomHydro(Prof, uLength); return; } if (0 == g_uHydrophobicRunLength) return; if (uLength <= g_uHydrophobicRunLength) return; unsigned uRunLength = 0; unsigned L2 = g_uHydrophobicRunLength/2; for (unsigned uColIndex = L2; uColIndex < uLength - L2; ++uColIndex) { ProfPos &PP = Prof[uColIndex]; bool bHydro = IsHydrophobic(PP.m_fcCounts); if (bHydro) { ++uRunLength; if (uRunLength >= g_uHydrophobicRunLength) { Prof[uColIndex-L2].m_scoreGapOpen *= (SCORE) g_dHydroFactor; Prof[uColIndex-L2].m_scoreGapClose *= (SCORE) g_dHydroFactor; } } else uRunLength = 0; } } muscle-3.8.31.orig/makerootmsa.cpp0000644000175000017500000001265611352261676016444 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include "seqvect.h" #include "profile.h" #include "msa.h" #include "pwpath.h" #include "estring.h" #define TRACE 0 #define VALIDATE 0 static void PathSeq(const Seq &s, const PWPath &Path, bool bRight, Seq &sOut) { short *esA; short *esB; PathToEstrings(Path, &esA, &esB); const unsigned uSeqLength = s.Length(); const unsigned uEdgeCount = Path.GetEdgeCount(); sOut.Clear(); sOut.SetName(s.GetName()); unsigned uPos = 0; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); char cType = Edge.cType; if (bRight) { if (cType == 'I') cType = 'D'; else if (cType == 'D') cType = 'I'; } switch (cType) { case 'M': sOut.AppendChar(s[uPos++]); break; case 'D': sOut.AppendChar('-'); break; case 'I': sOut.AppendChar(s[uPos++]); break; default: Quit("PathSeq, invalid edge type %c", cType); } } } #if VALIDATE static void MakeRootSeq(const Seq &s, const Tree &GuideTree, unsigned uLeafNodeIndex, const ProgNode Nodes[], Seq &sRoot) { sRoot.Copy(s); unsigned uNodeIndex = uLeafNodeIndex; for (;;) { unsigned uParent = GuideTree.GetParent(uNodeIndex); if (NULL_NEIGHBOR == uParent) break; bool bRight = (GuideTree.GetLeft(uParent) == uNodeIndex); uNodeIndex = uParent; const PWPath &Path = Nodes[uNodeIndex].m_Path; Seq sTmp; PathSeq(sRoot, Path, bRight, sTmp); sTmp.SetId(0); sRoot.Copy(sTmp); } } #endif // VALIDATE static short *MakeRootSeqE(const Seq &s, const Tree &GuideTree, unsigned uLeafNodeIndex, const ProgNode Nodes[], Seq &sRoot, short *Estring1, short *Estring2) { short *EstringCurr = Estring1; short *EstringNext = Estring2; const unsigned uSeqLength = s.Length(); EstringCurr[0] = uSeqLength; EstringCurr[1] = 0; unsigned uNodeIndex = uLeafNodeIndex; for (;;) { unsigned uParent = GuideTree.GetParent(uNodeIndex); if (NULL_NEIGHBOR == uParent) break; bool bRight = (GuideTree.GetLeft(uParent) == uNodeIndex); uNodeIndex = uParent; const PWPath &Path = Nodes[uNodeIndex].m_Path; const short *EstringNode = bRight ? Nodes[uNodeIndex].m_EstringL : Nodes[uNodeIndex].m_EstringR; MulEstrings(EstringCurr, EstringNode, EstringNext); #if TRACE Log("\n"); Log("Curr="); LogEstring(EstringCurr); Log("\n"); Log("Node="); LogEstring(EstringNode); Log("\n"); Log("Prod="); LogEstring(EstringNext); Log("\n"); #endif short *EstringTmp = EstringNext; EstringNext = EstringCurr; EstringCurr = EstringTmp; } EstringOp(EstringCurr, s, sRoot); #if TRACE Log("Root estring="); LogEstring(EstringCurr); Log("\n"); Log("Root seq="); sRoot.LogMe(); #endif return EstringCurr; } static unsigned GetFirstNodeIndex(const Tree &tree) { if (g_bStable) return 0; return tree.FirstDepthFirstNode(); } static unsigned GetNextNodeIndex(const Tree &tree, unsigned uPrevNodeIndex) { if (g_bStable) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned uNodeIndex = uPrevNodeIndex; for (;;) { ++uNodeIndex; if (uNodeIndex >= uNodeCount) return NULL_NEIGHBOR; if (tree.IsLeaf(uNodeIndex)) return uNodeIndex; } } unsigned uNodeIndex = uPrevNodeIndex; for (;;) { uNodeIndex = tree.NextDepthFirstNode(uNodeIndex); if (NULL_NEIGHBOR == uNodeIndex || tree.IsLeaf(uNodeIndex)) return uNodeIndex; } } void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a) { #if TRACE Log("MakeRootMSA Tree="); GuideTree.LogMe(); #endif const unsigned uSeqCount = v.GetSeqCount(); unsigned uColCount = uInsane; unsigned uSeqIndex = 0; const unsigned uTreeNodeCount = GuideTree.GetNodeCount(); const unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const PWPath &RootPath = Nodes[uRootNodeIndex].m_Path; const unsigned uRootColCount = RootPath.GetEdgeCount(); const unsigned uEstringSize = uRootColCount + 1; short *Estring1 = new short[uEstringSize]; short *Estring2 = new short[uEstringSize]; SetProgressDesc("Root alignment"); unsigned uTreeNodeIndex = GetFirstNodeIndex(GuideTree); do { Progress(uSeqIndex, uSeqCount); unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); const Seq &s = *(v[uId]); Seq sRootE; short *es = MakeRootSeqE(s, GuideTree, uTreeNodeIndex, Nodes, sRootE, Estring1, Estring2); Nodes[uTreeNodeIndex].m_EstringL = EstringNewCopy(es); #if VALIDATE Seq sRoot; MakeRootSeq(s, GuideTree, uTreeNodeIndex, Nodes, sRoot); if (!sRoot.Eq(sRootE)) { Log("sRoot="); sRoot.LogMe(); Log("sRootE="); sRootE.LogMe(); Quit("Root seqs differ"); } #if TRACE Log("MakeRootSeq=\n"); sRoot.LogMe(); #endif #endif if (uInsane == uColCount) { uColCount = sRootE.Length(); a.SetSize(uSeqCount, uColCount); } else { assert(uColCount == sRootE.Length()); } a.SetSeqName(uSeqIndex, s.GetName()); a.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) a.SetChar(uSeqIndex, uColIndex, sRootE[uColIndex]); ++uSeqIndex; uTreeNodeIndex = GetNextNodeIndex(GuideTree, uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); delete[] Estring1; delete[] Estring2; ProgressStepsDone(); assert(uSeqIndex == uSeqCount); } muscle-3.8.31.orig/enumopts.h0000644000175000017500000000037411352261600015416 0ustar kratzcharles#ifndef enumopts_h #define enumopts_h struct EnumOpt { const char *pstrOpt; int iValue; }; #define s(t) extern EnumOpt t##_Opts[]; #define c(t, x) /* empty */ #define e(t) /* empty */ #include "enums.h" #endif // enumopts_h muscle-3.8.31.orig/outweights.cpp0000644000175000017500000000071511352261666016314 0ustar kratzcharles#include "muscle.h" #include "msa.h" void OutWeights(const char *FileName, const MSA &msa) { FILE *f = fopen(FileName, "w"); if (0 == f) Quit("Cannot open '%s'", FileName); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *Id = msa.GetSeqName(uSeqIndex); const WEIGHT w = msa.GetSeqWeight(uSeqIndex); fprintf(f, "%s\t%.3g\n", Id, w); } fclose(f); } muscle-3.8.31.orig/sw.cpp0000644000175000017500000001314711352261667014547 0ustar kratzcharles#include "muscle.h" #include #include "pwpath.h" #include "profile.h" #include // Textbook Smith-Waterman affine gap implementation. #define TRACE 0 static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (MINUS_INFINITY == s) return " *"; sprintf(str, "%6.2f", s); return str; } static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } SCORE SW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; // Allocate DP matrices const size_t LM = uPrefixCountA*uPrefixCountB; SCORE *DPM_ = new SCORE[LM]; SCORE *DPD_ = new SCORE[LM]; SCORE *DPI_ = new SCORE[LM]; DPM(0, 0) = 0; DPD(0, 0) = MINUS_INFINITY; DPI(0, 0) = MINUS_INFINITY; DPM(1, 0) = MINUS_INFINITY; DPD(1, 0) = MINUS_INFINITY; DPI(1, 0) = MINUS_INFINITY; DPM(0, 1) = MINUS_INFINITY; DPD(0, 1) = MINUS_INFINITY; DPI(0, 1) = MINUS_INFINITY; // Empty prefix of B is special case for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { // M=LetterA+LetterB, impossible with empty prefix DPM(uPrefixLengthA, 0) = MINUS_INFINITY; // D=LetterA+GapB, never optimal in local alignment with gap penalties DPD(uPrefixLengthA, 0) = MINUS_INFINITY; // I=GapA+LetterB, impossible with empty prefix DPI(uPrefixLengthA, 0) = MINUS_INFINITY; } // Empty prefix of A is special case for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { // M=LetterA+LetterB, impossible with empty prefix DPM(0, uPrefixLengthB) = MINUS_INFINITY; // D=LetterA+GapB, impossible with empty prefix DPD(0, uPrefixLengthB) = MINUS_INFINITY; // I=GapA+LetterB, never optimal in local alignment with gap penalties DPI(0, uPrefixLengthB) = MINUS_INFINITY; } SCORE scoreMax = MINUS_INFINITY; unsigned uPrefixLengthAMax = uInsane; unsigned uPrefixLengthBMax = uInsane; // ============ // Main DP loop // ============ SCORE scoreGapCloseB = MINUS_INFINITY; for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreGapCloseA = MINUS_INFINITY; for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; { // Match M=LetterA+LetterB SCORE scoreLL = ScoreProfPos2(PPA, PPB); SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; SCORE scoreBest; if (scoreMM >= scoreDM && scoreMM >= scoreIM) scoreBest = scoreMM; else if (scoreDM >= scoreMM && scoreDM >= scoreIM) scoreBest = scoreDM; else { assert(scoreIM >= scoreMM && scoreIM >= scoreDM); scoreBest = scoreIM; } if (scoreBest < 0) scoreBest = 0; scoreBest += scoreLL; if (scoreBest > scoreMax) { scoreMax = scoreBest; uPrefixLengthAMax = uPrefixLengthA; uPrefixLengthBMax = uPrefixLengthB; } DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest; } { // Delete D=LetterA+GapB SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen; SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB); SCORE scoreBest; if (scoreMD >= scoreDD) scoreBest = scoreMD; else { assert(scoreDD >= scoreMD); scoreBest = scoreDD; } DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert I=GapA+LetterB { SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB - 1].m_scoreGapOpen; SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1); SCORE scoreBest; if (scoreMI >= scoreII) scoreBest = scoreMI; else { assert(scoreII > scoreMI); scoreBest = scoreII; } DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; } scoreGapCloseA = PPA.m_scoreGapClose; } scoreGapCloseB = PPB.m_scoreGapClose; } #if TRACE Log("DPM:\n"); ListDP(DPM_, PA, PB, uPrefixLengthA, uPrefixLengthB); Log("DPD:\n"); ListDP(DPD_, PA, PB, uPrefixLengthA, uPrefixLengthB); Log("DPI:\n"); ListDP(DPI_, PA, PB, uPrefixLengthA, uPrefixLengthB); #endif assert(scoreMax == DPM(uPrefixLengthAMax, uPrefixLengthBMax)); TraceBackSW(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, uPrefixLengthAMax, uPrefixLengthBMax, Path); #if TRACE SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path); Path.LogMe(); Log("Score = %s Path = %s\n", LocalScoreToStr(scoreMax), LocalScoreToStr(scorePath)); #endif delete[] DPM_; delete[] DPD_; delete[] DPI_; return scoreMax; } muscle-3.8.31.orig/glbalignle.cpp0000644000175000017500000002435211352261667016216 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "pwpath.h" #define OCC 1 struct DP_MEMORY { unsigned uLength; SCORE *GapOpenA; SCORE *GapOpenB; SCORE *GapCloseA; SCORE *GapCloseB; SCORE *MPrev; SCORE *MCurr; SCORE *MWork; SCORE *DPrev; SCORE *DCurr; SCORE *DWork; SCORE **ScoreMxB; #if OCC FCOUNT *OccA; FCOUNT *OccB; #endif unsigned **SortOrderA; unsigned *uDeletePos; FCOUNT **FreqsA; int **TraceBack; }; static struct DP_MEMORY DPM; static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) { // Max prefix length unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; if (uLength < DPM.uLength) return; // Add 256 to allow for future expansion and // round up to next multiple of 32. uLength += 256; uLength += 32 - uLength%32; const unsigned uOldLength = DPM.uLength; if (uOldLength > 0) { for (unsigned i = 0; i < uOldLength; ++i) { delete[] DPM.TraceBack[i]; delete[] DPM.FreqsA[i]; delete[] DPM.SortOrderA[i]; } for (unsigned n = 0; n < 20; ++n) delete[] DPM.ScoreMxB[n]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.uDeletePos; delete[] DPM.GapOpenA; delete[] DPM.GapOpenB; delete[] DPM.GapCloseA; delete[] DPM.GapCloseB; delete[] DPM.SortOrderA; delete[] DPM.FreqsA; delete[] DPM.ScoreMxB; delete[] DPM.TraceBack; #if OCC delete[] DPM.OccA; delete[] DPM.OccB; #endif } DPM.uLength = uLength; DPM.GapOpenA = new SCORE[uLength]; DPM.GapOpenB = new SCORE[uLength]; DPM.GapCloseA = new SCORE[uLength]; DPM.GapCloseB = new SCORE[uLength]; #if OCC DPM.OccA = new FCOUNT[uLength]; DPM.OccB = new FCOUNT[uLength]; #endif DPM.SortOrderA = new unsigned*[uLength]; DPM.FreqsA = new FCOUNT*[uLength]; DPM.ScoreMxB = new SCORE*[20]; DPM.MPrev = new SCORE[uLength]; DPM.MCurr = new SCORE[uLength]; DPM.MWork = new SCORE[uLength]; DPM.DPrev = new SCORE[uLength]; DPM.DCurr = new SCORE[uLength]; DPM.DWork = new SCORE[uLength]; DPM.uDeletePos = new unsigned[uLength]; DPM.TraceBack = new int*[uLength]; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) DPM.ScoreMxB[uLetter] = new SCORE[uLength]; for (unsigned i = 0; i < uLength; ++i) { DPM.SortOrderA[i] = new unsigned[20]; DPM.FreqsA[i] = new FCOUNT[20]; DPM.TraceBack[i] = new int[uLength]; } } SCORE GlobalAlignLE(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { SetTermGaps(PA, uLengthA); SetTermGaps(PB, uLengthB); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; AllocDPMem(uLengthA, uLengthB); SCORE *GapOpenA = DPM.GapOpenA; SCORE *GapOpenB = DPM.GapOpenB; SCORE *GapCloseA = DPM.GapCloseA; SCORE *GapCloseB = DPM.GapCloseB; unsigned **SortOrderA = DPM.SortOrderA; FCOUNT **FreqsA = DPM.FreqsA; SCORE **ScoreMxB = DPM.ScoreMxB; SCORE *MPrev = DPM.MPrev; SCORE *MCurr = DPM.MCurr; SCORE *MWork = DPM.MWork; SCORE *DPrev = DPM.DPrev; SCORE *DCurr = DPM.DCurr; SCORE *DWork = DPM.DWork; #if OCC FCOUNT *OccA = DPM.OccA; FCOUNT *OccB = DPM.OccB; #endif unsigned *uDeletePos = DPM.uDeletePos; int **TraceBack = DPM.TraceBack; for (unsigned i = 0; i < uLengthA; ++i) { GapOpenA[i] = PA[i].m_scoreGapOpen; GapCloseA[i] = PA[i].m_scoreGapClose; #if OCC OccA[i] = PA[i].m_fOcc; #endif for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; } } for (unsigned j = 0; j < uLengthB; ++j) { GapOpenB[j] = PB[j].m_scoreGapOpen; GapCloseB[j] = PB[j].m_scoreGapClose; #if OCC OccB[j] = PB[j].m_fOcc; #endif } for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { for (unsigned j = 0; j < uLengthB; ++j) ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; } for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); // Special case for i=0 unsigned **ptrSortOrderA = SortOrderA; FCOUNT **ptrFreqsA = FreqsA; assert(ptrSortOrderA == &(SortOrderA[0])); assert(ptrFreqsA == &(FreqsA[0])); TraceBack[0][0] = 0; SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][0]; } if (0 == scoreSum) MPrev[0] = -2.5; else { #if OCC MPrev[0] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[0]; #else MPrev[0] = (logf(scoreSum) - g_scoreCenter); #endif } // D(0,0) is -infinity (requires I->D). DPrev[0] = MINUS_INFINITY; for (unsigned j = 1; j < uLengthB; ++j) { // Only way to get M(0, j) looks like this: // A ----X // B XXXXX // 0 j // So gap-open at j=0, gap-close at j-1. SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][j]; } if (0 == scoreSum) MPrev[j] = -2.5; else { #if OCC MPrev[j] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[j] + GapOpenB[0] + GapCloseB[j-1]; #else MPrev[j] = (logf(scoreSum) - g_scoreCenter) + GapOpenB[0] + GapCloseB[j-1]; #endif } TraceBack[0][j] = -(int) j; // Assume no D->I transitions, then can't be a delete if only // one letter from A. DPrev[j] = MINUS_INFINITY; } SCORE IPrev_j_1; for (unsigned i = 1; i < uLengthA; ++i) { ++ptrSortOrderA; ++ptrFreqsA; assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); SCORE *ptrMCurr_j = MCurr; memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); const FCOUNT *FreqsAi = *ptrFreqsA; const unsigned *SortOrderAi = *ptrSortOrderA; const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; const SCORE *ptrMCurrMax = MCurr + uLengthB; for (const unsigned *ptrSortOrderAi = SortOrderAi; ptrSortOrderAi != ptrSortOrderAiEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; SCORE *NSBR_Letter = ScoreMxB[uLetter]; const FCOUNT fcLetter = FreqsAi[uLetter]; if (0 == fcLetter) break; SCORE *ptrNSBR = NSBR_Letter; for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) *ptrMCurr += fcLetter*(*ptrNSBR++); } #if OCC const FCOUNT OccAi = OccA[i]; #endif for (unsigned j = 0; j < uLengthB; ++j) { if (MCurr[j] == 0) MCurr[j] = -2.5; else #if OCC MCurr[j] = (logf(MCurr[j]) - g_scoreCenter)*OccAi*OccB[j]; #else MCurr[j] = (logf(MCurr[j]) - g_scoreCenter); #endif } ptrMCurr_j = MCurr; unsigned *ptrDeletePos = uDeletePos; // Special case for j=0 // Only way to get M(i, 0) looks like this: // 0 i // A XXXXX // B ----X // So gap-open at i=0, gap-close at i-1. assert(ptrMCurr_j == &(MCurr[0])); *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; ++ptrMCurr_j; int *ptrTraceBack_ij = TraceBack[i]; *ptrTraceBack_ij++ = (int) i; SCORE *ptrMPrev_j = MPrev; SCORE *ptrDPrev = DPrev; SCORE d = *ptrDPrev; SCORE DNew = *ptrMPrev_j + GapOpenA[i]; if (DNew > d) { d = DNew; *ptrDeletePos = i; } SCORE *ptrDCurr = DCurr; assert(ptrDCurr == &(DCurr[0])); *ptrDCurr = d; // Can't have an insert if no letters from B IPrev_j_1 = MINUS_INFINITY; unsigned uInsertPos = 0; const SCORE scoreGapOpenAi = GapOpenA[i]; const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; for (unsigned j = 1; j < uLengthB; ++j) { // Here, MPrev_j is preserved from previous // iteration so with current i,j is M[i-1][j-1] SCORE MPrev_j = *ptrMPrev_j; SCORE INew = MPrev_j + GapOpenB[j]; if (INew > IPrev_j_1) { IPrev_j_1 = INew; uInsertPos = j; } SCORE scoreMax = MPrev_j; assert(ptrDPrev == &(DPrev[j-1])); SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; if (scoreD > scoreMax) { scoreMax = scoreD; assert(ptrDeletePos == &(uDeletePos[j-1])); *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; assert(*ptrTraceBack_ij > 0); } ++ptrDeletePos; SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; if (scoreI > scoreMax) { scoreMax = scoreI; *ptrTraceBack_ij = (int) uInsertPos - (int) j; assert(*ptrTraceBack_ij < 0); } assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); *ptrMCurr_j += scoreMax; assert(ptrMCurr_j == &(MCurr[j])); ++ptrMCurr_j; MPrev_j = *(++ptrMPrev_j); assert(ptrDPrev == &(DPrev[j])); SCORE d = *ptrDPrev; SCORE DNew = MPrev_j + scoreGapOpenAi; if (DNew > d) { d = DNew; assert(ptrDeletePos == &uDeletePos[j]); *ptrDeletePos = i; } assert(ptrDCurr + 1 == &(DCurr[j])); *(++ptrDCurr) = d; ++ptrTraceBack_ij; } Rotate(MPrev, MCurr, MWork); Rotate(DPrev, DCurr, DWork); } // Special case for i=uLengthA SCORE IPrev = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { SCORE INew = MPrev[j-1] + GapOpenB[j]; if (INew > IPrev) { uInsertPos = j; IPrev = INew; } } // Special case for i=uLengthA, j=uLengthB SCORE scoreMax = MPrev[uLengthB-1]; int iTraceBack = 0; SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; if (scoreD > scoreMax) { scoreMax = scoreD; iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; } SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; if (scoreI > scoreMax) { scoreMax = scoreI; iTraceBack = (int) uInsertPos - (int) uLengthB; } TraceBack[uLengthA][uLengthB] = iTraceBack; TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); return scoreMax; } muscle-3.8.31.orig/setgscweights.cpp0000644000175000017500000001366311352261600016767 0ustar kratzcharles/*** Gerstein/Sonnhammer/Chothia ad hoc sequence weighting. The algorithm was deduced by reverse-engineering the HMMer code. I used an alternative representation that I prefer over HMMer's. The HMMer code is full of tree manipulations that do something to the left child and then the equivalent thing to the right child. It was clear that there must be a re-formulation that does everything once for each node, which would reduce the number of operations expressed in the code by a factor of two. This gives a more elegant and less error-prone way to code it. These notes explain the correspondence between my design and Eddy's. HMMer stores a data structure phylo_s for each non-leaf node in the cluster tree. This structure contains the following fields: diff Weight of the node lblen Left branch length rblen Right branch length The lblen and rblen branch lengths are calculated as: this.lblen = this.diff - left.diff this.rblen = this.diff - right.diff My code stores one ClusterNode data structure per node in the cluster tree, including leaves. I store only the weight. I can recover the HMMer branch length fields in a trivial O(1) calculation as follows: lblen = Node.GetWeight() - Node.GetLeft()->GetWeight() rblen = Node.GetWeight() - Node.GetRight()->GetWeight() For the GSC weights calculation, HMMer constructs the following vectors, which have entries for all nodes, including leaves: lwt Left weight rwt Right weight The "left weight" is calculated as the sum of the weights in all the nodes reachable through the left branch, including the node itself. (This is not immediately obvious from the code, which does the calculation using branch lengths rather than weights, but this is an equivalent, and to my mind clearer, statement of what they are). Similarly, the "right weight" is the sum of all weights reachable via the right branch. I define the "cluster weight" to be the summed weight of all nodes in the subtree under the node, including the node itself. I provide a function Node.GetClusterWeight() which calculates the cluster weight using a O(ln N) recursion through the tree. The lwt and rwt values can be recovered as follows: lwt = Node.GetLeft()->GetClusterWeight() + Node.GetWeight() lwt = Node.GetLeft()->GetClusterWeight() + Node.GetWeight() HMMer calculates a further vector fwt as follows. this.fwt = parent.fwt * parent.lwt / (parent.lwt + parent.rwt) This applies to nodes reached via a left branch, for nodes reached via a right branch: this.fwt = parent.fwt * parent.rwt / (parent.lwt + parent.rwt) The values of fwt at the leaf nodes are the final GSC weights. We derive the various terms using our equivalents. parent.lwt = Parent.GetLeft()->GetClusterWeight() + Parent.GetWeight() parent.rwt = Parent.GetRight()->GetClusterWeight() + Parent.GetWeight() parent.lwt + parent.rwt = { Parent.GetLeft()->GetClusterWeight() + Parent.GetRight()->GetClusterWeight() + Parent.GetWeight() } + Parent.GetWeight() We recognize the term {...} as the cluster weight of the parent, so parent.lwt + parent.rwt = Parent.GetClusterWeight() + Parent.GetWeight() As you would expect, repeating this exercise for parent.rwt gives exactly the same expression. The GSC weights (fwt) are stored in the Weight2 field of the cluster tree, the Weight field stores the original (BLOSUM) weights used as input to this algorithm. ***/ #include "muscle.h" #include "msa.h" #include "cluster.h" #include "distfunc.h" // Set weights of all sequences in the subtree under given node. void MSA::SetSubtreeWeight2(const ClusterNode *ptrNode) const { if (0 == ptrNode) return; const ClusterNode *ptrRight = ptrNode->GetRight(); const ClusterNode *ptrLeft = ptrNode->GetLeft(); // If leaf, set weight if (0 == ptrRight && 0 == ptrLeft) { unsigned uIndex = ptrNode->GetIndex(); double dWeight = ptrNode->GetWeight2(); WEIGHT w = DoubleToWeight(dWeight); m_Weights[uIndex] = w; return; } // Otherwise, recursively set subtrees SetSubtreeWeight2(ptrLeft); SetSubtreeWeight2(ptrRight); } void MSA::SetSubtreeGSCWeight(ClusterNode *ptrNode) const { if (0 == ptrNode) return; ClusterNode *ptrParent = ptrNode->GetParent(); double dParentWeight2 = ptrParent->GetWeight2(); double dParentClusterWeight = ptrParent->GetClusterWeight(); if (0.0 == dParentClusterWeight) { double dThisClusterSize = ptrNode->GetClusterSize(); double dParentClusterSize = ptrParent->GetClusterSize(); double dWeight2 = dParentWeight2*dThisClusterSize/dParentClusterSize; ptrNode->SetWeight2(dWeight2); } else { // Could cache cluster weights for better performance. // We calculate cluster weight of each node twice, so this // would give x2 improvement. // As weighting is not very expensive, we don't care. double dThisClusterWeight = ptrNode->GetClusterWeight(); double dParentWeight = ptrParent->GetWeight(); double dNum = dThisClusterWeight + dParentWeight; double dDenom = dParentClusterWeight + dParentWeight; double dWeight2 = dParentWeight2*(dNum/dDenom); ptrNode->SetWeight2(dWeight2); } SetSubtreeGSCWeight(ptrNode->GetLeft()); SetSubtreeGSCWeight(ptrNode->GetRight()); } void MSA::SetGSCWeights() const { ClusterTree CT; CalcBLOSUMWeights(CT); // Calculate weights and store in tree. ClusterNode *ptrRoot = CT.GetRoot(); ptrRoot->SetWeight2(1.0); SetSubtreeGSCWeight(ptrRoot->GetLeft()); SetSubtreeGSCWeight(ptrRoot->GetRight()); // Copy weights from tree to MSA. SetSubtreeWeight2(ptrRoot); } void MSA::ListWeights() const { const unsigned uSeqCount = GetSeqCount(); Log("Weights:\n"); WEIGHT wTotal = 0; for (unsigned n = 0; n < uSeqCount; ++n) { wTotal += GetSeqWeight(n); Log("%6.3f %s\n", GetSeqWeight(n), GetSeqName(n)); } Log("Total weights = %6.3f, should be 1.0\n", wTotal); } muscle-3.8.31.orig/gapscoredimer.h0000644000175000017500000000525311352261626016401 0ustar kratzcharles// source code generated by dimer.py static SCORE GapScoreMM(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LG + PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_LG) + g_scoreGapExtend*(PPA.m_LL*PPB.m_GG + PPA.m_GG*PPB.m_LL) + g_scoreGapAmbig*(PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_GL); } static SCORE GapScoreMD(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + g_scoreGapExtend*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG) + g_scoreGapAmbig*(PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG); } static SCORE GapScoreMI(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + g_scoreGapExtend*(PPA.m_LG*PPB.m_LL + PPA.m_GG*PPB.m_LL) + g_scoreGapAmbig*(PPA.m_LG*PPB.m_GL + PPA.m_GG*PPB.m_GL); } static SCORE GapScoreDM(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL) + g_scoreGapExtend*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG) + g_scoreGapAmbig*(PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_LL + PPA.m_GG*PPB.m_GL); } static SCORE GapScoreDD(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapExtend*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GL + PPA.m_LL*PPB.m_GG) + g_scoreGapAmbig*(PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GL + PPA.m_GL*PPB.m_GG); } static SCORE GapScoreDI(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + g_scoreGapAmbig*(PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL + PPA.m_GG*PPB.m_LL + PPA.m_GG*PPB.m_GL); } static SCORE GapScoreIM(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LG + PPA.m_GL*PPB.m_LG) + g_scoreGapExtend*(PPA.m_LG*PPB.m_LL + PPA.m_GG*PPB.m_LL) + g_scoreGapAmbig*(PPA.m_LL*PPB.m_GG + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_GL); } static SCORE GapScoreID(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + g_scoreGapAmbig*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG + PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG); } static SCORE GapScoreII(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapExtend*(PPA.m_LL*PPB.m_LL + PPA.m_LG*PPB.m_LL + PPA.m_GL*PPB.m_LL + PPA.m_GG*PPB.m_LL) + g_scoreGapAmbig*(PPA.m_LL*PPB.m_GL + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_GL + PPA.m_GG*PPB.m_GL); } muscle-3.8.31.orig/make.out0000644000175000017500000000245411352261633015050 0ustar kratzcharlesg++ -O3 -march=pentiumpro -mcpu=pentiumpro -funroll-loops -Winline -DNDEBUG=1 -o muscle aligngivenpath.o aligngivenpathsw.o aligntwomsas.o aligntwoprofs.o alpha.o anchors.o blosumla.o clust.o cluster.o clwwt.o cons.o diaglist.o difftrees.o difftreese.o distcalc.o distfunc.o domuscle.o dosp.o dpreglist.o edgelist.o enumopts.o enumtostr.o estring.o fasta.o fastclust.o fastdist.o fastdistjones.o fastdistkbit.o fastdistkmer.o fastdistmafft.o fastscorepath2.o finddiags.o glbalign.o glbaligndiag.o glbalignle.o glbalignsimple.o glbalignsp.o globals.o globalslinux.o globalswin32.o gonnet.o gotowt.o henikoffweight.o henikoffweightpb.o hydro.o intmath.o local.o main.o makerootmsa.o mpam200.o msa.o msa2.o msadistkimura.o msf.o objscore.o objscore2.o onexception.o options.o pam200mafft.o params.o phy.o phy2.o phy3.o phy4.o phyfromclust.o phyfromfile.o phytofile.o posgap.o profile.o profilefrommsa.o progalign.o progress.o progressivealign.o pwpath.o realigndiffs.o realigndiffse.o refine.o refinehoriz.o refinesubfams.o refinetree.o refinetreee.o refinevert.o savebest.o scorehistory.o scoremx.o seq.o seqvect.o setblosumweights.o setgscweights.o setnewhandler.o sw.o textfile.o threewaywt.o traceback.o tracebackopt.o tracebacksw.o treefrommsa.o typetostr.o upgma2.o usage.o validateids.o vtml2.o -lm -static strip muscle muscle-3.8.31.orig/msadistmafft.h0000644000175000017500000000102011352261600016213 0ustar kratzcharles#ifndef MSADistMAFFT_h #define MSADistMAFFT_h #include "msadist.h" #include extern double PctIdToMAFFTDist(double dPctId); class MSADistMAFFT : public MSADist { public: virtual double ComputeDist(const MSA &msa, unsigned uSeqIndex1, unsigned uSeqIndex2) { double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2); //if (dPctId < 0.05) // dPctId = 0.05; //double dDist = -log(dPctId); //return dDist; return PctIdToMAFFTDist(dPctId); } }; #endif // MSADistMAFFT_h muscle-3.8.31.orig/fastdist.cpp0000644000175000017500000000214511352261673015730 0ustar kratzcharles#include "muscle.h" #include "distfunc.h" #include "seqvect.h" void DistPWScoreDist(const SeqVect &v, DistFunc &DF); void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF) { const unsigned uSeqCount = v.Length(); switch (DistMethod) { case DISTANCE_Kmer6_6: DistKmer6_6(v, DF); break; case DISTANCE_Kmer20_3: DistKmer20_3(v, DF); break; case DISTANCE_Kmer20_4: FastDistKmer(v, DF); break; case DISTANCE_Kbit20_3: DistKbit20_3(v, DF); break; case DISTANCE_Kmer4_6: DistKmer4_6(v, DF); break; case DISTANCE_PWKimura: DistPWKimura(v, DF); break; case DISTANCE_PWScoreDist: DistPWScoreDist(v, DF); break; default: Quit("DistUnaligned, unsupported distance method %d", DistMethod); } // const char **SeqNames = (const char **) malloc(uSeqCount*sizeof(char *)); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const Seq &s = *(v[uSeqIndex]); const char *ptrName = s.GetName(); unsigned uId = s.GetId(); DF.SetName(uSeqIndex, ptrName); DF.SetId(uSeqIndex, uId); } } muscle-3.8.31.orig/scorepp.cpp0000644000175000017500000000504211352261666015563 0ustar kratzcharles#include "muscle.h" #include "profile.h" char ConsensusChar(const ProfPos &PP) { unsigned uMostCommonLetter = 0; FCOUNT fcMostCommon = PP.m_fcCounts[0]; bool bMoreThanOneLetter = false; bool bAnyLetter = false; for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) { const FCOUNT fc = PP.m_fcCounts[uLetter]; if (fc > 0) { if (bAnyLetter) bMoreThanOneLetter = true; bAnyLetter = true; } if (fc > fcMostCommon) { uMostCommonLetter = uLetter; fcMostCommon = fc; } } if (!bAnyLetter) return '-'; char c = LetterToChar(uMostCommonLetter); if (bMoreThanOneLetter) return UnalignChar(c); return c; } SCORE ScoreProfPos2LA(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 20; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } if (0 == Score) return -2.5; SCORE logScore = logf(Score); return (SCORE) ((logScore - g_scoreCenter)*(PPA.m_fOcc * PPB.m_fOcc)); } SCORE ScoreProfPos2NS(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 20; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } return Score - g_scoreCenter; } SCORE ScoreProfPos2SP(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 20; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } return Score - g_scoreCenter; } SCORE ScoreProfPos2SPN(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 4; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } return Score - g_scoreCenter; } SCORE ScoreProfPos2(const ProfPos &PPA, const ProfPos &PPB) { if (PPSCORE_SP == g_PPScore) return ScoreProfPos2NS(PPA, PPB); else if (PPSCORE_LE == g_PPScore) return ScoreProfPos2LA(PPA, PPB); else if (PPSCORE_SV == g_PPScore) return ScoreProfPos2SP(PPA, PPB); else if (PPSCORE_SPN == g_PPScore) return ScoreProfPos2SPN(PPA, PPB); Quit("Invalid g_PPScore"); return 0; } muscle-3.8.31.orig/domuscle.cpp0000644000175000017500000001576311366141374015734 0ustar kratzcharles#include "muscle.h" #include "textfile.h" #include "seqvect.h" #include "distfunc.h" #include "msa.h" #include "tree.h" #include "profile.h" #include "timing.h" static char g_strUseTreeWarning[] = "\n******** WARNING ****************\n" "\nYou specified the -usetree option.\n" "Note that a good evolutionary tree may NOT be a good\n" "guide tree for multiple alignment. For more details,\n" "please refer to the user guide. To disable this\n" "warning, use -usetree_nowarn .\n\n"; void DoMuscle() { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); PTR_SCOREMATRIX UserMatrix = 0; if (0 != g_pstrMatrixFileName) { const char *FileName = g_pstrMatrixFileName; const char *Path = getenv("MUSCLE_MXPATH"); if (Path != 0) { size_t n = strlen(Path) + 1 + strlen(FileName) + 1; char *NewFileName = new char[n]; sprintf(NewFileName, "%s/%s", Path, FileName); FileName = NewFileName; } TextFile File(FileName); UserMatrix = ReadMx(File); g_Alpha = ALPHA_Amino; g_PPScore = PPSCORE_SP; } SetPPScore(); if (0 != UserMatrix) g_ptrScoreMatrix = UserMatrix; unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags = g_bDiags1; SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (0 == uSeqCount) Quit("Input file '%s' has no sequences", g_pstrInFileName); if (1 == uSeqCount) { TextFile fileOut(g_pstrOutFileName, true); v.ToFile(fileOut); return; } if (uSeqCount > 1) MHackStart(v); // First iteration Tree GuideTree; if (0 != g_pstrUseTreeFileName) { // Discourage users... if (!g_bUseTreeNoWarn) fprintf(stderr, "%s", g_strUseTreeWarning); // Read tree from file TextFile TreeFile(g_pstrUseTreeFileName); GuideTree.FromFile(TreeFile); // Make sure tree is rooted if (!GuideTree.IsRooted()) Quit("User tree must be rooted"); if (GuideTree.GetLeafCount() != uSeqCount) Quit("User tree does not match input sequences"); const unsigned uNodeCount = GuideTree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!GuideTree.IsLeaf(uNodeIndex)) continue; const char *LeafName = GuideTree.GetLeafName(uNodeIndex); unsigned uSeqIndex; bool SeqFound = v.FindName(LeafName, &uSeqIndex); if (!SeqFound) Quit("Label %s in tree does not match sequences", LeafName); unsigned uId = v.GetSeqIdFromName(LeafName); GuideTree.SetLeafId(uNodeIndex, uId); } } else TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1, g_pstrDistMxFileName1); const char *Tree1 = ValueOpt("Tree1"); if (0 != Tree1) { TextFile f(Tree1, true); GuideTree.ToFile(f); if (g_bClusterOnly) return; } SetMuscleTree(GuideTree); ValidateMuscleIds(GuideTree); MSA msa; ProgNode *ProgNodes = 0; if (g_bLow) ProgNodes = ProgressiveAlignE(v, GuideTree, msa); else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); if (0 != g_pstrComputeWeightsFileName) { extern void OutWeights(const char *FileName, const MSA &msa); SetMSAWeightsMuscle(msa); OutWeights(g_pstrComputeWeightsFileName, msa); return; } ValidateMuscleIds(msa); if (1 == g_uMaxIters || 2 == uSeqCount) { //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); return; } if (0 == g_pstrUseTreeFileName) { g_bDiags = g_bDiags2; SetIter(2); if (g_bLow) { if (0 != g_uMaxTreeRefineIters) RefineTreeE(msa, v, GuideTree, ProgNodes); } else RefineTree(msa, GuideTree); const char *Tree2 = ValueOpt("Tree2"); if (0 != Tree2) { TextFile f(Tree2, true); GuideTree.ToFile(f); } } SetSeqWeightMethod(g_SeqWeight2); SetMuscleTree(GuideTree); if (g_bAnchors) RefineVert(msa, GuideTree, g_uMaxIters - 2); else RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false); #if 0 // Refining by subfamilies is disabled as it didn't give better // results. I tried doing this before and after RefineHoriz. // Should get back to this as it seems like this should work. RefineSubfams(msa, GuideTree, g_uMaxIters - 2); #endif ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); } void Run() { SetStartTime(); Log("Started %s\n", GetTimeAsStr()); for (int i = 0; i < g_argc; ++i) Log("%s ", g_argv[i]); Log("\n"); #if TIMING TICKS t1 = GetClockTicks(); #endif if (g_bRefine) Refine(); else if (g_bRefineW) { extern void DoRefineW(); DoRefineW(); } else if (g_bProfDB) ProfDB(); else if (g_bSW) Local(); else if (0 != g_pstrSPFileName) DoSP(); else if (g_bProfile) Profile(); else if (g_bPPScore) PPScore(); else if (g_bPAS) ProgAlignSubFams(); else if (g_bMakeTree) { extern void DoMakeTree(); DoMakeTree(); } else DoMuscle(); #if TIMING extern TICKS g_ticksDP; extern TICKS g_ticksObjScore; TICKS t2 = GetClockTicks(); TICKS TotalTicks = t2 - t1; TICKS ticksOther = TotalTicks - g_ticksDP - g_ticksObjScore; double dSecs = TicksToSecs(TotalTicks); double PctDP = (double) g_ticksDP*100.0/(double) TotalTicks; double PctOS = (double) g_ticksObjScore*100.0/(double) TotalTicks; double PctOther = (double) ticksOther*100.0/(double) TotalTicks; Log(" Ticks Secs Pct\n"); Log(" ============ ======= =====\n"); Log("DP %12ld %7.2f %5.1f%%\n", (long) g_ticksDP, TicksToSecs(g_ticksDP), PctDP); Log("OS %12ld %7.2f %5.1f%%\n", (long) g_ticksObjScore, TicksToSecs(g_ticksObjScore), PctOS); Log("Other %12ld %7.2f %5.1f%%\n", (long) ticksOther, TicksToSecs(ticksOther), PctOther); Log("Total %12ld %7.2f 100.0%%\n", (long) TotalTicks, dSecs); #endif ListDiagSavings(); Log("Finished %s\n", GetTimeAsStr()); } muscle-3.8.31.orig/refinehoriz.cpp0000644000175000017500000001700011352261667016432 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include "msa.h" #include "pwpath.h" #include "profile.h" #include "scorehistory.h" #include "objscore.h" unsigned g_uRefineHeightSubtree; unsigned g_uRefineHeightSubtreeTotal; #define TRACE 0 #define DIFFOBJSCORE 0 static bool TryRealign(MSA &msaIn, const Tree &tree, const unsigned Leaves1[], unsigned uCount1, const unsigned Leaves2[], unsigned uCount2, SCORE *ptrscoreBefore, SCORE *ptrscoreAfter, bool bLockLeft, bool bLockRight) { #if TRACE Log("TryRealign, msaIn=\n"); msaIn.LogMe(); #endif const unsigned uSeqCount = msaIn.GetSeqCount(); unsigned *Ids1 = new unsigned[uSeqCount]; unsigned *Ids2 = new unsigned[uSeqCount]; LeafIndexesToIds(tree, Leaves1, uCount1, Ids1); LeafIndexesToIds(tree, Leaves2, uCount2, Ids2); MSA msa1; MSA msa2; MSASubsetByIds(msaIn, Ids1, uCount1, msa1); MSASubsetByIds(msaIn, Ids2, uCount2, msa2); #if DEBUG ValidateMuscleIds(msa1); ValidateMuscleIds(msa2); #endif // Computing the objective score may be expensive for // large numbers of sequences. As a speed optimization, // we check whether the alignment changes. If it does // not change, there is no need to compute the objective // score. We test for the alignment changing by comparing // the Viterbi paths before and after re-aligning. PWPath pathBefore; pathBefore.FromMSAPair(msa1, msa2); DeleteGappedCols(msa1); DeleteGappedCols(msa2); if (0 == msa1.GetColCount() || 0 == msa2.GetColCount()) return false; MSA msaRealigned; PWPath pathAfter; AlignTwoMSAs(msa1, msa2, msaRealigned, pathAfter, bLockLeft, bLockRight); bool bAnyChanges = !pathAfter.Equal(pathBefore); unsigned uDiffCount1; unsigned uDiffCount2; static unsigned Edges1[10000]; static unsigned Edges2[10000]; DiffPaths(pathBefore, pathAfter, Edges1, &uDiffCount1, Edges2, &uDiffCount2); #if TRACE Log("TryRealign, msa1=\n"); msa1.LogMe(); Log("\nmsa2=\n"); msa2.LogMe(); Log("\nRealigned (changes %s)=\n", bAnyChanges ? "TRUE" : "FALSE"); msaRealigned.LogMe(); #endif if (!bAnyChanges) { *ptrscoreBefore = 0; *ptrscoreAfter = 0; return false; } SetMSAWeightsMuscle(msaIn); SetMSAWeightsMuscle(msaRealigned); #if DIFFOBJSCORE const SCORE scoreDiff = DiffObjScore(msaIn, pathBefore, Edges1, uDiffCount1, msaRealigned, pathAfter, Edges2, uDiffCount2); bool bAccept = (scoreDiff > 0); *ptrscoreBefore = 0; *ptrscoreAfter = scoreDiff; //const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2); //const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2); //Log("Diff = %.3g %.3g\n", scoreDiff, scoreAfter - scoreBefore); #else const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2); const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2); bool bAccept = (scoreAfter > scoreBefore); #if TRACE Log("Score %g -> %g Accept %s\n", scoreBefore, scoreAfter, bAccept ? "TRUE" : "FALSE"); #endif *ptrscoreBefore = scoreBefore; *ptrscoreAfter = scoreAfter; #endif if (bAccept) msaIn.Copy(msaRealigned); delete[] Ids1; delete[] Ids2; return bAccept; } static void RefineHeightParts(MSA &msaIn, const Tree &tree, const unsigned InternalNodeIndexes[], bool bReversed, bool bRight, unsigned uIter, ScoreHistory &History, bool *ptrbAnyChanges, bool *ptrbOscillating, bool bLockLeft, bool bLockRight) { *ptrbOscillating = false; const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uInternalNodeCount = uSeqCount - 1; unsigned *Leaves1 = new unsigned[uSeqCount]; unsigned *Leaves2 = new unsigned[uSeqCount]; const unsigned uRootNodeIndex = tree.GetRootNodeIndex(); bool bAnyAccepted = false; for (unsigned i = 0; i < uInternalNodeCount; ++i) { const unsigned uInternalNodeIndex = InternalNodeIndexes[i]; unsigned uNeighborNodeIndex; if (tree.IsRoot(uInternalNodeIndex) && !bRight) continue; else if (bRight) uNeighborNodeIndex = tree.GetRight(uInternalNodeIndex); else uNeighborNodeIndex = tree.GetLeft(uInternalNodeIndex); g_uTreeSplitNode1 = uInternalNodeIndex; g_uTreeSplitNode2 = uNeighborNodeIndex; unsigned uCount1; unsigned uCount2; GetLeaves(tree, uNeighborNodeIndex, Leaves1, &uCount1); GetLeavesExcluding(tree, uRootNodeIndex, uNeighborNodeIndex, Leaves2, &uCount2); #if TRACE Log("\nRefineHeightParts node %u\n", uInternalNodeIndex); Log("Group1="); for (unsigned n = 0; n < uCount1; ++n) Log(" %u(%s)", Leaves1[n], tree.GetName(Leaves1[n])); Log("\n"); Log("Group2="); for (unsigned n = 0; n < uCount2; ++n) Log(" %u(%s)", Leaves2[n], tree.GetName(Leaves2[n])); Log("\n"); #endif SCORE scoreBefore; SCORE scoreAfter; bool bAccepted = TryRealign(msaIn, tree, Leaves1, uCount1, Leaves2, uCount2, &scoreBefore, &scoreAfter, bLockLeft, bLockRight); SetCurrentAlignment(msaIn); ++g_uRefineHeightSubtree; Progress(g_uRefineHeightSubtree, g_uRefineHeightSubtreeTotal); #if TRACE if (uIter > 0) Log("Before %g %g\n", scoreBefore, History.GetScore(uIter - 1, uInternalNodeIndex, bReversed, bRight)); #endif SCORE scoreMax = scoreAfter > scoreBefore? scoreAfter : scoreBefore; bool bRepeated = History.SetScore(uIter, uInternalNodeIndex, bRight, scoreMax); if (bRepeated) { *ptrbOscillating = true; break; } if (bAccepted) bAnyAccepted = true; } delete[] Leaves1; delete[] Leaves2; *ptrbAnyChanges = bAnyAccepted; } // Return true if any changes made bool RefineHoriz(MSA &msaIn, const Tree &tree, unsigned uIters, bool bLockLeft, bool bLockRight) { #if TRACE tree.LogMe(); #endif if (!tree.IsRooted()) Quit("RefineHeight: requires rooted tree"); const unsigned uSeqCount = msaIn.GetSeqCount(); if (uSeqCount < 3) return false; const unsigned uInternalNodeCount = uSeqCount - 1; unsigned *InternalNodeIndexes = new unsigned[uInternalNodeCount]; unsigned *InternalNodeIndexesR = new unsigned[uInternalNodeCount]; GetInternalNodesInHeightOrder(tree, InternalNodeIndexes); ScoreHistory History(uIters, 2*uSeqCount - 1); bool bAnyChangesAnyIter = false; for (unsigned n = 0; n < uInternalNodeCount; ++n) InternalNodeIndexesR[uInternalNodeCount - 1 - n] = InternalNodeIndexes[n]; for (unsigned uIter = 0; uIter < uIters; ++uIter) { bool bAnyChangesThisIter = false; IncIter(); SetProgressDesc("Refine biparts"); g_uRefineHeightSubtree = 0; g_uRefineHeightSubtreeTotal = uInternalNodeCount*2 - 1; bool bReverse = (uIter%2 != 0); unsigned *Internals; if (bReverse) Internals = InternalNodeIndexesR; else Internals = InternalNodeIndexes; bool bOscillating; for (unsigned i = 0; i < 2; ++i) { bool bAnyChanges = false; bool bRight; switch (i) { case 0: bRight = true; break; case 1: bRight = false; break; default: Quit("RefineHeight default case"); } RefineHeightParts(msaIn, tree, Internals, bReverse, bRight, uIter, History, &bAnyChanges, &bOscillating, bLockLeft, bLockRight); if (bOscillating) { ProgressStepsDone(); goto Osc; } if (bAnyChanges) { bAnyChangesThisIter = true; bAnyChangesAnyIter = true; } } ProgressStepsDone(); if (bOscillating) break; if (!bAnyChangesThisIter) break; } Osc: delete[] InternalNodeIndexes; delete[] InternalNodeIndexesR; return bAnyChangesAnyIter; } muscle-3.8.31.orig/mhack.cpp0000644000175000017500000000243011352261667015172 0ustar kratzcharles#include "muscle.h" #include "seqvect.h" #include "msa.h" /*** Methionine hack. Most proteins start with M. This results in odd-looking alignments with the terminal Ms aligned followed immediately by gaps. Hack this by treating terminal M like X. ***/ static bool *M; void MHackStart(SeqVect &v) { if (ALPHA_Amino != g_Alpha) return; const unsigned uSeqCount = v.Length(); M = new bool[uSeqCount]; memset(M, 0, uSeqCount*sizeof(bool)); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = v.GetSeq(uSeqIndex); if (0 == s.Length()) continue; unsigned uId = s.GetId(); if (s[0] == 'M' || s[0] == 'm') { M[uId] = true; s[0] = 'X'; } } } void MHackEnd(MSA &msa) { if (ALPHA_Amino != g_Alpha) return; if (0 == M) return; const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa.GetSeqId(uSeqIndex); if (M[uId]) { for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { if (!msa.IsGap(uSeqIndex, uColIndex)) { msa.SetChar(uSeqIndex, uColIndex, 'M'); break; } } } } delete[] M; M = 0; } muscle-3.8.31.orig/glbalign.cpp0000644000175000017500000000730111352261667015670 0ustar kratzcharles#include "muscle.h" #include "pwpath.h" #include "timing.h" #include "textfile.h" #include "msa.h" #include "profile.h" #if !VER_3_52 #define COMPARE_SIMPLE 0 #if TIMING TICKS g_ticksDP = 0; #endif #if 1 extern bool g_bKeepSimpleDP; SCORE NWSmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE NWDASmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE NWDASimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE NWDASimple2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { return GlobalAlign(PA, uLengthA, PB, uLengthB, Path); } #if COMPARE_SIMPLE SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { #if TIMING TICKS t1 = GetClockTicks(); #endif g_bKeepSimpleDP = true; PWPath SimplePath; GlobalAlignSimple(PA, uLengthA, PB, uLengthB, SimplePath); SCORE Score = NWSmall(PA, uLengthA, PB, uLengthB, Path); if (!Path.Equal(SimplePath)) { Log("Simple:\n"); SimplePath.LogMe(); Log("Small:\n"); Path.LogMe(); Quit("Paths differ"); } #if TIMING TICKS t2 = GetClockTicks(); g_ticksDP += (t2 - t1); #endif return Score; } #else // COMPARE_SIMPLE SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { #if TIMING TICKS t1 = GetClockTicks(); #endif SCORE Score = NWSmall(PA, uLengthA, PB, uLengthB, Path); #if TIMING TICKS t2 = GetClockTicks(); g_ticksDP += (t2 - t1); #endif return Score; } #endif #else // 1 static void AllInserts(PWPath &Path, unsigned uLengthB) { Path.Clear(); PWEdge Edge; Edge.cType = 'I'; Edge.uPrefixLengthA = 0; for (unsigned uPrefixLengthB = 1; uPrefixLengthB <= uLengthB; ++uPrefixLengthB) { Edge.uPrefixLengthB = uPrefixLengthB; Path.AppendEdge(Edge); } } static void AllDeletes(PWPath &Path, unsigned uLengthA) { Path.Clear(); PWEdge Edge; Edge.cType = 'D'; Edge.uPrefixLengthB = 0; for (unsigned uPrefixLengthA = 1; uPrefixLengthA <= uLengthA; ++uPrefixLengthA) { Edge.uPrefixLengthA = uPrefixLengthA; Path.AppendEdge(Edge); } } SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { #if TIMING TICKS t1 = GetClockTicks(); #endif if (0 == uLengthA) { AllInserts(Path, uLengthB); return 0; } else if (0 == uLengthB) { AllDeletes(Path, uLengthA); return 0; } SCORE Score = 0; if (g_bDiags) Score = GlobalAlignDiags(PA, uLengthA, PB, uLengthB, Path); else Score = GlobalAlignNoDiags(PA, uLengthA, PB, uLengthB, Path); #if TIMING TICKS t2 = GetClockTicks(); g_ticksDP += (t2 - t1); #endif return Score; } SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { if (g_bDimer) return GlobalAlignDimer(PA, uLengthA, PB, uLengthB, Path); switch (g_PPScore) { case PPSCORE_LE: return GlobalAlignLE(PA, uLengthA, PB, uLengthB, Path); case PPSCORE_SP: case PPSCORE_SV: return GlobalAlignSP(PA, uLengthA, PB, uLengthB, Path); case PPSCORE_SPN: return GlobalAlignSPN(PA, uLengthA, PB, uLengthB, Path); } Quit("Invalid PP score (GlobalAlignNoDiags)"); return 0; } #endif #endif // !VER_3_52 muscle-3.8.31.orig/estring.cpp0000644000175000017500000003206711352261667015573 0ustar kratzcharles#include "muscle.h" #include "pwpath.h" #include "estring.h" #include "seq.h" #include "msa.h" /*** An "estring" is an edit string that operates on a sequence. An estring is represented as a vector of integers. It is interpreted in order of increasing suffix. A positive value n means copy n letters. A negative value -n means insert n indels. Zero marks the end of the vector. Consecutive entries must have opposite sign, i.e. the shortest possible representation must be used. A "tpair" is a traceback path for a pairwise alignment represented as two estrings, one for each sequence. ***/ #define c2(c,d) (((unsigned char) c) << 8 | (unsigned char) d) unsigned LengthEstring(const short es[]) { unsigned i = 0; while (*es++ != 0) ++i; return i; } short *EstringNewCopy(const short es[]) { unsigned n = LengthEstring(es) + 1; short *esNew = new short[n]; memcpy(esNew, es, n*sizeof(short)); return esNew; } void LogEstring(const short es[]) { Log("<"); for (unsigned i = 0; es[i] != 0; ++i) { if (i > 0) Log(" "); Log("%d", es[i]); } Log(">"); } static bool EstringsEq(const short es1[], const short es2[]) { for (;;) { if (*es1 != *es2) return false; if (0 == *es1) break; ++es1; ++es2; } return true; } static void EstringCounts(const short es[], unsigned *ptruSymbols, unsigned *ptruIndels) { unsigned uSymbols = 0; unsigned uIndels = 0; for (unsigned i = 0; es[i] != 0; ++i) { short n = es[i]; if (n > 0) uSymbols += n; else if (n < 0) uIndels += -n; } *ptruSymbols = uSymbols; *ptruIndels = uIndels; } static char *EstringOp(const short es[], const char s[]) { unsigned uSymbols; unsigned uIndels; EstringCounts(es, &uSymbols, &uIndels); assert((unsigned) strlen(s) == uSymbols); char *sout = new char[uSymbols + uIndels + 1]; char *psout = sout; for (;;) { int n = *es++; if (0 == n) break; if (n > 0) for (int i = 0; i < n; ++i) *psout++ = *s++; else for (int i = 0; i < -n; ++i) *psout++ = '-'; } assert(0 == *s); *psout = 0; return sout; } void EstringOp(const short es[], const Seq &sIn, Seq &sOut) { #if DEBUG unsigned uSymbols; unsigned uIndels; EstringCounts(es, &uSymbols, &uIndels); assert(sIn.Length() == uSymbols); #endif sOut.Clear(); sOut.SetName(sIn.GetName()); int p = 0; for (;;) { int n = *es++; if (0 == n) break; if (n > 0) for (int i = 0; i < n; ++i) { char c = sIn[p++]; sOut.push_back(c); } else for (int i = 0; i < -n; ++i) sOut.push_back('-'); } } unsigned EstringOp(const short es[], const Seq &sIn, MSA &a) { unsigned uSymbols; unsigned uIndels; EstringCounts(es, &uSymbols, &uIndels); assert(sIn.Length() == uSymbols); unsigned uColCount = uSymbols + uIndels; a.Clear(); a.SetSize(1, uColCount); a.SetSeqName(0, sIn.GetName()); a.SetSeqId(0, sIn.GetId()); unsigned p = 0; unsigned uColIndex = 0; for (;;) { int n = *es++; if (0 == n) break; if (n > 0) for (int i = 0; i < n; ++i) { char c = sIn[p++]; a.SetChar(0, uColIndex++, c); } else for (int i = 0; i < -n; ++i) a.SetChar(0, uColIndex++, '-'); } assert(uColIndex == uColCount); return uColCount; } void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB) { // First pass to determine size of estrings esA and esB const unsigned uEdgeCount = Path.GetEdgeCount(); if (0 == uEdgeCount) { short *esA = new short[1]; short *esB = new short[1]; esA[0] = 0; esB[0] = 0; *ptresA = esA; *ptresB = esB; return; } unsigned iLengthA = 1; unsigned iLengthB = 1; const char cFirstEdgeType = Path.GetEdge(0).cType; char cPrevEdgeType = cFirstEdgeType; for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); char cEdgeType = Edge.cType; switch (c2(cPrevEdgeType, cEdgeType)) { case c2('M', 'M'): case c2('D', 'D'): case c2('I', 'I'): break; case c2('D', 'M'): case c2('M', 'D'): ++iLengthB; break; case c2('I', 'M'): case c2('M', 'I'): ++iLengthA; break; case c2('I', 'D'): case c2('D', 'I'): ++iLengthB; ++iLengthA; break; default: assert(false); } cPrevEdgeType = cEdgeType; } // Pass2 for seq A { short *esA = new short[iLengthA+1]; unsigned iA = 0; switch (Path.GetEdge(0).cType) { case 'M': case 'D': esA[0] = 1; break; case 'I': esA[0] = -1; break; default: assert(false); } char cPrevEdgeType = cFirstEdgeType; for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); char cEdgeType = Edge.cType; switch (c2(cPrevEdgeType, cEdgeType)) { case c2('M', 'M'): case c2('D', 'D'): case c2('D', 'M'): case c2('M', 'D'): ++(esA[iA]); break; case c2('I', 'D'): case c2('I', 'M'): ++iA; esA[iA] = 1; break; case c2('M', 'I'): case c2('D', 'I'): ++iA; esA[iA] = -1; break; case c2('I', 'I'): --(esA[iA]); break; default: assert(false); } cPrevEdgeType = cEdgeType; } assert(iA == iLengthA - 1); esA[iLengthA] = 0; *ptresA = esA; } { // Pass2 for seq B short *esB = new short[iLengthB+1]; unsigned iB = 0; switch (Path.GetEdge(0).cType) { case 'M': case 'I': esB[0] = 1; break; case 'D': esB[0] = -1; break; default: assert(false); } char cPrevEdgeType = cFirstEdgeType; for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); char cEdgeType = Edge.cType; switch (c2(cPrevEdgeType, cEdgeType)) { case c2('M', 'M'): case c2('I', 'I'): case c2('I', 'M'): case c2('M', 'I'): ++(esB[iB]); break; case c2('D', 'I'): case c2('D', 'M'): ++iB; esB[iB] = 1; break; case c2('M', 'D'): case c2('I', 'D'): ++iB; esB[iB] = -1; break; case c2('D', 'D'): --(esB[iB]); break; default: assert(false); } cPrevEdgeType = cEdgeType; } assert(iB == iLengthB - 1); esB[iLengthB] = 0; *ptresB = esB; } #if DEBUG { const PWEdge &LastEdge = Path.GetEdge(uEdgeCount - 1); unsigned uSymbols; unsigned uIndels; EstringCounts(*ptresA, &uSymbols, &uIndels); assert(uSymbols == LastEdge.uPrefixLengthA); assert(uSymbols + uIndels == uEdgeCount); EstringCounts(*ptresB, &uSymbols, &uIndels); assert(uSymbols == LastEdge.uPrefixLengthB); assert(uSymbols + uIndels == uEdgeCount); PWPath TmpPath; EstringsToPath(*ptresA, *ptresB, TmpPath); TmpPath.AssertEqual(Path); } #endif } void EstringsToPath(const short esA[], const short esB[], PWPath &Path) { Path.Clear(); unsigned iA = 0; unsigned iB = 0; int nA = esA[iA++]; int nB = esB[iB++]; unsigned uPrefixLengthA = 0; unsigned uPrefixLengthB = 0; for (;;) { char cType; if (nA > 0) { if (nB > 0) { cType = 'M'; --nA; --nB; } else if (nB < 0) { cType = 'D'; --nA; ++nB; } else assert(false); } else if (nA < 0) { if (nB > 0) { cType = 'I'; ++nA; --nB; } else assert(false); } else assert(false); switch (cType) { case 'M': ++uPrefixLengthA; ++uPrefixLengthB; break; case 'D': ++uPrefixLengthA; break; case 'I': ++uPrefixLengthB; break; } PWEdge Edge; Edge.cType = cType; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; Path.AppendEdge(Edge); if (nA == 0) { if (0 == esA[iA]) { assert(0 == esB[iB]); break; } nA = esA[iA++]; } if (nB == 0) nB = esB[iB++]; } } /*** Multiply two estrings to make a third estring. The product of two estrings e1*e2 is defined to be the estring that produces the same result as applying e1 then e2. Multiplication is not commutative. In fact, the reversed order is undefined unless both estrings consist of a single, identical, positive entry. A primary motivation for using estrings is that multiplication is very fast, reducing the time needed to construct the root alignment. Example <-1,3>(XXX) = -XXX <2,-1,2>(-XXX) = -X-XX Therefore, <-1,3>*<2,-1,2> = <-1,1,-1,2> ***/ static bool CanMultiplyEstrings(const short es1[], const short es2[]) { unsigned uSymbols1; unsigned uSymbols2; unsigned uIndels1; unsigned uIndels2; EstringCounts(es1, &uSymbols1, &uIndels1); EstringCounts(es2, &uSymbols2, &uIndels2); return uSymbols1 + uIndels1 == uSymbols2; } static inline void AppendGaps(short esp[], int &ip, int n) { if (-1 == ip) esp[++ip] = n; else if (esp[ip] < 0) esp[ip] += n; else esp[++ip] = n; } static inline void AppendSymbols(short esp[], int &ip, int n) { if (-1 == ip) esp[++ip] = n; else if (esp[ip] > 0) esp[ip] += n; else esp[++ip] = n; } void MulEstrings(const short es1[], const short es2[], short esp[]) { assert(CanMultiplyEstrings(es1, es2)); unsigned i1 = 0; int ip = -1; int n1 = es1[i1++]; for (unsigned i2 = 0; ; ++i2) { int n2 = es2[i2]; if (0 == n2) break; if (n2 > 0) { for (;;) { if (n1 < 0) { if (n2 > -n1) { AppendGaps(esp, ip, n1); n2 += n1; n1 = es1[i1++]; } else if (n2 == -n1) { AppendGaps(esp, ip, n1); n1 = es1[i1++]; break; } else { assert(n2 < -n1); AppendGaps(esp, ip, -n2); n1 += n2; break; } } else { assert(n1 > 0); if (n2 > n1) { AppendSymbols(esp, ip, n1); n2 -= n1; n1 = es1[i1++]; } else if (n2 == n1) { AppendSymbols(esp, ip, n1); n1 = es1[i1++]; break; } else { assert(n2 < n1); AppendSymbols(esp, ip, n2); n1 -= n2; break; } } } } else { assert(n2 < 0); AppendGaps(esp, ip, n2); } } esp[++ip] = 0; #if DEBUG { int MaxLen = (int) (LengthEstring(es1) + LengthEstring(es2) + 1); assert(ip < MaxLen); if (ip >= 2) for (int i = 0; i < ip - 2; ++i) { if (!(esp[i] > 0 && esp[i+1] < 0 || esp[i] < 0 && esp[i+1] > 0)) { Log("Bad result of MulEstring: "); LogEstring(esp); Quit("Assert failed (alternating signs)"); } } unsigned uSymbols1; unsigned uSymbols2; unsigned uSymbolsp; unsigned uIndels1; unsigned uIndels2; unsigned uIndelsp; EstringCounts(es1, &uSymbols1, &uIndels1); EstringCounts(es2, &uSymbols2, &uIndels2); EstringCounts(esp, &uSymbolsp, &uIndelsp); if (uSymbols1 + uIndels1 != uSymbols2) { Log("Bad result of MulEstring: "); LogEstring(esp); Quit("Assert failed (counts1 %u %u %u)", uSymbols1, uIndels1, uSymbols2); } } #endif } static void test(const short es1[], const short es2[], const short esa[]) { unsigned uSymbols1; unsigned uSymbols2; unsigned uIndels1; unsigned uIndels2; EstringCounts(es1, &uSymbols1, &uIndels1); EstringCounts(es2, &uSymbols2, &uIndels2); char s[4096]; memset(s, 'X', sizeof(s)); s[uSymbols1] = 0; char *s1 = EstringOp(es1, s); char *s12 = EstringOp(es2, s1); memset(s, 'X', sizeof(s)); s[uSymbols2] = 0; char *s2 = EstringOp(es2, s); Log("%s * %s = %s\n", s1, s2, s12); LogEstring(es1); Log(" * "); LogEstring(es2); Log(" = "); LogEstring(esa); Log("\n"); short esp[4096]; MulEstrings(es1, es2, esp); LogEstring(esp); if (!EstringsEq(esp, esa)) Log(" *ERROR* "); Log("\n"); memset(s, 'X', sizeof(s)); s[uSymbols1] = 0; char *sp = EstringOp(esp, s); Log("%s\n", sp); Log("\n==========\n\n"); } void TestEstrings() { SetListFileName("c:\\tmp\\muscle.log", false); //{ //short es1[] = { -1, 1, -1, 0 }; //short es2[] = { 1, -1, 2, 0 }; //short esa[] = { -2, 1, -1, 0 }; //test(es1, es2, esa); //} //{ //short es1[] = { 2, -1, 2, 0 }; //short es2[] = { 1, -1, 3, -1, 1, 0 }; //short esa[] = { 1, -1, 1, -1, 1, -1, 1, 0 }; //test(es1, es2, esa); //} //{ //short es1[] = { -1, 3, 0 }; //short es2[] = { 2, -1, 2, 0 }; //short esa[] = { -1, 1, -1, 2, 0 }; //test(es1, es2, esa); //} //{ //short es1[] = { -1, 1, -1, 1, 0}; //short es2[] = { 4, 0 }; //short esa[] = { -1, 1, -1, 1, 0}; //test(es1, es2, esa); //} //{ //short es1[] = { 1, -1, 1, -1, 0}; //short es2[] = { 4, 0 }; //short esa[] = { 1, -1, 1, -1, 0}; //test(es1, es2, esa); //} //{ //short es1[] = { 1, -1, 1, -1, 0}; //short es2[] = { -1, 4, -1, 0 }; //short esa[] = { -1, 1, -1, 1, -2, 0}; //test(es1, es2, esa); //} { short es1[] = { 106, -77, 56, -2, 155, -3, 123, -2, 0}; short es2[] = { 50, -36, 34, -3, 12, -6, 1, -6, 18, -17, 60, -5, 349, -56, 0 }; short esa[] = { 0 }; test(es1, es2, esa); } exit(0); } muscle-3.8.31.orig/msadistkimura.cpp0000644000175000017500000000675611352261667017003 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include // "Standard" NJ distance: the Kimura measure. // This is defined to be: // // log_e(1 - p - p*p/5) // // where p is the fraction of residues that differ, i.e.: // // p = (1 - fractional_conservation) // // This measure is infinite for p = 0.8541 and is considered // unreliable for p >= 0.75 (according to the ClustalW docs). // ClustalW uses a table lookup for values > 0.75. // The following table was copied from the ClustalW file dayhoff.h. static int dayhoff_pams[]={ 195, /* 75.0% observed d; 195 PAMs estimated = 195% estimated d */ 196, /* 75.1% observed d; 196 PAMs estimated */ 197, 198, 199, 200, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 226, 227, 228, 229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 240, 241, 243, 244, 245, 246, 248, 249, 250, /* 250 PAMs = 80.3% observed d */ 252, 253, 254, 255, 257, 258, 260, 261, 262, 264, 265, 267, 268, 270, 271, 273, 274, 276, 277, 279, 281, 282, 284, 285, 287, 289, 291, 292, 294, 296, 298, 299, 301, 303, 305, 307, 309, 311, 313, 315, 317, 319, 321, 323, 325, 328, 330, 332, 335, 337, 339, 342, 344, 347, 349, 352, 354, 357, 360, 362, 365, 368, 371, 374, 377, 380, 383, 386, 389, 393, 396, 399, 403, 407, 410, 414, 418, 422, 426, 430, 434, 438, 442, 447, 451, 456, 461, 466, 471, 476, 482, 487, 493, 498, 504, 511, 517, 524, 531, 538, 545, 553, 560, 569, 577, 586, 595, 605, 615, 626, 637, 649, 661, 675, 688, 703, 719, 736, 754, 775, 796, 819, 845, 874, 907, 945, /* 92.9% observed; 945 PAMs */ 988 /* 93.0% observed; 988 PAMs */ }; static int iTableEntries = sizeof(dayhoff_pams)/sizeof(dayhoff_pams[0]); double KimuraDist(double dPctId) { double p = 1 - dPctId; // Typical case: use Kimura's empirical formula if (p < 0.75) return -log(1 - p - (p*p)/5); // Per ClustalW, return 10.0 for anything over 93% if (p > 0.93) return 10.0; // If p >= 0.75, use table lookup assert(p <= 1 && p >= 0.75); // Thanks for Michael Hoel for pointing out a bug // in the table index calculation in versions <= 3.52. int iTableIndex = (int) ((p - 0.75)*1000 + 0.5); if (iTableIndex < 0 || iTableIndex >= iTableEntries) Quit("Internal error in MSADistKimura::ComputeDist"); return dayhoff_pams[iTableIndex] / 100.0; } //double MSADistKimura::ComputeDist(const MSA &msa, unsigned uSeqIndex1, // unsigned uSeqIndex2) // { // double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2); // return KimuraDist(dPctId); // } double KimuraDistToPctId(double dKimuraDist) { // Solve quadratic equation const double a = 0.2; const double b = 1; const double c = 1.0 - exp(-dKimuraDist); const double p = (-b + sqrt(b*b + 4*a*c))/(2*a); return 1 - p; } double PctIdToHeightKimura(double dPctId) { return KimuraDist(dPctId); } muscle-3.8.31.orig/fastscorepath2.cpp0000644000175000017500000000673411352261667017052 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "pwpath.h" SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const PWPath &Path) { const unsigned uEdgeCount = Path.GetEdgeCount(); Log("Edge SS PLA PLB Match Gap Total\n"); Log("---- -- --- --- ----- --- -----\n"); char cType = 'S'; SCORE scoreTotal = 0; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); const char cPrevType = cType; cType = Edge.cType; const unsigned uPrefixLengthA = Edge.uPrefixLengthA; const unsigned uPrefixLengthB = Edge.uPrefixLengthB; bool bGap = false; bool bMatch = false; SCORE scoreGap = 0; SCORE scoreMatch = 0; switch (cType) { case 'M': { if (0 == uPrefixLengthA || 0 == uPrefixLengthB) Quit("FastScorePath2, M zero length"); const ProfPos &PPA = PA[uPrefixLengthA - 1]; const ProfPos &PPB = PB[uPrefixLengthB - 1]; bMatch = true; scoreMatch = ScoreProfPos2(PPA, PPB); if ('D' == cPrevType) { bGap = true; assert(uPrefixLengthA > 1); scoreGap = PA[uPrefixLengthA-2].m_scoreGapClose; } else if ('I' == cPrevType) { bGap = true; assert(uPrefixLengthB > 1); scoreGap = PB[uPrefixLengthB-2].m_scoreGapClose; } break; } case 'D': { if (0 == uPrefixLengthA) Quit("FastScorePath2, D zero length"); const ProfPos &PPA = PA[uPrefixLengthA - 1]; bGap = true; switch (cPrevType) { case 'S': scoreGap = PPA.m_scoreGapOpen; break; case 'M': scoreGap = PPA.m_scoreGapOpen; break; case 'D': // scoreGap = g_scoreGapExtend; scoreGap = 0; break; case 'I': Quit("FastScorePath2 DI"); } break; } case 'I': { if (0 == uPrefixLengthB) Quit("FastScorePath2, I zero length"); const ProfPos &PPB = PB[uPrefixLengthB - 1]; bGap = true; switch (cPrevType) { case 'S': scoreGap = PPB.m_scoreGapOpen; break; case 'M': scoreGap = PPB.m_scoreGapOpen; break; case 'I': scoreGap = 0; // scoreGap = g_scoreGapExtend; break; case 'D': Quit("FastScorePath2 DI"); } break; } case 'U': { Quit("FastScorePath2 U"); } default: Quit("FastScorePath2: invalid type %c", cType); } Log("%4u %c%c %4u %4u ", uEdgeIndex, cPrevType, cType, uPrefixLengthA, uPrefixLengthB); if (bMatch) Log("%7.1f ", scoreMatch); else Log(" "); if (bGap) Log("%7.1f ", scoreGap); else Log(" "); SCORE scoreEdge = scoreMatch + scoreGap; scoreTotal += scoreEdge; Log("%7.1f %7.1f", scoreEdge, scoreTotal); Log("\n"); } SCORE scoreGap = 0; // if (!g_bTermGapsHalf) switch (cType) { case 'M': scoreGap = 0; break; case 'D': { const ProfPos &LastPPA = PA[uLengthA - 1]; scoreGap = LastPPA.m_scoreGapClose; break; } case 'I': { const ProfPos &LastPPB = PB[uLengthB - 1]; scoreGap = LastPPB.m_scoreGapClose; break; } case 'U': Quit("Unaligned regions not supported"); case 'S': break; default: Quit("Invalid type %c", cType); } Log(" %cE %4u %4u %7.1f\n", cType, uLengthA, uLengthB, scoreGap); scoreTotal += scoreGap; Log("Total = %g\n", scoreTotal); return scoreTotal; } muscle-3.8.31.orig/fasta.cpp0000644000175000017500000000235411352261636015206 0ustar kratzcharles#include "muscle.h" #include #include #include "msa.h" #include "textfile.h" const unsigned FASTA_BLOCK = 60; void MSA::FromFASTAFile(TextFile &File) { Clear(); FILE *f = File.GetStdioFile(); unsigned uSeqCount = 0; unsigned uColCount = uInsane; for (;;) { char *Label; unsigned uSeqLength; char *SeqData = GetFastaSeq(f, &uSeqLength, &Label, false); if (0 == SeqData) break; AppendSeq(SeqData, uSeqLength, Label); } } void MSA::ToFASTAFile(TextFile &File) const { const unsigned uColCount = GetColCount(); assert(uColCount > 0); const unsigned uLinesPerSeq = (GetColCount() - 1)/FASTA_BLOCK + 1; const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { File.PutString(">"); File.PutString(GetSeqName(uSeqIndex)); File.PutString("\n"); unsigned n = 0; for (unsigned uLine = 0; uLine < uLinesPerSeq; ++uLine) { unsigned uLetters = uColCount - uLine*FASTA_BLOCK; if (uLetters > FASTA_BLOCK) uLetters = FASTA_BLOCK; for (unsigned i = 0; i < uLetters; ++i) { char c = GetChar(uSeqIndex, n); File.PutChar(c); ++n; } File.PutChar('\n'); } } } muscle-3.8.31.orig/glbalign352.cpp0000644000175000017500000000226411352261666016124 0ustar kratzcharles#include "muscle.h" #include "pwpath.h" #include "timing.h" #include "textfile.h" #include "msa.h" #include "profile.h" #if VER_3_52 #if TIMING TICKS g_ticksDP = 0; #endif SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { #if TIMING TICKS t1 = GetClockTicks(); #endif SCORE Score = 0; if (g_bDiags) Score = GlobalAlignDiags(PA, uLengthA, PB, uLengthB, Path); else Score = GlobalAlignNoDiags(PA, uLengthA, PB, uLengthB, Path); #if TIMING TICKS t2 = GetClockTicks(); g_ticksDP += (t2 - t1); #endif return Score; } SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { if (g_bDimer) return GlobalAlignDimer(PA, uLengthA, PB, uLengthB, Path); switch (g_PPScore) { case PPSCORE_LE: return GlobalAlignLE(PA, uLengthA, PB, uLengthB, Path); case PPSCORE_SP: case PPSCORE_SV: return GlobalAlignSP(PA, uLengthA, PB, uLengthB, Path); case PPSCORE_SPN: return GlobalAlignSPN(PA, uLengthA, PB, uLengthB, Path); } Quit("Invalid PP score (GlobalAlignNoDiags)"); return 0; } #endif // VER_3_52 muscle-3.8.31.orig/sptest.cpp0000644000175000017500000000740511352261621015426 0ustar kratzcharles#include "muscle.h" #include "objscore.h" #include "msa.h" #include "textfile.h" #include "pwpath.h" const unsigned INDELS = 1; static void GetPos(const char Str[], unsigned L, int *pi1, int *pi2) { int i1; for (;;) { i1 = rand()%(L-2) + 1; if (Str[i1] == 'M') break; } int i2; for (;;) { i2 = rand()%(L-2) + 1; if (i1 != i2 && Str[i2] == 'M') break; } *pi1 = i1; *pi2 = i2; } static void MakePath(unsigned uSeqLength, unsigned uIndelCount, char Str[]) { unsigned uPathLength = uSeqLength + uIndelCount; for (unsigned i = 0; i < uPathLength; ++i) Str[i] = 'M'; for (unsigned i = 0; i < uIndelCount; ++i) { int i1, i2; GetPos(Str, uPathLength, &i1, &i2); Str[i1] = 'D'; Str[i2] = 'I'; } Str[uPathLength] = 0; Log("MakePath=%s\n", Str); } void SPTest() { SetPPScore(PPSCORE_SV); SetListFileName("c:\\tmp\\muscle.log", false); TextFile file1("c:\\tmp\\msa1.afa"); TextFile file2("c:\\tmp\\msa2.afa"); MSA msa1; MSA msa2; msa1.FromFile(file1); msa2.FromFile(file2); Log("msa1=\n"); msa1.LogMe(); Log("msa2=\n"); msa2.LogMe(); const unsigned uColCount = msa1.GetColCount(); if (msa2.GetColCount() != uColCount) Quit("Different lengths"); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); const unsigned uSeqCount = uSeqCount1 + uSeqCount2; MSA::SetIdCount(uSeqCount); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) { msa1.SetSeqWeight(uSeqIndex1, 1.0); msa1.SetSeqId(uSeqIndex1, uSeqIndex1); } for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) { msa2.SetSeqWeight(uSeqIndex2, 1.0); msa2.SetSeqId(uSeqIndex2, uSeqCount1 + uSeqIndex2); } MSA alnA; MSA alnB; char strPathA[1024]; char strPathB[1024]; MakePath(uColCount, INDELS, strPathA); MakePath(uColCount, INDELS, strPathB); PWPath PathA; PWPath PathB; PathA.FromStr(strPathA); PathB.FromStr(strPathB); Log("PathA=\n"); PathA.LogMe(); Log("PathB=\n"); PathB.LogMe(); AlignTwoMSAsGivenPath(PathA, msa1, msa2, alnA); AlignTwoMSAsGivenPath(PathB, msa1, msa2, alnB); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { alnA.SetSeqWeight(uSeqIndex, 1.0); alnB.SetSeqWeight(uSeqIndex, 1.0); } unsigned Seqs1[1024]; unsigned Seqs2[1024]; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) Seqs1[uSeqIndex1] = uSeqIndex1; for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) Seqs2[uSeqIndex2] = uSeqCount1 + uSeqIndex2; MSA msaA1; MSA msaA2; MSA msaB1; MSA msaB2; MSAFromSeqSubset(alnA, Seqs1, uSeqCount1, msaA1); MSAFromSeqSubset(alnB, Seqs1, uSeqCount1, msaB1); MSAFromSeqSubset(alnA, Seqs2, uSeqCount2, msaA2); MSAFromSeqSubset(alnB, Seqs2, uSeqCount2, msaB2); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) { msaA1.SetSeqWeight(uSeqIndex1, 1.0); msaB1.SetSeqWeight(uSeqIndex1, 1.0); } for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) { msaA2.SetSeqWeight(uSeqIndex2, 1.0); msaB2.SetSeqWeight(uSeqIndex2, 1.0); } Log("msaA1=\n"); msaA1.LogMe(); Log("msaB1=\n"); msaB1.LogMe(); Log("msaA2=\n"); msaA2.LogMe(); Log("msaB2=\n"); msaB2.LogMe(); Log("alnA=\n"); alnA.LogMe(); Log("AlnB=\n"); alnB.LogMe(); Log("\nSPA\n---\n"); SCORE SPA = ObjScoreSP(alnA); Log("\nSPB\n---\n"); SCORE SPB = ObjScoreSP(alnB); Log("\nXPA\n---\n"); SCORE XPA = ObjScoreXP(msaA1, msaA2); Log("\nXPB\n---\n"); SCORE XPB = ObjScoreXP(msaB1, msaB2); Log("SPA=%.4g SPB=%.4g Diff=%.4g\n", SPA, SPB, SPA - SPB); Log("XPA=%.4g XPB=%.4g Diff=%.4g\n", XPA, XPB, XPA - XPB); } muscle-3.8.31.orig/phy.cpp0000644000175000017500000007434311352261676014723 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include #define TRACE 0 /*** Node has 0 to 3 neighbors: 0 neighbors: singleton root 1 neighbor: leaf, neighbor is parent 2 neigbors: non-singleton root 3 neighbors: internal node (other than root) Minimal rooted tree is single node. Minimal unrooted tree is single edge. Leaf node always has nulls in neighbors 2 and 3, neighbor 1 is parent. When tree is rooted, neighbor 1=parent, 2=left, 3=right. ***/ void Tree::AssertAreNeighbors(unsigned uNodeIndex1, unsigned uNodeIndex2) const { if (uNodeIndex1 >= m_uNodeCount || uNodeIndex2 >= m_uNodeCount) Quit("AssertAreNeighbors(%u,%u), are %u nodes", uNodeIndex1, uNodeIndex2, m_uNodeCount); if (m_uNeighbor1[uNodeIndex1] != uNodeIndex2 && m_uNeighbor2[uNodeIndex1] != uNodeIndex2 && m_uNeighbor3[uNodeIndex1] != uNodeIndex2) { LogMe(); Quit("AssertAreNeighbors(%u,%u) failed", uNodeIndex1, uNodeIndex2); } if (m_uNeighbor1[uNodeIndex2] != uNodeIndex1 && m_uNeighbor2[uNodeIndex2] != uNodeIndex1 && m_uNeighbor3[uNodeIndex2] != uNodeIndex1) { LogMe(); Quit("AssertAreNeighbors(%u,%u) failed", uNodeIndex1, uNodeIndex2); } bool Has12 = HasEdgeLength(uNodeIndex1, uNodeIndex2); bool Has21 = HasEdgeLength(uNodeIndex2, uNodeIndex1); if (Has12 != Has21) { HasEdgeLength(uNodeIndex1, uNodeIndex2); HasEdgeLength(uNodeIndex2, uNodeIndex1); LogMe(); Log("HasEdgeLength(%u, %u)=%c HasEdgeLength(%u, %u)=%c\n", uNodeIndex1, uNodeIndex2, Has12 ? 'T' : 'F', uNodeIndex2, uNodeIndex1, Has21 ? 'T' : 'F'); Quit("Tree::AssertAreNeighbors, HasEdgeLength not symmetric"); } if (Has12) { double d12 = GetEdgeLength(uNodeIndex1, uNodeIndex2); double d21 = GetEdgeLength(uNodeIndex2, uNodeIndex1); if (d12 != d21) { LogMe(); Quit("Tree::AssertAreNeighbors, Edge length disagrees %u-%u=%.3g, %u-%u=%.3g", uNodeIndex1, uNodeIndex2, d12, uNodeIndex2, uNodeIndex1, d21); } } } void Tree::ValidateNode(unsigned uNodeIndex) const { if (uNodeIndex >= m_uNodeCount) Quit("ValidateNode(%u), %u nodes", uNodeIndex, m_uNodeCount); const unsigned uNeighborCount = GetNeighborCount(uNodeIndex); if (2 == uNeighborCount) { if (!m_bRooted) { LogMe(); Quit("Tree::ValidateNode: Node %u has two neighbors, tree is not rooted", uNodeIndex); } if (uNodeIndex != m_uRootNodeIndex) { LogMe(); Quit("Tree::ValidateNode: Node %u has two neighbors, but not root node=%u", uNodeIndex, m_uRootNodeIndex); } } const unsigned n1 = m_uNeighbor1[uNodeIndex]; const unsigned n2 = m_uNeighbor2[uNodeIndex]; const unsigned n3 = m_uNeighbor3[uNodeIndex]; if (NULL_NEIGHBOR == n2 && NULL_NEIGHBOR != n3) { LogMe(); Quit("Tree::ValidateNode, n2=null, n3!=null", uNodeIndex); } if (NULL_NEIGHBOR == n3 && NULL_NEIGHBOR != n2) { LogMe(); Quit("Tree::ValidateNode, n3=null, n2!=null", uNodeIndex); } if (n1 != NULL_NEIGHBOR) AssertAreNeighbors(uNodeIndex, n1); if (n2 != NULL_NEIGHBOR) AssertAreNeighbors(uNodeIndex, n2); if (n3 != NULL_NEIGHBOR) AssertAreNeighbors(uNodeIndex, n3); if (n1 != NULL_NEIGHBOR && (n1 == n2 || n1 == n3)) { LogMe(); Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); } if (n2 != NULL_NEIGHBOR && (n2 == n1 || n2 == n3)) { LogMe(); Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); } if (n3 != NULL_NEIGHBOR && (n3 == n1 || n3 == n2)) { LogMe(); Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); } if (IsRooted()) { if (NULL_NEIGHBOR == GetParent(uNodeIndex)) { if (uNodeIndex != m_uRootNodeIndex) { LogMe(); Quit("Tree::ValiateNode(%u), no parent", uNodeIndex); } } else if (GetLeft(GetParent(uNodeIndex)) != uNodeIndex && GetRight(GetParent(uNodeIndex)) != uNodeIndex) { LogMe(); Quit("Tree::ValidateNode(%u), parent / child mismatch", uNodeIndex); } } } void Tree::Validate() const { for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) ValidateNode(uNodeIndex); } bool Tree::IsEdge(unsigned uNodeIndex1, unsigned uNodeIndex2) const { assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); return m_uNeighbor1[uNodeIndex1] == uNodeIndex2 || m_uNeighbor2[uNodeIndex1] == uNodeIndex2 || m_uNeighbor3[uNodeIndex1] == uNodeIndex2; } double Tree::GetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const { assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); if (!HasEdgeLength(uNodeIndex1, uNodeIndex2)) { LogMe(); Quit("Missing edge length in tree %u-%u", uNodeIndex1, uNodeIndex2); } if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) return m_dEdgeLength1[uNodeIndex1]; else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) return m_dEdgeLength2[uNodeIndex1]; assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); return m_dEdgeLength3[uNodeIndex1]; } void Tree::ExpandCache() { const unsigned uNodeCount = 100; unsigned uNewCacheCount = m_uCacheCount + uNodeCount; unsigned *uNewNeighbor1 = new unsigned[uNewCacheCount]; unsigned *uNewNeighbor2 = new unsigned[uNewCacheCount]; unsigned *uNewNeighbor3 = new unsigned[uNewCacheCount]; unsigned *uNewIds = new unsigned[uNewCacheCount]; memset(uNewIds, 0xff, uNewCacheCount*sizeof(unsigned)); double *dNewEdgeLength1 = new double[uNewCacheCount]; double *dNewEdgeLength2 = new double[uNewCacheCount]; double *dNewEdgeLength3 = new double[uNewCacheCount]; double *dNewHeight = new double[uNewCacheCount]; bool *bNewHasEdgeLength1 = new bool[uNewCacheCount]; bool *bNewHasEdgeLength2 = new bool[uNewCacheCount]; bool *bNewHasEdgeLength3 = new bool[uNewCacheCount]; bool *bNewHasHeight = new bool[uNewCacheCount]; char **ptrNewName = new char *[uNewCacheCount]; memset(ptrNewName, 0, uNewCacheCount*sizeof(char *)); if (m_uCacheCount > 0) { const unsigned uUnsignedBytes = m_uCacheCount*sizeof(unsigned); memcpy(uNewNeighbor1, m_uNeighbor1, uUnsignedBytes); memcpy(uNewNeighbor2, m_uNeighbor2, uUnsignedBytes); memcpy(uNewNeighbor3, m_uNeighbor3, uUnsignedBytes); memcpy(uNewIds, m_Ids, uUnsignedBytes); const unsigned uEdgeBytes = m_uCacheCount*sizeof(double); memcpy(dNewEdgeLength1, m_dEdgeLength1, uEdgeBytes); memcpy(dNewEdgeLength2, m_dEdgeLength2, uEdgeBytes); memcpy(dNewEdgeLength3, m_dEdgeLength3, uEdgeBytes); memcpy(dNewHeight, m_dHeight, uEdgeBytes); const unsigned uBoolBytes = m_uCacheCount*sizeof(bool); memcpy(bNewHasEdgeLength1, m_bHasEdgeLength1, uBoolBytes); memcpy(bNewHasEdgeLength2, m_bHasEdgeLength2, uBoolBytes); memcpy(bNewHasEdgeLength3, m_bHasEdgeLength3, uBoolBytes); memcpy(bNewHasHeight, m_bHasHeight, uBoolBytes); const unsigned uNameBytes = m_uCacheCount*sizeof(char *); memcpy(ptrNewName, m_ptrName, uNameBytes); delete[] m_uNeighbor1; delete[] m_uNeighbor2; delete[] m_uNeighbor3; delete[] m_Ids; delete[] m_dEdgeLength1; delete[] m_dEdgeLength2; delete[] m_dEdgeLength3; delete[] m_bHasEdgeLength1; delete[] m_bHasEdgeLength2; delete[] m_bHasEdgeLength3; delete[] m_bHasHeight; delete[] m_ptrName; } m_uCacheCount = uNewCacheCount; m_uNeighbor1 = uNewNeighbor1; m_uNeighbor2 = uNewNeighbor2; m_uNeighbor3 = uNewNeighbor3; m_Ids = uNewIds; m_dEdgeLength1 = dNewEdgeLength1; m_dEdgeLength2 = dNewEdgeLength2; m_dEdgeLength3 = dNewEdgeLength3; m_dHeight = dNewHeight; m_bHasEdgeLength1 = bNewHasEdgeLength1; m_bHasEdgeLength2 = bNewHasEdgeLength2; m_bHasEdgeLength3 = bNewHasEdgeLength3; m_bHasHeight = bNewHasHeight; m_ptrName = ptrNewName; } // Creates tree with single node, no edges. // Root node always has index 0. void Tree::CreateRooted() { Clear(); ExpandCache(); m_uNodeCount = 1; m_uNeighbor1[0] = NULL_NEIGHBOR; m_uNeighbor2[0] = NULL_NEIGHBOR; m_uNeighbor3[0] = NULL_NEIGHBOR; m_bHasEdgeLength1[0] = false; m_bHasEdgeLength2[0] = false; m_bHasEdgeLength3[0] = false; m_bHasHeight[0] = false; m_uRootNodeIndex = 0; m_bRooted = true; #if DEBUG Validate(); #endif } // Creates unrooted tree with single edge. // Nodes for that edge are always 0 and 1. void Tree::CreateUnrooted(double dEdgeLength) { Clear(); ExpandCache(); m_uNeighbor1[0] = 1; m_uNeighbor2[0] = NULL_NEIGHBOR; m_uNeighbor3[0] = NULL_NEIGHBOR; m_uNeighbor1[1] = 0; m_uNeighbor2[1] = NULL_NEIGHBOR; m_uNeighbor3[1] = NULL_NEIGHBOR; m_dEdgeLength1[0] = dEdgeLength; m_dEdgeLength1[1] = dEdgeLength; m_bHasEdgeLength1[0] = true; m_bHasEdgeLength1[1] = true; m_bRooted = false; #if DEBUG Validate(); #endif } void Tree::SetLeafName(unsigned uNodeIndex, const char *ptrName) { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); free(m_ptrName[uNodeIndex]); m_ptrName[uNodeIndex] = strsave(ptrName); } void Tree::SetLeafId(unsigned uNodeIndex, unsigned uId) { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); m_Ids[uNodeIndex] = uId; } const char *Tree::GetLeafName(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); return m_ptrName[uNodeIndex]; } unsigned Tree::GetLeafId(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); return m_Ids[uNodeIndex]; } // Append a new branch. // This adds two new nodes and joins them to an existing leaf node. // Return value is k, new nodes have indexes k and k+1 respectively. unsigned Tree::AppendBranch(unsigned uExistingLeafIndex) { if (0 == m_uNodeCount) Quit("Tree::AppendBranch: tree has not been created"); #if DEBUG assert(uExistingLeafIndex < m_uNodeCount); if (!IsLeaf(uExistingLeafIndex)) { LogMe(); Quit("AppendBranch(%u): not leaf", uExistingLeafIndex); } #endif if (m_uNodeCount >= m_uCacheCount - 2) ExpandCache(); const unsigned uNewLeaf1 = m_uNodeCount; const unsigned uNewLeaf2 = m_uNodeCount + 1; m_uNodeCount += 2; assert(m_uNeighbor2[uExistingLeafIndex] == NULL_NEIGHBOR); assert(m_uNeighbor3[uExistingLeafIndex] == NULL_NEIGHBOR); m_uNeighbor2[uExistingLeafIndex] = uNewLeaf1; m_uNeighbor3[uExistingLeafIndex] = uNewLeaf2; m_uNeighbor1[uNewLeaf1] = uExistingLeafIndex; m_uNeighbor1[uNewLeaf2] = uExistingLeafIndex; m_uNeighbor2[uNewLeaf1] = NULL_NEIGHBOR; m_uNeighbor2[uNewLeaf2] = NULL_NEIGHBOR; m_uNeighbor3[uNewLeaf1] = NULL_NEIGHBOR; m_uNeighbor3[uNewLeaf2] = NULL_NEIGHBOR; m_dEdgeLength2[uExistingLeafIndex] = 0; m_dEdgeLength3[uExistingLeafIndex] = 0; m_dEdgeLength1[uNewLeaf1] = 0; m_dEdgeLength2[uNewLeaf1] = 0; m_dEdgeLength3[uNewLeaf1] = 0; m_dEdgeLength1[uNewLeaf2] = 0; m_dEdgeLength2[uNewLeaf2] = 0; m_dEdgeLength3[uNewLeaf2] = 0; m_bHasEdgeLength1[uNewLeaf1] = false; m_bHasEdgeLength2[uNewLeaf1] = false; m_bHasEdgeLength3[uNewLeaf1] = false; m_bHasEdgeLength1[uNewLeaf2] = false; m_bHasEdgeLength2[uNewLeaf2] = false; m_bHasEdgeLength3[uNewLeaf2] = false; m_bHasHeight[uNewLeaf1] = false; m_bHasHeight[uNewLeaf2] = false; m_Ids[uNewLeaf1] = uInsane; m_Ids[uNewLeaf2] = uInsane; return uNewLeaf1; } void Tree::LogMe() const { Log("Tree::LogMe %u nodes, ", m_uNodeCount); if (IsRooted()) { Log("rooted.\n"); Log("\n"); Log("Index Parnt LengthP Left LengthL Right LengthR Id Name\n"); Log("----- ----- ------- ---- ------- ----- ------- ----- ----\n"); } else { Log("unrooted.\n"); Log("\n"); Log("Index Nbr_1 Length1 Nbr_2 Length2 Nbr_3 Length3 Id Name\n"); Log("----- ----- ------- ----- ------- ----- ------- ----- ----\n"); } for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { Log("%5u ", uNodeIndex); const unsigned n1 = m_uNeighbor1[uNodeIndex]; const unsigned n2 = m_uNeighbor2[uNodeIndex]; const unsigned n3 = m_uNeighbor3[uNodeIndex]; if (NULL_NEIGHBOR != n1) { Log("%5u ", n1); if (m_bHasEdgeLength1[uNodeIndex]) Log("%7.4f ", m_dEdgeLength1[uNodeIndex]); else Log(" * "); } else Log(" "); if (NULL_NEIGHBOR != n2) { Log("%5u ", n2); if (m_bHasEdgeLength2[uNodeIndex]) Log("%7.4f ", m_dEdgeLength2[uNodeIndex]); else Log(" * "); } else Log(" "); if (NULL_NEIGHBOR != n3) { Log("%5u ", n3); if (m_bHasEdgeLength3[uNodeIndex]) Log("%7.4f ", m_dEdgeLength3[uNodeIndex]); else Log(" * "); } else Log(" "); if (m_Ids != 0 && IsLeaf(uNodeIndex)) { unsigned uId = m_Ids[uNodeIndex]; if (uId == uInsane) Log(" *"); else Log("%5u", uId); } else Log(" "); if (m_bRooted && uNodeIndex == m_uRootNodeIndex) Log(" [ROOT] "); const char *ptrName = m_ptrName[uNodeIndex]; if (ptrName != 0) Log(" %s", ptrName); Log("\n"); } } void Tree::SetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2, double dLength) { assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); assert(IsEdge(uNodeIndex1, uNodeIndex2)); if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) { m_dEdgeLength1[uNodeIndex1] = dLength; m_bHasEdgeLength1[uNodeIndex1] = true; } else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) { m_dEdgeLength2[uNodeIndex1] = dLength; m_bHasEdgeLength2[uNodeIndex1] = true; } else { assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); m_dEdgeLength3[uNodeIndex1] = dLength; m_bHasEdgeLength3[uNodeIndex1] = true; } if (m_uNeighbor1[uNodeIndex2] == uNodeIndex1) { m_dEdgeLength1[uNodeIndex2] = dLength; m_bHasEdgeLength1[uNodeIndex2] = true; } else if (m_uNeighbor2[uNodeIndex2] == uNodeIndex1) { m_dEdgeLength2[uNodeIndex2] = dLength; m_bHasEdgeLength2[uNodeIndex2] = true; } else { assert(m_uNeighbor3[uNodeIndex2] == uNodeIndex1); m_dEdgeLength3[uNodeIndex2] = dLength; m_bHasEdgeLength3[uNodeIndex2] = true; } } unsigned Tree::UnrootFromFile() { #if TRACE Log("Before unroot:\n"); LogMe(); #endif if (!m_bRooted) Quit("Tree::Unroot, not rooted"); // Convention: root node is always node zero assert(IsRoot(0)); assert(NULL_NEIGHBOR == m_uNeighbor1[0]); const unsigned uThirdNode = m_uNodeCount++; m_uNeighbor1[0] = uThirdNode; m_uNeighbor1[uThirdNode] = 0; m_uNeighbor2[uThirdNode] = NULL_NEIGHBOR; m_uNeighbor3[uThirdNode] = NULL_NEIGHBOR; m_dEdgeLength1[0] = 0; m_dEdgeLength1[uThirdNode] = 0; m_bHasEdgeLength1[uThirdNode] = true; m_bRooted = false; #if TRACE Log("After unroot:\n"); LogMe(); #endif return uThirdNode; } // In an unrooted tree, equivalent of GetLeft/Right is // GetFirst/SecondNeighbor. // uNeighborIndex must be a known neighbor of uNodeIndex. // This is the way to find the other two neighbor nodes of // an internal node. // The labeling as "First" and "Second" neighbor is arbitrary. // Calling these functions on a leaf returns NULL_NEIGHBOR, as // for GetLeft/Right. unsigned Tree::GetFirstNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const { assert(uNodeIndex < m_uNodeCount); assert(uNeighborIndex < m_uNodeCount); assert(IsEdge(uNodeIndex, uNeighborIndex)); for (unsigned n = 0; n < 3; ++n) { unsigned uNeighbor = GetNeighbor(uNodeIndex, n); if (NULL_NEIGHBOR != uNeighbor && uNeighborIndex != uNeighbor) return uNeighbor; } return NULL_NEIGHBOR; } unsigned Tree::GetSecondNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const { assert(uNodeIndex < m_uNodeCount); assert(uNeighborIndex < m_uNodeCount); assert(IsEdge(uNodeIndex, uNeighborIndex)); bool bFoundOne = false; for (unsigned n = 0; n < 3; ++n) { unsigned uNeighbor = GetNeighbor(uNodeIndex, n); if (NULL_NEIGHBOR != uNeighbor && uNeighborIndex != uNeighbor) { if (bFoundOne) return uNeighbor; else bFoundOne = true; } } return NULL_NEIGHBOR; } // Compute the number of leaves in the sub-tree defined by an edge // in an unrooted tree. Conceptually, the tree is cut at this edge, // and uNodeIndex2 considered the root of the sub-tree. unsigned Tree::GetLeafCountUnrooted(unsigned uNodeIndex1, unsigned uNodeIndex2, double *ptrdTotalDistance) const { assert(!IsRooted()); if (IsLeaf(uNodeIndex2)) { *ptrdTotalDistance = GetEdgeLength(uNodeIndex1, uNodeIndex2); return 1; } // Recurse down the rooted sub-tree defined by cutting the edge // and considering uNodeIndex2 as the root. const unsigned uLeft = GetFirstNeighbor(uNodeIndex2, uNodeIndex1); const unsigned uRight = GetSecondNeighbor(uNodeIndex2, uNodeIndex1); double dLeftDistance; double dRightDistance; const unsigned uLeftCount = GetLeafCountUnrooted(uNodeIndex2, uLeft, &dLeftDistance); const unsigned uRightCount = GetLeafCountUnrooted(uNodeIndex2, uRight, &dRightDistance); *ptrdTotalDistance = dLeftDistance + dRightDistance; return uLeftCount + uRightCount; } void Tree::RootUnrootedTree(ROOT Method) { assert(!IsRooted()); #if TRACE Log("Tree::RootUnrootedTree, before:"); LogMe(); #endif unsigned uNode1; unsigned uNode2; double dLength1; double dLength2; FindRoot(*this, &uNode1, &uNode2, &dLength1, &dLength2, Method); if (m_uNodeCount == m_uCacheCount) ExpandCache(); m_uRootNodeIndex = m_uNodeCount++; double dEdgeLength = GetEdgeLength(uNode1, uNode2); m_uNeighbor1[m_uRootNodeIndex] = NULL_NEIGHBOR; m_uNeighbor2[m_uRootNodeIndex] = uNode1; m_uNeighbor3[m_uRootNodeIndex] = uNode2; if (m_uNeighbor1[uNode1] == uNode2) m_uNeighbor1[uNode1] = m_uRootNodeIndex; else if (m_uNeighbor2[uNode1] == uNode2) m_uNeighbor2[uNode1] = m_uRootNodeIndex; else { assert(m_uNeighbor3[uNode1] == uNode2); m_uNeighbor3[uNode1] = m_uRootNodeIndex; } if (m_uNeighbor1[uNode2] == uNode1) m_uNeighbor1[uNode2] = m_uRootNodeIndex; else if (m_uNeighbor2[uNode2] == uNode1) m_uNeighbor2[uNode2] = m_uRootNodeIndex; else { assert(m_uNeighbor3[uNode2] == uNode1); m_uNeighbor3[uNode2] = m_uRootNodeIndex; } OrientParent(uNode1, m_uRootNodeIndex); OrientParent(uNode2, m_uRootNodeIndex); SetEdgeLength(m_uRootNodeIndex, uNode1, dLength1); SetEdgeLength(m_uRootNodeIndex, uNode2, dLength2); m_bHasHeight[m_uRootNodeIndex] = false; m_ptrName[m_uRootNodeIndex] = 0; m_bRooted = true; #if TRACE Log("\nPhy::RootUnrootedTree, after:"); LogMe(); #endif Validate(); } bool Tree::HasEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const { assert(uNodeIndex1 < m_uNodeCount); assert(uNodeIndex2 < m_uNodeCount); assert(IsEdge(uNodeIndex1, uNodeIndex2)); if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) return m_bHasEdgeLength1[uNodeIndex1]; else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) return m_bHasEdgeLength2[uNodeIndex1]; assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); return m_bHasEdgeLength3[uNodeIndex1]; } void Tree::OrientParent(unsigned uNodeIndex, unsigned uParentNodeIndex) { if (NULL_NEIGHBOR == uNodeIndex) return; if (m_uNeighbor1[uNodeIndex] == uParentNodeIndex) ; else if (m_uNeighbor2[uNodeIndex] == uParentNodeIndex) { double dEdgeLength2 = m_dEdgeLength2[uNodeIndex]; m_uNeighbor2[uNodeIndex] = m_uNeighbor1[uNodeIndex]; m_dEdgeLength2[uNodeIndex] = m_dEdgeLength1[uNodeIndex]; m_uNeighbor1[uNodeIndex] = uParentNodeIndex; m_dEdgeLength1[uNodeIndex] = dEdgeLength2; } else { assert(m_uNeighbor3[uNodeIndex] == uParentNodeIndex); double dEdgeLength3 = m_dEdgeLength3[uNodeIndex]; m_uNeighbor3[uNodeIndex] = m_uNeighbor1[uNodeIndex]; m_dEdgeLength3[uNodeIndex] = m_dEdgeLength1[uNodeIndex]; m_uNeighbor1[uNodeIndex] = uParentNodeIndex; m_dEdgeLength1[uNodeIndex] = dEdgeLength3; } OrientParent(m_uNeighbor2[uNodeIndex], uNodeIndex); OrientParent(m_uNeighbor3[uNodeIndex], uNodeIndex); } unsigned Tree::FirstDepthFirstNode() const { assert(IsRooted()); // Descend via left branches until we hit a leaf unsigned uNodeIndex = m_uRootNodeIndex; while (!IsLeaf(uNodeIndex)) uNodeIndex = GetLeft(uNodeIndex); return uNodeIndex; } unsigned Tree::FirstDepthFirstNodeR() const { assert(IsRooted()); // Descend via left branches until we hit a leaf unsigned uNodeIndex = m_uRootNodeIndex; while (!IsLeaf(uNodeIndex)) uNodeIndex = GetRight(uNodeIndex); return uNodeIndex; } unsigned Tree::NextDepthFirstNode(unsigned uNodeIndex) const { #if TRACE Log("NextDepthFirstNode(%3u) ", uNodeIndex); #endif assert(IsRooted()); assert(uNodeIndex < m_uNodeCount); if (IsRoot(uNodeIndex)) { #if TRACE Log(">> Node %u is root, end of traversal\n", uNodeIndex); #endif return NULL_NEIGHBOR; } unsigned uParent = GetParent(uNodeIndex); if (GetRight(uParent) == uNodeIndex) { #if TRACE Log(">> Is right branch, return parent=%u\n", uParent); #endif return uParent; } uNodeIndex = GetRight(uParent); #if TRACE Log(">> Descend left from right sibling=%u ... ", uNodeIndex); #endif while (!IsLeaf(uNodeIndex)) uNodeIndex = GetLeft(uNodeIndex); #if TRACE Log("bottom out at leaf=%u\n", uNodeIndex); #endif return uNodeIndex; } unsigned Tree::NextDepthFirstNodeR(unsigned uNodeIndex) const { #if TRACE Log("NextDepthFirstNode(%3u) ", uNodeIndex); #endif assert(IsRooted()); assert(uNodeIndex < m_uNodeCount); if (IsRoot(uNodeIndex)) { #if TRACE Log(">> Node %u is root, end of traversal\n", uNodeIndex); #endif return NULL_NEIGHBOR; } unsigned uParent = GetParent(uNodeIndex); if (GetLeft(uParent) == uNodeIndex) { #if TRACE Log(">> Is left branch, return parent=%u\n", uParent); #endif return uParent; } uNodeIndex = GetLeft(uParent); #if TRACE Log(">> Descend right from left sibling=%u ... ", uNodeIndex); #endif while (!IsLeaf(uNodeIndex)) uNodeIndex = GetRight(uNodeIndex); #if TRACE Log("bottom out at leaf=%u\n", uNodeIndex); #endif return uNodeIndex; } void Tree::UnrootByDeletingRoot() { assert(IsRooted()); assert(m_uNodeCount >= 3); const unsigned uLeft = GetLeft(m_uRootNodeIndex); const unsigned uRight = GetRight(m_uRootNodeIndex); m_uNeighbor1[uLeft] = uRight; m_uNeighbor1[uRight] = uLeft; bool bHasEdgeLength = HasEdgeLength(m_uRootNodeIndex, uLeft) && HasEdgeLength(m_uRootNodeIndex, uRight); if (bHasEdgeLength) { double dEdgeLength = GetEdgeLength(m_uRootNodeIndex, uLeft) + GetEdgeLength(m_uRootNodeIndex, uRight); m_dEdgeLength1[uLeft] = dEdgeLength; m_dEdgeLength1[uRight] = dEdgeLength; } // Remove root node entry from arrays const unsigned uMoveCount = m_uNodeCount - m_uRootNodeIndex; const unsigned uUnsBytes = uMoveCount*sizeof(unsigned); memmove(m_uNeighbor1 + m_uRootNodeIndex, m_uNeighbor1 + m_uRootNodeIndex + 1, uUnsBytes); memmove(m_uNeighbor2 + m_uRootNodeIndex, m_uNeighbor2 + m_uRootNodeIndex + 1, uUnsBytes); memmove(m_uNeighbor3 + m_uRootNodeIndex, m_uNeighbor3 + m_uRootNodeIndex + 1, uUnsBytes); const unsigned uDoubleBytes = uMoveCount*sizeof(double); memmove(m_dEdgeLength1 + m_uRootNodeIndex, m_dEdgeLength1 + m_uRootNodeIndex + 1, uDoubleBytes); memmove(m_dEdgeLength2 + m_uRootNodeIndex, m_dEdgeLength2 + m_uRootNodeIndex + 1, uDoubleBytes); memmove(m_dEdgeLength3 + m_uRootNodeIndex, m_dEdgeLength3 + m_uRootNodeIndex + 1, uDoubleBytes); const unsigned uBoolBytes = uMoveCount*sizeof(bool); memmove(m_bHasEdgeLength1 + m_uRootNodeIndex, m_bHasEdgeLength1 + m_uRootNodeIndex + 1, uBoolBytes); memmove(m_bHasEdgeLength2 + m_uRootNodeIndex, m_bHasEdgeLength2 + m_uRootNodeIndex + 1, uBoolBytes); memmove(m_bHasEdgeLength3 + m_uRootNodeIndex, m_bHasEdgeLength3 + m_uRootNodeIndex + 1, uBoolBytes); const unsigned uPtrBytes = uMoveCount*sizeof(char *); memmove(m_ptrName + m_uRootNodeIndex, m_ptrName + m_uRootNodeIndex + 1, uPtrBytes); --m_uNodeCount; m_bRooted = false; // Fix up table entries for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { #define DEC(x) if (x != NULL_NEIGHBOR && x > m_uRootNodeIndex) --x; DEC(m_uNeighbor1[uNodeIndex]) DEC(m_uNeighbor2[uNodeIndex]) DEC(m_uNeighbor3[uNodeIndex]) #undef DEC } Validate(); } unsigned Tree::GetLeafParent(unsigned uNodeIndex) const { assert(IsLeaf(uNodeIndex)); if (IsRooted()) return GetParent(uNodeIndex); if (m_uNeighbor1[uNodeIndex] != NULL_NEIGHBOR) return m_uNeighbor1[uNodeIndex]; if (m_uNeighbor2[uNodeIndex] != NULL_NEIGHBOR) return m_uNeighbor2[uNodeIndex]; return m_uNeighbor3[uNodeIndex]; } // TODO: This is not efficient for large trees, should cache. double Tree::GetNodeHeight(unsigned uNodeIndex) const { if (!IsRooted()) Quit("Tree::GetNodeHeight: undefined unless rooted tree"); if (IsLeaf(uNodeIndex)) return 0.0; if (m_bHasHeight[uNodeIndex]) return m_dHeight[uNodeIndex]; const unsigned uLeft = GetLeft(uNodeIndex); const unsigned uRight = GetRight(uNodeIndex); double dLeftLength = GetEdgeLength(uNodeIndex, uLeft); double dRightLength = GetEdgeLength(uNodeIndex, uRight); if (dLeftLength < 0) dLeftLength = 0; if (dRightLength < 0) dRightLength = 0; const double dLeftHeight = dLeftLength + GetNodeHeight(uLeft); const double dRightHeight = dRightLength + GetNodeHeight(uRight); const double dHeight = (dLeftHeight + dRightHeight)/2; m_bHasHeight[uNodeIndex] = true; m_dHeight[uNodeIndex] = dHeight; return dHeight; } unsigned Tree::GetNeighborSubscript(unsigned uNodeIndex, unsigned uNeighborIndex) const { assert(uNodeIndex < m_uNodeCount); assert(uNeighborIndex < m_uNodeCount); if (uNeighborIndex == m_uNeighbor1[uNodeIndex]) return 0; if (uNeighborIndex == m_uNeighbor2[uNodeIndex]) return 1; if (uNeighborIndex == m_uNeighbor3[uNodeIndex]) return 2; return NULL_NEIGHBOR; } unsigned Tree::GetNeighbor(unsigned uNodeIndex, unsigned uNeighborSubscript) const { switch (uNeighborSubscript) { case 0: return m_uNeighbor1[uNodeIndex]; case 1: return m_uNeighbor2[uNodeIndex]; case 2: return m_uNeighbor3[uNodeIndex]; } Quit("Tree::GetNeighbor, sub=%u", uNeighborSubscript); return NULL_NEIGHBOR; } // TODO: check if this is a performance issue, could cache a lookup table unsigned Tree::LeafIndexToNodeIndex(unsigned uLeafIndex) const { const unsigned uNodeCount = GetNodeCount(); unsigned uLeafCount = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (IsLeaf(uNodeIndex)) { if (uLeafCount == uLeafIndex) return uNodeIndex; else ++uLeafCount; } } Quit("LeafIndexToNodeIndex: out of range"); return 0; } unsigned Tree::GetLeafNodeIndex(const char *ptrName) const { const unsigned uNodeCount = GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!IsLeaf(uNodeIndex)) continue; const char *ptrLeafName = GetLeafName(uNodeIndex); if (0 == strcmp(ptrName, ptrLeafName)) return uNodeIndex; } Quit("Tree::GetLeafNodeIndex, name not found"); return 0; } void Tree::Copy(const Tree &tree) { const unsigned uNodeCount = tree.GetNodeCount(); InitCache(uNodeCount); m_uNodeCount = uNodeCount; const size_t UnsignedBytes = uNodeCount*sizeof(unsigned); const size_t DoubleBytes = uNodeCount*sizeof(double); const size_t BoolBytes = uNodeCount*sizeof(bool); memcpy(m_uNeighbor1, tree.m_uNeighbor1, UnsignedBytes); memcpy(m_uNeighbor2, tree.m_uNeighbor2, UnsignedBytes); memcpy(m_uNeighbor3, tree.m_uNeighbor3, UnsignedBytes); memcpy(m_Ids, tree.m_Ids, UnsignedBytes); memcpy(m_dEdgeLength1, tree.m_dEdgeLength1, DoubleBytes); memcpy(m_dEdgeLength2, tree.m_dEdgeLength2, DoubleBytes); memcpy(m_dEdgeLength3, tree.m_dEdgeLength3, DoubleBytes); memcpy(m_dHeight, tree.m_dHeight, DoubleBytes); memcpy(m_bHasEdgeLength1, tree.m_bHasEdgeLength1, BoolBytes); memcpy(m_bHasEdgeLength2, tree.m_bHasEdgeLength2, BoolBytes); memcpy(m_bHasEdgeLength3, tree.m_bHasEdgeLength3, BoolBytes); memcpy(m_bHasHeight, tree.m_bHasHeight, BoolBytes); m_uRootNodeIndex = tree.m_uRootNodeIndex; m_bRooted = tree.m_bRooted; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { if (tree.IsLeaf(uNodeIndex)) { const char *ptrName = tree.GetLeafName(uNodeIndex); m_ptrName[uNodeIndex] = strsave(ptrName); } else m_ptrName[uNodeIndex] = 0; } #if DEBUG Validate(); #endif } // Create rooted tree from a vector description. // Node indexes are 0..N-1 for leaves, N..2N-2 for // internal nodes. // Vector subscripts are i-N and have values for // internal nodes only, but those values are node // indexes 0..2N-2. So e.g. if N=6 and Left[2]=1, // this means that the third internal node (node index 8) // has the second leaf (node index 1) as its left child. // uRoot gives the vector subscript of the root, so add N // to get the node index. void Tree::Create(unsigned uLeafCount, unsigned uRoot, const unsigned Left[], const unsigned Right[], const float LeftLength[], const float RightLength[], const unsigned LeafIds[], char **LeafNames) { Clear(); m_uNodeCount = 2*uLeafCount - 1; InitCache(m_uNodeCount); for (unsigned uNodeIndex = 0; uNodeIndex < uLeafCount; ++uNodeIndex) { m_Ids[uNodeIndex] = LeafIds[uNodeIndex]; m_ptrName[uNodeIndex] = strsave(LeafNames[uNodeIndex]); } for (unsigned uNodeIndex = uLeafCount; uNodeIndex < m_uNodeCount; ++uNodeIndex) { unsigned v = uNodeIndex - uLeafCount; unsigned uLeft = Left[v]; unsigned uRight = Right[v]; float fLeft = LeftLength[v]; float fRight = RightLength[v]; m_uNeighbor2[uNodeIndex] = uLeft; m_uNeighbor3[uNodeIndex] = uRight; m_bHasEdgeLength2[uNodeIndex] = true; m_bHasEdgeLength3[uNodeIndex] = true; m_dEdgeLength2[uNodeIndex] = fLeft; m_dEdgeLength3[uNodeIndex] = fRight; m_uNeighbor1[uLeft] = uNodeIndex; m_uNeighbor1[uRight] = uNodeIndex; m_dEdgeLength1[uLeft] = fLeft; m_dEdgeLength1[uRight] = fRight; m_bHasEdgeLength1[uLeft] = true; m_bHasEdgeLength1[uRight] = true; } m_bRooted = true; m_uRootNodeIndex = uRoot + uLeafCount; Validate(); } muscle-3.8.31.orig/phy3.cpp0000644000175000017500000003264011352261600014763 0ustar kratzcharles#include "muscle.h" #include "tree.h" #include "edgelist.h" #define TRACE 0 struct EdgeInfo { EdgeInfo() { m_bSet = false; } // Is data in this structure valid (i.e, has been set)? bool m_bSet; // Node at start of this edge unsigned m_uNode1; // Node at end of this edge unsigned m_uNode2; // Maximum distance from Node2 to a leaf double m_dMaxDistToLeaf; // Sum of distances from Node2 to all leaves under Node2 double m_dTotalDistToLeaves; // Next node on path from Node2 to most distant leaf unsigned m_uMaxStep; // Most distant leaf from Node2 (used for debugging only) unsigned m_uMostDistantLeaf; // Number of leaves under Node2 unsigned m_uLeafCount; }; static void RootByMidLongestSpan(const Tree &tree, EdgeInfo **EIs, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2); static void RootByMinAvgLeafDist(const Tree &tree, EdgeInfo **EIs, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2); static void ListEIs(EdgeInfo **EIs, unsigned uNodeCount) { Log("Node1 Node2 MaxDist TotDist MostDist LeafCount Step\n"); Log("----- ----- ------- ------- -------- --------- ----\n"); // 12345 12345 1234567 1234567 12345678 123456789 for (unsigned uNode = 0; uNode < uNodeCount; ++uNode) for (unsigned uNeighbor = 0; uNeighbor < 3; ++uNeighbor) { const EdgeInfo &EI = EIs[uNode][uNeighbor]; if (!EI.m_bSet) continue; Log("%5u %5u %7.3g %7.3g %8u %9u", EI.m_uNode1, EI.m_uNode2, EI.m_dMaxDistToLeaf, EI.m_dTotalDistToLeaves, EI.m_uMostDistantLeaf, EI.m_uLeafCount); if (NULL_NEIGHBOR != EI.m_uMaxStep) Log(" %4u", EI.m_uMaxStep); Log("\n"); } } static void CalcInfo(const Tree &tree, unsigned uNode1, unsigned uNode2, EdgeInfo **EIs) { const unsigned uNeighborIndex = tree.GetNeighborSubscript(uNode1, uNode2); EdgeInfo &EI = EIs[uNode1][uNeighborIndex]; EI.m_uNode1 = uNode1; EI.m_uNode2 = uNode2; if (tree.IsLeaf(uNode2)) { EI.m_dMaxDistToLeaf = 0; EI.m_dTotalDistToLeaves = 0; EI.m_uMaxStep = NULL_NEIGHBOR; EI.m_uMostDistantLeaf = uNode2; EI.m_uLeafCount = 1; EI.m_bSet = true; return; } double dMaxDistToLeaf = -1e29; double dTotalDistToLeaves = 0.0; unsigned uLeafCount = 0; unsigned uMostDistantLeaf = NULL_NEIGHBOR; unsigned uMaxStep = NULL_NEIGHBOR; const unsigned uNeighborCount = tree.GetNeighborCount(uNode2); for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub) { const unsigned uNode3 = tree.GetNeighbor(uNode2, uSub); if (uNode3 == uNode1) continue; const EdgeInfo &EINext = EIs[uNode2][uSub]; if (!EINext.m_bSet) Quit("CalcInfo: internal error, dist %u->%u not known", uNode2, uNode3); uLeafCount += EINext.m_uLeafCount; const double dEdgeLength = tree.GetEdgeLength(uNode2, uNode3); const double dTotalDist = EINext.m_dTotalDistToLeaves + EINext.m_uLeafCount*dEdgeLength; dTotalDistToLeaves += dTotalDist; const double dDist = EINext.m_dMaxDistToLeaf + dEdgeLength; if (dDist > dMaxDistToLeaf) { dMaxDistToLeaf = dDist; uMostDistantLeaf = EINext.m_uMostDistantLeaf; uMaxStep = uNode3; } } if (NULL_NEIGHBOR == uMaxStep || NULL_NEIGHBOR == uMostDistantLeaf || 0 == uLeafCount) Quit("CalcInfo: internal error 2"); const double dThisDist = tree.GetEdgeLength(uNode1, uNode2); EI.m_dMaxDistToLeaf = dMaxDistToLeaf; EI.m_dTotalDistToLeaves = dTotalDistToLeaves; EI.m_uMaxStep = uMaxStep; EI.m_uMostDistantLeaf = uMostDistantLeaf; EI.m_uLeafCount = uLeafCount; EI.m_bSet = true; } static bool Known(const Tree &tree, EdgeInfo **EIs, unsigned uNodeFrom, unsigned uNodeTo) { const unsigned uSub = tree.GetNeighborSubscript(uNodeFrom, uNodeTo); return EIs[uNodeFrom][uSub].m_bSet; } static bool AllKnownOut(const Tree &tree, EdgeInfo **EIs, unsigned uNodeFrom, unsigned uNodeTo) { const unsigned uNeighborCount = tree.GetNeighborCount(uNodeTo); for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub) { unsigned uNeighborIndex = tree.GetNeighbor(uNodeTo, uSub); if (uNeighborIndex == uNodeFrom) continue; if (!EIs[uNodeTo][uSub].m_bSet) return false; } return true; } void FindRoot(const Tree &tree, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2, ROOT RootMethod) { #if TRACE tree.LogMe(); #endif if (tree.IsRooted()) Quit("FindRoot: tree already rooted"); const unsigned uNodeCount = tree.GetNodeCount(); const unsigned uLeafCount = tree.GetLeafCount(); if (uNodeCount < 2) Quit("Root: don't support trees with < 2 edges"); EdgeInfo **EIs = new EdgeInfo *[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) EIs[uNodeIndex] = new EdgeInfo[3]; EdgeList Edges; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) if (tree.IsLeaf(uNodeIndex)) { unsigned uParent = tree.GetNeighbor1(uNodeIndex); Edges.Add(uParent, uNodeIndex); } #if TRACE Log("Edges: "); Edges.LogMe(); #endif // Main loop: iterate until all distances known double dAllMaxDist = -1e20; unsigned uMaxFrom = NULL_NEIGHBOR; unsigned uMaxTo = NULL_NEIGHBOR; for (;;) { EdgeList NextEdges; #if TRACE Log("\nTop of main loop\n"); Log("Edges: "); Edges.LogMe(); Log("MDs:\n"); ListEIs(EIs, uNodeCount); #endif // For all edges const unsigned uEdgeCount = Edges.GetCount(); if (0 == uEdgeCount) break; for (unsigned n = 0; n < uEdgeCount; ++n) { unsigned uNodeFrom; unsigned uNodeTo; Edges.GetEdge(n, &uNodeFrom, &uNodeTo); CalcInfo(tree, uNodeFrom, uNodeTo, EIs); #if TRACE Log("Edge %u -> %u\n", uNodeFrom, uNodeTo); #endif const unsigned uNeighborCount = tree.GetNeighborCount(uNodeFrom); for (unsigned i = 0; i < uNeighborCount; ++i) { const unsigned uNeighborIndex = tree.GetNeighbor(uNodeFrom, i); if (!Known(tree, EIs, uNeighborIndex, uNodeFrom) && AllKnownOut(tree, EIs, uNeighborIndex, uNodeFrom)) NextEdges.Add(uNeighborIndex, uNodeFrom); } } Edges.Copy(NextEdges); } #if TRACE ListEIs(EIs, uNodeCount); #endif switch (RootMethod) { case ROOT_MidLongestSpan: RootByMidLongestSpan(tree, EIs, ptruNode1, ptruNode2, ptrdLength1, ptrdLength2); break; case ROOT_MinAvgLeafDist: RootByMinAvgLeafDist(tree, EIs, ptruNode1, ptruNode2, ptrdLength1, ptrdLength2); break; default: Quit("Invalid RootMethod=%d", RootMethod); } for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) delete[] EIs[uNodeIndex]; delete[] EIs; } static void RootByMidLongestSpan(const Tree &tree, EdgeInfo **EIs, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned uLeaf1 = NULL_NEIGHBOR; unsigned uMostDistantLeaf = NULL_NEIGHBOR; double dMaxDist = -VERY_LARGE_DOUBLE; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!tree.IsLeaf(uNodeIndex)) continue; const unsigned uNode2 = tree.GetNeighbor1(uNodeIndex); if (NULL_NEIGHBOR == uNode2) Quit("RootByMidLongestSpan: internal error 0"); const double dEdgeLength = tree.GetEdgeLength(uNodeIndex, uNode2); const EdgeInfo &EI = EIs[uNodeIndex][0]; if (!EI.m_bSet) Quit("RootByMidLongestSpan: internal error 1"); if (EI.m_uNode1 != uNodeIndex || EI.m_uNode2 != uNode2) Quit("RootByMidLongestSpan: internal error 2"); const double dSpanLength = dEdgeLength + EI.m_dMaxDistToLeaf; if (dSpanLength > dMaxDist) { dMaxDist = dSpanLength; uLeaf1 = uNodeIndex; uMostDistantLeaf = EI.m_uMostDistantLeaf; } } if (NULL_NEIGHBOR == uLeaf1) Quit("RootByMidLongestSpan: internal error 3"); const double dTreeHeight = dMaxDist/2.0; unsigned uNode1 = uLeaf1; unsigned uNode2 = tree.GetNeighbor1(uLeaf1); double dAccumSpanLength = 0; #if TRACE Log("RootByMidLongestSpan: span=%u", uLeaf1); #endif for (;;) { const double dEdgeLength = tree.GetEdgeLength(uNode1, uNode2); #if TRACE Log("->%u(%g;%g)", uNode2, dEdgeLength, dAccumSpanLength); #endif if (dAccumSpanLength + dEdgeLength >= dTreeHeight) { *ptruNode1 = uNode1; *ptruNode2 = uNode2; *ptrdLength1 = dTreeHeight - dAccumSpanLength; *ptrdLength2 = dEdgeLength - *ptrdLength1; #if TRACE { const EdgeInfo &EI = EIs[uLeaf1][0]; Log("...\n"); Log("Midpoint: Leaf1=%u Leaf2=%u Node1=%u Node2=%u Length1=%g Length2=%g\n", uLeaf1, EI.m_uMostDistantLeaf, *ptruNode1, *ptruNode2, *ptrdLength1, *ptrdLength2); } #endif return; } if (tree.IsLeaf(uNode2)) Quit("RootByMidLongestSpan: internal error 4"); dAccumSpanLength += dEdgeLength; const unsigned uSub = tree.GetNeighborSubscript(uNode1, uNode2); const EdgeInfo &EI = EIs[uNode1][uSub]; if (!EI.m_bSet) Quit("RootByMidLongestSpan: internal error 5"); uNode1 = uNode2; uNode2 = EI.m_uMaxStep; } } /*** Root by balancing average distance to leaves. The root is a point p such that the average distance to leaves to the left of p is the same as the to the right. This is the method used by CLUSTALW, which was originally used in PROFILEWEIGHT: Thompson et al. (1994) CABIOS (10) 1, 19-29. ***/ static void RootByMinAvgLeafDist(const Tree &tree, EdgeInfo **EIs, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2) { const unsigned uNodeCount = tree.GetNodeCount(); const unsigned uLeafCount = tree.GetLeafCount(); unsigned uNode1 = NULL_NEIGHBOR; unsigned uNode2 = NULL_NEIGHBOR; double dMinHeight = VERY_LARGE_DOUBLE; double dBestLength1 = VERY_LARGE_DOUBLE; double dBestLength2 = VERY_LARGE_DOUBLE; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { const unsigned uNeighborCount = tree.GetNeighborCount(uNodeIndex); for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub) { const unsigned uNeighborIndex = tree.GetNeighbor(uNodeIndex, uSub); // Avoid visiting same edge a second time in reversed order. if (uNeighborIndex < uNodeIndex) continue; const unsigned uSubRev = tree.GetNeighborSubscript(uNeighborIndex, uNodeIndex); if (NULL_NEIGHBOR == uSubRev) Quit("RootByMinAvgLeafDist, internal error 1"); // Get info for edges Node1->Node2 and Node2->Node1 (reversed) const EdgeInfo &EI = EIs[uNodeIndex][uSub]; const EdgeInfo &EIRev = EIs[uNeighborIndex][uSubRev]; if (EI.m_uNode1 != uNodeIndex || EI.m_uNode2 != uNeighborIndex || EIRev.m_uNode1 != uNeighborIndex || EIRev.m_uNode2 != uNodeIndex) Quit("RootByMinAvgLeafDist, internal error 2"); if (!EI.m_bSet) Quit("RootByMinAvgLeafDist, internal error 3"); if (uLeafCount != EI.m_uLeafCount + EIRev.m_uLeafCount) Quit("RootByMinAvgLeafDist, internal error 4"); const double dEdgeLength = tree.GetEdgeLength(uNodeIndex, uNeighborIndex); if (dEdgeLength != tree.GetEdgeLength(uNeighborIndex, uNodeIndex)) Quit("RootByMinAvgLeafDist, internal error 5"); // Consider point p on edge 12 in tree (1=Node, 2=Neighbor). // // ----- ---- // | | // 1----p--2 // | | // ----- ---- // // Define: // ADLp = average distance to leaves to left of point p. // ADRp = average distance to leaves to right of point p. // L = edge length = distance 12 // x = distance 1p // So distance p2 = L - x. // Average distance from p to leaves on left of p is: // ADLp = ADL1 + x // Average distance from p to leaves on right of p is: // ADRp = ADR2 + (L - x) // To be a root, we require these two distances to be equal, // ADLp = ADRp // ADL1 + x = ADR2 + (L - x) // Solving for x, // x = (ADR2 - ADL1 + L)/2 // If 0 <= x <= L, we can place the root on edge 12. const double ADL1 = EI.m_dTotalDistToLeaves / EI.m_uLeafCount; const double ADR2 = EIRev.m_dTotalDistToLeaves / EIRev.m_uLeafCount; const double x = (ADR2 - ADL1 + dEdgeLength)/2.0; if (x >= 0 && x <= dEdgeLength) { const double dLength1 = x; const double dLength2 = dEdgeLength - x; const double dHeight1 = EI.m_dMaxDistToLeaf + dLength1; const double dHeight2 = EIRev.m_dMaxDistToLeaf + dLength2; const double dHeight = dHeight1 >= dHeight2 ? dHeight1 : dHeight2; #if TRACE Log("Candidate root Node1=%u Node2=%u Height=%g\n", uNodeIndex, uNeighborIndex, dHeight); #endif if (dHeight < dMinHeight) { uNode1 = uNodeIndex; uNode2 = uNeighborIndex; dBestLength1 = dLength1; dBestLength2 = dLength2; dMinHeight = dHeight; } } } } if (NULL_NEIGHBOR == uNode1 || NULL_NEIGHBOR == uNode2) Quit("RootByMinAvgLeafDist, internal error 6"); #if TRACE Log("Best root Node1=%u Node2=%u Length1=%g Length2=%g Height=%g\n", uNode1, uNode2, dBestLength1, dBestLength2, dMinHeight); #endif *ptruNode1 = uNode1; *ptruNode2 = uNode2; *ptrdLength1 = dBestLength1; *ptrdLength2 = dBestLength2; } void FixRoot(Tree &tree, ROOT Method) { if (!tree.IsRooted()) Quit("FixRoot: expecting rooted tree"); // Pseudo-root: keep root assigned by clustering if (ROOT_Pseudo == Method) return; tree.UnrootByDeletingRoot(); tree.RootUnrootedTree(Method); } muscle-3.8.31.orig/pam200mafft.cpp0000644000175000017500000001020311352261600016104 0ustar kratzcharles#include "muscle.h" // Adjusted PAM200 scoring matrix as used by default in MAFFT. // Katoh, Misawa, Kuma and Miyata (2002), NAR 30(14), 3059-3066. static float PAM200[23][23] = { // A C D E F G H I K L M N P Q R S T V W Y B Z X 408, 20, 54, 52, -182, 179, -68, 109, -35, -47, 39, 106, 206, -14, -12, 257, 293, 191, -306, -219, 0, 0, 0, // A 20, 1190, -228, -295, 94, 6, 63, -131, -184, -176, -112, -29, -122, -195, 49, 185, 13, -49, 199, 333, 0, 0, 0, // C 54, -228, 645, 516, -399, 168, 98, -225, 75, -341, -235, 352, -149, 142, -44, 65, 7, -147, -418, -128, 0, 0, 0, // D 52, -295, 516, 630, -460, 145, 45, -225, 195, -307, -222, 186, -121, 299, 54, -10, -36, -130, -366, -285, 0, 0, 0, // E -182, 94, -399, -460, 908, -387, 82, 100, -423, 340, 87, -216, -160, -274, -307, -31, -153, 51, 19, 604, 0, 0, 0, // F 179, 6, 168, 145, -387, 682, -94, -196, -14, -304, -226, 99, -57, -48, 117, 175, 41, -73, -38, -329, 0, 0, 0, // G -68, 63, 98, 45, 82, -94, 786, -185, 164, -72, -132, 258, 86, 388, 277, 55, -15, -197, -181, 488, 0, 0, 0, // H 109, -131, -225, -225, 100, -196, -185, 574, -204, 308, 411, -94, -95, -202, -188, 1, 182, 489, -254, -133, 0, 0, 0, // I -35, -184, 75, 195, -423, -14, 164, -204, 652, -229, -98, 206, -66, 335, 486, 22, 39, -207, -196, -244, 0, 0, 0, // K -47, -176, -341, -307, 340, -304, -72, 308, -229, 611, 389, -203, 73, -66, -150, -49, -21, 259, -46, -9, 0, 0, 0, // L 39, -112, -235, -222, 87, -226, -132, 411, -98, 389, 776, -111, -78, -104, -109, -29, 149, 351, -209, -162, 0, 0, 0, // M 106, -29, 352, 186, -216, 99, 258, -94, 206, -203, -111, 536, -1, 108, 93, 260, 188, -98, -359, 12, 0, 0, 0, // N 206, -122, -149, -121, -160, -57, 86, -95, -66, 73, -78, -1, 756, 142, 25, 241, 159, -55, -353, -206, 0, 0, 0, // P -14, -195, 142, 299, -274, -48, 388, -202, 335, -66, -104, 108, 142, 655, 321, 7, -15, -175, -223, -53, 0, 0, 0, // Q -12, 49, -44, 54, -307, 117, 277, -188, 486, -150, -109, 93, 25, 321, 626, 48, 16, -181, 124, -113, 0, 0, 0, // R 257, 185, 65, -10, -31, 175, 55, 1, 22, -49, -29, 260, 241, 7, 48, 373, 279, 28, -193, -35, 0, 0, 0, // S 293, 13, 7, -36, -153, 41, -15, 182, 39, -21, 149, 188, 159, -15, 16, 279, 442, 163, -323, -170, 0, 0, 0, // T 191, -49, -147, -130, 51, -73, -197, 489, -207, 259, 351, -98, -55, -175, -181, 28, 163, 525, -225, -177, 0, 0, 0, // V -306, 199, -418, -366, 19, -38, -181, -254, -196, -46, -209, -359, -353, -223, 124, -193, -323, -225, 1495, 83, 0, 0, 0, // W -219, 333, -128, -285, 604, -329, 488, -133, -244, -9, -162, 12, -206, -53, -113, -35, -170, -177, 83, 999, 0, 0, 0, // Y 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Z 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // X }; muscle-3.8.31.orig/refinetree.cpp0000644000175000017500000000243511352261673016241 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "tree.h" #include "profile.h" #include void RefineTree(MSA &msa, Tree &tree) { const unsigned uSeqCount = msa.GetSeqCount(); if (tree.GetLeafCount() != uSeqCount) Quit("Refine tree, tree has different number of nodes"); if (uSeqCount < 3) return; #if DEBUG ValidateMuscleIds(msa); ValidateMuscleIds(tree); #endif unsigned *IdToDiffsLeafNodeIndex = new unsigned[uSeqCount]; unsigned uDiffsCount = uSeqCount; Tree Tree2; for (unsigned uIter = 0; uIter < g_uMaxTreeRefineIters; ++uIter) { TreeFromMSA(msa, Tree2, g_Cluster2, g_Distance2, g_Root2, g_pstrDistMxFileName2); #if DEBUG ValidateMuscleIds(Tree2); #endif Tree Diffs; DiffTrees(Tree2, tree, Diffs, IdToDiffsLeafNodeIndex); tree.Copy(Tree2); const unsigned uNewDiffsNodeCount = Diffs.GetNodeCount(); const unsigned uNewDiffsCount = (uNewDiffsNodeCount - 1)/2; if (0 == uNewDiffsCount || uNewDiffsCount >= uDiffsCount) { ProgressStepsDone(); break; } uDiffsCount = uNewDiffsCount; MSA msa2; RealignDiffs(msa, Diffs, IdToDiffsLeafNodeIndex, msa2); #if DEBUG ValidateMuscleIds(msa2); #endif msa.Copy(msa2); SetCurrentAlignment(msa); } delete[] IdToDiffsLeafNodeIndex; } muscle-3.8.31.orig/diaglist.h0000644000175000017500000000353011352261667015356 0ustar kratzcharles#ifndef diaglist_h #define diaglist_h const unsigned EMPTY = (unsigned) ~0; const unsigned MAX_DIAGS = 1024; struct Diag { unsigned m_uStartPosA; unsigned m_uStartPosB; unsigned m_uLength; }; struct Rect { unsigned m_uStartPosA; unsigned m_uStartPosB; unsigned m_uLengthA; unsigned m_uLengthB; }; class DiagList { public: DiagList() { m_uCount = 0; } ~DiagList() { Free(); } public: // Creation void Clear() { Free(); } void FromPath(const PWPath &Path); void Add(const Diag &d); void Add(unsigned uStartPosA, unsigned uStartPosB, unsigned uLength); void DeleteIncompatible(); // Accessors unsigned GetCount() const { return m_uCount; } const Diag &Get(unsigned uIndex) const; // Operations void Sort(); void Copy(const DiagList &DL); // Query // returns true iff given diagonal is included in the list // in whole or in part. bool NonZeroIntersection(const Diag &d) const; bool IsSorted() const; // Diagnostics void LogMe() const; private: void Free() { m_uCount = 0; } private: unsigned m_uCount; Diag m_Diags[MAX_DIAGS]; }; unsigned DiagOverlap(const Diag &d1, const Diag &d2); unsigned DiagOverlapA(const Diag &d1, const Diag &d2); unsigned DiagOverlapB(const Diag &d1, const Diag &d2); unsigned DiagBreak(const Diag &d1, const Diag &d2); bool DiagCompatible(const Diag &d1, const Diag &d2); void CheckDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const MSA &msaA, const MSA &msaB, const PWPath &Path); void FindDiags(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, unsigned uLengthY, DiagList &DL); void FindDiagsNuc(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, unsigned uLengthY, DiagList &DL); void MergeDiags(DiagList &DL); #endif // diaglist_h muscle-3.8.31.orig/msf.cpp0000644000175000017500000000700511352261667014677 0ustar kratzcharles#include "muscle.h" #include #include #include "msa.h" #include "textfile.h" const int MAX_NAME = 63; const unsigned uCharsPerLine = 50; const unsigned uCharsPerBlock = 10; // Truncate at first white space or MAX_NAME, whichever comes // first, then pad with blanks up to PadLength. static const char *GetPaddedName(const char *Name, int PadLength) { static char PaddedName[MAX_NAME+1]; memset(PaddedName, ' ', MAX_NAME); size_t n = strcspn(Name, " \t"); memcpy(PaddedName, Name, n); PaddedName[PadLength] = 0; return PaddedName; } static const char *strfind(const char *s, const char *t) { size_t n = strcspn(s, t); if (0 == n) return 0; return s + n; } // GCG checksum code kindly provided by Eric Martel. unsigned MSA::GetGCGCheckSum(unsigned uSeqIndex) const { unsigned CheckSum = 0; const unsigned uColCount = GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { unsigned c = (unsigned) GetChar(uSeqIndex, uColIndex); CheckSum += c*(uColIndex%57 + 1); CheckSum %= 10000; } return CheckSum; } static void MSFFixGaps(MSA &a) { const int SeqCount = a.GetSeqCount(); const int ColCount = a.GetColCount(); for (int SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { for (int ColIndex = 0; ColIndex < ColCount; ++ColIndex) if (a.IsGap(SeqIndex, ColIndex)) a.SetChar(SeqIndex, ColIndex, '.'); } } void MSA::ToMSFFile(TextFile &File, const char *ptrComment) const { // Cast away const, yuck SetMSAWeightsMuscle((MSA &) *this); MSFFixGaps((MSA &) *this); File.PutString("PileUp\n"); if (0 != ptrComment) File.PutFormat("Comment: %s\n", ptrComment); else File.PutString("\n"); char seqtype = (g_Alpha == ALPHA_DNA || g_Alpha == ALPHA_RNA) ? 'N' : 'A'; File.PutFormat(" MSF: %u Type: %c Check: 0000 ..\n\n", GetColCount(), seqtype); int iLongestNameLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *Name = GetSeqName(uSeqIndex); const char *PaddedName = GetPaddedName(Name, MAX_NAME); int iLength = (int) strcspn(PaddedName, " \t"); if (iLength > iLongestNameLength) iLongestNameLength = iLength; } for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *Name = GetSeqName(uSeqIndex); const char *PaddedName = GetPaddedName(Name, iLongestNameLength); File.PutFormat(" Name: %s", PaddedName); File.PutFormat(" Len: %u Check: %5u Weight: %g\n", GetColCount(), GetGCGCheckSum(uSeqIndex), GetSeqWeight(uSeqIndex)); } File.PutString("\n//\n"); if (0 == GetColCount()) return; unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1; for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex) { File.PutString("\n"); unsigned uStartColIndex = uLineIndex*uCharsPerLine; unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1; if (uEndColIndex >= GetColCount()) uEndColIndex = GetColCount() - 1; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *Name = GetSeqName(uSeqIndex); const char *PaddedName = GetPaddedName(Name, iLongestNameLength); File.PutFormat("%s ", PaddedName); for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; ++uColIndex) { if (0 == uColIndex%uCharsPerBlock) File.PutString(" "); char c = GetChar(uSeqIndex, uColIndex); File.PutFormat("%c", c); } File.PutString("\n"); } } } muscle-3.8.31.orig/distfunc.h0000644000175000017500000000144711352261600015365 0ustar kratzcharles#ifndef DistFunc_h #define DistFunc_h class DistFunc { public: DistFunc(); virtual ~DistFunc(); public: virtual void SetCount(unsigned uCount); virtual void SetDist(unsigned uIndex1, unsigned uIndex2, float dDist); void SetName(unsigned uIndex, const char szName[]); void SetId(unsigned uIndex, unsigned uId); const char *GetName(unsigned uIndex) const; unsigned GetId(unsigned uIndex) const; virtual float GetDist(unsigned uIndex1, unsigned uIndex2) const; virtual unsigned GetCount() const; void LogMe() const; protected: unsigned VectorIndex(unsigned uIndex, unsigned uIndex2) const; unsigned VectorLength() const; private: unsigned m_uCount; unsigned m_uCacheCount; float *m_Dists; char **m_Names; unsigned *m_Ids; }; #endif // DistFunc_h muscle-3.8.31.orig/muscle21.vcproj0000644000175000017500000002065711352261611016263 0ustar kratzcharles muscle-3.8.31.orig/blosumla.cpp0000644000175000017500000001505711352261600015721 0ustar kratzcharles#include "muscle.h" #define GAPVAL 0.3 #define GAPGAPVAL 5.0 // Blosum62 log-average factor matrix static float Blosum62LA[20][20] = { #define v(x) ((float) x) #define S_ROW(n, c, A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ v(R), v(S), v(T), v(V), v(W), v(Y) }, // Blosum62 log average matrix // A C D E F // G H I K L // M N P Q R // S T V W Y S_ROW( 0, 'A', 3.9029401, 0.8679881, 0.5446049, 0.7412640, 0.4648942, 1.0568696, 0.5693654, 0.6324813, 0.7753898, 0.6019460, 0.7231498, 0.5883077, 0.7541214, 0.7568035, 0.6126988, 1.4721037, 0.9844022, 0.9364584, 0.4165484, 0.5426125) S_ROW( 1, 'C', 0.8679881, 19.5765802, 0.3014542, 0.2859347, 0.4389910, 0.4203886, 0.3550472, 0.6534589, 0.3491296, 0.6422760, 0.6113537, 0.3978026, 0.3795628, 0.3657796, 0.3089379, 0.7384148, 0.7405530, 0.7558448, 0.4499807, 0.4342013) S_ROW( 2, 'D', 0.5446049, 0.3014542, 7.3979253, 1.6878109, 0.2989696, 0.6343015, 0.6785593, 0.3390155, 0.7840905, 0.2866128, 0.3464547, 1.5538520, 0.5987177, 0.8970811, 0.5732000, 0.9135051, 0.6947898, 0.3365004, 0.2321050, 0.3456829) S_ROW( 3, 'E', 0.7412640, 0.2859347, 1.6878109, 5.4695276, 0.3307441, 0.4812675, 0.9600400, 0.3305223, 1.3082782, 0.3728734, 0.5003421, 0.9112983, 0.6792027, 1.9017376, 0.9607983, 0.9503570, 0.7414260, 0.4289431, 0.3743021, 0.4964664) S_ROW( 4, 'F', 0.4648942, 0.4389910, 0.2989696, 0.3307441, 8.1287983, 0.3406407, 0.6519893, 0.9457698, 0.3440433, 1.1545978, 1.0043715, 0.3542882, 0.2874440, 0.3339729, 0.3807263, 0.4399736, 0.4816930, 0.7450894, 1.3743775, 2.7693817) S_ROW( 5, 'G', 1.0568696, 0.4203886, 0.6343015, 0.4812675, 0.3406407, 6.8763075, 0.4929663, 0.2750096, 0.5888716, 0.2845039, 0.3954865, 0.8637114, 0.4773858, 0.5386498, 0.4499840, 0.9035965, 0.5792712, 0.3369551, 0.4216898, 0.3487141) S_ROW( 6, 'H', 0.5693654, 0.3550472, 0.6785593, 0.9600400, 0.6519893, 0.4929663, 13.5060070, 0.3262878, 0.7788884, 0.3806759, 0.5841316, 1.2220028, 0.4728797, 1.1679835, 0.9170473, 0.7367319, 0.5575021, 0.3394474, 0.4440859, 1.7979036) S_ROW( 7, 'I', 0.6324813, 0.6534589, 0.3390155, 0.3305223, 0.9457698, 0.2750096, 0.3262878, 3.9979299, 0.3963730, 1.6944349, 1.4777449, 0.3279345, 0.3846629, 0.3829375, 0.3547509, 0.4431634, 0.7798163, 2.4175121, 0.4088732, 0.6303898) S_ROW( 8, 'K', 0.7753898, 0.3491296, 0.7840905, 1.3082782, 0.3440433, 0.5888716, 0.7788884, 0.3963730, 4.7643359, 0.4282702, 0.6253033, 0.9398419, 0.7037741, 1.5543233, 2.0768092, 0.9319192, 0.7929060, 0.4565429, 0.3589319, 0.5321784) S_ROW( 9, 'L', 0.6019460, 0.6422760, 0.2866128, 0.3728734, 1.1545978, 0.2845039, 0.3806759, 1.6944349, 0.4282702, 3.7966214, 1.9942957, 0.3100430, 0.3711219, 0.4773261, 0.4739194, 0.4288939, 0.6603292, 1.3142355, 0.5680359, 0.6920589) S_ROW(10, 'M', 0.7231498, 0.6113537, 0.3464547, 0.5003421, 1.0043715, 0.3954865, 0.5841316, 1.4777449, 0.6253033, 1.9942957, 6.4814549, 0.4745299, 0.4238960, 0.8642486, 0.6226249, 0.5985578, 0.7938018, 1.2689365, 0.6103022, 0.7083636) S_ROW(11, 'N', 0.5883077, 0.3978026, 1.5538520, 0.9112983, 0.3542882, 0.8637114, 1.2220028, 0.3279345, 0.9398419, 0.3100430, 0.4745299, 7.0940964, 0.4999337, 1.0005835, 0.8586298, 1.2315289, 0.9841525, 0.3690340, 0.2777841, 0.4860309) S_ROW(12, 'P', 0.7541214, 0.3795628, 0.5987177, 0.6792027, 0.2874440, 0.4773858, 0.4728797, 0.3846629, 0.7037741, 0.3711219, 0.4238960, 0.4999337, 12.8375452, 0.6412803, 0.4815348, 0.7555033, 0.6888962, 0.4430825, 0.2818321, 0.3635216) S_ROW(13, 'Q', 0.7568035, 0.3657796, 0.8970811, 1.9017376, 0.3339729, 0.5386498, 1.1679835, 0.3829375, 1.5543233, 0.4773261, 0.8642486, 1.0005835, 0.6412803, 6.2444210, 1.4057958, 0.9655559, 0.7913219, 0.4667781, 0.5093584, 0.6110951) S_ROW(14, 'R', 0.6126988, 0.3089379, 0.5732000, 0.9607983, 0.3807263, 0.4499840, 0.9170473, 0.3547509, 2.0768092, 0.4739194, 0.6226249, 0.8586298, 0.4815348, 1.4057958, 6.6655769, 0.7671661, 0.6777544, 0.4200721, 0.3951049, 0.5559652) S_ROW(15, 'S', 1.4721037, 0.7384148, 0.9135051, 0.9503570, 0.4399736, 0.9035965, 0.7367319, 0.4431634, 0.9319192, 0.4288939, 0.5985578, 1.2315289, 0.7555033, 0.9655559, 0.7671661, 3.8428476, 1.6139205, 0.5652240, 0.3853031, 0.5575206) S_ROW(16, 'T', 0.9844022, 0.7405530, 0.6947898, 0.7414260, 0.4816930, 0.5792712, 0.5575021, 0.7798163, 0.7929060, 0.6603292, 0.7938018, 0.9841525, 0.6888962, 0.7913219, 0.6777544, 1.6139205, 4.8321048, 0.9809432, 0.4309317, 0.5731577) S_ROW(17, 'V', 0.9364584, 0.7558448, 0.3365004, 0.4289431, 0.7450894, 0.3369551, 0.3394474, 2.4175121, 0.4565429, 1.3142355, 1.2689365, 0.3690340, 0.4430825, 0.4667781, 0.4200721, 0.5652240, 0.9809432, 3.6921553, 0.3744576, 0.6580390) S_ROW(18, 'W', 0.4165484, 0.4499807, 0.2321050, 0.3743021, 1.3743775, 0.4216898, 0.4440859, 0.4088732, 0.3589319, 0.5680359, 0.6103022, 0.2777841, 0.2818321, 0.5093584, 0.3951049, 0.3853031, 0.4309317, 0.3744576, 38.1077830, 2.1098056) S_ROW(19, 'Y', 0.5426125, 0.4342013, 0.3456829, 0.4964664, 2.7693817, 0.3487141, 1.7979036, 0.6303898, 0.5321784, 0.6920589, 0.7083636, 0.4860309, 0.3635216, 0.6110951, 0.5559652, 0.5575206, 0.5731577, 0.6580390, 2.1098056, 9.8322054) }; muscle-3.8.31.orig/pwpath.h0000644000175000017500000000457611352261623015064 0ustar kratzcharles#ifndef PWPath_h #define PWPath_h /*** Each PWEdge in a PWPath specifies a column in a pair-wise (PW) alignment. "Path" is by analogy with the path through an HMM. Edge types are: 'M' LetterA + LetterB 'D' LetterA + GapB 'I' GapB + LetterA The mnemomic is Match, Delete, Insert (with respect to A). Here is a global alignment of sequences A and B. A: AMQT-F B: -M-TIF The path for this example is: Edge cType uPrefixLengthA uPrefixLengthB 0 D 1 0 1 M 2 1 2 D 3 1 3 M 4 2 4 I 4 3 5 M 5 4 Given the starting positions in each alignment (e.g., column zero for a global alignment), the prefix length fields are redundant; they are included only for convenience and as a sanity check, we are not trying to optimize for speed or space here. We use prefix lengths rather than column indexes because of the problem of representing the special case of a gap in the first position. ***/ class Seq; class MSA; class SatchmoParams; class PW; class TextFile; class PWScore; class PWEdge { public: char cType; unsigned uPrefixLengthA; unsigned uPrefixLengthB; bool Equal(const PWEdge &e) const { return uPrefixLengthA == e.uPrefixLengthA && uPrefixLengthB == e.uPrefixLengthB && cType == e.cType; } }; class PWPath { // Disable compiler defaults private: PWPath &operator=(const PWPath &rhs); PWPath(const PWPath &rhs); public: PWPath(); virtual ~PWPath(); public: void Clear(); void FromStr(const char Str[]); void Copy(const PWPath &Path); void AppendEdge(const PWEdge &Edge); void AppendEdge(char cType, unsigned uPrefixLengthA, unsigned uPrefixLengthB); void PrependEdge(const PWEdge &Edge); unsigned GetEdgeCount() const { return m_uEdgeCount; } const PWEdge &GetEdge(unsigned uEdgeIndex) const; void Validate(const PWScore &PWS) const; void Validate() const; void LogMe() const; void FromFile(TextFile &File); void ToFile(TextFile &File) const; void FromMSAPair(const MSA &msaA, const MSA &msaB); void AssertEqual(const PWPath &Path) const; bool Equal(const PWPath &Path) const; unsigned GetMatchCount() const; unsigned GetDeleteCount() const; unsigned GetInsertCount() const; private: void ExpandPath(unsigned uAdditionalEdgeCount); private: unsigned m_uEdgeCount; unsigned m_uArraySize; PWEdge *m_Edges; }; #endif // PWPath_h muscle-3.8.31.orig/globalswin32.cpp0000644000175000017500000000453111352261667016421 0ustar kratzcharles#include "muscle.h" #if WIN32 #include #include #include #include #include void DebugPrintf(const char *szFormat, ...) { va_list ArgList; char szStr[4096]; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); OutputDebugString(szStr); } double GetNAN() { static unsigned long nan[2]={0xffffffff, 0x7fffffff}; double dNAN = *( double* )nan; assert(_isnan(dNAN)); return dNAN; } double g_dNAN = GetNAN(); void chkmem(const char szMsg[]) { if (!_CrtCheckMemory()) Quit("chkmem(%s)", szMsg); } void Break() { if (IsDebuggerPresent()) DebugBreak(); } const char *GetCmdLine() { return GetCommandLine(); } static unsigned uPeakMemUseBytes; double GetRAMSizeMB() { MEMORYSTATUS MS; GlobalMemoryStatus(&MS); return MS.dwAvailPhys/1e6; } double GetMemUseMB() { HANDLE hProc = GetCurrentProcess(); PROCESS_MEMORY_COUNTERS PMC; BOOL bOk = GetProcessMemoryInfo(hProc, &PMC, sizeof(PMC)); assert(bOk); //printf("GetMemUseMB()\n"); //printf("%12u PageFaultCount\n", (unsigned) PMC.PageFaultCount); //printf("%12u PagefileUsage\n", (unsigned) PMC.PagefileUsage); //printf("%12u PeakPagefileUsage\n", (unsigned) PMC.PeakPagefileUsage); //printf("%12u WorkingSetSize\n", (unsigned) PMC.WorkingSetSize); //printf("%12u PeakWorkingSetSize\n", (unsigned) PMC.PeakWorkingSetSize); //printf("%12u QuotaPagedPoolUsage\n", (unsigned) PMC.QuotaPagedPoolUsage); //printf("%12u QuotaPeakPagedPoolUsage\n", (unsigned) PMC.QuotaPeakPagedPoolUsage); //printf("%12u QuotaNonPagedPoolUsage\n", (unsigned) PMC.QuotaNonPagedPoolUsage); //printf("%12u QuotaPeakNonPagedPoolUsage\n", (unsigned) PMC.QuotaPeakNonPagedPoolUsage); unsigned uBytes = (unsigned) PMC.WorkingSetSize; if (uBytes > uPeakMemUseBytes) uPeakMemUseBytes = uBytes; return (uBytes + 500000.0)/1000000.0; } double GetPeakMemUseMB() { return (uPeakMemUseBytes + 500000.0)/1000000.0; } void CheckMemUse() { // Side-effect: sets peak usage in uPeakMemUseBytes GetMemUseMB(); } double GetCPUGHz() { double dGHz = 2.5; const char *e = getenv("CPUGHZ"); if (0 != e) dGHz = atof(e); if (dGHz < 0.1 || dGHz > 1000.0) Quit("Invalid value '%s' for environment variable CPUGHZ", e); return dGHz; } #endif // WIN32 muscle-3.8.31.orig/refine.cpp0000644000175000017500000000322711352261673015361 0ustar kratzcharles#include "muscle.h" #include "textfile.h" #include "seqvect.h" #include "distfunc.h" #include "msa.h" #include "tree.h" #include "clust.h" #include "profile.h" #include "clustsetmsa.h" void Refine() { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetStartTime(); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); MSA msa; msa.FromFile(fileIn); const unsigned uSeqCount = msa.GetSeqCount(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = msa.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); msa.FixAlpha(); SetPPScore(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); SetMuscleInputMSA(msa); Tree GuideTree; TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2); SetMuscleTree(GuideTree); if (g_bAnchors) RefineVert(msa, GuideTree, g_uMaxIters); else RefineHoriz(msa, GuideTree, g_uMaxIters, false, false); ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); // TextFile fileOut(g_pstrOutFileName, true); // msa.ToFile(fileOut); MuscleOutput(msa); } muscle-3.8.31.orig/local.cpp0000644000175000017500000000403111352261667015200 0ustar kratzcharles#include "muscle.h" #include "textfile.h" #include "msa.h" #include "profile.h" #include "pwpath.h" #include "tree.h" #define TRACE 0 static void MSAFromFileName(const char *FileName, MSA &a) { TextFile File(FileName); a.FromFile(File); } static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree) { const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); TreeFromMSA(msa, tree, g_Cluster1, g_Distance1, g_Root1); SetMuscleTree(tree); return ProfileFromMSA(msa); } void Local() { if (0 == g_pstrFileName1 || 0 == g_pstrFileName2) Quit("Must specify both -in1 and -in2 for -sw"); SetSeqWeightMethod(g_SeqWeight1); MSA msa1; MSA msa2; MSAFromFileName(g_pstrFileName1, msa1); MSAFromFileName(g_pstrFileName2, msa2); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = msa1.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); msa1.FixAlpha(); msa2.FixAlpha(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); const unsigned uMaxSeqCount = (uSeqCount1 > uSeqCount2 ? uSeqCount1 : uSeqCount2); MSA::SetIdCount(uMaxSeqCount); unsigned uLength1 = msa1.GetColCount(); unsigned uLength2 = msa2.GetColCount(); Tree tree1; Tree tree2; ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1); ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2); PWPath Path; SW(Prof1, uLength1, Prof2, uLength2, Path); #if TRACE Path.LogMe(); #endif MSA msaOut; AlignTwoMSAsGivenPathSW(Path, msa1, msa2, msaOut); #if TRACE msaOut.LogMe(); #endif TextFile fileOut(g_pstrOutFileName, true); msaOut.ToFile(fileOut); } muscle-3.8.31.orig/henikoffweightpb.cpp0000644000175000017500000000714111352261600017421 0ustar kratzcharles#include "muscle.h" #include "msa.h" /*** Compute Henikoff weights. Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights. J. Mol. Biol., 243(4):574-578. Award each different residue an equal share of the weight, and then to divide up that weight equally among the sequences sharing the same residue. So if in a position of a multiple alignment, r different residues are represented, a residue represented in only one sequence contributes a score of 1/r to that sequence, whereas a residue represented in s sequences contributes a score of 1/rs to each of the s sequences. For each sequence, the contributions from each position are summed to give a sequence weight. Here we use the variant from PSI-BLAST, which (a) treats gaps as a 21st letter, and (b) ignores columns that are perfectly conserved. >>> WARNING -- I SUSPECT THIS DOESN'T WORK CORRECTLY <<< ***/ void MSA::CalcHenikoffWeightsColPB(unsigned uColIndex) const { const unsigned uSeqCount = GetSeqCount(); // Compute letter counts in this column unsigned uLetterCount[MAX_ALPHA+1]; memset(uLetterCount, 0, (MAX_ALPHA+1)*sizeof(unsigned)); unsigned uLetter; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) uLetter = MAX_ALPHA; else uLetter = GetLetter(uSeqIndex, uColIndex); ++(uLetterCount[uLetter]); } // Check for special case of perfect conservation for (unsigned uLetter = 0; uLetter < MAX_ALPHA+1; ++uLetter) { unsigned uCount = uLetterCount[uLetter]; if (uCount > 0) { // Perfectly conserved? if (uCount == uSeqCount) return; else // If count > 0 but less than nr. sequences, can't be conserved break; } } // Compute weight contributions for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uLetter; if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) uLetter = MAX_ALPHA; else uLetter = GetLetter(uSeqIndex, uColIndex); const unsigned uCount = uLetterCount[uLetter]; m_Weights[uSeqIndex] += (WEIGHT) (1.0/uCount); } } bool MSA::IsGapSeq(unsigned uSeqIndex) const { const unsigned uColCount = GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) if (!IsGap(uSeqIndex, uColIndex)) return false; return true; } void MSA::SetUniformWeights() const { const unsigned uSeqCount = GetSeqCount(); if (0 == uSeqCount) return; const WEIGHT w = (WEIGHT) (1.0 / uSeqCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) m_Weights[uSeqIndex] = w; } void MSA::SetHenikoffWeightsPB() const { const unsigned uColCount = GetColCount(); const unsigned uSeqCount = GetSeqCount(); if (0 == uSeqCount) return; else if (1 == uSeqCount) { m_Weights[0] = 1.0; return; } else if (2 == uSeqCount) { m_Weights[0] = (WEIGHT) 0.5; m_Weights[1] = (WEIGHT) 0.5; return; } for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) m_Weights[uSeqIndex] = 0.0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) CalcHenikoffWeightsColPB(uColIndex); // Set all-gap seqs weight to 0 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGapSeq(uSeqIndex)) m_Weights[uSeqIndex] = 0.0; // Check for special case of identical sequences, which will cause all // columns to be skipped becasue they're perfectly conserved. if (VectorIsZero(m_Weights, uSeqCount)) VectorSet(m_Weights, uSeqCount, 1.0); Normalize(m_Weights, uSeqCount); } muscle-3.8.31.orig/tracebacksw.cpp0000644000175000017500000001151411352261617016376 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "pwpath.h" #include #define TRACE 0 #define EQ(a, b) (fabs(a-b) < 0.1) void TraceBackSW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, unsigned uPrefixLengthAMax, unsigned uPrefixLengthBMax, PWPath &Path) { #if TRACE Log("\n"); Log("TraceBackSW LengthA=%u LengthB=%u PLAMax=%u PLBMax=%u\n", uLengthA, uLengthB, uPrefixLengthAMax, uPrefixLengthBMax); #endif assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; Path.Clear(); unsigned uPrefixLengthA = uPrefixLengthAMax; unsigned uPrefixLengthB = uPrefixLengthBMax; SCORE scoreMax = DPM(uPrefixLengthA, uPrefixLengthB); char cEdgeType = 'M'; for (;;) { if ('S' == cEdgeType) break; PWEdge Edge; Edge.cType = cEdgeType; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; Path.PrependEdge(Edge); char cPrevEdgeType; unsigned uPrevPrefixLengthA = uPrefixLengthA; unsigned uPrevPrefixLengthB = uPrefixLengthB; switch (cEdgeType) { case 'M': { assert(uPrefixLengthA > 0); assert(uPrefixLengthB > 0); const ProfPos &PPA = PA[uPrefixLengthA - 1]; const ProfPos &PPB = PB[uPrefixLengthB - 1]; const SCORE Score = DPM(uPrefixLengthA, uPrefixLengthB); const SCORE scoreMatch = ScoreProfPos2(PPA, PPB); SCORE scoreSM; if (1 == uPrefixLengthA && 1 == uPrefixLengthB) scoreSM = scoreMatch; else scoreSM = MINUS_INFINITY; SCORE scoreMM = MINUS_INFINITY; SCORE scoreDM = MINUS_INFINITY; SCORE scoreIM = MINUS_INFINITY; if (uPrefixLengthA > 1 && uPrefixLengthB > 1) { SCORE scoreTrans = DPM(uPrefixLengthA-1, uPrefixLengthB-1); scoreMM = scoreTrans + scoreMatch; } if (uPrefixLengthA > 1) { SCORE scoreTransDM = PA[uPrefixLengthA-2].m_scoreGapClose; scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransDM + scoreMatch; } if (uPrefixLengthB > 1) { SCORE scoreTransIM = PB[uPrefixLengthB-2].m_scoreGapClose; scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransIM + scoreMatch; } if (EQ(scoreMM, Score)) cPrevEdgeType = 'M'; else if (EQ(scoreDM, Score)) cPrevEdgeType = 'D'; else if (EQ(scoreIM, Score)) cPrevEdgeType = 'I'; else if (EQ(scoreSM, Score)) cPrevEdgeType = 'S'; else if (EQ(scoreMatch, Score)) cPrevEdgeType = 'S'; else Quit("TraceBack2: failed to match M score=%g M=%g D=%g I=%g S=%g", Score, scoreMM, scoreDM, scoreIM, scoreSM); --uPrevPrefixLengthA; --uPrevPrefixLengthB; break; } case 'D': { assert(uPrefixLengthA > 0); const SCORE Score = DPD(uPrefixLengthA, uPrefixLengthB); SCORE scoreMD = MINUS_INFINITY; SCORE scoreDD = MINUS_INFINITY; SCORE scoreSD = MINUS_INFINITY; if (uPrefixLengthB == 0) { if (uPrefixLengthA == 1) scoreSD = PA[0].m_scoreGapOpen; else scoreSD = DPD(uPrefixLengthA - 1, 0); } if (uPrefixLengthA > 1) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; SCORE scoreTransMD = PPA.m_scoreGapOpen; scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + scoreTransMD; scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB); } if (EQ(Score, scoreMD)) cPrevEdgeType = 'M'; else if (EQ(Score, scoreDD)) cPrevEdgeType = 'D'; else if (EQ(Score, scoreSD)) cPrevEdgeType = 'S'; else Quit("TraceBack2: failed to match D"); --uPrevPrefixLengthA; break; } case 'I': { assert(uPrefixLengthB > 0); const SCORE Score = DPI(uPrefixLengthA, uPrefixLengthB); SCORE scoreMI = MINUS_INFINITY; SCORE scoreII = MINUS_INFINITY; SCORE scoreSI = MINUS_INFINITY; if (uPrefixLengthA == 0) { if (uPrefixLengthB == 1) scoreSI = PB[0].m_scoreGapOpen; else scoreSI = DPI(0, uPrefixLengthB - 1); } if (uPrefixLengthB > 1) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreTransMI = PPB.m_scoreGapOpen; scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + scoreTransMI; scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1); } if (EQ(Score, scoreMI)) cPrevEdgeType = 'M'; else if (EQ(Score, scoreII)) cPrevEdgeType = 'I'; else if (EQ(Score, scoreSI)) cPrevEdgeType = 'S'; else Quit("TraceBack2: failed to match I"); --uPrevPrefixLengthB; break; } default: assert(false); } #if TRACE Log("Edge %c%c%u.%u", cPrevEdgeType, cEdgeType, uPrefixLengthA, uPrefixLengthB); Log("\n"); #endif cEdgeType = cPrevEdgeType; uPrefixLengthA = uPrevPrefixLengthA; uPrefixLengthB = uPrevPrefixLengthB; } } muscle-3.8.31.orig/gonnet.h0000644000175000017500000000037111352261600015033 0ustar kratzcharles#ifndef Gonnet_h #define Gonnet_h typedef double t_ROW[20]; const t_ROW *GetGonnetMatrix(unsigned N); SCORE GetGonnetGapOpen(unsigned N); SCORE GetGonnetGapExtend(unsigned N); extern double GonnetLookup[400][400]; #endif // Gonnet_h muscle-3.8.31.orig/timing.h0000644000175000017500000000055411352261600015033 0ustar kratzcharles#if WIN32 typedef unsigned __int64 TICKS; #pragma warning(disable:4035) inline TICKS GetClockTicks() { _asm { _emit 0x0f _emit 0x31 } } #define StartTimer() __int64 t1__ = GetClockTicks() #define GetElapsedTicks() (GetClockTicks() - t1__) static double TicksToSecs(TICKS t) { return (__int64) t/2.5e9; } #endif // WIN32 muscle-3.8.31.orig/main.cpp0000644000175000017500000000244211367131123015023 0ustar kratzcharles//@@TODO reconcile /muscle with /muscle3.6 #include "muscle.h" #include #ifdef WIN32 #include // for SetPriorityClass() #include // for isatty() #else #include // for isatty() #endif const char *MUSCLE_LONG_VERSION = "MUSCLE v" SHORT_VERSION "." #include "svnversion.h" " by Robert C. Edgar"; int g_argc; char **g_argv; int main(int argc, char **argv) { #if WIN32 // Multi-tasking does not work well in CPU-bound // console apps running under Win32. // Reducing the process priority allows GUI apps // to run responsively in parallel. SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS); #endif g_argc = argc; g_argv = argv; SetNewHandler(); SetStartTime(); ProcessArgVect(argc - 1, argv + 1); SetParams(); SetLogFile(); //extern void TestSubFams(const char *); //TestSubFams(g_pstrInFileName); //return 0; if (g_bVersion) { printf("%s\n", MUSCLE_LONG_VERSION); exit(EXIT_SUCCESS); } if (!g_bQuiet) Credits(); if (MissingCommand() && isatty(0)) { Usage(); exit(EXIT_SUCCESS); } if (g_bCatchExceptions) { try { Run(); } catch (...) { OnException(); exit(EXIT_Except); } } else Run(); exit(EXIT_Success); } muscle-3.8.31.orig/setnewhandler.cpp0000644000175000017500000000114411352261673016750 0ustar kratzcharles#include "muscle.h" #include #include const int ONE_MB = 1024*1024; const size_t RESERVE_BYTES = 8*ONE_MB; static void *EmergencyReserve = 0; void OnOutOfMemory() { free(EmergencyReserve); fprintf(stderr, "\n*** OUT OF MEMORY ***\n"); fprintf(stderr, "Memory allocated so far %g MB\n", GetMemUseMB()); extern MSA *ptrBestMSA; if (ptrBestMSA == 0) fprintf(stderr, "No alignment generated\n"); else SaveCurrentAlignment(); exit(EXIT_FatalError); } void SetNewHandler() { EmergencyReserve = malloc(RESERVE_BYTES); std::set_new_handler(OnOutOfMemory); } muscle-3.8.31.orig/nwrec.cpp0000644000175000017500000000460011352261673015223 0ustar kratzcharles/*** Needleman-Wunch recursions Notation: i,j are prefix lengths so are in ranges i = [0,|A|] and j = [0,|B|]. Profile positions are in ranges [0,|A|-1] and [0,|B|-1] so prefix length i corresponds to position (i-1) in the profile, and similarly for j. Terminal gap scoring -------------------- Terminal gaps are scored as with open [close] penalties only at the left [right] terminal, as follows: 0 i | | A XXXXX... B ---XX... i |A|-1 | | A ...XXXXX B ...XX--- In these examples, open / close penalty at position i is included, but close / open penalty at |A|-1 / 0 is not included. This is implemented by setting the open [close] penalty to zero in the first [last] position of each profile. Consider adding a column to a sub-alignment. After the column is added, there are i letters from A and j letters from B. The column starts a left-terminal gap if: Delete with i=1, j=0 or Insert with i=0, j=1. The column ends a left-terminal gap if: Match following Delete with j=1, or Match following Insert with i=1. The column starts a right-terminal gap if: Delete following a Match and i=|A|, or Insert following a Match and j=|B|. The column ends a right-terminal gap if: Match with i=|A|, j=|B| following Delete or Insert. RECURSION RELATIONS =================== i-1 | DD A ..X X B ..- - MD A ..X X B ..X - D(i,j) = max D(i-1,j) + e M(i-1,j) + goA(i-1) Valid for: i = [1,|A|-1] j = [1,|B|] I(i,j) By symmetry with D(i,j). i-2 | i-1 | | MM A ..X X B ..X X DM A ..X X B ..- X IM A ..- X B ..X X | | | j-1 j-2 M(i,j) = L(i-1,j-1) + max M(i-1,j-1) D(i-1,j-1) + gcA(i-2) I(i-1,j-1) + gcB(j-2) Valid for: i = [2,|A|] j = [2,|B|] Equivalently: M(i+1,j+1) = L(i,j) + max M(i,j) D(i,j) + gcA(i-1) I(i,j) + gcB(j-1) Valid for: i = [1,|A|-1] j = [1,|B|-1] Boundary conditions =================== A XXXX B ---- D(0,0) = -infinity D(i,0) = ie i = [1,|A|] D(0,j) = -infinity j = [0,|B|] I(0,0), I(0,j) and I(i,0) by symmetry with D. M(0,0) = 0 M(i,0) = -infinity, i > 0 M(0,j) = -infinity, j > 0 A X B - D(1,0) = e D(1,j) = -infinity, j = [1,|B|] (assuming no I-D allowed). D(0,1) = -infinity D(1,1) = -infinity D(i,1) = max. ***/ muscle-3.8.31.orig/refinevert.cpp0000644000175000017500000000776511352261623016270 0ustar kratzcharles#include "muscle.h" #include "profile.h" #include "msa.h" #include "pwpath.h" #include "seqvect.h" #include "clust.h" #include "tree.h" #define TRACE 0 struct Range { unsigned m_uBestColLeft; unsigned m_uBestColRight; }; static void ListVertSavings(unsigned uColCount, unsigned uAnchorColCount, const Range *Ranges, unsigned uRangeCount) { if (!g_bVerbose || !g_bAnchors) return; double dTotalArea = uColCount*uColCount; double dArea = 0.0; for (unsigned i = 0; i < uRangeCount; ++i) { unsigned uLength = Ranges[i].m_uBestColRight - Ranges[i].m_uBestColLeft; dArea += uLength*uLength; } double dPct = (dTotalArea - dArea)*100.0/dTotalArea; Log("Anchor columns found %u\n", uAnchorColCount); Log("DP area saved by anchors %-4.1f%%\n", dPct); } static void ColsToRanges(const unsigned BestCols[], unsigned uBestColCount, unsigned uColCount, Range Ranges[]) { // N best columns produces N+1 vertical blocks. const unsigned uRangeCount = uBestColCount + 1; for (unsigned uIndex = 0; uIndex < uRangeCount ; ++uIndex) { unsigned uBestColLeft = 0; if (uIndex > 0) uBestColLeft = BestCols[uIndex-1]; unsigned uBestColRight = uColCount; if (uIndex < uBestColCount) uBestColRight = BestCols[uIndex]; Ranges[uIndex].m_uBestColLeft = uBestColLeft; Ranges[uIndex].m_uBestColRight = uBestColRight; } } // Return true if any changes made bool RefineVert(MSA &msaIn, const Tree &tree, unsigned uIters) { bool bAnyChanges = false; const unsigned uColCountIn = msaIn.GetColCount(); const unsigned uSeqCountIn = msaIn.GetSeqCount(); if (uColCountIn < 3 || uSeqCountIn < 3) return false; unsigned *AnchorCols = new unsigned[uColCountIn]; unsigned uAnchorColCount; SetMSAWeightsMuscle(msaIn); FindAnchorCols(msaIn, AnchorCols, &uAnchorColCount); const unsigned uRangeCount = uAnchorColCount + 1; Range *Ranges = new Range[uRangeCount]; #if TRACE Log("%u ranges\n", uRangeCount); #endif ColsToRanges(AnchorCols, uAnchorColCount, uColCountIn, Ranges); ListVertSavings(uColCountIn, uAnchorColCount, Ranges, uRangeCount); #if TRACE { Log("Anchor cols: "); for (unsigned i = 0; i < uAnchorColCount; ++i) Log(" %u", AnchorCols[i]); Log("\n"); Log("Ranges:\n"); for (unsigned i = 0; i < uRangeCount; ++i) Log("%4u - %4u\n", Ranges[i].m_uBestColLeft, Ranges[i].m_uBestColRight); } #endif delete[] AnchorCols; MSA msaOut; msaOut.SetSize(uSeqCountIn, 0); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCountIn; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uSeqIndex); unsigned uId = msaIn.GetSeqId(uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); msaOut.SetSeqId(uSeqIndex, uId); } for (unsigned uRangeIndex = 0; uRangeIndex < uRangeCount; ++uRangeIndex) { MSA msaRange; const Range &r = Ranges[uRangeIndex]; const unsigned uFromColIndex = r.m_uBestColLeft; const unsigned uRangeColCount = r.m_uBestColRight - uFromColIndex; if (0 == uRangeColCount) continue; else if (1 == uRangeColCount) { MSAFromColRange(msaIn, uFromColIndex, 1, msaRange); MSAAppend(msaOut, msaRange); continue; } MSAFromColRange(msaIn, uFromColIndex, uRangeColCount, msaRange); #if TRACE Log("\n-------------\n"); Log("Range %u - %u count=%u\n", r.m_uBestColLeft, r.m_uBestColRight, uRangeColCount); Log("Before:\n"); msaRange.LogMe(); #endif bool bLockLeft = (0 != uRangeIndex); bool bLockRight = (uRangeCount - 1 != uRangeIndex); bool bAnyChangesThisBlock = RefineHoriz(msaRange, tree, uIters, bLockLeft, bLockRight); bAnyChanges = (bAnyChanges || bAnyChangesThisBlock); #if TRACE Log("After:\n"); msaRange.LogMe(); #endif MSAAppend(msaOut, msaRange); #if TRACE Log("msaOut after Cat:\n"); msaOut.LogMe(); #endif } #if DEBUG // Sanity check AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut); #endif delete[] Ranges; if (bAnyChanges) msaIn.Copy(msaOut); return bAnyChanges; } muscle-3.8.31.orig/distpwkimura.cpp0000644000175000017500000000216611352261621016626 0ustar kratzcharles#include "muscle.h" #include "distfunc.h" #include "msa.h" #include "seqvect.h" #include "pwpath.h" void DistPWKimura(const SeqVect &v, DistFunc &DF) { SEQWEIGHT SeqWeightSave = GetSeqWeightMethod(); SetSeqWeightMethod(SEQWEIGHT_Henikoff); const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; SetProgressDesc("PWKimura distance"); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const Seq &s1 = v.GetSeq(uSeqIndex1); MSA msa1; msa1.FromSeq(s1); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) { if (0 == uCount%20) Progress(uCount, uPairCount); ++uCount; const Seq &s2 = v.GetSeq(uSeqIndex2); MSA msa2; msa2.FromSeq(s2); PWPath Path; MSA msaOut; AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false); double dPctId = msaOut.GetPctIdentityPair(0, 1); float f = (float) KimuraDist(dPctId); DF.SetDist(uSeqIndex1, uSeqIndex2, f); } } ProgressStepsDone(); SetSeqWeightMethod(SeqWeightSave); } muscle-3.8.31.orig/textfile.cpp0000644000175000017500000001507511366141374015741 0ustar kratzcharles#include "muscle.h" #include "textfile.h" #include TextFile::TextFile(const char szFileName[], bool bWrite) { FILE *ptrFile = 0; if (bWrite) { if (0 == strcmp(szFileName, "-")) ptrFile = stdout; else ptrFile = fopen(szFileName, "wb"); } else { if (0 == strcmp(szFileName, "-")) ptrFile = stdin; else ptrFile = fopen(szFileName, "rb"); } if (0 == ptrFile) Quit("Cannot open '%s' errno=%d\n", szFileName, errno); Init(ptrFile, szFileName); } void TextFile::Init(FILE *ptrFile, const char *ptrFileName) { m_ptrFile = ptrFile; m_ptrName = strdup(ptrFileName); m_uLineNr = 1; m_uColNr = 0; m_bLastCharWasEOL = true; m_cPushedBack = -1; #if DEBUG setbuf(m_ptrFile, 0); #endif } TextFile::TextFile(FILE *ptrFile, const char *ptrFileName) { Init(ptrFile, "-"); } TextFile::~TextFile() { if (m_ptrFile && m_ptrFile != stdin && m_ptrFile != stdout && m_ptrFile != stderr) fclose(m_ptrFile); free(m_ptrName); } // Get line from file. // Return true if end-of-file, quit if line too long. bool TextFile::GetLine(char szLine[], unsigned uBytes) { if (0 == uBytes) Quit("TextFile::GetLine, buffer zero size"); int FillVal = 0; // suppress warning from gcc that I don't understand memset(szLine, FillVal, (size_t) uBytes); unsigned uBytesCopied = 0; // Loop until end of line or end of file. for (;;) { char c; bool bEof = GetChar(c); if (bEof) return true; if ('\r' == c) continue; if ('\n' == c) return false; if (uBytesCopied < uBytes - 1) szLine[uBytesCopied++] = (char) c; else Quit("TextFile::GetLine: input buffer too small, line %u", m_uLineNr); } } // As GetLine, but trim leading and trailing blanks; skip empty lines bool TextFile::GetTrimLine(char szLine[], unsigned uBytes) { if (uBytes == 0) Quit("GetTrimLine"); for (;;) { bool bEOF = GetLine(szLine, uBytes); if (bEOF) return true; TrimBlanks(szLine); if (0 != szLine[0]) break; } return false; } void TextFile::Rewind() { fseek(m_ptrFile, 0, SEEK_SET); m_uLineNr = 1; m_bLastCharWasEOL = true; } void TextFile::PutChar(char c) { int i = fputc(c, m_ptrFile); assert(i == c); if ('\n' == c) { ++m_uLineNr; m_uColNr = 1; } else ++m_uColNr; } void TextFile::PutString(const char szLine[]) { int iError = fputs(szLine, m_ptrFile); assert(iError >= 0); } void TextFile::PutFormat(const char szFormat[], ...) { char szStr[4096]; va_list ArgList; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); PutString(szStr); } void TextFile::GetLineX(char szLine[], unsigned uBytes) { if (uBytes == 0) Quit("GetLineX"); bool bEof = GetLine(szLine, uBytes); if (bEof) Quit("end-of-file in GetLineX"); } bool TextFile::GetToken(char szToken[], unsigned uBytes, const char szCharTokens[]) { // Skip leading white space char c; for (;;) { bool bEof = GetChar(c); if (bEof) return true; if (!isspace(c)) break; } // Check for special case single-character tokens if (0 != strchr(szCharTokens, c)) { assert(uBytes >= 2); szToken[0] = c; szToken[1] = 0; return false; } // Loop until token terminated by white space, EOF or special unsigned uBytesCopied = 0; for (;;) { if (uBytesCopied < uBytes - 1) szToken[uBytesCopied++] = c; else Quit("TextFile::GetToken: input buffer too small, line %u", m_uLineNr); bool bEof = GetChar(c); if (bEof) { szToken[uBytesCopied] = 0; return true; } // Check for special case single-character tokens if (0 != strchr(szCharTokens, c)) { PushBack(c); assert(uBytesCopied > 0 && uBytesCopied < uBytes); szToken[uBytesCopied] = 0; return false; } if (isspace(c)) { assert(uBytesCopied > 0 && uBytesCopied < uBytes); szToken[uBytesCopied] = 0; return false; } } } void TextFile::GetTokenX(char szToken[], unsigned uBytes, const char szCharTokens[]) { bool bEof = GetToken(szToken, uBytes, szCharTokens); if (bEof) Quit("End-of-file in GetTokenX"); } void TextFile::Skip() { for (;;) { char c; bool bEof = GetChar(c); if (bEof || '\n' == c) return; assert(isspace(c)); } } #ifdef _WIN32 TEXTFILEPOS TextFile::GetPos() { fpos_t p; int i = fgetpos(m_ptrFile, &p); assert(0 == i); assert(p >= 0); TEXTFILEPOS Pos; Pos.uOffset = (unsigned) p; Pos.uLineNr = m_uLineNr; Pos.uColNr = m_uColNr; return Pos; } void TextFile::SetPos(TEXTFILEPOS Pos) { fpos_t p = (fpos_t) Pos.uOffset; int i = fsetpos(m_ptrFile, &p); assert(0 == i); m_uLineNr = Pos.uLineNr; m_uColNr = Pos.uColNr; } #else TEXTFILEPOS TextFile::GetPos() { TEXTFILEPOS Pos; Pos.uOffset = ftell(m_ptrFile); Pos.uLineNr = m_uLineNr; Pos.uColNr = m_uColNr; return Pos; } void TextFile::SetPos(TEXTFILEPOS Pos) { fseek(m_ptrFile, Pos.uOffset, SEEK_SET); m_uLineNr = Pos.uLineNr; m_uColNr = Pos.uColNr; } #endif bool TextFile::GetChar(char &c) { if (-1 != m_cPushedBack) { c = (char) m_cPushedBack; m_cPushedBack = -1; return false; } int ic = fgetc(m_ptrFile); if (ic < 0) { if (feof(m_ptrFile)) { // Hack to fix up a non-empty text file that is missing // and end-of-line character in the last line. if (!m_bLastCharWasEOL && m_uLineNr > 0) { c = '\n'; m_bLastCharWasEOL = true; return false; } return true; } Quit("TextFile::GetChar, error %s", strerror(errno)); } c = (char) ic; if ('\n' == c) { m_bLastCharWasEOL = true; ++m_uLineNr; m_uColNr = 1; } else { m_bLastCharWasEOL = false; ++m_uColNr; } return false; } void TextFile::GetCharX(char &c) { bool bEof = GetChar(c); if (bEof) Quit("End-of-file in GetCharX"); } void TextFile::GetNonblankChar(char &c) { do { bool bEof = GetChar(c); if (bEof) Quit("End-of-file in GetCharX"); } while (isspace(c)); } void TextFile::SkipLine() { if (m_bLastCharWasEOL) return; for (;;) { char c; bool bEof = GetChar(c); if (bEof) Quit("End-of-file in SkipLine"); if ('\n' == c) break; } } void TextFile::SkipWhite() { bool bEof = SkipWhiteX(); if (bEof) Quit("End-of-file skipping white space"); } bool TextFile::SkipWhiteX() { for (;;) { char c; bool bEof = GetChar(c); if (bEof) return true; if (!isspace(c)) { PushBack(c); break; } } return false; } muscle-3.8.31.orig/setblosumweights.cpp0000644000175000017500000001147511352261600017513 0ustar kratzcharles/*** Code for implementing HMMer's "BLOSUM weighting" algorithm. The algorithm was deduced by reverse-engineering the HMMer code. The HMMer documentation refers to BLOSUM weighting as "Henikoff simple filter weighting" The name BLOSUM implied to me that HMMer would be using a substitution probability matrix to compute distances, but this turned out not to be the case. It is notable, not to say puzzling, that the HMMer BLOSUM weighting algorithm is guaranteed to produce an integral NIC (number-of-indepdent- counts, also known as effective sequence count). Presumably Eddy must have known this, though he doesn't comment on it and he computes & stores the value in a float. Here's the algorithm: Distances between two sequences are based on the average of a simple binary equal (one) / not equal (zero) at each position. The only thing that has anything to do with BLOSUM in this calculation is an obscure (to me) constant value of 0.62. The sequences are clustered using this distance. If the pairwise identity (fraction of identical positions) is less than 0.62, they get assigned to disjoint clusters, the final number of disjoint clusters is the NIC. This makes some intuitive sense: I would interpret this by saying that if a set of sequences are close enough they count as one sequence. The weight for each sequence within a disjoint cluster is then determined to be 1 / (clustersize), from which it follows that the sum of all weights is equal to the number of disjoint clusters and is thus guaranteed to be an integer value. It is exactly this sum that HMMer uses for the NIC, by default. The individual BLOSUM sequence weights are not used for anything else in HMMer, unless you specify that BLOSUM weighting should override the default GSC weighting. GSC weighting uses a different clustering algorithm to determine relative weights. The BLOSUM NIC is then distributed over the GSC tree according to those relative weights. ***/ #include "muscle.h" #include "msa.h" #include "cluster.h" #include "distfunc.h" // Set weights of all sequences in the subtree under given node. void MSA::SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const { if (0 == ptrNode) return; const ClusterNode *ptrRight = ptrNode->GetRight(); const ClusterNode *ptrLeft = ptrNode->GetLeft(); // If leaf, set weight if (0 == ptrRight && 0 == ptrLeft) { unsigned uIndex = ptrNode->GetIndex(); WEIGHT w = DoubleToWeight(dWeight); m_Weights[uIndex] = w; return; } // Otherwise, recursively set subtrees SetBLOSUMSubtreeWeight(ptrLeft, dWeight); SetBLOSUMSubtreeWeight(ptrRight, dWeight); } // Traverse a subtree looking for clusters where all // the leaves are sufficiently similar that they // should be weighted as a group, i.e. given a weight // of 1/N where N is the cluster size. The idea is // to avoid sample bias where we have closely related // sequences in the input alignment. // The weight at a node is the distance between // the two closest sequences in the left and right // subtrees under that node. "Sufficiently similar" // is defined as being where that minimum distance // is less than the dMinDist threshhold. I don't know // why the clustering is done using a minimum rather // than a maximum or average, either of which would // seem more natural to me. // Return value is number of groups under this node. // A "group" is the cluster found under a node with a // weight less than the minimum. unsigned MSA::SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const { if (0 == ptrNode) return 0; if (ptrNode->GetWeight() < dMinDist) { unsigned uClusterSize = ptrNode->GetClusterSize(); assert(uClusterSize > 0); double dWeight = 1.0 / uClusterSize; SetBLOSUMSubtreeWeight(ptrNode, dWeight); return 1; } const ClusterNode *ptrLeft = ptrNode->GetLeft(); const ClusterNode *ptrRight = ptrNode->GetRight(); unsigned uLeftGroupCount = SetBLOSUMNodeWeight(ptrLeft, dMinDist); unsigned uRightGroupCount = SetBLOSUMNodeWeight(ptrRight, dMinDist); return uLeftGroupCount + uRightGroupCount; } // Return value is the group count, i.e. the effective number // of distinctly different sequences. unsigned MSA::CalcBLOSUMWeights(ClusterTree &BlosumCluster) const { // Build distance matrix DistFunc DF; unsigned uSeqCount = GetSeqCount(); DF.SetCount(uSeqCount); for (unsigned i = 0; i < uSeqCount; ++i) for (unsigned j = i+1; j < uSeqCount; ++j) { double dDist = GetPctIdentityPair(i, j); assert(dDist >= 0.0 && dDist <= 1.0); DF.SetDist(i, j, (float) (1.0 - dDist)); } // Cluster based on the distance function BlosumCluster.Create(DF); // Return value is HMMer's "effective sequence count". return SetBLOSUMNodeWeight(BlosumCluster.GetRoot(), 1.0 - BLOSUM_DIST); } muscle-3.8.31.orig/usage.cpp0000644000175000017500000000331111367132113015177 0ustar kratzcharles#include "muscle.h" #include void Credits() { static bool Displayed = false; if (Displayed) return; fprintf(stderr, "\n%s\n\n", MUSCLE_LONG_VERSION); fprintf(stderr, "http://www.drive5.com/muscle\n"); fprintf(stderr, "This software is donated to the public domain.\n"); fprintf(stderr, "Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.\n\n"); Displayed = true; } void Usage() { Credits(); fprintf(stderr, "\n" "Basic usage\n" "\n" " muscle -in -out \n" "\n" "Common options (for a complete list please see the User Guide):\n" "\n" " -in Input file in FASTA format (default stdin)\n" " -out Output alignment in FASTA format (default stdout)\n" " -diags Find diagonals (faster for similar sequences)\n" " -maxiters Maximum number of iterations (integer, default 16)\n" " -maxhours Maximum time to iterate in hours (default no limit)\n" " -html Write output in HTML format (default FASTA)\n" " -msf Write output in GCG MSF format (default FASTA)\n" " -clw Write output in CLUSTALW format (default FASTA)\n" " -clwstrict As -clw, with 'CLUSTAL W (1.81)' header\n" " -log[a] Log to file (append if -loga, overwrite if -log)\n" " -quiet Do not write progress messages to stderr\n" " -version Display version information and exit\n" "\n" "Without refinement (very fast, avg accuracy similar to T-Coffee): -maxiters 2\n" "Fastest possible (amino acids): -maxiters 1 -diags -sv -distance1 kbit20_3\n" "Fastest possible (nucleotides): -maxiters 1 -diags\n"); } muscle-3.8.31.orig/msadistkimura.h0000644000175000017500000000037511352261600016422 0ustar kratzcharles#ifndef MSADistKimura_h #define MSADistKimura_h #include "msadist.h" class MSADistKimura : public MSADist { public: virtual double ComputeDist(const MSA &msa, unsigned uSeqIndex1, unsigned uSeqIndex2); }; #endif // MSADistKimura_h muscle-3.8.31.orig/onexception.cpp0000644000175000017500000000045611366141374016445 0ustar kratzcharles#include "muscle.h" #include static char szOnExceptionMessage[] = { "\nFatal error, exception caught.\n" }; void OnException() { fprintf(stderr, "%s", szOnExceptionMessage); Log("%s", szOnExceptionMessage); Log("Finished %s\n", GetTimeAsStr()); exit(EXIT_Except); } muscle-3.8.31.orig/realigndiffs.cpp0000644000175000017500000000543611352261615016546 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "tree.h" #include "profile.h" #include "pwpath.h" #define TRACE 0 // Progressive alignment according to a diffs tree. static void MakeNode(const MSA &msaIn, const Tree &Diffs, unsigned uDiffsNodeIndex, const unsigned IdToDiffsTreeNodeIndex[], ProgNode &Node) { const unsigned uSeqCount = msaIn.GetSeqCount(); unsigned *Ids = new unsigned[uSeqCount]; unsigned uSeqsInDiffCount = 0; for (unsigned uId = 0; uId < uSeqCount; ++uId) { if (IdToDiffsTreeNodeIndex[uId] == uDiffsNodeIndex) { Ids[uSeqsInDiffCount] = uId; ++uSeqsInDiffCount; } } if (0 == uSeqsInDiffCount) Quit("MakeNode: no seqs in diff"); MSASubsetByIds(msaIn, Ids, uSeqsInDiffCount, Node.m_MSA); #if DEBUG ValidateMuscleIds(Node.m_MSA); #endif DeleteGappedCols(Node.m_MSA); delete[] Ids; } void RealignDiffs(const MSA &msaIn, const Tree &Diffs, const unsigned IdToDiffsTreeNodeIndex[], MSA &msaOut) { assert(Diffs.IsRooted()); #if TRACE Log("RealignDiffs\n"); Log("Diff tree:\n"); Diffs.LogMe(); #endif const unsigned uNodeCount = Diffs.GetNodeCount(); if (uNodeCount%2 == 0) Quit("RealignDiffs: Expected odd number of nodes"); const unsigned uMergeCount = (uNodeCount - 1)/2; ProgNode *ProgNodes = new ProgNode[uNodeCount]; unsigned uJoin = 0; SetProgressDesc("Refine tree"); for (unsigned uDiffsNodeIndex = Diffs.FirstDepthFirstNode(); NULL_NEIGHBOR != uDiffsNodeIndex; uDiffsNodeIndex = Diffs.NextDepthFirstNode(uDiffsNodeIndex)) { if (Diffs.IsLeaf(uDiffsNodeIndex)) { assert(uDiffsNodeIndex < uNodeCount); if (uDiffsNodeIndex >= uNodeCount) Quit("TreeNodeIndex=%u NodeCount=%u\n", uDiffsNodeIndex, uNodeCount); ProgNode &Node = ProgNodes[uDiffsNodeIndex]; MakeNode(msaIn, Diffs, uDiffsNodeIndex, IdToDiffsTreeNodeIndex, Node); Node.m_uLength = Node.m_MSA.GetColCount(); } else { Progress(uJoin, uMergeCount); ++uJoin; const unsigned uMergeNodeIndex = uDiffsNodeIndex; ProgNode &Parent = ProgNodes[uMergeNodeIndex]; const unsigned uLeft = Diffs.GetLeft(uDiffsNodeIndex); const unsigned uRight = Diffs.GetRight(uDiffsNodeIndex); ProgNode &Node1 = ProgNodes[uLeft]; ProgNode &Node2 = ProgNodes[uRight]; PWPath Path; AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); #if TRACE { Log("Combined:\n"); Parent.m_MSA.LogMe(); } #endif Node1.m_MSA.Clear(); Node2.m_MSA.Clear(); } } ProgressStepsDone(); unsigned uRootNodeIndex = Diffs.GetRootNodeIndex(); const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; msaOut.Copy(RootProgNode.m_MSA); #if DEBUG AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut); #endif delete[] ProgNodes; ProgNodes = 0; } muscle-3.8.31.orig/distcalc.h0000644000175000017500000000205611352261611015333 0ustar kratzcharles#ifndef DistCalc_h #define DistCalc_h typedef float dist_t; const dist_t BIG_DIST = (dist_t) 1e29; class DistFunc; class DistCalc { public: virtual void CalcDistRange(unsigned i, dist_t Dist[]) const = 0; virtual unsigned GetCount() const = 0; virtual unsigned GetId(unsigned i) const = 0; virtual const char *GetName(unsigned i) const = 0; }; class DistCalcDF : public DistCalc { public: void Init(const DistFunc &DF); virtual void CalcDistRange(unsigned i, dist_t Dist[]) const; virtual unsigned GetCount() const; virtual unsigned GetId(unsigned i) const; virtual const char *GetName(unsigned i) const; private: const DistFunc *m_ptrDF; }; class DistCalcMSA : public DistCalc { public: void Init(const MSA &msa, DISTANCE Distance); virtual void CalcDistRange(unsigned i, dist_t Dist[]) const; virtual unsigned GetCount() const; virtual unsigned GetId(unsigned i) const; virtual const char *GetName(unsigned i) const; private: const MSA *m_ptrMSA; DISTANCE m_Distance; }; #endif // DistCalc_h muscle-3.8.31.orig/glbalignsimple.cpp0000644000175000017500000002176011352261667017107 0ustar kratzcharles#include "muscle.h" #include #include "pwpath.h" #include "profile.h" #include #define TRACE 0 #if 1 // SINGLE_AFFINE extern bool g_bKeepSimpleDP; extern SCORE *g_DPM; extern SCORE *g_DPD; extern SCORE *g_DPI; extern char *g_TBM; extern char *g_TBD; extern char *g_TBI; static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -100000) return " *"; sprintf(str, "%6.1f", s); return str; } static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB)); Log("\n"); } } static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); SetTermGaps(PA, uLengthA); SetTermGaps(PB, uLengthB); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; // Allocate DP matrices const size_t LM = uPrefixCountA*uPrefixCountB; SCORE *DPL_ = new SCORE[LM]; SCORE *DPM_ = new SCORE[LM]; SCORE *DPD_ = new SCORE[LM]; SCORE *DPI_ = new SCORE[LM]; char *TBM_ = new char[LM]; char *TBD_ = new char[LM]; char *TBI_ = new char[LM]; memset(TBM_, '?', LM); memset(TBD_, '?', LM); memset(TBI_, '?', LM); DPM(0, 0) = 0; DPD(0, 0) = MINUS_INFINITY; DPI(0, 0) = MINUS_INFINITY; DPM(1, 0) = MINUS_INFINITY; DPD(1, 0) = PA[0].m_scoreGapOpen; TBD(1, 0) = 'D'; DPI(1, 0) = MINUS_INFINITY; DPM(0, 1) = MINUS_INFINITY; DPD(0, 1) = MINUS_INFINITY; DPI(0, 1) = PB[0].m_scoreGapOpen; TBI(0, 1) = 'I'; // Empty prefix of B is special case for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { // M=LetterA+LetterB, impossible with empty prefix DPM(uPrefixLengthA, 0) = MINUS_INFINITY; // D=LetterA+GapB DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend; TBD(uPrefixLengthA, 0) = 'D'; // I=GapA+LetterB, impossible with empty prefix DPI(uPrefixLengthA, 0) = MINUS_INFINITY; } // Empty prefix of A is special case for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { // M=LetterA+LetterB, impossible with empty prefix DPM(0, uPrefixLengthB) = MINUS_INFINITY; // D=LetterA+GapB, impossible with empty prefix DPD(0, uPrefixLengthB) = MINUS_INFINITY; // I=GapA+LetterB DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend; TBI(0, uPrefixLengthB) = 'I'; } // Special case to agree with NWFast, no D-I transitions so... DPD(uLengthA, 0) = MINUS_INFINITY; // DPI(0, uLengthB) = MINUS_INFINITY; // ============ // Main DP loop // ============ SCORE scoreGapCloseB = MINUS_INFINITY; for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreGapCloseA = MINUS_INFINITY; for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; { // Match M=LetterA+LetterB SCORE scoreLL = ScoreProfPos2(PPA, PPB); DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL; SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; SCORE scoreBest; if (scoreMM >= scoreDM && scoreMM >= scoreIM) { scoreBest = scoreMM; TBM(uPrefixLengthA, uPrefixLengthB) = 'M'; } else if (scoreDM >= scoreMM && scoreDM >= scoreIM) { scoreBest = scoreDM; TBM(uPrefixLengthA, uPrefixLengthB) = 'D'; } else { assert(scoreIM >= scoreMM && scoreIM >= scoreDM); scoreBest = scoreIM; TBM(uPrefixLengthA, uPrefixLengthB) = 'I'; } DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; } { // Delete D=LetterA+GapB SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen; SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend; SCORE scoreBest; if (scoreMD >= scoreDD) { scoreBest = scoreMD; TBD(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreDD >= scoreMD); scoreBest = scoreDD; TBD(uPrefixLengthA, uPrefixLengthB) = 'D'; } DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert I=GapA+LetterB { SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB - 1].m_scoreGapOpen; SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend; SCORE scoreBest; if (scoreMI >= scoreII) { scoreBest = scoreMI; TBI(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreII > scoreMI); scoreBest = scoreII; TBI(uPrefixLengthA, uPrefixLengthB) = 'I'; } DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; } scoreGapCloseA = PPA.m_scoreGapClose; } scoreGapCloseB = PPB.m_scoreGapClose; } #if TRACE Log("\n"); Log("Simple DPL:\n"); ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple DPM:\n"); ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple DPD:\n"); ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple DPI:\n"); ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple TBM:\n"); ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple TBD:\n"); ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple TBI:\n"); ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); #endif // Trace-back // ========== Path.Clear(); // Find last edge SCORE M = DPM(uLengthA, uLengthB); SCORE D = DPD(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose; SCORE I = DPI(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose; char cEdgeType = '?'; SCORE BestScore = MINUS_INFINITY; if (M >= D && M >= I) { cEdgeType = 'M'; BestScore = M; } else if (D >= M && D >= I) { cEdgeType = 'D'; BestScore = D; } else { assert(I >= M && I >= D); cEdgeType = 'I'; BestScore = I; } #if TRACE Log("Simple: MAB=%.4g DAB=%.4g IAB=%.4g best=%c\n", M, D, I, cEdgeType); #endif unsigned PLA = uLengthA; unsigned PLB = uLengthB; for (;;) { PWEdge Edge; Edge.cType = cEdgeType; Edge.uPrefixLengthA = PLA; Edge.uPrefixLengthB = PLB; #if TRACE Log("Prepend %c%d.%d\n", Edge.cType, PLA, PLB); #endif Path.PrependEdge(Edge); switch (cEdgeType) { case 'M': assert(PLA > 0); assert(PLB > 0); cEdgeType = TBM(PLA, PLB); --PLA; --PLB; break; case 'D': assert(PLA > 0); cEdgeType = TBD(PLA, PLB); --PLA; break; case 'I': assert(PLB > 0); cEdgeType = TBI(PLA, PLB); --PLB; break; default: Quit("Invalid edge %c", cEdgeType); } if (0 == PLA && 0 == PLB) break; } Path.Validate(); // SCORE Score = TraceBack(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, Path); #if TRACE SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path); Path.LogMe(); Log("Score = %s Path = %s\n", LocalScoreToStr(BestScore), LocalScoreToStr(scorePath)); #endif if (g_bKeepSimpleDP) { g_DPM = DPM_; g_DPD = DPD_; g_DPI = DPI_; g_TBM = TBM_; g_TBD = TBD_; g_TBI = TBI_; } else { delete[] DPM_; delete[] DPD_; delete[] DPI_; delete[] TBM_; delete[] TBD_; delete[] TBI_; } return BestScore; } #endif // SINLGLE_AFFINE muscle-3.8.31.orig/globalslinux.cpp0000644000175000017500000000553211352261673016615 0ustar kratzcharles#include "muscle.h" #if defined(__linux__) #include #include #include #include #include #include const int ONE_MB = 1000000; const int MEM_WARNING_THRESHOLD = 20*ONE_MB; double GetNAN() { static unsigned long nan[2]={0xffffffff, 0x7fffffff}; double dNAN = *( double* )nan; return dNAN; } double g_dNAN = GetNAN(); void chkmem(const char szMsg[]) { //assert(_CrtCheckMemory()); } void Break() { //DebugBreak(); } static char szCmdLine[4096]; void *ptrStartBreak = sbrk(0); const char *GetCmdLine() { return szCmdLine; } double GetMemUseMB() { static char statm[64]; static int PageSize; if (0 == statm[0]) { PageSize = sysconf(_SC_PAGESIZE); pid_t pid = getpid(); sprintf(statm, "/proc/%d/statm", (int) pid); } int fd = open(statm, O_RDONLY); if (-1 == fd) return -1; char Buffer[64]; int n = read(fd, Buffer, sizeof(Buffer) - 1); close(fd); fd = -1; if (n <= 0) { static bool Warned = false; if (!Warned) { Warned = true; Warning("*Warning* Cannot read %s errno=%d %s", statm, errno, strerror(errno)); } return 0; } Buffer[n] = 0; int Pages = atoi(Buffer); return ((double) Pages * (double) PageSize)/1e6; } void SaveCmdLine(int argc, char *argv[]) { for (int i = 0; i < argc; ++i) { if (i > 0) strcat(szCmdLine, " "); strcat(szCmdLine, argv[i]); } } double dPeakMemUseMB = 0; double GetPeakMemUseMB() { CheckMemUse(); return dPeakMemUseMB; } double GetCPUGHz() { double dGHz = 2.5; const char *e = getenv("CPUGHZ"); if (0 != e) dGHz = atof(e); return dGHz; } void CheckMemUse() { double dMB = GetMemUseMB(); if (dMB > dPeakMemUseMB) dPeakMemUseMB = dMB; } double GetRAMSizeMB() { const double DEFAULT_RAM = 500; static double RAMMB = 0; if (RAMMB != 0) return RAMMB; int fd = open("/proc/meminfo", O_RDONLY); if (-1 == fd) { static bool Warned = false; if (!Warned) { Warned = true; Warning("*Warning* Cannot open /proc/meminfo errno=%d %s", errno, strerror(errno)); } return DEFAULT_RAM; } char Buffer[1024]; int n = read(fd, Buffer, sizeof(Buffer) - 1); close(fd); fd = -1; if (n <= 0) { static bool Warned = false; if (!Warned) { Warned = true; Warning("*Warning* Cannot read /proc/meminfo errno=%d %s", errno, strerror(errno)); } return DEFAULT_RAM; } Buffer[n] = 0; char *pMem = strstr(Buffer, "MemTotal: "); if (0 == pMem) { static bool Warned = false; if (!Warned) { Warned = true; Warning("*Warning* 'MemTotal:' not found in /proc/meminfo"); } return DEFAULT_RAM; } int Bytes = atoi(pMem+9)*1000; return ((double) Bytes)/1e6; } #endif // !WIN32 muscle-3.8.31.orig/aligntwomsas.cpp0000644000175000017500000000166211352261667016625 0ustar kratzcharles#include "muscle.h" #include "msa.h" #include "profile.h" #include "pwpath.h" #include "textfile.h" #include "timing.h" SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path, bool bLockLeft, bool bLockRight) { const unsigned uLengthA = msa1.GetColCount(); const unsigned uLengthB = msa2.GetColCount(); ProfPos *PA = ProfileFromMSA(msa1); ProfPos *PB = ProfileFromMSA(msa2); if (bLockLeft) { PA[0].m_scoreGapOpen = MINUS_INFINITY; PB[0].m_scoreGapOpen = MINUS_INFINITY; } if (bLockRight) { PA[uLengthA-1].m_scoreGapClose = MINUS_INFINITY; PB[uLengthB-1].m_scoreGapClose = MINUS_INFINITY; } float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0 if (r < 1) r = 1/r; SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path); AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut); delete[] PA; delete[] PB; return Score; } muscle-3.8.31.orig/seq.cpp0000644000175000017500000001552411352261667014707 0ustar kratzcharles#include "muscle.h" #include "seq.h" #include "textfile.h" #include "msa.h" //#include const size_t MAX_FASTA_LINE = 16000; void Seq::SetName(const char *ptrName) { delete[] m_ptrName; size_t n = strlen(ptrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, ptrName); } void Seq::ToFASTAFile(TextFile &File) const { File.PutFormat(">%s\n", m_ptrName); unsigned uColCount = Length(); for (unsigned n = 0; n < uColCount; ++n) { if (n > 0 && n%60 == 0) File.PutString("\n"); File.PutChar(at(n)); } File.PutString("\n"); } // Return true on end-of-file bool Seq::FromFASTAFile(TextFile &File) { Clear(); char szLine[MAX_FASTA_LINE]; bool bEof = File.GetLine(szLine, sizeof(szLine)); if (bEof) return true; if ('>' != szLine[0]) Quit("Expecting '>' in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); size_t n = strlen(szLine); if (1 == n) Quit("Missing annotation following '>' in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); m_ptrName = new char[n]; strcpy(m_ptrName, szLine + 1); TEXTFILEPOS Pos = File.GetPos(); for (;;) { bEof = File.GetLine(szLine, sizeof(szLine)); if (bEof) { if (0 == size()) { Quit("Empty sequence in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); return true; } return false; } if ('>' == szLine[0]) { if (0 == size()) Quit("Empty sequence in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); // Rewind to beginning of this line, it's the start of the // next sequence. File.SetPos(Pos); return false; } const char *ptrChar = szLine; while (char c = *ptrChar++) { if (isspace(c)) continue; if (IsGapChar(c)) continue; if (!IsResidueChar(c)) { if (isprint(c)) { char w = GetWildcardChar(); Warning("Invalid residue '%c' in FASTA file %s line %d, replaced by '%c'", c, File.GetFileName(), File.GetLineNr(), w); c = w; } else Quit("Invalid byte hex %02x in FASTA file %s line %d", (unsigned char) c, File.GetFileName(), File.GetLineNr()); } c = toupper(c); push_back(c); } Pos = File.GetPos(); } } void Seq::ExtractUngapped(MSA &msa) const { msa.Clear(); unsigned uColCount = Length(); msa.SetSize(1, 1); unsigned uUngappedPos = 0; for (unsigned n = 0; n < uColCount; ++n) { char c = at(n); if (!IsGapChar(c)) msa.SetChar(0, uUngappedPos++, c); } msa.SetSeqName(0, m_ptrName); } void Seq::Copy(const Seq &rhs) { clear(); const unsigned uLength = rhs.Length(); for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) push_back(rhs.at(uColIndex)); const char *ptrName = rhs.GetName(); size_t n = strlen(ptrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, ptrName); SetId(rhs.GetId()); } void Seq::CopyReversed(const Seq &rhs) { clear(); const unsigned uLength = rhs.Length(); const unsigned uBase = rhs.Length() - 1; for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) push_back(rhs.at(uBase - uColIndex)); const char *ptrName = rhs.GetName(); size_t n = strlen(ptrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, ptrName); } void Seq::StripGaps() { for (CharVect::iterator p = begin(); p != end(); ) { char c = *p; if (IsGapChar(c)) erase(p); else ++p; } } void Seq::StripGapsAndWhitespace() { for (CharVect::iterator p = begin(); p != end(); ) { char c = *p; if (isspace(c) || IsGapChar(c)) erase(p); else ++p; } } void Seq::ToUpper() { for (CharVect::iterator p = begin(); p != end(); ++p) { char c = *p; if (islower(c)) *p = toupper(c); } } unsigned Seq::GetLetter(unsigned uIndex) const { assert(uIndex < Length()); char c = operator[](uIndex); return CharToLetter(c); } bool Seq::EqIgnoreCase(const Seq &s) const { const unsigned n = Length(); if (n != s.Length()) return false; for (unsigned i = 0; i < n; ++i) { const char c1 = at(i); const char c2 = s.at(i); if (IsGapChar(c1)) { if (!IsGapChar(c2)) return false; } else { if (toupper(c1) != toupper(c2)) return false; } } return true; } bool Seq::Eq(const Seq &s) const { const unsigned n = Length(); if (n != s.Length()) return false; for (unsigned i = 0; i < n; ++i) { const char c1 = at(i); const char c2 = s.at(i); if (c1 != c2) return false; } return true; } bool Seq::EqIgnoreCaseAndGaps(const Seq &s) const { const unsigned uThisLength = Length(); const unsigned uOtherLength = s.Length(); unsigned uThisPos = 0; unsigned uOtherPos = 0; int cThis; int cOther; for (;;) { if (uThisPos == uThisLength && uOtherPos == uOtherLength) break; // Set cThis to next non-gap character in this string // or -1 if end-of-string. for (;;) { if (uThisPos == uThisLength) { cThis = -1; break; } else { cThis = at(uThisPos); ++uThisPos; if (!IsGapChar(cThis)) { cThis = toupper(cThis); break; } } } // Set cOther to next non-gap character in s // or -1 if end-of-string. for (;;) { if (uOtherPos == uOtherLength) { cOther = -1; break; } else { cOther = s.at(uOtherPos); ++uOtherPos; if (!IsGapChar(cOther)) { cOther = toupper(cOther); break; } } } // Compare characters are corresponding ungapped position if (cThis != cOther) return false; } return true; } unsigned Seq::GetUngappedLength() const { unsigned uUngappedLength = 0; for (CharVect::const_iterator p = begin(); p != end(); ++p) { char c = *p; if (!IsGapChar(c)) ++uUngappedLength; } return uUngappedLength; } void Seq::LogMe() const { Log(">%s\n", m_ptrName); const unsigned n = Length(); for (unsigned i = 0; i < n; ++i) Log("%c", at(i)); Log("\n"); } void Seq::FromString(const char *pstrSeq, const char *pstrName) { clear(); const unsigned uLength = (unsigned) strlen(pstrSeq); for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) push_back(pstrSeq[uColIndex]); size_t n = strlen(pstrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, pstrName); } bool Seq::HasGap() const { for (CharVect::const_iterator p = begin(); p != end(); ++p) { char c = *p; if (IsGapChar(c)) return true; } return false; } void Seq::FixAlpha() { for (CharVect::iterator p = begin(); p != end(); ++p) { char c = *p; if (!IsResidueChar(c)) { char w = GetWildcardChar(); // Warning("Invalid residue '%c', replaced by '%c'", c, w); InvalidLetterWarning(c, w); *p = w; } } }