aligngivenpath.cpp0000664000175000017500000005254512360262614012667 0ustar bobbob#include "muscle.h" #include "msa.h" #include "pwpath.h" #include "profile.h" #define TRACE 0 static void LogPP(const ProfPos &PP) { Log("ResidueGroup %u\n", PP.m_uResidueGroup); Log("AllGaps %d\n", PP.m_bAllGaps); Log("Occ %.3g\n", PP.m_fOcc); Log("LL=%.3g LG=%.3g GL=%.3g GG=%.3g\n", PP.m_LL, PP.m_LG, PP.m_GL, PP.m_GG); Log("Freqs "); for (unsigned i = 0; i < 20; ++i) if (PP.m_fcCounts[i] > 0) Log("%c=%.3g ", LetterToChar(i), PP.m_fcCounts[i]); Log("\n"); } static void AssertProfPosEq(const ProfPos *PA, const ProfPos *PB, unsigned i) { const ProfPos &PPA = PA[i]; const ProfPos &PPB = PB[i]; #define eq(x) if (PPA.m_##x != PPB.m_##x) { LogPP(PPA); LogPP(PPB); Quit("AssertProfPosEq." #x); } #define be(x) if (!BTEq(PPA.m_##x, PPB.m_##x)) { LogPP(PPA); LogPP(PPB); Quit("AssertProfPosEq." #x); } eq(bAllGaps) eq(uResidueGroup) be(LL) be(LG) be(GL) be(GG) be(fOcc) be(scoreGapOpen) be(scoreGapClose) for (unsigned j = 0; j < 20; ++j) { #define eqj(x) if (PPA.m_##x != PPB.m_##x) Quit("AssertProfPosEq j=%u " #x, j); #define bej(x) if (!BTEq(PPA.m_##x, PPB.m_##x)) Quit("AssertProfPosEq j=%u " #x, j); bej(fcCounts[j]); // eqj(uSortOrder[j]) // may differ due to ties, don't check? bej(AAScores[j]) #undef eqj #undef bej } #undef eq #undef be } void AssertProfsEq(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB) { if (uLengthA != uLengthB) Quit("AssertProfsEq: lengths differ %u %u", uLengthA, uLengthB); for (unsigned i = 0; i < uLengthB; ++i) AssertProfPosEq(PA, PB, i); } #if DEBUG static void ValidateProf(const ProfPos *Prof, unsigned uLength) { for (unsigned i = 0; i < uLength; ++i) { const ProfPos &PP = Prof[i]; FCOUNT s1 = PP.m_LL + PP.m_LG + PP.m_GL + PP.m_GG; assert(BTEq(s1, 1.0)); if (i > 0) { const ProfPos &PPPrev = Prof[i-1]; FCOUNT s2 = PPPrev.m_LL + PPPrev.m_GL; FCOUNT s3 = PP.m_LL + PP.m_LG; assert(BTEq(s2, s3)); } if (i < uLength - 1) { const ProfPos &PPNext = Prof[i+1]; FCOUNT s4 = PP.m_LL + PP.m_GL; FCOUNT s5 = PPNext.m_LL + PPNext.m_LG; assert(BTEq(s4, s5)); } } } #else #define ValidateProf(Prof, Length) /* empty */ #endif static void ScoresFromFreqsPos(ProfPos *Prof, unsigned uLength, unsigned uPos) { ProfPos &PP = Prof[uPos]; SortCounts(PP.m_fcCounts, PP.m_uSortOrder); PP.m_uResidueGroup = ResidueGroupFromFCounts(PP.m_fcCounts); // "Occupancy" PP.m_fOcc = PP.m_LL + PP.m_GL; // Frequency of gap-opens in this position (i) // Gap open = letter in i-1 and gap in i // = iff LG in i FCOUNT fcOpen = PP.m_LG; // Frequency of gap-closes in this position // Gap close = gap in i and letter in i+1 // = iff GL in i+1 FCOUNT fcClose; if (uPos + 1 < uLength) fcClose = Prof[uPos + 1].m_GL; else fcClose = PP.m_GG + PP.m_LG; PP.m_scoreGapOpen = (SCORE) ((1.0 - fcOpen)*g_scoreGapOpen/2.0); PP.m_scoreGapClose = (SCORE) ((1.0 - fcClose)*g_scoreGapOpen/2.0); #if DOUBLE_AFFINE PP.m_scoreGapOpen2 = (SCORE) ((1.0 - fcOpen)*g_scoreGapOpen2/2.0); PP.m_scoreGapClose2 = (SCORE) ((1.0 - fcClose)*g_scoreGapOpen2/2.0); #endif for (unsigned i = 0; i < g_AlphaSize; ++i) { SCORE scoreSum = 0; for (unsigned j = 0; j < g_AlphaSize; ++j) scoreSum += PP.m_fcCounts[j]*(*g_ptrScoreMatrix)[i][j]; PP.m_AAScores[i] = scoreSum; } } void ProfScoresFromFreqs(ProfPos *Prof, unsigned uLength) { for (unsigned i = 0; i < uLength; ++i) ScoresFromFreqsPos(Prof, uLength, i); } static void AppendDelete(const MSA &msaA, unsigned &uColIndexA, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendDelete ColIxA=%u ColIxCmb=%u\n", uColIndexA, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA); msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-'); ++uColIndexCombined; ++uColIndexA; } static void AppendInsert(const MSA &msaB, unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendInsert ColIxB=%u ColIxCmb=%u\n", uColIndexB, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-'); for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); } ++uColIndexCombined; ++uColIndexB; } static void AppendTplInserts(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA, const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendTplInserts ColIxA=%u ColIxB=%u ColIxCmb=%u\n", uColIndexA, uColIndexB, uColIndexCombined); #endif const unsigned uLengthA = msaA.GetColCount(); const unsigned uLengthB = msaB.GetColCount(); unsigned uNewColCount = uColCountA; if (uColCountB > uNewColCount) uNewColCount = uColCountB; for (unsigned n = 0; n < uColCountA; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA + n); c = UnalignChar(c); msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c); } } for (unsigned n = uColCountA; n < uNewColCount; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.'); } for (unsigned n = 0; n < uColCountB; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB + n); c = UnalignChar(c); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c); } } for (unsigned n = uColCountB; n < uNewColCount; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.'); } uColIndexCombined += uNewColCount; uColIndexA += uColCountA; uColIndexB += uColCountB; } static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB, unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n", uColIndexA, uColIndexB, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA); msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); } ++uColIndexA; ++uColIndexB; ++uColIndexCombined; } void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB, MSA &msaCombined) { msaCombined.Clear(); #if TRACE Log("FastAlignProfiles\n"); Log("Template A:\n"); msaA.LogMe(); Log("Template B:\n"); msaB.LogMe(); #endif const unsigned uColCountA = msaA.GetColCount(); const unsigned uColCountB = msaB.GetColCount(); const unsigned uSeqCountA = msaA.GetSeqCount(); const unsigned uSeqCountB = msaB.GetSeqCount(); msaCombined.SetSeqCount(uSeqCountA + uSeqCountB); // Copy sequence names into combined MSA for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA)); msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA)); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB)); msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB)); } unsigned uColIndexA = 0; unsigned uColIndexB = 0; unsigned uColIndexCombined = 0; const unsigned uEdgeCount = Path.GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); #if TRACE Log("\nEdge %u %c%u.%u\n", uEdgeIndex, Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); #endif const char cType = Edge.cType; const unsigned uPrefixLengthA = Edge.uPrefixLengthA; unsigned uColCountA = 0; if (uPrefixLengthA > 0) { const unsigned uNodeIndexA = uPrefixLengthA - 1; const unsigned uTplColIndexA = uNodeIndexA; if (uTplColIndexA > uColIndexA) uColCountA = uTplColIndexA - uColIndexA; } const unsigned uPrefixLengthB = Edge.uPrefixLengthB; unsigned uColCountB = 0; if (uPrefixLengthB > 0) { const unsigned uNodeIndexB = uPrefixLengthB - 1; const unsigned uTplColIndexB = uNodeIndexB; if (uTplColIndexB > uColIndexB) uColCountB = uTplColIndexB - uColIndexB; } // TODO: This code looks like a hangover from HMM estimation -- can we delete it? assert(uColCountA == 0); assert(uColCountB == 0); AppendTplInserts(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); switch (cType) { case 'M': { assert(uPrefixLengthA > 0); assert(uPrefixLengthB > 0); const unsigned uColA = uPrefixLengthA - 1; const unsigned uColB = uPrefixLengthB - 1; assert(uColIndexA == uColA); assert(uColIndexB == uColB); AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } case 'D': { assert(uPrefixLengthA > 0); const unsigned uColA = uPrefixLengthA - 1; assert(uColIndexA == uColA); AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } case 'I': { assert(uPrefixLengthB > 0); const unsigned uColB = uPrefixLengthB - 1; assert(uColIndexB == uColB); AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } default: assert(false); } } unsigned uInsertColCountA = uColCountA - uColIndexA; unsigned uInsertColCountB = uColCountB - uColIndexB; // TODO: This code looks like a hangover from HMM estimation -- can we delete it? assert(uInsertColCountA == 0); assert(uInsertColCountB == 0); AppendTplInserts(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB, uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); assert(msaCombined.GetColCount() == uEdgeCount); } static const ProfPos PPStart = { false, //m_bAllGaps; { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_uSortOrder[21]; { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_fcCounts[20]; 1.0, // m_LL; 0.0, // m_LG; 0.0, // m_GL; 0.0, // m_GG; { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_ALScores 0, // m_uResidueGroup; 1.0, // m_fOcc; 0.0, // m_fcStartOcc; 0.0, // m_fcEndOcc; 0.0, // m_scoreGapOpen; 0.0, // m_scoreGapClose; }; // MM // Ai–1 Ai Out // X X LL LL // X - LG LG // - X GL GL // - - GG GG // // Bj–1 Bj // X X LL LL // X - LG LG // - X GL GL // - - GG GG static void SetGapsMM( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wA*PPA.m_LL + wB*PPB.m_LL; PPO.m_LG = wA*PPA.m_LG + wB*PPB.m_LG; PPO.m_GL = wA*PPA.m_GL + wB*PPB.m_GL; PPO.m_GG = wA*PPA.m_GG + wB*PPB.m_GG; } // MD // Ai–1 Ai Out // X X LL LL // X - LG LG // - X GL GL // - - GG GG // // Bj (-) // X - ?L LG // - - ?G GG static void SetGapsMD( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wA*PPA.m_LL; PPO.m_LG = wA*PPA.m_LG + wB*(PPB.m_LL + PPB.m_GL); PPO.m_GL = wA*PPA.m_GL; PPO.m_GG = wA*PPA.m_GG + wB*(PPB.m_LG + PPB.m_GG); } // DD // Ai–1 Ai Out // X X LL LL // X - LG LG // - X GL GL // - - GG GG // // (-) (-) // - - ?? GG static void SetGapsDD( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wA*PPA.m_LL; PPO.m_LG = wA*PPA.m_LG; PPO.m_GL = wA*PPA.m_GL; PPO.m_GG = wA*PPA.m_GG + wB; } // MI // Ai (-) Out // X - ?L LG // - - ?G GG // Bj–1 Bj // X X LL LL // X - LG LG // - X GL GL // - - GG GG static void SetGapsMI( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wB*PPB.m_LL; PPO.m_LG = wB*PPB.m_LG + wA*(PPA.m_LL + PPA.m_GL); PPO.m_GL = wB*PPB.m_GL; PPO.m_GG = wB*PPB.m_GG + wA*(PPA.m_LG + PPA.m_GG); } // DM // Ai–1 Ai Out // X X LL LL // X - LG LG // - X GL GL // - - GG GG // // (-) Bj // - X ?L GL // - - ?G GG static void SetGapsDM( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wA*PPA.m_LL; PPO.m_LG = wA*PPA.m_LG; PPO.m_GL = wA*PPA.m_GL + wB*(PPB.m_LL + PPB.m_GL); PPO.m_GG = wA*PPA.m_GG + wB*(PPB.m_LG + PPB.m_GG); } // IM // (-) Ai Out // - X ?L GL // - - ?G GG // Bj–1 Bj // X X LL LL // X - LG LG // - X GL GL // - - GG GG static void SetGapsIM( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wB*PPB.m_LL; PPO.m_LG = wB*PPB.m_LG; PPO.m_GL = wB*PPB.m_GL + wA*(PPA.m_LL + PPA.m_GL); PPO.m_GG = wB*PPB.m_GG + wA*(PPA.m_LG + PPA.m_GG); } // ID // (-) Ai Out // - X ?L GL // - - ?G GG // Bj (-) // X - ?L LG // - - ?G GG static void SetGapsID( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = 0; PPO.m_LG = wB*PPB.m_GL + wB*PPB.m_LL; PPO.m_GL = wA*PPA.m_GL + wA*PPA.m_LL; PPO.m_GG = wA*(PPA.m_LG + PPA.m_GG) + wB*(PPB.m_LG + PPB.m_GG); } // DI // Ai (-) Out // X - ?L LG // - - ?G GG // (-) Bj // - X ?L GL // - - ?G GG static void SetGapsDI( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = 0; PPO.m_LG = wA*PPA.m_GL + wA*PPA.m_LL; PPO.m_GL = wB*PPB.m_GL + wB*PPB.m_LL; PPO.m_GG = wA*(PPA.m_LG + PPA.m_GG) + wB*(PPB.m_LG + PPB.m_GG); } // II // (-) (-) Out // - - ?? GG // Bj–1 Bj // X X LL LL // X - LG LG // - X GL GL // - - GG GG static void SetGapsII( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; PPO.m_LL = wB*PPB.m_LL; PPO.m_LG = wB*PPB.m_LG; PPO.m_GL = wB*PPB.m_GL; PPO.m_GG = wB*PPB.m_GG + wA; } static void SetFreqs( const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos *POut, unsigned uColIndexOut) { const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; ProfPos &PPO = POut[uColIndexOut]; if (g_bNormalizeCounts) { const FCOUNT fA = PPA.m_fOcc*wA/(wA + wB); const FCOUNT fB = PPB.m_fOcc*wB/(wA + wB); FCOUNT fTotal = 0; for (unsigned i = 0; i < 20; ++i) { const FCOUNT f = fA*PPA.m_fcCounts[i] + fB*PPB.m_fcCounts[i]; PPO.m_fcCounts[i] = f; fTotal += f; } if (fTotal > 0) for (unsigned i = 0; i < 20; ++i) PPO.m_fcCounts[i] /= fTotal; } else { for (unsigned i = 0; i < 20; ++i) PPO.m_fcCounts[i] = wA*PPA.m_fcCounts[i] + wB*PPB.m_fcCounts[i]; } } void AlignTwoProfsGivenPath(const PWPath &Path, const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, ProfPos **ptrPOut, unsigned *ptruLengthOut) { #if TRACE Log("AlignTwoProfsGivenPath wA=%.3g wB=%.3g Path=\n", wA, wB); Path.LogMe(); #endif assert(BTEq(wA + wB, 1.0)); unsigned uColIndexA = 0; unsigned uColIndexB = 0; unsigned uColIndexOut = 0; const unsigned uEdgeCount = Path.GetEdgeCount(); ProfPos *POut = new ProfPos[uEdgeCount]; char cPrevType = 'M'; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); const char cType = Edge.cType; const unsigned uPrefixLengthA = Edge.uPrefixLengthA; const unsigned uPrefixLengthB = Edge.uPrefixLengthB; #if TRACE Log("\nEdge %u %c%u.%u ColA=%u ColB=%u\n", uEdgeIndex, Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB, uColIndexA, uColIndexB); #endif POut[uColIndexOut].m_bAllGaps = false; switch (cType) { case 'M': { assert(uPrefixLengthA > 0); assert(uPrefixLengthB > 0); SetFreqs( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); switch (cPrevType) { case 'M': SetGapsMM( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'D': SetGapsDM( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'I': SetGapsIM( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; default: Quit("Bad cPrevType"); } ++uColIndexA; ++uColIndexB; ++uColIndexOut; break; } case 'D': { assert(uPrefixLengthA > 0); SetFreqs( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, 0, POut, uColIndexOut); switch (cPrevType) { case 'M': SetGapsMD( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'D': SetGapsDD( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'I': SetGapsID( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; default: Quit("Bad cPrevType"); } ++uColIndexA; ++uColIndexOut; break; } case 'I': { assert(uPrefixLengthB > 0); SetFreqs( PA, uPrefixLengthA, 0, PB, uPrefixLengthB, wB, POut, uColIndexOut); switch (cPrevType) { case 'M': SetGapsMI( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'D': SetGapsDI( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; case 'I': SetGapsII( PA, uPrefixLengthA, wA, PB, uPrefixLengthB, wB, POut, uColIndexOut); break; default: Quit("Bad cPrevType"); } ++uColIndexB; ++uColIndexOut; break; } default: assert(false); } cPrevType = cType; } assert(uColIndexOut == uEdgeCount); ProfScoresFromFreqs(POut, uEdgeCount); ValidateProf(POut, uEdgeCount); *ptrPOut = POut; *ptruLengthOut = uEdgeCount; #if TRACE Log("AlignTwoProfsGivenPath:\n"); ListProfile(POut, uEdgeCount, 0); #endif } aligngivenpathsw.cpp0000664000175000017500000001617412360262614013237 0ustar bobbob#include "muscle.h" #include "msa.h" #include "pwpath.h" #include "profile.h" #define TRACE 0 static void AppendDelete(const MSA &msaA, unsigned &uColIndexA, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendDelete ColIxA=%u ColIxCmb=%u\n", uColIndexA, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA); msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-'); ++uColIndexCombined; ++uColIndexA; } static void AppendInsert(const MSA &msaB, unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendInsert ColIxB=%u ColIxCmb=%u\n", uColIndexB, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-'); for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); } ++uColIndexCombined; ++uColIndexB; } static void AppendUnalignedTerminals(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA, const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendUnalignedTerminals ColIxA=%u ColIxB=%u ColIxCmb=%u\n", uColIndexA, uColIndexB, uColIndexCombined); #endif const unsigned uLengthA = msaA.GetColCount(); const unsigned uLengthB = msaB.GetColCount(); unsigned uNewColCount = uColCountA; if (uColCountB > uNewColCount) uNewColCount = uColCountB; for (unsigned n = 0; n < uColCountA; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA + n); c = UnalignChar(c); msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c); } } for (unsigned n = uColCountA; n < uNewColCount; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.'); } for (unsigned n = 0; n < uColCountB; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB + n); c = UnalignChar(c); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c); } } for (unsigned n = uColCountB; n < uNewColCount; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.'); } uColIndexCombined += uNewColCount; uColIndexA += uColCountA; uColIndexB += uColCountB; } static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB, unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n", uColIndexA, uColIndexB, uColIndexCombined); #endif for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA); msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); } ++uColIndexA; ++uColIndexB; ++uColIndexCombined; } void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB, MSA &msaCombined) { msaCombined.Clear(); #if TRACE Log("AlignTwoMSAsGivenPathSW\n"); Log("Template A:\n"); msaA.LogMe(); Log("Template B:\n"); msaB.LogMe(); #endif const unsigned uColCountA = msaA.GetColCount(); const unsigned uColCountB = msaB.GetColCount(); const unsigned uSeqCountA = msaA.GetSeqCount(); const unsigned uSeqCountB = msaB.GetSeqCount(); msaCombined.SetSeqCount(uSeqCountA + uSeqCountB); // Copy sequence names into combined MSA for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA)); msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA)); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB)); msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB)); } unsigned uColIndexA = 0; unsigned uColIndexB = 0; unsigned uColIndexCombined = 0; const unsigned uEdgeCount = Path.GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); #if TRACE Log("\nEdge %u %c%u.%u\n", uEdgeIndex, Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); #endif const char cType = Edge.cType; const unsigned uPrefixLengthA = Edge.uPrefixLengthA; unsigned uColCountA = 0; if (uPrefixLengthA > 0) { const unsigned uNodeIndexA = uPrefixLengthA - 1; const unsigned uTplColIndexA = uNodeIndexA; if (uTplColIndexA > uColIndexA) uColCountA = uTplColIndexA - uColIndexA; } const unsigned uPrefixLengthB = Edge.uPrefixLengthB; unsigned uColCountB = 0; if (uPrefixLengthB > 0) { const unsigned uNodeIndexB = uPrefixLengthB - 1; const unsigned uTplColIndexB = uNodeIndexB; if (uTplColIndexB > uColIndexB) uColCountB = uTplColIndexB - uColIndexB; } AppendUnalignedTerminals(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); switch (cType) { case 'M': { assert(uPrefixLengthA > 0); assert(uPrefixLengthB > 0); const unsigned uColA = uPrefixLengthA - 1; const unsigned uColB = uPrefixLengthB - 1; assert(uColIndexA == uColA); assert(uColIndexB == uColB); AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } case 'D': { assert(uPrefixLengthA > 0); const unsigned uColA = uPrefixLengthA - 1; assert(uColIndexA == uColA); AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } case 'I': { assert(uPrefixLengthB > 0); const unsigned uColB = uPrefixLengthB - 1; assert(uColIndexB == uColB); AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } default: assert(false); } } unsigned uInsertColCountA = uColCountA - uColIndexA; unsigned uInsertColCountB = uColCountB - uColIndexB; AppendUnalignedTerminals(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB, uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); } aligntwomsas.cpp0000664000175000017500000000166212360262614012371 0ustar bobbob#include "muscle.h" #include "msa.h" #include "profile.h" #include "pwpath.h" #include "textfile.h" #include "timing.h" SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path, bool bLockLeft, bool bLockRight) { const unsigned uLengthA = msa1.GetColCount(); const unsigned uLengthB = msa2.GetColCount(); ProfPos *PA = ProfileFromMSA(msa1); ProfPos *PB = ProfileFromMSA(msa2); if (bLockLeft) { PA[0].m_scoreGapOpen = MINUS_INFINITY; PB[0].m_scoreGapOpen = MINUS_INFINITY; } if (bLockRight) { PA[uLengthA-1].m_scoreGapClose = MINUS_INFINITY; PB[uLengthB-1].m_scoreGapClose = MINUS_INFINITY; } float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0 if (r < 1) r = 1/r; SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path); AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut); delete[] PA; delete[] PB; return Score; } aligntwoprofs.cpp0000664000175000017500000000147712360262614012563 0ustar bobbob#include "muscle.h" #include "msa.h" #include "profile.h" #include "pwpath.h" SCORE GlobalAlign4(ProfPos *PA, unsigned uLengthA, ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE AlignTwoProfs( const ProfPos *PA, unsigned uLengthA, WEIGHT wA, const ProfPos *PB, unsigned uLengthB, WEIGHT wB, PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut) { assert(uLengthA < 100000); assert(uLengthB < 100000); float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0 if (r < 1) r = 1/r; SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path); AlignTwoProfsGivenPath(Path, PA, uLengthB, wA/(wA + wB), PB, uLengthB, wB/(wA + wB), ptrPout, ptruLengthOut); #if HYDRO if (ALPHA_Amino == g_Alpha) Hydro(*ptrPout, *ptruLengthOut); #endif return Score; } aln.cpp0000664000175000017500000001102312360262613010422 0ustar bobbob#include "muscle.h" #include #include #include "msa.h" #include "textfile.h" const unsigned uCharsPerLine = 60; const int MIN_NAME = 10; const int MAX_NAME = 32; static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex); void MSA::ToAlnFile(TextFile &File) const { if (g_bClwStrict) File.PutString("CLUSTAL W (1.81) multiple sequence alignment\n"); else { File.PutString("MUSCLE (" SHORT_VERSION ")" " multiple sequence alignment\n"); File.PutString("\n"); } int iLongestNameLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *ptrName = GetSeqName(uSeqIndex); const char *ptrBlank = strchr(ptrName, ' '); int iLength; if (0 != ptrBlank) iLength = (int) (ptrBlank - ptrName); else iLength = (int) strlen(ptrName); if (iLength > iLongestNameLength) iLongestNameLength = iLength; } if (iLongestNameLength > MAX_NAME) iLongestNameLength = MAX_NAME; if (iLongestNameLength < MIN_NAME) iLongestNameLength = MIN_NAME; unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1; for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex) { File.PutString("\n"); unsigned uStartColIndex = uLineIndex*uCharsPerLine; unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1; if (uEndColIndex >= GetColCount()) uEndColIndex = GetColCount() - 1; char Name[MAX_NAME+1]; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *ptrName = GetSeqName(uSeqIndex); const char *ptrBlank = strchr(ptrName, ' '); int iLength; if (0 != ptrBlank) iLength = (int) (ptrBlank - ptrName); else iLength = (int) strlen(ptrName); if (iLength > MAX_NAME) iLength = MAX_NAME; memset(Name, ' ', MAX_NAME); memcpy(Name, ptrName, iLength); Name[iLongestNameLength] = 0; File.PutFormat("%s ", Name); for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; ++uColIndex) { const char c = GetChar(uSeqIndex, uColIndex); File.PutFormat("%c", toupper(c)); } File.PutString("\n"); } memset(Name, ' ', MAX_NAME); Name[iLongestNameLength] = 0; File.PutFormat("%s ", Name); for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; ++uColIndex) { const char c = GetAlnConsensusChar(*this, uColIndex); File.PutChar(c); } File.PutString("\n"); } } static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex) { const unsigned uSeqCount = a.GetSeqCount(); unsigned BitMap = 0; unsigned Count = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uLetter = a.GetLetterEx(uSeqIndex, uColIndex); assert(uLetter < 32); unsigned Bit = (1 << uLetter); if (!(BitMap & Bit)) ++Count; BitMap |= Bit; } // '*' indicates positions which have a single, fully conserved residue if (1 == Count) return '*'; if (ALPHA_Amino != g_Alpha) return ' '; #define B(a) (1 << AX_##a) #define S2(a, b) S(B(a) | B(b)) #define S3(a, b, c) S(B(a) | B(b) | B(c)) #define S4(a, b, c, d) S(B(a) | B(b) | B(c) | B(d)) #define S(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return ':'; #define W3(a, b, c) W(B(a) | B(b) | B(c)) #define W4(a, b, c, d) W(B(a) | B(b) | B(c) | B(d)) #define W5(a, b, c, d, e) W(B(a) | B(b) | B(c) | B(d) | B(e)) #define W6(a, b, c, d, e, f) W(B(a) | B(b) | B(c) | B(d) | B(e) | B(f)) #define W(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return '.'; // ':' indicates that one of the following 'strong' // groups is fully conserved // STA // NEQK // NHQK // NDEQ // QHRK // MILV // MILF // HY // FYW // S3(S, T, A) S4(N, E, Q, K) S4(N, H, Q, K) S4(N, D, E, Q) S4(M, I, L, V) S4(M, I, L, F) S2(H, Y) S3(F, Y, W) // '.' indicates that one of the following 'weaker' // groups is fully conserved // CSA // ATV // SAG // STNK // STPA // SGND // SNDEQK // NDEQHK // NEQHRK // FVLIM // HFY W3(C, S, A) W3(A, T, V) W3(S, A, G) W4(S, T, N, K) W4(S, T, P, A) W4(S, G, N, D) W6(S, N, D, E, Q, K) W6(N, W, Q, H, R, K) W5(F, V, L, I, M) W3(H, F, Y) return ' '; } alpha.cpp0000664000175000017500000001304312360262614010742 0ustar bobbob#include "muscle.h" #include /*** From Bioperl docs: Extended DNA / RNA alphabet ------------------------------------------ Symbol Meaning Nucleic Acid ------------------------------------------ A A Adenine C C Cytosine G G Guanine T T Thymine U U Uracil M A or C R A or G W A or T S C or G Y C or T K G or T V A or C or G H A or C or T D A or G or T B C or G or T X G or A or T or C N G or A or T or C IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE: Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030. ***/ unsigned g_CharToLetter[MAX_CHAR]; unsigned g_CharToLetterEx[MAX_CHAR]; char g_LetterToChar[MAX_ALPHA]; char g_LetterExToChar[MAX_ALPHA_EX]; char g_UnalignChar[MAX_CHAR]; char g_AlignChar[MAX_CHAR]; bool g_IsWildcardChar[MAX_CHAR]; bool g_IsResidueChar[MAX_CHAR]; ALPHA g_Alpha = ALPHA_Undefined; unsigned g_AlphaSize = 0; #define Res(c, Letter) \ { \ const unsigned char Upper = (unsigned char) toupper(c); \ const unsigned char Lower = (unsigned char) tolower(c); \ g_CharToLetter[Upper] = Letter; \ g_CharToLetter[Lower] = Letter; \ g_CharToLetterEx[Upper] = Letter; \ g_CharToLetterEx[Lower] = Letter; \ g_LetterToChar[Letter] = Upper; \ g_LetterExToChar[Letter] = Upper; \ g_IsResidueChar[Upper] = true; \ g_IsResidueChar[Lower] = true; \ g_AlignChar[Upper] = Upper; \ g_AlignChar[Lower] = Upper; \ g_UnalignChar[Upper] = Lower; \ g_UnalignChar[Lower] = Lower; \ } #define Wild(c, Letter) \ { \ const unsigned char Upper = (unsigned char) toupper(c); \ const unsigned char Lower = (unsigned char) tolower(c); \ g_CharToLetterEx[Upper] = Letter; \ g_CharToLetterEx[Lower] = Letter; \ g_LetterExToChar[Letter] = Upper; \ g_IsResidueChar[Upper] = true; \ g_IsResidueChar[Lower] = true; \ g_AlignChar[Upper] = Upper; \ g_AlignChar[Lower] = Upper; \ g_UnalignChar[Upper] = Lower; \ g_UnalignChar[Lower] = Lower; \ g_IsWildcardChar[Lower] = true; \ g_IsWildcardChar[Upper] = true; \ } static unsigned GetAlphaSize(ALPHA Alpha) { switch (Alpha) { case ALPHA_Amino: return 20; case ALPHA_RNA: case ALPHA_DNA: return 4; } Quit("Invalid Alpha=%d", Alpha); return 0; } static void InitArrays() { memset(g_CharToLetter, 0xff, sizeof(g_CharToLetter)); memset(g_CharToLetterEx, 0xff, sizeof(g_CharToLetterEx)); memset(g_LetterToChar, '?', sizeof(g_LetterToChar)); memset(g_LetterExToChar, '?', sizeof(g_LetterExToChar)); memset(g_AlignChar, '?', sizeof(g_UnalignChar)); memset(g_UnalignChar, '?', sizeof(g_UnalignChar)); memset(g_IsWildcardChar, 0, sizeof(g_IsWildcardChar)); } static void SetGapChar(char c) { unsigned char u = (unsigned char) c; g_CharToLetterEx[u] = AX_GAP; g_LetterExToChar[AX_GAP] = u; g_AlignChar[u] = u; g_UnalignChar[u] = u; } static void SetAlphaDNA() { Res('A', NX_A) Res('C', NX_C) Res('G', NX_G) Res('T', NX_T) Wild('M', NX_M) Wild('R', NX_R) Wild('W', NX_W) Wild('S', NX_S) Wild('Y', NX_Y) Wild('K', NX_K) Wild('V', NX_V) Wild('H', NX_H) Wild('D', NX_D) Wild('B', NX_B) Wild('X', NX_X) Wild('N', NX_N) } static void SetAlphaRNA() { Res('A', NX_A) Res('C', NX_C) Res('G', NX_G) Res('U', NX_U) Res('T', NX_T) Wild('M', NX_M) Wild('R', NX_R) Wild('W', NX_W) Wild('S', NX_S) Wild('Y', NX_Y) Wild('K', NX_K) Wild('V', NX_V) Wild('H', NX_H) Wild('D', NX_D) Wild('B', NX_B) Wild('X', NX_X) Wild('N', NX_N) } static void SetAlphaAmino() { Res('A', AX_A) Res('C', AX_C) Res('D', AX_D) Res('E', AX_E) Res('F', AX_F) Res('G', AX_G) Res('H', AX_H) Res('I', AX_I) Res('K', AX_K) Res('L', AX_L) Res('M', AX_M) Res('N', AX_N) Res('P', AX_P) Res('Q', AX_Q) Res('R', AX_R) Res('S', AX_S) Res('T', AX_T) Res('V', AX_V) Res('W', AX_W) Res('Y', AX_Y) Wild('B', AX_B) Wild('X', AX_X) Wild('Z', AX_Z) } void SetAlpha(ALPHA Alpha) { InitArrays(); SetGapChar('.'); SetGapChar('-'); switch (Alpha) { case ALPHA_Amino: SetAlphaAmino(); break; case ALPHA_DNA: SetAlphaDNA(); case ALPHA_RNA: SetAlphaRNA(); break; default: Quit("Invalid Alpha=%d", Alpha); } g_AlphaSize = GetAlphaSize(Alpha); g_Alpha = Alpha; if (g_bVerbose) Log("Alphabet %s\n", ALPHAToStr(g_Alpha)); } char GetWildcardChar() { switch (g_Alpha) { case ALPHA_Amino: return 'X'; case ALPHA_DNA: case ALPHA_RNA: return 'N'; default: Quit("Invalid Alpha=%d", g_Alpha); } return '?'; } bool IsNucleo(char c) { return strchr("ACGTURYNacgturyn", c) != 0; } bool IsDNA(char c) { return strchr("AGCTNagctn", c) != 0; } bool IsRNA(char c) { return strchr("AGCUNagcun", c) != 0; } static char InvalidLetters[256]; static int InvalidLetterCount = 0; void ClearInvalidLetterWarning() { memset(InvalidLetters, 0, 256); } void InvalidLetterWarning(char c, char w) { InvalidLetters[(unsigned char) c] = 1; ++InvalidLetterCount; } void ReportInvalidLetters() { if (0 == InvalidLetterCount) return; char Str[257]; memset(Str, 0, 257); int n = 0; for (int i = 0; i < 256; ++i) { if (InvalidLetters[i]) Str[n++] = (char) i; } Warning("Assuming %s (see -seqtype option), invalid letters found: %s", ALPHAToStr(g_Alpha), Str); } anchors.cpp0000664000175000017500000001407112360262614011314 0ustar bobbob#include "muscle.h" #include "msa.h" #include "objscore.h" #define TRACE 0 static void WindowSmooth(const SCORE Score[], unsigned uCount, unsigned uWindowLength, SCORE SmoothScore[], double dCeil) { #define Ceil(x) ((SCORE) ((x) > dCeil ? dCeil : (x))) if (1 != uWindowLength%2) Quit("WindowSmooth=%u must be odd", uWindowLength); if (uCount <= uWindowLength) { for (unsigned i = 0; i < uCount; ++i) SmoothScore[i] = 0; return; } const unsigned w2 = uWindowLength/2; for (unsigned i = 0; i < w2; ++i) { SmoothScore[i] = 0; SmoothScore[uCount - i - 1] = 0; } SCORE scoreWindowTotal = 0; for (unsigned i = 0; i < uWindowLength; ++i) { scoreWindowTotal += Ceil(Score[i]); } for (unsigned i = w2; ; ++i) { SmoothScore[i] = scoreWindowTotal/uWindowLength; if (i == uCount - w2 - 1) break; scoreWindowTotal -= Ceil(Score[i - w2]); scoreWindowTotal += Ceil(Score[i + w2 + 1]); } #undef Ceil } // Find columns that score above the given threshold. // A range of scores is defined between the average // and the maximum. The threshold is a fraction 0.0 .. 1.0 // within that range, where 0.0 is the average score // and 1.0 is the maximum score. // "Grade" is by analogy with grading on a curve. static void FindBestColsGrade(const SCORE Score[], unsigned uCount, double dThreshold, unsigned BestCols[], unsigned *ptruBestColCount) { SCORE scoreTotal = 0; for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) scoreTotal += Score[uIndex]; const SCORE scoreAvg = scoreTotal / uCount; SCORE scoreMax = MINUS_INFINITY; for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) if (Score[uIndex] > scoreMax) scoreMax = Score[uIndex]; unsigned uBestColCount = 0; for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) { const SCORE s = Score[uIndex]; const double dHeight = (s - scoreAvg)/(scoreMax - scoreAvg); if (dHeight >= dThreshold) { BestCols[uBestColCount] = uIndex; ++uBestColCount; } } *ptruBestColCount = uBestColCount; } // Best col only if all following criteria satisfied: // (1) Score >= min // (2) Smoothed score >= min // (3) No gaps. static void FindBestColsCombo(const MSA &msa, const SCORE Score[], const SCORE SmoothScore[], double dMinScore, double dMinSmoothScore, unsigned BestCols[], unsigned *ptruBestColCount) { const unsigned uColCount = msa.GetColCount(); unsigned uBestColCount = 0; for (unsigned uIndex = 0; uIndex < uColCount; ++uIndex) { if (Score[uIndex] < dMinScore) continue; if (SmoothScore[uIndex] < dMinSmoothScore) continue; if (msa.ColumnHasGap(uIndex)) continue; BestCols[uBestColCount] = uIndex; ++uBestColCount; } *ptruBestColCount = uBestColCount; } static void ListBestCols(const MSA &msa, const SCORE Score[], const SCORE SmoothScore[], unsigned BestCols[], unsigned uBestColCount) { const unsigned uColCount = msa.GetColCount(); const unsigned uSeqCount = msa.GetSeqCount(); Log("Col "); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%u", uSeqIndex%10); Log(" "); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { Log("%3u ", uColIndex); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%c", msa.GetChar(uSeqIndex, uColIndex)); Log(" %10.3f", Score[uColIndex]); Log(" %10.3f", SmoothScore[uColIndex]); for (unsigned i = 0; i < uBestColCount; ++i) if (BestCols[i] == uColIndex) Log(" <-- Best"); Log("\n"); } } // If two best columns are found within a window, choose // the highest-scoring. If more than two, choose the one // closest to the center of the window. static void MergeBestCols(const SCORE Scores[], const unsigned BestCols[], unsigned uBestColCount, unsigned uWindowLength, unsigned AnchorCols[], unsigned *ptruAnchorColCount) { unsigned uAnchorColCount = 0; for (unsigned n = 0; n < uBestColCount; /* update inside loop */) { unsigned uBestColIndex = BestCols[n]; unsigned uCountWithinWindow = 0; for (unsigned i = n + 1; i < uBestColCount; ++i) { unsigned uBestColIndex2 = BestCols[i]; if (uBestColIndex2 - uBestColIndex >= uWindowLength) break; ++uCountWithinWindow; } unsigned uAnchorCol = uBestColIndex; if (1 == uCountWithinWindow) { unsigned uBestColIndex2 = BestCols[n+1]; if (Scores[uBestColIndex] > Scores[uBestColIndex2]) uAnchorCol = uBestColIndex; else uAnchorCol = uBestColIndex2; } else if (uCountWithinWindow > 1) { unsigned uWindowCenter = uBestColIndex + uWindowLength/2; int iClosestDist = uWindowLength; unsigned uClosestCol = uBestColIndex; for (unsigned i = n + 1; i < n + uCountWithinWindow; ++i) { unsigned uColIndex = BestCols[i]; int iDist = uColIndex - uBestColIndex; if (iDist < 0) iDist = -iDist; if (iDist < iClosestDist) { uClosestCol = uColIndex; iClosestDist = iDist; } } uAnchorCol = uClosestCol; } AnchorCols[uAnchorColCount] = uAnchorCol; ++uAnchorColCount; n += uCountWithinWindow + 1; } *ptruAnchorColCount = uAnchorColCount; } void FindAnchorCols(const MSA &msa, unsigned AnchorCols[], unsigned *ptruAnchorColCount) { const unsigned uColCount = msa.GetColCount(); if (uColCount < 16) { *ptruAnchorColCount = 0; return; } SCORE *MatchScore = new SCORE[uColCount]; SCORE *SmoothScore = new SCORE[uColCount]; unsigned *BestCols = new unsigned[uColCount]; GetLetterScores(msa, MatchScore); WindowSmooth(MatchScore, uColCount, g_uSmoothWindowLength, SmoothScore, g_dSmoothScoreCeil); unsigned uBestColCount; FindBestColsCombo(msa, MatchScore, SmoothScore, g_dMinBestColScore, g_dMinSmoothScore, BestCols, &uBestColCount); #if TRACE ListBestCols(msa, MatchScore, SmoothScore, BestCols, uBestColCount); #endif MergeBestCols(MatchScore, BestCols, uBestColCount, g_uAnchorSpacing, AnchorCols, ptruAnchorColCount); delete[] MatchScore; delete[] SmoothScore; delete[] BestCols; } bittraceback.cpp0000664000175000017500000000640212360262614012274 0ustar bobbob#include "muscle.h" #include "pwpath.h" #define TRACE 0 static char XlatEdgeType(char c) { if ('E' == c) return 'D'; if ('J' == c) return 'I'; return c; } static const char *BitsToStr(char Bits) { static char Str[] = "xM xD xI"; switch (Bits & BIT_xM) { case BIT_MM: Str[0] = 'M'; break; case BIT_DM: Str[0] = 'D'; break; case BIT_IM: Str[0] = 'I'; break; } switch (Bits & BIT_xD) { case BIT_MD: Str[3] = 'M'; break; case BIT_DD: Str[3] = 'D'; break; } switch (Bits & BIT_xI) { case BIT_MI: Str[6] = 'M'; break; case BIT_II: Str[6] = 'I'; break; } return Str; } static inline char XChar(char Bits, char cType) { switch (cType) { case 'M': { switch (Bits & BIT_xM) { case BIT_MM: return 'M'; case BIT_DM: return 'D'; case BIT_IM: return 'I'; #if DOUBLE_AFFINE case BIT_EM: return 'E'; case BIT_JM: return 'J'; #endif } Quit("Huh!?"); return '?'; } case 'D': { switch (Bits & BIT_xD) { case BIT_MD: return 'M'; case BIT_DD: return 'D'; } Quit("Huh!?"); return '?'; } case 'I': { switch (Bits & BIT_xI) { case BIT_MI: return 'M'; case BIT_II: return 'I'; } Quit("Huh!?"); return '?'; } #if DOUBLE_AFFINE case 'E': { switch (Bits & BIT_xE) { case BIT_ME: return 'M'; case BIT_EE: return 'E'; } Quit("Huh!?"); return '?'; } case 'J': { switch (Bits & BIT_xJ) { case BIT_MJ: return 'M'; case BIT_JJ: return 'J'; } Quit("Huh!?"); return '?'; } #endif default: Quit("Huh?"); return '?'; } } void BitTraceBack(char **TraceBack, unsigned uLengthA, unsigned uLengthB, char LastEdge, PWPath &Path) { #if TRACE Log("BitTraceBack\n"); #endif Path.Clear(); PWEdge Edge; Edge.uPrefixLengthA = uLengthA; Edge.uPrefixLengthB = uLengthB; char Bits = TraceBack[uLengthA][uLengthB]; Edge.cType = LastEdge; for (;;) { #if TRACE Log("Prepend %c%d.%d\n", Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); #endif char cSave = Edge.cType; Edge.cType = XlatEdgeType(cSave); Path.PrependEdge(Edge); Edge.cType = cSave; unsigned PLA = Edge.uPrefixLengthA; unsigned PLB = Edge.uPrefixLengthB; char Bits = TraceBack[PLA][PLB]; char NextEdgeType = XChar(Bits, Edge.cType); #if TRACE Log("XChar(%s, %c) = %c\n", BitsToStr(Bits), Edge.cType, NextEdgeType); #endif switch (Edge.cType) { case 'M': { if (Edge.uPrefixLengthA == 0) Quit("BitTraceBack MA=0"); if (Edge.uPrefixLengthB == 0) Quit("BitTraceBack MA=0"); --(Edge.uPrefixLengthA); --(Edge.uPrefixLengthB); break; } case 'D': case 'E': { if (Edge.uPrefixLengthA == 0) Quit("BitTraceBack DA=0"); --(Edge.uPrefixLengthA); break; } case 'I': case 'J': { if (Edge.uPrefixLengthB == 0) Quit("BitTraceBack IB=0"); --(Edge.uPrefixLengthB); break; } default: Quit("BitTraceBack: Invalid edge %c", Edge); } if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB) break; Edge.cType = NextEdgeType; } #if TRACE Path.LogMe(); #endif } blosum62.cpp0000664000175000017500000000377712360262613011342 0ustar bobbob#include "muscle.h" int BLOSUM62[20][20] = { // A C D E F G H I K L M N P Q R S T V W Y { 4, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, -2}, // A { 0, 9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2}, // C {-2, -3, 6, 2, -3, -1, -1, -3, -1, -4, -3, 1, -1, 0, -2, 0, -1, -3, -4, -3}, // D {-1, -4, 2, 5, -3, -2, 0, -3, 1, -3, -2, 0, -1, 2, 0, 0, -1, -2, -3, -2}, // E {-2, -2, -3, -3, 6, -3, -1, 0, -3, 0, 0, -3, -4, -3, -3, -2, -2, -1, 1, 3}, // F { 0, -3, -1, -2, -3, 6, -2, -4, -2, -4, -3, 0, -2, -2, -2, 0, -2, -3, -2, -3}, // G {-2, -3, -1, 0, -1, -2, 8, -3, -1, -3, -2, 1, -2, 0, 0, -1, -2, -3, -2, 2}, // H {-1, -1, -3, -3, 0, -4, -3, 4, -3, 2, 1, -3, -3, -3, -3, -2, -1, 3, -3, -1}, // I {-1, -3, -1, 1, -3, -2, -1, -3, 5, -2, -1, 0, -1, 1, 2, 0, -1, -2, -3, -2}, // K {-1, -1, -4, -3, 0, -4, -3, 2, -2, 4, 2, -3, -3, -2, -2, -2, -1, 1, -2, -1}, // L {-1, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5, -2, -2, 0, -1, -1, -1, 1, -1, -1}, // M {-2, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -2}, // N {-1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7, -1, -2, -1, -1, -2, -4, -3}, // P {-1, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5, 1, 0, -1, -2, -2, -1}, // Q {-1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5, -1, -1, -3, -3, -2}, // R { 1, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4, 1, -2, -3, -2}, // S { 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5, 0, -2, -2}, // T { 0, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4, -3, -1}, // V {-3, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, 2}, // W {-2, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, 7}, // Y }; double BLOSUM62_Expected = -0.5209; blosumla.cpp0000664000175000017500000001505712360262614011502 0ustar bobbob#include "muscle.h" #define GAPVAL 0.3 #define GAPGAPVAL 5.0 // Blosum62 log-average factor matrix static float Blosum62LA[20][20] = { #define v(x) ((float) x) #define S_ROW(n, c, A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ v(R), v(S), v(T), v(V), v(W), v(Y) }, // Blosum62 log average matrix // A C D E F // G H I K L // M N P Q R // S T V W Y S_ROW( 0, 'A', 3.9029401, 0.8679881, 0.5446049, 0.7412640, 0.4648942, 1.0568696, 0.5693654, 0.6324813, 0.7753898, 0.6019460, 0.7231498, 0.5883077, 0.7541214, 0.7568035, 0.6126988, 1.4721037, 0.9844022, 0.9364584, 0.4165484, 0.5426125) S_ROW( 1, 'C', 0.8679881, 19.5765802, 0.3014542, 0.2859347, 0.4389910, 0.4203886, 0.3550472, 0.6534589, 0.3491296, 0.6422760, 0.6113537, 0.3978026, 0.3795628, 0.3657796, 0.3089379, 0.7384148, 0.7405530, 0.7558448, 0.4499807, 0.4342013) S_ROW( 2, 'D', 0.5446049, 0.3014542, 7.3979253, 1.6878109, 0.2989696, 0.6343015, 0.6785593, 0.3390155, 0.7840905, 0.2866128, 0.3464547, 1.5538520, 0.5987177, 0.8970811, 0.5732000, 0.9135051, 0.6947898, 0.3365004, 0.2321050, 0.3456829) S_ROW( 3, 'E', 0.7412640, 0.2859347, 1.6878109, 5.4695276, 0.3307441, 0.4812675, 0.9600400, 0.3305223, 1.3082782, 0.3728734, 0.5003421, 0.9112983, 0.6792027, 1.9017376, 0.9607983, 0.9503570, 0.7414260, 0.4289431, 0.3743021, 0.4964664) S_ROW( 4, 'F', 0.4648942, 0.4389910, 0.2989696, 0.3307441, 8.1287983, 0.3406407, 0.6519893, 0.9457698, 0.3440433, 1.1545978, 1.0043715, 0.3542882, 0.2874440, 0.3339729, 0.3807263, 0.4399736, 0.4816930, 0.7450894, 1.3743775, 2.7693817) S_ROW( 5, 'G', 1.0568696, 0.4203886, 0.6343015, 0.4812675, 0.3406407, 6.8763075, 0.4929663, 0.2750096, 0.5888716, 0.2845039, 0.3954865, 0.8637114, 0.4773858, 0.5386498, 0.4499840, 0.9035965, 0.5792712, 0.3369551, 0.4216898, 0.3487141) S_ROW( 6, 'H', 0.5693654, 0.3550472, 0.6785593, 0.9600400, 0.6519893, 0.4929663, 13.5060070, 0.3262878, 0.7788884, 0.3806759, 0.5841316, 1.2220028, 0.4728797, 1.1679835, 0.9170473, 0.7367319, 0.5575021, 0.3394474, 0.4440859, 1.7979036) S_ROW( 7, 'I', 0.6324813, 0.6534589, 0.3390155, 0.3305223, 0.9457698, 0.2750096, 0.3262878, 3.9979299, 0.3963730, 1.6944349, 1.4777449, 0.3279345, 0.3846629, 0.3829375, 0.3547509, 0.4431634, 0.7798163, 2.4175121, 0.4088732, 0.6303898) S_ROW( 8, 'K', 0.7753898, 0.3491296, 0.7840905, 1.3082782, 0.3440433, 0.5888716, 0.7788884, 0.3963730, 4.7643359, 0.4282702, 0.6253033, 0.9398419, 0.7037741, 1.5543233, 2.0768092, 0.9319192, 0.7929060, 0.4565429, 0.3589319, 0.5321784) S_ROW( 9, 'L', 0.6019460, 0.6422760, 0.2866128, 0.3728734, 1.1545978, 0.2845039, 0.3806759, 1.6944349, 0.4282702, 3.7966214, 1.9942957, 0.3100430, 0.3711219, 0.4773261, 0.4739194, 0.4288939, 0.6603292, 1.3142355, 0.5680359, 0.6920589) S_ROW(10, 'M', 0.7231498, 0.6113537, 0.3464547, 0.5003421, 1.0043715, 0.3954865, 0.5841316, 1.4777449, 0.6253033, 1.9942957, 6.4814549, 0.4745299, 0.4238960, 0.8642486, 0.6226249, 0.5985578, 0.7938018, 1.2689365, 0.6103022, 0.7083636) S_ROW(11, 'N', 0.5883077, 0.3978026, 1.5538520, 0.9112983, 0.3542882, 0.8637114, 1.2220028, 0.3279345, 0.9398419, 0.3100430, 0.4745299, 7.0940964, 0.4999337, 1.0005835, 0.8586298, 1.2315289, 0.9841525, 0.3690340, 0.2777841, 0.4860309) S_ROW(12, 'P', 0.7541214, 0.3795628, 0.5987177, 0.6792027, 0.2874440, 0.4773858, 0.4728797, 0.3846629, 0.7037741, 0.3711219, 0.4238960, 0.4999337, 12.8375452, 0.6412803, 0.4815348, 0.7555033, 0.6888962, 0.4430825, 0.2818321, 0.3635216) S_ROW(13, 'Q', 0.7568035, 0.3657796, 0.8970811, 1.9017376, 0.3339729, 0.5386498, 1.1679835, 0.3829375, 1.5543233, 0.4773261, 0.8642486, 1.0005835, 0.6412803, 6.2444210, 1.4057958, 0.9655559, 0.7913219, 0.4667781, 0.5093584, 0.6110951) S_ROW(14, 'R', 0.6126988, 0.3089379, 0.5732000, 0.9607983, 0.3807263, 0.4499840, 0.9170473, 0.3547509, 2.0768092, 0.4739194, 0.6226249, 0.8586298, 0.4815348, 1.4057958, 6.6655769, 0.7671661, 0.6777544, 0.4200721, 0.3951049, 0.5559652) S_ROW(15, 'S', 1.4721037, 0.7384148, 0.9135051, 0.9503570, 0.4399736, 0.9035965, 0.7367319, 0.4431634, 0.9319192, 0.4288939, 0.5985578, 1.2315289, 0.7555033, 0.9655559, 0.7671661, 3.8428476, 1.6139205, 0.5652240, 0.3853031, 0.5575206) S_ROW(16, 'T', 0.9844022, 0.7405530, 0.6947898, 0.7414260, 0.4816930, 0.5792712, 0.5575021, 0.7798163, 0.7929060, 0.6603292, 0.7938018, 0.9841525, 0.6888962, 0.7913219, 0.6777544, 1.6139205, 4.8321048, 0.9809432, 0.4309317, 0.5731577) S_ROW(17, 'V', 0.9364584, 0.7558448, 0.3365004, 0.4289431, 0.7450894, 0.3369551, 0.3394474, 2.4175121, 0.4565429, 1.3142355, 1.2689365, 0.3690340, 0.4430825, 0.4667781, 0.4200721, 0.5652240, 0.9809432, 3.6921553, 0.3744576, 0.6580390) S_ROW(18, 'W', 0.4165484, 0.4499807, 0.2321050, 0.3743021, 1.3743775, 0.4216898, 0.4440859, 0.4088732, 0.3589319, 0.5680359, 0.6103022, 0.2777841, 0.2818321, 0.5093584, 0.3951049, 0.3853031, 0.4309317, 0.3744576, 38.1077830, 2.1098056) S_ROW(19, 'Y', 0.5426125, 0.4342013, 0.3456829, 0.4964664, 2.7693817, 0.3487141, 1.7979036, 0.6303898, 0.5321784, 0.6920589, 0.7083636, 0.4860309, 0.3635216, 0.6110951, 0.5559652, 0.5575206, 0.5731577, 0.6580390, 2.1098056, 9.8322054) }; clust.cpp0000664000175000017500000004460612360262614011020 0ustar bobbob#include "muscle.h" #include "clust.h" #include "clustset.h" #include #define TRACE 0 Clust::Clust() { m_Nodes = 0; m_uNodeCount = 0; m_uLeafCount = 0; m_uClusterCount = 0; m_JoinStyle = JOIN_Undefined; m_dDist = 0; m_uLeafCount = 0; m_ptrSet = 0; } Clust::~Clust() { delete[] m_Nodes; delete[] m_dDist; delete[] m_ClusterIndexToNodeIndex; } void Clust::Create(ClustSet &Set, CLUSTER Method) { m_ptrSet = &Set; SetLeafCount(Set.GetLeafCount()); switch (Method) { case CLUSTER_UPGMA: m_JoinStyle = JOIN_NearestNeighbor; m_CentroidStyle = LINKAGE_Avg; break; case CLUSTER_UPGMAMax: m_JoinStyle = JOIN_NearestNeighbor; m_CentroidStyle = LINKAGE_Max; break; case CLUSTER_UPGMAMin: m_JoinStyle = JOIN_NearestNeighbor; m_CentroidStyle = LINKAGE_Min; break; case CLUSTER_UPGMB: m_JoinStyle = JOIN_NearestNeighbor; m_CentroidStyle = LINKAGE_Biased; break; case CLUSTER_NeighborJoining: m_JoinStyle = JOIN_NeighborJoining; m_CentroidStyle = LINKAGE_NeighborJoining; break; default: Quit("Clust::Create, invalid method %d", Method); } if (m_uLeafCount <= 1) Quit("Clust::Create: no leaves"); m_uNodeCount = 2*m_uLeafCount - 1; m_Nodes = new ClustNode[m_uNodeCount]; m_ClusterIndexToNodeIndex = new unsigned[m_uLeafCount]; m_ptrClusterList = 0; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { ClustNode &Node = m_Nodes[uNodeIndex]; Node.m_uIndex = uNodeIndex; if (uNodeIndex < m_uLeafCount) { Node.m_uSize = 1; Node.m_uLeafIndexes = new unsigned[1]; Node.m_uLeafIndexes[0] = uNodeIndex; AddToClusterList(uNodeIndex); } else Node.m_uSize = 0; } // Compute initial distance matrix between leaves SetProgressDesc("Build dist matrix"); unsigned uPairIndex = 0; const unsigned uPairCount = (m_uLeafCount*(m_uLeafCount - 1))/2; for (unsigned i = 0; i < m_uLeafCount; ++i) for (unsigned j = 0; j < i; ++j) { const float dDist = (float) m_ptrSet->ComputeDist(*this, i, j); SetDist(i, j, dDist); if (0 == uPairIndex%10000) Progress(uPairIndex, uPairCount); ++uPairIndex; } ProgressStepsDone(); // Call CreateCluster once for each internal node in the tree SetProgressDesc("Build guide tree"); m_uClusterCount = m_uLeafCount; const unsigned uInternalNodeCount = m_uNodeCount - m_uLeafCount; for (unsigned uNodeIndex = m_uLeafCount; uNodeIndex < m_uNodeCount; ++uNodeIndex) { unsigned i = uNodeIndex + 1 - m_uLeafCount; Progress(i, uInternalNodeCount); CreateCluster(); } ProgressStepsDone(); } void Clust::CreateCluster() { unsigned uLeftNodeIndex; unsigned uRightNodeIndex; float dLeftLength; float dRightLength; ChooseJoin(&uLeftNodeIndex, &uRightNodeIndex, &dLeftLength, &dRightLength); const unsigned uNewNodeIndex = m_uNodeCount - m_uClusterCount + 1; JoinNodes(uLeftNodeIndex, uRightNodeIndex, dLeftLength, dRightLength, uNewNodeIndex); #if TRACE Log("Merge New=%u L=%u R=%u Ld=%7.2g Rd=%7.2g\n", uNewNodeIndex, uLeftNodeIndex, uRightNodeIndex, dLeftLength, dRightLength); #endif // Compute distances to other clusters --m_uClusterCount; for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane; uNodeIndex = GetNextCluster(uNodeIndex)) { if (uNodeIndex == uLeftNodeIndex || uNodeIndex == uRightNodeIndex) continue; if (uNewNodeIndex == uNodeIndex) continue; const float dDist = ComputeDist(uNewNodeIndex, uNodeIndex); SetDist(uNewNodeIndex, uNodeIndex, dDist); } for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane; uNodeIndex = GetNextCluster(uNodeIndex)) { if (uNodeIndex == uLeftNodeIndex || uNodeIndex == uRightNodeIndex) continue; if (uNewNodeIndex == uNodeIndex) continue; #if REDLACK const float dMetric = ComputeMetric(uNewNodeIndex, uNodeIndex); InsertMetric(uNewNodeIndex, uNodeIndex, dMetric); #endif } } void Clust::ChooseJoin(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength) { switch (m_JoinStyle) { case JOIN_NearestNeighbor: ChooseJoinNearestNeighbor(ptruLeftIndex, ptruRightIndex, ptrdLeftLength, ptrdRightLength); return; case JOIN_NeighborJoining: ChooseJoinNeighborJoining(ptruLeftIndex, ptruRightIndex, ptrdLeftLength, ptrdRightLength); return; } Quit("Clust::ChooseJoin, Invalid join style %u", m_JoinStyle); } void Clust::ChooseJoinNearestNeighbor(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength) { const unsigned uClusterCount = GetClusterCount(); unsigned uMinLeftNodeIndex; unsigned uMinRightNodeIndex; GetMinMetric(&uMinLeftNodeIndex, &uMinRightNodeIndex); float dMinDist = GetDist(uMinLeftNodeIndex, uMinRightNodeIndex); const float dLeftHeight = GetHeight(uMinLeftNodeIndex); const float dRightHeight = GetHeight(uMinRightNodeIndex); *ptruLeftIndex = uMinLeftNodeIndex; *ptruRightIndex = uMinRightNodeIndex; *ptrdLeftLength = dMinDist/2 - dLeftHeight; *ptrdRightLength = dMinDist/2 - dRightHeight; } void Clust::ChooseJoinNeighborJoining(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength) { const unsigned uClusterCount = GetClusterCount(); //unsigned uMinLeftNodeIndex = uInsane; //unsigned uMinRightNodeIndex = uInsane; //float dMinD = PLUS_INFINITY; //for (unsigned i = GetFirstCluster(); i != uInsane; i = GetNextCluster(i)) // { // const float ri = Calc_r(i); // for (unsigned j = GetNextCluster(i); j != uInsane; j = GetNextCluster(j)) // { // const float rj = Calc_r(j); // const float dij = GetDist(i, j); // const float Dij = dij - (ri + rj); // if (Dij < dMinD) // { // dMinD = Dij; // uMinLeftNodeIndex = i; // uMinRightNodeIndex = j; // } // } // } unsigned uMinLeftNodeIndex; unsigned uMinRightNodeIndex; GetMinMetric(&uMinLeftNodeIndex, &uMinRightNodeIndex); const float dDistLR = GetDist(uMinLeftNodeIndex, uMinRightNodeIndex); const float rL = Calc_r(uMinLeftNodeIndex); const float rR = Calc_r(uMinRightNodeIndex); const float dLeftLength = (dDistLR + rL - rR)/2; const float dRightLength = (dDistLR - rL + rR)/2; *ptruLeftIndex = uMinLeftNodeIndex; *ptruRightIndex = uMinRightNodeIndex; *ptrdLeftLength = dLeftLength; *ptrdRightLength = dRightLength; } void Clust::JoinNodes(unsigned uLeftIndex, unsigned uRightIndex, float dLeftLength, float dRightLength, unsigned uNodeIndex) { ClustNode &Parent = m_Nodes[uNodeIndex]; ClustNode &Left = m_Nodes[uLeftIndex]; ClustNode &Right = m_Nodes[uRightIndex]; Left.m_dLength = dLeftLength; Right.m_dLength = dRightLength; Parent.m_ptrLeft = &Left; Parent.m_ptrRight = &Right; Left.m_ptrParent = &Parent; Right.m_ptrParent = &Parent; const unsigned uLeftSize = Left.m_uSize; const unsigned uRightSize = Right.m_uSize; const unsigned uParentSize = uLeftSize + uRightSize; Parent.m_uSize = uParentSize; assert(0 == Parent.m_uLeafIndexes); Parent.m_uLeafIndexes = new unsigned[uParentSize]; const unsigned uLeftBytes = uLeftSize*sizeof(unsigned); const unsigned uRightBytes = uRightSize*sizeof(unsigned); memcpy(Parent.m_uLeafIndexes, Left.m_uLeafIndexes, uLeftBytes); memcpy(Parent.m_uLeafIndexes + uLeftSize, Right.m_uLeafIndexes, uRightBytes); DeleteFromClusterList(uLeftIndex); DeleteFromClusterList(uRightIndex); AddToClusterList(uNodeIndex); } float Clust::Calc_r(unsigned uNodeIndex) const { const unsigned uClusterCount = GetClusterCount(); if (2 == uClusterCount) return 0; float dSum = 0; for (unsigned i = GetFirstCluster(); i != uInsane; i = GetNextCluster(i)) { if (i == uNodeIndex) continue; dSum += GetDist(uNodeIndex, i); } return dSum/(uClusterCount - 2); } float Clust::ComputeDist(unsigned uNewNodeIndex, unsigned uNodeIndex) { switch (m_CentroidStyle) { case LINKAGE_Avg: return ComputeDistAverageLinkage(uNewNodeIndex, uNodeIndex); case LINKAGE_Min: return ComputeDistMinLinkage(uNewNodeIndex, uNodeIndex); case LINKAGE_Max: return ComputeDistMaxLinkage(uNewNodeIndex, uNodeIndex); case LINKAGE_Biased: return ComputeDistMAFFT(uNewNodeIndex, uNodeIndex); case LINKAGE_NeighborJoining: return ComputeDistNeighborJoining(uNewNodeIndex, uNodeIndex); } Quit("Clust::ComputeDist, invalid centroid style %u", m_CentroidStyle); return (float) g_dNAN; } float Clust::ComputeDistMinLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex) { const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); return (dDistL < dDistR ? dDistL : dDistR); } float Clust::ComputeDistMaxLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex) { const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); return (dDistL > dDistR ? dDistL : dDistR); } float Clust::ComputeDistAverageLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex) { const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); return (dDistL + dDistR)/2; } float Clust::ComputeDistNeighborJoining(unsigned uNewNodeIndex, unsigned uNodeIndex) { const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); const float dDistLR = GetDist(uLeftNodeIndex, uRightNodeIndex); const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); const float dDist = (dDistL + dDistR - dDistLR)/2; return dDist; } // This is a mysterious variant of UPGMA reverse-engineered from MAFFT source. float Clust::ComputeDistMAFFT(unsigned uNewNodeIndex, unsigned uNodeIndex) { const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); const float dDistLR = GetDist(uLeftNodeIndex, uRightNodeIndex); const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); const float dMinDistLR = (dDistL < dDistR ? dDistL : dDistR); const float dSumDistLR = dDistL + dDistR; const float dDist = dMinDistLR*(1 - g_dSUEFF) + dSumDistLR*g_dSUEFF/2; return dDist; } unsigned Clust::GetClusterCount() const { return m_uClusterCount; } void Clust::LogMe() const { Log("Clust %u leaves, %u nodes, %u clusters.\n", m_uLeafCount, m_uNodeCount, m_uClusterCount); Log("Distance matrix\n"); const unsigned uNodeCount = GetNodeCount(); Log(" "); for (unsigned i = 0; i < uNodeCount - 1; ++i) Log(" %7u", i); Log("\n"); Log(" "); for (unsigned i = 0; i < uNodeCount - 1; ++i) Log(" ------"); Log("\n"); for (unsigned i = 0; i < uNodeCount - 1; ++i) { Log("%4u: ", i); for (unsigned j = 0; j < i; ++j) Log(" %7.2g", GetDist(i, j)); Log("\n"); } Log("\n"); Log("Node Size Prnt Left Rght Length Name\n"); Log("---- ---- ---- ---- ---- ------ ----\n"); for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { const ClustNode &Node = m_Nodes[uNodeIndex]; Log("%4u %4u", uNodeIndex, Node.m_uSize); if (0 != Node.m_ptrParent) Log(" %4u", Node.m_ptrParent->m_uIndex); else Log(" "); if (0 != Node.m_ptrLeft) Log(" %4u", Node.m_ptrLeft->m_uIndex); else Log(" "); if (0 != Node.m_ptrRight) Log(" %4u", Node.m_ptrRight->m_uIndex); else Log(" "); if (uNodeIndex != m_uNodeCount - 1) Log(" %7.3g", Node.m_dLength); if (IsLeaf(uNodeIndex)) { const char *ptrName = GetNodeName(uNodeIndex); if (0 != ptrName) Log(" %s", ptrName); } if (GetRootNodeIndex() == uNodeIndex) Log(" [ROOT]"); Log("\n"); } } const ClustNode &Clust::GetNode(unsigned uNodeIndex) const { if (uNodeIndex >= m_uNodeCount) Quit("ClustNode::GetNode(%u) %u", uNodeIndex, m_uNodeCount); return m_Nodes[uNodeIndex]; } bool Clust::IsLeaf(unsigned uNodeIndex) const { return uNodeIndex < m_uLeafCount; } unsigned Clust::GetClusterSize(unsigned uNodeIndex) const { const ClustNode &Node = GetNode(uNodeIndex); return Node.m_uSize; } unsigned Clust::GetLeftIndex(unsigned uNodeIndex) const { const ClustNode &Node = GetNode(uNodeIndex); if (0 == Node.m_ptrLeft) Quit("Clust::GetLeftIndex: leaf"); return Node.m_ptrLeft->m_uIndex; } unsigned Clust::GetRightIndex(unsigned uNodeIndex) const { const ClustNode &Node = GetNode(uNodeIndex); if (0 == Node.m_ptrRight) Quit("Clust::GetRightIndex: leaf"); return Node.m_ptrRight->m_uIndex; } float Clust::GetLength(unsigned uNodeIndex) const { const ClustNode &Node = GetNode(uNodeIndex); return Node.m_dLength; } void Clust::SetLeafCount(unsigned uLeafCount) { if (uLeafCount <= 1) Quit("Clust::SetLeafCount(%u)", uLeafCount); m_uLeafCount = uLeafCount; const unsigned uNodeCount = GetNodeCount(); // Triangular matrix size excluding diagonal (all zeros in our case). m_uTriangularMatrixSize = (uNodeCount*(uNodeCount - 1))/2; m_dDist = new float[m_uTriangularMatrixSize]; } unsigned Clust::GetLeafCount() const { return m_uLeafCount; } unsigned Clust::VectorIndex(unsigned uIndex1, unsigned uIndex2) const { const unsigned uNodeCount = GetNodeCount(); if (uIndex1 >= uNodeCount || uIndex2 >= uNodeCount) Quit("DistVectorIndex(%u,%u) %u", uIndex1, uIndex2, uNodeCount); unsigned v; if (uIndex1 >= uIndex2) v = uIndex2 + (uIndex1*(uIndex1 - 1))/2; else v = uIndex1 + (uIndex2*(uIndex2 - 1))/2; assert(v < m_uTriangularMatrixSize); return v; } float Clust::GetDist(unsigned uIndex1, unsigned uIndex2) const { unsigned v = VectorIndex(uIndex1, uIndex2); return m_dDist[v]; } void Clust::SetDist(unsigned uIndex1, unsigned uIndex2, float dDist) { unsigned v = VectorIndex(uIndex1, uIndex2); m_dDist[v] = dDist; } float Clust::GetHeight(unsigned uNodeIndex) const { if (IsLeaf(uNodeIndex)) return 0; const unsigned uLeftIndex = GetLeftIndex(uNodeIndex); const unsigned uRightIndex = GetRightIndex(uNodeIndex); const float dLeftLength = GetLength(uLeftIndex); const float dRightLength = GetLength(uRightIndex); const float dLeftHeight = dLeftLength + GetHeight(uLeftIndex); const float dRightHeight = dRightLength + GetHeight(uRightIndex); return (dLeftHeight + dRightHeight)/2; } const char *Clust::GetNodeName(unsigned uNodeIndex) const { if (!IsLeaf(uNodeIndex)) Quit("Clust::GetNodeName, is not leaf"); return m_ptrSet->GetLeafName(uNodeIndex); } unsigned Clust::GetNodeId(unsigned uNodeIndex) const { if (uNodeIndex >= GetLeafCount()) return 0; return m_ptrSet->GetLeafId(uNodeIndex); } unsigned Clust::GetLeaf(unsigned uNodeIndex, unsigned uLeafIndex) const { const ClustNode &Node = GetNode(uNodeIndex); const unsigned uLeafCount = Node.m_uSize; if (uLeafIndex >= uLeafCount) Quit("Clust::GetLeaf, invalid index"); const unsigned uIndex = Node.m_uLeafIndexes[uLeafIndex]; if (uIndex >= m_uNodeCount) Quit("Clust::GetLeaf, index out of range"); return uIndex; } unsigned Clust::GetFirstCluster() const { if (0 == m_ptrClusterList) return uInsane; return m_ptrClusterList->m_uIndex; } unsigned Clust::GetNextCluster(unsigned uIndex) const { ClustNode *ptrNode = &m_Nodes[uIndex]; if (0 == ptrNode->m_ptrNextCluster) return uInsane; return ptrNode->m_ptrNextCluster->m_uIndex; } void Clust::DeleteFromClusterList(unsigned uNodeIndex) { assert(uNodeIndex < m_uNodeCount); ClustNode *ptrNode = &m_Nodes[uNodeIndex]; ClustNode *ptrPrev = ptrNode->m_ptrPrevCluster; ClustNode *ptrNext = ptrNode->m_ptrNextCluster; if (0 != ptrNext) ptrNext->m_ptrPrevCluster = ptrPrev; if (0 == ptrPrev) { assert(m_ptrClusterList == ptrNode); m_ptrClusterList = ptrNext; } else ptrPrev->m_ptrNextCluster = ptrNext; ptrNode->m_ptrNextCluster = 0; ptrNode->m_ptrPrevCluster = 0; } void Clust::AddToClusterList(unsigned uNodeIndex) { assert(uNodeIndex < m_uNodeCount); ClustNode *ptrNode = &m_Nodes[uNodeIndex]; if (0 != m_ptrClusterList) m_ptrClusterList->m_ptrPrevCluster = ptrNode; ptrNode->m_ptrNextCluster = m_ptrClusterList; ptrNode->m_ptrPrevCluster = 0; m_ptrClusterList = ptrNode; } float Clust::ComputeMetric(unsigned uIndex1, unsigned uIndex2) const { switch (m_JoinStyle) { case JOIN_NearestNeighbor: return ComputeMetricNearestNeighbor(uIndex1, uIndex2); case JOIN_NeighborJoining: return ComputeMetricNeighborJoining(uIndex1, uIndex2); } Quit("Clust::ComputeMetric"); return 0; } float Clust::ComputeMetricNeighborJoining(unsigned i, unsigned j) const { float ri = Calc_r(i); float rj = Calc_r(j); float dij = GetDist(i, j); float dMetric = dij - (ri + rj); return (float) dMetric; } float Clust::ComputeMetricNearestNeighbor(unsigned i, unsigned j) const { return (float) GetDist(i, j); } float Clust::GetMinMetricBruteForce(unsigned *ptruIndex1, unsigned *ptruIndex2) const { unsigned uMinLeftNodeIndex = uInsane; unsigned uMinRightNodeIndex = uInsane; float dMinMetric = PLUS_INFINITY; for (unsigned uLeftNodeIndex = GetFirstCluster(); uLeftNodeIndex != uInsane; uLeftNodeIndex = GetNextCluster(uLeftNodeIndex)) { for (unsigned uRightNodeIndex = GetNextCluster(uLeftNodeIndex); uRightNodeIndex != uInsane; uRightNodeIndex = GetNextCluster(uRightNodeIndex)) { float dMetric = ComputeMetric(uLeftNodeIndex, uRightNodeIndex); if (dMetric < dMinMetric) { dMinMetric = dMetric; uMinLeftNodeIndex = uLeftNodeIndex; uMinRightNodeIndex = uRightNodeIndex; } } } *ptruIndex1 = uMinLeftNodeIndex; *ptruIndex2 = uMinRightNodeIndex; return dMinMetric; } float Clust::GetMinMetric(unsigned *ptruIndex1, unsigned *ptruIndex2) const { return GetMinMetricBruteForce(ptruIndex1, ptruIndex2); } cluster.cpp0000664000175000017500000002052012360262614011334 0ustar bobbob#include "muscle.h" #include "cluster.h" #include "distfunc.h" static inline float Min(float d1, float d2) { return d1 < d2 ? d1 : d2; } static inline float Max(float d1, float d2) { return d1 > d2 ? d1 : d2; } static inline float Mean(float d1, float d2) { return (float) ((d1 + d2)/2.0); } #if _DEBUG void ClusterTree::Validate(unsigned uNodeCount) { unsigned n; ClusterNode *pNode; unsigned uDisjointListCount = 0; for (pNode = m_ptrDisjoints; pNode; pNode = pNode->GetNextDisjoint()) { ClusterNode *pPrev = pNode->GetPrevDisjoint(); ClusterNode *pNext = pNode->GetNextDisjoint(); if (0 != pPrev) { if (pPrev->GetNextDisjoint() != pNode) { Log("Prev->This mismatch, prev=\n"); pPrev->LogMe(); Log("This=\n"); pNode->LogMe(); Quit("ClusterTree::Validate()"); } } else { if (pNode != m_ptrDisjoints) { Log("[%u]->prev = 0 but != m_ptrDisjoints=%d\n", pNode->GetIndex(), m_ptrDisjoints ? m_ptrDisjoints->GetIndex() : 0xffffffff); pNode->LogMe(); Quit("ClusterTree::Validate()"); } } if (0 != pNext) { if (pNext->GetPrevDisjoint() != pNode) { Log("Next->This mismatch, next=\n"); pNext->LogMe(); Log("This=\n"); pNode->LogMe(); Quit("ClusterTree::Validate()"); } } ++uDisjointListCount; if (uDisjointListCount > m_uNodeCount) Quit("Loop in disjoint list"); } unsigned uParentlessNodeCount = 0; for (n = 0; n < uNodeCount; ++n) if (0 == m_Nodes[n].GetParent()) ++uParentlessNodeCount; if (uDisjointListCount != uParentlessNodeCount) Quit("Disjoints = %u Parentless = %u\n", uDisjointListCount, uParentlessNodeCount); } #else // !_DEBUG #define Validate(uNodeCount) // empty #endif void ClusterNode::LogMe() const { unsigned uClusterSize = GetClusterSize(); Log("[%02u] w=%5.3f CW=%5.3f LBW=%5.3f RBW=%5.3f LWT=%5.3f RWT=%5.3f L=%02d R=%02d P=%02d NxDj=%02d PvDj=%02d Sz=%02d {", m_uIndex, m_dWeight, GetClusterWeight(), GetLeftBranchWeight(), GetRightBranchWeight(), GetLeftWeight(), GetRightWeight(), m_ptrLeft ? m_ptrLeft->GetIndex() : 0xffffffff, m_ptrRight ? m_ptrRight->GetIndex() : 0xffffffff, m_ptrParent ? m_ptrParent->GetIndex() : 0xffffffff, m_ptrNextDisjoint ? m_ptrNextDisjoint->GetIndex() : 0xffffffff, m_ptrPrevDisjoint ? m_ptrPrevDisjoint->GetIndex() : 0xffffffff, uClusterSize); for (unsigned i = 0; i < uClusterSize; ++i) Log(" %u", GetClusterLeaf(i)->GetIndex()); Log(" }\n"); } // How many leaves in the sub-tree under this node? unsigned ClusterNode::GetClusterSize() const { unsigned uLeafCount = 0; if (0 == m_ptrLeft && 0 == m_ptrRight) return 1; if (0 != m_ptrLeft) uLeafCount += m_ptrLeft->GetClusterSize(); if (0 != m_ptrRight) uLeafCount += m_ptrRight->GetClusterSize(); assert(uLeafCount > 0); return uLeafCount; } double ClusterNode::GetClusterWeight() const { double dWeight = 0.0; if (0 != m_ptrLeft) dWeight += m_ptrLeft->GetClusterWeight(); if (0 != m_ptrRight) dWeight += m_ptrRight->GetClusterWeight(); return dWeight + GetWeight(); } double ClusterNode::GetLeftBranchWeight() const { const ClusterNode *ptrLeft = GetLeft(); if (0 == ptrLeft) return 0.0; return GetWeight() - ptrLeft->GetWeight(); } double ClusterNode::GetRightBranchWeight() const { const ClusterNode *ptrRight = GetRight(); if (0 == ptrRight) return 0.0; return GetWeight() - ptrRight->GetWeight(); } double ClusterNode::GetRightWeight() const { const ClusterNode *ptrRight = GetRight(); if (0 == ptrRight) return 0.0; return ptrRight->GetClusterWeight() + GetWeight(); } double ClusterNode::GetLeftWeight() const { const ClusterNode *ptrLeft = GetLeft(); if (0 == ptrLeft) return 0.0; return ptrLeft->GetClusterWeight() + GetWeight(); } // Return n'th leaf in the sub-tree under this node. const ClusterNode *ClusterNode::GetClusterLeaf(unsigned uLeafIndex) const { if (0 != m_ptrLeft) { if (0 == m_ptrRight) return this; unsigned uLeftLeafCount = m_ptrLeft->GetClusterSize(); if (uLeafIndex < uLeftLeafCount) return m_ptrLeft->GetClusterLeaf(uLeafIndex); assert(uLeafIndex >= uLeftLeafCount); return m_ptrRight->GetClusterLeaf(uLeafIndex - uLeftLeafCount); } if (0 == m_ptrRight) return this; return m_ptrRight->GetClusterLeaf(uLeafIndex); } void ClusterTree::DeleteFromDisjoints(ClusterNode *ptrNode) { ClusterNode *ptrPrev = ptrNode->GetPrevDisjoint(); ClusterNode *ptrNext = ptrNode->GetNextDisjoint(); if (0 != ptrPrev) ptrPrev->SetNextDisjoint(ptrNext); else m_ptrDisjoints = ptrNext; if (0 != ptrNext) ptrNext->SetPrevDisjoint(ptrPrev); #if _DEBUG // not algorithmically necessary, but improves clarity // and supports Validate(). ptrNode->SetPrevDisjoint(0); ptrNode->SetNextDisjoint(0); #endif } void ClusterTree::AddToDisjoints(ClusterNode *ptrNode) { ptrNode->SetNextDisjoint(m_ptrDisjoints); ptrNode->SetPrevDisjoint(0); if (0 != m_ptrDisjoints) m_ptrDisjoints->SetPrevDisjoint(ptrNode); m_ptrDisjoints = ptrNode; } ClusterTree::ClusterTree() { m_ptrDisjoints = 0; m_Nodes = 0; m_uNodeCount = 0; } ClusterTree::~ClusterTree() { delete[] m_Nodes; } void ClusterTree::LogMe() const { Log("Disjoints=%d\n", m_ptrDisjoints ? m_ptrDisjoints->GetIndex() : 0xffffffff); for (unsigned i = 0; i < m_uNodeCount; ++i) { m_Nodes[i].LogMe(); } } ClusterNode *ClusterTree::GetRoot() const { return &m_Nodes[m_uNodeCount - 1]; } // This is the UPGMA algorithm as described in Durbin et al. p166. void ClusterTree::Create(const DistFunc &Dist) { unsigned i; m_uLeafCount = Dist.GetCount(); m_uNodeCount = 2*m_uLeafCount - 1; delete[] m_Nodes; m_Nodes = new ClusterNode[m_uNodeCount]; for (i = 0; i < m_uNodeCount; ++i) m_Nodes[i].SetIndex(i); for (i = 0; i < m_uLeafCount - 1; ++i) m_Nodes[i].SetNextDisjoint(&m_Nodes[i+1]); for (i = 1; i < m_uLeafCount; ++i) m_Nodes[i].SetPrevDisjoint(&m_Nodes[i-1]); m_ptrDisjoints = &m_Nodes[0]; // Log("Initial state\n"); // LogMe(); // Log("\n"); DistFunc ClusterDist; ClusterDist.SetCount(m_uNodeCount); double dMaxDist = 0.0; for (i = 0; i < m_uLeafCount; ++i) for (unsigned j = 0; j < m_uLeafCount; ++j) { float dDist = Dist.GetDist(i, j); ClusterDist.SetDist(i, j, dDist); } Validate(m_uLeafCount); // Iteration. N-1 joins needed to create a binary tree from N leaves. for (unsigned uJoinIndex = m_uLeafCount; uJoinIndex < m_uNodeCount; ++uJoinIndex) { // Find closest pair of clusters unsigned uIndexClosest1; unsigned uIndexClosest2; bool bFound = false; double dDistClosest = 9e99; for (ClusterNode *ptrNode1 = m_ptrDisjoints; ptrNode1; ptrNode1 = ptrNode1->GetNextDisjoint()) { for (ClusterNode *ptrNode2 = ptrNode1->GetNextDisjoint(); ptrNode2; ptrNode2 = ptrNode2->GetNextDisjoint()) { unsigned i1 = ptrNode1->GetIndex(); unsigned i2 = ptrNode2->GetIndex(); double dDist = ClusterDist.GetDist(i1, i2); if (dDist < dDistClosest) { bFound = true; dDistClosest = dDist; uIndexClosest1 = i1; uIndexClosest2 = i2; } } } assert(bFound); ClusterNode &Join = m_Nodes[uJoinIndex]; ClusterNode &Child1 = m_Nodes[uIndexClosest1]; ClusterNode &Child2 = m_Nodes[uIndexClosest2]; Join.SetLeft(&Child1); Join.SetRight(&Child2); Join.SetWeight(dDistClosest); Child1.SetParent(&Join); Child2.SetParent(&Join); DeleteFromDisjoints(&Child1); DeleteFromDisjoints(&Child2); AddToDisjoints(&Join); // Log("After join %d %d\n", uIndexClosest1, uIndexClosest2); // LogMe(); // Calculate distance of every remaining disjoint cluster to the // new cluster created by the join for (ClusterNode *ptrNode = m_ptrDisjoints; ptrNode; ptrNode = ptrNode->GetNextDisjoint()) { unsigned uNodeIndex = ptrNode->GetIndex(); float dDist1 = ClusterDist.GetDist(uNodeIndex, uIndexClosest1); float dDist2 = ClusterDist.GetDist(uNodeIndex, uIndexClosest2); float dDist = Min(dDist1, dDist2); ClusterDist.SetDist(uJoinIndex, uNodeIndex, dDist); } Validate(uJoinIndex+1); } GetRoot()->GetClusterWeight(); // LogMe(); } clwwt.cpp0000664000175000017500000001150712360262613011017 0ustar bobbob#include "muscle.h" #include "tree.h" #include "msa.h" /*** Compute weights by the CLUSTALW method. Thompson, Higgins and Gibson (1994), CABIOS (10) 19-29; see also CLUSTALW paper. Weights are computed from the edge lengths of a rooted tree. Define the strength of an edge to be its length divided by the number of leaves under that edge. The weight of a sequence is then the sum of edge strengths on the path from the root to the leaf. Example. 0.2 -----A 0.1 -x ------- B 0.7 --------y ----------- C 0.3 ----------z 0.4 -------------- D 0.8 Edge Length Leaves Strength ---- ----- ------ -------- xy 0.3 3 0.1 xA 0.2 1 0.2 yz 0.4 2 0.2 yB 0.1 1 0.1 zC 0.7 1 0.7 zD 0.8 1 0.8 Leaf Path Strengths Weight ---- ---- --------- ------ A xA 0.2 0.2 B xy-yB 0.1 + 0.1 0.2 C xy-yz-zC 0.1 + 0.2 + 0.7 1.0 D xy-yz-zD 0.1 + 0.2 + 0.8 1.1 ***/ #define TRACE 0 static unsigned CountLeaves(const Tree &tree, unsigned uNodeIndex, unsigned LeavesUnderNode[]) { if (tree.IsLeaf(uNodeIndex)) { LeavesUnderNode[uNodeIndex] = 1; return 1; } const unsigned uLeft = tree.GetLeft(uNodeIndex); const unsigned uRight = tree.GetRight(uNodeIndex); const unsigned uRightCount = CountLeaves(tree, uRight, LeavesUnderNode); const unsigned uLeftCount = CountLeaves(tree, uLeft, LeavesUnderNode); const unsigned uCount = uRightCount + uLeftCount; LeavesUnderNode[uNodeIndex] = uCount; return uCount; } void CalcClustalWWeights(const Tree &tree, WEIGHT Weights[]) { #if TRACE Log("CalcClustalWWeights\n"); tree.LogMe(); #endif const unsigned uLeafCount = tree.GetLeafCount(); if (0 == uLeafCount) return; else if (1 == uLeafCount) { Weights[0] = (WEIGHT) 1.0; return; } else if (2 == uLeafCount) { Weights[0] = (WEIGHT) 0.5; Weights[1] = (WEIGHT) 0.5; return; } if (!tree.IsRooted()) Quit("CalcClustalWWeights requires rooted tree"); const unsigned uNodeCount = tree.GetNodeCount(); unsigned *LeavesUnderNode = new unsigned[uNodeCount]; memset(LeavesUnderNode, 0, uNodeCount*sizeof(unsigned)); const unsigned uRootNodeIndex = tree.GetRootNodeIndex(); unsigned uLeavesUnderRoot = CountLeaves(tree, uRootNodeIndex, LeavesUnderNode); if (uLeavesUnderRoot != uLeafCount) Quit("WeightsFromTreee: Internal error, root count %u %u", uLeavesUnderRoot, uLeafCount); #if TRACE Log("Node Leaves Length Strength\n"); Log("---- ------ -------- --------\n"); // 1234 123456 12345678 12345678 #endif double *Strengths = new double[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (tree.IsRoot(uNodeIndex)) { Strengths[uNodeIndex] = 0.0; continue; } const unsigned uParent = tree.GetParent(uNodeIndex); const double dLength = tree.GetEdgeLength(uNodeIndex, uParent); const unsigned uLeaves = LeavesUnderNode[uNodeIndex]; const double dStrength = dLength / (double) uLeaves; Strengths[uNodeIndex] = dStrength; #if TRACE Log("%4u %6u %8g %8g\n", uNodeIndex, uLeaves, dLength, dStrength); #endif } #if TRACE Log("\n"); Log(" Seq Path..Weight\n"); Log("-------------------- ------------\n"); #endif for (unsigned n = 0; n < uLeafCount; ++n) { const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n); #if TRACE Log("%20.20s %4u ", tree.GetLeafName(uLeafNodeIndex), uLeafNodeIndex); #endif if (!tree.IsLeaf(uLeafNodeIndex)) Quit("CalcClustalWWeights: leaf"); double dWeight = 0; unsigned uNode = uLeafNodeIndex; while (!tree.IsRoot(uNode)) { dWeight += Strengths[uNode]; uNode = tree.GetParent(uNode); #if TRACE Log("->%u(%g)", uNode, Strengths[uNode]); #endif } if (dWeight < 0.0001) { #if TRACE Log("zero->one"); #endif dWeight = 1.0; } Weights[n] = (WEIGHT) dWeight; #if TRACE Log(" = %g\n", dWeight); #endif } delete[] Strengths; delete[] LeavesUnderNode; Normalize(Weights, uLeafCount); } void MSA::SetClustalWWeights(const Tree &tree) { const unsigned uSeqCount = GetSeqCount(); const unsigned uLeafCount = tree.GetLeafCount(); WEIGHT *Weights = new WEIGHT[uSeqCount]; CalcClustalWWeights(tree, Weights); for (unsigned n = 0; n < uLeafCount; ++n) { const WEIGHT w = Weights[n]; const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n); const unsigned uId = tree.GetLeafId(uLeafNodeIndex); const unsigned uSeqIndex = GetSeqIndex(uId); #if DEBUG if (GetSeqName(uSeqIndex) != tree.GetLeafName(uLeafNodeIndex)) Quit("MSA::SetClustalWWeights: names don't match"); #endif SetSeqWeight(uSeqIndex, w); } NormalizeWeights((WEIGHT) 1.0); delete[] Weights; } color.cpp0000664000175000017500000001331212360262614010772 0ustar bobbob#include "muscle.h" #include "msa.h" static int Blosum62[23][23] = { // A B C D E F G H I K L M N P Q R S T V W X Y Z +4, -2, +0, -2, -1, -2, +0, -2, -1, -1, -1, -1, -2, -1, -1, -1, +1, +0, +0, -3, -1, -2, -1, // A -2, +6, -3, +6, +2, -3, -1, -1, -3, -1, -4, -3, +1, -1, +0, -2, +0, -1, -3, -4, -1, -3, +2, // B +0, -3, +9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -1, -2, -4, // C -2, +6, -3, +6, +2, -3, -1, -1, -3, -1, -4, -3, +1, -1, +0, -2, +0, -1, -3, -4, -1, -3, +2, // D -1, +2, -4, +2, +5, -3, -2, +0, -3, +1, -3, -2, +0, -1, +2, +0, +0, -1, -2, -3, -1, -2, +5, // E -2, -3, -2, -3, -3, +6, -3, -1, +0, -3, +0, +0, -3, -4, -3, -3, -2, -2, -1, +1, -1, +3, -3, // F +0, -1, -3, -1, -2, -3, +6, -2, -4, -2, -4, -3, +0, -2, -2, -2, +0, -2, -3, -2, -1, -3, -2, // G -2, -1, -3, -1, +0, -1, -2, +8, -3, -1, -3, -2, +1, -2, +0, +0, -1, -2, -3, -2, -1, +2, +0, // H -1, -3, -1, -3, -3, +0, -4, -3, +4, -3, +2, +1, -3, -3, -3, -3, -2, -1, +3, -3, -1, -1, -3, // I -1, -1, -3, -1, +1, -3, -2, -1, -3, +5, -2, -1, +0, -1, +1, +2, +0, -1, -2, -3, -1, -2, +1, // K -1, -4, -1, -4, -3, +0, -4, -3, +2, -2, +4, +2, -3, -3, -2, -2, -2, -1, +1, -2, -1, -1, -3, // L -1, -3, -1, -3, -2, +0, -3, -2, +1, -1, +2, +5, -2, -2, +0, -1, -1, -1, +1, -1, -1, -1, -2, // M -2, +1, -3, +1, +0, -3, +0, +1, -3, +0, -3, -2, +6, -2, +0, +0, +1, +0, -3, -4, -1, -2, +0, // N -1, -1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, +7, -1, -2, -1, -1, -2, -4, -1, -3, -1, // P -1, +0, -3, +0, +2, -3, -2, +0, -3, +1, -2, +0, +0, -1, +5, +1, +0, -1, -2, -2, -1, -1, +2, // Q -1, -2, -3, -2, +0, -3, -2, +0, -3, +2, -2, -1, +0, -2, +1, +5, -1, -1, -3, -3, -1, -2, +0, // R +1, +0, -1, +0, +0, -2, +0, -1, -2, +0, -2, -1, +1, -1, +0, -1, +4, +1, -2, -3, -1, -2, +0, // S +0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, +0, -1, -1, -1, +1, +5, +0, -2, -1, -2, -1, // T +0, -3, -1, -3, -2, -1, -3, -3, +3, -2, +1, +1, -3, -2, -2, -3, -2, +0, +4, -3, -1, -1, -2, // V -3, -4, -2, -4, -3, +1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3,+11, -1, +2, -3, // W -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // X -2, -3, -2, -3, -2, +3, -3, +2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, +2, -1, +7, -2, // Y -1, +2, -4, +2, +5, -3, -2, +0, -3, +1, -3, -2, +0, -1, +2, +0, +0, -1, -2, -3, -1, -2, +5, // Z }; static int toi_tab[26] = { 0, // A 1, // B 2, // C 3, // D 4, // E 5, // F 6, // G 7, // H 8, // I -1, // J 9, // K 10, // L 11, // M 12, // N -1, // O 13, // P 14, // Q 15, // R 16, // S 17, // T 17, // U 18, // V 19, // W 20, // X 21, // Y 22, // Z }; static int toi(char c) { c = toupper(c); return toi_tab[c - 'A']; } static int BlosumScore(char c1, char c2) { int i1 = toi(c1); int i2 = toi(c2); return Blosum62[i1][i2]; } /*** Consider a column with 5 As and 3 Bs. There are: 5x4 pairs of As. 3x2 pairs of Bs. 5x3x2 AB pairs 8x7 = 5x4 + 3x2 + 5x3x2 pairs of letters ***/ static double BlosumScoreCol(const MSA &a, unsigned uColIndex) { int iCounts[23]; memset(iCounts, 0, sizeof(iCounts)); const unsigned uSeqCount = a.GetSeqCount(); unsigned uCharCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { char c = a.GetChar(uSeqIndex, uColIndex); if (IsGapChar(c)) continue; int iChar = toi(c); ++iCounts[iChar]; ++uCharCount; } if (uCharCount < 2) return -9; int iTotalScore = 0; for (int i1 = 0; i1 < 23; ++i1) { int iCounts1 = iCounts[i1]; iTotalScore += iCounts1*(iCounts1 - 1)*Blosum62[i1][i1]; for (int i2 = i1 + 1; i2 < 23; ++i2) iTotalScore += iCounts[i2]*iCounts1*2*Blosum62[i1][i2]; } int iPairCount = uCharCount*(uCharCount - 1); return (double) iTotalScore / (double) iPairCount; } /*** Consider a column with 5 As and 3 Bs. A residue of type Q scores: 5xAQ + 3xBQ ***/ static void AssignColorsCol(const MSA &a, unsigned uColIndex, int **Colors) { int iCounts[23]; memset(iCounts, 0, sizeof(iCounts)); const unsigned uSeqCount = a.GetSeqCount(); unsigned uCharCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { char c = a.GetChar(uSeqIndex, uColIndex); if (IsGapChar(c)) continue; int iChar = toi(c); ++iCounts[iChar]; ++uCharCount; } int iMostConservedType = -1; int iMostConservedCount = -1; for (unsigned i = 0; i < 23; ++i) { if (iCounts[i] > iMostConservedCount) { iMostConservedType = i; iMostConservedCount = iCounts[i]; } } double dColScore = BlosumScoreCol(a, uColIndex); int c; if (dColScore >= 3.0) c = 3; //else if (dColScore >= 1.0) // c = 2; else if (dColScore >= 0.2) c = 1; else c = 0; int Color[23]; for (unsigned uLetter = 0; uLetter < 23; ++uLetter) { double dScore = Blosum62[uLetter][iMostConservedType]; if (dScore >= dColScore) Color[uLetter] = c; else Color[uLetter] = 0; } for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { char c = a.GetChar(uSeqIndex, uColIndex); if (IsGapChar(c)) { Colors[uSeqIndex][uColIndex] = 0; continue; } int iLetter = toi(c); if (iLetter >= 0 && iLetter < 23) Colors[uSeqIndex][uColIndex] = Color[iLetter]; else Colors[uSeqIndex][uColIndex] = 0; } } void AssignColors(const MSA &a, int **Colors) { const unsigned uColCount = a.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) AssignColorsCol(a, uColIndex, Colors); } cons.cpp0000664000175000017500000000611212360262614010616 0ustar bobbob/*** Conservation value for a column in an MSA is defined as the number of times the most common letter appears divided by the number of sequences. ***/ #include "muscle.h" #include "msa.h" #include double MSA::GetAvgCons() const { assert(GetSeqCount() > 0); double dSum = 0; unsigned uNonGapColCount = 0; for (unsigned uColIndex = 0; uColIndex < GetColCount(); ++uColIndex) { if (!IsGapColumn(uColIndex)) { dSum += GetCons(uColIndex); ++uNonGapColCount; } } assert(uNonGapColCount > 0); double dAvg = dSum / uNonGapColCount; assert(dAvg > 0 && dAvg <= 1); return dAvg; } double MSA::GetCons(unsigned uColIndex) const { unsigned Counts[MAX_ALPHA]; for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) Counts[uLetter] = 0; unsigned uMaxCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { if (IsGap(uSeqIndex, uColIndex)) continue; char c = GetChar(uSeqIndex, uColIndex); c = toupper(c); if ('X' == c || 'B' == c || 'Z' == c) continue; unsigned uLetter = GetLetter(uSeqIndex, uColIndex); unsigned uCount = Counts[uLetter] + 1; if (uCount > uMaxCount) uMaxCount = uCount; Counts[uLetter] = uCount; } // Cons is undefined for all-gap column if (0 == uMaxCount) { // assert(false); return 1; } double dCons = (double) uMaxCount / (double) GetSeqCount(); assert(dCons > 0 && dCons <= 1); return dCons; } // Perecent identity of a pair of sequences. // Positions with one or both gapped are ignored. double MSA::GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const { const unsigned uColCount = GetColCount(); unsigned uPosCount = 0; unsigned uSameCount = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c1 = GetChar(uSeqIndex1, uColIndex); const char c2 = GetChar(uSeqIndex2, uColIndex); if (IsGapChar(c1) || IsGapChar(c2)) continue; if (c1 == c2) ++uSameCount; ++uPosCount; } if (0 == uPosCount) return 0; return (double) uSameCount / (double) uPosCount; } // Perecent group identity of a pair of sequences. // Positions with one or both gapped are ignored. double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const { extern unsigned ResidueGroup[]; const unsigned uColCount = GetColCount(); unsigned uPosCount = 0; unsigned uSameCount = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { if (IsGap(uSeqIndex1, uColIndex)) continue; if (IsGap(uSeqIndex2, uColIndex)) continue; if (IsWildcard(uSeqIndex1, uColIndex)) continue; if (IsWildcard(uSeqIndex2, uColIndex)) continue; const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex); const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex); const unsigned uGroup1 = ResidueGroup[uLetter1]; const unsigned uGroup2 = ResidueGroup[uLetter2]; if (uGroup1 == uGroup2) ++uSameCount; ++uPosCount; } if (0 == uPosCount) return 0; return (double) uSameCount / (double) uPosCount; } diaglist.cpp0000664000175000017500000002202112360262613011450 0ustar bobbob#include "muscle.h" #include "diaglist.h" #include "pwpath.h" #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) void DiagList::Add(const Diag &d) { if (m_uCount == MAX_DIAGS) Quit("DiagList::Add, overflow %u", m_uCount); m_Diags[m_uCount] = d; ++m_uCount; } void DiagList::Add(unsigned uStartPosA, unsigned uStartPosB, unsigned uLength) { Diag d; d.m_uStartPosA = uStartPosA; d.m_uStartPosB = uStartPosB; d.m_uLength = uLength; Add(d); } const Diag &DiagList::Get(unsigned uIndex) const { if (uIndex >= m_uCount) Quit("DiagList::Get(%u), count=%u", uIndex, m_uCount); return m_Diags[uIndex]; } void DiagList::LogMe() const { Log("DiagList::LogMe, count=%u\n", m_uCount); Log(" n StartA StartB Length\n"); Log("--- ------ ------ ------\n"); for (unsigned n = 0; n < m_uCount; ++n) { const Diag &d = m_Diags[n]; Log("%3u %6u %6u %6u\n", n, d.m_uStartPosA, d.m_uStartPosB, d.m_uLength); } } void DiagList::FromPath(const PWPath &Path) { Clear(); const unsigned uEdgeCount = Path.GetEdgeCount(); unsigned uLength = 0; unsigned uStartPosA; unsigned uStartPosB; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); // Typical cases if (Edge.cType == 'M') { if (0 == uLength) { uStartPosA = Edge.uPrefixLengthA - 1; uStartPosB = Edge.uPrefixLengthB - 1; } ++uLength; } else { if (uLength >= g_uMinDiagLength) Add(uStartPosA, uStartPosB, uLength); uLength = 0; } } // Special case for last edge if (uLength >= g_uMinDiagLength) Add(uStartPosA, uStartPosB, uLength); } bool DiagList::NonZeroIntersection(const Diag &d) const { for (unsigned n = 0; n < m_uCount; ++n) { const Diag &d2 = m_Diags[n]; if (DiagOverlap(d, d2) > 0) return true; } return false; } // DialogOverlap returns the length of the overlapping // section of the two diagonals along the diagonals // themselves; in other words, the length of // the intersection of the two sets of cells in // the matrix. unsigned DiagOverlap(const Diag &d1, const Diag &d2) { // Determine where the diagonals intersect the A // axis (extending them if required). If they // intersect at different points, they do not // overlap. Coordinates on a diagonal are // given by B = A + c where c is the value of // A at the intersection with the A axis. // Hence, c = B - A for any point on the diagonal. int c1 = (int) d1.m_uStartPosB - (int) d1.m_uStartPosA; int c2 = (int) d2.m_uStartPosB - (int) d2.m_uStartPosA; if (c1 != c2) return 0; assert(DiagOverlapA(d1, d2) == DiagOverlapB(d1, d2)); return DiagOverlapA(d1, d2); } // DialogOverlapA returns the length of the overlapping // section of the projection of the two diagonals onto // the A axis. unsigned DiagOverlapA(const Diag &d1, const Diag &d2) { unsigned uMaxStart = MAX(d1.m_uStartPosA, d2.m_uStartPosA); unsigned uMinEnd = MIN(d1.m_uStartPosA + d1.m_uLength - 1, d2.m_uStartPosA + d2.m_uLength - 1); int iLength = (int) uMinEnd - (int) uMaxStart + 1; if (iLength < 0) return 0; return (unsigned) iLength; } // DialogOverlapB returns the length of the overlapping // section of the projection of the two diagonals onto // the B axis. unsigned DiagOverlapB(const Diag &d1, const Diag &d2) { unsigned uMaxStart = MAX(d1.m_uStartPosB, d2.m_uStartPosB); unsigned uMinEnd = MIN(d1.m_uStartPosB + d1.m_uLength - 1, d2.m_uStartPosB + d2.m_uLength - 1); int iLength = (int) uMinEnd - (int) uMaxStart + 1; if (iLength < 0) return 0; return (unsigned) iLength; } // Returns true if the two diagonals can be on the // same path through the DP matrix. If DiagCompatible // returns false, they cannot be in the same path // and hence "contradict" each other. bool DiagCompatible(const Diag &d1, const Diag &d2) { if (DiagOverlap(d1, d2) > 0) return true; return 0 == DiagOverlapA(d1, d2) && 0 == DiagOverlapB(d1, d2); } // Returns the length of the "break" between two diagonals. unsigned DiagBreak(const Diag &d1, const Diag &d2) { int c1 = (int) d1.m_uStartPosB - (int) d1.m_uStartPosA; int c2 = (int) d2.m_uStartPosB - (int) d2.m_uStartPosA; if (c1 != c2) return 0; int iMaxStart = MAX(d1.m_uStartPosA, d2.m_uStartPosA); int iMinEnd = MIN(d1.m_uStartPosA + d1.m_uLength - 1, d2.m_uStartPosA + d1.m_uLength - 1); int iBreak = iMaxStart - iMinEnd - 1; if (iBreak < 0) return 0; return (unsigned) iBreak; } // Merge diagonals that are continuations of each other with // int breaks of up to length g_uMaxDiagBreak. // In a sorted list of diagonals, we only have to check // consecutive entries. void MergeDiags(DiagList &DL) { return; #if DEBUG if (!DL.IsSorted()) Quit("MergeDiags: !IsSorted"); #endif // TODO: Fix this! // Breaks must be with no offset (no gaps) const unsigned uCount = DL.GetCount(); if (uCount <= 1) return; DiagList NewList; Diag MergedDiag; const Diag *ptrPrev = &DL.Get(0); for (unsigned i = 1; i < uCount; ++i) { const Diag *ptrDiag = &DL.Get(i); unsigned uBreakLength = DiagBreak(*ptrPrev, *ptrDiag); if (uBreakLength <= g_uMaxDiagBreak) { MergedDiag.m_uStartPosA = ptrPrev->m_uStartPosA; MergedDiag.m_uStartPosB = ptrPrev->m_uStartPosB; MergedDiag.m_uLength = ptrPrev->m_uLength + ptrDiag->m_uLength + uBreakLength; ptrPrev = &MergedDiag; } else { NewList.Add(*ptrPrev); ptrPrev = ptrDiag; } } NewList.Add(*ptrPrev); DL.Copy(NewList); } void DiagList::DeleteIncompatible() { assert(IsSorted()); if (m_uCount < 2) return; bool *bFlagForDeletion = new bool[m_uCount]; for (unsigned i = 0; i < m_uCount; ++i) bFlagForDeletion[i] = false; for (unsigned i = 0; i < m_uCount; ++i) { const Diag &di = m_Diags[i]; for (unsigned j = i + 1; j < m_uCount; ++j) { const Diag &dj = m_Diags[j]; // Verify sorted correctly assert(di.m_uStartPosA <= dj.m_uStartPosA); // If two diagonals are incompatible and // one is is much longer than the other, // keep the longer one. if (!DiagCompatible(di, dj)) { if (di.m_uLength > dj.m_uLength*4) bFlagForDeletion[j] = true; else if (dj.m_uLength > di.m_uLength*4) bFlagForDeletion[i] = true; else { bFlagForDeletion[i] = true; bFlagForDeletion[j] = true; } } } } for (unsigned i = 0; i < m_uCount; ++i) { const Diag &di = m_Diags[i]; if (bFlagForDeletion[i]) continue; for (unsigned j = i + 1; j < m_uCount; ++j) { const Diag &dj = m_Diags[j]; if (bFlagForDeletion[j]) continue; // Verify sorted correctly assert(di.m_uStartPosA <= dj.m_uStartPosA); // If sort order in B different from sorted order in A, // either diags are incompatible or we detected a repeat // or permutation. if (di.m_uStartPosB >= dj.m_uStartPosB || !DiagCompatible(di, dj)) { bFlagForDeletion[i] = true; bFlagForDeletion[j] = true; } } } unsigned uNewCount = 0; Diag *NewDiags = new Diag[m_uCount]; for (unsigned i = 0; i < m_uCount; ++i) { if (bFlagForDeletion[i]) continue; const Diag &d = m_Diags[i]; NewDiags[uNewCount] = d; ++uNewCount; } memcpy(m_Diags, NewDiags, uNewCount*sizeof(Diag)); m_uCount = uNewCount; delete[] NewDiags; } void DiagList::Copy(const DiagList &DL) { Clear(); unsigned uCount = DL.GetCount(); for (unsigned i = 0; i < uCount; ++i) Add(DL.Get(i)); } // Check if sorted in increasing order of m_uStartPosA bool DiagList::IsSorted() const { return true; unsigned uCount = GetCount(); for (unsigned i = 1; i < uCount; ++i) if (m_Diags[i-1].m_uStartPosA > m_Diags[i].m_uStartPosA) return false; return true; } // Sort in increasing order of m_uStartPosA // Dumb bubble sort, but don't care about speed // because don't get long lists. void DiagList::Sort() { if (m_uCount < 2) return; bool bContinue = true; while (bContinue) { bContinue = false; for (unsigned i = 0; i < m_uCount - 1; ++i) { if (m_Diags[i].m_uStartPosA > m_Diags[i+1].m_uStartPosA) { Diag Tmp = m_Diags[i]; m_Diags[i] = m_Diags[i+1]; m_Diags[i+1] = Tmp; bContinue = true; } } } } //void TestDiag() // { // Diag d1; // Diag d2; // Diag d3; // // d1.m_uStartPosA = 0; // d1.m_uStartPosB = 1; // d1.m_uLength = 32; // // d2.m_uStartPosA = 55; // d2.m_uStartPosB = 70; // d2.m_uLength = 36; // // d3.m_uStartPosA = 102; // d3.m_uStartPosB = 122; // d3.m_uLength = 50; // // DiagList DL; // DL.Add(d1); // DL.Add(d2); // DL.Add(d3); // // Log("Before DeleteIncompatible:\n"); // DL.LogMe(); // DL.DeleteIncompatible(); // // Log("After DeleteIncompatible:\n"); // DL.LogMe(); // // MergeDiags(DL); // Log("After Merge:\n"); // DL.LogMe(); // // DPRegionList RL; // DiagListToDPRegionList(DL, RL, 200, 200); // RL.LogMe(); // } diffobjscore.cpp0000664000175000017500000001055512360262614012321 0ustar bobbob#include "muscle.h" #include "msa.h" #include "objscore.h" #include "profile.h" #define TRACE 0 #define COMPARE_3_52 0 #define BRUTE_LETTERS 0 static SCORE ScoreColLetters(const MSA &msa, unsigned uColIndex) { SCOREMATRIX &Mx = *g_ptrScoreMatrix; const unsigned uSeqCount = msa.GetSeqCount(); #if BRUTE_LETTERS SCORE BruteScore = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { unsigned uLetter1 = msa.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter1 >= g_AlphaSize) continue; WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { unsigned uLetter2 = msa.GetLetterEx(uSeqIndex2, uColIndex); if (uLetter2 >= g_AlphaSize) continue; WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); BruteScore += w1*w2*Mx[uLetter1][uLetter2]; } } #endif double N = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { WEIGHT w = msa.GetSeqWeight(uSeqIndex1); N += w; } if (N <= 0) return 0; FCOUNT Freqs[20]; memset(Freqs, 0, sizeof(Freqs)); SCORE Score = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { unsigned uLetter = msa.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter >= g_AlphaSize) continue; WEIGHT w = msa.GetSeqWeight(uSeqIndex1); Freqs[uLetter] += w; Score -= w*w*Mx[uLetter][uLetter]; } for (unsigned uLetter1 = 0; uLetter1 < g_AlphaSize; ++uLetter1) { const FCOUNT f1 = Freqs[uLetter1]; Score += f1*f1*Mx[uLetter1][uLetter1]; for (unsigned uLetter2 = uLetter1 + 1; uLetter2 < g_AlphaSize; ++uLetter2) { const FCOUNT f2 = Freqs[uLetter2]; Score += 2*f1*f2*Mx[uLetter1][uLetter2]; } } Score /= 2; #if BRUTE_LETTERS assert(BTEq(BruteScore, Score)); #endif return Score; } static SCORE ScoreLetters(const MSA &msa, const unsigned Edges[], unsigned uEdgeCount) { const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); // Letters SCORE Score = 0; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const unsigned uColIndex = Edges[uEdgeIndex]; assert(uColIndex < uColCount); Score += ScoreColLetters(msa, uColIndex); } return Score; } void GetLetterScores(const MSA &msa, SCORE Scores[]) { const unsigned uColCount = msa.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) Scores[uColIndex] = ScoreColLetters(msa, uColIndex); } SCORE DiffObjScore( const MSA &msa1, const PWPath &Path1, const unsigned Edges1[], unsigned uEdgeCount1, const MSA &msa2, const PWPath &Path2, const unsigned Edges2[], unsigned uEdgeCount2) { #if TRACE { Log("============DiffObjScore===========\n"); Log("msa1:\n"); msa1.LogMe(); Log("\n"); Log("Cols1: "); for (unsigned i = 0; i < uEdgeCount1; ++i) Log(" %u", Edges1[i]); Log("\n\n"); Log("msa2:\n"); msa2.LogMe(); Log("Cols2: "); for (unsigned i = 0; i < uEdgeCount2; ++i) Log(" %u", Edges2[i]); Log("\n\n"); } #endif #if COMPARE_3_52 extern SCORE g_SPScoreLetters; extern SCORE g_SPScoreGaps; SCORE SP1 = ObjScoreSP(msa1); SCORE SPLetters1 = g_SPScoreLetters; SCORE SPGaps1 = g_SPScoreGaps; SCORE SP2 = ObjScoreSP(msa2); SCORE SPLetters2 = g_SPScoreLetters; SCORE SPGaps2 = g_SPScoreGaps; SCORE SPDiffLetters = SPLetters2 - SPLetters1; SCORE SPDiffGaps = SPGaps2 - SPGaps1; SCORE SPDiff = SPDiffLetters + SPDiffGaps; #endif SCORE Letters1 = ScoreLetters(msa1, Edges1, uEdgeCount1); SCORE Letters2 = ScoreLetters(msa2, Edges2, uEdgeCount2); SCORE Gaps1 = ScoreGaps(msa1, Edges1, uEdgeCount1); SCORE Gaps2 = ScoreGaps(msa2, Edges2, uEdgeCount2); SCORE DiffLetters = Letters2 - Letters1; SCORE DiffGaps = Gaps2 - Gaps1; SCORE Diff = DiffLetters + DiffGaps; #if COMPARE_3_52 Log("ObjScoreSP Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n", SPLetters1, SPLetters2, SPDiffLetters); Log("DiffObjScore Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n", Letters1, Letters2, DiffLetters); Log("ObjScoreSP Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n", SPGaps1, SPGaps2, SPDiffGaps); Log("DiffObjScore Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n", Gaps1, Gaps2, DiffGaps); Log("SP diff=%.4g DiffObjScore Diff=%.4g\n", SPDiff, Diff); #endif return Diff; } diffpaths.cpp0000664000175000017500000000606312360262614011631 0ustar bobbob#include "muscle.h" #include "pwpath.h" #define TRACE 0 void DiffPaths(const PWPath &p1, const PWPath &p2, unsigned Edges1[], unsigned *ptruDiffCount1, unsigned Edges2[], unsigned *ptruDiffCount2) { #if TRACE Log("DiffPaths\n"); Log("p1="); p1.LogMe(); Log("p2="); p2.LogMe(); #endif const unsigned uEdgeCount1 = p1.GetEdgeCount(); const unsigned uEdgeCount2 = p2.GetEdgeCount(); unsigned uDiffCount1 = 0; unsigned uDiffCount2 = 0; unsigned uEdgeIndex1 = 0; unsigned uEdgeIndex2 = 0; const PWEdge *Edge1 = &p1.GetEdge(uEdgeIndex1); const PWEdge *Edge2 = &p2.GetEdge(uEdgeIndex2); for (;;) { unsigned uEdgeIndexTop1 = uEdgeIndex1; unsigned uEdgeIndexTop2 = uEdgeIndex2; Edge1 = &p1.GetEdge(uEdgeIndex1); Edge2 = &p2.GetEdge(uEdgeIndex2); #if TRACE Log("e1[%u] PLA%u PLB%u %c, e2[%u] PLA%u PLB %u %c DC1=%u DC2=%u\n", uEdgeIndex1, Edge1->uPrefixLengthA, Edge1->uPrefixLengthB, Edge1->cType, uEdgeIndex2, Edge2->uPrefixLengthA, Edge2->uPrefixLengthB, Edge2->cType, uDiffCount1, uDiffCount2); #endif if (Edge1->uPrefixLengthA == Edge2->uPrefixLengthA && Edge1->uPrefixLengthB == Edge2->uPrefixLengthB) { if (!Edge1->Equal(*Edge2)) { Edges1[uDiffCount1++] = uEdgeIndex1; Edges2[uDiffCount2++] = uEdgeIndex2; } ++uEdgeIndex1; ++uEdgeIndex2; } else if (Edge2->uPrefixLengthA < Edge1->uPrefixLengthA || Edge2->uPrefixLengthB < Edge1->uPrefixLengthB) Edges2[uDiffCount2++] = uEdgeIndex2++; else if (Edge1->uPrefixLengthA < Edge2->uPrefixLengthA || Edge1->uPrefixLengthB < Edge2->uPrefixLengthB) Edges1[uDiffCount1++] = uEdgeIndex1++; if (uEdgeCount1 == uEdgeIndex1) { while (uEdgeIndex2 < uEdgeCount2) Edges2[uDiffCount2++] = uEdgeIndex2++; goto Done; } if (uEdgeCount2 == uEdgeIndex2) { while (uEdgeIndex1 < uEdgeCount1) Edges1[uDiffCount1++] = uEdgeIndex1++; goto Done; } if (uEdgeIndex1 == uEdgeIndexTop1 && uEdgeIndex2 == uEdgeIndexTop2) Quit("DiffPaths stuck"); } Done:; #if TRACE Log("DiffCount1=%u (%u %u)\n", uDiffCount1, uEdgeCount1, uEdgeCount2); Log("Diffs1="); for (unsigned i = 0; i < uDiffCount1; ++i) { const PWEdge e = p1.GetEdge(Edges1[i]); Log(" %u=%c%u.%u", Edges1[i], e.cType, e.uPrefixLengthA, e.uPrefixLengthB); } Log("\n"); Log("DiffCount2=%u\n", uDiffCount2); Log("Diffs2="); for (unsigned i = 0; i < uDiffCount2; ++i) { const PWEdge e = p2.GetEdge(Edges2[i]); Log(" %u=%c%u.%u", Edges2[i], e.cType, e.uPrefixLengthA, e.uPrefixLengthB); } Log("\n"); #endif *ptruDiffCount1 = uDiffCount1; *ptruDiffCount2 = uDiffCount2; } void TestDiffPaths() { PWPath p1; PWPath p2; p1.AppendEdge('M', 1, 1); p1.AppendEdge('M', 2, 2); p1.AppendEdge('M', 3, 3); p2.AppendEdge('M', 1, 1); p2.AppendEdge('D', 2, 1); p2.AppendEdge('I', 2, 2); p2.AppendEdge('M', 3, 3); unsigned Edges1[64]; unsigned Edges2[64]; unsigned uDiffCount1; unsigned uDiffCount2; DiffPaths(p1, p2, Edges1, &uDiffCount1, Edges2, &uDiffCount2); } difftrees.cpp0000664000175000017500000002560612360262613011637 0ustar bobbob#include "muscle.h" #include "tree.h" #define TRACE 0 /*** Algorithm to compare two trees, X and Y. A node x in X and node y in Y are defined to be similar iff the set of leaves in the subtree under x is identical to the set of leaves under y. A node is defined to be dissimilar iff it is not similar to any node in the other tree. Nodes x and y are defined to be married iff every node in the subtree under x is similar to a node in the subtree under y. Married nodes are considered to be equal. The subtrees under two married nodes can at most differ by exchanges of left and right branches, which we do not consider to be significant here. A node is defined to be a bachelor iff it is not married. If a node is a bachelor, then it has a dissimilar node in its subtree, and it follows immediately from the definition of marriage that its parent is also a bachelor. Hence all nodes on the path from a bachelor node to the root are bachelors. We assume the trees have the same set of leaves, so every leaf is trivially both similar and married to the same leaf in the opposite tree. Bachelor nodes are therefore always internal (i.e., non-leaf) nodes. A node is defined to be a diff iff (a) it is married and (b) its parent is a bachelor. The subtree under a diff is maximally similar to the other tree. (In other words, you cannot extend the subtree without adding a bachelor). The set of diffs is the subset of the two trees that we consider to be identical. Example: -----A -----k ----j -----B --i -----C ------D -----A -----p ----n -----B --m -----D ------C The following pairs of internal nodes are similar. Nodes Set of leaves ----- ------------- k,p A,B i,m A,B,C,D Bachelors in the first tree are i and j, bachelors in the second tree are m and n. Node k and p are married, but i and m are not (because j and n are bachelors). The diffs are C, D and k. The set of bachelor nodes can be viewed as the internal nodes of a tree, the leaves of which are diffs. (To see that there can't be disjoint subtrees, note that the path from a diff to a root is all bachelor nodes, so there is always a path between two diffs that goes through the root). We call this tree the "diffs tree". There is a simple O(N) algorithm to build the diffs tree. To achieve O(N) we avoid traversing a given subtree multiple times and also avoid comparing lists of leaves. We visit nodes in depth-first order (i.e., a node is visited before its parent). If either child of a node is a bachelor, we flag it as a bachelor. If both children of the node we are visiting are married, we check whether the spouses of those children have the same parent in the other tree. If the parents are different, the current node is a bachelor. If they have the same parent, then the node we are visiting is the spouse of that parent. We assign this newly identified married couple a unique integer id. The id of a node is in one-to-one correspondence with the set of leaves in its subtree. Two nodes have the same set of leaves iff they have the same id. Bachelor nodes do not get an id. ***/ static void BuildDiffs(const Tree &tree, unsigned uTreeNodeIndex, const bool bIsDiff[], Tree &Diffs, unsigned uDiffsNodeIndex, unsigned IdToDiffsLeafNodeIndex[]) { #if TRACE Log("BuildDiffs(TreeNode=%u IsDiff=%d IsLeaf=%d)\n", uTreeNodeIndex, bIsDiff[uTreeNodeIndex], tree.IsLeaf(uTreeNodeIndex)); #endif if (bIsDiff[uTreeNodeIndex]) { unsigned uLeafCount = tree.GetLeafCount(); unsigned *Leaves = new unsigned[uLeafCount]; GetLeaves(tree, uTreeNodeIndex, Leaves, &uLeafCount); for (unsigned n = 0; n < uLeafCount; ++n) { const unsigned uLeafNodeIndex = Leaves[n]; const unsigned uId = tree.GetLeafId(uLeafNodeIndex); if (uId >= tree.GetLeafCount()) Quit("BuildDiffs, id out of range"); IdToDiffsLeafNodeIndex[uId] = uDiffsNodeIndex; #if TRACE Log(" Leaf id=%u DiffsNode=%u\n", uId, uDiffsNodeIndex); #endif } delete[] Leaves; return; } if (tree.IsLeaf(uTreeNodeIndex)) Quit("BuildDiffs: should never reach leaf"); const unsigned uTreeLeft = tree.GetLeft(uTreeNodeIndex); const unsigned uTreeRight = tree.GetRight(uTreeNodeIndex); const unsigned uDiffsLeft = Diffs.AppendBranch(uDiffsNodeIndex); const unsigned uDiffsRight = uDiffsLeft + 1; BuildDiffs(tree, uTreeLeft, bIsDiff, Diffs, uDiffsLeft, IdToDiffsLeafNodeIndex); BuildDiffs(tree, uTreeRight, bIsDiff, Diffs, uDiffsRight, IdToDiffsLeafNodeIndex); } void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs, unsigned IdToDiffsLeafNodeIndex[]) { #if TRACE Log("Tree1:\n"); Tree1.LogMe(); Log("\n"); Log("Tree2:\n"); Tree2.LogMe(); #endif if (!Tree1.IsRooted() || !Tree2.IsRooted()) Quit("DiffTrees: requires rooted trees"); const unsigned uNodeCount = Tree1.GetNodeCount(); const unsigned uNodeCount2 = Tree2.GetNodeCount(); const unsigned uLeafCount = Tree1.GetLeafCount(); const unsigned uLeafCount2 = Tree2.GetLeafCount(); assert(uLeafCount == uLeafCount2); if (uNodeCount != uNodeCount2) Quit("DiffTrees: different node counts"); // Allocate tables so we can convert tree node index to // and from the unique id with a O(1) lookup. unsigned *NodeIndexToId1 = new unsigned[uNodeCount]; unsigned *IdToNodeIndex2 = new unsigned[uNodeCount]; bool *bIsBachelor1 = new bool[uNodeCount]; bool *bIsDiff1 = new bool[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { NodeIndexToId1[uNodeIndex] = uNodeCount; bIsBachelor1[uNodeIndex] = false; bIsDiff1[uNodeIndex] = false; // Use uNodeCount as value meaning "not set". IdToNodeIndex2[uNodeIndex] = uNodeCount; } // Initialize node index <-> id lookup tables for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (Tree1.IsLeaf(uNodeIndex)) { const unsigned uId = Tree1.GetLeafId(uNodeIndex); if (uId >= uNodeCount) Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)"); NodeIndexToId1[uNodeIndex] = uId; } if (Tree2.IsLeaf(uNodeIndex)) { const unsigned uId = Tree2.GetLeafId(uNodeIndex); if (uId >= uNodeCount) Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)"); IdToNodeIndex2[uId] = uNodeIndex; } } // Validity check. This verifies that the ids // pre-assigned to the leaves in Tree1 are unique // (note that the id= uNodeCount) { Log("NewNode=%u uOld=%u > uNodeCount=%u\n", uNewNodeIndex, uOld, uNodeCount); Quit("Diff check failed"); } unsigned uIdNew = NewTree.GetLeafId(uNewNodeIndex); unsigned uIdOld = OldTree.GetLeafId(uOld); if (uIdNew != uIdOld) { Log("NewNode=%u uOld=%u IdNew=%u IdOld=%u\n", uNewNodeIndex, uOld, uIdNew, uIdOld); Quit("Diff check failed"); } continue; } if (NODE_CHANGED == uOld) continue; unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex); unsigned uNewRight = NewTree.GetRight(uNewNodeIndex); unsigned uOldLeft = OldTree.GetLeft(uOld); unsigned uOldRight = OldTree.GetRight(uOld); unsigned uNewLeftPartner = NewNodeIndexToOldNodeIndex[uNewLeft]; unsigned uNewRightPartner = NewNodeIndexToOldNodeIndex[uNewRight]; bool bSameNotRotated = (uNewLeftPartner == uOldLeft && uNewRightPartner == uOldRight); bool bSameRotated = (uNewLeftPartner == uOldRight && uNewRightPartner == uOldLeft); if (!bSameNotRotated && !bSameRotated) { Log("NewNode=%u NewL=%u NewR=%u\n", uNewNodeIndex, uNewLeft, uNewRight); Log("OldNode=%u OldL=%u OldR=%u\n", uOld, uOldLeft, uOldRight); Log("NewLPartner=%u NewRPartner=%u\n", uNewLeftPartner, uNewRightPartner); Quit("Diff check failed"); } } } #endif } distcalc.cpp0000664000175000017500000000357112360262614011450 0ustar bobbob#include "muscle.h" #include "distfunc.h" #include "distcalc.h" #include "msa.h" void DistCalcDF::Init(const DistFunc &DF) { m_ptrDF = &DF; } void DistCalcDF::CalcDistRange(unsigned i, dist_t Dist[]) const { for (unsigned j = 0; j < i; ++j) Dist[j] = m_ptrDF->GetDist(i, j); } unsigned DistCalcDF::GetCount() const { return m_ptrDF->GetCount(); } unsigned DistCalcDF::GetId(unsigned i) const { return m_ptrDF->GetId(i); } const char *DistCalcDF::GetName(unsigned i) const { return m_ptrDF->GetName(i); } void DistCalcMSA::Init(const MSA &msa, DISTANCE Distance) { m_ptrMSA = &msa; m_Distance = Distance; } void DistCalcMSA::CalcDistRange(unsigned i, dist_t Dist[]) const { for (unsigned j = 0; j < i; ++j) { switch (m_Distance) { case DISTANCE_PctIdKimura: { const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j); Dist[j] = (float) KimuraDist(PctId); break; } case DISTANCE_PctIdLog: { const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j); Dist[j] = (float) PctIdToMAFFTDist(PctId); break; } case DISTANCE_ScoreDist: { double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2); Dist[j] = (float) GetScoreDist(*m_ptrMSA, i, j); continue; } case DISTANCE_Edit: { const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j); if (PctId > 1.0) Quit("Internal error, DISTANCE_Edit, pct id=%.3g", PctId); Dist[j] = (float) 1.0 - PctId; break; } default: Quit("DistCalcMSA: Invalid DISTANCE_%u", m_Distance); } } } unsigned DistCalcMSA::GetCount() const { return m_ptrMSA->GetSeqCount(); } unsigned DistCalcMSA::GetId(unsigned i) const { return m_ptrMSA->GetSeqId(i); } const char *DistCalcMSA::GetName(unsigned i) const { return m_ptrMSA->GetSeqName(i); } distfunc.cpp0000664000175000017500000000457012360262614011501 0ustar bobbob#include "muscle.h" #include "distfunc.h" #include DistFunc::DistFunc() { m_Dists = 0; m_uCount = 0; m_uCacheCount = 0; m_Names = 0; m_Ids = 0; } DistFunc::~DistFunc() { if (0 != m_Names) { for (unsigned i = 0; i < m_uCount; ++i) free(m_Names[i]); } delete[] m_Dists; delete[] m_Names; delete[] m_Ids; } float DistFunc::GetDist(unsigned uIndex1, unsigned uIndex2) const { return m_Dists[VectorIndex(uIndex1, uIndex2)]; } unsigned DistFunc::GetCount() const { return m_uCount; } void DistFunc::SetCount(unsigned uCount) { m_uCount = uCount; if (uCount <= m_uCacheCount) return; delete[] m_Dists; m_Dists = new float[VectorLength()]; m_Names = new char *[m_uCount]; m_Ids = new unsigned[m_uCount]; m_uCacheCount = uCount; memset(m_Names, 0, m_uCount*sizeof(char *)); memset(m_Ids, 0xff, m_uCount*sizeof(unsigned)); memset(m_Dists, 0, VectorLength()*sizeof(float)); } void DistFunc::SetDist(unsigned uIndex1, unsigned uIndex2, float dDist) { m_Dists[VectorIndex(uIndex1, uIndex2)] = dDist; m_Dists[VectorIndex(uIndex2, uIndex1)] = dDist; } unsigned DistFunc::VectorIndex(unsigned uIndex1, unsigned uIndex2) const { assert(uIndex1 < m_uCount && uIndex2 < m_uCount); return uIndex1*m_uCount + uIndex2; } unsigned DistFunc::VectorLength() const { return m_uCount*m_uCount; } void DistFunc::SetName(unsigned uIndex, const char szName[]) { assert(uIndex < m_uCount); m_Names[uIndex] = strsave(szName); } void DistFunc::SetId(unsigned uIndex, unsigned uId) { assert(uIndex < m_uCount); m_Ids[uIndex] = uId; } const char *DistFunc::GetName(unsigned uIndex) const { assert(uIndex < m_uCount); return m_Names[uIndex]; } unsigned DistFunc::GetId(unsigned uIndex) const { assert(uIndex < m_uCount); return m_Ids[uIndex]; } void DistFunc::LogMe() const { Log("DistFunc::LogMe count=%u\n", m_uCount); Log(" "); for (unsigned i = 0; i < m_uCount; ++i) Log(" %7u", i); Log("\n"); Log(" "); for (unsigned i = 0; i < m_uCount; ++i) Log(" %7.7s", m_Names[i] ? m_Names[i] : ""); Log("\n"); for (unsigned i = 0; i < m_uCount; ++i) { Log("%4u %10.10s : ", i, m_Names[i] ? m_Names[i] : ""); for (unsigned j = 0; j <= i; ++j) Log(" %7.4g", GetDist(i, j)); Log("\n"); } } distpwkimura.cpp0000664000175000017500000000216612360262614012404 0ustar bobbob#include "muscle.h" #include "distfunc.h" #include "msa.h" #include "seqvect.h" #include "pwpath.h" void DistPWKimura(const SeqVect &v, DistFunc &DF) { SEQWEIGHT SeqWeightSave = GetSeqWeightMethod(); SetSeqWeightMethod(SEQWEIGHT_Henikoff); const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; SetProgressDesc("PWKimura distance"); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const Seq &s1 = v.GetSeq(uSeqIndex1); MSA msa1; msa1.FromSeq(s1); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) { if (0 == uCount%20) Progress(uCount, uPairCount); ++uCount; const Seq &s2 = v.GetSeq(uSeqIndex2); MSA msa2; msa2.FromSeq(s2); PWPath Path; MSA msaOut; AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false); double dPctId = msaOut.GetPctIdentityPair(0, 1); float f = (float) KimuraDist(dPctId); DF.SetDist(uSeqIndex1, uSeqIndex2, f); } } ProgressStepsDone(); SetSeqWeightMethod(SeqWeightSave); } domuscle.cpp0000664000175000017500000001607712360262614011502 0ustar bobbob#include "muscle.h" #include "textfile.h" #include "seqvect.h" #include "distfunc.h" #include "msa.h" #include "tree.h" #include "profile.h" #include "timing.h" static char g_strUseTreeWarning[] = "\n******** WARNING ****************\n" "\nYou specified the -usetree option.\n" "Note that a good evolutionary tree may NOT be a good\n" "guide tree for multiple alignment. For more details,\n" "please refer to the user guide. To disable this\n" "warning, use -usetree_nowarn .\n\n"; void DoMuscle() { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); PTR_SCOREMATRIX UserMatrix = 0; if (0 != g_pstrMatrixFileName) { const char *FileName = g_pstrMatrixFileName; const char *Path = getenv("MUSCLE_MXPATH"); if (Path != 0) { size_t n = strlen(Path) + 1 + strlen(FileName) + 1; char *NewFileName = new char[n]; sprintf(NewFileName, "%s/%s", Path, FileName); FileName = NewFileName; } TextFile File(FileName); UserMatrix = ReadMx(File); g_Alpha = ALPHA_Amino; g_PPScore = PPSCORE_SP; } SetPPScore(); if (0 != UserMatrix) g_ptrScoreMatrix = UserMatrix; unsigned uMinL = 0; unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (uMinL == 0 || L < uMinL) uMinL = L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags = g_bDiags1; SetSeqStats(uSeqCount, uMinL, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (0 == uSeqCount) Quit("Input file '%s' has no sequences", g_pstrInFileName); if (1 == uSeqCount) { TextFile fileOut(g_pstrOutFileName, true); v.ToFile(fileOut); return; } if (uSeqCount > 1) MHackStart(v); // First iteration Tree GuideTree; if (0 != g_pstrUseTreeFileName) { // Discourage users... if (!g_bUseTreeNoWarn) fprintf(stderr, "%s", g_strUseTreeWarning); // Read tree from file TextFile TreeFile(g_pstrUseTreeFileName); GuideTree.FromFile(TreeFile); // Make sure tree is rooted if (!GuideTree.IsRooted()) Quit("User tree must be rooted"); if (GuideTree.GetLeafCount() != uSeqCount) Quit("User tree does not match input sequences"); const unsigned uNodeCount = GuideTree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!GuideTree.IsLeaf(uNodeIndex)) continue; const char *LeafName = GuideTree.GetLeafName(uNodeIndex); unsigned uSeqIndex; bool SeqFound = v.FindName(LeafName, &uSeqIndex); if (!SeqFound) Quit("Label %s in tree does not match sequences", LeafName); unsigned uId = v.GetSeqIdFromName(LeafName); GuideTree.SetLeafId(uNodeIndex, uId); } } else TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1, g_pstrDistMxFileName1); const char *Tree1 = ValueOpt("Tree1"); if (0 != Tree1) { TextFile f(Tree1, true); GuideTree.ToFile(f); if (g_bClusterOnly) return; } SetMuscleTree(GuideTree); ValidateMuscleIds(GuideTree); MSA msa; ProgNode *ProgNodes = 0; if (g_bLow) ProgNodes = ProgressiveAlignE(v, GuideTree, msa); else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); if (0 != g_pstrComputeWeightsFileName) { extern void OutWeights(const char *FileName, const MSA &msa); SetMSAWeightsMuscle(msa); OutWeights(g_pstrComputeWeightsFileName, msa); return; } ValidateMuscleIds(msa); if (1 == g_uMaxIters || 2 == uSeqCount) { //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); return; } if (0 == g_pstrUseTreeFileName) { g_bDiags = g_bDiags2; SetIter(2); if (g_bLow) { if (0 != g_uMaxTreeRefineIters) RefineTreeE(msa, v, GuideTree, ProgNodes); } else RefineTree(msa, GuideTree); const char *Tree2 = ValueOpt("Tree2"); if (0 != Tree2) { TextFile f(Tree2, true); GuideTree.ToFile(f); } } SetSeqWeightMethod(g_SeqWeight2); SetMuscleTree(GuideTree); if (g_bAnchors) RefineVert(msa, GuideTree, g_uMaxIters - 2); else RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false); #if 0 // Refining by subfamilies is disabled as it didn't give better // results. I tried doing this before and after RefineHoriz. // Should get back to this as it seems like this should work. RefineSubfams(msa, GuideTree, g_uMaxIters - 2); #endif ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); } void Run() { SetStartTime(); Log("Started %s\n", GetTimeAsStr()); for (int i = 0; i < g_argc; ++i) Log("%s ", g_argv[i]); Log("\n"); #if TIMING TICKS t1 = GetClockTicks(); #endif if (g_bRefine) Refine(); else if (g_bRefineW) { extern void DoRefineW(); DoRefineW(); } else if (g_bProfDB) ProfDB(); else if (g_bSW) Local(); else if (0 != g_pstrSPFileName) DoSP(); else if (g_bProfile) Profile(); else if (g_bPPScore) PPScore(); else if (g_bPAS) ProgAlignSubFams(); else if (g_bMakeTree) { extern void DoMakeTree(); DoMakeTree(); } else DoMuscle(); #if TIMING extern TICKS g_ticksDP; extern TICKS g_ticksObjScore; TICKS t2 = GetClockTicks(); TICKS TotalTicks = t2 - t1; TICKS ticksOther = TotalTicks - g_ticksDP - g_ticksObjScore; double dSecs = TicksToSecs(TotalTicks); double PctDP = (double) g_ticksDP*100.0/(double) TotalTicks; double PctOS = (double) g_ticksObjScore*100.0/(double) TotalTicks; double PctOther = (double) ticksOther*100.0/(double) TotalTicks; Log(" Ticks Secs Pct\n"); Log(" ============ ======= =====\n"); Log("DP %12ld %7.2f %5.1f%%\n", (long) g_ticksDP, TicksToSecs(g_ticksDP), PctDP); Log("OS %12ld %7.2f %5.1f%%\n", (long) g_ticksObjScore, TicksToSecs(g_ticksObjScore), PctOS); Log("Other %12ld %7.2f %5.1f%%\n", (long) ticksOther, TicksToSecs(ticksOther), PctOther); Log("Total %12ld %7.2f 100.0%%\n", (long) TotalTicks, dSecs); #endif ListDiagSavings(); Log("Finished %s\n", GetTimeAsStr()); } dosp.cpp0000664000175000017500000000222312360262613010617 0ustar bobbob#include "muscle.h" #include "textfile.h" #include "msa.h" #include "objscore.h" #include "tree.h" #include "profile.h" void DoSP() { TextFile f(g_pstrSPFileName); MSA a; a.FromFile(f); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = a.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); a.FixAlpha(); SetPPScore(); const unsigned uSeqCount = a.GetSeqCount(); if (0 == uSeqCount) Quit("No sequences in input file %s", g_pstrSPFileName); MSA::SetIdCount(uSeqCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) a.SetSeqId(uSeqIndex, uSeqIndex); SetSeqWeightMethod(g_SeqWeight1); Tree tree; TreeFromMSA(a, tree, g_Cluster2, g_Distance2, g_Root2); SetMuscleTree(tree); SetMSAWeightsMuscle((MSA &) a); SCORE SP = ObjScoreSP(a); Log("File=%s;SP=%.4g\n", g_pstrSPFileName, SP); fprintf(stderr, "File=%s;SP=%.4g\n", g_pstrSPFileName, SP); } dpreglist.cpp0000664000175000017500000000655312360262614011662 0ustar bobbob#include "muscle.h" #include "dpreglist.h" unsigned DPRegionList::GetDPArea() const { unsigned uArea = 0; for (unsigned i = 0; i < m_uCount; ++i) { const DPRegion &r = m_DPRegions[i]; if (DPREGIONTYPE_Rect == r.m_Type) uArea += r.m_Rect.m_uLengthA*r.m_Rect.m_uLengthB; } return uArea; } void DPRegionList::Add(const DPRegion &r) { if (m_uCount == MAX_DPREGIONS) Quit("DPRegionList::Add, overflow %d", m_uCount); m_DPRegions[m_uCount] = r; ++m_uCount; } void DPRegionList::LogMe() const { Log("DPRegionList::LogMe, count=%u\n", m_uCount); Log("Region Type StartA StartB EndA EndB\n"); Log("------ ---- ------ ------ ---- ----\n"); for (unsigned i = 0; i < m_uCount; ++i) { const DPRegion &r = m_DPRegions[i]; Log("%6u ", i); if (DPREGIONTYPE_Diag == r.m_Type) Log("Diag %6u %6u %6u %6u\n", r.m_Diag.m_uStartPosA, r.m_Diag.m_uStartPosB, r.m_Diag.m_uStartPosA + r.m_Diag.m_uLength - 1, r.m_Diag.m_uStartPosB + r.m_Diag.m_uLength - 1); else if (DPREGIONTYPE_Rect == r.m_Type) Log("Rect %6u %6u %6u %6u\n", r.m_Rect.m_uStartPosA, r.m_Rect.m_uStartPosB, r.m_Rect.m_uStartPosA + r.m_Rect.m_uLengthA - 1, r.m_Rect.m_uStartPosB + r.m_Rect.m_uLengthB - 1); else Log(" *** ERROR *** Type=%u\n", r.m_Type); } } void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL, unsigned uLengthA, unsigned uLengthB) { if (g_uDiagMargin > g_uMinDiagLength/2) Quit("Invalid parameters, diagmargin=%d must be <= 2*diaglength=%d", g_uDiagMargin, g_uMinDiagLength); unsigned uStartPosA = 0; unsigned uStartPosB = 0; const unsigned uDiagCount = DL.GetCount(); DPRegion r; for (unsigned uDiagIndex = 0; uDiagIndex < uDiagCount; ++uDiagIndex) { const Diag &d = DL.Get(uDiagIndex); assert(d.m_uLength >= g_uMinDiagLength); const unsigned uStartVertexA = d.m_uStartPosA + g_uDiagMargin - 1; const unsigned uStartVertexB = d.m_uStartPosB + g_uDiagMargin - 1; const unsigned uEndVertexA = d.m_uStartPosA + d.m_uLength - g_uDiagMargin; const unsigned uEndVertexB = d.m_uStartPosB + d.m_uLength - g_uDiagMargin; r.m_Type = DPREGIONTYPE_Rect; r.m_Rect.m_uStartPosA = uStartPosA; r.m_Rect.m_uStartPosB = uStartPosB; assert(uStartVertexA + 1 >= uStartPosA); assert(uStartVertexB + 1 >= uStartPosB); r.m_Rect.m_uLengthA = uStartVertexA + 1 - uStartPosA; r.m_Rect.m_uLengthB = uStartVertexB + 1 - uStartPosB; RL.Add(r); if (uEndVertexA > uStartVertexA + 1) { const unsigned uDiagLengthMinusCaps = uEndVertexA - uStartVertexA - 1; r.m_Type = DPREGIONTYPE_Diag; r.m_Diag.m_uStartPosA = uStartVertexA + 1; r.m_Diag.m_uStartPosB = uStartVertexB + 1; assert(uEndVertexA - uStartVertexA == uEndVertexB - uStartVertexB); r.m_Diag.m_uLength = uEndVertexA - uStartVertexA - 1; RL.Add(r); } uStartPosA = uEndVertexA; uStartPosB = uEndVertexB; } assert((int) uLengthA - (int) uStartPosA >= (int) g_uDiagMargin); assert((int) uLengthB - (int) uStartPosB >= (int) g_uDiagMargin); r.m_Type = DPREGIONTYPE_Rect; r.m_Rect.m_uStartPosA = uStartPosA; r.m_Rect.m_uStartPosB = uStartPosB; assert(uLengthA >= uStartPosA); assert(uLengthB >= uStartPosB); r.m_Rect.m_uLengthA = uLengthA - uStartPosA; r.m_Rect.m_uLengthB = uLengthB - uStartPosB; RL.Add(r); } drawtree.cpp0000664000175000017500000000162712360262614011477 0ustar bobbob#include "muscle.h" #include "tree.h" /*** Simple tree drawing algorithm. y coordinate of node is index in depth-first traversal. x coordinate is distance from root. ***/ static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex) { const unsigned uRoot = tree.GetRootNodeIndex(); unsigned uDist = 0; while (uNodeIndex != uRoot) { ++uDist; uNodeIndex = tree.GetParent(uNodeIndex); } return uDist; } static void DrawNode(const Tree &tree, unsigned uNodeIndex) { if (!tree.IsLeaf(uNodeIndex)) DrawNode(tree, tree.GetLeft(uNodeIndex)); unsigned uDist = DistFromRoot(tree, uNodeIndex); for (unsigned i = 0; i < 5*uDist; ++i) Log(" "); Log("%d\n", uNodeIndex); if (!tree.IsLeaf(uNodeIndex)) DrawNode(tree, tree.GetRight(uNodeIndex)); } void DrawTree(const Tree &tree) { unsigned uRoot = tree.GetRootNodeIndex(); DrawNode(tree, uRoot); } edgelist.cpp0000664000175000017500000000326712360262614011464 0ustar bobbob#include "muscle.h" #include "edgelist.h" EdgeList::EdgeList() { m_uNode1 = 0; m_uNode2 = 0; m_uCount = 0; m_uCacheSize = 0; } EdgeList::~EdgeList() { Clear(); } void EdgeList::Clear() { delete[] m_uNode1; delete[] m_uNode2; m_uNode1 = 0; m_uNode2 = 0; m_uCount = 0; m_uCacheSize = 0; } void EdgeList::Add(unsigned uNode1, unsigned uNode2) { if (m_uCount <= m_uCacheSize) Expand(); m_uNode1[m_uCount] = uNode1; m_uNode2[m_uCount] = uNode2; ++m_uCount; } unsigned EdgeList::GetCount() const { return m_uCount; } void EdgeList::GetEdge(unsigned uIndex, unsigned *ptruNode1, unsigned *ptruNode2) const { if (uIndex > m_uCount) Quit("EdgeList::GetEdge(%u) count=%u", uIndex, m_uCount); *ptruNode1 = m_uNode1[uIndex]; *ptruNode2 = m_uNode2[uIndex]; } void EdgeList::Copy(const EdgeList &rhs) { Clear(); const unsigned uCount = rhs.GetCount(); for (unsigned n = 0; n < uCount; ++n) { unsigned uNode1; unsigned uNode2; rhs.GetEdge(n, &uNode1, &uNode2); Add(uNode1, uNode2); } } void EdgeList::Expand() { unsigned uNewCacheSize = m_uCacheSize + 512; unsigned *NewNode1 = new unsigned[uNewCacheSize]; unsigned *NewNode2 = new unsigned[uNewCacheSize]; if (m_uCount > 0) { memcpy(NewNode1, m_uNode1, m_uCount*sizeof(unsigned)); memcpy(NewNode2, m_uNode2, m_uCount*sizeof(unsigned)); } delete[] m_uNode1; delete[] m_uNode2; m_uNode1 = NewNode1; m_uNode2 = NewNode2; m_uCacheSize = uNewCacheSize; } void EdgeList::LogMe() const { for (unsigned n = 0; n < m_uCount; ++n) { if (n > 0) Log(" "); Log("%u->%u", m_uNode1[n], m_uNode2[n]); } Log("\n"); } enumopts.cpp0000664000175000017500000000024012360262614011522 0ustar bobbob#include "muscle.h" #include "enumopts.h" #define s(t) EnumOpt t##_Opts[] = { #define c(t, x) #x, t##_##x, #define e(t) 0, 0 }; #include "enums.h" enumtostr.cpp0000664000175000017500000000120412360262613011710 0ustar bobbob#include "muscle.h" #include static char szMsg[64]; // Define XXXToStr(XXX x) functions for each enum type XXX. #define s(t) const char *t##ToStr(t x) { switch (x) { case t##_Undefined: return "Undefined"; #define c(t, x) case t##_##x: return #x; #define e(t) } sprintf(szMsg, #t "_%d", x); return szMsg; } #include "enums.h" // Define StrToXXX(const char *Str) functions for each enum type XXX. #define s(t) t StrTo##t(const char *Str) { if (0) ; #define c(t, x) else if (0 == stricmp(#x, Str)) return t##_##x; #define e(t) Quit("Invalid value %s for type %s", Str, #t); return t##_Undefined; } #include "enums.h" estring.cpp0000664000175000017500000003171012360262613011330 0ustar bobbob#include "muscle.h" #include "pwpath.h" #include "estring.h" #include "seq.h" #include "msa.h" /*** An "estring" is an edit string that operates on a sequence. An estring is represented as a vector of integers. It is interpreted in order of increasing suffix. A positive value n means copy n letters. A negative value -n means insert n indels. Zero marks the end of the vector. Consecutive entries must have opposite sign, i.e. the shortest possible representation must be used. A "tpair" is a traceback path for a pairwise alignment represented as two estrings, one for each sequence. ***/ #define c2(c,d) (((unsigned char) c) << 8 | (unsigned char) d) unsigned LengthEstring(const int es[]) { unsigned i = 0; while (*es++ != 0) ++i; return i; } int *EstringNewCopy(const int es[]) { unsigned n = LengthEstring(es) + 1; int *esNew = new int[n]; memcpy(esNew, es, n*sizeof(int)); return esNew; } void LogEstring(const int es[]) { Log("<"); for (unsigned i = 0; es[i] != 0; ++i) { if (i > 0) Log(" "); Log("%d", es[i]); } Log(">"); } static bool EstringsEq(const int es1[], const int es2[]) { for (;;) { if (*es1 != *es2) return false; if (0 == *es1) break; ++es1; ++es2; } return true; } static void EstringCounts(const int es[], unsigned *ptruSymbols, unsigned *ptruIndels) { unsigned uSymbols = 0; unsigned uIndels = 0; for (unsigned i = 0; es[i] != 0; ++i) { int n = es[i]; if (n > 0) uSymbols += n; else if (n < 0) uIndels += -n; } *ptruSymbols = uSymbols; *ptruIndels = uIndels; } static char *EstringOp(const int es[], const char s[]) { unsigned uSymbols; unsigned uIndels; EstringCounts(es, &uSymbols, &uIndels); assert((unsigned) strlen(s) == uSymbols); char *sout = new char[uSymbols + uIndels + 1]; char *psout = sout; for (;;) { int n = *es++; if (0 == n) break; if (n > 0) for (int i = 0; i < n; ++i) *psout++ = *s++; else for (int i = 0; i < -n; ++i) *psout++ = '-'; } assert(0 == *s); *psout = 0; return sout; } void EstringOp(const int es[], const Seq &sIn, Seq &sOut) { #if DEBUG unsigned uSymbols; unsigned uIndels; EstringCounts(es, &uSymbols, &uIndels); assert(sIn.Length() == uSymbols); #endif sOut.Clear(); sOut.SetName(sIn.GetName()); int p = 0; for (;;) { int n = *es++; if (0 == n) break; if (n > 0) for (int i = 0; i < n; ++i) { char c = sIn[p++]; sOut.push_back(c); } else for (int i = 0; i < -n; ++i) sOut.push_back('-'); } } unsigned EstringOp(const int es[], const Seq &sIn, MSA &a) { unsigned uSymbols; unsigned uIndels; EstringCounts(es, &uSymbols, &uIndels); assert(sIn.Length() == uSymbols); unsigned uColCount = uSymbols + uIndels; a.Clear(); a.SetSize(1, uColCount); a.SetSeqName(0, sIn.GetName()); a.SetSeqId(0, sIn.GetId()); unsigned p = 0; unsigned uColIndex = 0; for (;;) { int n = *es++; if (0 == n) break; if (n > 0) for (int i = 0; i < n; ++i) { char c = sIn[p++]; a.SetChar(0, uColIndex++, c); } else for (int i = 0; i < -n; ++i) a.SetChar(0, uColIndex++, '-'); } assert(uColIndex == uColCount); return uColCount; } void PathToEstrings(const PWPath &Path, int **ptresA, int **ptresB) { // First pass to determine size of estrings esA and esB const unsigned uEdgeCount = Path.GetEdgeCount(); if (0 == uEdgeCount) { int *esA = new int[1]; int *esB = new int[1]; esA[0] = 0; esB[0] = 0; *ptresA = esA; *ptresB = esB; return; } unsigned iLengthA = 1; unsigned iLengthB = 1; const char cFirstEdgeType = Path.GetEdge(0).cType; char cPrevEdgeType = cFirstEdgeType; for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); char cEdgeType = Edge.cType; switch (c2(cPrevEdgeType, cEdgeType)) { case c2('M', 'M'): case c2('D', 'D'): case c2('I', 'I'): break; case c2('D', 'M'): case c2('M', 'D'): ++iLengthB; break; case c2('I', 'M'): case c2('M', 'I'): ++iLengthA; break; case c2('I', 'D'): case c2('D', 'I'): ++iLengthB; ++iLengthA; break; default: assert(false); } cPrevEdgeType = cEdgeType; } // Pass2 for seq A { int *esA = new int[iLengthA+1]; unsigned iA = 0; switch (Path.GetEdge(0).cType) { case 'M': case 'D': esA[0] = 1; break; case 'I': esA[0] = -1; break; default: assert(false); } char cPrevEdgeType = cFirstEdgeType; for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); char cEdgeType = Edge.cType; switch (c2(cPrevEdgeType, cEdgeType)) { case c2('M', 'M'): case c2('D', 'D'): case c2('D', 'M'): case c2('M', 'D'): ++(esA[iA]); break; case c2('I', 'D'): case c2('I', 'M'): ++iA; esA[iA] = 1; break; case c2('M', 'I'): case c2('D', 'I'): ++iA; esA[iA] = -1; break; case c2('I', 'I'): --(esA[iA]); break; default: assert(false); } cPrevEdgeType = cEdgeType; } assert(iA == iLengthA - 1); esA[iLengthA] = 0; *ptresA = esA; } { // Pass2 for seq B int *esB = new int[iLengthB+1]; unsigned iB = 0; switch (Path.GetEdge(0).cType) { case 'M': case 'I': esB[0] = 1; break; case 'D': esB[0] = -1; break; default: assert(false); } char cPrevEdgeType = cFirstEdgeType; for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); char cEdgeType = Edge.cType; switch (c2(cPrevEdgeType, cEdgeType)) { case c2('M', 'M'): case c2('I', 'I'): case c2('I', 'M'): case c2('M', 'I'): ++(esB[iB]); break; case c2('D', 'I'): case c2('D', 'M'): ++iB; esB[iB] = 1; break; case c2('M', 'D'): case c2('I', 'D'): ++iB; esB[iB] = -1; break; case c2('D', 'D'): --(esB[iB]); break; default: assert(false); } cPrevEdgeType = cEdgeType; } assert(iB == iLengthB - 1); esB[iLengthB] = 0; *ptresB = esB; } #if DEBUG { const PWEdge &LastEdge = Path.GetEdge(uEdgeCount - 1); unsigned uSymbols; unsigned uIndels; EstringCounts(*ptresA, &uSymbols, &uIndels); assert(uSymbols == LastEdge.uPrefixLengthA); assert(uSymbols + uIndels == uEdgeCount); EstringCounts(*ptresB, &uSymbols, &uIndels); assert(uSymbols == LastEdge.uPrefixLengthB); assert(uSymbols + uIndels == uEdgeCount); PWPath TmpPath; EstringsToPath(*ptresA, *ptresB, TmpPath); TmpPath.AssertEqual(Path); } #endif } void EstringsToPath(const int esA[], const int esB[], PWPath &Path) { Path.Clear(); unsigned iA = 0; unsigned iB = 0; int nA = esA[iA++]; int nB = esB[iB++]; unsigned uPrefixLengthA = 0; unsigned uPrefixLengthB = 0; for (;;) { char cType; if (nA > 0) { if (nB > 0) { cType = 'M'; --nA; --nB; } else if (nB < 0) { cType = 'D'; --nA; ++nB; } else assert(false); } else if (nA < 0) { if (nB > 0) { cType = 'I'; ++nA; --nB; } else assert(false); } else assert(false); switch (cType) { case 'M': ++uPrefixLengthA; ++uPrefixLengthB; break; case 'D': ++uPrefixLengthA; break; case 'I': ++uPrefixLengthB; break; } PWEdge Edge; Edge.cType = cType; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; Path.AppendEdge(Edge); if (nA == 0) { if (0 == esA[iA]) { assert(0 == esB[iB]); break; } nA = esA[iA++]; } if (nB == 0) nB = esB[iB++]; } } /*** Multiply two estrings to make a third estring. The product of two estrings e1*e2 is defined to be the estring that produces the same result as applying e1 then e2. Multiplication is not commutative. In fact, the reversed order is undefined unless both estrings consist of a single, identical, positive entry. A primary motivation for using estrings is that multiplication is very fast, reducing the time needed to construct the root alignment. Example <-1,3>(XXX) = -XXX <2,-1,2>(-XXX) = -X-XX Therefore, <-1,3>*<2,-1,2> = <-1,1,-1,2> ***/ static bool CanMultiplyEstrings(const int es1[], const int es2[]) { unsigned uSymbols1; unsigned uSymbols2; unsigned uIndels1; unsigned uIndels2; EstringCounts(es1, &uSymbols1, &uIndels1); EstringCounts(es2, &uSymbols2, &uIndels2); return uSymbols1 + uIndels1 == uSymbols2; } static inline void AppendGaps(int esp[], int &ip, int n) { assert(n < SHRT_MAX); if (-1 == ip) esp[++ip] = n; else if (esp[ip] < 0) esp[ip] += n; else esp[++ip] = n; } static inline void AppendSymbols(int esp[], int &ip, int n) { assert(n < SHRT_MAX); if (-1 == ip) esp[++ip] = n; else if (esp[ip] > 0) esp[ip] += n; else esp[++ip] = n; } void MulEstrings(const int es1[], const int es2[], int esp[]) { unsigned i1 = 0; int ip = -1; int n1 = es1[i1++]; for (unsigned i2 = 0; ; ++i2) { int n2 = es2[i2]; if (0 == n2) break; if (n2 > 0) { for (;;) { if (n1 < 0) { if (n2 > -n1) { AppendGaps(esp, ip, n1); n2 += n1; n1 = es1[i1++]; } else if (n2 == -n1) { AppendGaps(esp, ip, n1); n1 = es1[i1++]; break; } else { assert(n2 < -n1); AppendGaps(esp, ip, -n2); n1 += n2; break; } } else { assert(n1 > 0); if (n2 > n1) { AppendSymbols(esp, ip, n1); n2 -= n1; n1 = es1[i1++]; } else if (n2 == n1) { AppendSymbols(esp, ip, n1); n1 = es1[i1++]; break; } else { assert(n2 < n1); AppendSymbols(esp, ip, n2); n1 -= n2; break; } } } } else { assert(n2 < 0); AppendGaps(esp, ip, n2); } } esp[++ip] = 0; #if DEBUG { int MaxLen = (int) (LengthEstring(es1) + LengthEstring(es2) + 1); assert(ip < MaxLen); if (ip >= 2) for (int i = 0; i < ip - 2; ++i) { if (!(esp[i] > 0 && esp[i+1] < 0 || esp[i] < 0 && esp[i+1] > 0)) { Log("Bad result of MulEstring: "); LogEstring(esp); Quit("Assert failed (alternating signs)"); } } unsigned uSymbols1; unsigned uSymbols2; unsigned uSymbolsp; unsigned uIndels1; unsigned uIndels2; unsigned uIndelsp; EstringCounts(es1, &uSymbols1, &uIndels1); EstringCounts(es2, &uSymbols2, &uIndels2); EstringCounts(esp, &uSymbolsp, &uIndelsp); if (uSymbols1 + uIndels1 != uSymbols2) { Log("Bad result of MulEstring: "); LogEstring(esp); Quit("Assert failed (counts1 %u %u %u)", uSymbols1, uIndels1, uSymbols2); } } #endif } static void test(const int es1[], const int es2[], const int esa[]) { unsigned uSymbols1; unsigned uSymbols2; unsigned uIndels1; unsigned uIndels2; EstringCounts(es1, &uSymbols1, &uIndels1); EstringCounts(es2, &uSymbols2, &uIndels2); char s[4096]; memset(s, 'X', sizeof(s)); s[uSymbols1] = 0; char *s1 = EstringOp(es1, s); char *s12 = EstringOp(es2, s1); memset(s, 'X', sizeof(s)); s[uSymbols2] = 0; char *s2 = EstringOp(es2, s); Log("%s * %s = %s\n", s1, s2, s12); LogEstring(es1); Log(" * "); LogEstring(es2); Log(" = "); LogEstring(esa); Log("\n"); int esp[4096]; MulEstrings(es1, es2, esp); LogEstring(esp); if (!EstringsEq(esp, esa)) Log(" *ERROR* "); Log("\n"); memset(s, 'X', sizeof(s)); s[uSymbols1] = 0; char *sp = EstringOp(esp, s); Log("%s\n", sp); Log("\n==========\n\n"); } void TestEstrings() { SetListFileName("c:\\tmp\\muscle.log", false); //{ //int es1[] = { -1, 1, -1, 0 }; //int es2[] = { 1, -1, 2, 0 }; //int esa[] = { -2, 1, -1, 0 }; //test(es1, es2, esa); //} //{ //int es1[] = { 2, -1, 2, 0 }; //int es2[] = { 1, -1, 3, -1, 1, 0 }; //int esa[] = { 1, -1, 1, -1, 1, -1, 1, 0 }; //test(es1, es2, esa); //} //{ //int es1[] = { -1, 3, 0 }; //int es2[] = { 2, -1, 2, 0 }; //int esa[] = { -1, 1, -1, 2, 0 }; //test(es1, es2, esa); //} //{ //int es1[] = { -1, 1, -1, 1, 0}; //int es2[] = { 4, 0 }; //int esa[] = { -1, 1, -1, 1, 0}; //test(es1, es2, esa); //} //{ //int es1[] = { 1, -1, 1, -1, 0}; //int es2[] = { 4, 0 }; //int esa[] = { 1, -1, 1, -1, 0}; //test(es1, es2, esa); //} //{ //int es1[] = { 1, -1, 1, -1, 0}; //int es2[] = { -1, 4, -1, 0 }; //int esa[] = { -1, 1, -1, 1, -2, 0}; //test(es1, es2, esa); //} { int es1[] = { 106, -77, 56, -2, 155, -3, 123, -2, 0}; int es2[] = { 50, -36, 34, -3, 12, -6, 1, -6, 18, -17, 60, -5, 349, -56, 0 }; int esa[] = { 0 }; test(es1, es2, esa); } exit(0); } fasta2.cpp0000664000175000017500000000460612360262614011042 0ustar bobbob#include "muscle.h" #include #include //const int BUFFER_BYTES = 16*1024; const int BUFFER_BYTES = 128; const int CR = '\r'; const int NL = '\n'; #define ADD(c) \ { \ if (Pos >= BufferLength) \ { \ const int NewBufferLength = BufferLength + BUFFER_BYTES; \ char *NewBuffer = new char[NewBufferLength]; \ memcpy(NewBuffer, Buffer, BufferLength); \ delete[] Buffer; \ Buffer = NewBuffer; \ BufferLength = NewBufferLength; \ } \ Buffer[Pos++] = c; \ } // Get next sequence from file. char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, bool DeleteGaps) { unsigned BufferLength = 0; unsigned Pos = 0; char *Buffer = 0; int c = fgetc(f); if (EOF == c) return 0; if ('>' != c) Quit("Invalid file format, expected '>' to start FASTA label"); for (;;) { int c = fgetc(f); if (EOF == c) Quit("End-of-file or input error in FASTA label"); // NL or CR terminates label if (NL == c || CR == c) break; // All other characters added to label ADD(c) } // Nul-terminate label ADD(0) *ptrLabel = Buffer; BufferLength = 0; Pos = 0; Buffer = 0; int PreviousChar = NL; for (;;) { int c = fgetc(f); if (EOF == c) { if (feof(f)) break; else if (ferror(f)) Quit("Error reading FASTA file, ferror=TRUE feof=FALSE errno=%d %s", errno, strerror(errno)); else Quit("Error reading FASTA file, fgetc=EOF feof=FALSE ferror=FALSE errno=%d %s", errno, strerror(errno)); } if ('>' == c) { if (NL == PreviousChar || CR == PreviousChar) { ungetc(c, f); break; } else Quit("Unexpected '>' in FASTA sequence data"); } else if (isspace(c)) ; else if (IsGapChar(c)) { if (!DeleteGaps) ADD(c) } else if (isalpha(c)) { c = toupper(c); ADD(c) } else if (isprint(c)) { Warning("Invalid character '%c' in FASTA sequence data, ignored", c); continue; } else { Warning("Invalid byte hex %02x in FASTA sequence data, ignored", (unsigned char) c); continue; } PreviousChar = c; } if (0 == Pos) return GetFastaSeq(f, ptrSeqLength, ptrLabel, DeleteGaps); *ptrSeqLength = Pos; return Buffer; } fasta.cpp0000664000175000017500000000235412360262614010756 0ustar bobbob#include "muscle.h" #include #include #include "msa.h" #include "textfile.h" const unsigned FASTA_BLOCK = 60; void MSA::FromFASTAFile(TextFile &File) { Clear(); FILE *f = File.GetStdioFile(); unsigned uSeqCount = 0; unsigned uColCount = uInsane; for (;;) { char *Label; unsigned uSeqLength; char *SeqData = GetFastaSeq(f, &uSeqLength, &Label, false); if (0 == SeqData) break; AppendSeq(SeqData, uSeqLength, Label); } } void MSA::ToFASTAFile(TextFile &File) const { const unsigned uColCount = GetColCount(); assert(uColCount > 0); const unsigned uLinesPerSeq = (GetColCount() - 1)/FASTA_BLOCK + 1; const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { File.PutString(">"); File.PutString(GetSeqName(uSeqIndex)); File.PutString("\n"); unsigned n = 0; for (unsigned uLine = 0; uLine < uLinesPerSeq; ++uLine) { unsigned uLetters = uColCount - uLine*FASTA_BLOCK; if (uLetters > FASTA_BLOCK) uLetters = FASTA_BLOCK; for (unsigned i = 0; i < uLetters; ++i) { char c = GetChar(uSeqIndex, n); File.PutChar(c); ++n; } File.PutChar('\n'); } } } fastclust.cpp0000664000175000017500000000334012360262614011664 0ustar bobbob#include "muscle.h" #include "seqvect.h" #include "distfunc.h" #include "clust.h" #include "clustsetdf.h" #include "tree.h" #include "clust.h" #include "distcalc.h" #include static void TreeFromSeqVect_NJ(const DistFunc &DF, CLUSTER Cluster, Tree &tree) { ClustSetDF CSD(DF); Clust C; C.Create(CSD, Cluster); tree.FromClust(C); } static void TreeFromSeqVect_UPGMA(const DistFunc &DF, CLUSTER Cluster, Tree &tree) { LINKAGE Linkage = LINKAGE_Undefined; switch (Cluster) { case CLUSTER_UPGMA: Linkage = LINKAGE_Avg; break; case CLUSTER_UPGMAMin: Linkage = LINKAGE_Min; break; case CLUSTER_UPGMAMax: Linkage = LINKAGE_Max; break; case CLUSTER_UPGMB: Linkage = LINKAGE_Biased; break; default: Quit("TreeFromSeqVect_UPGMA, CLUSTER_%u not supported", Cluster); } DistCalcDF DC; DC.Init(DF); UPGMA2(DC, tree, Linkage); } static void SaveDF(const SeqVect &v, DistFunc &d, const char *FileName) { FILE *f = fopen(FileName, "w"); if (f == 0) Quit("Cannot create %s", FileName); unsigned n = v.GetSeqCount(); fprintf(f, "%u\n", n); for (unsigned i = 0; i < n; ++i) { fprintf(f, "%10.10s ", v.GetSeqName(i)); for (unsigned j = 0; j < i; ++j) fprintf(f, " %9g", d.GetDist(i, j)); fprintf(f, "\n"); } fclose(f); } void TreeFromSeqVect(const SeqVect &v, Tree &tree, CLUSTER Cluster, DISTANCE Distance, ROOT Root, const char *SaveFileName) { DistFunc DF; DistUnaligned(v, Distance, DF); if (SaveFileName != 0) SaveDF(v, DF, SaveFileName); if (CLUSTER_NeighborJoining == Cluster) TreeFromSeqVect_NJ(DF, Cluster, tree); else TreeFromSeqVect_UPGMA(DF, Cluster, tree); FixRoot(tree, Root); } fastdist.cpp0000664000175000017500000000214512360262613011476 0ustar bobbob#include "muscle.h" #include "distfunc.h" #include "seqvect.h" void DistPWScoreDist(const SeqVect &v, DistFunc &DF); void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF) { const unsigned uSeqCount = v.Length(); switch (DistMethod) { case DISTANCE_Kmer6_6: DistKmer6_6(v, DF); break; case DISTANCE_Kmer20_3: DistKmer20_3(v, DF); break; case DISTANCE_Kmer20_4: FastDistKmer(v, DF); break; case DISTANCE_Kbit20_3: DistKbit20_3(v, DF); break; case DISTANCE_Kmer4_6: DistKmer4_6(v, DF); break; case DISTANCE_PWKimura: DistPWKimura(v, DF); break; case DISTANCE_PWScoreDist: DistPWScoreDist(v, DF); break; default: Quit("DistUnaligned, unsupported distance method %d", DistMethod); } // const char **SeqNames = (const char **) malloc(uSeqCount*sizeof(char *)); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const Seq &s = *(v[uSeqIndex]); const char *ptrName = s.GetName(); unsigned uId = s.GetId(); DF.SetName(uSeqIndex, ptrName); DF.SetId(uSeqIndex, uId); } } fastdistjones.cpp0000664000175000017500000001356112360262614012542 0ustar bobbob#include "muscle.h" #include "distfunc.h" #include "seqvect.h" #include const unsigned TRIPLE_COUNT = 20*20*20; struct TripleCount { unsigned m_uSeqCount; // How many sequences have this triple? unsigned int *m_Counts; // m_Counts[s] = nr of times triple found in seq s }; static TripleCount *TripleCounts; // WARNING: Sequences MUST be stripped of gaps and upper case! void DistKmer20_3(const SeqVect &v, DistFunc &DF) { const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount); TripleCounts = (TripleCount *) malloc(uTripleArrayBytes); if (0 == TripleCounts) Quit("Not enough memory (TripleCounts)"); memset(TripleCounts, 0, uTripleArrayBytes); for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { TripleCount &tc = *(TripleCounts + uWord); const unsigned uBytes = uSeqCount*sizeof(int); tc.m_Counts = (unsigned int *) malloc(uBytes); memset(tc.m_Counts, 0, uBytes); } for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos) { const unsigned uLetter1 = CharToLetterEx(s[uPos]); if (uLetter1 >= 20) continue; const unsigned uLetter2 = CharToLetterEx(s[uPos+1]); if (uLetter2 >= 20) continue; const unsigned uLetter3 = CharToLetterEx(s[uPos+2]); if (uLetter3 >= 20) continue; const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20; assert(uWord < TRIPLE_COUNT); TripleCount &tc = *(TripleCounts + uWord); const unsigned uOldCount = tc.m_Counts[uSeqIndex]; if (0 == uOldCount) ++(tc.m_uSeqCount); ++(tc.m_Counts[uSeqIndex]); } } #if TRACE { Log("TripleCounts\n"); unsigned uGrandTotal = 0; for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { const TripleCount &tc = *(TripleCounts + uWord); if (0 == tc.m_uSeqCount) continue; const unsigned uLetter3 = uWord/(20*20); const unsigned uLetter2 = (uWord - uLetter3*20*20)/20; const unsigned uLetter1 = uWord%20; Log("Word %6u %c%c%c %6u", uWord, LetterToCharAmino(uLetter1), LetterToCharAmino(uLetter2), LetterToCharAmino(uLetter3), tc.m_uSeqCount); unsigned uSeqCountWithThisWord = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const unsigned uCount = tc.m_Counts[uSeqIndex]; if (uCount > 0) { ++uSeqCountWithThisWord; Log(" %u=%u", uSeqIndex, uCount); uGrandTotal += uCount; } } if (uSeqCountWithThisWord != tc.m_uSeqCount) Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord); Log("\n"); } unsigned uTotalBySeqLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); uTotalBySeqLength += uSeqLength - 2; } if (uGrandTotal != uTotalBySeqLength) Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength); } #endif const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned); unsigned int *SeqList = (unsigned int *) malloc(uSeqListBytes); for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { const TripleCount &tc = *(TripleCounts + uWord); if (0 == tc.m_uSeqCount) continue; unsigned uSeqCountFound = 0; memset(SeqList, 0, uSeqListBytes); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { if (tc.m_Counts[uSeqIndex] > 0) { SeqList[uSeqCountFound] = uSeqIndex; ++uSeqCountFound; if (uSeqCountFound == tc.m_uSeqCount) break; } } assert(uSeqCountFound == tc.m_uSeqCount); for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1) { const unsigned uSeqIndex1 = SeqList[uSeq1]; const unsigned uCount1 = tc.m_Counts[uSeqIndex1]; for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { const unsigned uSeqIndex2 = SeqList[uSeq2]; const unsigned uCount2 = tc.m_Counts[uSeqIndex2]; const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2; const double d = DF.GetDist(uSeqIndex1, uSeqIndex2); DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount)); } } } delete[] SeqList; free(TripleCounts); unsigned uDone = 0; const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0.0); const Seq &s1 = *(v[uSeq1]); const unsigned uLength1 = s1.Length(); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { const Seq &s2 = *(v[uSeq2]); const unsigned uLength2 = s2.Length(); unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2; if (uMinLength < 3) { DF.SetDist(uSeq1, uSeq2, 1.0); continue; } const double dTripleCount = DF.GetDist(uSeq1, uSeq2); if (dTripleCount == 0) { DF.SetDist(uSeq1, uSeq2, 1.0); continue; } double dNormalizedTripletScore = dTripleCount/(uMinLength - 2); //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore)); //if (dEstimatedPairwiseIdentity > 1) // dEstimatedPairwiseIdentity = 1; // DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity)); DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore); #if TRACE { Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n", s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2, dEstimatedPairwiseIdentity); } #endif if (uDone%1000 == 0) Progress(uDone, uTotal); } } ProgressStepsDone(); } fastdistkbit.cpp0000664000175000017500000000540412360262614012352 0ustar bobbob#include "muscle.h" #include "distfunc.h" #include "seqvect.h" #include #define MIN(x, y) ((x) < (y) ? (x) : (y)) static void SetKmerBitVector(const Seq &s, byte Bits[]) { const unsigned uLength = s.Length(); const unsigned k = 3; // kmer length unsigned i = 0; unsigned c = 0; unsigned h = 0; for (unsigned j = 0; j < k - 1; ++j) { unsigned x = CharToLetterEx(s[i++]); if (x <= AX_Y) c = c*20 + x; else { c = 0; h = j + 1; } } for ( ; i < uLength; ++i) { unsigned x = CharToLetterEx(s[i++]); if (x <= AX_Y) c = (c*20 + x)%8000; else { c = 0; h = i + k; } if (i >= h) { unsigned ByteOffset = c/8; unsigned BitOffset = c%8; Bits[ByteOffset] |= (1 << BitOffset); } } } static unsigned CommonBitCount(const byte Bits1[], const byte Bits2[]) { const byte * const p1end = Bits1 + 1000; const byte *p2 = Bits2; unsigned uCount = 0; for (const byte *p1 = Bits1; p1 != p1end; ++p1) { // Here is a cute trick for efficiently counting the // bits common between two bytes by combining them into // a single word. unsigned b = *p1 | (*p2 << 8); while (b != 0) { if (b & 0x101) ++uCount; b >>= 1; } ++p2; } return uCount; } void DistKbit20_3(const SeqVect &v, DistFunc &DF) { const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); // There are 20^3 = 8,000 distinct kmers in the 20-letter alphabet. // For each sequence, we create a bit vector of length 8,000, i.e. // 1,000 bytes, having one bit per kmer. The bit is set to 1 if the // kmer is present in the sequence. const unsigned uBytes = uSeqCount*1000; byte *BitVector = new byte[uBytes]; memset(BitVector, 0, uBytes); SetProgressDesc("K-bit distance matrix"); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) SetKmerBitVector(*v[uSeqIndex], BitVector + uSeqIndex*1000); unsigned uDone = 0; const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const byte *Bits1 = BitVector + uSeqIndex1*1000; const unsigned uLength1 = v[uSeqIndex1]->Length(); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) { const byte *Bits2 = BitVector + uSeqIndex2*1000; const unsigned uLength2 = v[uSeqIndex2]->Length(); const float fCount = (float) CommonBitCount(Bits1, Bits2); // Distance measure = K / min(L1, L2) // K is number of distinct kmers that are found in both sequences const float fDist = fCount / MIN(uLength1, uLength2); DF.SetDist(uSeqIndex1, uSeqIndex2, fDist); if (uDone%10000 == 0) Progress(uDone, uTotal); ++uDone; } } ProgressStepsDone(); delete[] BitVector; } fastdistkmer.cpp0000664000175000017500000001455512360262614012366 0ustar bobbob#include "muscle.h" #include "msa.h" #include "seqvect.h" #include "seq.h" #include "distfunc.h" #include #define TRACE 0 /*** Some candidate alphabets considered because they have high correlations and small table sizes. Correlation coefficent is between k-mer distance and %id D measured from a CLUSTALW alignment. Table size is N^k where N is size of alphabet. A is standard (uncompressed) amino alphabet. Correlation Alpha N k Table Size all 25-50% ----- -- - ---------- ---- ------ A 20 3 8,000 0.943 0.575 A 20 4 160,000 0.962 0.685 << LiA 14 4 38,416 0.966 0.645 SEB 14 4 38,416 0.964 0.634 LiA 13 4 28,561 0.965 0.640 LiA 12 4 20,736 0.963 0.620 LiA 10 5 100,000 0.964 0.652 We select A with k=4 because it has the best correlations. The only drawback is a large table size, but space is readily available and the only additional time cost is in resetting the table to zero, which can be done quickly with memset or by keeping a list of the k-mers that were found (should test to see which is faster, and may vary by compiler and processor type). It also has the minor advantage that we don't need to convert the alphabet. Fractional identity d is estimated as follows. F = fractional k-mer count if F is 0: F = 0.01 Y = log(0.02 + F) d = -4.1 + 4.12*Y The constant 0.02 was chosen to make the relationship between Y and D linear. The constants -4.1 and 4.12 were chosen to fit a straight line to the scatterplot of Y vs D. ***/ #define MIN(x, y) (((x) < (y)) ? (x) : (y)) const unsigned K = 4; const unsigned N = 20; const unsigned N_2 = 20*20; const unsigned N_3 = 20*20*20; const unsigned N_4 = 20*20*20*20; const unsigned TABLE_SIZE = N_4; // For debug output const char *KmerToStr(unsigned Kmer) { static char s[5]; unsigned c3 = (Kmer/N_3)%N; unsigned c2 = (Kmer/N_2)%N; unsigned c1 = (Kmer/N)%N; unsigned c0 = Kmer%N; s[0] = LetterToChar(c3); s[1] = LetterToChar(c2); s[2] = LetterToChar(c1); s[3] = LetterToChar(c0); return s; } void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[]) { #if TRACE Log("CountKmers\n"); #endif memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte)); const byte *ptrKmerStart = s; const byte *ptrKmerEnd = s + 4; const byte *ptrSeqEnd = s + uSeqLength; unsigned c3 = s[0]*N_3; unsigned c2 = s[1]*N_2; unsigned c1 = s[2]*N; unsigned c0 = s[3]; unsigned Kmer = c3 + c2 + c1 + c0; for (;;) { assert(Kmer < TABLE_SIZE); #if TRACE Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer)); #endif ++(KmerCounts[Kmer]); if (ptrKmerEnd == ptrSeqEnd) break; // Compute k-mer as function of previous k-mer: // 1. Subtract first letter from previous k-mer. // 2. Multiply by N. // 3. Add next letter. c3 = (*ptrKmerStart++) * N_3; Kmer = (Kmer - c3)*N; Kmer += *ptrKmerEnd++; } } unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength, const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2) { byte KmerCounts2[TABLE_SIZE]; CountKmers(Seq2, uSeqLength2, KmerCounts2); const byte *ptrKmerStart = Seq; const byte *ptrKmerEnd = Seq + 4; const byte *ptrSeqEnd = Seq + uSeqLength; unsigned c3 = Seq[0]*N_3; unsigned c2 = Seq[1]*N_2; unsigned c1 = Seq[2]*N; unsigned c0 = Seq[3]; unsigned Kmer = c3 + c2 + c1 + c0; unsigned uCommonCount = 0; for (;;) { assert(Kmer < TABLE_SIZE); const byte Count1 = KmerCounts1[Kmer]; const byte Count2 = KmerCounts2[Kmer]; uCommonCount += MIN(Count1, Count2); // Hack so we don't double-count KmerCounts2[Kmer] = 0; if (ptrKmerEnd == ptrSeqEnd) break; // Compute k-mer as function of previous k-mer: // 1. Subtract first letter from previous k-mer. // 2. Multiply by N. // 3. Add next letter. c3 = (*ptrKmerStart++) * N_3; Kmer = (Kmer - c3)*N; Kmer += *ptrKmerEnd++; } return uCommonCount; } static void SeqToLetters(const Seq &s, byte Letters[]) { const unsigned uSeqLength = s.Length(); for (unsigned uCol = 0; uCol < uSeqLength; ++uCol) { char c = s.GetChar(uCol); // Ugly hack. My k-mer counting code isn't wild-card // aware. Arbitrarily replace wildcards by a specific // amino acid. if (IsWildcardChar(c)) c = 'A'; *Letters++ = CharToLetter(c); } } void FastDistKmer(const SeqVect &v, DistFunc &DF) { byte KmerCounts[TABLE_SIZE]; const unsigned uSeqCount = v.GetSeqCount(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; // Initialize distance matrix to zero for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } unsigned uMaxLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const Seq &s = v.GetSeq(uSeqIndex); unsigned uSeqLength = s.Length(); if (uSeqLength > uMaxLength) uMaxLength = uSeqLength; } if (0 == uMaxLength) return; byte *Seq1Letters = new byte[uMaxLength]; byte *Seq2Letters = new byte[uMaxLength]; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1) { const Seq &s1 = v.GetSeq(uSeqIndex1); const unsigned uSeqLength1 = s1.Length(); SeqToLetters(s1, Seq1Letters); CountKmers(Seq1Letters, uSeqLength1, KmerCounts); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const Seq &s2 = v.GetSeq(uSeqIndex2); const unsigned uSeqLength2 = s2.Length(); SeqToLetters(s2, Seq2Letters); unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1, KmerCounts, Seq2Letters, uSeqLength2); unsigned uMinLength = MIN(uSeqLength1, uSeqLength2); double F = (double) uCommonKmerCount / (uMinLength - K + 1); if (0.0 == F) F = 0.01; double Y = log(0.02 + F); double EstimatedPctId = Y/4.12 + 0.995; double KD = KimuraDist(EstimatedPctId); // DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD); DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F)); #if TRACE Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n", uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD); #endif } } delete[] Seq1Letters; delete[] Seq2Letters; } fastdistmafft.cpp0000664000175000017500000001651712360262614012525 0ustar bobbob#include "muscle.h" #include "distfunc.h" #include "seqvect.h" #include #define TRACE 0 #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) const unsigned TUPLE_COUNT = 6*6*6*6*6*6; static unsigned char Count1[TUPLE_COUNT]; static unsigned char Count2[TUPLE_COUNT]; // Amino acid groups according to MAFFT (sextet5) // 0 = A G P S T // 1 = I L M V // 2 = N D Q E B Z // 3 = R H K // 4 = F W Y // 5 = C // 6 = X . - U unsigned ResidueGroup[] = { 0, // AX_A, 5, // AX_C, 2, // AX_D, 2, // AX_E, 4, // AX_F, 0, // AX_G, 3, // AX_H, 1, // AX_I, 3, // AX_K, 1, // AX_L, 1, // AX_M, 2, // AX_N, 0, // AX_P, 2, // AX_Q, 3, // AX_R, 0, // AX_S, 0, // AX_T, 1, // AX_V, 4, // AX_W, 4, // AX_Y, 2, // AX_B, // D or N 2, // AX_Z, // E or Q 0, // AX_X, // Unknown // ******** TODO ************* // This isn't the correct way of avoiding group 6 0 // AX_GAP, // ******** TODO ****************** }; unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]); static char *TupleToStr(int t) { static char s[7]; int t1, t2, t3, t4, t5, t6; t1 = t%6; t2 = (t/6)%6; t3 = (t/(6*6))%6; t4 = (t/(6*6*6))%6; t5 = (t/(6*6*6*6))%6; t6 = (t/(6*6*6*6*6))%6; s[5] = '0' + t1; s[4] = '0' + t2; s[3] = '0' + t3; s[2] = '0' + t4; s[1] = '0' + t5; s[0] = '0' + t6; return s; } static unsigned GetTuple(const unsigned uLetters[], unsigned n) { assert(uLetters[n] < uResidueGroupCount); assert(uLetters[n+1] < uResidueGroupCount); assert(uLetters[n+2] < uResidueGroupCount); assert(uLetters[n+3] < uResidueGroupCount); assert(uLetters[n+4] < uResidueGroupCount); assert(uLetters[n+5] < uResidueGroupCount); unsigned u1 = ResidueGroup[uLetters[n]]; unsigned u2 = ResidueGroup[uLetters[n+1]]; unsigned u3 = ResidueGroup[uLetters[n+2]]; unsigned u4 = ResidueGroup[uLetters[n+3]]; unsigned u5 = ResidueGroup[uLetters[n+4]]; unsigned u6 = ResidueGroup[uLetters[n+5]]; return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6; } static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[]) { memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char)); for (unsigned n = 0; n < uTupleCount; ++n) { const unsigned uTuple = GetTuple(L, n); ++(Count[uTuple]); } } static void ListCount(const unsigned char Count[]) { for (unsigned n = 0; n < TUPLE_COUNT; ++n) { if (0 == Count[n]) continue; Log("%s %u\n", TupleToStr(n), Count[n]); } } void DistKmer6_6(const SeqVect &v, DistFunc &DF) { const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; // Initialize distance matrix to zero for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } // Convert to letters unsigned **Letters = new unsigned *[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); unsigned *L = new unsigned[uSeqLength]; Letters[uSeqIndex] = L; for (unsigned n = 0; n < uSeqLength; ++n) { char c = s[n]; L[n] = CharToLetterEx(c); assert(L[n] < uResidueGroupCount); } } unsigned **uCommonTupleCount = new unsigned *[uSeqCount]; for (unsigned n = 0; n < uSeqCount; ++n) { uCommonTupleCount[n] = new unsigned[uSeqCount]; memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned)); } const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &seq1 = *(v[uSeq1]); const unsigned uSeqLength1 = seq1.Length(); if (uSeqLength1 < 5) continue; const unsigned uTupleCount = uSeqLength1 - 5; const unsigned *L = Letters[uSeq1]; CountTuples(L, uTupleCount, Count1); #if TRACE { Log("Seq1=%d\n", uSeq1); Log("Groups:\n"); for (unsigned n = 0; n < uSeqLength1; ++n) Log("%u", ResidueGroup[L[n]]); Log("\n"); Log("Tuples:\n"); ListCount(Count1); } #endif SetProgressDesc("K-mer dist pass 1"); for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; Seq &seq2 = *(v[uSeq2]); const unsigned uSeqLength2 = seq2.Length(); if (uSeqLength2 < 5) { if (uSeq1 == uSeq2) DF.SetDist(uSeq1, uSeq2, 0); else DF.SetDist(uSeq1, uSeq2, 1); continue; } // First pass through seq 2 to count tuples const unsigned uTupleCount = uSeqLength2 - 5; const unsigned *L = Letters[uSeq2]; CountTuples(L, uTupleCount, Count2); #if TRACE Log("Seq2=%d Counts=\n", uSeq2); ListCount(Count2); #endif // Second pass to accumulate sum of shared tuples // MAFFT defines this as the sum over unique tuples // in seq2 of the minimum of the number of tuples found // in the two sequences. unsigned uSum = 0; for (unsigned n = 0; n < uTupleCount; ++n) { const unsigned uTuple = GetTuple(L, n); uSum += MIN(Count1[uTuple], Count2[uTuple]); // This is a hack to make sure each unique tuple counted only once. Count2[uTuple] = 0; } #if TRACE { Seq &s1 = *(v[uSeq1]); Seq &s2 = *(v[uSeq2]); const char *pName1 = s1.GetName(); const char *pName2 = s2.GetName(); Log("Common count %s(%d) - %s(%d) =%u\n", pName1, uSeq1, pName2, uSeq2, uSum); } #endif uCommonTupleCount[uSeq1][uSeq2] = uSum; uCommonTupleCount[uSeq2][uSeq1] = uSum; } } ProgressStepsDone(); uCount = 0; SetProgressDesc("K-mer dist pass 2"); for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &s1 = *(v[uSeq1]); const char *pName1 = s1.GetName(); double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1]; if (0 == dCommonTupleCount11) dCommonTupleCount11 = 1; DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2]; if (0 == dCommonTupleCount22) dCommonTupleCount22 = 1; const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount11; const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount22; // dMinDist is the value used for tree-building in MAFFT const double dMinDist = MIN(dDist1, dDist2); DF.SetDist(uSeq1, uSeq2, (float) dMinDist); //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist); //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId); // **** TODO **** why does this make score slightly worse?? //const double dKimuraDist = KimuraDist(dEstimatedPctId); //DF.SetDist(uSeq1, uSeq2, dKimuraDist); } } ProgressStepsDone(); for (unsigned n = 0; n < uSeqCount; ++n) delete[] uCommonTupleCount[n]; delete[] uCommonTupleCount; delete[] Letters; } double PctIdToMAFFTDist(double dPctId) { if (dPctId < 0.05) dPctId = 0.05; double dDist = -log(dPctId); return dDist; } double PctIdToHeightMAFFT(double dPctId) { return PctIdToMAFFTDist(dPctId); } fastdistnuc.cpp0000664000175000017500000001546712360262613012217 0ustar bobbob#include "muscle.h" #include "distfunc.h" #include "seqvect.h" #include #define TRACE 0 #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) const unsigned TUPLE_COUNT = 6*6*6*6*6*6; static unsigned char Count1[TUPLE_COUNT]; static unsigned char Count2[TUPLE_COUNT]; // Nucleotide groups according to MAFFT (sextet5) // 0 = A // 1 = C // 2 = G // 3 = T // 4 = other static unsigned ResidueGroup[] = { 0, // NX_A, 1, // NX_C, 2, // NX_G, 3, // NX_T/U 4, // NX_N, 4, // NX_R, 4, // NX_Y, 4, // NX_GAP }; static unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]); static char *TupleToStr(int t) { static char s[7]; int t1, t2, t3, t4, t5, t6; t1 = t%6; t2 = (t/6)%6; t3 = (t/(6*6))%6; t4 = (t/(6*6*6))%6; t5 = (t/(6*6*6*6))%6; t6 = (t/(6*6*6*6*6))%6; s[5] = '0' + t1; s[4] = '0' + t2; s[3] = '0' + t3; s[2] = '0' + t4; s[1] = '0' + t5; s[0] = '0' + t6; return s; } static unsigned GetTuple(const unsigned uLetters[], unsigned n) { assert(uLetters[n] < uResidueGroupCount); assert(uLetters[n+1] < uResidueGroupCount); assert(uLetters[n+2] < uResidueGroupCount); assert(uLetters[n+3] < uResidueGroupCount); assert(uLetters[n+4] < uResidueGroupCount); assert(uLetters[n+5] < uResidueGroupCount); unsigned u1 = ResidueGroup[uLetters[n]]; unsigned u2 = ResidueGroup[uLetters[n+1]]; unsigned u3 = ResidueGroup[uLetters[n+2]]; unsigned u4 = ResidueGroup[uLetters[n+3]]; unsigned u5 = ResidueGroup[uLetters[n+4]]; unsigned u6 = ResidueGroup[uLetters[n+5]]; return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6; } static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[]) { memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char)); for (unsigned n = 0; n < uTupleCount; ++n) { const unsigned uTuple = GetTuple(L, n); ++(Count[uTuple]); } } static void ListCount(const unsigned char Count[]) { for (unsigned n = 0; n < TUPLE_COUNT; ++n) { if (0 == Count[n]) continue; Log("%s %u\n", TupleToStr(n), Count[n]); } } void DistKmer4_6(const SeqVect &v, DistFunc &DF) { if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha) Quit("DistKmer4_6 requires nucleo alphabet"); const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; // Initialize distance matrix to zero for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } // Convert to letters unsigned **Letters = new unsigned *[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); unsigned *L = new unsigned[uSeqLength]; Letters[uSeqIndex] = L; for (unsigned n = 0; n < uSeqLength; ++n) { char c = s[n]; L[n] = CharToLetterEx(c); if (L[n] >= 4) L[n] = 4; } } unsigned **uCommonTupleCount = new unsigned *[uSeqCount]; for (unsigned n = 0; n < uSeqCount; ++n) { uCommonTupleCount[n] = new unsigned[uSeqCount]; memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned)); } const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &seq1 = *(v[uSeq1]); const unsigned uSeqLength1 = seq1.Length(); if (uSeqLength1 < 5) continue; const unsigned uTupleCount = uSeqLength1 - 5; const unsigned *L = Letters[uSeq1]; CountTuples(L, uTupleCount, Count1); #if TRACE { Log("Seq1=%d\n", uSeq1); Log("Groups:\n"); for (unsigned n = 0; n < uSeqLength1; ++n) Log("%u", ResidueGroup[L[n]]); Log("\n"); Log("Tuples:\n"); ListCount(Count1); } #endif SetProgressDesc("K-mer dist pass 1"); for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; Seq &seq2 = *(v[uSeq2]); const unsigned uSeqLength2 = seq2.Length(); if (uSeqLength2 < 5) { if (uSeq1 == uSeq2) DF.SetDist(uSeq1, uSeq2, 0); else DF.SetDist(uSeq1, uSeq2, 1); continue; } // First pass through seq 2 to count tuples const unsigned uTupleCount = uSeqLength2 - 5; const unsigned *L = Letters[uSeq2]; CountTuples(L, uTupleCount, Count2); #if TRACE Log("Seq2=%d Counts=\n", uSeq2); ListCount(Count2); #endif // Second pass to accumulate sum of shared tuples // MAFFT defines this as the sum over unique tuples // in seq2 of the minimum of the number of tuples found // in the two sequences. unsigned uSum = 0; for (unsigned n = 0; n < uTupleCount; ++n) { const unsigned uTuple = GetTuple(L, n); uSum += MIN(Count1[uTuple], Count2[uTuple]); // This is a hack to make sure each unique tuple counted only once. Count2[uTuple] = 0; } #if TRACE { Seq &s1 = *(v[uSeq1]); Seq &s2 = *(v[uSeq2]); const char *pName1 = s1.GetName(); const char *pName2 = s2.GetName(); Log("Common count %s(%d) - %s(%d) =%u\n", pName1, uSeq1, pName2, uSeq2, uSum); } #endif uCommonTupleCount[uSeq1][uSeq2] = uSum; uCommonTupleCount[uSeq2][uSeq1] = uSum; } } ProgressStepsDone(); uCount = 0; SetProgressDesc("K-mer dist pass 2"); for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &s1 = *(v[uSeq1]); const char *pName1 = s1.GetName(); double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1]; if (0 == dCommonTupleCount11) dCommonTupleCount11 = 1; DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2]; if (0 == dCommonTupleCount22) dCommonTupleCount22 = 1; const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount11; const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount22; // dMinDist is the value used for tree-building in MAFFT const double dMinDist = MIN(dDist1, dDist2); DF.SetDist(uSeq1, uSeq2, (float) dMinDist); //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist); //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId); // **** TODO **** why does this make score slightly worse?? //const double dKimuraDist = KimuraDist(dEstimatedPctId); //DF.SetDist(uSeq1, uSeq2, dKimuraDist); } } ProgressStepsDone(); for (unsigned n = 0; n < uSeqCount; ++n) { delete[] uCommonTupleCount[n]; delete[] Letters[n]; } delete[] uCommonTupleCount; delete[] Letters; } fastscorepath2.cpp0000664000175000017500000000673412360262614012616 0ustar bobbob#include "muscle.h" #include "profile.h" #include "pwpath.h" SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const PWPath &Path) { const unsigned uEdgeCount = Path.GetEdgeCount(); Log("Edge SS PLA PLB Match Gap Total\n"); Log("---- -- --- --- ----- --- -----\n"); char cType = 'S'; SCORE scoreTotal = 0; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); const char cPrevType = cType; cType = Edge.cType; const unsigned uPrefixLengthA = Edge.uPrefixLengthA; const unsigned uPrefixLengthB = Edge.uPrefixLengthB; bool bGap = false; bool bMatch = false; SCORE scoreGap = 0; SCORE scoreMatch = 0; switch (cType) { case 'M': { if (0 == uPrefixLengthA || 0 == uPrefixLengthB) Quit("FastScorePath2, M zero length"); const ProfPos &PPA = PA[uPrefixLengthA - 1]; const ProfPos &PPB = PB[uPrefixLengthB - 1]; bMatch = true; scoreMatch = ScoreProfPos2(PPA, PPB); if ('D' == cPrevType) { bGap = true; assert(uPrefixLengthA > 1); scoreGap = PA[uPrefixLengthA-2].m_scoreGapClose; } else if ('I' == cPrevType) { bGap = true; assert(uPrefixLengthB > 1); scoreGap = PB[uPrefixLengthB-2].m_scoreGapClose; } break; } case 'D': { if (0 == uPrefixLengthA) Quit("FastScorePath2, D zero length"); const ProfPos &PPA = PA[uPrefixLengthA - 1]; bGap = true; switch (cPrevType) { case 'S': scoreGap = PPA.m_scoreGapOpen; break; case 'M': scoreGap = PPA.m_scoreGapOpen; break; case 'D': // scoreGap = g_scoreGapExtend; scoreGap = 0; break; case 'I': Quit("FastScorePath2 DI"); } break; } case 'I': { if (0 == uPrefixLengthB) Quit("FastScorePath2, I zero length"); const ProfPos &PPB = PB[uPrefixLengthB - 1]; bGap = true; switch (cPrevType) { case 'S': scoreGap = PPB.m_scoreGapOpen; break; case 'M': scoreGap = PPB.m_scoreGapOpen; break; case 'I': scoreGap = 0; // scoreGap = g_scoreGapExtend; break; case 'D': Quit("FastScorePath2 DI"); } break; } case 'U': { Quit("FastScorePath2 U"); } default: Quit("FastScorePath2: invalid type %c", cType); } Log("%4u %c%c %4u %4u ", uEdgeIndex, cPrevType, cType, uPrefixLengthA, uPrefixLengthB); if (bMatch) Log("%7.1f ", scoreMatch); else Log(" "); if (bGap) Log("%7.1f ", scoreGap); else Log(" "); SCORE scoreEdge = scoreMatch + scoreGap; scoreTotal += scoreEdge; Log("%7.1f %7.1f", scoreEdge, scoreTotal); Log("\n"); } SCORE scoreGap = 0; // if (!g_bTermGapsHalf) switch (cType) { case 'M': scoreGap = 0; break; case 'D': { const ProfPos &LastPPA = PA[uLengthA - 1]; scoreGap = LastPPA.m_scoreGapClose; break; } case 'I': { const ProfPos &LastPPB = PB[uLengthB - 1]; scoreGap = LastPPB.m_scoreGapClose; break; } case 'U': Quit("Unaligned regions not supported"); case 'S': break; default: Quit("Invalid type %c", cType); } Log(" %cE %4u %4u %7.1f\n", cType, uLengthA, uLengthB, scoreGap); scoreTotal += scoreGap; Log("Total = %g\n", scoreTotal); return scoreTotal; } finddiags.cpp0000664000175000017500000000711112360262613011603 0ustar bobbob#include "muscle.h" #include "profile.h" #include "diaglist.h" #define TRACE 0 const unsigned KTUP = 5; const unsigned KTUPS = 6*6*6*6*6; static unsigned TuplePos[KTUPS]; static char *TupleToStr(int t) { static char s[7]; int t1, t2, t3, t4, t5; t1 = t%6; t2 = (t/6)%6; t3 = (t/(6*6))%6; t4 = (t/(6*6*6))%6; t5 = (t/(6*6*6*6))%6; s[4] = '0' + t1; s[3] = '0' + t2; s[2] = '0' + t3; s[1] = '0' + t4; s[0] = '0' + t5; return s; } static unsigned GetTuple(const ProfPos *PP, unsigned uPos) { const unsigned t0 = PP[uPos].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == t0) return EMPTY; const unsigned t1 = PP[uPos+1].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == t1) return EMPTY; const unsigned t2 = PP[uPos+2].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == t2) return EMPTY; const unsigned t3 = PP[uPos+3].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == t3) return EMPTY; const unsigned t4 = PP[uPos+4].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == t4) return EMPTY; return t0 + t1*6 + t2*6*6 + t3*6*6*6 + t4*6*6*6*6; } void FindDiags(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, unsigned uLengthY, DiagList &DL) { if (ALPHA_Amino != g_Alpha) Quit("FindDiags: requires amino acid alphabet"); DL.Clear(); if (uLengthX < 12 || uLengthY < 12) return; // Set A to shorter profile, B to longer const ProfPos *PA; const ProfPos *PB; unsigned uLengthA; unsigned uLengthB; bool bSwap; if (uLengthX < uLengthY) { bSwap = false; PA = PX; PB = PY; uLengthA = uLengthX; uLengthB = uLengthY; } else { bSwap = true; PA = PY; PB = PX; uLengthA = uLengthY; uLengthB = uLengthX; } // Build tuple map for the longer profile, B if (uLengthB < KTUP) Quit("FindDiags: profile too int"); memset(TuplePos, EMPTY, sizeof(TuplePos)); for (unsigned uPos = 0; uPos < uLengthB - KTUP; ++uPos) { const unsigned uTuple = GetTuple(PB, uPos); if (EMPTY == uTuple) continue; TuplePos[uTuple] = uPos; } // Find matches for (unsigned uPosA = 0; uPosA < uLengthA - KTUP; ++uPosA) { const unsigned uTuple = GetTuple(PA, uPosA); if (EMPTY == uTuple) continue; const unsigned uPosB = TuplePos[uTuple]; if (EMPTY == uPosB) continue; // This tuple is found in both profiles unsigned uStartPosA = uPosA; unsigned uStartPosB = uPosB; // Try to extend the match forwards unsigned uEndPosA = uPosA + KTUP - 1; unsigned uEndPosB = uPosB + KTUP - 1; for (;;) { if (uLengthA - 1 == uEndPosA || uLengthB - 1 == uEndPosB) break; const unsigned uAAGroupA = PA[uEndPosA+1].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == uAAGroupA) break; const unsigned uAAGroupB = PB[uEndPosB+1].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == uAAGroupB) break; if (uAAGroupA != uAAGroupB) break; ++uEndPosA; ++uEndPosB; } uPosA = uEndPosA; #if TRACE { Log("Match: A %4u-%4u ", uStartPosA, uEndPosA); for (unsigned n = uStartPosA; n <= uEndPosA; ++n) Log("%c", 'A' + PA[n].m_uResidueGroup); Log("\n"); Log(" B %4u-%4u ", uStartPosB, uEndPosB); for (unsigned n = uStartPosB; n <= uEndPosB; ++n) Log("%c", 'A' + PB[n].m_uResidueGroup); Log("\n"); } #endif const unsigned uLength = uEndPosA - uStartPosA + 1; assert(uEndPosB - uStartPosB + 1 == uLength); if (uLength >= g_uMinDiagLength) { if (bSwap) DL.Add(uStartPosB, uStartPosA, uLength); else DL.Add(uStartPosA, uStartPosB, uLength); } } } finddiagsn.cpp0000664000175000017500000000657712360262614012001 0ustar bobbob#include "muscle.h" #include "profile.h" #include "diaglist.h" #define TRACE 0 #define pow4(i) (1 << (2*i)) // 4^i = 2^(2*i) const unsigned K = 7; const unsigned KTUPS = pow4(K); static unsigned TuplePos[KTUPS]; static char *TupleToStr(int t) { static char s[K]; for (int i = 0; i < K; ++i) { unsigned Letter = (t/(pow4(i)))%4; assert(Letter >= 0 && Letter < 4); s[K-i-1] = LetterToChar(Letter); } return s; } static unsigned GetTuple(const ProfPos *PP, unsigned uPos) { unsigned t = 0; for (unsigned i = 0; i < K; ++i) { const unsigned uLetter = PP[uPos+i].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == uLetter) return EMPTY; t = t*4 + uLetter; } return t; } void FindDiagsNuc(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, unsigned uLengthY, DiagList &DL) { if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha) Quit("FindDiagsNuc: requires nucleo alphabet"); DL.Clear(); // 16 is arbitrary slop, no principled reason for this. if (uLengthX < K + 16 || uLengthY < K + 16) return; // Set A to shorter profile, B to longer const ProfPos *PA; const ProfPos *PB; unsigned uLengthA; unsigned uLengthB; bool bSwap; if (uLengthX < uLengthY) { bSwap = false; PA = PX; PB = PY; uLengthA = uLengthX; uLengthB = uLengthY; } else { bSwap = true; PA = PY; PB = PX; uLengthA = uLengthY; uLengthB = uLengthX; } #if TRACE Log("FindDiagsNuc(LengthA=%d LengthB=%d\n", uLengthA, uLengthB); #endif // Build tuple map for the longer profile, B if (uLengthB < K) Quit("FindDiags: profile too int"); memset(TuplePos, EMPTY, sizeof(TuplePos)); for (unsigned uPos = 0; uPos < uLengthB - K; ++uPos) { const unsigned uTuple = GetTuple(PB, uPos); if (EMPTY == uTuple) continue; TuplePos[uTuple] = uPos; } // Find matches for (unsigned uPosA = 0; uPosA < uLengthA - K; ++uPosA) { const unsigned uTuple = GetTuple(PA, uPosA); if (EMPTY == uTuple) continue; const unsigned uPosB = TuplePos[uTuple]; if (EMPTY == uPosB) continue; // This tuple is found in both profiles unsigned uStartPosA = uPosA; unsigned uStartPosB = uPosB; // Try to extend the match forwards unsigned uEndPosA = uPosA + K - 1; unsigned uEndPosB = uPosB + K - 1; for (;;) { if (uLengthA - 1 == uEndPosA || uLengthB - 1 == uEndPosB) break; const unsigned uAAGroupA = PA[uEndPosA+1].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == uAAGroupA) break; const unsigned uAAGroupB = PB[uEndPosB+1].m_uResidueGroup; if (RESIDUE_GROUP_MULTIPLE == uAAGroupB) break; if (uAAGroupA != uAAGroupB) break; ++uEndPosA; ++uEndPosB; } uPosA = uEndPosA; #if TRACE { Log("Match: A %4u-%4u ", uStartPosA, uEndPosA); for (unsigned n = uStartPosA; n <= uEndPosA; ++n) Log("%c", LetterToChar(PA[n].m_uResidueGroup)); Log("\n"); Log(" B %4u-%4u ", uStartPosB, uEndPosB); for (unsigned n = uStartPosB; n <= uEndPosB; ++n) Log("%c", LetterToChar(PB[n].m_uResidueGroup)); Log("\n"); } #endif const unsigned uLength = uEndPosA - uStartPosA + 1; assert(uEndPosB - uStartPosB + 1 == uLength); if (uLength >= g_uMinDiagLength) { if (bSwap) DL.Add(uStartPosB, uStartPosA, uLength); else DL.Add(uStartPosA, uStartPosB, uLength); } } } glbalign352.cpp0000664000175000017500000000226412360262614011671 0ustar bobbob#include "muscle.h" #include "pwpath.h" #include "timing.h" #include "textfile.h" #include "msa.h" #include "profile.h" #if VER_3_52 #if TIMING TICKS g_ticksDP = 0; #endif SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { #if TIMING TICKS t1 = GetClockTicks(); #endif SCORE Score = 0; if (g_bDiags) Score = GlobalAlignDiags(PA, uLengthA, PB, uLengthB, Path); else Score = GlobalAlignNoDiags(PA, uLengthA, PB, uLengthB, Path); #if TIMING TICKS t2 = GetClockTicks(); g_ticksDP += (t2 - t1); #endif return Score; } SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { if (g_bDimer) return GlobalAlignDimer(PA, uLengthA, PB, uLengthB, Path); switch (g_PPScore) { case PPSCORE_LE: return GlobalAlignLE(PA, uLengthA, PB, uLengthB, Path); case PPSCORE_SP: case PPSCORE_SV: return GlobalAlignSP(PA, uLengthA, PB, uLengthB, Path); case PPSCORE_SPN: return GlobalAlignSPN(PA, uLengthA, PB, uLengthB, Path); } Quit("Invalid PP score (GlobalAlignNoDiags)"); return 0; } #endif // VER_3_52 glbalign.cpp0000664000175000017500000000730112360262614011434 0ustar bobbob#include "muscle.h" #include "pwpath.h" #include "timing.h" #include "textfile.h" #include "msa.h" #include "profile.h" #if !VER_3_52 #define COMPARE_SIMPLE 0 #if TIMING TICKS g_ticksDP = 0; #endif #if 1 extern bool g_bKeepSimpleDP; SCORE NWSmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE NWDASmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE NWDASimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE NWDASimple2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { return GlobalAlign(PA, uLengthA, PB, uLengthB, Path); } #if COMPARE_SIMPLE SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { #if TIMING TICKS t1 = GetClockTicks(); #endif g_bKeepSimpleDP = true; PWPath SimplePath; GlobalAlignSimple(PA, uLengthA, PB, uLengthB, SimplePath); SCORE Score = NWSmall(PA, uLengthA, PB, uLengthB, Path); if (!Path.Equal(SimplePath)) { Log("Simple:\n"); SimplePath.LogMe(); Log("Small:\n"); Path.LogMe(); Quit("Paths differ"); } #if TIMING TICKS t2 = GetClockTicks(); g_ticksDP += (t2 - t1); #endif return Score; } #else // COMPARE_SIMPLE SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { #if TIMING TICKS t1 = GetClockTicks(); #endif SCORE Score = NWSmall(PA, uLengthA, PB, uLengthB, Path); #if TIMING TICKS t2 = GetClockTicks(); g_ticksDP += (t2 - t1); #endif return Score; } #endif #else // 1 static void AllInserts(PWPath &Path, unsigned uLengthB) { Path.Clear(); PWEdge Edge; Edge.cType = 'I'; Edge.uPrefixLengthA = 0; for (unsigned uPrefixLengthB = 1; uPrefixLengthB <= uLengthB; ++uPrefixLengthB) { Edge.uPrefixLengthB = uPrefixLengthB; Path.AppendEdge(Edge); } } static void AllDeletes(PWPath &Path, unsigned uLengthA) { Path.Clear(); PWEdge Edge; Edge.cType = 'D'; Edge.uPrefixLengthB = 0; for (unsigned uPrefixLengthA = 1; uPrefixLengthA <= uLengthA; ++uPrefixLengthA) { Edge.uPrefixLengthA = uPrefixLengthA; Path.AppendEdge(Edge); } } SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { #if TIMING TICKS t1 = GetClockTicks(); #endif if (0 == uLengthA) { AllInserts(Path, uLengthB); return 0; } else if (0 == uLengthB) { AllDeletes(Path, uLengthA); return 0; } SCORE Score = 0; if (g_bDiags) Score = GlobalAlignDiags(PA, uLengthA, PB, uLengthB, Path); else Score = GlobalAlignNoDiags(PA, uLengthA, PB, uLengthB, Path); #if TIMING TICKS t2 = GetClockTicks(); g_ticksDP += (t2 - t1); #endif return Score; } SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { if (g_bDimer) return GlobalAlignDimer(PA, uLengthA, PB, uLengthB, Path); switch (g_PPScore) { case PPSCORE_LE: return GlobalAlignLE(PA, uLengthA, PB, uLengthB, Path); case PPSCORE_SP: case PPSCORE_SV: return GlobalAlignSP(PA, uLengthA, PB, uLengthB, Path); case PPSCORE_SPN: return GlobalAlignSPN(PA, uLengthA, PB, uLengthB, Path); } Quit("Invalid PP score (GlobalAlignNoDiags)"); return 0; } #endif #endif // !VER_3_52 glbaligndiag.cpp0000664000175000017500000001032312360262614012257 0ustar bobbob#include "muscle.h" #include "dpreglist.h" #include "diaglist.h" #include "pwpath.h" #include "profile.h" #include "timing.h" #define TRACE 0 #define TRACE_PATH 0 #define LIST_DIAGS 0 static double g_dDPAreaWithoutDiags = 0.0; static double g_dDPAreaWithDiags = 0.0; static void OffsetPath(PWPath &Path, unsigned uOffsetA, unsigned uOffsetB) { const unsigned uEdgeCount = Path.GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); // Nasty hack -- poke new values back into path, circumventing class PWEdge &NonConstEdge = (PWEdge &) Edge; NonConstEdge.uPrefixLengthA += uOffsetA; NonConstEdge.uPrefixLengthB += uOffsetB; } } static void DiagToPath(const Diag &d, PWPath &Path) { Path.Clear(); const unsigned uLength = d.m_uLength; for (unsigned i = 0; i < uLength; ++i) { PWEdge Edge; Edge.cType = 'M'; Edge.uPrefixLengthA = d.m_uStartPosA + i + 1; Edge.uPrefixLengthB = d.m_uStartPosB + i + 1; Path.AppendEdge(Edge); } } static void AppendRegPath(PWPath &Path, const PWPath &RegPath) { const unsigned uRegEdgeCount = RegPath.GetEdgeCount(); for (unsigned uRegEdgeIndex = 0; uRegEdgeIndex < uRegEdgeCount; ++uRegEdgeIndex) { const PWEdge &RegEdge = RegPath.GetEdge(uRegEdgeIndex); Path.AppendEdge(RegEdge); } } SCORE GlobalAlignDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { #if LIST_DIAGS TICKS t1 = GetClockTicks(); #endif DiagList DL; if (ALPHA_Amino == g_Alpha) FindDiags(PA, uLengthA, PB, uLengthB, DL); else if (ALPHA_DNA == g_Alpha || ALPHA_RNA == g_Alpha) FindDiagsNuc(PA, uLengthA, PB, uLengthB, DL); else Quit("GlobalAlignDiags: bad alpha"); #if TRACE Log("GlobalAlignDiags, diag list:\n"); DL.LogMe(); #endif DL.Sort(); DL.DeleteIncompatible(); #if TRACE Log("After DeleteIncompatible:\n"); DL.LogMe(); #endif MergeDiags(DL); #if TRACE Log("After MergeDiags:\n"); DL.LogMe(); #endif DPRegionList RL; DiagListToDPRegionList(DL, RL, uLengthA, uLengthB); #if TRACE Log("RegionList:\n"); RL.LogMe(); #endif #if LIST_DIAGS { TICKS t2 = GetClockTicks(); unsigned uArea = RL.GetDPArea(); Log("ticks=%ld\n", (long) (t2 - t1)); Log("area=%u\n", uArea); } #endif g_dDPAreaWithoutDiags += uLengthA*uLengthB; double dDPAreaWithDiags = 0.0; const unsigned uRegionCount = RL.GetCount(); for (unsigned uRegionIndex = 0; uRegionIndex < uRegionCount; ++uRegionIndex) { const DPRegion &r = RL.Get(uRegionIndex); PWPath RegPath; if (DPREGIONTYPE_Diag == r.m_Type) { DiagToPath(r.m_Diag, RegPath); #if TRACE_PATH Log("DiagToPath, path=\n"); RegPath.LogMe(); #endif } else if (DPREGIONTYPE_Rect == r.m_Type) { const unsigned uRegStartPosA = r.m_Rect.m_uStartPosA; const unsigned uRegStartPosB = r.m_Rect.m_uStartPosB; const unsigned uRegLengthA = r.m_Rect.m_uLengthA; const unsigned uRegLengthB = r.m_Rect.m_uLengthB; const ProfPos *RegPA = PA + uRegStartPosA; const ProfPos *RegPB = PB + uRegStartPosB; dDPAreaWithDiags += uRegLengthA*uRegLengthB; GlobalAlignNoDiags(RegPA, uRegLengthA, RegPB, uRegLengthB, RegPath); #if TRACE_PATH Log("GlobalAlignNoDiags RegPath=\n"); RegPath.LogMe(); #endif OffsetPath(RegPath, uRegStartPosA, uRegStartPosB); #if TRACE_PATH Log("After offset path, RegPath=\n"); RegPath.LogMe(); #endif } else Quit("GlobalAlignDiags, Invalid region type %u", r.m_Type); AppendRegPath(Path, RegPath); #if TRACE_PATH Log("After AppendPath, path="); Path.LogMe(); #endif } #if TRACE { double dDPAreaWithoutDiags = uLengthA*uLengthB; Log("DP area with diags %.3g without %.3g pct saved %.3g %%\n", dDPAreaWithDiags, dDPAreaWithoutDiags, (1.0 - dDPAreaWithDiags/dDPAreaWithoutDiags)*100.0); } #endif g_dDPAreaWithDiags += dDPAreaWithDiags; return 0; } void ListDiagSavings() { if (!g_bVerbose || !g_bDiags) return; double dAreaSaved = g_dDPAreaWithoutDiags - g_dDPAreaWithDiags; double dPct = dAreaSaved*100.0/g_dDPAreaWithoutDiags; Log("DP area saved by diagonals %-4.1f%%\n", dPct); } glbalignle.cpp0000664000175000017500000002435212360262614011762 0ustar bobbob#include "muscle.h" #include "profile.h" #include "pwpath.h" #define OCC 1 struct DP_MEMORY { unsigned uLength; SCORE *GapOpenA; SCORE *GapOpenB; SCORE *GapCloseA; SCORE *GapCloseB; SCORE *MPrev; SCORE *MCurr; SCORE *MWork; SCORE *DPrev; SCORE *DCurr; SCORE *DWork; SCORE **ScoreMxB; #if OCC FCOUNT *OccA; FCOUNT *OccB; #endif unsigned **SortOrderA; unsigned *uDeletePos; FCOUNT **FreqsA; int **TraceBack; }; static struct DP_MEMORY DPM; static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) { // Max prefix length unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; if (uLength < DPM.uLength) return; // Add 256 to allow for future expansion and // round up to next multiple of 32. uLength += 256; uLength += 32 - uLength%32; const unsigned uOldLength = DPM.uLength; if (uOldLength > 0) { for (unsigned i = 0; i < uOldLength; ++i) { delete[] DPM.TraceBack[i]; delete[] DPM.FreqsA[i]; delete[] DPM.SortOrderA[i]; } for (unsigned n = 0; n < 20; ++n) delete[] DPM.ScoreMxB[n]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.uDeletePos; delete[] DPM.GapOpenA; delete[] DPM.GapOpenB; delete[] DPM.GapCloseA; delete[] DPM.GapCloseB; delete[] DPM.SortOrderA; delete[] DPM.FreqsA; delete[] DPM.ScoreMxB; delete[] DPM.TraceBack; #if OCC delete[] DPM.OccA; delete[] DPM.OccB; #endif } DPM.uLength = uLength; DPM.GapOpenA = new SCORE[uLength]; DPM.GapOpenB = new SCORE[uLength]; DPM.GapCloseA = new SCORE[uLength]; DPM.GapCloseB = new SCORE[uLength]; #if OCC DPM.OccA = new FCOUNT[uLength]; DPM.OccB = new FCOUNT[uLength]; #endif DPM.SortOrderA = new unsigned*[uLength]; DPM.FreqsA = new FCOUNT*[uLength]; DPM.ScoreMxB = new SCORE*[20]; DPM.MPrev = new SCORE[uLength]; DPM.MCurr = new SCORE[uLength]; DPM.MWork = new SCORE[uLength]; DPM.DPrev = new SCORE[uLength]; DPM.DCurr = new SCORE[uLength]; DPM.DWork = new SCORE[uLength]; DPM.uDeletePos = new unsigned[uLength]; DPM.TraceBack = new int*[uLength]; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) DPM.ScoreMxB[uLetter] = new SCORE[uLength]; for (unsigned i = 0; i < uLength; ++i) { DPM.SortOrderA[i] = new unsigned[20]; DPM.FreqsA[i] = new FCOUNT[20]; DPM.TraceBack[i] = new int[uLength]; } } SCORE GlobalAlignLE(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { SetTermGaps(PA, uLengthA); SetTermGaps(PB, uLengthB); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; AllocDPMem(uLengthA, uLengthB); SCORE *GapOpenA = DPM.GapOpenA; SCORE *GapOpenB = DPM.GapOpenB; SCORE *GapCloseA = DPM.GapCloseA; SCORE *GapCloseB = DPM.GapCloseB; unsigned **SortOrderA = DPM.SortOrderA; FCOUNT **FreqsA = DPM.FreqsA; SCORE **ScoreMxB = DPM.ScoreMxB; SCORE *MPrev = DPM.MPrev; SCORE *MCurr = DPM.MCurr; SCORE *MWork = DPM.MWork; SCORE *DPrev = DPM.DPrev; SCORE *DCurr = DPM.DCurr; SCORE *DWork = DPM.DWork; #if OCC FCOUNT *OccA = DPM.OccA; FCOUNT *OccB = DPM.OccB; #endif unsigned *uDeletePos = DPM.uDeletePos; int **TraceBack = DPM.TraceBack; for (unsigned i = 0; i < uLengthA; ++i) { GapOpenA[i] = PA[i].m_scoreGapOpen; GapCloseA[i] = PA[i].m_scoreGapClose; #if OCC OccA[i] = PA[i].m_fOcc; #endif for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; } } for (unsigned j = 0; j < uLengthB; ++j) { GapOpenB[j] = PB[j].m_scoreGapOpen; GapCloseB[j] = PB[j].m_scoreGapClose; #if OCC OccB[j] = PB[j].m_fOcc; #endif } for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { for (unsigned j = 0; j < uLengthB; ++j) ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; } for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); // Special case for i=0 unsigned **ptrSortOrderA = SortOrderA; FCOUNT **ptrFreqsA = FreqsA; assert(ptrSortOrderA == &(SortOrderA[0])); assert(ptrFreqsA == &(FreqsA[0])); TraceBack[0][0] = 0; SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][0]; } if (0 == scoreSum) MPrev[0] = -2.5; else { #if OCC MPrev[0] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[0]; #else MPrev[0] = (logf(scoreSum) - g_scoreCenter); #endif } // D(0,0) is -infinity (requires I->D). DPrev[0] = MINUS_INFINITY; for (unsigned j = 1; j < uLengthB; ++j) { // Only way to get M(0, j) looks like this: // A ----X // B XXXXX // 0 j // So gap-open at j=0, gap-close at j-1. SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][j]; } if (0 == scoreSum) MPrev[j] = -2.5; else { #if OCC MPrev[j] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[j] + GapOpenB[0] + GapCloseB[j-1]; #else MPrev[j] = (logf(scoreSum) - g_scoreCenter) + GapOpenB[0] + GapCloseB[j-1]; #endif } TraceBack[0][j] = -(int) j; // Assume no D->I transitions, then can't be a delete if only // one letter from A. DPrev[j] = MINUS_INFINITY; } SCORE IPrev_j_1; for (unsigned i = 1; i < uLengthA; ++i) { ++ptrSortOrderA; ++ptrFreqsA; assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); SCORE *ptrMCurr_j = MCurr; memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); const FCOUNT *FreqsAi = *ptrFreqsA; const unsigned *SortOrderAi = *ptrSortOrderA; const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; const SCORE *ptrMCurrMax = MCurr + uLengthB; for (const unsigned *ptrSortOrderAi = SortOrderAi; ptrSortOrderAi != ptrSortOrderAiEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; SCORE *NSBR_Letter = ScoreMxB[uLetter]; const FCOUNT fcLetter = FreqsAi[uLetter]; if (0 == fcLetter) break; SCORE *ptrNSBR = NSBR_Letter; for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) *ptrMCurr += fcLetter*(*ptrNSBR++); } #if OCC const FCOUNT OccAi = OccA[i]; #endif for (unsigned j = 0; j < uLengthB; ++j) { if (MCurr[j] == 0) MCurr[j] = -2.5; else #if OCC MCurr[j] = (logf(MCurr[j]) - g_scoreCenter)*OccAi*OccB[j]; #else MCurr[j] = (logf(MCurr[j]) - g_scoreCenter); #endif } ptrMCurr_j = MCurr; unsigned *ptrDeletePos = uDeletePos; // Special case for j=0 // Only way to get M(i, 0) looks like this: // 0 i // A XXXXX // B ----X // So gap-open at i=0, gap-close at i-1. assert(ptrMCurr_j == &(MCurr[0])); *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; ++ptrMCurr_j; int *ptrTraceBack_ij = TraceBack[i]; *ptrTraceBack_ij++ = (int) i; SCORE *ptrMPrev_j = MPrev; SCORE *ptrDPrev = DPrev; SCORE d = *ptrDPrev; SCORE DNew = *ptrMPrev_j + GapOpenA[i]; if (DNew > d) { d = DNew; *ptrDeletePos = i; } SCORE *ptrDCurr = DCurr; assert(ptrDCurr == &(DCurr[0])); *ptrDCurr = d; // Can't have an insert if no letters from B IPrev_j_1 = MINUS_INFINITY; unsigned uInsertPos = 0; const SCORE scoreGapOpenAi = GapOpenA[i]; const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; for (unsigned j = 1; j < uLengthB; ++j) { // Here, MPrev_j is preserved from previous // iteration so with current i,j is M[i-1][j-1] SCORE MPrev_j = *ptrMPrev_j; SCORE INew = MPrev_j + GapOpenB[j]; if (INew > IPrev_j_1) { IPrev_j_1 = INew; uInsertPos = j; } SCORE scoreMax = MPrev_j; assert(ptrDPrev == &(DPrev[j-1])); SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; if (scoreD > scoreMax) { scoreMax = scoreD; assert(ptrDeletePos == &(uDeletePos[j-1])); *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; assert(*ptrTraceBack_ij > 0); } ++ptrDeletePos; SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; if (scoreI > scoreMax) { scoreMax = scoreI; *ptrTraceBack_ij = (int) uInsertPos - (int) j; assert(*ptrTraceBack_ij < 0); } assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); *ptrMCurr_j += scoreMax; assert(ptrMCurr_j == &(MCurr[j])); ++ptrMCurr_j; MPrev_j = *(++ptrMPrev_j); assert(ptrDPrev == &(DPrev[j])); SCORE d = *ptrDPrev; SCORE DNew = MPrev_j + scoreGapOpenAi; if (DNew > d) { d = DNew; assert(ptrDeletePos == &uDeletePos[j]); *ptrDeletePos = i; } assert(ptrDCurr + 1 == &(DCurr[j])); *(++ptrDCurr) = d; ++ptrTraceBack_ij; } Rotate(MPrev, MCurr, MWork); Rotate(DPrev, DCurr, DWork); } // Special case for i=uLengthA SCORE IPrev = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { SCORE INew = MPrev[j-1] + GapOpenB[j]; if (INew > IPrev) { uInsertPos = j; IPrev = INew; } } // Special case for i=uLengthA, j=uLengthB SCORE scoreMax = MPrev[uLengthB-1]; int iTraceBack = 0; SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; if (scoreD > scoreMax) { scoreMax = scoreD; iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; } SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; if (scoreI > scoreMax) { scoreMax = scoreI; iTraceBack = (int) uInsertPos - (int) uLengthB; } TraceBack[uLengthA][uLengthB] = iTraceBack; TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); return scoreMax; } glbalignsimple.cpp0000664000175000017500000002176012360262614012653 0ustar bobbob#include "muscle.h" #include #include "pwpath.h" #include "profile.h" #include #define TRACE 0 #if 1 // SINGLE_AFFINE extern bool g_bKeepSimpleDP; extern SCORE *g_DPM; extern SCORE *g_DPD; extern SCORE *g_DPI; extern char *g_TBM; extern char *g_TBD; extern char *g_TBI; static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -100000) return " *"; sprintf(str, "%6.1f", s); return str; } static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB)); Log("\n"); } } static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); SetTermGaps(PA, uLengthA); SetTermGaps(PB, uLengthB); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; // Allocate DP matrices const size_t LM = uPrefixCountA*uPrefixCountB; SCORE *DPL_ = new SCORE[LM]; SCORE *DPM_ = new SCORE[LM]; SCORE *DPD_ = new SCORE[LM]; SCORE *DPI_ = new SCORE[LM]; char *TBM_ = new char[LM]; char *TBD_ = new char[LM]; char *TBI_ = new char[LM]; memset(TBM_, '?', LM); memset(TBD_, '?', LM); memset(TBI_, '?', LM); DPM(0, 0) = 0; DPD(0, 0) = MINUS_INFINITY; DPI(0, 0) = MINUS_INFINITY; DPM(1, 0) = MINUS_INFINITY; DPD(1, 0) = PA[0].m_scoreGapOpen; TBD(1, 0) = 'D'; DPI(1, 0) = MINUS_INFINITY; DPM(0, 1) = MINUS_INFINITY; DPD(0, 1) = MINUS_INFINITY; DPI(0, 1) = PB[0].m_scoreGapOpen; TBI(0, 1) = 'I'; // Empty prefix of B is special case for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { // M=LetterA+LetterB, impossible with empty prefix DPM(uPrefixLengthA, 0) = MINUS_INFINITY; // D=LetterA+GapB DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend; TBD(uPrefixLengthA, 0) = 'D'; // I=GapA+LetterB, impossible with empty prefix DPI(uPrefixLengthA, 0) = MINUS_INFINITY; } // Empty prefix of A is special case for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { // M=LetterA+LetterB, impossible with empty prefix DPM(0, uPrefixLengthB) = MINUS_INFINITY; // D=LetterA+GapB, impossible with empty prefix DPD(0, uPrefixLengthB) = MINUS_INFINITY; // I=GapA+LetterB DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend; TBI(0, uPrefixLengthB) = 'I'; } // Special case to agree with NWFast, no D-I transitions so... DPD(uLengthA, 0) = MINUS_INFINITY; // DPI(0, uLengthB) = MINUS_INFINITY; // ============ // Main DP loop // ============ SCORE scoreGapCloseB = MINUS_INFINITY; for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreGapCloseA = MINUS_INFINITY; for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; { // Match M=LetterA+LetterB SCORE scoreLL = ScoreProfPos2(PPA, PPB); DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL; SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; SCORE scoreBest; if (scoreMM >= scoreDM && scoreMM >= scoreIM) { scoreBest = scoreMM; TBM(uPrefixLengthA, uPrefixLengthB) = 'M'; } else if (scoreDM >= scoreMM && scoreDM >= scoreIM) { scoreBest = scoreDM; TBM(uPrefixLengthA, uPrefixLengthB) = 'D'; } else { assert(scoreIM >= scoreMM && scoreIM >= scoreDM); scoreBest = scoreIM; TBM(uPrefixLengthA, uPrefixLengthB) = 'I'; } DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; } { // Delete D=LetterA+GapB SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen; SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend; SCORE scoreBest; if (scoreMD >= scoreDD) { scoreBest = scoreMD; TBD(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreDD >= scoreMD); scoreBest = scoreDD; TBD(uPrefixLengthA, uPrefixLengthB) = 'D'; } DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert I=GapA+LetterB { SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB - 1].m_scoreGapOpen; SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend; SCORE scoreBest; if (scoreMI >= scoreII) { scoreBest = scoreMI; TBI(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreII > scoreMI); scoreBest = scoreII; TBI(uPrefixLengthA, uPrefixLengthB) = 'I'; } DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; } scoreGapCloseA = PPA.m_scoreGapClose; } scoreGapCloseB = PPB.m_scoreGapClose; } #if TRACE Log("\n"); Log("Simple DPL:\n"); ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple DPM:\n"); ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple DPD:\n"); ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple DPI:\n"); ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple TBM:\n"); ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple TBD:\n"); ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("Simple TBI:\n"); ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); #endif // Trace-back // ========== Path.Clear(); // Find last edge SCORE M = DPM(uLengthA, uLengthB); SCORE D = DPD(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose; SCORE I = DPI(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose; char cEdgeType = '?'; SCORE BestScore = MINUS_INFINITY; if (M >= D && M >= I) { cEdgeType = 'M'; BestScore = M; } else if (D >= M && D >= I) { cEdgeType = 'D'; BestScore = D; } else { assert(I >= M && I >= D); cEdgeType = 'I'; BestScore = I; } #if TRACE Log("Simple: MAB=%.4g DAB=%.4g IAB=%.4g best=%c\n", M, D, I, cEdgeType); #endif unsigned PLA = uLengthA; unsigned PLB = uLengthB; for (;;) { PWEdge Edge; Edge.cType = cEdgeType; Edge.uPrefixLengthA = PLA; Edge.uPrefixLengthB = PLB; #if TRACE Log("Prepend %c%d.%d\n", Edge.cType, PLA, PLB); #endif Path.PrependEdge(Edge); switch (cEdgeType) { case 'M': assert(PLA > 0); assert(PLB > 0); cEdgeType = TBM(PLA, PLB); --PLA; --PLB; break; case 'D': assert(PLA > 0); cEdgeType = TBD(PLA, PLB); --PLA; break; case 'I': assert(PLB > 0); cEdgeType = TBI(PLA, PLB); --PLB; break; default: Quit("Invalid edge %c", cEdgeType); } if (0 == PLA && 0 == PLB) break; } Path.Validate(); // SCORE Score = TraceBack(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, Path); #if TRACE SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path); Path.LogMe(); Log("Score = %s Path = %s\n", LocalScoreToStr(BestScore), LocalScoreToStr(scorePath)); #endif if (g_bKeepSimpleDP) { g_DPM = DPM_; g_DPD = DPD_; g_DPI = DPI_; g_TBM = TBM_; g_TBD = TBD_; g_TBI = TBI_; } else { delete[] DPM_; delete[] DPD_; delete[] DPI_; delete[] TBM_; delete[] TBD_; delete[] TBI_; } return BestScore; } #endif // SINLGLE_AFFINE glbalignsp.cpp0000664000175000017500000002235512360262613012004 0ustar bobbob#include "muscle.h" #include "profile.h" #include "pwpath.h" struct DP_MEMORY { unsigned uLength; SCORE *GapOpenA; SCORE *GapOpenB; SCORE *GapCloseA; SCORE *GapCloseB; SCORE *MPrev; SCORE *MCurr; SCORE *MWork; SCORE *DPrev; SCORE *DCurr; SCORE *DWork; SCORE **ScoreMxB; unsigned **SortOrderA; unsigned *uDeletePos; FCOUNT **FreqsA; int **TraceBack; }; static struct DP_MEMORY DPM; static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) { // Max prefix length unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; if (uLength < DPM.uLength) return; // Add 256 to allow for future expansion and // round up to next multiple of 32. uLength += 256; uLength += 32 - uLength%32; const unsigned uOldLength = DPM.uLength; if (uOldLength > 0) { for (unsigned i = 0; i < uOldLength; ++i) { delete[] DPM.TraceBack[i]; delete[] DPM.FreqsA[i]; delete[] DPM.SortOrderA[i]; } for (unsigned n = 0; n < 20; ++n) delete[] DPM.ScoreMxB[n]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.uDeletePos; delete[] DPM.GapOpenA; delete[] DPM.GapOpenB; delete[] DPM.GapCloseA; delete[] DPM.GapCloseB; delete[] DPM.SortOrderA; delete[] DPM.FreqsA; delete[] DPM.ScoreMxB; delete[] DPM.TraceBack; } DPM.uLength = uLength; DPM.GapOpenA = new SCORE[uLength]; DPM.GapOpenB = new SCORE[uLength]; DPM.GapCloseA = new SCORE[uLength]; DPM.GapCloseB = new SCORE[uLength]; DPM.SortOrderA = new unsigned*[uLength]; DPM.FreqsA = new FCOUNT*[uLength]; DPM.ScoreMxB = new SCORE*[20]; DPM.MPrev = new SCORE[uLength]; DPM.MCurr = new SCORE[uLength]; DPM.MWork = new SCORE[uLength]; DPM.DPrev = new SCORE[uLength]; DPM.DCurr = new SCORE[uLength]; DPM.DWork = new SCORE[uLength]; DPM.uDeletePos = new unsigned[uLength]; DPM.TraceBack = new int*[uLength]; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) DPM.ScoreMxB[uLetter] = new SCORE[uLength]; for (unsigned i = 0; i < uLength; ++i) { DPM.SortOrderA[i] = new unsigned[20]; DPM.FreqsA[i] = new FCOUNT[20]; DPM.TraceBack[i] = new int[uLength]; } } SCORE GlobalAlignSP(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; AllocDPMem(uLengthA, uLengthB); SCORE *GapOpenA = DPM.GapOpenA; SCORE *GapOpenB = DPM.GapOpenB; SCORE *GapCloseA = DPM.GapCloseA; SCORE *GapCloseB = DPM.GapCloseB; unsigned **SortOrderA = DPM.SortOrderA; FCOUNT **FreqsA = DPM.FreqsA; SCORE **ScoreMxB = DPM.ScoreMxB; SCORE *MPrev = DPM.MPrev; SCORE *MCurr = DPM.MCurr; SCORE *MWork = DPM.MWork; SCORE *DPrev = DPM.DPrev; SCORE *DCurr = DPM.DCurr; SCORE *DWork = DPM.DWork; unsigned *uDeletePos = DPM.uDeletePos; int **TraceBack = DPM.TraceBack; for (unsigned i = 0; i < uLengthA; ++i) { GapOpenA[i] = PA[i].m_scoreGapOpen; GapCloseA[i] = PA[i].m_scoreGapClose; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; } } for (unsigned j = 0; j < uLengthB; ++j) { GapOpenB[j] = PB[j].m_scoreGapOpen; GapCloseB[j] = PB[j].m_scoreGapClose; } for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { for (unsigned j = 0; j < uLengthB; ++j) ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; } for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); // Special case for i=0 unsigned **ptrSortOrderA = SortOrderA; FCOUNT **ptrFreqsA = FreqsA; assert(ptrSortOrderA == &(SortOrderA[0])); assert(ptrFreqsA == &(FreqsA[0])); TraceBack[0][0] = 0; SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][0]; } MPrev[0] = scoreSum - g_scoreCenter; // D(0,0) is -infinity (requires I->D). DPrev[0] = MINUS_INFINITY; for (unsigned j = 1; j < uLengthB; ++j) { // Only way to get M(0, j) looks like this: // A ----X // B XXXXX // 0 j // So gap-open at j=0, gap-close at j-1. SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][j]; } MPrev[j] = scoreSum - g_scoreCenter + GapOpenB[0] + GapCloseB[j-1]; TraceBack[0][j] = -(int) j; // Assume no D->I transitions, then can't be a delete if only // one letter from A. DPrev[j] = MINUS_INFINITY; } SCORE IPrev_j_1; for (unsigned i = 1; i < uLengthA; ++i) { ++ptrSortOrderA; ++ptrFreqsA; assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); SCORE *ptrMCurr_j = MCurr; memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); const FCOUNT *FreqsAi = *ptrFreqsA; const unsigned *SortOrderAi = *ptrSortOrderA; const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; const SCORE *ptrMCurrMax = MCurr + uLengthB; for (const unsigned *ptrSortOrderAi = SortOrderAi; ptrSortOrderAi != ptrSortOrderAiEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; SCORE *NSBR_Letter = ScoreMxB[uLetter]; const FCOUNT fcLetter = FreqsAi[uLetter]; if (0 == fcLetter) break; SCORE *ptrNSBR = NSBR_Letter; for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) *ptrMCurr += fcLetter*(*ptrNSBR++); } for (unsigned j = 0; j < uLengthB; ++j) MCurr[j] -= g_scoreCenter; ptrMCurr_j = MCurr; unsigned *ptrDeletePos = uDeletePos; // Special case for j=0 // Only way to get M(i, 0) looks like this: // 0 i // A XXXXX // B ----X // So gap-open at i=0, gap-close at i-1. assert(ptrMCurr_j == &(MCurr[0])); *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; ++ptrMCurr_j; int *ptrTraceBack_ij = TraceBack[i]; *ptrTraceBack_ij++ = (int) i; SCORE *ptrMPrev_j = MPrev; SCORE *ptrDPrev = DPrev; SCORE d = *ptrDPrev; SCORE DNew = *ptrMPrev_j + GapOpenA[i]; if (DNew > d) { d = DNew; *ptrDeletePos = i; } SCORE *ptrDCurr = DCurr; assert(ptrDCurr == &(DCurr[0])); *ptrDCurr = d; // Can't have an insert if no letters from B IPrev_j_1 = MINUS_INFINITY; unsigned uInsertPos; const SCORE scoreGapOpenAi = GapOpenA[i]; const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; for (unsigned j = 1; j < uLengthB; ++j) { // Here, MPrev_j is preserved from previous // iteration so with current i,j is M[i-1][j-1] SCORE MPrev_j = *ptrMPrev_j; SCORE INew = MPrev_j + GapOpenB[j]; if (INew > IPrev_j_1) { IPrev_j_1 = INew; uInsertPos = j; } SCORE scoreMax = MPrev_j; assert(ptrDPrev == &(DPrev[j-1])); SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; if (scoreD > scoreMax) { scoreMax = scoreD; assert(ptrDeletePos == &(uDeletePos[j-1])); *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; assert(*ptrTraceBack_ij > 0); } ++ptrDeletePos; SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; if (scoreI > scoreMax) { scoreMax = scoreI; *ptrTraceBack_ij = (int) uInsertPos - (int) j; assert(*ptrTraceBack_ij < 0); } assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); *ptrMCurr_j += scoreMax; assert(ptrMCurr_j == &(MCurr[j])); ++ptrMCurr_j; MPrev_j = *(++ptrMPrev_j); assert(ptrDPrev == &(DPrev[j])); SCORE d = *ptrDPrev; SCORE DNew = MPrev_j + scoreGapOpenAi; if (DNew > d) { d = DNew; assert(ptrDeletePos == &uDeletePos[j]); *ptrDeletePos = i; } assert(ptrDCurr + 1 == &(DCurr[j])); *(++ptrDCurr) = d; ++ptrTraceBack_ij; } Rotate(MPrev, MCurr, MWork); Rotate(DPrev, DCurr, DWork); } // Special case for i=uLengthA SCORE IPrev = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { SCORE INew = MPrev[j-1] + GapOpenB[j]; if (INew > IPrev) { uInsertPos = j; IPrev = INew; } } // Special case for i=uLengthA, j=uLengthB SCORE scoreMax = MPrev[uLengthB-1]; int iTraceBack = 0; SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; if (scoreD > scoreMax) { scoreMax = scoreD; iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; } SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; if (scoreI > scoreMax) { scoreMax = scoreI; iTraceBack = (int) uInsertPos - (int) uLengthB; } TraceBack[uLengthA][uLengthB] = iTraceBack; TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); return scoreMax; } glbalignspn.cpp0000664000175000017500000002375512360262613012167 0ustar bobbob#include "muscle.h" #include "profile.h" #include "pwpath.h" struct DP_MEMORY { unsigned uLength; SCORE *GapOpenA; SCORE *GapOpenB; SCORE *GapCloseA; SCORE *GapCloseB; SCORE *MPrev; SCORE *MCurr; SCORE *MWork; SCORE *DPrev; SCORE *DCurr; SCORE *DWork; SCORE **ScoreMxB; unsigned **SortOrderA; unsigned *uDeletePos; FCOUNT **FreqsA; int **TraceBack; }; static struct DP_MEMORY DPM; void FreeDPMemSPN() { const unsigned uOldLength = DPM.uLength; if (0 == uOldLength) return; for (unsigned i = 0; i < uOldLength; ++i) { delete[] DPM.TraceBack[i]; delete[] DPM.FreqsA[i]; delete[] DPM.SortOrderA[i]; } for (unsigned n = 0; n < 4; ++n) delete[] DPM.ScoreMxB[n]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.uDeletePos; delete[] DPM.GapOpenA; delete[] DPM.GapOpenB; delete[] DPM.GapCloseA; delete[] DPM.GapCloseB; delete[] DPM.SortOrderA; delete[] DPM.FreqsA; delete[] DPM.ScoreMxB; delete[] DPM.TraceBack; } static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) { // Max prefix length unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; if (uLength < DPM.uLength) return; // Add 256 to allow for future expansion and // round up to next multiple of 32. uLength += 256; uLength += 32 - uLength%32; const unsigned uOldLength = DPM.uLength; if (uOldLength > 0) { for (unsigned i = 0; i < uOldLength; ++i) { delete[] DPM.TraceBack[i]; delete[] DPM.FreqsA[i]; delete[] DPM.SortOrderA[i]; } for (unsigned n = 0; n < 4; ++n) delete[] DPM.ScoreMxB[n]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.uDeletePos; delete[] DPM.GapOpenA; delete[] DPM.GapOpenB; delete[] DPM.GapCloseA; delete[] DPM.GapCloseB; delete[] DPM.SortOrderA; delete[] DPM.FreqsA; delete[] DPM.ScoreMxB; delete[] DPM.TraceBack; } DPM.uLength = uLength; DPM.GapOpenA = new SCORE[uLength]; DPM.GapOpenB = new SCORE[uLength]; DPM.GapCloseA = new SCORE[uLength]; DPM.GapCloseB = new SCORE[uLength]; DPM.SortOrderA = new unsigned*[uLength]; DPM.FreqsA = new FCOUNT*[uLength]; DPM.ScoreMxB = new SCORE*[4]; DPM.MPrev = new SCORE[uLength]; DPM.MCurr = new SCORE[uLength]; DPM.MWork = new SCORE[uLength]; DPM.DPrev = new SCORE[uLength]; DPM.DCurr = new SCORE[uLength]; DPM.DWork = new SCORE[uLength]; DPM.uDeletePos = new unsigned[uLength]; DPM.TraceBack = new int*[uLength]; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) DPM.ScoreMxB[uLetter] = new SCORE[uLength]; for (unsigned i = 0; i < uLength; ++i) { DPM.SortOrderA[i] = new unsigned[4]; DPM.FreqsA[i] = new FCOUNT[4]; DPM.TraceBack[i] = new int[uLength]; } } SCORE GlobalAlignSPN(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { if (ALPHA_DNA != g_Alpha || ALPHA_RNA == g_Alpha) Quit("GlobalAlignSPN: must be nucleo"); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; AllocDPMem(uLengthA, uLengthB); SCORE *GapOpenA = DPM.GapOpenA; SCORE *GapOpenB = DPM.GapOpenB; SCORE *GapCloseA = DPM.GapCloseA; SCORE *GapCloseB = DPM.GapCloseB; unsigned **SortOrderA = DPM.SortOrderA; FCOUNT **FreqsA = DPM.FreqsA; SCORE **ScoreMxB = DPM.ScoreMxB; SCORE *MPrev = DPM.MPrev; SCORE *MCurr = DPM.MCurr; SCORE *MWork = DPM.MWork; SCORE *DPrev = DPM.DPrev; SCORE *DCurr = DPM.DCurr; SCORE *DWork = DPM.DWork; unsigned *uDeletePos = DPM.uDeletePos; int **TraceBack = DPM.TraceBack; for (unsigned i = 0; i < uLengthA; ++i) { GapOpenA[i] = PA[i].m_scoreGapOpen; GapCloseA[i] = PA[i].m_scoreGapClose; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) { SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; } } for (unsigned j = 0; j < uLengthB; ++j) { GapOpenB[j] = PB[j].m_scoreGapOpen; GapCloseB[j] = PB[j].m_scoreGapClose; } for (unsigned uLetter = 0; uLetter < 4; ++uLetter) { for (unsigned j = 0; j < uLengthB; ++j) ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; } for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); // Special case for i=0 unsigned **ptrSortOrderA = SortOrderA; FCOUNT **ptrFreqsA = FreqsA; assert(ptrSortOrderA == &(SortOrderA[0])); assert(ptrFreqsA == &(FreqsA[0])); TraceBack[0][0] = 0; SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 4; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][0]; } MPrev[0] = scoreSum - g_scoreCenter; // D(0,0) is -infinity (requires I->D). DPrev[0] = MINUS_INFINITY; for (unsigned j = 1; j < uLengthB; ++j) { // Only way to get M(0, j) looks like this: // A ----X // B XXXXX // 0 j // So gap-open at j=0, gap-close at j-1. SCORE scoreSum = 0; unsigned *ptrSortOrderAi = SortOrderA[0]; const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 4; FCOUNT *ptrFreqsAi = FreqsA[0]; for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; const FCOUNT fcLetter = ptrFreqsAi[uLetter]; if (0 == fcLetter) break; scoreSum += fcLetter*ScoreMxB[uLetter][j]; } MPrev[j] = scoreSum - g_scoreCenter + GapOpenB[0] + GapCloseB[j-1]; TraceBack[0][j] = -(int) j; // Assume no D->I transitions, then can't be a delete if only // one letter from A. DPrev[j] = MINUS_INFINITY; } SCORE IPrev_j_1; for (unsigned i = 1; i < uLengthA; ++i) { ++ptrSortOrderA; ++ptrFreqsA; assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); SCORE *ptrMCurr_j = MCurr; memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); const FCOUNT *FreqsAi = *ptrFreqsA; const unsigned *SortOrderAi = *ptrSortOrderA; const unsigned *ptrSortOrderAiEnd = SortOrderAi + 4; const SCORE *ptrMCurrMax = MCurr + uLengthB; for (const unsigned *ptrSortOrderAi = SortOrderAi; ptrSortOrderAi != ptrSortOrderAiEnd; ++ptrSortOrderAi) { const unsigned uLetter = *ptrSortOrderAi; SCORE *NSBR_Letter = ScoreMxB[uLetter]; const FCOUNT fcLetter = FreqsAi[uLetter]; if (0 == fcLetter) break; SCORE *ptrNSBR = NSBR_Letter; for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) *ptrMCurr += fcLetter*(*ptrNSBR++); } for (unsigned j = 0; j < uLengthB; ++j) MCurr[j] -= g_scoreCenter; ptrMCurr_j = MCurr; unsigned *ptrDeletePos = uDeletePos; // Special case for j=0 // Only way to get M(i, 0) looks like this: // 0 i // A XXXXX // B ----X // So gap-open at i=0, gap-close at i-1. assert(ptrMCurr_j == &(MCurr[0])); *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; ++ptrMCurr_j; int *ptrTraceBack_ij = TraceBack[i]; *ptrTraceBack_ij++ = (int) i; SCORE *ptrMPrev_j = MPrev; SCORE *ptrDPrev = DPrev; SCORE d = *ptrDPrev; SCORE DNew = *ptrMPrev_j + GapOpenA[i]; if (DNew > d) { d = DNew; *ptrDeletePos = i; } SCORE *ptrDCurr = DCurr; assert(ptrDCurr == &(DCurr[0])); *ptrDCurr = d; // Can't have an insert if no letters from B IPrev_j_1 = MINUS_INFINITY; unsigned uInsertPos; const SCORE scoreGapOpenAi = GapOpenA[i]; const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; for (unsigned j = 1; j < uLengthB; ++j) { // Here, MPrev_j is preserved from previous // iteration so with current i,j is M[i-1][j-1] SCORE MPrev_j = *ptrMPrev_j; SCORE INew = MPrev_j + GapOpenB[j]; if (INew > IPrev_j_1) { IPrev_j_1 = INew; uInsertPos = j; } SCORE scoreMax = MPrev_j; assert(ptrDPrev == &(DPrev[j-1])); SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; if (scoreD > scoreMax) { scoreMax = scoreD; assert(ptrDeletePos == &(uDeletePos[j-1])); *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; assert(*ptrTraceBack_ij > 0); } ++ptrDeletePos; SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; if (scoreI > scoreMax) { scoreMax = scoreI; *ptrTraceBack_ij = (int) uInsertPos - (int) j; assert(*ptrTraceBack_ij < 0); } assert(ptrSortOrderA == &(SortOrderA[i])); assert(ptrFreqsA == &(FreqsA[i])); *ptrMCurr_j += scoreMax; assert(ptrMCurr_j == &(MCurr[j])); ++ptrMCurr_j; MPrev_j = *(++ptrMPrev_j); assert(ptrDPrev == &(DPrev[j])); SCORE d = *ptrDPrev; SCORE DNew = MPrev_j + scoreGapOpenAi; if (DNew > d) { d = DNew; assert(ptrDeletePos == &uDeletePos[j]); *ptrDeletePos = i; } assert(ptrDCurr + 1 == &(DCurr[j])); *(++ptrDCurr) = d; ++ptrTraceBack_ij; } Rotate(MPrev, MCurr, MWork); Rotate(DPrev, DCurr, DWork); } // Special case for i=uLengthA SCORE IPrev = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { SCORE INew = MPrev[j-1] + GapOpenB[j]; if (INew > IPrev) { uInsertPos = j; IPrev = INew; } } // Special case for i=uLengthA, j=uLengthB SCORE scoreMax = MPrev[uLengthB-1]; int iTraceBack = 0; SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; if (scoreD > scoreMax) { scoreMax = scoreD; iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; } SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; if (scoreI > scoreMax) { scoreMax = scoreI; iTraceBack = (int) uInsertPos - (int) uLengthB; } TraceBack[uLengthA][uLengthB] = iTraceBack; TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); return scoreMax; } glbalignss.cpp0000664000175000017500000001631012360262614012002 0ustar bobbob#include "muscle.h" #include "profile.h" #include "pwpath.h" #include "seq.h" extern SCOREMATRIX VTML_SP; // #define SUBST(i, j) Subst(seqA, seqB, i, j) #define SUBST(i, j) MxRowA[i][seqB.GetLetter(j)] static SCORE Subst(const Seq &seqA, const Seq &seqB, unsigned i, unsigned j) { assert(i < seqA.Length()); assert(j < seqB.Length()); unsigned uLetterA = seqA.GetLetter(i); unsigned uLetterB = seqB.GetLetter(j); return VTML_SP[uLetterA][uLetterB] + g_scoreCenter; } struct DP_MEMORY { unsigned uLength; SCORE *MPrev; SCORE *MCurr; SCORE *MWork; SCORE *DPrev; SCORE *DCurr; SCORE *DWork; SCORE **MxRowA; unsigned *LettersB; unsigned *uDeletePos; int **TraceBack; }; static struct DP_MEMORY DPM; static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) { // Max prefix length unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; if (uLength < DPM.uLength) return; // Add 256 to allow for future expansion and // round up to next multiple of 32. uLength += 256; uLength += 32 - uLength%32; const unsigned uOldLength = DPM.uLength; if (uOldLength > 0) { for (unsigned i = 0; i < uOldLength; ++i) delete[] DPM.TraceBack[i]; delete[] DPM.MPrev; delete[] DPM.MCurr; delete[] DPM.MWork; delete[] DPM.DPrev; delete[] DPM.DCurr; delete[] DPM.DWork; delete[] DPM.MxRowA; delete[] DPM.LettersB; delete[] DPM.uDeletePos; delete[] DPM.TraceBack; } DPM.uLength = uLength; DPM.MPrev = new SCORE[uLength]; DPM.MCurr = new SCORE[uLength]; DPM.MWork = new SCORE[uLength]; DPM.DPrev = new SCORE[uLength]; DPM.DCurr = new SCORE[uLength]; DPM.DWork = new SCORE[uLength]; DPM.MxRowA = new SCORE *[uLength]; DPM.LettersB = new unsigned[uLength]; DPM.uDeletePos = new unsigned[uLength]; DPM.TraceBack = new int*[uLength]; for (unsigned i = 0; i < uLength; ++i) DPM.TraceBack[i] = new int[uLength]; } static void RowFromSeq(const Seq &s, SCORE *Row[]) { const unsigned uLength = s.Length(); for (unsigned i = 0; i < uLength; ++i) { char c = s.GetChar(i); unsigned uLetter = CharToLetter(c); if (uLetter < 20) Row[i] = VTML_SP[uLetter]; else Row[i] = VTML_SP[AX_X]; } } static void LettersFromSeq(const Seq &s, unsigned Letters[]) { const unsigned uLength = s.Length(); for (unsigned i = 0; i < uLength; ++i) { char c = s.GetChar(i); unsigned uLetter = CharToLetter(c); if (uLetter < 20) Letters[i] = uLetter; else Letters[i] = AX_X; } } SCORE GlobalAlignSS(const Seq &seqA, const Seq &seqB, PWPath &Path) { const unsigned uLengthA = seqA.Length(); const unsigned uLengthB = seqB.Length(); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; AllocDPMem(uLengthA, uLengthB); SCORE *MPrev = DPM.MPrev; SCORE *MCurr = DPM.MCurr; SCORE *MWork = DPM.MWork; SCORE *DPrev = DPM.DPrev; SCORE *DCurr = DPM.DCurr; SCORE *DWork = DPM.DWork; SCORE **MxRowA = DPM.MxRowA; unsigned *LettersB = DPM.LettersB; RowFromSeq(seqA, MxRowA); LettersFromSeq(seqB, LettersB); unsigned *uDeletePos = DPM.uDeletePos; int **TraceBack = DPM.TraceBack; #if DEBUG for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); #endif // Special case for i=0 TraceBack[0][0] = 0; MPrev[0] = MxRowA[0][LettersB[0]]; // D(0,0) is -infinity (requires I->D). DPrev[0] = MINUS_INFINITY; for (unsigned j = 1; j < uLengthB; ++j) { unsigned uLetterB = LettersB[j]; // Only way to get M(0, j) looks like this: // A ----X // B XXXXX // 0 j // So gap-open at j=0, gap-close at j-1. MPrev[j] = MxRowA[0][uLetterB] + g_scoreGapOpen/2; // term gaps half TraceBack[0][j] = -(int) j; // Assume no D->I transitions, then can't be a delete if only // one letter from A. DPrev[j] = MINUS_INFINITY; } SCORE IPrev_j_1; for (unsigned i = 1; i < uLengthA; ++i) { SCORE *ptrMCurr_j = MCurr; memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); const SCORE *RowA = MxRowA[i]; const SCORE *ptrRowA = MxRowA[i]; const SCORE *ptrMCurrEnd = ptrMCurr_j + uLengthB; unsigned *ptrLettersB = LettersB; for (; ptrMCurr_j != ptrMCurrEnd; ++ptrMCurr_j) { *ptrMCurr_j = RowA[*ptrLettersB]; ++ptrLettersB; } unsigned *ptrDeletePos = uDeletePos; // Special case for j=0 // Only way to get M(i, 0) looks like this: // 0 i // A XXXXX // B ----X // So gap-open at i=0, gap-close at i-1. ptrMCurr_j = MCurr; assert(ptrMCurr_j == &(MCurr[0])); *ptrMCurr_j += g_scoreGapOpen/2; // term gaps half ++ptrMCurr_j; int *ptrTraceBack_ij = TraceBack[i]; *ptrTraceBack_ij++ = (int) i; SCORE *ptrMPrev_j = MPrev; SCORE *ptrDPrev = DPrev; SCORE d = *ptrDPrev; SCORE DNew = *ptrMPrev_j + g_scoreGapOpen; if (DNew > d) { d = DNew; *ptrDeletePos = i; } SCORE *ptrDCurr = DCurr; assert(ptrDCurr == &(DCurr[0])); *ptrDCurr = d; // Can't have an insert if no letters from B IPrev_j_1 = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { // Here, MPrev_j is preserved from previous // iteration so with current i,j is M[i-1][j-1] SCORE MPrev_j = *ptrMPrev_j; SCORE INew = MPrev_j + g_scoreGapOpen; if (INew > IPrev_j_1) { IPrev_j_1 = INew; uInsertPos = j; } SCORE scoreMax = MPrev_j; assert(ptrDPrev == &(DPrev[j-1])); SCORE scoreD = *ptrDPrev++; if (scoreD > scoreMax) { scoreMax = scoreD; assert(ptrDeletePos == &(uDeletePos[j-1])); *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; assert(*ptrTraceBack_ij > 0); } ++ptrDeletePos; SCORE scoreI = IPrev_j_1; if (scoreI > scoreMax) { scoreMax = scoreI; *ptrTraceBack_ij = (int) uInsertPos - (int) j; assert(*ptrTraceBack_ij < 0); } *ptrMCurr_j += scoreMax; assert(ptrMCurr_j == &(MCurr[j])); ++ptrMCurr_j; MPrev_j = *(++ptrMPrev_j); assert(ptrDPrev == &(DPrev[j])); SCORE d = *ptrDPrev; SCORE DNew = MPrev_j + g_scoreGapOpen; if (DNew > d) { d = DNew; assert(ptrDeletePos == &uDeletePos[j]); *ptrDeletePos = i; } assert(ptrDCurr + 1 == &(DCurr[j])); *(++ptrDCurr) = d; ++ptrTraceBack_ij; } Rotate(MPrev, MCurr, MWork); Rotate(DPrev, DCurr, DWork); } // Special case for i=uLengthA SCORE IPrev = MINUS_INFINITY; unsigned uInsertPos; for (unsigned j = 1; j < uLengthB; ++j) { SCORE INew = MPrev[j-1]; if (INew > IPrev) { uInsertPos = j; IPrev = INew; } } // Special case for i=uLengthA, j=uLengthB SCORE scoreMax = MPrev[uLengthB-1]; int iTraceBack = 0; SCORE scoreD = DPrev[uLengthB-1] - g_scoreGapOpen/2; // term gaps half if (scoreD > scoreMax) { scoreMax = scoreD; iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; } SCORE scoreI = IPrev - g_scoreGapOpen/2; if (scoreI > scoreMax) { scoreMax = scoreI; iTraceBack = (int) uInsertPos - (int) uLengthB; } TraceBack[uLengthA][uLengthB] = iTraceBack; TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); return scoreMax; } glbalndimer.cpp0000664000175000017500000002415512360262614012143 0ustar bobbob#include "muscle.h" #include #include // for sprintf #include "pwpath.h" #include "profile.h" #include "gapscoredimer.h" #define TRACE 0 static SCORE TraceBackDimer( const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, const char *TBM_, const char *TBD_, const char *TBI_, unsigned uLengthA, unsigned uLengthB, PWPath &Path); static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (MINUS_INFINITY == s) return " *"; sprintf(str, "%6.3g", s); return str; } #if TRACE static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log("%2d", uPrefixLengthB); Log("\n"); Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %c", c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %c", TBM(uPrefixLengthA, uPrefixLengthB)); Log("\n"); } } #endif // TRACE static ProfPos PPTerm; static bool InitializePPTerm() { PPTerm.m_bAllGaps = false; PPTerm.m_LL = 1; PPTerm.m_LG = 0; PPTerm.m_GL = 0; PPTerm.m_GG = 0; PPTerm.m_fOcc = 1; return true; } static bool PPTermInitialized = InitializePPTerm(); static SCORE ScoreProfPosDimerLE(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 20; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } if (0 == Score) return -2.5; SCORE logScore = logf(Score); return (SCORE) (logScore*(PPA.m_fOcc * PPB.m_fOcc)); } static SCORE ScoreProfPosDimerPSP(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 20; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } return Score; } static SCORE ScoreProfPosDimer(const ProfPos &PPA, const ProfPos &PPB) { switch (g_PPScore) { case PPSCORE_LE: return ScoreProfPosDimerLE(PPA, PPB); case PPSCORE_SP: case PPSCORE_SV: return ScoreProfPosDimerPSP(PPA, PPB); } Quit("Invalid g_PPScore"); return 0; } // Global alignment dynamic programming // This variant optimizes the profile-profile SP score under the // dimer approximation. SCORE GlobalAlignDimer(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; // Allocate DP matrices const size_t LM = uPrefixCountA*uPrefixCountB; SCORE *DPM_ = new SCORE[LM]; SCORE *DPD_ = new SCORE[LM]; SCORE *DPI_ = new SCORE[LM]; char *TBM_ = new char[LM]; char *TBD_ = new char[LM]; char *TBI_ = new char[LM]; DPM(0, 0) = 0; DPD(0, 0) = MINUS_INFINITY; DPI(0, 0) = MINUS_INFINITY; TBM(0, 0) = 'S'; TBD(0, 0) = '?'; TBI(0, 0) = '?'; DPM(1, 0) = MINUS_INFINITY; DPD(1, 0) = GapScoreMD(PA[0], PPTerm); DPI(1, 0) = MINUS_INFINITY; TBM(1, 0) = '?'; TBD(1, 0) = 'S'; TBI(1, 0) = '?'; DPM(0, 1) = MINUS_INFINITY; DPD(0, 1) = MINUS_INFINITY; DPI(0, 1) = GapScoreMI(PPTerm, PB[0]); TBM(0, 1) = '?'; TBD(0, 1) = '?'; TBI(0, 1) = 'S'; // Empty prefix of B is special case for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { // M=LetterA+LetterB, impossible with empty prefix DPM(uPrefixLengthA, 0) = MINUS_INFINITY; TBM(uPrefixLengthA, 0) = '?'; // D=LetterA+GapB DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + GapScoreDD(PA[uPrefixLengthA - 1], PPTerm); TBD(uPrefixLengthA, 0) = 'D'; // I=GapA+LetterB, impossible with empty prefix DPI(uPrefixLengthA, 0) = MINUS_INFINITY; TBI(uPrefixLengthA, 0) = '?'; } // Empty prefix of A is special case for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { // M=LetterA+LetterB, impossible with empty prefix DPM(0, uPrefixLengthB) = MINUS_INFINITY; TBM(0, uPrefixLengthB) = '?'; // D=LetterA+GapB, impossible with empty prefix DPD(0, uPrefixLengthB) = MINUS_INFINITY; TBD(0, uPrefixLengthB) = '?'; // I=GapA+LetterB DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + GapScoreII(PPTerm, PB[uPrefixLengthB - 1]); TBI(0, uPrefixLengthB) = 'I'; } // ============ // Main DP loop // ============ for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; { // Match M=LetterA+LetterB SCORE scoreLL = ScoreProfPosDimer(PPA, PPB); SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreMM(PPA, PPB); SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreDM(PPA, PPB); SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreIM(PPA, PPB); SCORE scoreBest = scoreMM; char c = 'M'; if (scoreDM > scoreBest) { scoreBest = scoreDM; c = 'D'; } if (scoreIM > scoreBest) { scoreBest = scoreIM; c = 'I'; } DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; TBM(uPrefixLengthA, uPrefixLengthB) = c; } { // Delete D=LetterA+GapB SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + GapScoreMD(PPA, PPB); SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + GapScoreDD(PPA, PPB); SCORE scoreID = DPI(uPrefixLengthA-1, uPrefixLengthB) + GapScoreID(PPA, PPB); SCORE scoreBest = scoreMD; char c = 'M'; if (scoreDD > scoreBest) { scoreBest = scoreDD; c = 'D'; } if (scoreID > scoreBest) { scoreBest = scoreID; c = 'I'; } DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; TBD(uPrefixLengthA, uPrefixLengthB) = c; } { // Insert I=GapA+LetterB SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + GapScoreMI(PPA, PPB); SCORE scoreDI = DPD(uPrefixLengthA, uPrefixLengthB-1) + GapScoreDI(PPA, PPB); SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + GapScoreII(PPA, PPB); SCORE scoreBest = scoreMI; char c = 'M'; if (scoreDI > scoreBest) { scoreBest = scoreDI; c = 'D'; } if (scoreII > scoreBest) { scoreBest = scoreII; c = 'I'; } DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; TBI(uPrefixLengthA, uPrefixLengthB) = c; } } } #if TRACE Log("DPM:\n"); ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPD:\n"); ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPI:\n"); ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBM:\n"); ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBD:\n"); ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBI:\n"); ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); #endif SCORE Score = TraceBackDimer(DPM_, DPD_, DPI_, TBM_, TBD_, TBI_, uLengthA, uLengthB, Path); #if TRACE Log("GlobalAlignDimer score = %.3g\n", Score); #endif delete[] DPM_; delete[] DPD_; delete[] DPI_; delete[] TBM_; delete[] TBD_; delete[] TBI_; return Score; } static SCORE TraceBackDimer( const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, const char *TBM_, const char *TBD_, const char *TBI_, unsigned uLengthA, unsigned uLengthB, PWPath &Path) { const unsigned uPrefixCountA = uLengthA + 1; unsigned uPrefixLengthA = uLengthA; unsigned uPrefixLengthB = uLengthB; char cEdge = 'M'; SCORE scoreMax = DPM(uLengthA, uLengthB); if (DPD(uLengthA, uLengthB) > scoreMax) { scoreMax = DPD(uLengthA, uLengthB); cEdge = 'D'; } if (DPI(uLengthA, uLengthB) > scoreMax) { scoreMax = DPI(uLengthA, uLengthB); cEdge = 'I'; } for (;;) { if (0 == uPrefixLengthA && 0 == uPrefixLengthB) break; PWEdge Edge; Edge.cType = cEdge; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; Path.PrependEdge(Edge); #if TRACE Log("PLA=%u PLB=%u Edge=%c\n", uPrefixLengthA, uPrefixLengthB, cEdge); #endif switch (cEdge) { case 'M': assert(uPrefixLengthA > 0 && uPrefixLengthB > 0); cEdge = TBM(uPrefixLengthA, uPrefixLengthB); --uPrefixLengthA; --uPrefixLengthB; break; case 'D': assert(uPrefixLengthA > 0); cEdge = TBD(uPrefixLengthA, uPrefixLengthB); --uPrefixLengthA; break; case 'I': assert(uPrefixLengthB > 0); cEdge = TBI(uPrefixLengthA, uPrefixLengthB); --uPrefixLengthB; break; default: Quit("Invalid edge PLA=%u PLB=%u %c", uPrefixLengthA, uPrefixLengthB, cEdge); } } #if TRACE Path.LogMe(); #endif return scoreMax; } globals.cpp0000664000175000017500000001361712360262614011307 0ustar bobbob#if WIN32 #include #include #endif #include "muscle.h" #include #include #include #include #include #include #include #include #ifndef MAX_PATH #define MAX_PATH 260 #endif static char g_strListFileName[MAX_PATH]; static bool g_bListFileAppend = false; static SEQWEIGHT g_SeqWeight = SEQWEIGHT_Undefined; void SetSeqWeightMethod(SEQWEIGHT Method) { g_SeqWeight = Method; } SEQWEIGHT GetSeqWeightMethod() { return g_SeqWeight; } void SetListFileName(const char *ptrListFileName, bool bAppend) { assert(strlen(ptrListFileName) < MAX_PATH); strcpy(g_strListFileName, ptrListFileName); g_bListFileAppend = bAppend; } void Log(const char szFormat[], ...) { if (0 == g_strListFileName[0]) return; static FILE *f = NULL; const char *mode; if (g_bListFileAppend) mode = "a"; else mode = "w"; if (NULL == f) f = _fsopen(g_strListFileName, mode, _SH_DENYNO); if (NULL == f) { perror(g_strListFileName); exit(EXIT_NotStarted); } char szStr[4096]; va_list ArgList; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); fprintf(f, "%s", szStr); fflush(f); } const char *GetTimeAsStr() { static char szStr[32]; time_t t; time(&t); struct tm *ptmCurrentTime = localtime(&t); strcpy(szStr, asctime(ptmCurrentTime)); assert('\n' == szStr[24]); szStr[24] = 0; return szStr; } // Exit immediately with error message, printf-style. void Quit(const char szFormat[], ...) { va_list ArgList; char szStr[4096]; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); fprintf(stderr, "\n*** ERROR *** %s\n", szStr); Log("\n*** FATAL ERROR *** "); Log("%s\n", szStr); Log("Stopped %s\n", GetTimeAsStr()); #ifdef WIN32 if (IsDebuggerPresent()) { int iBtn = MessageBox(NULL, szStr, "muscle", MB_ICONERROR | MB_OKCANCEL); if (IDCANCEL == iBtn) Break(); } #endif exit(EXIT_FatalError); } void Warning(const char szFormat[], ...) { va_list ArgList; char szStr[4096]; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); fprintf(stderr, "\n*** WARNING *** %s\n", szStr); Log("\n*** WARNING *** %s\n", szStr); } // Remove leading and trailing blanks from string void TrimBlanks(char szStr[]) { TrimLeadingBlanks(szStr); TrimTrailingBlanks(szStr); } void TrimLeadingBlanks(char szStr[]) { size_t n = strlen(szStr); while (szStr[0] == ' ') { memmove(szStr, szStr+1, n); szStr[--n] = 0; } } void TrimTrailingBlanks(char szStr[]) { size_t n = strlen(szStr); while (n > 0 && szStr[n-1] == ' ') szStr[--n] = 0; } bool Verbose() { return true; } SCORE StrToScore(const char *pszStr) { return (SCORE) atof(pszStr); } void StripWhitespace(char szStr[]) { unsigned uOutPos = 0; unsigned uInPos = 0; while (char c = szStr[uInPos++]) if (' ' != c && '\t' != c && '\n' != c && '\r' != c) szStr[uOutPos++] = c; szStr[uOutPos] = 0; } void StripGaps(char szStr[]) { unsigned uOutPos = 0; unsigned uInPos = 0; while (char c = szStr[uInPos++]) if ('-' != c) szStr[uOutPos++] = c; szStr[uOutPos] = 0; } bool IsValidSignedInteger(const char *Str) { if (0 == strlen(Str)) return false; if ('+' == *Str || '-' == *Str) ++Str; while (char c = *Str++) if (!isdigit(c)) return false; return true; } bool IsValidInteger(const char *Str) { if (0 == strlen(Str)) return false; while (char c = *Str++) if (!isdigit(c)) return false; return true; } // Is c valid as first character in an identifier? bool isidentf(char c) { return isalpha(c) || '_' == c; } // Is c valid character in an identifier? bool isident(char c) { return isalpha(c) || isdigit(c) || '_' == c; } bool IsValidIdentifier(const char *Str) { if (!isidentf(Str[0])) return false; while (char c = *Str++) if (!isident(c)) return false; return true; } void SetLogFile() { const char *strFileName = ValueOpt("loga"); if (0 != strFileName) g_bListFileAppend = true; else strFileName = ValueOpt("log"); if (0 == strFileName) return; strcpy(g_strListFileName, strFileName); } // Get filename, stripping any extension and directory parts. void NameFromPath(const char szPath[], char szName[], unsigned uBytes) { if (0 == uBytes) return; const char *pstrLastSlash = strrchr(szPath, '/'); const char *pstrLastBackslash = strrchr(szPath, '\\'); const char *pstrLastDot = strrchr(szPath, '.'); const char *pstrLastSep = pstrLastSlash > pstrLastBackslash ? pstrLastSlash : pstrLastBackslash; const char *pstrBegin = pstrLastSep ? pstrLastSep + 1 : szPath; const char *pstrEnd = pstrLastDot ? pstrLastDot - 1 : szPath + strlen(szPath); unsigned uNameLength = (unsigned) (pstrEnd - pstrBegin + 1); if (uNameLength > uBytes - 1) uNameLength = uBytes - 1; memcpy(szName, pstrBegin, uNameLength); szName[uNameLength] = 0; } char *strsave(const char *s) { char *ptrCopy = strdup(s); if (0 == ptrCopy) Quit("Out of memory"); return ptrCopy; } bool IsValidFloatChar(char c) { return isdigit(c) || '.' == c || 'e' == c || 'E' == c || 'd' == c || 'D' == c || '.' == c || '+' == c || '-' == c; } void Call_MY_ASSERT(const char *file, int line, bool b, const char *msg) { if (b) return; Quit("%s(%d): MY_ASSERT(%s)", file, line, msg); } static size_t g_MemTotal; void MemPlus(size_t Bytes, char *Where) { g_MemTotal += Bytes; Log("+%10u %6u %6u %s\n", (unsigned) Bytes, (unsigned) GetMemUseMB(), (unsigned) (g_MemTotal/1000000), Where); } void MemMinus(size_t Bytes, char *Where) { g_MemTotal -= Bytes; Log("-%10u %6u %6u %s\n", (unsigned) Bytes, (unsigned) GetMemUseMB(), (unsigned) (g_MemTotal/1000000), Where); } globalslinux.cpp0000664000175000017500000000553212360262614012364 0ustar bobbob#include "muscle.h" #if defined(__linux__) #include #include #include #include #include #include const int ONE_MB = 1000000; const int MEM_WARNING_THRESHOLD = 20*ONE_MB; double GetNAN() { static unsigned long nan[2]={0xffffffff, 0x7fffffff}; double dNAN = *( double* )nan; return dNAN; } double g_dNAN = GetNAN(); void chkmem(const char szMsg[]) { //assert(_CrtCheckMemory()); } void Break() { //DebugBreak(); } static char szCmdLine[4096]; void *ptrStartBreak = sbrk(0); const char *GetCmdLine() { return szCmdLine; } double GetMemUseMB() { static char statm[64]; static int PageSize; if (0 == statm[0]) { PageSize = sysconf(_SC_PAGESIZE); pid_t pid = getpid(); sprintf(statm, "/proc/%d/statm", (int) pid); } int fd = open(statm, O_RDONLY); if (-1 == fd) return -1; char Buffer[64]; int n = read(fd, Buffer, sizeof(Buffer) - 1); close(fd); fd = -1; if (n <= 0) { static bool Warned = false; if (!Warned) { Warned = true; Warning("*Warning* Cannot read %s errno=%d %s", statm, errno, strerror(errno)); } return 0; } Buffer[n] = 0; int Pages = atoi(Buffer); return ((double) Pages * (double) PageSize)/1e6; } void SaveCmdLine(int argc, char *argv[]) { for (int i = 0; i < argc; ++i) { if (i > 0) strcat(szCmdLine, " "); strcat(szCmdLine, argv[i]); } } double dPeakMemUseMB = 0; double GetPeakMemUseMB() { CheckMemUse(); return dPeakMemUseMB; } double GetCPUGHz() { double dGHz = 2.5; const char *e = getenv("CPUGHZ"); if (0 != e) dGHz = atof(e); return dGHz; } void CheckMemUse() { double dMB = GetMemUseMB(); if (dMB > dPeakMemUseMB) dPeakMemUseMB = dMB; } double GetRAMSizeMB() { const double DEFAULT_RAM = 500; static double RAMMB = 0; if (RAMMB != 0) return RAMMB; int fd = open("/proc/meminfo", O_RDONLY); if (-1 == fd) { static bool Warned = false; if (!Warned) { Warned = true; Warning("*Warning* Cannot open /proc/meminfo errno=%d %s", errno, strerror(errno)); } return DEFAULT_RAM; } char Buffer[1024]; int n = read(fd, Buffer, sizeof(Buffer) - 1); close(fd); fd = -1; if (n <= 0) { static bool Warned = false; if (!Warned) { Warned = true; Warning("*Warning* Cannot read /proc/meminfo errno=%d %s", errno, strerror(errno)); } return DEFAULT_RAM; } Buffer[n] = 0; char *pMem = strstr(Buffer, "MemTotal: "); if (0 == pMem) { static bool Warned = false; if (!Warned) { Warned = true; Warning("*Warning* 'MemTotal:' not found in /proc/meminfo"); } return DEFAULT_RAM; } int Bytes = atoi(pMem+9)*1000; return ((double) Bytes)/1e6; } #endif // !WIN32 globalsosx.cpp0000664000175000017500000000345012360262614012033 0ustar bobbob#ifdef __MACH__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include const double DEFAULT_RAM = 1e9; const double DEFAULT_MEM_USE = 1e6; double GetNAN() { static unsigned long nan[2]={0xffffffff, 0x7fffffff}; double dNAN = *( double* )nan; return dNAN; } double g_dNAN = GetNAN(); double GetRAMSize() { static double CACHED_RAM = 0; if (CACHED_RAM != 0) return CACHED_RAM; uint64_t MemPages = 0; size_t Len = sizeof(MemPages); if (sysctlbyname("hw.memsize", &MemPages, &Len, NULL, 0) < 0) return DEFAULT_RAM; return (double) MemPages; } double GetRAMSizeMB() { return GetRAMSize()/1e6; } static double g_uPeakMemUseBytes; double GetMaxMemUseBytes() { return g_uPeakMemUseBytes; } double GetPeakMemUseBytes() { return GetMaxMemUseBytes(); } double GetMemUseBytes() { task_t mytask = mach_task_self(); struct task_basic_info ti; memset((void *) &ti, 0, sizeof(ti)); mach_msg_type_number_t count = TASK_BASIC_INFO_COUNT; kern_return_t ok = task_info(mytask, TASK_BASIC_INFO, (task_info_t) &ti, &count); if (ok == KERN_INVALID_ARGUMENT) return DEFAULT_MEM_USE; if (ok != KERN_SUCCESS) return DEFAULT_MEM_USE; double uBytes = (double ) ti.resident_size; if (uBytes > g_uPeakMemUseBytes) g_uPeakMemUseBytes = uBytes; return uBytes; } double GetMemUseMB() { return GetMemUseBytes()/1e6; } void OSInit() { } #endif // __MACH__ globalsother.cpp0000664000175000017500000000131112360262614012335 0ustar bobbob#include "muscle.h" #if !defined(__linux__) && !defined(_MSC_VER) && !defined(__MACH__) double GetNAN() { return 0.0; } double g_dNAN = GetNAN(); void chkmem(const char szMsg[]) { } void Break() { } char szCmdLine[4096]; const char *GetCmdLine() { return "muscle"; } double GetMemUseMB() { return 100.0; } void SaveCmdLine(int argc, char *argv[]) { for (int i = 0; i < argc; ++i) { if (i > 0) strcat(szCmdLine, " "); strcat(szCmdLine, argv[i]); } } double GetPeakMemUseMB() { return 100.0; } double GetCPUGHz() { return 2.0; } void CheckMemUse() { } double GetRAMSizeMB() { return 500.0; } #endif globalswin32.cpp0000664000175000017500000000453112360262614012165 0ustar bobbob#include "muscle.h" #if WIN32 #include #include #include #include #include void DebugPrintf(const char *szFormat, ...) { va_list ArgList; char szStr[4096]; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); OutputDebugString(szStr); } double GetNAN() { static unsigned long nan[2]={0xffffffff, 0x7fffffff}; double dNAN = *( double* )nan; assert(_isnan(dNAN)); return dNAN; } double g_dNAN = GetNAN(); void chkmem(const char szMsg[]) { if (!_CrtCheckMemory()) Quit("chkmem(%s)", szMsg); } void Break() { if (IsDebuggerPresent()) DebugBreak(); } const char *GetCmdLine() { return GetCommandLine(); } static unsigned uPeakMemUseBytes; double GetRAMSizeMB() { MEMORYSTATUS MS; GlobalMemoryStatus(&MS); return MS.dwAvailPhys/1e6; } double GetMemUseMB() { HANDLE hProc = GetCurrentProcess(); PROCESS_MEMORY_COUNTERS PMC; BOOL bOk = GetProcessMemoryInfo(hProc, &PMC, sizeof(PMC)); assert(bOk); //printf("GetMemUseMB()\n"); //printf("%12u PageFaultCount\n", (unsigned) PMC.PageFaultCount); //printf("%12u PagefileUsage\n", (unsigned) PMC.PagefileUsage); //printf("%12u PeakPagefileUsage\n", (unsigned) PMC.PeakPagefileUsage); //printf("%12u WorkingSetSize\n", (unsigned) PMC.WorkingSetSize); //printf("%12u PeakWorkingSetSize\n", (unsigned) PMC.PeakWorkingSetSize); //printf("%12u QuotaPagedPoolUsage\n", (unsigned) PMC.QuotaPagedPoolUsage); //printf("%12u QuotaPeakPagedPoolUsage\n", (unsigned) PMC.QuotaPeakPagedPoolUsage); //printf("%12u QuotaNonPagedPoolUsage\n", (unsigned) PMC.QuotaNonPagedPoolUsage); //printf("%12u QuotaPeakNonPagedPoolUsage\n", (unsigned) PMC.QuotaPeakNonPagedPoolUsage); unsigned uBytes = (unsigned) PMC.WorkingSetSize; if (uBytes > uPeakMemUseBytes) uPeakMemUseBytes = uBytes; return (uBytes + 500000.0)/1000000.0; } double GetPeakMemUseMB() { return (uPeakMemUseBytes + 500000.0)/1000000.0; } void CheckMemUse() { // Side-effect: sets peak usage in uPeakMemUseBytes GetMemUseMB(); } double GetCPUGHz() { double dGHz = 2.5; const char *e = getenv("CPUGHZ"); if (0 != e) dGHz = atof(e); if (dGHz < 0.1 || dGHz > 1000.0) Quit("Invalid value '%s' for environment variable CPUGHZ", e); return dGHz; } #endif // WIN32 gonnet.cpp0000664000175000017500000005745512360262614011166 0ustar bobbob#include "muscle.h" #include "gonnet.h" #define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ { A/4.0, C/4.0, D/4.0, E/4.0, F/4.0, G/4.0, H/4.0, I/4.0, K/4.0, L/4.0, M/4.0, N/4.0, P/4.0, Q/4.0, R/4.0, S/4.0, T/4.0, V/4.0, W/4.0, Y/4.0 }, static double Gonnet80[20][20] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 1990, 1140, 930, 1070, 600, 1130, 850, 810, 940, 810, 980, 900, 1080, 1020, 880, 1380, 1190, 1180, 370, 590) // A ROW( 1140, 2780, 310, 300, 850, 630, 810, 700, 360, 690, 850, 690, 310, 480, 640, 1090, 900, 1030, 810, 920) // C ROW( 930, 310, 2200, 1550, 130, 980, 1070, 180, 1030, 150, 360, 1450, 820, 1150, 800, 1100, 1000, 350, 0, 550) // D ROW( 1070, 300, 1550, 2120, 220, 770, 1070, 510, 1280, 490, 710, 1110, 890, 1470, 1010, 1050, 970, 730, 260, 500) // E ROW( 600, 850, 130, 220, 2380, 90, 980, 1090, 350, 1310, 1270, 490, 310, 540, 340, 470, 620, 930, 1400, 1730) // F ROW( 1130, 630, 980, 770, 90, 2210, 710, 100, 740, 200, 410, 1060, 660, 800, 810, 1080, 720, 380, 430, 300) // G ROW( 850, 810, 1070, 1070, 980, 710, 2510, 600, 1120, 670, 860, 1330, 790, 1380, 1140, 990, 1000, 590, 810, 1450) // H ROW( 810, 700, 180, 510, 1090, 100, 600, 2100, 650, 1460, 1490, 530, 490, 640, 530, 620, 960, 1650, 610, 770) // I ROW( 940, 360, 1030, 1280, 350, 740, 1120, 650, 2090, 660, 870, 1220, 870, 1410, 1570, 1040, 1090, 700, 350, 640) // K ROW( 810, 690, 150, 490, 1310, 200, 670, 1460, 660, 2010, 1550, 450, 660, 850, 660, 600, 750, 1270, 800, 890) // L ROW( 980, 850, 360, 710, 1270, 410, 860, 1490, 870, 1550, 2410, 620, 460, 1050, 710, 830, 990, 1250, 790, 870) // M ROW( 900, 690, 1450, 1110, 490, 1060, 1330, 530, 1220, 450, 620, 2210, 760, 1180, 1020, 1290, 1170, 550, 380, 850) // N ROW( 1080, 310, 820, 890, 310, 660, 790, 490, 870, 660, 460, 760, 2380, 1000, 790, 1100, 1040, 670, 120, 480) // P ROW( 1020, 480, 1150, 1470, 540, 800, 1380, 640, 1410, 850, 1050, 1180, 1000, 2190, 1350, 1090, 1060, 730, 620, 710) // Q ROW( 880, 640, 800, 1010, 340, 810, 1140, 530, 1570, 660, 710, 1020, 790, 1350, 2210, 970, 970, 640, 830, 740) // R ROW( 1380, 1090, 1100, 1050, 470, 1080, 990, 620, 1040, 600, 830, 1290, 1100, 1090, 970, 2020, 1490, 810, 520, 780) // S ROW( 1190, 900, 1000, 970, 620, 720, 1000, 960, 1090, 750, 990, 1170, 1040, 1060, 970, 1490, 2050, 1150, 370, 660) // T ROW( 1180, 1030, 350, 730, 930, 380, 590, 1650, 700, 1270, 1250, 550, 670, 730, 640, 810, 1150, 2040, 440, 770) // V ROW( 370, 810, 0, 260, 1400, 430, 810, 610, 350, 800, 790, 380, 120, 620, 830, 520, 370, 440, 2970, 1470) // W ROW( 590, 920, 550, 500, 1730, 300, 1450, 770, 640, 890, 870, 850, 480, 710, 740, 780, 660, 770, 1470, 2470) // Y }; static double Gonnet120[20][20] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 1550, 950, 780, 870, 480, 930, 700, 690, 770, 660, 790, 760, 900, 840, 730, 1120, 980, 960, 280, 480) // A ROW( 950, 2400, 270, 280, 700, 510, 650, 600, 320, 570, 700, 550, 280, 400, 510, 890, 750, 850, 670, 760) // C ROW( 780, 270, 1780, 1310, 90, 820, 890, 160, 880, 140, 320, 1220, 680, 970, 690, 910, 830, 310, 0, 430) // D ROW( 870, 280, 1310, 1680, 180, 650, 900, 410, 1070, 390, 560, 950, 740, 1210, 860, 870, 810, 580, 180, 400) // E ROW( 480, 700, 90, 180, 1980, 40, 820, 930, 290, 1110, 1070, 380, 240, 430, 280, 380, 490, 790, 1230, 1510) // F ROW( 930, 510, 820, 650, 40, 1860, 590, 90, 620, 140, 310, 890, 550, 660, 660, 900, 610, 310, 300, 220) // G ROW( 700, 650, 890, 900, 820, 590, 2060, 480, 940, 540, 680, 1100, 650, 1130, 950, 820, 820, 490, 680, 1220) // H ROW( 690, 600, 160, 410, 930, 90, 480, 1680, 520, 1240, 1250, 410, 400, 530, 430, 520, 790, 1380, 500, 650) // I ROW( 770, 320, 880, 1070, 290, 620, 940, 520, 1650, 520, 690, 1010, 720, 1160, 1320, 860, 900, 570, 280, 520) // K ROW( 660, 570, 140, 390, 1110, 140, 540, 1240, 520, 1620, 1300, 350, 520, 660, 520, 490, 620, 1090, 670, 760) // L ROW( 790, 700, 320, 560, 1070, 310, 680, 1250, 690, 1300, 1910, 500, 400, 820, 580, 670, 800, 1060, 650, 740) // M ROW( 760, 550, 1220, 950, 380, 890, 1100, 410, 1010, 350, 500, 1760, 640, 970, 860, 1060, 960, 460, 280, 680) // N ROW( 900, 280, 680, 740, 240, 550, 650, 400, 720, 520, 400, 640, 2010, 820, 660, 910, 860, 540, 70, 370) // P ROW( 840, 400, 970, 1210, 430, 660, 1130, 530, 1160, 660, 820, 970, 820, 1700, 1120, 890, 870, 600, 470, 580) // Q ROW( 730, 510, 690, 860, 280, 660, 950, 430, 1320, 520, 580, 860, 660, 1120, 1790, 810, 800, 520, 660, 590) // R ROW( 1120, 890, 910, 870, 380, 900, 820, 520, 860, 490, 670, 1060, 910, 890, 810, 1560, 1220, 680, 390, 610) // S ROW( 980, 750, 830, 810, 490, 610, 820, 790, 900, 620, 800, 960, 860, 870, 800, 1220, 1600, 930, 290, 540) // T ROW( 960, 850, 310, 580, 790, 310, 490, 1380, 570, 1090, 1060, 460, 540, 600, 520, 680, 930, 1610, 370, 630) // V ROW( 280, 670, 0, 180, 1230, 300, 680, 500, 280, 670, 650, 280, 70, 470, 660, 390, 290, 370, 2620, 1290) // W ROW( 480, 760, 430, 400, 1510, 220, 1220, 650, 520, 760, 740, 680, 370, 580, 590, 610, 540, 630, 1290, 2070) // Y }; static SCORE Gonnet160[20][20] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 1240, 810, 670, 740, 400, 800, 600, 600, 660, 560, 660, 660, 770, 710, 620, 940, 830, 790, 230, 410) // A ROW( 810, 2130, 250, 260, 600, 440, 550, 530, 300, 490, 590, 470, 260, 360, 430, 760, 640, 720, 570, 650) // C ROW( 670, 250, 1480, 1120, 80, 710, 770, 160, 770, 130, 280, 1040, 590, 840, 620, 780, 720, 290, 0, 360) // D ROW( 740, 260, 1120, 1370, 160, 570, 770, 350, 910, 330, 470, 830, 640, 1010, 750, 750, 700, 480, 140, 340) // E ROW( 400, 600, 80, 160, 1690, 20, 710, 810, 250, 970, 920, 310, 200, 370, 250, 330, 420, 700, 1100, 1340) // F ROW( 800, 440, 710, 570, 20, 1600, 510, 80, 540, 110, 260, 760, 480, 570, 570, 770, 540, 260, 230, 180) // G ROW( 600, 550, 770, 770, 710, 510, 1710, 410, 800, 460, 570, 930, 560, 950, 810, 700, 700, 430, 590, 1050) // H ROW( 600, 530, 160, 350, 810, 80, 410, 1370, 430, 1080, 1070, 340, 350, 460, 370, 450, 660, 1180, 440, 580) // I ROW( 660, 300, 770, 910, 250, 540, 800, 430, 1330, 440, 570, 860, 620, 980, 1130, 740, 760, 480, 240, 430) // K ROW( 560, 490, 130, 330, 970, 110, 460, 1080, 440, 1350, 1120, 300, 430, 540, 430, 420, 540, 950, 580, 670) // L ROW( 660, 590, 280, 470, 920, 260, 570, 1070, 570, 1120, 1540, 420, 360, 660, 490, 550, 670, 920, 560, 650) // M ROW( 660, 470, 1040, 830, 310, 760, 930, 340, 860, 300, 420, 1430, 560, 830, 740, 890, 810, 400, 230, 560) // N ROW( 770, 260, 590, 640, 200, 480, 560, 350, 620, 430, 360, 560, 1740, 700, 570, 780, 740, 460, 40, 300) // P ROW( 710, 360, 840, 1010, 370, 570, 950, 460, 980, 540, 660, 830, 700, 1340, 950, 760, 740, 510, 380, 490) // Q ROW( 620, 430, 620, 750, 250, 570, 810, 370, 1130, 430, 490, 740, 570, 950, 1490, 690, 690, 440, 540, 490) // R ROW( 940, 760, 780, 750, 330, 770, 700, 450, 740, 420, 550, 890, 780, 760, 690, 1220, 1010, 580, 310, 500) // S ROW( 830, 640, 720, 700, 420, 540, 700, 660, 760, 540, 670, 810, 740, 740, 690, 1010, 1280, 780, 240, 460) // T ROW( 790, 720, 290, 480, 700, 260, 430, 1180, 480, 950, 920, 400, 460, 510, 440, 580, 780, 1310, 330, 540) // V ROW( 230, 570, 0, 140, 1100, 230, 590, 440, 240, 580, 560, 230, 40, 380, 540, 310, 240, 330, 2360, 1160) // W ROW( 410, 650, 360, 340, 1340, 180, 1050, 580, 430, 670, 650, 560, 300, 490, 490, 500, 460, 540, 1160, 1780) // Y }; double Gonnet16[21][21] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 124, 81, 67, 74, 40, 80, 60, 60, 66, 56, 66, 66, 77, 71, 62, 94, 83, 79, 23, 41) // A ROW( 81, 213, 25, 26, 60, 44, 55, 53, 30, 49, 59, 47, 26, 36, 43, 76, 64, 72, 57, 65) // C ROW( 67, 25, 148, 112, 8, 71, 77, 16, 77, 13, 28, 104, 59, 84, 62, 78, 72, 29, 0, 36) // D ROW( 74, 26, 112, 137, 16, 57, 77, 35, 91, 33, 47, 83, 64, 101, 75, 75, 70, 48, 14, 34) // E ROW( 40, 60, 8, 16, 169, 2, 71, 81, 25, 97, 92, 31, 20, 37, 25, 33, 42, 70, 110, 134) // F ROW( 80, 44, 71, 57, 2, 160, 51, 8, 54, 11, 26, 76, 48, 57, 57, 77, 54, 26, 23, 18) // G ROW( 60, 55, 77, 77, 71, 51, 171, 41, 80, 46, 57, 93, 56, 95, 81, 70, 70, 43, 59, 105) // H ROW( 60, 53, 16, 35, 81, 8, 41, 137, 43, 108, 107, 34, 35, 46, 37, 45, 66, 118, 44, 58) // I ROW( 66, 30, 77, 91, 25, 54, 80, 43, 133, 44, 57, 86, 62, 98, 113, 74, 76, 48, 24, 43) // K ROW( 56, 49, 13, 33, 97, 11, 46, 108, 44, 135, 112, 30, 43, 54, 43, 42, 54, 95, 58, 67) // L ROW( 66, 59, 28, 47, 92, 26, 57, 107, 57, 112, 154, 42, 36, 66, 49, 55, 67, 92, 56, 65) // M ROW( 66, 47, 104, 83, 31, 76, 93, 34, 86, 30, 42, 143, 56, 83, 74, 89, 81, 40, 23, 56) // N ROW( 77, 26, 59, 64, 20, 48, 56, 35, 62, 43, 36, 56, 174, 70, 57, 78, 74, 46, 4, 30) // P ROW( 71, 36, 84, 101, 37, 57, 95, 46, 98, 54, 66, 83, 70, 134, 95, 76, 74, 51, 38, 49) // Q ROW( 62, 43, 62, 75, 25, 57, 81, 37, 113, 43, 49, 74, 57, 95, 149, 69, 69, 44, 54, 49) // R ROW( 94, 76, 78, 75, 33, 77, 70, 45, 74, 42, 55, 89, 78, 76, 69, 122, 101, 58, 31, 50) // S ROW( 83, 64, 72, 70, 42, 54, 70, 66, 76, 54, 67, 81, 74, 74, 69, 101, 128, 78, 24, 46) // T ROW( 79, 72, 29, 48, 70, 26, 43, 118, 48, 95, 92, 40, 46, 51, 44, 58, 78, 131, 33, 54) // V ROW( 23, 57, 0, 14, 110, 23, 59, 44, 24, 58, 56, 23, 4, 38, 54, 31, 24, 33, 236, 116) // W ROW( 41, 65, 36, 34, 134, 18, 105, 58, 43, 67, 65, 56, 30, 49, 49, 50, 46, 54, 116, 178) // Y }; static double Gonnet250[20][20] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 760, 570, 490, 520, 290, 570, 440, 440, 480, 400, 450, 490, 550, 500, 460, 630, 580, 530, 160, 300) // A ROW( 570, 1670, 200, 220, 440, 320, 390, 410, 240, 370, 430, 340, 210, 280, 300, 530, 470, 520, 420, 470) // C ROW( 490, 200, 990, 790, 70, 530, 560, 140, 570, 120, 220, 740, 450, 610, 490, 570, 520, 230, 0, 240) // D ROW( 520, 220, 790, 880, 130, 440, 560, 250, 640, 240, 320, 610, 470, 690, 560, 540, 510, 330, 90, 250) // E ROW( 290, 440, 70, 130, 1220, 0, 510, 620, 190, 720, 680, 210, 140, 260, 200, 240, 300, 530, 880, 1030) // F ROW( 570, 320, 530, 440, 0, 1180, 380, 70, 410, 80, 170, 560, 360, 420, 420, 560, 410, 190, 120, 120) // G ROW( 440, 390, 560, 560, 510, 380, 1120, 300, 580, 330, 390, 640, 410, 640, 580, 500, 490, 320, 440, 740) // H ROW( 440, 410, 140, 250, 620, 70, 300, 920, 310, 800, 770, 240, 260, 330, 280, 340, 460, 830, 340, 450) // I ROW( 480, 240, 570, 640, 190, 410, 580, 310, 840, 310, 380, 600, 460, 670, 790, 530, 530, 350, 170, 310) // K ROW( 400, 370, 120, 240, 720, 80, 330, 800, 310, 920, 800, 220, 290, 360, 300, 310, 390, 700, 450, 520) // L ROW( 450, 430, 220, 320, 680, 170, 390, 770, 380, 800, 950, 300, 280, 420, 350, 380, 460, 680, 420, 500) // M ROW( 490, 340, 740, 610, 210, 560, 640, 240, 600, 220, 300, 900, 430, 590, 550, 610, 570, 300, 160, 380) // N ROW( 550, 210, 450, 470, 140, 360, 410, 260, 460, 290, 280, 430, 1280, 500, 430, 560, 530, 340, 20, 210) // P ROW( 500, 280, 610, 690, 260, 420, 640, 330, 670, 360, 420, 590, 500, 790, 670, 540, 520, 370, 250, 350) // Q ROW( 460, 300, 490, 560, 200, 420, 580, 280, 790, 300, 350, 550, 430, 670, 990, 500, 500, 320, 360, 340) // R ROW( 630, 530, 570, 540, 240, 560, 500, 340, 530, 310, 380, 610, 560, 540, 500, 740, 670, 420, 190, 330) // S ROW( 580, 470, 520, 510, 300, 410, 490, 460, 530, 390, 460, 570, 530, 520, 500, 670, 770, 520, 170, 330) // T ROW( 530, 520, 230, 330, 530, 190, 320, 830, 350, 700, 680, 300, 340, 370, 320, 420, 520, 860, 260, 410) // V ROW( 160, 420, 0, 90, 880, 120, 440, 340, 170, 450, 420, 160, 20, 250, 360, 190, 170, 260, 1940, 930) // W ROW( 300, 470, 240, 250, 1030, 120, 740, 450, 310, 520, 500, 380, 210, 350, 340, 330, 330, 410, 930, 1300) // Y }; static double Gonnet350[20][20] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 450, 390, 350, 360, 210, 400, 310, 310, 340, 280, 310, 350, 380, 350, 330, 410, 390, 350, 110, 210) // A ROW( 390, 1280, 160, 180, 320, 230, 270, 300, 190, 280, 310, 240, 170, 210, 220, 360, 330, 370, 310, 340) // C ROW( 350, 160, 640, 540, 50, 390, 400, 110, 410, 100, 160, 500, 330, 430, 370, 400, 370, 170, 0, 170) // D ROW( 360, 180, 540, 550, 100, 330, 390, 180, 440, 170, 220, 440, 350, 460, 410, 380, 360, 230, 60, 180) // E ROW( 210, 320, 50, 100, 860, 0, 360, 460, 140, 530, 490, 150, 100, 190, 150, 170, 220, 400, 700, 770) // F ROW( 400, 230, 390, 330, 0, 860, 280, 60, 310, 50, 120, 400, 280, 310, 310, 400, 300, 140, 50, 80) // G ROW( 310, 270, 400, 390, 360, 280, 680, 220, 400, 240, 270, 430, 300, 420, 410, 350, 340, 240, 320, 500) // H ROW( 310, 300, 110, 180, 460, 60, 220, 620, 220, 570, 540, 170, 190, 240, 200, 240, 320, 570, 260, 340) // I ROW( 340, 190, 410, 440, 140, 310, 400, 220, 530, 210, 260, 420, 330, 450, 530, 370, 370, 250, 120, 210) // K ROW( 280, 280, 100, 170, 530, 50, 240, 570, 210, 630, 560, 160, 200, 240, 210, 220, 280, 510, 340, 400) // L ROW( 310, 310, 160, 220, 490, 120, 270, 540, 260, 560, 580, 210, 210, 280, 240, 260, 310, 490, 320, 370) // M ROW( 350, 240, 500, 440, 150, 400, 430, 170, 420, 160, 210, 550, 320, 410, 390, 410, 390, 220, 110, 250) // N ROW( 380, 170, 330, 350, 100, 280, 300, 190, 330, 200, 210, 320, 910, 350, 310, 390, 370, 240, 10, 150) // P ROW( 350, 210, 430, 460, 190, 310, 420, 240, 450, 240, 280, 410, 350, 470, 450, 370, 360, 260, 160, 240) // Q ROW( 330, 220, 370, 410, 150, 310, 410, 200, 530, 210, 240, 390, 310, 450, 630, 360, 350, 230, 230, 230) // R ROW( 410, 360, 400, 380, 170, 400, 350, 240, 370, 220, 260, 410, 390, 370, 360, 450, 430, 290, 130, 230) // S ROW( 390, 330, 370, 360, 220, 300, 340, 320, 370, 280, 310, 390, 370, 360, 350, 430, 460, 350, 120, 230) // T ROW( 350, 370, 170, 230, 400, 140, 240, 570, 250, 510, 490, 220, 240, 260, 230, 290, 350, 560, 210, 310) // V ROW( 110, 310, 0, 60, 700, 50, 320, 260, 120, 340, 320, 110, 10, 160, 230, 130, 120, 210, 1590, 740) // W ROW( 210, 340, 170, 180, 770, 80, 500, 340, 210, 400, 370, 250, 150, 240, 230, 230, 230, 310, 740, 920) // Y }; const t_ROW *GetGonnetMatrix(unsigned N) { switch (N) { case 80: return Gonnet80; case 120: return Gonnet120; //case 16: // return Gonnet16; //case 160: // return Gonnet160; case 250: return Gonnet250; case 350: return Gonnet350; } Quit("Invalid Gonnet%u", N); return 0; } //SCORE GetGonnetGapOpen(unsigned N) // { // switch (N) // { // case 80: // return -639; // case 120: // return -863; // case 160: // return -611; // case 250: // return -308; // case 350: // return -158; // } // Quit("Invalid Gonnet%u", N); // return 0; // } SCORE GetGonnetGapOpen(unsigned N) { switch (N) { case 80: return -1000; case 120: return -800; case 160: return -700; case 250: return -200; case 350: return -175; } Quit("Invalid Gonnet%u", N); return 0; } SCORE GetGonnetGapExtend(unsigned N) { switch (N) { case 80: return 350; case 120: return 200; case 160: return 175; case 250: return 20; case 350: return 20; } Quit("Invalid Gonnet%u", N); return 0; } //double GonnetLookup[400][400]; // //static bool InitGonnetLookup() // { // for (unsigned i = 0; i < 400; ++i) // { // const unsigned A1 = i/20; // const unsigned A2 = i%20; // for (unsigned j = 0; j <= i; ++j) // { // const unsigned B1 = j/20; // const unsigned B2 = j%20; // // const double s00 = Gonnet16[A1][B1]; // const double s01 = Gonnet16[A1][B2]; // const double s10 = Gonnet16[A2][B1]; // const double s11 = Gonnet16[A2][B2]; // // GonnetLookup[i][j] = GonnetLookup[j][i] = (s00 + s01 + s10 + s11)/4; // } // } // return true; // } // //static bool bGonnetLookupInitialized = InitGonnetLookup(); henikoffweight.cpp0000664000175000017500000000467612360262614012672 0ustar bobbob#include "muscle.h" #include "msa.h" /*** Compute Henikoff weights. Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights. J. Mol. Biol., 243(4):574-578. Award each different residue an equal share of the weight, and then to divide up that weight equally among the sequences sharing the same residue. So if in a position of a multiple alignment, r different residues are represented, a residue represented in only one sequence contributes a score of 1/r to that sequence, whereas a residue represented in s sequences contributes a score of 1/rs to each of the s sequences. For each sequence, the contributions from each position are summed to give a sequence weight. See also HenikoffWeightPB. ***/ void MSA::CalcHenikoffWeightsCol(unsigned uColIndex) const { const unsigned uSeqCount = GetSeqCount(); // Compute letter counts in this column unsigned uLetterCount[MAX_ALPHA]; memset(uLetterCount, 0, sizeof(uLetterCount)); unsigned uDifferentLetterCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); if (uLetter >= 20) continue; unsigned uNewCount = uLetterCount[uLetter] + 1; uLetterCount[uLetter] = uNewCount; if (1 == uNewCount) ++uDifferentLetterCount; } // Compute weight contributions for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); if (uLetter >= 20) continue; const unsigned uCount = uLetterCount[uLetter]; unsigned uDenom = uCount*uDifferentLetterCount; if (uDenom == 0) continue; m_Weights[uSeqIndex] += (WEIGHT) (1.0/uDenom); } } void MSA::SetHenikoffWeights() const { const unsigned uColCount = GetColCount(); const unsigned uSeqCount = GetSeqCount(); if (0 == uSeqCount) return; else if (1 == uSeqCount) { m_Weights[0] = (WEIGHT) 1.0; return; } else if (2 == uSeqCount) { m_Weights[0] = (WEIGHT) 0.5; m_Weights[1] = (WEIGHT) 0.5; return; } for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) m_Weights[uSeqIndex] = 0.0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) CalcHenikoffWeightsCol(uColIndex); // Set all-gap seqs weight to 0 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGapSeq(uSeqIndex)) m_Weights[uSeqIndex] = 0.0; Normalize(m_Weights, uSeqCount); } henikoffweightpb.cpp0000664000175000017500000000714112360262614013202 0ustar bobbob#include "muscle.h" #include "msa.h" /*** Compute Henikoff weights. Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights. J. Mol. Biol., 243(4):574-578. Award each different residue an equal share of the weight, and then to divide up that weight equally among the sequences sharing the same residue. So if in a position of a multiple alignment, r different residues are represented, a residue represented in only one sequence contributes a score of 1/r to that sequence, whereas a residue represented in s sequences contributes a score of 1/rs to each of the s sequences. For each sequence, the contributions from each position are summed to give a sequence weight. Here we use the variant from PSI-BLAST, which (a) treats gaps as a 21st letter, and (b) ignores columns that are perfectly conserved. >>> WARNING -- I SUSPECT THIS DOESN'T WORK CORRECTLY <<< ***/ void MSA::CalcHenikoffWeightsColPB(unsigned uColIndex) const { const unsigned uSeqCount = GetSeqCount(); // Compute letter counts in this column unsigned uLetterCount[MAX_ALPHA+1]; memset(uLetterCount, 0, (MAX_ALPHA+1)*sizeof(unsigned)); unsigned uLetter; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) uLetter = MAX_ALPHA; else uLetter = GetLetter(uSeqIndex, uColIndex); ++(uLetterCount[uLetter]); } // Check for special case of perfect conservation for (unsigned uLetter = 0; uLetter < MAX_ALPHA+1; ++uLetter) { unsigned uCount = uLetterCount[uLetter]; if (uCount > 0) { // Perfectly conserved? if (uCount == uSeqCount) return; else // If count > 0 but less than nr. sequences, can't be conserved break; } } // Compute weight contributions for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uLetter; if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) uLetter = MAX_ALPHA; else uLetter = GetLetter(uSeqIndex, uColIndex); const unsigned uCount = uLetterCount[uLetter]; m_Weights[uSeqIndex] += (WEIGHT) (1.0/uCount); } } bool MSA::IsGapSeq(unsigned uSeqIndex) const { const unsigned uColCount = GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) if (!IsGap(uSeqIndex, uColIndex)) return false; return true; } void MSA::SetUniformWeights() const { const unsigned uSeqCount = GetSeqCount(); if (0 == uSeqCount) return; const WEIGHT w = (WEIGHT) (1.0 / uSeqCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) m_Weights[uSeqIndex] = w; } void MSA::SetHenikoffWeightsPB() const { const unsigned uColCount = GetColCount(); const unsigned uSeqCount = GetSeqCount(); if (0 == uSeqCount) return; else if (1 == uSeqCount) { m_Weights[0] = 1.0; return; } else if (2 == uSeqCount) { m_Weights[0] = (WEIGHT) 0.5; m_Weights[1] = (WEIGHT) 0.5; return; } for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) m_Weights[uSeqIndex] = 0.0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) CalcHenikoffWeightsColPB(uColIndex); // Set all-gap seqs weight to 0 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGapSeq(uSeqIndex)) m_Weights[uSeqIndex] = 0.0; // Check for special case of identical sequences, which will cause all // columns to be skipped becasue they're perfectly conserved. if (VectorIsZero(m_Weights, uSeqCount)) VectorSet(m_Weights, uSeqCount, 1.0); Normalize(m_Weights, uSeqCount); } html.cpp0000664000175000017500000000711012360262614010617 0ustar bobbob#include "muscle.h" #include #include #include "msa.h" #include "textfile.h" const unsigned uCharsPerLine = 60; const int MIN_NAME = 10; const int MAX_NAME = 32; extern void AssignColors(const MSA &a, int **Colors); static int **MakeColors(const MSA &a) { const unsigned uSeqCount = a.GetSeqCount(); const unsigned uColCount = a.GetColCount(); int **Colors = new int *[uSeqCount]; for (unsigned i = 0; i < uSeqCount; ++i) { Colors[i] = new int[uColCount]; memset(Colors[i], 0, uColCount*sizeof(int)); } AssignColors(a, Colors); return Colors; } static void ChangeColor(TextFile &File, int From, int To) { if (From == To) return; #define COLOR_WHITE "FFFFFF" #define COLOR_GRAY "C0C0C0" #define COLOR_BLACK "000000" #define COLOR_RED "FF0000" #define COLOR_GREEN "00FF00" #define COLOR_BLUE "5590FF" #define COLOR_LIGHTBLUE "77FFFF" #define X(c) File.PutString(""); switch (To) { case 0: X(COLOR_WHITE) break; case 1: X(COLOR_GRAY) break; case 2: X(COLOR_BLUE) break; case 3: X(COLOR_LIGHTBLUE) break; } } #define COLOR_WINDOW "FFEEE0" void MSA::ToHTMLFile(TextFile &File) const { File.PutString("\n"); File.PutString("\n"); File.PutString("
");

	int **Colors = MakeColors(*this);

	int iLongestNameLength = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
		{
		const char *ptrName = GetSeqName(uSeqIndex);
		const char *ptrBlank = strchr(ptrName, ' ');
		int iLength;
		if (0 != ptrBlank)
			iLength = (int) (ptrBlank - ptrName);
		else
			iLength = (int) strlen(ptrName);
		if (iLength > iLongestNameLength)
			iLongestNameLength = iLength;
		}
	if (iLongestNameLength > MAX_NAME)
		iLongestNameLength = MAX_NAME;
	if (iLongestNameLength < MIN_NAME)
		iLongestNameLength = MIN_NAME;

	unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1;
	int CurrentColor = -1;
	for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex)
		{
		File.PutString("\n");
		unsigned uStartColIndex = uLineIndex*uCharsPerLine;
		unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1;
		if (uEndColIndex >= GetColCount())
			uEndColIndex = GetColCount() - 1;
		char Name[MAX_NAME+1];
		for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
			{
			const char *ptrName = GetSeqName(uSeqIndex);
			const char *ptrBlank = strchr(ptrName, ' ');
			int iLength;
			if (0 != ptrBlank)
				iLength = (int) (ptrBlank - ptrName);
			else
				iLength = (int) strlen(ptrName);
			if (iLength > MAX_NAME)
				iLength = MAX_NAME;
			memset(Name, ' ', MAX_NAME);
			memcpy(Name, ptrName, iLength);
			Name[iLongestNameLength] = 0;

//			File.PutString("");
			CurrentColor = -1;
			File.PutString("");
			File.PutFormat("%s      ", Name);
			File.PutString("");
			for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
			  ++uColIndex)
				{
				const int Color = Colors[uSeqIndex][uColIndex];
				ChangeColor(File, CurrentColor, Color);
				CurrentColor = Color;
				const char c = GetChar(uSeqIndex, uColIndex);
				if (Color == 0)
					File.PutFormat("%c", tolower(c));
				else
					File.PutFormat("%c", toupper(c));
				}
			File.PutString("\n");
			}
		}
	File.PutString("\n");
	File.PutString("
\n"); File.PutString("\n"); File.PutString("\n"); } hydro.cpp0000664000175000017500000000162712360262614011007 0ustar bobbob#include "muscle.h" #include "profile.h" extern void TomHydro(ProfPos *Prof, unsigned Length); // Apply hydrophobicity heuristic to a profile void Hydro(ProfPos *Prof, unsigned uLength) { if (ALPHA_Amino != g_Alpha) return; if (g_bTomHydro) { TomHydro(Prof, uLength); return; } if (0 == g_uHydrophobicRunLength) return; if (uLength <= g_uHydrophobicRunLength) return; unsigned uRunLength = 0; unsigned L2 = g_uHydrophobicRunLength/2; for (unsigned uColIndex = L2; uColIndex < uLength - L2; ++uColIndex) { ProfPos &PP = Prof[uColIndex]; bool bHydro = IsHydrophobic(PP.m_fcCounts); if (bHydro) { ++uRunLength; if (uRunLength >= g_uHydrophobicRunLength) { Prof[uColIndex-L2].m_scoreGapOpen *= (SCORE) g_dHydroFactor; Prof[uColIndex-L2].m_scoreGapClose *= (SCORE) g_dHydroFactor; } } else uRunLength = 0; } } intmath.cpp0000664000175000017500000001674412360262614011334 0ustar bobbob#include "muscle.h" #include PROB ScoreToProb(SCORE Score) { if (MINUS_INFINITY >= Score) return 0.0; return (PROB) pow(2.0, (double) Score/INTSCALE); } //#if 0 //static const double log2e = log2(exp(1.0)); // //double lnTolog2(double ln) // { // return ln*log2e; // } // //double log2(double x) // { // if (0 == x) // return MINUS_INFINITY; // // static const double dInvLn2 = 1.0/log(2.0); //// Multiply by inverse of log(2) just in case multiplication //// is faster than division. // return log(x)*dInvLn2; // } //#endif //SCORE ProbToScore(PROB Prob) // { // if (0.0 == Prob) // return MINUS_INFINITY; //// return (SCORE) floor(INTSCALE*log2(Prob)); // return (SCORE) log2(Prob); // } WEIGHT DoubleToWeight(double d) { assert(d >= 0); return (WEIGHT) (INTSCALE*d); } double WeightToDouble(WEIGHT w) { return (double) w / (double) INTSCALE; } SCORE DoubleToScore(double d) { return (SCORE)(d*(double) INTSCALE); } bool ScoreEq(SCORE s1, SCORE s2) { return BTEq(s1, s2); } static bool BTEq2(BASETYPE b1, BASETYPE b2) { double diff = fabs(b1 - b2); if (diff < 0.0001) return true; double sum = fabs(b1) + fabs(b2); return diff/sum < 0.005; } bool BTEq(double b1, double b2) { return BTEq2((BASETYPE) b1, (BASETYPE) b2); } //const double dLn2 = log(2.0); //// pow2(x)=2^x //double pow2(double x) // { // if (MINUS_INFINITY == x) // return 0; // return exp(x*dLn2); // } //// lp2(x) = log2(1 + 2^-x), x >= 0 //double lp2(double x) // { // return log2(1 + pow2(-x)); // } // SumLog(x, y) = log2(2^x + 2^y) //SCORE SumLog(SCORE x, SCORE y) // { // return (SCORE) log2(pow2(x) + pow2(y)); // } // //// SumLog(x, y, z) = log2(2^x + 2^y + 2^z) //SCORE SumLog(SCORE x, SCORE y, SCORE z) // { // return (SCORE) log2(pow2(x) + pow2(y) + pow2(z)); // } // //// SumLog(w, x, y, z) = log2(2^w + 2^x + 2^y + 2^z) //SCORE SumLog(SCORE w, SCORE x, SCORE y, SCORE z) // { // return (SCORE) log2(pow2(w) + pow2(x) + pow2(y) + pow2(z)); // } //SCORE lp2Fast(SCORE x) // { // assert(x >= 0); // const int iTableSize = 1000; // const double dRange = 20.0; // const double dScale = dRange/iTableSize; // static SCORE dValue[iTableSize]; // static bool bInit = false; // if (!bInit) // { // for (int i = 0; i < iTableSize; ++i) // dValue[i] = (SCORE) lp2(i*dScale); // bInit = true; // } // if (x >= dRange) // return 0.0; // int i = (int) (x/dScale); // assert(i >= 0 && i < iTableSize); // SCORE dResult = dValue[i]; // assert(BTEq(dResult, lp2(x))); // return dResult; // } // //// SumLog(x, y) = log2(2^x + 2^y) //SCORE SumLogFast(SCORE x, SCORE y) // { // if (MINUS_INFINITY == x) // { // if (MINUS_INFINITY == y) // return MINUS_INFINITY; // return y; // } // else if (MINUS_INFINITY == y) // return x; // // SCORE dResult; // if (x > y) // dResult = x + lp2Fast(x-y); // else // dResult = y + lp2Fast(y-x); // assert(SumLog(x, y) == dResult); // return dResult; // } // //SCORE SumLogFast(SCORE x, SCORE y, SCORE z) // { // SCORE dResult = SumLogFast(x, SumLogFast(y, z)); // assert(SumLog(x, y, z) == dResult); // return dResult; // } //SCORE SumLogFast(SCORE w, SCORE x, SCORE y, SCORE z) // { // SCORE dResult = SumLogFast(SumLogFast(w, x), SumLogFast(y, z)); // assert(SumLog(w, x, y, z) == dResult); // return dResult; // } double VecSum(const double v[], unsigned n) { double dSum = 0.0; for (unsigned i = 0; i < n; ++i) dSum += v[i]; return dSum; } void Normalize(PROB p[], unsigned n) { unsigned i; PROB dSum = 0.0; for (i = 0; i < n; ++i) dSum += p[i]; if (0.0 == dSum) Quit("Normalize, sum=0"); for (i = 0; i < n; ++i) p[i] /= dSum; } void NormalizeUnlessZero(PROB p[], unsigned n) { unsigned i; PROB dSum = 0.0; for (i = 0; i < n; ++i) dSum += p[i]; if (0.0 == dSum) return; for (i = 0; i < n; ++i) p[i] /= dSum; } void Normalize(PROB p[], unsigned n, double dRequiredTotal) { unsigned i; double dSum = 0.0; for (i = 0; i < n; ++i) dSum += p[i]; if (0.0 == dSum) Quit("Normalize, sum=0"); double dFactor = dRequiredTotal / dSum; for (i = 0; i < n; ++i) p[i] *= (PROB) dFactor; } bool VectorIsZero(const double dValues[], unsigned n) { for (unsigned i = 0; i < n; ++i) if (dValues[i] != 0.0) return false; return true; } void VectorSet(double dValues[], unsigned n, double d) { for (unsigned i = 0; i < n; ++i) dValues[i] = d; } bool VectorIsZero(const float dValues[], unsigned n) { for (unsigned i = 0; i < n; ++i) if (dValues[i] != 0.0) return false; return true; } void VectorSet(float dValues[], unsigned n, float d) { for (unsigned i = 0; i < n; ++i) dValues[i] = d; } double Correl(const double P[], const double Q[], unsigned uCount) { double dSumP = 0.0; double dSumQ = 0.0; for (unsigned n = 0; n < uCount; ++n) { dSumP += P[n]; dSumQ += Q[n]; } const double dMeanP = dSumP/uCount; const double dMeanQ = dSumQ/uCount; double dSum1 = 0.0; double dSum2 = 0.0; double dSum3 = 0.0; for (unsigned n = 0; n < uCount; ++n) { const double dDiffP = P[n] - dMeanP; const double dDiffQ = Q[n] - dMeanQ; dSum1 += dDiffP*dDiffQ; dSum2 += dDiffP*dDiffP; dSum3 += dDiffQ*dDiffQ; } if (0 == dSum1) return 0; const double dCorrel = dSum1 / sqrt(dSum2*dSum3); return dCorrel; } float Correl(const float P[], const float Q[], unsigned uCount) { float dSumP = 0.0; float dSumQ = 0.0; for (unsigned n = 0; n < uCount; ++n) { dSumP += P[n]; dSumQ += Q[n]; } const float dMeanP = dSumP/uCount; const float dMeanQ = dSumQ/uCount; float dSum1 = 0.0; float dSum2 = 0.0; float dSum3 = 0.0; for (unsigned n = 0; n < uCount; ++n) { const float dDiffP = P[n] - dMeanP; const float dDiffQ = Q[n] - dMeanQ; dSum1 += dDiffP*dDiffQ; dSum2 += dDiffP*dDiffP; dSum3 += dDiffQ*dDiffQ; } if (0 == dSum1) return 0; const float dCorrel = dSum1 / (float) sqrt(dSum2*dSum3); return dCorrel; } // Simple (but slow) function to compute Pearson ranks // that allows for ties. Correctness and simplicity // are priorities over speed here. void Rank(const float P[], float Ranks[], unsigned uCount) { for (unsigned n = 0; n < uCount; ++n) { unsigned uNumberGreater = 0; unsigned uNumberEqual = 0; unsigned uNumberLess = 0; double dValue = P[n]; for (unsigned i = 0; i < uCount; ++i) { double v = P[i]; if (v == dValue) ++uNumberEqual; else if (v < dValue) ++uNumberLess; else ++uNumberGreater; } assert(uNumberEqual >= 1); assert(uNumberEqual + uNumberLess + uNumberGreater == uCount); Ranks[n] = (float) (1 + uNumberLess + (uNumberEqual - 1)/2.0); } } void Rank(const double P[], double Ranks[], unsigned uCount) { for (unsigned n = 0; n < uCount; ++n) { unsigned uNumberGreater = 0; unsigned uNumberEqual = 0; unsigned uNumberLess = 0; double dValue = P[n]; for (unsigned i = 0; i < uCount; ++i) { double v = P[i]; if (v == dValue) ++uNumberEqual; else if (v < dValue) ++uNumberLess; else ++uNumberGreater; } assert(uNumberEqual >= 1); assert(uNumberEqual + uNumberLess + uNumberGreater == uCount); Ranks[n] = (double) (1 + uNumberLess + (uNumberEqual - 1)/2.0); } } FCOUNT SumCounts(const FCOUNT Counts[]) { FCOUNT Sum = 0; for (int i = 0; i < 20; ++i) Sum += Counts[i]; return Sum; } local.cpp0000664000175000017500000000403112360262614010744 0ustar bobbob#include "muscle.h" #include "textfile.h" #include "msa.h" #include "profile.h" #include "pwpath.h" #include "tree.h" #define TRACE 0 static void MSAFromFileName(const char *FileName, MSA &a) { TextFile File(FileName); a.FromFile(File); } static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree) { const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); TreeFromMSA(msa, tree, g_Cluster1, g_Distance1, g_Root1); SetMuscleTree(tree); return ProfileFromMSA(msa); } void Local() { if (0 == g_pstrFileName1 || 0 == g_pstrFileName2) Quit("Must specify both -in1 and -in2 for -sw"); SetSeqWeightMethod(g_SeqWeight1); MSA msa1; MSA msa2; MSAFromFileName(g_pstrFileName1, msa1); MSAFromFileName(g_pstrFileName2, msa2); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = msa1.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); msa1.FixAlpha(); msa2.FixAlpha(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); const unsigned uMaxSeqCount = (uSeqCount1 > uSeqCount2 ? uSeqCount1 : uSeqCount2); MSA::SetIdCount(uMaxSeqCount); unsigned uLength1 = msa1.GetColCount(); unsigned uLength2 = msa2.GetColCount(); Tree tree1; Tree tree2; ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1); ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2); PWPath Path; SW(Prof1, uLength1, Prof2, uLength2, Path); #if TRACE Path.LogMe(); #endif MSA msaOut; AlignTwoMSAsGivenPathSW(Path, msa1, msa2, msaOut); #if TRACE msaOut.LogMe(); #endif TextFile fileOut(g_pstrOutFileName, true); msaOut.ToFile(fileOut); } main.cpp0000664000175000017500000000244212360262613010601 0ustar bobbob//@@TODO reconcile /muscle with /muscle3.6 #include "muscle.h" #include #ifdef WIN32 #include // for SetPriorityClass() #include // for isatty() #else #include // for isatty() #endif const char *MUSCLE_LONG_VERSION = "MUSCLE v" SHORT_VERSION "." #include "svnversion.h" " by Robert C. Edgar"; int g_argc; char **g_argv; int main(int argc, char **argv) { #if WIN32 // Multi-tasking does not work well in CPU-bound // console apps running under Win32. // Reducing the process priority allows GUI apps // to run responsively in parallel. SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS); #endif g_argc = argc; g_argv = argv; SetNewHandler(); SetStartTime(); ProcessArgVect(argc - 1, argv + 1); SetParams(); SetLogFile(); //extern void TestSubFams(const char *); //TestSubFams(g_pstrInFileName); //return 0; if (g_bVersion) { printf("%s\n", MUSCLE_LONG_VERSION); exit(EXIT_SUCCESS); } if (!g_bQuiet) Credits(); if (MissingCommand() && isatty(0)) { Usage(); exit(EXIT_SUCCESS); } if (g_bCatchExceptions) { try { Run(); } catch (...) { OnException(); exit(EXIT_Except); } } else Run(); exit(EXIT_Success); } makerootmsab.cpp0000664000175000017500000000400512360262614012337 0ustar bobbob#include "muscle.h" #include "tree.h" #include "profile.h" #include "msa.h" #include "seqvect.h" #include "pwpath.h" static void DoSeq(Seq &s, unsigned uSeqIndex, const ProfPos *RootProf, unsigned uRootProfLength, MSA &msaOut) { MSA msaSeq; msaSeq.FromSeq(s); const unsigned uSeqLength = s.Length(); MSA msaDummy; msaDummy.SetSize(1, uRootProfLength); msaDummy.SetSeqId(0, 0); msaDummy.SetSeqName(0, "Dummy0"); for (unsigned uColIndex = 0; uColIndex < uRootProfLength; ++uColIndex) msaDummy.SetChar(0, uColIndex, '?'); ProfPos *SeqProf = ProfileFromMSA(msaSeq); for (unsigned uColIndex = 0; uColIndex < uSeqLength; ++uColIndex) { ProfPos &PP = SeqProf[uColIndex]; PP.m_scoreGapOpen = MINUS_INFINITY; PP.m_scoreGapClose = MINUS_INFINITY; } ProfPos *ProfOut; unsigned uLengthOut; PWPath Path; AlignTwoProfs(SeqProf, uSeqLength, 1.0, RootProf, uRootProfLength, 1.0, Path, &ProfOut, &uLengthOut); assert(uLengthOut = uRootProfLength); delete[] ProfOut; MSA msaCombined; AlignTwoMSAsGivenPath(Path, msaSeq, msaDummy, msaCombined); msaCombined.LogMe(); msaOut.SetSeqName(uSeqIndex, s.GetName()); msaOut.SetSeqId(uSeqIndex, s.GetId()); for (unsigned uColIndex = 0; uColIndex < uRootProfLength; ++uColIndex) msaOut.SetChar(uSeqIndex, uColIndex, msaCombined.GetChar(0, uColIndex)); } // Steven Brenner's O(NL^2) proposal for creating a root alignment // Align each sequence to the profile at the root. // Compare the e-string solution, which is O(NL log N). void MakeRootMSABrenner(SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a) { const unsigned uSeqCount = v.Length(); const unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const ProfPos *RootProfile = Nodes[uRootNodeIndex].m_Prof; const unsigned uRootColCount = Nodes[uRootNodeIndex].m_uLength; a.SetSize(uSeqCount, uRootColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) DoSeq(*v[uSeqIndex], uSeqIndex, RootProfile, uRootColCount, a); } makerootmsa.cpp0000664000175000017500000001262212360262613012200 0ustar bobbob#include "muscle.h" #include "tree.h" #include "seqvect.h" #include "profile.h" #include "msa.h" #include "pwpath.h" #include "estring.h" #define TRACE 0 #define VALIDATE 0 static void PathSeq(const Seq &s, const PWPath &Path, bool bRight, Seq &sOut) { int *esA; int *esB; PathToEstrings(Path, &esA, &esB); const unsigned uSeqLength = s.Length(); const unsigned uEdgeCount = Path.GetEdgeCount(); sOut.Clear(); sOut.SetName(s.GetName()); unsigned uPos = 0; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); char cType = Edge.cType; if (bRight) { if (cType == 'I') cType = 'D'; else if (cType == 'D') cType = 'I'; } switch (cType) { case 'M': sOut.AppendChar(s[uPos++]); break; case 'D': sOut.AppendChar('-'); break; case 'I': sOut.AppendChar(s[uPos++]); break; default: Quit("PathSeq, invalid edge type %c", cType); } } } #if VALIDATE static void MakeRootSeq(const Seq &s, const Tree &GuideTree, unsigned uLeafNodeIndex, const ProgNode Nodes[], Seq &sRoot) { sRoot.Copy(s); unsigned uNodeIndex = uLeafNodeIndex; for (;;) { unsigned uParent = GuideTree.GetParent(uNodeIndex); if (NULL_NEIGHBOR == uParent) break; bool bRight = (GuideTree.GetLeft(uParent) == uNodeIndex); uNodeIndex = uParent; const PWPath &Path = Nodes[uNodeIndex].m_Path; Seq sTmp; PathSeq(sRoot, Path, bRight, sTmp); sTmp.SetId(0); sRoot.Copy(sTmp); } } #endif // VALIDATE static int *MakeRootSeqE(const Seq &s, const Tree &GuideTree, unsigned uLeafNodeIndex, const ProgNode Nodes[], Seq &sRoot, int *Estring1, int *Estring2) { int *EstringCurr = Estring1; int *EstringNext = Estring2; const unsigned uSeqLength = s.Length(); EstringCurr[0] = uSeqLength; EstringCurr[1] = 0; unsigned uNodeIndex = uLeafNodeIndex; for (;;) { unsigned uParent = GuideTree.GetParent(uNodeIndex); if (NULL_NEIGHBOR == uParent) break; bool bRight = (GuideTree.GetLeft(uParent) == uNodeIndex); uNodeIndex = uParent; const PWPath &Path = Nodes[uNodeIndex].m_Path; const int *EstringNode = bRight ? Nodes[uNodeIndex].m_EstringL : Nodes[uNodeIndex].m_EstringR; MulEstrings(EstringCurr, EstringNode, EstringNext); #if TRACE Log("\n"); Log("Curr="); LogEstring(EstringCurr); Log("\n"); Log("Node="); LogEstring(EstringNode); Log("\n"); Log("Prod="); LogEstring(EstringNext); Log("\n"); #endif int *EstringTmp = EstringNext; EstringNext = EstringCurr; EstringCurr = EstringTmp; } EstringOp(EstringCurr, s, sRoot); #if TRACE Log("Root estring="); LogEstring(EstringCurr); Log("\n"); Log("Root seq="); sRoot.LogMe(); #endif return EstringCurr; } static unsigned GetFirstNodeIndex(const Tree &tree) { if (g_bStable) return 0; return tree.FirstDepthFirstNode(); } static unsigned GetNextNodeIndex(const Tree &tree, unsigned uPrevNodeIndex) { if (g_bStable) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned uNodeIndex = uPrevNodeIndex; for (;;) { ++uNodeIndex; if (uNodeIndex >= uNodeCount) return NULL_NEIGHBOR; if (tree.IsLeaf(uNodeIndex)) return uNodeIndex; } } unsigned uNodeIndex = uPrevNodeIndex; for (;;) { uNodeIndex = tree.NextDepthFirstNode(uNodeIndex); if (NULL_NEIGHBOR == uNodeIndex || tree.IsLeaf(uNodeIndex)) return uNodeIndex; } } void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a) { #if TRACE Log("MakeRootMSA Tree="); GuideTree.LogMe(); #endif const unsigned uSeqCount = v.GetSeqCount(); unsigned uColCount = uInsane; unsigned uSeqIndex = 0; const unsigned uTreeNodeCount = GuideTree.GetNodeCount(); const unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const PWPath &RootPath = Nodes[uRootNodeIndex].m_Path; const unsigned uRootColCount = RootPath.GetEdgeCount(); const unsigned uEstringSize = uRootColCount + 1; int *Estring1 = new int[uEstringSize]; int *Estring2 = new int[uEstringSize]; SetProgressDesc("Root alignment"); unsigned uTreeNodeIndex = GetFirstNodeIndex(GuideTree); do { Progress(uSeqIndex, uSeqCount); unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); const Seq &s = *(v[uId]); Seq sRootE; int *es = MakeRootSeqE(s, GuideTree, uTreeNodeIndex, Nodes, sRootE, Estring1, Estring2); Nodes[uTreeNodeIndex].m_EstringL = EstringNewCopy(es); #if VALIDATE Seq sRoot; MakeRootSeq(s, GuideTree, uTreeNodeIndex, Nodes, sRoot); if (!sRoot.Eq(sRootE)) { Log("sRoot="); sRoot.LogMe(); Log("sRootE="); sRootE.LogMe(); Quit("Root seqs differ"); } #if TRACE Log("MakeRootSeq=\n"); sRoot.LogMe(); #endif #endif if (uInsane == uColCount) { uColCount = sRootE.Length(); a.SetSize(uSeqCount, uColCount); } else { assert(uColCount == sRootE.Length()); } a.SetSeqName(uSeqIndex, s.GetName()); a.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) a.SetChar(uSeqIndex, uColIndex, sRootE[uColIndex]); ++uSeqIndex; uTreeNodeIndex = GetNextNodeIndex(GuideTree, uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); delete[] Estring1; delete[] Estring2; ProgressStepsDone(); assert(uSeqIndex == uSeqCount); } maketree.cpp0000664000175000017500000000156012360262613011452 0ustar bobbob#include "muscle.h" #include "msa.h" #include "textfile.h" #include "tree.h" void DoMakeTree() { if (g_pstrInFileName == 0 || g_pstrOutFileName == 0) Quit("-maketree requires -in and -out "); SetStartTime(); SetSeqWeightMethod(g_SeqWeight1); TextFile MSAFile(g_pstrInFileName); MSA msa; msa.FromFile(MSAFile); unsigned uSeqCount = msa.GetSeqCount(); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); SetMuscleInputMSA(msa); Progress("%u sequences", uSeqCount); Tree tree; TreeFromMSA(msa, tree, g_Cluster2, g_Distance2, g_Root2); TextFile TreeFile(g_pstrOutFileName, true); tree.ToFile(TreeFile); Progress("Tree created"); } mhack.cpp0000664000175000017500000000243012360262614010736 0ustar bobbob#include "muscle.h" #include "seqvect.h" #include "msa.h" /*** Methionine hack. Most proteins start with M. This results in odd-looking alignments with the terminal Ms aligned followed immediately by gaps. Hack this by treating terminal M like X. ***/ static bool *M; void MHackStart(SeqVect &v) { if (ALPHA_Amino != g_Alpha) return; const unsigned uSeqCount = v.Length(); M = new bool[uSeqCount]; memset(M, 0, uSeqCount*sizeof(bool)); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = v.GetSeq(uSeqIndex); if (0 == s.Length()) continue; unsigned uId = s.GetId(); if (s[0] == 'M' || s[0] == 'm') { M[uId] = true; s[0] = 'X'; } } } void MHackEnd(MSA &msa) { if (ALPHA_Amino != g_Alpha) return; if (0 == M) return; const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa.GetSeqId(uSeqIndex); if (M[uId]) { for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { if (!msa.IsGap(uSeqIndex, uColIndex)) { msa.SetChar(uSeqIndex, uColIndex, 'M'); break; } } } } delete[] M; M = 0; } mpam200.cpp0000664000175000017500000001757412360262613011045 0ustar bobbob#include "muscle.h" const float PAM_200_CENTER = (float) 20.0; #define v(x) ((float) x + PAM_200_CENTER) #define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), \ v(M), v(N), v(P), v(Q), v(R), v(S), v(T), v(V), v(W), v(Y) }, float PAM200[32][32] = { // A C D E F G H I K L // M N P Q R S T V W Y ROW( 388, -0, 34, 32, -202, 159, -88, 89, -55, -67, 19, 86, 186, -34, -32, 237, 273, 171, -326, -239) // A ROW( -0, 1170, -248, -315, 74, -14, 43, -151, -204, -196, -132, -49, -142, -215, 29, 165, -7, -69, 179, 313) // C ROW( 34, -248, 625, 496, -419, 148, 78, -245, 55, -361, -255, 332, -169, 122, -64, 45, -13, -167, -438, -148) // D ROW( 32, -315, 496, 610, -480, 125, 25, -245, 175, -327, -242, 166, -141, 279, 34, -30, -56, -150, -386, -305) // E ROW( -202, 74, -419, -480, 888, -407, 62, 80, -443, 320, 67, -236, -180, -294, -327, -51, -173, 31, -1, 584) // F ROW( 159, -14, 148, 125, -407, 662, -114, -216, -34, -324, -246, 79, -77, -68, 97, 155, 21, -93, -58, -349) // G ROW( -88, 43, 78, 25, 62, -114, 766, -205, 144, -92, -152, 238, 66, 368, 257, 35, -35, -217, -201, 468) // H ROW( 89, -151, -245, -245, 80, -216, -205, 554, -224, 288, 391, -114, -115, -222, -208, -19, 162, 469, -274, -153) // I ROW( -55, -204, 55, 175, -443, -34, 144, -224, 632, -249, -118, 186, -86, 315, 466, 2, 19, -227, -216, -264) // K ROW( -67, -196, -361, -327, 320, -324, -92, 288, -249, 591, 369, -223, 53, -86, -170, -69, -41, 239, -66, -29) // L ROW( 19, -132, -255, -242, 67, -246, -152, 391, -118, 369, 756, -131, -98, -124, -129, -49, 129, 331, -229, -182) // M ROW( 86, -49, 332, 166, -236, 79, 238, -114, 186, -223, -131, 516, -21, 88, 73, 240, 168, -118, -379, -8) // N ROW( 186, -142, -169, -141, -180, -77, 66, -115, -86, 53, -98, -21, 736, 122, 5, 221, 139, -75, -373, -226) // P ROW( -34, -215, 122, 279, -294, -68, 368, -222, 315, -86, -124, 88, 122, 635, 301, -13, -35, -195, -243, -73) // Q ROW( -32, 29, -64, 34, -327, 97, 257, -208, 466, -170, -129, 73, 5, 301, 606, 28, -4, -201, 104, -133) // R ROW( 237, 165, 45, -30, -51, 155, 35, -19, 2, -69, -49, 240, 221, -13, 28, 353, 259, 8, -213, -55) // S ROW( 273, -7, -13, -56, -173, 21, -35, 162, 19, -41, 129, 168, 139, -35, -4, 259, 422, 143, -343, -190) // T ROW( 171, -69, -167, -150, 31, -93, -217, 469, -227, 239, 331, -118, -75, -195, -201, 8, 143, 505, -245, -197) // V ROW( -326, 179, -438, -386, -1, -58, -201, -274, -216, -66, -229, -379, -373, -243, 104, -213, -343, -245, 1475, 63) // W ROW( -239, 313, -148, -305, 584, -349, 468, -153, -264, -29, -182, -8, -226, -73, -133, -55, -190, -197, 63, 979) // Y }; #undef v #define v(x) ((float) x) #define RNC(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), \ v(M), v(N), v(P), v(Q), v(R), v(S), v(T), v(V), v(W), v(Y) }, float PAM200NoCenter[32][32] = { // A C D E F G H I K L // M N P Q R S T V W Y RNC( 388, -0, 34, 32, -202, 159, -88, 89, -55, -67, 19, 86, 186, -34, -32, 237, 273, 171, -326, -239) // A RNC( -0, 1170, -248, -315, 74, -14, 43, -151, -204, -196, -132, -49, -142, -215, 29, 165, -7, -69, 179, 313) // C RNC( 34, -248, 625, 496, -419, 148, 78, -245, 55, -361, -255, 332, -169, 122, -64, 45, -13, -167, -438, -148) // D RNC( 32, -315, 496, 610, -480, 125, 25, -245, 175, -327, -242, 166, -141, 279, 34, -30, -56, -150, -386, -305) // E RNC( -202, 74, -419, -480, 888, -407, 62, 80, -443, 320, 67, -236, -180, -294, -327, -51, -173, 31, -1, 584) // F RNC( 159, -14, 148, 125, -407, 662, -114, -216, -34, -324, -246, 79, -77, -68, 97, 155, 21, -93, -58, -349) // G RNC( -88, 43, 78, 25, 62, -114, 766, -205, 144, -92, -152, 238, 66, 368, 257, 35, -35, -217, -201, 468) // H RNC( 89, -151, -245, -245, 80, -216, -205, 554, -224, 288, 391, -114, -115, -222, -208, -19, 162, 469, -274, -153) // I RNC( -55, -204, 55, 175, -443, -34, 144, -224, 632, -249, -118, 186, -86, 315, 466, 2, 19, -227, -216, -264) // K RNC( -67, -196, -361, -327, 320, -324, -92, 288, -249, 591, 369, -223, 53, -86, -170, -69, -41, 239, -66, -29) // L RNC( 19, -132, -255, -242, 67, -246, -152, 391, -118, 369, 756, -131, -98, -124, -129, -49, 129, 331, -229, -182) // M RNC( 86, -49, 332, 166, -236, 79, 238, -114, 186, -223, -131, 516, -21, 88, 73, 240, 168, -118, -379, -8) // N RNC( 186, -142, -169, -141, -180, -77, 66, -115, -86, 53, -98, -21, 736, 122, 5, 221, 139, -75, -373, -226) // P RNC( -34, -215, 122, 279, -294, -68, 368, -222, 315, -86, -124, 88, 122, 635, 301, -13, -35, -195, -243, -73) // Q RNC( -32, 29, -64, 34, -327, 97, 257, -208, 466, -170, -129, 73, 5, 301, 606, 28, -4, -201, 104, -133) // R RNC( 237, 165, 45, -30, -51, 155, 35, -19, 2, -69, -49, 240, 221, -13, 28, 353, 259, 8, -213, -55) // S RNC( 273, -7, -13, -56, -173, 21, -35, 162, 19, -41, 129, 168, 139, -35, -4, 259, 422, 143, -343, -190) // T RNC( 171, -69, -167, -150, 31, -93, -217, 469, -227, 239, 331, -118, -75, -195, -201, 8, 143, 505, -245, -197) // V RNC( -326, 179, -438, -386, -1, -58, -201, -274, -216, -66, -229, -379, -373, -243, 104, -213, -343, -245, 1475, 63) // W RNC( -239, 313, -148, -305, 584, -349, 468, -153, -264, -29, -182, -8, -226, -73, -133, -55, -190, -197, 63, 979) // Y }; msa2.cpp0000664000175000017500000003336012360262614010523 0ustar bobbob#include "muscle.h" #include "msa.h" #include "seqvect.h" #include "profile.h" #include "tree.h" // These global variables are a hack to allow the tree // dependent iteration code to communicate the edge // used to divide the tree. The three-way weighting // scheme needs to know this edge in order to compute // sequence weights. static const Tree *g_ptrMuscleTree = 0; unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR; unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR; void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize, FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd, FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc, FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const { const unsigned uSeqCount = GetSeqCount(); const unsigned uColCount = GetColCount(); memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT)); WEIGHT wTotal = 0; FCOUNT fGap = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const WEIGHT w = GetSeqWeight(uSeqIndex); if (IsGap(uSeqIndex, uColIndex)) { fGap += w; continue; } else if (IsWildcard(uSeqIndex, uColIndex)) { const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); switch (g_Alpha) { case ALPHA_Amino: switch (uLetter) { case AX_B: // D or N fcCounts[AX_D] += w/2; fcCounts[AX_N] += w/2; break; case AX_Z: // E or Q fcCounts[AX_E] += w/2; fcCounts[AX_Q] += w/2; break; default: // any { const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) fcCounts[uLetter] += f; break; } } break; case ALPHA_DNA: case ALPHA_RNA: switch (uLetter) { case AX_R: // G or A fcCounts[NX_G] += w/2; fcCounts[NX_A] += w/2; break; case AX_Y: // C or T/U fcCounts[NX_C] += w/2; fcCounts[NX_T] += w/2; break; default: // any const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) fcCounts[uLetter] += f; break; } break; default: Quit("Alphabet %d not supported", g_Alpha); } continue; } unsigned uLetter = GetLetter(uSeqIndex, uColIndex); fcCounts[uLetter] += w; wTotal += w; } *ptrfOcc = (float) (1.0 - fGap); if (bNormalize && wTotal > 0) { if (wTotal > 1.001) Quit("wTotal=%g\n", wTotal); for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) fcCounts[uLetter] /= wTotal; // AssertNormalized(fcCounts); } FCOUNT fcStartCount = 0; if (uColIndex == 0) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcStartCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1)) fcStartCount += GetSeqWeight(uSeqIndex); } FCOUNT fcEndCount = 0; if (uColCount - 1 == uColIndex) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcEndCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1)) fcEndCount += GetSeqWeight(uSeqIndex); } FCOUNT LL = 0; FCOUNT LG = 0; FCOUNT GL = 0; FCOUNT GG = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { WEIGHT w = GetSeqWeight(uSeqIndex); bool bLetterHere = !IsGap(uSeqIndex, uColIndex); bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1)); if (bLetterHere) { if (bLetterPrev) LL += w; else GL += w; } else { if (bLetterPrev) LG += w; else GG += w; } } FCOUNT fcExtendCount = 0; if (uColIndex > 0 && uColIndex < GetColCount() - 1) for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) && IsGap(uSeqIndex, uColIndex + 1)) fcExtendCount += GetSeqWeight(uSeqIndex); *ptrfcLL = LL; *ptrfcLG = LG; *ptrfcGL = GL; *ptrfcGG = GG; *ptrfcGapStart = fcStartCount; *ptrfcGapEnd = fcEndCount; *ptrfcGapExtend = fcExtendCount; } // Return true if the given column has no gaps and all // its residues are in the same biochemical group. bool MSAColIsConservative(const MSA &msa, unsigned uColIndex) { extern unsigned ResidueGroup[]; const unsigned uSeqCount = msa.GetColCount(); if (0 == uSeqCount) Quit("MSAColIsConservative: empty alignment"); if (msa.IsGap(0, uColIndex)) return false; unsigned uLetter = msa.GetLetterEx(0, uColIndex); const unsigned uGroup = ResidueGroup[uLetter]; for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex) { if (msa.IsGap(uSeqIndex, uColIndex)) return false; uLetter = msa.GetLetter(uSeqIndex, uColIndex); if (ResidueGroup[uLetter] != uGroup) return false; } return true; } void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex); msaOut.SetChar(uSeqIndex, uColIndex, c); } } } void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount, MSA &msaOut) { const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uInColCount = msaIn.GetColCount(); if (uFromColIndex + uColCount - 1 > uInColCount) Quit("MSAFromColRange, out of bounds"); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uSeqIndex); unsigned uId = msaIn.GetSeqId(uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); msaOut.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex); msaOut.SetChar(uSeqIndex, uColIndex, c); } } } void SeqVectFromMSA(const MSA &msa, SeqVect &v) { v.Clear(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq s; msa.GetSeq(uSeqIndex, s); s.StripGaps(); //if (0 == s.Length()) // continue; const char *ptrName = msa.GetSeqName(uSeqIndex); s.SetName(ptrName); v.AppendSeq(s); } } void DeleteGappedCols(MSA &msa) { unsigned uColIndex = 0; for (;;) { if (uColIndex >= msa.GetColCount()) break; if (msa.IsGapColumn(uColIndex)) msa.DeleteCol(uColIndex); else ++uColIndex; } } void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut) { unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut]; const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); unsigned uId = msaIn.GetSeqId(uSeqIndexIn); msaOut.SetSeqName(uSeqIndexOut, ptrName); msaOut.SetSeqId(uSeqIndexOut, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); msaOut.SetChar(uSeqIndexOut, uColIndex, c); } } } void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2) { const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); if (uSeqCount1 != uSeqCount2) Quit("Seq count differs"); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex) { Seq seq1; msa1.GetSeq(uSeqIndex, seq1); unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); Seq seq2; msa2.GetSeq(uSeqIndex2, seq2); if (!seq1.EqIgnoreCaseAndGaps(seq2)) { Log("Input:\n"); seq1.LogMe(); Log("Output:\n"); seq2.LogMe(); Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex)); } } } void AssertMSAEq(const MSA &msa1, const MSA &msa2) { const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); if (uSeqCount1 != uSeqCount2) Quit("Seq count differs"); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex) { Seq seq1; msa1.GetSeq(uSeqIndex, seq1); unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); Seq seq2; msa2.GetSeq(uSeqIndex2, seq2); if (!seq1.Eq(seq2)) { Log("Input:\n"); seq1.LogMe(); Log("Output:\n"); seq2.LogMe(); Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex)); } } } void SetMSAWeightsMuscle(MSA &msa) { SEQWEIGHT Method = GetSeqWeightMethod(); switch (Method) { case SEQWEIGHT_None: msa.SetUniformWeights(); return; case SEQWEIGHT_Henikoff: msa.SetHenikoffWeights(); return; case SEQWEIGHT_HenikoffPB: msa.SetHenikoffWeightsPB(); return; case SEQWEIGHT_GSC: msa.SetGSCWeights(); return; case SEQWEIGHT_ClustalW: SetClustalWWeightsMuscle(msa); return; case SEQWEIGHT_ThreeWay: SetThreeWayWeightsMuscle(msa); return; } Quit("SetMSAWeightsMuscle, Invalid method=%d", Method); } static WEIGHT *g_MuscleWeights; static unsigned g_uMuscleIdCount; WEIGHT GetMuscleSeqWeightById(unsigned uId) { if (0 == g_MuscleWeights) Quit("g_MuscleWeights = 0"); if (uId >= g_uMuscleIdCount) Quit("GetMuscleSeqWeightById(%u): count=%u", uId, g_uMuscleIdCount); return g_MuscleWeights[uId]; } void SetMuscleTree(const Tree &tree) { g_ptrMuscleTree = &tree; if (SEQWEIGHT_ClustalW != GetSeqWeightMethod()) return; delete[] g_MuscleWeights; const unsigned uLeafCount = tree.GetLeafCount(); g_uMuscleIdCount = uLeafCount; g_MuscleWeights = new WEIGHT[uLeafCount]; CalcClustalWWeights(tree, g_MuscleWeights); } void SetClustalWWeightsMuscle(MSA &msa) { if (0 == g_MuscleWeights) Quit("g_MuscleWeights = 0"); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const unsigned uId = msa.GetSeqId(uSeqIndex); if (uId >= g_uMuscleIdCount) Quit("SetClustalWWeightsMuscle: id out of range"); msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]); } msa.NormalizeWeights((WEIGHT) 1.0); } #define LOCAL_VERBOSE 0 void SetThreeWayWeightsMuscle(MSA &msa) { if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2) { msa.SetHenikoffWeightsPB(); return; } const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount(); WEIGHT *Weights = new WEIGHT[uMuscleSeqCount]; CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2, Weights); const unsigned uMSASeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex) { const unsigned uId = msa.GetSeqId(uSeqIndex); if (uId >= uMuscleSeqCount) Quit("SetThreeWayWeightsMuscle: id out of range"); msa.SetSeqWeight(uSeqIndex, Weights[uId]); } #if LOCAL_VERBOSE { Log("SetThreeWayWeightsMuscle\n"); for (unsigned n = 0; n < uMSASeqCount; ++n) { const unsigned uId = msa.GetSeqId(n); Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]); } } #endif msa.NormalizeWeights((WEIGHT) 1.0); delete[] Weights; } // Append msa2 at the end of msa1 void MSAAppend(MSA &msa1, const MSA &msa2) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } } // "Catenate" two MSAs (by bad analogy with UNIX cat command). // msa1 and msa2 must have same sequence names, but possibly // in a different order. // msaCat is the combined alignment produce by appending // sequences in msa2 to sequences in msa1. void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; msaCat.SetSize(uSeqCount, uColCountCat); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex) { const char c = msa1.GetChar(uSeqIndex, uColIndex); msaCat.SetChar(uSeqIndex, uColIndex, c); } const char *ptrSeqName = msa1.GetSeqName(uSeqIndex); unsigned uSeqIndex2; msaCat.SetSeqName(uSeqIndex, ptrSeqName); bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2); if (bFound) { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } else { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } } msa.cpp0000664000175000017500000005054412360262613010443 0ustar bobbob#include "muscle.h" #include "msa.h" #include "textfile.h" #include "seq.h" #include const unsigned DEFAULT_SEQ_LENGTH = 500; unsigned MSA::m_uIdCount = 0; MSA::MSA() { m_uSeqCount = 0; m_uColCount = 0; m_szSeqs = 0; m_szNames = 0; m_Weights = 0; m_IdToSeqIndex = 0; m_SeqIndexToId = 0; m_uCacheSeqCount = 0; m_uCacheSeqLength = 0; } MSA::~MSA() { Free(); } void MSA::Free() { for (unsigned n = 0; n < m_uSeqCount; ++n) { delete[] m_szSeqs[n]; delete[] m_szNames[n]; } delete[] m_szSeqs; delete[] m_szNames; delete[] m_Weights; delete[] m_IdToSeqIndex; delete[] m_SeqIndexToId; m_uSeqCount = 0; m_uColCount = 0; m_szSeqs = 0; m_szNames = 0; m_Weights = 0; m_IdToSeqIndex = 0; m_SeqIndexToId = 0; } void MSA::SetSize(unsigned uSeqCount, unsigned uColCount) { Free(); m_uSeqCount = uSeqCount; m_uCacheSeqLength = uColCount; m_uColCount = 0; if (0 == uSeqCount && 0 == uColCount) return; m_szSeqs = new char *[uSeqCount]; m_szNames = new char *[uSeqCount]; m_Weights = new WEIGHT[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { m_szSeqs[uSeqIndex] = new char[uColCount+1]; m_szNames[uSeqIndex] = 0; #if DEBUG m_Weights[uSeqIndex] = BTInsane; memset(m_szSeqs[uSeqIndex], '?', uColCount); #endif m_szSeqs[uSeqIndex][uColCount] = 0; } if (m_uIdCount > 0) { m_IdToSeqIndex = new unsigned[m_uIdCount]; m_SeqIndexToId = new unsigned[m_uSeqCount]; #if DEBUG memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); #endif } } void MSA::LogMe() const { if (0 == GetColCount()) { Log("MSA empty\n"); return; } const unsigned uColsPerLine = 50; unsigned uLinesPerSeq = (GetColCount() - 1)/uColsPerLine + 1; for (unsigned n = 0; n < uLinesPerSeq; ++n) { unsigned i; unsigned iStart = n*uColsPerLine; unsigned iEnd = GetColCount(); if (iEnd - iStart + 1 > uColsPerLine) iEnd = iStart + uColsPerLine; Log(" "); for (i = iStart; i < iEnd; ++i) Log("%u", i%10); Log("\n"); Log(" "); for (i = iStart; i + 9 < iEnd; i += 10) Log("%-10u", i); if (n == uLinesPerSeq - 1) Log(" %-10u", GetColCount()); Log("\n"); for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { Log("%12.12s", m_szNames[uSeqIndex]); if (m_Weights[uSeqIndex] != BTInsane) Log(" (%5.3f)", m_Weights[uSeqIndex]); else Log(" "); Log(" "); for (i = iStart; i < iEnd; ++i) Log("%c", GetChar(uSeqIndex, i)); if (0 != m_SeqIndexToId) Log(" [%5u]", m_SeqIndexToId[uSeqIndex]); Log("\n"); } Log("\n\n"); } } char MSA::GetChar(unsigned uSeqIndex, unsigned uIndex) const { // TODO: Performance cost? if (uSeqIndex >= m_uSeqCount || uIndex >= m_uColCount) Quit("MSA::GetChar(%u/%u,%u/%u)", uSeqIndex, m_uSeqCount, uIndex, m_uColCount); char c = m_szSeqs[uSeqIndex][uIndex]; // assert(IsLegalChar(c)); return c; } unsigned MSA::GetLetter(unsigned uSeqIndex, unsigned uIndex) const { // TODO: Performance cost? char c = GetChar(uSeqIndex, uIndex); unsigned uLetter = CharToLetter(c); if (uLetter >= 20) { char c = ' '; if (uSeqIndex < m_uSeqCount && uIndex < m_uColCount) c = m_szSeqs[uSeqIndex][uIndex]; Quit("MSA::GetLetter(%u/%u, %u/%u)='%c'/%u", uSeqIndex, m_uSeqCount, uIndex, m_uColCount, c, uLetter); } return uLetter; } unsigned MSA::GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const { // TODO: Performance cost? char c = GetChar(uSeqIndex, uIndex); unsigned uLetter = CharToLetterEx(c); return uLetter; } void MSA::SetSeqName(unsigned uSeqIndex, const char szName[]) { if (uSeqIndex >= m_uSeqCount) Quit("MSA::SetSeqName(%u, %s), count=%u", uSeqIndex, m_uSeqCount); delete[] m_szNames[uSeqIndex]; int n = (int) strlen(szName) + 1; m_szNames[uSeqIndex] = new char[n]; memcpy(m_szNames[uSeqIndex], szName, n); } const char *MSA::GetSeqName(unsigned uSeqIndex) const { if (uSeqIndex >= m_uSeqCount) Quit("MSA::GetSeqName(%u), count=%u", uSeqIndex, m_uSeqCount); return m_szNames[uSeqIndex]; } bool MSA::IsGap(unsigned uSeqIndex, unsigned uIndex) const { char c = GetChar(uSeqIndex, uIndex); return IsGapChar(c); } bool MSA::IsWildcard(unsigned uSeqIndex, unsigned uIndex) const { char c = GetChar(uSeqIndex, uIndex); return IsWildcardChar(c); } void MSA::SetChar(unsigned uSeqIndex, unsigned uIndex, char c) { if (uSeqIndex >= m_uSeqCount || uIndex > m_uCacheSeqLength) Quit("MSA::SetChar(%u,%u)", uSeqIndex, uIndex); if (uIndex == m_uCacheSeqLength) { const unsigned uNewCacheSeqLength = m_uCacheSeqLength + DEFAULT_SEQ_LENGTH; for (unsigned n = 0; n < m_uSeqCount; ++n) { char *ptrNewSeq = new char[uNewCacheSeqLength+1]; memcpy(ptrNewSeq, m_szSeqs[n], m_uCacheSeqLength); memset(ptrNewSeq + m_uCacheSeqLength, '?', DEFAULT_SEQ_LENGTH); ptrNewSeq[uNewCacheSeqLength] = 0; delete[] m_szSeqs[n]; m_szSeqs[n] = ptrNewSeq; } m_uColCount = uIndex; m_uCacheSeqLength = uNewCacheSeqLength; } if (uIndex >= m_uColCount) m_uColCount = uIndex + 1; m_szSeqs[uSeqIndex][uIndex] = c; } void MSA::GetSeq(unsigned uSeqIndex, Seq &seq) const { assert(uSeqIndex < m_uSeqCount); seq.Clear(); for (unsigned n = 0; n < m_uColCount; ++n) if (!IsGap(uSeqIndex, n)) { char c = GetChar(uSeqIndex, n); if (!isalpha(c)) Quit("Invalid character '%c' in sequence", c); c = toupper(c); seq.push_back(c); } const char *ptrName = GetSeqName(uSeqIndex); seq.SetName(ptrName); } bool MSA::HasGap() const { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) for (unsigned n = 0; n < GetColCount(); ++n) if (IsGap(uSeqIndex, n)) return true; return false; } bool MSA::IsLegalLetter(unsigned uLetter) const { return uLetter < 20; } void MSA::SetSeqCount(unsigned uSeqCount) { Free(); SetSize(uSeqCount, DEFAULT_SEQ_LENGTH); } void MSA::CopyCol(unsigned uFromCol, unsigned uToCol) { assert(uFromCol < GetColCount()); assert(uToCol < GetColCount()); if (uFromCol == uToCol) return; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char c = GetChar(uSeqIndex, uFromCol); SetChar(uSeqIndex, uToCol, c); } } void MSA::Copy(const MSA &msa) { Free(); const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { SetSeqName(uSeqIndex, msa.GetSeqName(uSeqIndex)); const unsigned uId = msa.GetSeqId(uSeqIndex); SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msa.GetChar(uSeqIndex, uColIndex); SetChar(uSeqIndex, uColIndex, c); } } } bool MSA::IsGapColumn(unsigned uColIndex) const { assert(GetSeqCount() > 0); for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) if (!IsGap(uSeqIndex, uColIndex)) return false; return true; } bool MSA::GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) if (0 == stricmp(ptrSeqName, GetSeqName(uSeqIndex))) { *ptruSeqIndex = uSeqIndex; return true; } return false; } void MSA::DeleteCol(unsigned uColIndex) { assert(uColIndex < m_uColCount); size_t n = m_uColCount - uColIndex; if (n > 0) { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { char *ptrSeq = m_szSeqs[uSeqIndex]; memmove(ptrSeq + uColIndex, ptrSeq + uColIndex + 1, n); } } --m_uColCount; } void MSA::DeleteColumns(unsigned uColIndex, unsigned uColCount) { for (unsigned n = 0; n < uColCount; ++n) DeleteCol(uColIndex); } void MSA::FromFile(TextFile &File) { FromFASTAFile(File); } // Weights sum to 1, WCounts sum to NIC WEIGHT MSA::GetSeqWeight(unsigned uSeqIndex) const { assert(uSeqIndex < m_uSeqCount); WEIGHT w = m_Weights[uSeqIndex]; if (w == wInsane) Quit("Seq weight not set"); return w; } void MSA::SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const { assert(uSeqIndex < m_uSeqCount); m_Weights[uSeqIndex] = w; } void MSA::NormalizeWeights(WEIGHT wDesiredTotal) const { WEIGHT wTotal = 0; for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) wTotal += m_Weights[uSeqIndex]; if (0 == wTotal) return; const WEIGHT f = wDesiredTotal/wTotal; for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) m_Weights[uSeqIndex] *= f; } void MSA::CalcWeights() const { Quit("Calc weights not implemented"); } static void FmtChar(char c, unsigned uWidth) { Log("%c", c); for (unsigned n = 0; n < uWidth - 1; ++n) Log(" "); } static void FmtInt(unsigned u, unsigned uWidth) { static char szStr[1024]; assert(uWidth < sizeof(szStr)); if (u > 0) sprintf(szStr, "%u", u); else strcpy(szStr, "."); Log(szStr); unsigned n = (unsigned) strlen(szStr); if (n < uWidth) for (unsigned i = 0; i < uWidth - n; ++i) Log(" "); } static void FmtInt0(unsigned u, unsigned uWidth) { static char szStr[1024]; assert(uWidth < sizeof(szStr)); sprintf(szStr, "%u", u); Log(szStr); unsigned n = (unsigned) strlen(szStr); if (n < uWidth) for (unsigned i = 0; i < uWidth - n; ++i) Log(" "); } static void FmtPad(unsigned n) { for (unsigned i = 0; i < n; ++i) Log(" "); } void MSA::FromSeq(const Seq &s) { unsigned uSeqLength = s.Length(); SetSize(1, uSeqLength); SetSeqName(0, s.GetName()); if (0 != m_SeqIndexToId) SetSeqId(0, s.GetId()); for (unsigned n = 0; n < uSeqLength; ++n) SetChar(0, n, s[n]); } unsigned MSA::GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const { assert(uSeqIndex < GetSeqCount()); assert(uColIndex < GetColCount()); unsigned uCol = 0; for (unsigned n = 0; n <= uColIndex; ++n) if (!IsGap(uSeqIndex, n)) ++uCol; return uCol; } void MSA::CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex) { assert(uToSeqIndex < m_uSeqCount); const unsigned uColCount = msaFrom.GetColCount(); assert(m_uColCount == uColCount || (0 == m_uColCount && uColCount <= m_uCacheSeqLength)); memcpy(m_szSeqs[uToSeqIndex], msaFrom.GetSeqBuffer(uFromSeqIndex), uColCount); SetSeqName(uToSeqIndex, msaFrom.GetSeqName(uFromSeqIndex)); if (0 == m_uColCount) m_uColCount = uColCount; } const char *MSA::GetSeqBuffer(unsigned uSeqIndex) const { assert(uSeqIndex < m_uSeqCount); return m_szSeqs[uSeqIndex]; } void MSA::DeleteSeq(unsigned uSeqIndex) { assert(uSeqIndex < m_uSeqCount); delete m_szSeqs[uSeqIndex]; delete m_szNames[uSeqIndex]; const unsigned uBytesToMove = (m_uSeqCount - uSeqIndex)*sizeof(char *); if (uBytesToMove > 0) { memmove(m_szSeqs + uSeqIndex, m_szSeqs + uSeqIndex + 1, uBytesToMove); memmove(m_szNames + uSeqIndex, m_szNames + uSeqIndex + 1, uBytesToMove); } --m_uSeqCount; delete[] m_Weights; m_Weights = 0; } bool MSA::IsEmptyCol(unsigned uColIndex) const { const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (!IsGap(uSeqIndex, uColIndex)) return false; return true; } //void MSA::DeleteEmptyCols(bool bProgress) // { // unsigned uColCount = GetColCount(); // for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) // { // if (IsEmptyCol(uColIndex)) // { // if (bProgress) // { // Log("Deleting col %u of %u\n", uColIndex, uColCount); // printf("Deleting col %u of %u\n", uColIndex, uColCount); // } // DeleteCol(uColIndex); // --uColCount; // } // } // } unsigned MSA::AlignedColIndexToColIndex(unsigned uAlignedColIndex) const { Quit("MSA::AlignedColIndexToColIndex not implemented"); return 0; } WEIGHT MSA::GetTotalSeqWeight() const { WEIGHT wTotal = 0; const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) wTotal += m_Weights[uSeqIndex]; return wTotal; } bool MSA::SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2, unsigned uSeqIndex2) { Seq s1; Seq s2; a1.GetSeq(uSeqIndex1, s1); a2.GetSeq(uSeqIndex2, s2); s1.StripGaps(); s2.StripGaps(); return s1.EqIgnoreCase(s2); } unsigned MSA::GetSeqLength(unsigned uSeqIndex) const { assert(uSeqIndex < GetSeqCount()); const unsigned uColCount = GetColCount(); unsigned uLength = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) if (!IsGap(uSeqIndex, uColIndex)) ++uLength; return uLength; } void MSA::GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrPWID, unsigned *ptruPosCount) const { assert(uSeqIndex1 < GetSeqCount()); assert(uSeqIndex2 < GetSeqCount()); unsigned uSameCount = 0; unsigned uPosCount = 0; const unsigned uColCount = GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { char c1 = GetChar(uSeqIndex1, uColIndex); if (IsGapChar(c1)) continue; char c2 = GetChar(uSeqIndex2, uColIndex); if (IsGapChar(c2)) continue; ++uPosCount; if (c1 == c2) ++uSameCount; } *ptruPosCount = uPosCount; if (uPosCount > 0) *ptrPWID = 100.0 * (double) uSameCount / (double) uPosCount; else *ptrPWID = 0; } void MSA::UnWeight() { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) m_Weights[uSeqIndex] = BTInsane; } unsigned MSA::UniqueResidueTypes(unsigned uColIndex) const { assert(uColIndex < GetColCount()); unsigned Counts[MAX_ALPHA]; memset(Counts, 0, sizeof(Counts)); const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) continue; const unsigned uLetter = GetLetter(uSeqIndex, uColIndex); ++(Counts[uLetter]); } unsigned uUniqueCount = 0; for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) if (Counts[uLetter] > 0) ++uUniqueCount; return uUniqueCount; } double MSA::GetOcc(unsigned uColIndex) const { unsigned uGapCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) ++uGapCount; unsigned uSeqCount = GetSeqCount(); return (double) (uSeqCount - uGapCount) / (double) uSeqCount; } void MSA::ToFile(TextFile &File) const { if (g_bMSF) ToMSFFile(File); else if (g_bAln) ToAlnFile(File); else if (g_bHTML) ToHTMLFile(File); else if (g_bPHYS) ToPhySequentialFile(File); else if (g_bPHYI) ToPhyInterleavedFile(File); else ToFASTAFile(File); if (0 != g_pstrScoreFileName) WriteScoreFile(*this); } bool MSA::ColumnHasGap(unsigned uColIndex) const { const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) return true; return false; } void MSA::SetIdCount(unsigned uIdCount) { //if (m_uIdCount != 0) // Quit("MSA::SetIdCount: may only be called once"); if (m_uIdCount > 0) { if (uIdCount > m_uIdCount) Quit("MSA::SetIdCount: cannot increase count"); return; } m_uIdCount = uIdCount; } void MSA::SetSeqId(unsigned uSeqIndex, unsigned uId) { assert(uSeqIndex < m_uSeqCount); assert(uId < m_uIdCount); if (0 == m_SeqIndexToId) { if (0 == m_uIdCount) Quit("MSA::SetSeqId, SetIdCount has not been called"); m_IdToSeqIndex = new unsigned[m_uIdCount]; m_SeqIndexToId = new unsigned[m_uSeqCount]; memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); } m_SeqIndexToId[uSeqIndex] = uId; m_IdToSeqIndex[uId] = uSeqIndex; } unsigned MSA::GetSeqIndex(unsigned uId) const { assert(uId < m_uIdCount); assert(0 != m_IdToSeqIndex); unsigned uSeqIndex = m_IdToSeqIndex[uId]; assert(uSeqIndex < m_uSeqCount); return uSeqIndex; } bool MSA::GetSeqIndex(unsigned uId, unsigned *ptruIndex) const { for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { if (uId == m_SeqIndexToId[uSeqIndex]) { *ptruIndex = uSeqIndex; return true; } } return false; } unsigned MSA::GetSeqId(unsigned uSeqIndex) const { assert(uSeqIndex < m_uSeqCount); unsigned uId = m_SeqIndexToId[uSeqIndex]; assert(uId < m_uIdCount); return uId; } bool MSA::WeightsSet() const { return BTInsane != m_Weights[0]; } void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uIdCount, uColCount); for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uIdCount; ++uSeqIndexOut) { const unsigned uId = Ids[uSeqIndexOut]; const unsigned uSeqIndexIn = msaIn.GetSeqIndex(uId); const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); msaOut.SetSeqId(uSeqIndexOut, uId); msaOut.SetSeqName(uSeqIndexOut, ptrName); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); msaOut.SetChar(uSeqIndexOut, uColIndex, c); } } } // Caller must allocate ptrSeq and ptrLabel as new char[n]. void MSA::AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel) { if (m_uSeqCount > m_uCacheSeqCount) Quit("Internal error MSA::AppendSeq"); if (m_uSeqCount == m_uCacheSeqCount) ExpandCache(m_uSeqCount + 4, uSeqLength); m_szSeqs[m_uSeqCount] = ptrSeq; m_szNames[m_uSeqCount] = ptrLabel; ++m_uSeqCount; } void MSA::ExpandCache(unsigned uSeqCount, unsigned uColCount) { if (m_IdToSeqIndex != 0 || m_SeqIndexToId != 0 || uSeqCount < m_uSeqCount) Quit("Internal error MSA::ExpandCache"); if (m_uSeqCount > 0 && uColCount != m_uColCount) Quit("Internal error MSA::ExpandCache, ColCount changed"); char **NewSeqs = new char *[uSeqCount]; char **NewNames = new char *[uSeqCount]; WEIGHT *NewWeights = new WEIGHT[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { NewSeqs[uSeqIndex] = m_szSeqs[uSeqIndex]; NewNames[uSeqIndex] = m_szNames[uSeqIndex]; NewWeights[uSeqIndex] = m_Weights[uSeqIndex]; } for (unsigned uSeqIndex = m_uSeqCount; uSeqIndex < uSeqCount; ++uSeqIndex) { char *Seq = new char[uColCount]; NewSeqs[uSeqIndex] = Seq; #if DEBUG memset(Seq, '?', uColCount); #endif } delete[] m_szSeqs; delete[] m_szNames; delete[] m_Weights; m_szSeqs = NewSeqs; m_szNames = NewNames; m_Weights = NewWeights; m_uCacheSeqCount = uSeqCount; m_uCacheSeqLength = uColCount; m_uColCount = uColCount; } void MSA::FixAlpha() { ClearInvalidLetterWarning(); for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { for (unsigned uColIndex = 0; uColIndex < m_uColCount; ++uColIndex) { char c = GetChar(uSeqIndex, uColIndex); if (!IsResidueChar(c) && !IsGapChar(c)) { char w = GetWildcardChar(); // Warning("Invalid letter '%c', replaced by '%c'", c, w); InvalidLetterWarning(c, w); SetChar(uSeqIndex, uColIndex, w); } } } ReportInvalidLetters(); } ALPHA MSA::GuessAlpha() const { // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap // letters belong to the nucleotide alphabet, guess nucleo. // Otherwise amino. const unsigned CHAR_COUNT = 100; const unsigned MIN_NUCLEO_PCT = 95; const unsigned uSeqCount = GetSeqCount(); const unsigned uColCount = GetColCount(); if (0 == uSeqCount) return ALPHA_Amino; unsigned uDNACount = 0; unsigned uRNACount = 0; unsigned uTotal = 0; unsigned i = 0; for (;;) { unsigned uSeqIndex = i/uColCount; if (uSeqIndex >= uSeqCount) break; unsigned uColIndex = i%uColCount; ++i; char c = GetChar(uSeqIndex, uColIndex); if (IsGapChar(c)) continue; if (IsDNA(c)) ++uDNACount; if (IsRNA(c)) ++uRNACount; ++uTotal; if (uTotal >= CHAR_COUNT) break; } if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_RNA; if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_DNA; return ALPHA_Amino; } msadistkimura.cpp0000664000175000017500000000675612360262614012547 0ustar bobbob#include "muscle.h" #include "msa.h" #include // "Standard" NJ distance: the Kimura measure. // This is defined to be: // // log_e(1 - p - p*p/5) // // where p is the fraction of residues that differ, i.e.: // // p = (1 - fractional_conservation) // // This measure is infinite for p = 0.8541 and is considered // unreliable for p >= 0.75 (according to the ClustalW docs). // ClustalW uses a table lookup for values > 0.75. // The following table was copied from the ClustalW file dayhoff.h. static int dayhoff_pams[]={ 195, /* 75.0% observed d; 195 PAMs estimated = 195% estimated d */ 196, /* 75.1% observed d; 196 PAMs estimated */ 197, 198, 199, 200, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 226, 227, 228, 229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 240, 241, 243, 244, 245, 246, 248, 249, 250, /* 250 PAMs = 80.3% observed d */ 252, 253, 254, 255, 257, 258, 260, 261, 262, 264, 265, 267, 268, 270, 271, 273, 274, 276, 277, 279, 281, 282, 284, 285, 287, 289, 291, 292, 294, 296, 298, 299, 301, 303, 305, 307, 309, 311, 313, 315, 317, 319, 321, 323, 325, 328, 330, 332, 335, 337, 339, 342, 344, 347, 349, 352, 354, 357, 360, 362, 365, 368, 371, 374, 377, 380, 383, 386, 389, 393, 396, 399, 403, 407, 410, 414, 418, 422, 426, 430, 434, 438, 442, 447, 451, 456, 461, 466, 471, 476, 482, 487, 493, 498, 504, 511, 517, 524, 531, 538, 545, 553, 560, 569, 577, 586, 595, 605, 615, 626, 637, 649, 661, 675, 688, 703, 719, 736, 754, 775, 796, 819, 845, 874, 907, 945, /* 92.9% observed; 945 PAMs */ 988 /* 93.0% observed; 988 PAMs */ }; static int iTableEntries = sizeof(dayhoff_pams)/sizeof(dayhoff_pams[0]); double KimuraDist(double dPctId) { double p = 1 - dPctId; // Typical case: use Kimura's empirical formula if (p < 0.75) return -log(1 - p - (p*p)/5); // Per ClustalW, return 10.0 for anything over 93% if (p > 0.93) return 10.0; // If p >= 0.75, use table lookup assert(p <= 1 && p >= 0.75); // Thanks for Michael Hoel for pointing out a bug // in the table index calculation in versions <= 3.52. int iTableIndex = (int) ((p - 0.75)*1000 + 0.5); if (iTableIndex < 0 || iTableIndex >= iTableEntries) Quit("Internal error in MSADistKimura::ComputeDist"); return dayhoff_pams[iTableIndex] / 100.0; } //double MSADistKimura::ComputeDist(const MSA &msa, unsigned uSeqIndex1, // unsigned uSeqIndex2) // { // double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2); // return KimuraDist(dPctId); // } double KimuraDistToPctId(double dKimuraDist) { // Solve quadratic equation const double a = 0.2; const double b = 1; const double c = 1.0 - exp(-dKimuraDist); const double p = (-b + sqrt(b*b + 4*a*c))/(2*a); return 1 - p; } double PctIdToHeightKimura(double dPctId) { return KimuraDist(dPctId); } msf.cpp0000664000175000017500000000700512360262614010443 0ustar bobbob#include "muscle.h" #include #include #include "msa.h" #include "textfile.h" const int MAX_NAME = 63; const unsigned uCharsPerLine = 50; const unsigned uCharsPerBlock = 10; // Truncate at first white space or MAX_NAME, whichever comes // first, then pad with blanks up to PadLength. static const char *GetPaddedName(const char *Name, int PadLength) { static char PaddedName[MAX_NAME+1]; memset(PaddedName, ' ', MAX_NAME); size_t n = strcspn(Name, " \t"); memcpy(PaddedName, Name, n); PaddedName[PadLength] = 0; return PaddedName; } static const char *strfind(const char *s, const char *t) { size_t n = strcspn(s, t); if (0 == n) return 0; return s + n; } // GCG checksum code kindly provided by Eric Martel. unsigned MSA::GetGCGCheckSum(unsigned uSeqIndex) const { unsigned CheckSum = 0; const unsigned uColCount = GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { unsigned c = (unsigned) GetChar(uSeqIndex, uColIndex); CheckSum += c*(uColIndex%57 + 1); CheckSum %= 10000; } return CheckSum; } static void MSFFixGaps(MSA &a) { const int SeqCount = a.GetSeqCount(); const int ColCount = a.GetColCount(); for (int SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { for (int ColIndex = 0; ColIndex < ColCount; ++ColIndex) if (a.IsGap(SeqIndex, ColIndex)) a.SetChar(SeqIndex, ColIndex, '.'); } } void MSA::ToMSFFile(TextFile &File, const char *ptrComment) const { // Cast away const, yuck SetMSAWeightsMuscle((MSA &) *this); MSFFixGaps((MSA &) *this); File.PutString("PileUp\n"); if (0 != ptrComment) File.PutFormat("Comment: %s\n", ptrComment); else File.PutString("\n"); char seqtype = (g_Alpha == ALPHA_DNA || g_Alpha == ALPHA_RNA) ? 'N' : 'P'; File.PutFormat(" MSF: %u Type: %c Check: 0000 ..\n\n", GetColCount(), seqtype); int iLongestNameLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *Name = GetSeqName(uSeqIndex); const char *PaddedName = GetPaddedName(Name, MAX_NAME); int iLength = (int) strcspn(PaddedName, " \t"); if (iLength > iLongestNameLength) iLongestNameLength = iLength; } for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *Name = GetSeqName(uSeqIndex); const char *PaddedName = GetPaddedName(Name, iLongestNameLength); File.PutFormat(" Name: %s", PaddedName); File.PutFormat(" Len: %u Check: %5u Weight: %g\n", GetColCount(), GetGCGCheckSum(uSeqIndex), GetSeqWeight(uSeqIndex)); } File.PutString("\n//\n"); if (0 == GetColCount()) return; unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1; for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex) { File.PutString("\n"); unsigned uStartColIndex = uLineIndex*uCharsPerLine; unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1; if (uEndColIndex >= GetColCount()) uEndColIndex = GetColCount() - 1; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *Name = GetSeqName(uSeqIndex); const char *PaddedName = GetPaddedName(Name, iLongestNameLength); File.PutFormat("%s ", PaddedName); for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; ++uColIndex) { if (0 == uColIndex%uCharsPerBlock) File.PutString(" "); char c = GetChar(uSeqIndex, uColIndex); File.PutFormat("%c", c); } File.PutString("\n"); } } } muscle.cpp0000664000175000017500000000521012360262614011142 0ustar bobbob#include "muscle.h" #include "msa.h" #include "seqvect.h" #include "msa.h" #include "tree.h" #include "profile.h" void MUSCLE(SeqVect &v, MSA &msaOut) { const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) { SetPPScore(PPSCORE_SPN); g_Distance1 = DISTANCE_Kmer4_6; } unsigned uMinL = 0; unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (uMinL == 0 || L < uMinL) uMinL = L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags = g_bDiags1; SetSeqStats(uSeqCount, uMinL, uMaxL, uTotL/uSeqCount); MSA::SetIdCount(uSeqCount); //// Initialize sequence ids. //// From this point on, ids must somehow propogate from here. // for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) // v.SetSeqId(uSeqIndex, uSeqIndex); if (uSeqCount > 1) MHackStart(v); if (0 == uSeqCount) { msaOut.Clear(); return; } if (1 == uSeqCount && ALPHA_Amino == Alpha) { const Seq &s = v.GetSeq(0); msaOut.FromSeq(s); return; } // First iteration Tree GuideTree; TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1); SetMuscleTree(GuideTree); ProgNode *ProgNodes = 0; if (g_bLow) ProgNodes = ProgressiveAlignE(v, GuideTree, msaOut); else ProgressiveAlign(v, GuideTree, msaOut); SetCurrentAlignment(msaOut); if (1 == g_uMaxIters || 2 == uSeqCount) { MHackEnd(msaOut); return; } g_bDiags = g_bDiags2; SetIter(2); if (g_bLow) { if (0 != g_uMaxTreeRefineIters) RefineTreeE(msaOut, v, GuideTree, ProgNodes); } else RefineTree(msaOut, GuideTree); extern void DeleteProgNode(ProgNode &Node); const unsigned uNodeCount = GuideTree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) DeleteProgNode(ProgNodes[uNodeIndex]); delete[] ProgNodes; ProgNodes = 0; SetSeqWeightMethod(g_SeqWeight2); SetMuscleTree(GuideTree); if (g_bAnchors) RefineVert(msaOut, GuideTree, g_uMaxIters - 2); else RefineHoriz(msaOut, GuideTree, g_uMaxIters - 2, false, false); MHackEnd(msaOut); } muscleout.cpp0000664000175000017500000000415412360262614011700 0ustar bobbob#include "muscle.h" #include "msa.h" #include "params.h" #include "textfile.h" static void DoOutput(MSA &msa) { bool AnyOutput = false; // Value options if (g_pstrFASTAOutFileName) { TextFile File(g_pstrFASTAOutFileName, true); msa.ToFASTAFile(File); AnyOutput = true; } if (g_pstrMSFOutFileName) { TextFile File(g_pstrMSFOutFileName, true); msa.ToMSFFile(File); AnyOutput = true; } if (g_pstrClwOutFileName) { TextFile File(g_pstrClwOutFileName, true); msa.ToAlnFile(File); AnyOutput = true; } if (g_pstrClwStrictOutFileName) { g_bClwStrict = true; TextFile File(g_pstrClwStrictOutFileName, true); msa.ToAlnFile(File); AnyOutput = true; } if (g_pstrHTMLOutFileName) { TextFile File(g_pstrHTMLOutFileName, true); msa.ToHTMLFile(File); AnyOutput = true; } if (g_pstrPHYIOutFileName) { TextFile File(g_pstrPHYIOutFileName, true); msa.ToPhyInterleavedFile(File); AnyOutput = true; } if (g_pstrPHYSOutFileName) { TextFile File(g_pstrPHYSOutFileName, true); msa.ToPhySequentialFile(File); AnyOutput = true; } // Flag options, at most one used (because only one -out filename) TextFile fileOut(g_pstrOutFileName, true); if (g_bFASTA) { msa.ToFASTAFile(fileOut); AnyOutput = true; } else if (g_bMSF) { msa.ToMSFFile(fileOut); AnyOutput = true; } else if (g_bAln) { msa.ToAlnFile(fileOut); AnyOutput = true; } else if (g_bHTML) { msa.ToHTMLFile(fileOut); AnyOutput = true; } else if (g_bPHYI) { msa.ToPhyInterleavedFile(fileOut); AnyOutput = true; } else if (g_bPHYS) { msa.ToPhySequentialFile(fileOut); AnyOutput = true; } // If -out option was given but no flags, output as FASTA if (!AnyOutput) msa.ToFASTAFile(fileOut); fileOut.Close(); if (0 != g_pstrScoreFileName) WriteScoreFile(msa); } void MuscleOutput(MSA &msa) { MHackEnd(msa); if (g_bStable) { MSA msaStable; Stabilize(msa, msaStable); msa.Clear(); // save memory DoOutput(msaStable); } else DoOutput(msa); } nucmx.cpp0000664000175000017500000000103412360262614011004 0ustar bobbob#include "muscle.h" // BLASTZ default parameters // open 400, extend 30, matrix as below const float NUC_EXTEND = 30; const float NUC_SP_CENTER = 2*NUC_EXTEND; #define v(x) ((float) x + NUC_SP_CENTER) #define ROW(A, C, G, T) \ { v(A), v(C), v(G), v(T) }, float NUC_SP[32][32] = { // A C G T ROW( 91, -114, -31, -123) // A ROW( -114, 100, -125, -31) // C ROW( -31, -125, 100, -114) // G ROW( -123, -31, -114, 91) // T }; nwdasimple2.cpp0000664000175000017500000003403512360262614012106 0ustar bobbob#include "muscle.h" #include "pwpath.h" #include "profile.h" #if DOUBLE_AFFINE #define TRACE 0 extern bool g_bKeepSimpleDP; extern SCORE *g_DPM; extern SCORE *g_DPD; extern SCORE *g_DPE; extern SCORE *g_DPI; extern SCORE *g_DPJ; extern char *g_TBM; extern char *g_TBD; extern char *g_TBE; extern char *g_TBI; extern char *g_TBJ; static char XlatEdgeType(char c) { if ('E' == c) return 'D'; if ('J' == c) return 'I'; return c; } static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -100000) return " *"; sprintf(str, "%6.1f", s); return str; } static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB)); Log("\n"); } } static void ListDPM(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { SCORE x = (uPrefixLengthA + uPrefixLengthB)*g_scoreGapExtend; SCORE s = DPM(uPrefixLengthA, uPrefixLengthB) - x; Log(" %s", LocalScoreToStr(s)); } Log("\n"); } } extern SCORE ScoreProfPos2(const ProfPos &PP, const ProfPos &PPB); SCORE NWDASimple2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; // Allocate DP matrices const size_t LM = uPrefixCountA*uPrefixCountB; SCORE *DPM_ = new SCORE[LM]; SCORE *DPD_ = new SCORE[LM]; SCORE *DPE_ = new SCORE[LM]; SCORE *DPI_ = new SCORE[LM]; SCORE *DPJ_ = new SCORE[LM]; SCORE *DPL_ = new SCORE[LM]; char *TBM_ = new char[LM]; char *TBD_ = new char[LM]; char *TBE_ = new char[LM]; char *TBI_ = new char[LM]; char *TBJ_ = new char[LM]; memset(DPM_, 0, LM*sizeof(SCORE)); memset(DPD_, 0, LM*sizeof(SCORE)); memset(DPE_, 0, LM*sizeof(SCORE)); memset(DPI_, 0, LM*sizeof(SCORE)); memset(DPJ_, 0, LM*sizeof(SCORE)); // memset(DPL_, 0, LM*sizeof(SCORE)); memset(TBM_, '?', LM); memset(TBD_, '?', LM); memset(TBE_, '?', LM); memset(TBI_, '?', LM); memset(TBJ_, '?', LM); DPM(0, 0) = 0; DPD(0, 0) = MINUS_INFINITY; DPE(0, 0) = MINUS_INFINITY; DPI(0, 0) = MINUS_INFINITY; DPJ(0, 0) = MINUS_INFINITY; DPM(1, 0) = MINUS_INFINITY; DPD(1, 0) = PA[0].m_scoreGapOpen; DPE(1, 0) = PA[0].m_scoreGapOpen2; DPI(1, 0) = MINUS_INFINITY; DPJ(1, 0) = MINUS_INFINITY; DPM(0, 1) = MINUS_INFINITY; DPD(0, 1) = MINUS_INFINITY; DPE(0, 1) = MINUS_INFINITY; DPI(0, 1) = PB[0].m_scoreGapOpen; DPJ(0, 1) = PB[0].m_scoreGapOpen2; // Empty prefix of B is special case for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { // M=LetterA+LetterB, impossible with empty prefix DPM(uPrefixLengthA, 0) = MINUS_INFINITY; // D=LetterA+GapB DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend; TBD(uPrefixLengthA, 0) = 'D'; DPE(uPrefixLengthA, 0) = DPE(uPrefixLengthA - 1, 0) + g_scoreGapExtend2; TBE(uPrefixLengthA, 0) = 'E'; // I=GapA+LetterB, impossible with empty prefix DPI(uPrefixLengthA, 0) = MINUS_INFINITY; DPJ(uPrefixLengthA, 0) = MINUS_INFINITY; } // Empty prefix of A is special case for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { // M=LetterA+LetterB, impossible with empty prefix DPM(0, uPrefixLengthB) = MINUS_INFINITY; // D=LetterA+GapB, impossible with empty prefix DPD(0, uPrefixLengthB) = MINUS_INFINITY; DPE(0, uPrefixLengthB) = MINUS_INFINITY; // I=GapA+LetterB DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend; TBI(0, uPrefixLengthB) = 'I'; DPJ(0, uPrefixLengthB) = DPJ(0, uPrefixLengthB - 1) + g_scoreGapExtend2; TBJ(0, uPrefixLengthB) = 'J'; } // ============ // Main DP loop // ============ for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreGapCloseB; if (uPrefixLengthB == 1) scoreGapCloseB = MINUS_INFINITY; else scoreGapCloseB = PB[uPrefixLengthB-2].m_scoreGapClose; SCORE scoreGapClose2B; if (uPrefixLengthB == 1) scoreGapClose2B = MINUS_INFINITY; else scoreGapClose2B = PB[uPrefixLengthB-2].m_scoreGapClose2; for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; { // Match M=LetterA+LetterB SCORE scoreLL = ScoreProfPos2(PPA, PPB); DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL; SCORE scoreGapCloseA; if (uPrefixLengthA == 1) scoreGapCloseA = MINUS_INFINITY; else scoreGapCloseA = PA[uPrefixLengthA-2].m_scoreGapClose; SCORE scoreGapClose2A; if (uPrefixLengthA == 1) scoreGapClose2A = MINUS_INFINITY; else scoreGapClose2A = PA[uPrefixLengthA-2].m_scoreGapClose2; SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; SCORE scoreEM = DPE(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2A; SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; SCORE scoreJM = DPJ(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2B; SCORE scoreBest; if (scoreMM >= scoreDM && scoreMM >= scoreIM && scoreMM >= scoreEM && scoreMM >= scoreJM) { scoreBest = scoreMM; TBM(uPrefixLengthA, uPrefixLengthB) = 'M'; } else if (scoreDM >= scoreMM && scoreDM >= scoreIM && scoreDM >= scoreEM && scoreDM >= scoreJM) { scoreBest = scoreDM; TBM(uPrefixLengthA, uPrefixLengthB) = 'D'; } else if (scoreEM >= scoreMM && scoreEM >= scoreIM && scoreEM >= scoreDM && scoreEM >= scoreJM) { scoreBest = scoreEM; TBM(uPrefixLengthA, uPrefixLengthB) = 'E'; } else if (scoreIM >= scoreMM && scoreIM >= scoreDM && scoreIM >= scoreEM && scoreIM >= scoreJM) { scoreBest = scoreIM; TBM(uPrefixLengthA, uPrefixLengthB) = 'I'; } else if (scoreJM >= scoreMM && scoreJM >= scoreDM && scoreJM >= scoreEM && scoreJM >= scoreIM) { scoreBest = scoreJM; TBM(uPrefixLengthA, uPrefixLengthB) = 'J'; } else Quit("Max failed (M)"); DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; } { // Delete D=LetterA+GapB SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen; SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend; SCORE scoreBest; if (scoreMD >= scoreDD) { scoreBest = scoreMD; TBD(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreDD >= scoreMD); scoreBest = scoreDD; TBD(uPrefixLengthA, uPrefixLengthB) = 'D'; } DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; } { // Delete E=LetterA+GapB SCORE scoreME = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen2; SCORE scoreEE = DPE(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend2; SCORE scoreBest; if (scoreME >= scoreEE) { scoreBest = scoreME; TBE(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreEE >= scoreME); scoreBest = scoreEE; TBE(uPrefixLengthA, uPrefixLengthB) = 'E'; } DPE(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert I=GapA+LetterB { SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB-1].m_scoreGapOpen; SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend; SCORE scoreBest; if (scoreMI >= scoreII) { scoreBest = scoreMI; TBI(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreII > scoreMI); scoreBest = scoreII; TBI(uPrefixLengthA, uPrefixLengthB) = 'I'; } DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert J=GapA+LetterB { SCORE scoreMJ = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB-1].m_scoreGapOpen2; SCORE scoreJJ = DPJ(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend2; SCORE scoreBest; if (scoreMJ > scoreJJ) { scoreBest = scoreMJ; TBJ(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreJJ >= scoreMJ); scoreBest = scoreJJ; TBJ(uPrefixLengthA, uPrefixLengthB) = 'J'; } DPJ(uPrefixLengthA, uPrefixLengthB) = scoreBest; } } } // Special case: close gaps at end of alignment DPD(uLengthA, uLengthB) += PA[uLengthA-1].m_scoreGapClose; DPE(uLengthA, uLengthB) += PA[uLengthA-1].m_scoreGapClose2; DPI(uLengthA, uLengthB) += PB[uLengthB-1].m_scoreGapClose; DPJ(uLengthA, uLengthB) += PB[uLengthB-1].m_scoreGapClose2; #if TRACE Log("DPL:\n"); ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPM:\n"); ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPD:\n"); ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPE:\n"); ListDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPI:\n"); ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("DPJ:\n"); ListDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBM:\n"); ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBD:\n"); ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBE:\n"); ListTB(TBE_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBI:\n"); ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("TBJ:\n"); ListTB(TBJ_, PA, PB, uPrefixCountA, uPrefixCountB); #endif // ========== // Trace-back // ========== Path.Clear(); // Find last edge char cEdgeType = '?'; SCORE BestScore = MINUS_INFINITY; SCORE M = DPM(uLengthA, uLengthB); SCORE D = DPD(uLengthA, uLengthB); SCORE E = DPE(uLengthA, uLengthB); SCORE I = DPI(uLengthA, uLengthB); SCORE J = DPJ(uLengthA, uLengthB); if (M >= D && M >= E && M >= I && M >= J) { cEdgeType = 'M'; BestScore = M; } else if (D >= M && D >= E && D >= I && D >= J) { cEdgeType = 'D'; BestScore = D; } else if (E >= M && E >= D && E >= I && E >= J) { cEdgeType = 'E'; BestScore = E; } else if (I >= M && I >= D && I >= E && I >= J) { cEdgeType = 'I'; BestScore = I; } else if (J >= M && J >= D && J >= E && J >= I) { cEdgeType = 'J'; BestScore = J; } else Quit("Bad max"); unsigned PLA = uLengthA; unsigned PLB = uLengthB; unsigned ECount = 0; unsigned JCount = 0; for (;;) { #if TRACE Log("TraceBack: %c%u.%u\n", cEdgeType, PLA, PLB); #endif PWEdge Edge; Edge.cType = XlatEdgeType(cEdgeType); Edge.uPrefixLengthA = PLA; Edge.uPrefixLengthB = PLB; Path.PrependEdge(Edge); switch (cEdgeType) { case 'M': assert(PLA > 0); assert(PLB > 0); cEdgeType = TBM(PLA, PLB); --PLA; --PLB; break; case 'D': assert(PLA > 0); cEdgeType = TBD(PLA, PLB); --PLA; break; case 'E': ++ECount; assert(PLA > 0); cEdgeType = TBE(PLA, PLB); --PLA; break; case 'I': assert(PLB > 0); cEdgeType = TBI(PLA, PLB); --PLB; break; case 'J': ++JCount; assert(PLB > 0); cEdgeType = TBJ(PLA, PLB); --PLB; break; default: Quit("Invalid edge %c", cEdgeType); } if (0 == PLA && 0 == PLB) break; } //if (ECount > 0 || JCount > 0) // fprintf(stderr, "E=%d J=%d\n", ECount, JCount); Path.Validate(); if (Path.GetMatchCount() + Path.GetDeleteCount() != uLengthA) Quit("Path count A"); if (Path.GetMatchCount() + Path.GetInsertCount() != uLengthB) Quit("Path count B"); if (g_bKeepSimpleDP) { g_DPM = DPM_; g_DPD = DPD_; g_DPE = DPE_; g_DPI = DPI_; g_DPJ = DPJ_; g_TBM = TBM_; g_TBD = TBD_; g_TBE = TBE_; g_TBI = TBI_; g_TBJ = TBJ_; } else { delete[] DPM_; delete[] DPD_; delete[] DPE_; delete[] DPI_; delete[] DPJ_; delete[] TBM_; delete[] TBD_; delete[] TBE_; delete[] TBI_; delete[] TBJ_; } #if TRACE Log("BestScore=%.6g\n", BestScore); #endif return BestScore; } #endif // DOUBLE_AFFINE nwdasimple.cpp0000664000175000017500000003060512360262614012023 0ustar bobbob#include "muscle.h" #include #include "pwpath.h" #include "profile.h" #include #define TRACE 0 bool g_bKeepSimpleDP; SCORE *g_DPM; SCORE *g_DPD; SCORE *g_DPE; SCORE *g_DPI; SCORE *g_DPJ; char *g_TBM; char *g_TBD; char *g_TBE; char *g_TBI; char *g_TBJ; #if DOUBLE_AFFINE static char XlatEdgeType(char c) { if ('E' == c) return 'D'; if ('J' == c) return 'I'; return c; } static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -100000) return " *"; sprintf(str, "%6.1f", s); return str; } static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB)); Log("\n"); } } static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } SCORE NWDASimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; // Allocate DP matrices const size_t LM = uPrefixCountA*uPrefixCountB; SCORE *DPL_ = new SCORE[LM]; SCORE *DPM_ = new SCORE[LM]; SCORE *DPD_ = new SCORE[LM]; SCORE *DPE_ = new SCORE[LM]; SCORE *DPI_ = new SCORE[LM]; SCORE *DPJ_ = new SCORE[LM]; char *TBM_ = new char[LM]; char *TBD_ = new char[LM]; char *TBE_ = new char[LM]; char *TBI_ = new char[LM]; char *TBJ_ = new char[LM]; memset(TBM_, '?', LM); memset(TBD_, '?', LM); memset(TBE_, '?', LM); memset(TBI_, '?', LM); memset(TBJ_, '?', LM); DPM(0, 0) = 0; DPD(0, 0) = MINUS_INFINITY; DPE(0, 0) = MINUS_INFINITY; DPI(0, 0) = MINUS_INFINITY; DPJ(0, 0) = MINUS_INFINITY; DPM(1, 0) = MINUS_INFINITY; DPD(1, 0) = PA[0].m_scoreGapOpen; DPE(1, 0) = PA[0].m_scoreGapOpen2; TBD(1, 0) = 'D'; TBE(1, 0) = 'E'; DPI(1, 0) = MINUS_INFINITY; DPJ(1, 0) = MINUS_INFINITY; DPM(0, 1) = MINUS_INFINITY; DPD(0, 1) = MINUS_INFINITY; DPE(0, 1) = MINUS_INFINITY; DPI(0, 1) = PB[0].m_scoreGapOpen; DPJ(0, 1) = PB[0].m_scoreGapOpen2; TBI(0, 1) = 'I'; TBJ(0, 1) = 'J'; // Empty prefix of B is special case for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { DPM(uPrefixLengthA, 0) = MINUS_INFINITY; DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend; DPE(uPrefixLengthA, 0) = DPE(uPrefixLengthA - 1, 0) + g_scoreGapExtend2; TBD(uPrefixLengthA, 0) = 'D'; TBE(uPrefixLengthA, 0) = 'E'; DPI(uPrefixLengthA, 0) = MINUS_INFINITY; DPJ(uPrefixLengthA, 0) = MINUS_INFINITY; } // Empty prefix of A is special case for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { DPM(0, uPrefixLengthB) = MINUS_INFINITY; DPD(0, uPrefixLengthB) = MINUS_INFINITY; DPE(0, uPrefixLengthB) = MINUS_INFINITY; DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend; DPJ(0, uPrefixLengthB) = DPJ(0, uPrefixLengthB - 1) + g_scoreGapExtend2; TBI(0, uPrefixLengthB) = 'I'; TBJ(0, uPrefixLengthB) = 'J'; } // Special case to agree with NWFast, no D-I transitions so... DPD(uLengthA, 0) = MINUS_INFINITY; DPE(uLengthA, 0) = MINUS_INFINITY; // DPI(0, uLengthB) = MINUS_INFINITY; // DPJ(0, uLengthB) = MINUS_INFINITY; // ============ // Main DP loop // ============ SCORE scoreGapCloseB = MINUS_INFINITY; SCORE scoreGapClose2B = MINUS_INFINITY; for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreGapCloseA = MINUS_INFINITY; SCORE scoreGapClose2A = MINUS_INFINITY; for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; { // Match M=LetterA+LetterB SCORE scoreLL = ScoreProfPos2(PPA, PPB); DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL; SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; SCORE scoreEM = DPE(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2A; SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; SCORE scoreJM = DPJ(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2B; SCORE scoreBest; if (scoreMM >= scoreDM && scoreMM >= scoreEM && scoreMM >= scoreIM && scoreMM >= scoreJM) { scoreBest = scoreMM; TBM(uPrefixLengthA, uPrefixLengthB) = 'M'; } else if (scoreDM >= scoreMM && scoreDM >= scoreEM && scoreDM >= scoreIM && scoreDM >= scoreJM) { scoreBest = scoreDM; TBM(uPrefixLengthA, uPrefixLengthB) = 'D'; } else if (scoreEM >= scoreMM && scoreEM >= scoreDM && scoreEM >= scoreIM && scoreEM >= scoreJM) { scoreBest = scoreEM; TBM(uPrefixLengthA, uPrefixLengthB) = 'E'; } else if (scoreIM >= scoreMM && scoreIM >= scoreDM && scoreIM >= scoreEM && scoreIM >= scoreJM) { scoreBest = scoreIM; TBM(uPrefixLengthA, uPrefixLengthB) = 'I'; } else { assert(scoreJM >= scoreMM && scoreJM >= scoreDM && scoreJM >= scoreEM && scoreJM >= scoreIM); scoreBest = scoreJM; TBM(uPrefixLengthA, uPrefixLengthB) = 'J'; } DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; } { // Delete D=LetterA+GapB SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen; SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend; SCORE scoreBest; if (scoreMD >= scoreDD) { scoreBest = scoreMD; TBD(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreDD >= scoreMD); scoreBest = scoreDD; TBD(uPrefixLengthA, uPrefixLengthB) = 'D'; } DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; } { // Delete E=LetterA+GapB SCORE scoreME = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen2; SCORE scoreEE = DPE(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend2; SCORE scoreBest; if (scoreME >= scoreEE) { scoreBest = scoreME; TBE(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreEE >= scoreME); scoreBest = scoreEE; TBE(uPrefixLengthA, uPrefixLengthB) = 'E'; } DPE(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert I=GapA+LetterB { SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB - 1].m_scoreGapOpen; SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend; SCORE scoreBest; if (scoreMI >= scoreII) { scoreBest = scoreMI; TBI(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreII > scoreMI); scoreBest = scoreII; TBI(uPrefixLengthA, uPrefixLengthB) = 'I'; } DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert J=GapA+LetterB { SCORE scoreMJ = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB - 1].m_scoreGapOpen2; SCORE scoreJJ = DPJ(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend2; SCORE scoreBest; if (scoreMJ >= scoreJJ) { scoreBest = scoreMJ; TBJ(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreJJ > scoreMJ); scoreBest = scoreJJ; TBJ(uPrefixLengthA, uPrefixLengthB) = 'J'; } DPJ(uPrefixLengthA, uPrefixLengthB) = scoreBest; } scoreGapCloseA = PPA.m_scoreGapClose; scoreGapClose2A = PPA.m_scoreGapClose2; } scoreGapCloseB = PPB.m_scoreGapClose; scoreGapClose2B = PPB.m_scoreGapClose2; } #if TRACE Log("\n"); Log("DA Simple DPL:\n"); ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPM:\n"); ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPD:\n"); ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPE:\n"); ListDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPI:\n"); ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPJ:\n"); ListDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBM:\n"); ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBD:\n"); ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBE:\n"); ListTB(TBE_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBI:\n"); ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBJ:\n"); ListTB(TBJ_, PA, PB, uPrefixCountA, uPrefixCountB); #endif // Trace-back // ========== Path.Clear(); // Find last edge SCORE M = DPM(uLengthA, uLengthB); SCORE D = DPD(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose; SCORE E = DPE(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose2; SCORE I = DPI(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose; SCORE J = DPJ(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose2; char cEdgeType = '?'; SCORE BestScore = M; cEdgeType = 'M'; if (D > BestScore) { cEdgeType = 'D'; BestScore = D; } if (E > BestScore) { cEdgeType = 'E'; BestScore = E; } if (I > BestScore) { cEdgeType = 'I'; BestScore = I; } if (J > BestScore) { cEdgeType = 'J'; BestScore = J; } #if TRACE Log("DA Simple: MAB=%.4g DAB=%.4g EAB=%.4g IAB=%.4g JAB=%.4g best=%c\n", M, D, E, I, J, cEdgeType); #endif unsigned PLA = uLengthA; unsigned PLB = uLengthB; for (;;) { PWEdge Edge; Edge.cType = XlatEdgeType(cEdgeType); Edge.uPrefixLengthA = PLA; Edge.uPrefixLengthB = PLB; #if TRACE Log("Prepend %c%d.%d\n", Edge.cType, PLA, PLB); #endif Path.PrependEdge(Edge); switch (cEdgeType) { case 'M': assert(PLA > 0); assert(PLB > 0); cEdgeType = TBM(PLA, PLB); --PLA; --PLB; break; case 'D': assert(PLA > 0); cEdgeType = TBD(PLA, PLB); --PLA; break; case 'E': assert(PLA > 0); cEdgeType = TBE(PLA, PLB); --PLA; break; case 'I': assert(PLB > 0); cEdgeType = TBI(PLA, PLB); --PLB; break; case 'J': assert(PLB > 0); cEdgeType = TBJ(PLA, PLB); --PLB; break; default: Quit("Invalid edge %c", cEdgeType); } if (0 == PLA && 0 == PLB) break; } Path.Validate(); // SCORE Score = TraceBack(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, Path); #if TRACE SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path); Path.LogMe(); Log("Score = %s Path = %s\n", LocalScoreToStr(BestScore), LocalScoreToStr(scorePath)); #endif if (g_bKeepSimpleDP) { g_DPM = DPM_; g_DPD = DPD_; g_DPE = DPE_; g_DPI = DPI_; g_DPJ = DPJ_; g_TBM = TBM_; g_TBD = TBD_; g_TBE = TBE_; g_TBI = TBI_; g_TBJ = TBJ_; } else { delete[] DPM_; delete[] DPD_; delete[] DPE_; delete[] DPI_; delete[] DPJ_; delete[] TBM_; delete[] TBD_; delete[] TBE_; delete[] TBI_; delete[] TBJ_; } return BestScore; } #endif // DOUBLE_AFFINE nwdasmall.cpp0000664000175000017500000005367112360262614011652 0ustar bobbob#include "muscle.h" #include #include "pwpath.h" #include "profile.h" #include #if DOUBLE_AFFINE // NW double affine small memory, term gaps fully penalized // (so up to caller to adjust in profile if desired). #define TRACE 0 #define MIN(x, y) ((x) < (y) ? (x) : (y)) #if TRACE extern bool g_bKeepSimpleDP; extern SCORE *g_DPM; extern SCORE *g_DPD; extern SCORE *g_DPE; extern SCORE *g_DPI; extern SCORE *g_DPJ; extern char *g_TBM; extern char *g_TBD; extern char *g_TBE; extern char *g_TBI; extern char *g_TBJ; #endif #if TRACE #define ALLOC_TRACE() \ const SCORE UNINIT = MINUS_INFINITY; \ const size_t LM = uPrefixCountA*uPrefixCountB; \ \ SCORE *DPM_ = new SCORE[LM]; \ SCORE *DPD_ = new SCORE[LM]; \ SCORE *DPE_ = new SCORE[LM]; \ SCORE *DPI_ = new SCORE[LM]; \ SCORE *DPJ_ = new SCORE[LM]; \ \ char *TBM_ = new char[LM]; \ char *TBD_ = new char[LM]; \ char *TBE_ = new char[LM]; \ char *TBI_ = new char[LM]; \ char *TBJ_ = new char[LM]; \ \ memset(TBM_, '?', LM); \ memset(TBD_, '?', LM); \ memset(TBE_, '?', LM); \ memset(TBI_, '?', LM); \ memset(TBJ_, '?', LM); \ \ for (unsigned i = 0; i <= uLengthA; ++i) \ for (unsigned j = 0; j <= uLengthB; ++j) \ { \ DPM(i, j) = UNINIT; \ DPD(i, j) = UNINIT; \ DPE(i, j) = UNINIT; \ DPI(i, j) = UNINIT; \ DPJ(i, j) = UNINIT; \ } #else #define ALLOC_TRACE() #endif #if TRACE #define SetDPM(i, j, x) DPM(i, j) = x #define SetDPD(i, j, x) DPD(i, j) = x #define SetDPE(i, j, x) DPE(i, j) = x #define SetDPI(i, j, x) DPI(i, j) = x #define SetDPJ(i, j, x) DPJ(i, j) = x #define SetTBM(i, j, x) TBM(i, j) = x #define SetTBD(i, j, x) TBD(i, j) = x #define SetTBE(i, j, x) TBE(i, j) = x #define SetTBI(i, j, x) TBI(i, j) = x #define SetTBJ(i, j, x) TBJ(i, j) = x #else #define SetDPM(i, j, x) /* empty */ #define SetDPD(i, j, x) /* empty */ #define SetDPE(i, j, x) /* empty */ #define SetDPI(i, j, x) /* empty */ #define SetDPJ(i, j, x) /* empty */ #define SetTBM(i, j, x) /* empty */ #define SetTBD(i, j, x) /* empty */ #define SetTBE(i, j, x) /* empty */ #define SetTBI(i, j, x) /* empty */ #define SetTBJ(i, j, x) /* empty */ #endif #define RECURSE_D(i, j) \ { \ SCORE DD = DRow[j] + e; \ SCORE MD = MPrev[j] + PA[i-1].m_scoreGapOpen;\ if (DD > MD) \ { \ DRow[j] = DD; \ SetTBD(i, j, 'D'); \ } \ else \ { \ DRow[j] = MD; \ SetBitTBD(TB, i, j, 'M'); \ SetTBD(i, j, 'M'); \ } \ SetDPD(i, j, DRow[j]); \ } #define RECURSE_E(i, j) \ { \ SCORE EE = ERow[j] + e2; \ SCORE ME = MPrev[j] + PA[i-1].m_scoreGapOpen2;\ if (EE > ME) \ { \ ERow[j] = EE; \ SetTBE(i, j, 'E'); \ } \ else \ { \ ERow[j] = ME; \ SetBitTBE(TB, i, j, 'M'); \ SetTBE(i, j, 'M'); \ } \ SetDPE(i, j, ERow[j]); \ } #define RECURSE_D_ATerm(j) RECURSE_D(uLengthA, j) #define RECURSE_E_ATerm(j) RECURSE_E(uLengthA, j) #define RECURSE_D_BTerm(j) RECURSE_D(i, uLengthB) #define RECURSE_E_BTerm(j) RECURSE_E(i, uLengthB) #define RECURSE_I(i, j) \ { \ Iij += e; \ SCORE MI = MCurr[j-1] + PB[j-1].m_scoreGapOpen;\ if (MI >= Iij) \ { \ Iij = MI; \ SetBitTBI(TB, i, j, 'M'); \ SetTBI(i, j, 'M'); \ } \ else \ SetTBI(i, j, 'I'); \ SetDPI(i, j, Iij); \ } #define RECURSE_J(i, j) \ { \ Jij += e2; \ SCORE MJ = MCurr[j-1] + PB[j-1].m_scoreGapOpen2;\ if (MJ >= Jij) \ { \ Jij = MJ; \ SetBitTBJ(TB, i, j, 'M'); \ SetTBJ(i, j, 'M'); \ } \ else \ SetTBJ(i, j, 'I'); \ SetDPJ(i, j, Jij); \ } #define RECURSE_I_ATerm(j) RECURSE_I(uLengthA, j) #define RECURSE_J_ATerm(j) RECURSE_J(uLengthA, j) #define RECURSE_I_BTerm(j) RECURSE_I(i, uLengthB) #define RECURSE_J_BTerm(j) RECURSE_J(i, uLengthB) #define RECURSE_M(i, j) \ { \ SCORE Best = MCurr[j]; /* MM */ \ SetTBM(i+1, j+1, 'M'); \ SetBitTBM(TB, i+1, j+1, 'M'); \ \ SCORE DM = DRow[j] + PA[i-1].m_scoreGapClose; \ if (DM > Best) \ { \ Best = DM; \ SetTBM(i+1, j+1, 'D'); \ SetBitTBM(TB, i+1, j+1, 'D'); \ } \ \ SCORE EM = ERow[j] + PA[i-1].m_scoreGapClose2; \ if (EM > Best) \ { \ Best = EM; \ SetTBM(i+1, j+1, 'E'); \ SetBitTBM(TB, i+1, j+1, 'E'); \ } \ \ SCORE IM = Iij + PB[j-1].m_scoreGapClose; \ if (IM > Best) \ { \ Best = IM; \ SetTBM(i+1, j+1, 'I'); \ SetBitTBM(TB, i+1, j+1, 'I'); \ } \ \ SCORE JM = Jij + PB[j-1].m_scoreGapClose2; \ if (JM > Best) \ { \ Best = JM; \ SetTBM(i+1, j+1, 'J'); \ SetBitTBM(TB, i+1, j+1, 'J'); \ } \ MNext[j+1] += Best; \ SetDPM(i+1, j+1, MNext[j+1]); \ } #if TRACE static bool LocalEq(BASETYPE b1, BASETYPE b2) { if (b1 < -100000 && b2 < -100000) return true; double diff = fabs(b1 - b2); if (diff < 0.0001) return true; double sum = fabs(b1) + fabs(b2); return diff/sum < 0.005; } static char Get_M_Char(char Bits) { switch (Bits & BIT_xM) { case BIT_MM: return 'M'; case BIT_DM: return 'D'; case BIT_EM: return 'E'; case BIT_IM: return 'I'; case BIT_JM: return 'J'; } Quit("Huh?"); return '?'; } static char Get_D_Char(char Bits) { return (Bits & BIT_xD) ? 'M' : 'D'; } static char Get_E_Char(char Bits) { return (Bits & BIT_xE) ? 'M' : 'E'; } static char Get_I_Char(char Bits) { return (Bits & BIT_xI) ? 'M' : 'I'; } static char Get_J_Char(char Bits) { return (Bits & BIT_xJ) ? 'M' : 'J'; } static bool DPEq(char c, SCORE *g_DP, SCORE *DPD_, unsigned uPrefixCountA, unsigned uPrefixCountB) { if (0 == g_DP) { Log("***DPDIFF*** DP%c=NULL\n", c); return true; } SCORE *DPM_ = g_DP; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) if (!LocalEq(DPM(i, j), DPD(i, j))) { Log("***DPDIFF*** DP%c(%d, %d) Simple = %.2g, Small = %.2g\n", c, i, j, DPM(i, j), DPD(i, j)); return false; } return true; } static bool CompareTB(char **TB, char *TBM_, char *TBD_, char *TBE_, char *TBI_, char *TBJ_, unsigned uPrefixCountA, unsigned uPrefixCountB) { if (!g_bKeepSimpleDP) return true; SCORE *DPM_ = g_DPM; bool Eq = true; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBM(i, j); char c2 = Get_M_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPM(i, j) > -100000) { Log("TBM(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto D; } } D: SCORE *DPD_ = g_DPD; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBD(i, j); char c2 = Get_D_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPD(i, j) > -100000) { Log("TBD(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto E; } } E: SCORE *DPE_ = g_DPE; if (0 == TBE_) goto I; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBE(i, j); char c2 = Get_E_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPE(i, j) > -100000) { Log("TBE(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto I; } } I: SCORE *DPI_ = g_DPI; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBI(i, j); char c2 = Get_I_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPI(i, j) > -100000) { Log("TBI(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto J; } } J: SCORE *DPJ_ = g_DPJ; if (0 == DPJ_) goto Done; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBJ(i, j); char c2 = Get_J_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPJ(i, j) > -100000) { Log("TBJ(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto Done; } } Done: if (Eq) Log("TB success\n"); return Eq; } static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -100000) return " *"; sprintf(str, "%6.1f", s); return str; } static void LogDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } static void LogBitTB(char **TB, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); Log("Bit TBM:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_M_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBD:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_D_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBE:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_E_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBI:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_I_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBJ:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_J_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } } static void ListTB(char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = TBM(uPrefixLengthA, uPrefixLengthB); Log(" %6c", c); } Log("\n"); } } static const char *BitsToStr(char Bits) { static char Str[32]; sprintf(Str, "%cM %cD %cE %cI %cJ", Get_M_Char(Bits), Get_D_Char(Bits), Get_E_Char(Bits), Get_I_Char(Bits), Get_J_Char(Bits)); } #endif // TRACE static inline void SetBitTBM(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MM; break; case 'D': Bit = BIT_DM; break; #if DOUBLE_AFFINE case 'E': Bit = BIT_EM; break; case 'I': Bit = BIT_IM; break; case 'J': Bit = BIT_JM; break; #endif default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xM; TB[i][j] |= Bit; } static inline void SetBitTBD(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MD; break; case 'D': Bit = BIT_DD; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xD; TB[i][j] |= Bit; } static inline void SetBitTBI(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MI; break; case 'I': Bit = BIT_II; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xI; TB[i][j] |= Bit; } #if DOUBLE_AFFINE static inline void SetBitTBE(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_ME; break; case 'E': Bit = BIT_EE; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xE; TB[i][j] |= Bit; } static inline void SetBitTBJ(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MJ; break; case 'J': Bit = BIT_JJ; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xJ; TB[i][j] |= Bit; } #endif #if TRACE #define LogMatrices() \ { \ Log("Bit DPM:\n"); \ LogDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPD:\n"); \ LogDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPE:\n"); \ LogDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPI:\n"); \ LogDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPJ:\n"); \ LogDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit TB:\n"); \ LogBitTB(TB, PA, PB, uPrefixCountA, uPrefixCountB); \ bool Same; \ Same = DPEq('M', g_DPM, DPM_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPM success\n"); \ Same = DPEq('D', g_DPD, DPD_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPD success\n"); \ Same = DPEq('E', g_DPE, DPE_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPE success\n"); \ Same = DPEq('I', g_DPI, DPI_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPI success\n"); \ Same = DPEq('J', g_DPJ, DPJ_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPJ success\n"); \ CompareTB(TB, g_TBM, g_TBD, g_TBE, g_TBI, g_TBJ, uPrefixCountA, uPrefixCountB);\ } #else #define LogMatrices() /* empty */ #endif SCORE NWDASmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); ProfPos *pa0 = (ProfPos *) PA; ProfPos *pb0 = (ProfPos *) PB; ProfPos *paa = (ProfPos *) (PA + uLengthA - 1); ProfPos *pbb = (ProfPos *) (PB + uLengthB - 1); pa0->m_scoreGapOpen *= -1; pb0->m_scoreGapOpen *= -1; paa->m_scoreGapClose *= -1; pbb->m_scoreGapClose *= -1; pa0->m_scoreGapOpen2 *= -1; pb0->m_scoreGapOpen2 *= -1; paa->m_scoreGapClose2 *= -1; pbb->m_scoreGapClose2 *= -1; const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; const SCORE e = g_scoreGapExtend; const SCORE e2 = g_scoreGapExtend2; const SCORE min_e = MIN(g_scoreGapExtend, g_scoreGapExtend2); ALLOC_TRACE() SCORE *MCurr = new SCORE[uPrefixCountB]; SCORE *MNext = new SCORE[uPrefixCountB]; SCORE *MPrev = new SCORE[uPrefixCountB]; SCORE *DRow = new SCORE[uPrefixCountB]; SCORE *ERow = new SCORE[uPrefixCountB]; char **TB = new char *[uPrefixCountA]; for (unsigned i = 0; i < uPrefixCountA; ++i) { TB[i] = new char [uPrefixCountB]; memset(TB[i], 0, uPrefixCountB); } SCORE Iij = MINUS_INFINITY; SetDPI(0, 0, Iij); SCORE Jij = MINUS_INFINITY; SetDPJ(0, 0, Jij); Iij = PB[0].m_scoreGapOpen; SetDPI(0, 1, Iij); Jij = PB[0].m_scoreGapOpen2; SetDPJ(0, 1, Jij); for (unsigned j = 2; j <= uLengthB; ++j) { Iij += e; Jij += e2; SetDPI(0, j, Iij); SetDPJ(0, j, Jij); SetTBI(0, j, 'I'); SetTBJ(0, j, 'J'); } for (unsigned j = 0; j <= uLengthB; ++j) { DRow[j] = MINUS_INFINITY; ERow[j] = MINUS_INFINITY; SetDPD(0, j, DRow[j]); SetDPE(0, j, ERow[j]); SetTBD(0, j, 'D'); SetTBE(0, j, 'E'); } MPrev[0] = 0; SetDPM(0, 0, MPrev[0]); for (unsigned j = 1; j <= uLengthB; ++j) { MPrev[j] = MINUS_INFINITY; SetDPM(0, j, MPrev[j]); } MCurr[0] = MINUS_INFINITY; SetDPM(1, 0, MCurr[0]); MCurr[1] = ScoreProfPos2(PA[0], PB[0]); SetDPM(1, 1, MCurr[1]); SetBitTBM(TB, 1, 1, 'M'); SetTBM(1, 1, 'M'); for (unsigned j = 2; j <= uLengthB; ++j) { SCORE M = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen + (j - 2)*e + PB[j-2].m_scoreGapClose; SCORE M2 = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen2 + (j - 2)*e2 + PB[j-2].m_scoreGapClose2; if (M >= M2) { MCurr[j] = M; SetBitTBM(TB, 1, j, 'I'); SetTBM(1, j, 'I'); } else { MCurr[j] = M2; SetBitTBM(TB, 1, j, 'J'); SetTBM(1, j, 'J'); } SetDPM(1, j, MCurr[j]); } // Main DP loop for (unsigned i = 1; i < uLengthA; ++i) { Iij = MINUS_INFINITY; Jij = MINUS_INFINITY; SetDPI(i, 0, Iij); SetDPJ(i, 0, Jij); DRow[0] = PA[0].m_scoreGapOpen + (i - 1)*e; ERow[0] = PA[0].m_scoreGapOpen2 + (i - 1)*e2; SetDPD(i, 0, DRow[0]); SetDPE(i, 0, ERow[0]); MCurr[0] = MINUS_INFINITY; if (i == 1) { MCurr[1] = ScoreProfPos2(PA[0], PB[0]); SetBitTBM(TB, i, 1, 'M'); SetTBM(i, 1, 'M'); } else { SCORE M = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen + (i - 2)*e + PA[i-2].m_scoreGapClose; SCORE M2 = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen2 + (i - 2)*e2 + PA[i-2].m_scoreGapClose2; if (M >= M2) { MCurr[1] = M; SetBitTBM(TB, i, 1, 'D'); SetTBM(i, 1, 'D'); } else { MCurr[1] = M2; SetBitTBM(TB, i, 1, 'E'); SetTBM(i, 1, 'E'); } } SetDPM(i, 0, MCurr[0]); SetDPM(i, 1, MCurr[1]); for (unsigned j = 1; j < uLengthB; ++j) MNext[j+1] = ScoreProfPos2(PA[i], PB[j]); for (unsigned j = 1; j < uLengthB; ++j) { RECURSE_D(i, j) RECURSE_E(i, j) RECURSE_I(i, j) RECURSE_J(i, j) RECURSE_M(i, j) } // Special case for j=uLengthB RECURSE_D_BTerm(i) RECURSE_E_BTerm(i) RECURSE_I_BTerm(i) RECURSE_J_BTerm(i) // Prev := Curr, Curr := Next, Next := Prev Rotate(MPrev, MCurr, MNext); } // Special case for i=uLengthA MCurr[0] = MINUS_INFINITY; SCORE M = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e + PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose; SCORE M2 = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e + PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose; if (M >= M2) { MCurr[1] = M; SetBitTBM(TB, uLengthA, 1, 'D'); SetTBM(uLengthA, 1, 'D'); } else { MCurr[1] = M2; SetBitTBM(TB, uLengthA, 1, 'E'); SetTBM(uLengthA, 1, 'D'); } SetDPM(uLengthA, 0, MCurr[0]); SetDPM(uLengthA, 1, MCurr[1]); DRow[0] = MINUS_INFINITY; ERow[0] = MINUS_INFINITY; SetDPD(uLengthA, 0, DRow[0]); SetDPE(uLengthA, 0, ERow[0]); for (unsigned j = 1; j <= uLengthB; ++j) { RECURSE_D_ATerm(j); RECURSE_E_ATerm(j); } Iij = MINUS_INFINITY; Jij = MINUS_INFINITY; for (unsigned j = 1; j <= uLengthB; ++j) { RECURSE_I_ATerm(j) RECURSE_J_ATerm(j) } LogMatrices(); SCORE MAB = MCurr[uLengthB]; SCORE DAB = DRow[uLengthB] + PA[uLengthA-1].m_scoreGapClose; SCORE EAB = ERow[uLengthB] + PA[uLengthA-1].m_scoreGapClose2; SCORE IAB = Iij + PB[uLengthB-1].m_scoreGapClose; SCORE JAB = Jij + PB[uLengthB-1].m_scoreGapClose2; SCORE Score = MAB; char cEdgeType = 'M'; if (DAB > Score) { Score = DAB; cEdgeType = 'D'; } if (EAB > Score) { Score = EAB; cEdgeType = 'E'; } if (IAB > Score) { Score = IAB; cEdgeType = 'I'; } if (JAB > Score) { Score = JAB; cEdgeType = 'J'; } #if TRACE Log(" Small: MAB=%.4g DAB=%.4g EAB=%.4g IAB=%.4g JAB=%.4g best=%c\n", MAB, DAB, EAB, IAB, JAB, cEdgeType); #endif BitTraceBack(TB, uLengthA, uLengthB, cEdgeType, Path); #if DBEUG Path.Validate(); #endif delete[] MCurr; delete[] MNext; delete[] MPrev; delete[] DRow; delete[] ERow; for (unsigned i = 0; i < uPrefixCountA; ++i) delete[] TB[i]; delete[] TB; return 0; } #endif // DOUBLE_AFFINE nwrec.cpp0000664000175000017500000000460012360262613010771 0ustar bobbob/*** Needleman-Wunch recursions Notation: i,j are prefix lengths so are in ranges i = [0,|A|] and j = [0,|B|]. Profile positions are in ranges [0,|A|-1] and [0,|B|-1] so prefix length i corresponds to position (i-1) in the profile, and similarly for j. Terminal gap scoring -------------------- Terminal gaps are scored as with open [close] penalties only at the left [right] terminal, as follows: 0 i | | A XXXXX... B ---XX... i |A|-1 | | A ...XXXXX B ...XX--- In these examples, open / close penalty at position i is included, but close / open penalty at |A|-1 / 0 is not included. This is implemented by setting the open [close] penalty to zero in the first [last] position of each profile. Consider adding a column to a sub-alignment. After the column is added, there are i letters from A and j letters from B. The column starts a left-terminal gap if: Delete with i=1, j=0 or Insert with i=0, j=1. The column ends a left-terminal gap if: Match following Delete with j=1, or Match following Insert with i=1. The column starts a right-terminal gap if: Delete following a Match and i=|A|, or Insert following a Match and j=|B|. The column ends a right-terminal gap if: Match with i=|A|, j=|B| following Delete or Insert. RECURSION RELATIONS =================== i-1 | DD A ..X X B ..- - MD A ..X X B ..X - D(i,j) = max D(i-1,j) + e M(i-1,j) + goA(i-1) Valid for: i = [1,|A|-1] j = [1,|B|] I(i,j) By symmetry with D(i,j). i-2 | i-1 | | MM A ..X X B ..X X DM A ..X X B ..- X IM A ..- X B ..X X | | | j-1 j-2 M(i,j) = L(i-1,j-1) + max M(i-1,j-1) D(i-1,j-1) + gcA(i-2) I(i-1,j-1) + gcB(j-2) Valid for: i = [2,|A|] j = [2,|B|] Equivalently: M(i+1,j+1) = L(i,j) + max M(i,j) D(i,j) + gcA(i-1) I(i,j) + gcB(j-1) Valid for: i = [1,|A|-1] j = [1,|B|-1] Boundary conditions =================== A XXXX B ---- D(0,0) = -infinity D(i,0) = ie i = [1,|A|] D(0,j) = -infinity j = [0,|B|] I(0,0), I(0,j) and I(i,0) by symmetry with D. M(0,0) = 0 M(i,0) = -infinity, i > 0 M(0,j) = -infinity, j > 0 A X B - D(1,0) = e D(1,j) = -infinity, j = [1,|B|] (assuming no I-D allowed). D(0,1) = -infinity D(1,1) = -infinity D(i,1) = max. ***/ nwsmall.cpp0000664000175000017500000003743412360262614011344 0ustar bobbob#include "muscle.h" #include #include "pwpath.h" #include "profile.h" #include // NW small memory #define TRACE 0 #if TRACE extern bool g_bKeepSimpleDP; extern SCORE *g_DPM; extern SCORE *g_DPD; extern SCORE *g_DPI; extern char *g_TBM; extern char *g_TBD; extern char *g_TBI; #endif #if TRACE #define ALLOC_TRACE() \ const SCORE UNINIT = MINUS_INFINITY; \ const size_t LM = uPrefixCountA*uPrefixCountB; \ \ SCORE *DPM_ = new SCORE[LM]; \ SCORE *DPD_ = new SCORE[LM]; \ SCORE *DPI_ = new SCORE[LM]; \ \ char *TBM_ = new char[LM]; \ char *TBD_ = new char[LM]; \ char *TBI_ = new char[LM]; \ \ memset(TBM_, '?', LM); \ memset(TBD_, '?', LM); \ memset(TBI_, '?', LM); \ \ for (unsigned i = 0; i <= uLengthA; ++i) \ for (unsigned j = 0; j <= uLengthB; ++j) \ { \ DPM(i, j) = UNINIT; \ DPD(i, j) = UNINIT; \ DPI(i, j) = UNINIT; \ } #else #define ALLOC_TRACE() #endif #if TRACE #define SetDPM(i, j, x) DPM(i, j) = x #define SetDPD(i, j, x) DPD(i, j) = x #define SetDPI(i, j, x) DPI(i, j) = x #define SetTBM(i, j, x) TBM(i, j) = x #define SetTBD(i, j, x) TBD(i, j) = x #define SetTBI(i, j, x) TBI(i, j) = x #else #define SetDPM(i, j, x) /* empty */ #define SetDPD(i, j, x) /* empty */ #define SetDPI(i, j, x) /* empty */ #define SetTBM(i, j, x) /* empty */ #define SetTBD(i, j, x) /* empty */ #define SetTBI(i, j, x) /* empty */ #endif #define RECURSE_D(i, j) \ { \ SCORE DD = DRow[j] + e; \ SCORE MD = MPrev[j] + PA[i-1].m_scoreGapOpen;\ if (DD > MD) \ { \ DRow[j] = DD; \ SetTBD(i, j, 'D'); \ } \ else \ { \ DRow[j] = MD; \ /* SetBitTBD(TB, i, j, 'M'); */ \ TBRow[j] &= ~BIT_xD; \ TBRow[j] |= BIT_MD; \ SetTBD(i, j, 'M'); \ } \ SetDPD(i, j, DRow[j]); \ } #define RECURSE_D_ATerm(j) RECURSE_D(uLengthA, j) #define RECURSE_D_BTerm(j) RECURSE_D(i, uLengthB) #define RECURSE_I(i, j) \ { \ Iij += e; \ SCORE MI = MCurr[j-1] + PB[j-1].m_scoreGapOpen;\ if (MI >= Iij) \ { \ Iij = MI; \ /* SetBitTBI(TB, i, j, 'M'); */ \ TBRow[j] &= ~BIT_xI; \ TBRow[j] |= BIT_MI; \ SetTBI(i, j, 'M'); \ } \ else \ SetTBI(i, j, 'I'); \ SetDPI(i, j, Iij); \ } #define RECURSE_I_ATerm(j) RECURSE_I(uLengthA, j) #define RECURSE_I_BTerm(j) RECURSE_I(i, uLengthB) #define RECURSE_M(i, j) \ { \ SCORE DM = DRow[j] + PA[i-1].m_scoreGapClose; \ SCORE IM = Iij + PB[j-1].m_scoreGapClose; \ SCORE MM = MCurr[j]; \ TB[i+1][j+1] &= ~BIT_xM; \ if (MM >= DM && MM >= IM) \ { \ MNext[j+1] += MM; \ SetDPM(i+1, j+1, MNext[j+1]); \ SetTBM(i+1, j+1, 'M'); \ /* SetBitTBM(TB, i+1, j+1, 'M'); */ \ TB[i+1][j+1] |= BIT_MM; \ } \ else if (DM >= MM && DM >= IM) \ { \ MNext[j+1] += DM; \ SetDPM(i+1, j+1, MNext[j+1]); \ SetTBM(i+1, j+1, 'D'); \ /* SetBitTBM(TB, i+1, j+1, 'D'); */ \ TB[i+1][j+1] |= BIT_DM; \ } \ else \ { \ assert(IM >= MM && IM >= DM); \ MNext[j+1] += IM; \ SetDPM(i+1, j+1, MNext[j+1]); \ SetTBM(i+1, j+1, 'I'); \ /* SetBitTBM(TB, i+1, j+1, 'I'); */ \ TB[i+1][j+1] |= BIT_IM; \ } \ } #if TRACE static bool LocalEq(BASETYPE b1, BASETYPE b2) { if (b1 < -100000 && b2 < -100000) return true; double diff = fabs(b1 - b2); if (diff < 0.0001) return true; double sum = fabs(b1) + fabs(b2); return diff/sum < 0.005; } static char Get_M_Char(char Bits) { switch (Bits & BIT_xM) { case BIT_MM: return 'M'; case BIT_DM: return 'D'; case BIT_IM: return 'I'; } Quit("Huh?"); return '?'; } static char Get_D_Char(char Bits) { return (Bits & BIT_xD) ? 'M' : 'D'; } static char Get_I_Char(char Bits) { return (Bits & BIT_xI) ? 'M' : 'I'; } static bool DPEq(char c, SCORE *g_DP, SCORE *DPD_, unsigned uPrefixCountA, unsigned uPrefixCountB) { SCORE *DPM_ = g_DP; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) if (!LocalEq(DPM(i, j), DPD(i, j))) { Log("***DPDIFF*** DP%c(%d, %d) Simple = %.2g, Fast = %.2g\n", c, i, j, DPM(i, j), DPD(i, j)); return false; } return true; } static bool CompareTB(char **TB, char *TBM_, char *TBD_, char *TBI_, unsigned uPrefixCountA, unsigned uPrefixCountB) { SCORE *DPM_ = g_DPM; bool Eq = true; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBM(i, j); char c2 = Get_M_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPM(i, j) > -100000) { Log("TBM(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto D; } } D: SCORE *DPD_ = g_DPD; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBD(i, j); char c2 = Get_D_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPD(i, j) > -100000) { Log("TBD(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto I; } } I: SCORE *DPI_ = g_DPI; for (unsigned i = 0; i < uPrefixCountA; ++i) for (unsigned j = 0; j < uPrefixCountB; ++j) { char c1 = TBI(i, j); char c2 = Get_I_Char(TB[i][j]); if (c1 != '?' && c1 != c2 && DPI(i, j) > -100000) { Log("TBI(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); Eq = false; goto Done; } } Done: if (Eq) Log("TB success\n"); return Eq; } static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -100000) return " *"; sprintf(str, "%6.1f", s); return str; } static void LogDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } static void LogBitTB(char **TB, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); Log("Bit TBM:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_M_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBD:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_D_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } Log("\n"); Log("Bit TBI:\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = Get_I_Char(TB[uPrefixLengthA][uPrefixLengthB]); Log(" %6c", c); } Log("\n"); } } static void ListTB(char *TBM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = TBM(uPrefixLengthA, uPrefixLengthB); Log(" %6c", c); } Log("\n"); } } static const char *BitsToStr(char Bits) { static char Str[9]; sprintf(Str, "%cM %cD %cI", Get_M_Char(Bits), Get_D_Char(Bits), Get_I_Char(Bits)); } #endif // TRACE static inline void SetBitTBM(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MM; break; case 'D': Bit = BIT_DM; break; case 'I': Bit = BIT_IM; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xM; TB[i][j] |= Bit; } static inline void SetBitTBD(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MD; break; case 'D': Bit = BIT_DD; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xD; TB[i][j] |= Bit; } static inline void SetBitTBI(char **TB, unsigned i, unsigned j, char c) { char Bit; switch (c) { case 'M': Bit = BIT_MI; break; case 'I': Bit = BIT_II; break; default: Quit("Huh?!"); } TB[i][j] &= ~BIT_xI; TB[i][j] |= Bit; } #if TRACE #define LogMatrices() \ { \ Log("Bit DPM:\n"); \ LogDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPD:\n"); \ LogDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit DPI:\n"); \ LogDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); \ Log("Bit TB:\n"); \ LogBitTB(TB, PA, PB, uPrefixCountA, uPrefixCountB); \ bool Same; \ Same = DPEq('M', g_DPM, DPM_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPM success\n"); \ Same = DPEq('D', g_DPD, DPD_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPD success\n"); \ Same = DPEq('I', g_DPI, DPI_, uPrefixCountA, uPrefixCountB);\ if (Same) \ Log("DPI success\n"); \ CompareTB(TB, g_TBM, g_TBD, g_TBI, uPrefixCountA, uPrefixCountB);\ } #else #define LogMatrices() /* empty */ #endif static unsigned uCachePrefixCountB; static unsigned uCachePrefixCountA; static SCORE *CacheMCurr; static SCORE *CacheMNext; static SCORE *CacheMPrev; static SCORE *CacheDRow; static char **CacheTB; static void AllocCache(unsigned uPrefixCountA, unsigned uPrefixCountB) { if (uPrefixCountA <= uCachePrefixCountA && uPrefixCountB <= uCachePrefixCountB) return; delete[] CacheMCurr; delete[] CacheMNext; delete[] CacheMPrev; delete[] CacheDRow; for (unsigned i = 0; i < uCachePrefixCountA; ++i) delete[] CacheTB[i]; delete[] CacheTB; uCachePrefixCountA = uPrefixCountA + 1024; uCachePrefixCountB = uPrefixCountB + 1024; CacheMCurr = new SCORE[uCachePrefixCountB]; CacheMNext = new SCORE[uCachePrefixCountB]; CacheMPrev = new SCORE[uCachePrefixCountB]; CacheDRow = new SCORE[uCachePrefixCountB]; CacheTB = new char *[uCachePrefixCountA]; for (unsigned i = 0; i < uCachePrefixCountA; ++i) CacheTB[i] = new char [uCachePrefixCountB]; } SCORE NWSmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { if (0 == uLengthB || 0 == uLengthA ) Quit("Internal error, NWSmall: length=0"); SetTermGaps(PA, uLengthA); SetTermGaps(PB, uLengthB); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; const SCORE e = g_scoreGapExtend; ALLOC_TRACE() AllocCache(uPrefixCountA, uPrefixCountB); SCORE *MCurr = CacheMCurr; SCORE *MNext = CacheMNext; SCORE *MPrev = CacheMPrev; SCORE *DRow = CacheDRow; char **TB = CacheTB; for (unsigned i = 0; i < uPrefixCountA; ++i) memset(TB[i], 0, uPrefixCountB); SCORE Iij = MINUS_INFINITY; SetDPI(0, 0, Iij); Iij = PB[0].m_scoreGapOpen; SetDPI(0, 1, Iij); for (unsigned j = 2; j <= uLengthB; ++j) { Iij += e; SetDPI(0, j, Iij); SetTBI(0, j, 'I'); } for (unsigned j = 0; j <= uLengthB; ++j) { DRow[j] = MINUS_INFINITY; SetDPD(0, j, DRow[j]); SetTBD(0, j, 'D'); } MPrev[0] = 0; SetDPM(0, 0, MPrev[0]); for (unsigned j = 1; j <= uLengthB; ++j) { MPrev[j] = MINUS_INFINITY; SetDPM(0, j, MPrev[j]); } MCurr[0] = MINUS_INFINITY; SetDPM(1, 0, MCurr[0]); MCurr[1] = ScoreProfPos2(PA[0], PB[0]); SetDPM(1, 1, MCurr[1]); SetBitTBM(TB, 1, 1, 'M'); SetTBM(1, 1, 'M'); for (unsigned j = 2; j <= uLengthB; ++j) { MCurr[j] = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen + (j - 2)*e + PB[j-2].m_scoreGapClose; SetDPM(1, j, MCurr[j]); SetBitTBM(TB, 1, j, 'I'); SetTBM(1, j, 'I'); } // Main DP loop for (unsigned i = 1; i < uLengthA; ++i) { char *TBRow = TB[i]; Iij = MINUS_INFINITY; SetDPI(i, 0, Iij); DRow[0] = PA[0].m_scoreGapOpen + (i - 1)*e; SetDPD(i, 0, DRow[0]); MCurr[0] = MINUS_INFINITY; if (i == 1) { MCurr[1] = ScoreProfPos2(PA[0], PB[0]); SetBitTBM(TB, i, 1, 'M'); SetTBM(i, 1, 'M'); } else { MCurr[1] = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen + (i - 2)*e + PA[i-2].m_scoreGapClose; SetBitTBM(TB, i, 1, 'D'); SetTBM(i, 1, 'D'); } SetDPM(i, 0, MCurr[0]); SetDPM(i, 1, MCurr[1]); for (unsigned j = 1; j < uLengthB; ++j) MNext[j+1] = ScoreProfPos2(PA[i], PB[j]); for (unsigned j = 1; j < uLengthB; ++j) { RECURSE_D(i, j) RECURSE_I(i, j) RECURSE_M(i, j) } // Special case for j=uLengthB RECURSE_D_BTerm(i) RECURSE_I_BTerm(i) // Prev := Curr, Curr := Next, Next := Prev Rotate(MPrev, MCurr, MNext); } // Special case for i=uLengthA char *TBRow = TB[uLengthA]; MCurr[0] = MINUS_INFINITY; if (uLengthA > 1) MCurr[1] = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e + PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose; else MCurr[1] = ScoreProfPos2(PA[uLengthA-1], PB[0]) + PA[0].m_scoreGapOpen + PA[0].m_scoreGapClose; SetBitTBM(TB, uLengthA, 1, 'D'); SetTBM(uLengthA, 1, 'D'); SetDPM(uLengthA, 0, MCurr[0]); SetDPM(uLengthA, 1, MCurr[1]); DRow[0] = MINUS_INFINITY; SetDPD(uLengthA, 0, DRow[0]); for (unsigned j = 1; j <= uLengthB; ++j) RECURSE_D_ATerm(j); Iij = MINUS_INFINITY; for (unsigned j = 1; j <= uLengthB; ++j) RECURSE_I_ATerm(j) LogMatrices(); SCORE MAB = MCurr[uLengthB]; SCORE DAB = DRow[uLengthB]; SCORE IAB = Iij; SCORE Score = MAB; char cEdgeType = 'M'; if (DAB > Score) { Score = DAB; cEdgeType = 'D'; } if (IAB > Score) { Score = IAB; cEdgeType = 'I'; } #if TRACE Log(" Fast: MAB=%.4g DAB=%.4g IAB=%.4g best=%c\n", MAB, DAB, IAB, cEdgeType); #endif BitTraceBack(TB, uLengthA, uLengthB, cEdgeType, Path); #if DBEUG Path.Validate(); #endif return 0; } objscore2.cpp0000664000175000017500000003304512360262613011550 0ustar bobbob#include "muscle.h" #include "msa.h" #include "profile.h" #include "objscore.h" #define TRACE 0 #define TRACE_SEQPAIR 0 #define TEST_SPFAST 0 extern SCOREMATRIX VTML_LA; extern SCOREMATRIX PAM200; extern SCOREMATRIX PAM200NoCenter; extern SCOREMATRIX VTML_SP; extern SCOREMATRIX VTML_SPNoCenter; extern SCOREMATRIX NUC_SP; SCORE g_SPScoreLetters; SCORE g_SPScoreGaps; static SCORE TermGapScore(bool Gap) { switch (g_TermGaps) { case TERMGAPS_Full: return 0; case TERMGAPS_Half: if (Gap) return g_scoreGapOpen/2; return 0; case TERMGAPS_Ext: if (Gap) return g_scoreGapExtend; return 0; } Quit("TermGapScore?!"); return 0; } SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2) { const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPairLetters, different lengths"); #if TRACE_SEQPAIR { Log("\n"); Log("ScoreSeqPairLetters\n"); MSA msaTmp; msaTmp.SetSize(2, uColCount); msaTmp.CopySeq(0, msa1, uSeqIndex1); msaTmp.CopySeq(1, msa2, uSeqIndex2); msaTmp.LogMe(); } #endif SCORE scoreLetters = 0; SCORE scoreGaps = 0; bool bGapping1 = false; bool bGapping2 = false; unsigned uColStart = 0; bool bLeftTermGap = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bLeftTermGap = true; uColStart = uColIndex; break; } } unsigned uColEnd = uColCount - 1; bool bRightTermGap = false; for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bRightTermGap = true; uColEnd = (unsigned) iColIndex; break; } } #if TRACE_SEQPAIR Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); #endif for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) { unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter1 >= g_AlphaSize) continue; unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex); if (uLetter2 >= g_AlphaSize) continue; SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2]; scoreLetters += scoreMatch; } return scoreLetters; } SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2) { const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPairGaps, different lengths"); #if TRACE_SEQPAIR { Log("\n"); Log("ScoreSeqPairGaps\n"); MSA msaTmp; msaTmp.SetSize(2, uColCount); msaTmp.CopySeq(0, msa1, uSeqIndex1); msaTmp.CopySeq(1, msa2, uSeqIndex2); msaTmp.LogMe(); } #endif SCORE scoreGaps = 0; bool bGapping1 = false; bool bGapping2 = false; unsigned uColStart = 0; bool bLeftTermGap = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bLeftTermGap = true; uColStart = uColIndex; break; } } unsigned uColEnd = uColCount - 1; bool bRightTermGap = false; for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bRightTermGap = true; uColEnd = (unsigned) iColIndex; break; } } #if TRACE_SEQPAIR Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); #endif for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (bGap1 && bGap2) continue; if (bGap1) { if (!bGapping1) { #if TRACE_SEQPAIR Log("Gap open seq 1 col %d\n", uColIndex); #endif if (uColIndex == uColStart) scoreGaps += TermGapScore(true); else scoreGaps += g_scoreGapOpen; bGapping1 = true; } else scoreGaps += g_scoreGapExtend; continue; } else if (bGap2) { if (!bGapping2) { #if TRACE_SEQPAIR Log("Gap open seq 2 col %d\n", uColIndex); #endif if (uColIndex == uColStart) scoreGaps += TermGapScore(true); else scoreGaps += g_scoreGapOpen; bGapping2 = true; } else scoreGaps += g_scoreGapExtend; continue; } bGapping1 = false; bGapping2 = false; } if (bGapping1 || bGapping2) { scoreGaps -= g_scoreGapOpen; scoreGaps += TermGapScore(true); } return scoreGaps; } // The usual sum-of-pairs objective score: sum the score // of the alignment of each pair of sequences. SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[]) { #if TRACE Log("==================ObjScoreSP==============\n"); Log("msa=\n"); msa.LogMe(); #endif g_SPScoreLetters = 0; g_SPScoreGaps = 0; if (0 != MatchScore) { const unsigned uColCount = msa.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) MatchScore[uColIndex] = 0; } const unsigned uSeqCount = msa.GetSeqCount(); SCORE scoreTotal = 0; unsigned uPairCount = 0; #if TRACE Log("Seq1 Seq2 wt1 wt2 Letters Gaps Unwt.Score Wt.Score Total\n"); Log("---- ---- ------ ------ ---------- ---------- ---------- ---------- ----------\n"); #endif for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scorePair = scoreLetters + scoreGaps; ++uPairCount; scoreTotal += w*scorePair; g_SPScoreLetters += w*scoreLetters; g_SPScoreGaps += w*scoreGaps; #if TRACE Log("%4d %4d %6.3f %6.3f %10.2f %10.2f %10.2f %10.2f %10.2f >%s >%s\n", uSeqIndex1, uSeqIndex2, w1, w2, scoreLetters, scoreGaps, scorePair, scorePair*w1*w2, scoreTotal, msa.GetSeqName(uSeqIndex1), msa.GetSeqName(uSeqIndex2)); #endif } } #if TEST_SPFAST { SCORE f = ObjScoreSPFast(msa); Log("Fast = %.6g\n", f); Log("Brute = %.6g\n", scoreTotal); if (BTEq(f, scoreTotal)) Log("Agree\n"); else Log("** DISAGREE **\n"); } #endif // return scoreTotal / uPairCount; return scoreTotal; } // Objective score defined as the dynamic programming score. // Input is two alignments, which must be of the same length. // Result is the same profile-profile score that is optimized // by dynamic programming. SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[]) { const unsigned uColCount = msa1.GetColCount(); if (msa2.GetColCount() != uColCount) Quit("ObjScoreDP, must be same length"); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const ProfPos *PA = ProfileFromMSA(msa1); const ProfPos *PB = ProfileFromMSA(msa2); return ObjScoreDP_Profs(PA, PB, uColCount1, MatchScore); } SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount, SCORE MatchScore[]) { //#if TRACE // Log("Profile 1:\n"); // ListProfile(PA, uColCount, &msa1); // // Log("Profile 2:\n"); // ListProfile(PB, uColCount, &msa2); //#endif SCORE scoreTotal = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const ProfPos &PPA = PA[uColIndex]; const ProfPos &PPB = PB[uColIndex]; SCORE scoreGap = 0; SCORE scoreMatch = 0; // If gapped column... if (PPA.m_bAllGaps && PPB.m_bAllGaps) scoreGap = 0; else if (PPA.m_bAllGaps) { if (uColCount - 1 == uColIndex || !PA[uColIndex+1].m_bAllGaps) scoreGap = PPB.m_scoreGapClose; if (0 == uColIndex || !PA[uColIndex-1].m_bAllGaps) scoreGap += PPB.m_scoreGapOpen; //if (0 == scoreGap) // scoreGap = PPB.m_scoreGapExtend; } else if (PPB.m_bAllGaps) { if (uColCount - 1 == uColIndex || !PB[uColIndex+1].m_bAllGaps) scoreGap = PPA.m_scoreGapClose; if (0 == uColIndex || !PB[uColIndex-1].m_bAllGaps) scoreGap += PPA.m_scoreGapOpen; //if (0 == scoreGap) // scoreGap = PPA.m_scoreGapExtend; } else scoreMatch = ScoreProfPos2(PPA, PPB); if (0 != MatchScore) MatchScore[uColIndex] = scoreMatch; scoreTotal += scoreMatch + scoreGap; extern bool g_bTracePPScore; extern MSA *g_ptrPPScoreMSA1; extern MSA *g_ptrPPScoreMSA2; if (g_bTracePPScore) { const MSA &msa1 = *g_ptrPPScoreMSA1; const MSA &msa2 = *g_ptrPPScoreMSA2; const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); for (unsigned n = 0; n < uSeqCount1; ++n) Log("%c", msa1.GetChar(n, uColIndex)); Log(" "); for (unsigned n = 0; n < uSeqCount2; ++n) Log("%c", msa2.GetChar(n, uColIndex)); Log(" %10.3f", scoreMatch); if (scoreGap != 0) Log(" %10.3f", scoreGap); Log("\n"); } } delete[] PA; delete[] PB; return scoreTotal; } // Objective score defined as the sum of profile-sequence // scores for each sequence in the alignment. The profile // is computed from the entire alignment, so this includes // the score of each sequence against itself. This is to // avoid recomputing the profile each time, so we reduce // complexity but introduce a questionable approximation. // The goal is to see if we can exploit the apparent // improvement in performance of log-expectation score // over the usual sum-of-pairs by optimizing this // objective score in the iterative refinement stage. SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[]) { if (g_PPScore != PPSCORE_LE) Quit("FastScoreMSA_LASimple: LA"); const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); const ProfPos *Prof = ProfileFromMSA(msa); if (0 != MatchScore) for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) MatchScore[uColIndex] = 0; SCORE scoreTotal = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const WEIGHT weightSeq = msa.GetSeqWeight(uSeqIndex); SCORE scoreSeq = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const ProfPos &PP = Prof[uColIndex]; if (msa.IsGap(uSeqIndex, uColIndex)) { bool bOpen = (0 == uColIndex || !msa.IsGap(uSeqIndex, uColIndex - 1)); bool bClose = (uColCount - 1 == uColIndex || !msa.IsGap(uSeqIndex, uColIndex + 1)); if (bOpen) scoreSeq += PP.m_scoreGapOpen; if (bClose) scoreSeq += PP.m_scoreGapClose; //if (!bOpen && !bClose) // scoreSeq += PP.m_scoreGapExtend; } else if (msa.IsWildcard(uSeqIndex, uColIndex)) continue; else { unsigned uLetter = msa.GetLetter(uSeqIndex, uColIndex); const SCORE scoreMatch = PP.m_AAScores[uLetter]; if (0 != MatchScore) MatchScore[uColIndex] += weightSeq*scoreMatch; scoreSeq += scoreMatch; } } scoreTotal += weightSeq*scoreSeq; } delete[] Prof; return scoreTotal; } // The XP score is the sum of the score of each pair of // sequences between two profiles which are aligned to // each other. Notice that for two given profiles aligned // in different ways, the difference in XP score must be // the same as the difference in SP score because the // score of a pair of sequences in one profile doesn't // depend on the alignment. SCORE ObjScoreXP(const MSA &msa1, const MSA &msa2) { const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount1 != uColCount2) Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); #if TRACE Log(" Score Weight Weight Total\n"); Log("---------- ------ ------ ----------\n"); #endif SCORE scoreTotal = 0; unsigned uPairCount = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) { const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) { const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2); SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2); SCORE scorePair = scoreLetters + scoreGaps; scoreTotal += w1*w2*scorePair; ++uPairCount; #if TRACE Log("%10.2f %6.3f %6.3f %10.2f >%s >%s\n", scorePair, w1, w2, scorePair*w1*w2, msa1.GetSeqName(uSeqIndex1), msa2.GetSeqName(uSeqIndex2)); #endif } } if (0 == uPairCount) Quit("0 == uPairCount"); #if TRACE Log("msa1=\n"); msa1.LogMe(); Log("msa2=\n"); msa2.LogMe(); Log("XP=%g\n", scoreTotal); #endif // return scoreTotal / uPairCount; return scoreTotal; } objscore.cpp0000664000175000017500000000452712360262613011471 0ustar bobbob#include "muscle.h" #include "msa.h" #include "objscore.h" #include "profile.h" #include "timing.h" #if TIMING TICKS g_ticksObjScore = 0; #endif SCORE ObjScore(const MSA &msa, const unsigned SeqIndexes1[], unsigned uSeqCount1, const unsigned SeqIndexes2[], unsigned uSeqCount2) { #if TIMING TICKS t1 = GetClockTicks(); #endif const unsigned uSeqCount = msa.GetSeqCount(); OBJSCORE OS = g_ObjScore; if (g_ObjScore == OBJSCORE_SPM) { if (uSeqCount <= 100) OS = OBJSCORE_XP; else OS = OBJSCORE_SPF; } MSA msa1; MSA msa2; switch (OS) { case OBJSCORE_DP: case OBJSCORE_XP: MSAFromSeqSubset(msa, SeqIndexes1, uSeqCount1, msa1); MSAFromSeqSubset(msa, SeqIndexes2, uSeqCount2, msa2); SetMSAWeightsMuscle(msa1); SetMSAWeightsMuscle(msa2); break; case OBJSCORE_SP: case OBJSCORE_SPF: case OBJSCORE_PS: // Yuck -- casting away const (design flaw) SetMSAWeightsMuscle((MSA &) msa); break; } SCORE Score = 0; switch (OS) { case OBJSCORE_SP: Score = ObjScoreSP(msa); break; case OBJSCORE_DP: Score = ObjScoreDP(msa1, msa2); break; case OBJSCORE_XP: Score = ObjScoreXP(msa1, msa2); break; case OBJSCORE_PS: Score = ObjScorePS(msa); break; case OBJSCORE_SPF: Score = ObjScoreSPDimer(msa); break; default: Quit("Invalid g_ObjScore=%d", g_ObjScore); } #if TIMING TICKS t2 = GetClockTicks(); g_ticksObjScore += (t2 - t1); #endif return Score; } SCORE ObjScoreIds(const MSA &msa, const unsigned Ids1[], unsigned uCount1, const unsigned Ids2[], unsigned uCount2) { #if TIMING TICKS t1 = GetClockTicks(); #endif unsigned *SeqIndexes1 = new unsigned[uCount1]; unsigned *SeqIndexes2 = new unsigned[uCount2]; for (unsigned n = 0; n < uCount1; ++n) SeqIndexes1[n] = msa.GetSeqIndex(Ids1[n]); for (unsigned n = 0; n < uCount2; ++n) SeqIndexes2[n] = msa.GetSeqIndex(Ids2[n]); #if DOUBLE_AFFINE extern SCORE ObjScoreDA(const MSA &msa, SCORE *ptrLetters, SCORE *ptrGaps); SCORE Letters, Gaps; SCORE dObjScore = ObjScoreDA(msa, &Letters, &Gaps); delete[] SeqIndexes1; delete[] SeqIndexes2; #else SCORE dObjScore = ObjScore(msa, SeqIndexes1, uCount1, SeqIndexes2, uCount2); #endif #if TIMING TICKS t2 = GetClockTicks(); g_ticksObjScore += (t2 - t1); #endif return dObjScore; } objscoreda.cpp0000664000175000017500000001555712360262614012004 0ustar bobbob#include "muscle.h" #include "msa.h" #include "profile.h" #include "objscore.h" #if DOUBLE_AFFINE #define TRACE 0 #define TEST_SPFAST 0 static SCORE GapPenalty(unsigned uLength, bool Term, SCORE g, SCORE e) { //if (Term) // { // switch (g_TermGap) // { // case TERMGAP_Full: // return g + (uLength - 1)*e; // case TERMGAP_Half: // return g/2 + (uLength - 1)*e; // case TERMGAP_Ext: // return uLength*e; // } // Quit("Bad termgap"); // } //else // return g + (uLength - 1)*e; //return MINUS_INFINITY; return g + (uLength - 1)*e; } static SCORE GapPenalty(unsigned uLength, bool Term) { SCORE s1 = GapPenalty(uLength, Term, g_scoreGapOpen, g_scoreGapExtend); #if DOUBLE_AFFINE SCORE s2 = GapPenalty(uLength, Term, g_scoreGapOpen2, g_scoreGapExtend2); if (s1 > s2) return s1; return s2; #else return s1; #endif } static const MSA *g_ptrMSA1; static const MSA *g_ptrMSA2; static unsigned g_uSeqIndex1; static unsigned g_uSeqIndex2; static void LogGap(unsigned uStart, unsigned uEnd, unsigned uGapLength, bool bNTerm, bool bCTerm) { Log("%16.16s ", ""); for (unsigned i = 0; i < uStart; ++i) Log(" "); unsigned uMyLength = 0; for (unsigned i = uStart; i <= uEnd; ++i) { bool bGap1 = g_ptrMSA1->IsGap(g_uSeqIndex1, i); bool bGap2 = g_ptrMSA2->IsGap(g_uSeqIndex2, i); if (!bGap1 && !bGap2) Quit("Error -- neither gapping"); if (bGap1 && bGap2) Log("."); else { ++uMyLength; Log("-"); } } SCORE s = GapPenalty(uGapLength, bNTerm || bCTerm); Log(" L=%d N%d C%d s=%.3g", uGapLength, bNTerm, bCTerm, s); Log("\n"); if (uMyLength != uGapLength) Quit("Lengths differ"); } static SCORE ScoreSeqPair(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2, SCORE *ptrLetters, SCORE *ptrGaps) { g_ptrMSA1 = &msa1; g_ptrMSA2 = &msa2; g_uSeqIndex1 = uSeqIndex1; g_uSeqIndex2 = uSeqIndex2; const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPair, different lengths"); #if TRACE Log("ScoreSeqPair\n"); Log("%16.16s ", msa1.GetSeqName(uSeqIndex1)); for (unsigned i = 0; i < uColCount; ++i) Log("%c", msa1.GetChar(uSeqIndex1, i)); Log("\n"); Log("%16.16s ", msa2.GetSeqName(uSeqIndex2)); for (unsigned i = 0; i < uColCount; ++i) Log("%c", msa1.GetChar(uSeqIndex2, i)); Log("\n"); #endif SCORE scoreTotal = 0; // Substitution scores unsigned uFirstLetter1 = uInsane; unsigned uFirstLetter2 = uInsane; unsigned uLastLetter1 = uInsane; unsigned uLastLetter2 = uInsane; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); bool bWildcard1 = msa1.IsWildcard(uSeqIndex1, uColIndex); bool bWildcard2 = msa2.IsWildcard(uSeqIndex2, uColIndex); if (!bGap1) { if (uInsane == uFirstLetter1) uFirstLetter1 = uColIndex; uLastLetter1 = uColIndex; } if (!bGap2) { if (uInsane == uFirstLetter2) uFirstLetter2 = uColIndex; uLastLetter2 = uColIndex; } if (bGap1 || bGap2 || bWildcard1 || bWildcard2) continue; unsigned uLetter1 = msa1.GetLetter(uSeqIndex1, uColIndex); unsigned uLetter2 = msa2.GetLetter(uSeqIndex2, uColIndex); SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2]; scoreTotal += scoreMatch; #if TRACE Log("%c <-> %c = %7.1f %10.1f\n", msa1.GetChar(uSeqIndex1, uColIndex), msa2.GetChar(uSeqIndex2, uColIndex), scoreMatch, scoreTotal); #endif } *ptrLetters = scoreTotal; // Gap penalties unsigned uGapLength = uInsane; unsigned uGapStartCol = uInsane; bool bGapping1 = false; bool bGapping2 = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (bGap1 && bGap2) continue; if (bGapping1) { if (bGap1) ++uGapLength; else { bGapping1 = false; bool bNTerm = (uFirstLetter2 == uGapStartCol); bool bCTerm = (uLastLetter2 + 1 == uColIndex); SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } continue; } else { if (bGap1) { uGapStartCol = uColIndex; bGapping1 = true; uGapLength = 1; continue; } } if (bGapping2) { if (bGap2) ++uGapLength; else { bGapping2 = false; bool bNTerm = (uFirstLetter1 == uGapStartCol); bool bCTerm = (uLastLetter1 + 1 == uColIndex); SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } } else { if (bGap2) { uGapStartCol = uColIndex; bGapping2 = true; uGapLength = 1; } } } if (bGapping1 || bGapping2) { SCORE scoreGap = GapPenalty(uGapLength, true); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColCount - 1, uGapLength, false, true); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } *ptrGaps = scoreTotal - *ptrLetters; return scoreTotal; } // The usual sum-of-pairs objective score: sum the score // of the alignment of each pair of sequences. SCORE ObjScoreDA(const MSA &msa, SCORE *ptrLetters, SCORE *ptrGaps) { const unsigned uSeqCount = msa.GetSeqCount(); SCORE scoreTotal = 0; unsigned uPairCount = 0; #if TRACE msa.LogMe(); Log(" Score Weight Weight Total\n"); Log("---------- ------ ------ ----------\n"); #endif SCORE TotalLetters = 0; SCORE TotalGaps = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE Letters; SCORE Gaps; SCORE scorePair = ScoreSeqPair(msa, uSeqIndex1, msa, uSeqIndex2, &Letters, &Gaps); scoreTotal += w1*w2*scorePair; TotalLetters += w1*w2*Letters; TotalGaps += w1*w2*Gaps; ++uPairCount; #if TRACE Log("%10.2f %6.3f %6.3f %10.2f %d=%s %d=%s\n", scorePair, w1, w2, scorePair*w1*w2, uSeqIndex1, msa.GetSeqName(uSeqIndex1), uSeqIndex2, msa.GetSeqName(uSeqIndex2)); #endif } } *ptrLetters = TotalLetters; *ptrGaps = TotalGaps; return scoreTotal; } #endif // DOUBLE_AFFINE onexception.cpp0000664000175000017500000000045612360262614012214 0ustar bobbob#include "muscle.h" #include static char szOnExceptionMessage[] = { "\nFatal error, exception caught.\n" }; void OnException() { fprintf(stderr, "%s", szOnExceptionMessage); Log("%s", szOnExceptionMessage); Log("Finished %s\n", GetTimeAsStr()); exit(EXIT_Except); } options.cpp0000664000175000017500000001130512360262614011347 0ustar bobbob#include "muscle.h" #include struct VALUE_OPT { const char *m_pstrName; const char *m_pstrValue; }; struct FLAG_OPT { const char *m_pstrName; bool m_bSet; }; static VALUE_OPT ValueOpts[] = { "in", 0, "in1", 0, "in2", 0, "out", 0, "MaxIters", 0, "MaxHours", 0, "GapOpen", 0, "GapOpen2", 0, "GapExtend", 0, "GapExtend2", 0, "GapAmbig", 0, "Center", 0, "SmoothScoreCeil", 0, "MinBestColScore", 0, "MinSmoothScore", 0, "ObjScore", 0, "SmoothWindow", 0, "RefineWindow", 0, "FromWindow", 0, "ToWindow", 0, "SaveWindow", 0, "WindowOffset", 0, "FirstWindow", 0, "AnchorSpacing", 0, "Log", 0, "LogA", 0, "MaxTrees", 0, "SUEFF", 0, "Distance", 0, "Distance1", 0, "Distance2", 0, "Weight", 0, "Weight1", 0, "Weight2", 0, "Cluster", 0, "Cluster1", 0, "Cluster2", 0, "Root1", 0, "Root2", 0, "Tree1", 0, "Tree2", 0, "UseTree", 0, "UseTree_NoWarn", 0, "DiagLength", 0, "DiagMargin", 0, "DiagBreak", 0, "Hydro", 0, "HydroFactor", 0, "SPScore", 0, "SeqType", 0, "MaxMB", 0, "ComputeWeights", 0, "MaxSubFam", 0, "ScoreFile", 0, "TermGaps", 0, "FASTAOut", 0, "CLWOut", 0, "CLWStrictOut", 0, "HTMLOut", 0, "MSFOut", 0, "PHYIOut", 0, "PHYSOut", 0, "Matrix", 0, "DistMx1", 0, "DistMx2", 0, "Weight", 0, }; static int ValueOptCount = sizeof(ValueOpts)/sizeof(ValueOpts[0]); static FLAG_OPT FlagOpts[] = { "LE", false, "SP", false, "SV", false, "SPN", false, "Core", false, "NoCore", false, "Diags1", false, "Diags2", false, "Diags", false, "Quiet", false, "MSF", false, "Verbose", false, "Anchors", false, "NoAnchors", false, "Refine", false, "RefineW", false, "SW", false, "Profile", false, "PPScore", false, "ClusterOnly", false, "Brenner", false, "Dimer", false, "clw", false, "clwstrict", false, "HTML", false, "Version", false, "Stable", false, "Group", false, "FASTA", false, "ProfDB", false, "PAS", false, "PHYI", false, "PHYS", false, "TomHydro", false, "MakeTree", false, }; static int FlagOptCount = sizeof(FlagOpts)/sizeof(FlagOpts[0]); static bool TestSetFlagOpt(const char *Arg) { for (int i = 0; i < FlagOptCount; ++i) if (!stricmp(Arg, FlagOpts[i].m_pstrName)) { FlagOpts[i].m_bSet = true; return true; } return false; } static bool TestSetValueOpt(const char *Arg, const char *Value) { for (int i = 0; i < ValueOptCount; ++i) if (!stricmp(Arg, ValueOpts[i].m_pstrName)) { if (0 == Value) { fprintf(stderr, "Option -%s must have value\n", Arg); exit(EXIT_NotStarted); } ValueOpts[i].m_pstrValue = strsave(Value); return true; } return false; } bool FlagOpt(const char *Name) { for (int i = 0; i < FlagOptCount; ++i) if (!stricmp(Name, FlagOpts[i].m_pstrName)) return FlagOpts[i].m_bSet; Quit("FlagOpt(%s) invalid", Name); return false; } const char *ValueOpt(const char *Name) { for (int i = 0; i < ValueOptCount; ++i) if (!stricmp(Name, ValueOpts[i].m_pstrName)) return ValueOpts[i].m_pstrValue; Quit("ValueOpt(%s) invalid", Name); return 0; } void ProcessArgVect(int argc, char *argv[]) { for (int iArgIndex = 0; iArgIndex < argc; ) { const char *Arg = argv[iArgIndex]; if (Arg[0] != '-') { fprintf(stderr, "Command-line option \"%s\" must start with '-'\n", Arg); exit(EXIT_NotStarted); } const char *ArgName = Arg + 1; if (TestSetFlagOpt(ArgName)) { ++iArgIndex; continue; } char *Value = 0; if (iArgIndex < argc - 1) Value = argv[iArgIndex+1]; if (TestSetValueOpt(ArgName, Value)) { iArgIndex += 2; continue; } fprintf(stderr, "Invalid command line option \"%s\"\n", ArgName); Usage(); exit(EXIT_NotStarted); } } void ProcessArgStr(const char *ArgStr) { const int MAX_ARGS = 64; char *argv[MAX_ARGS]; if (0 == ArgStr) return; // Modifiable copy char *StrCopy = strsave(ArgStr); int argc = 0; bool bInArg = false; char *Str = StrCopy; while (char c = *Str) { if (isspace(c)) { *Str = 0; bInArg = false; } else if (!bInArg) { bInArg = true; if (argc >= MAX_ARGS) Quit("Too many args in MUSCLE_CMDLINE"); argv[argc++] = Str; } Str++; } ProcessArgVect(argc, argv); free(StrCopy); } void ListFlagOpts() { for (int i = 0; i < FlagOptCount; ++i) Log("%s %d\n", FlagOpts[i].m_pstrName, FlagOpts[i].m_bSet); } outweights.cpp0000664000175000017500000000071512360262614012061 0ustar bobbob#include "muscle.h" #include "msa.h" void OutWeights(const char *FileName, const MSA &msa) { FILE *f = fopen(FileName, "w"); if (0 == f) Quit("Cannot open '%s'", FileName); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *Id = msa.GetSeqName(uSeqIndex); const WEIGHT w = msa.GetSeqWeight(uSeqIndex); fprintf(f, "%s\t%.3g\n", Id, w); } fclose(f); } pam200mafft.cpp0000664000175000017500000001020312360262614011665 0ustar bobbob#include "muscle.h" // Adjusted PAM200 scoring matrix as used by default in MAFFT. // Katoh, Misawa, Kuma and Miyata (2002), NAR 30(14), 3059-3066. static float PAM200[23][23] = { // A C D E F G H I K L M N P Q R S T V W Y B Z X 408, 20, 54, 52, -182, 179, -68, 109, -35, -47, 39, 106, 206, -14, -12, 257, 293, 191, -306, -219, 0, 0, 0, // A 20, 1190, -228, -295, 94, 6, 63, -131, -184, -176, -112, -29, -122, -195, 49, 185, 13, -49, 199, 333, 0, 0, 0, // C 54, -228, 645, 516, -399, 168, 98, -225, 75, -341, -235, 352, -149, 142, -44, 65, 7, -147, -418, -128, 0, 0, 0, // D 52, -295, 516, 630, -460, 145, 45, -225, 195, -307, -222, 186, -121, 299, 54, -10, -36, -130, -366, -285, 0, 0, 0, // E -182, 94, -399, -460, 908, -387, 82, 100, -423, 340, 87, -216, -160, -274, -307, -31, -153, 51, 19, 604, 0, 0, 0, // F 179, 6, 168, 145, -387, 682, -94, -196, -14, -304, -226, 99, -57, -48, 117, 175, 41, -73, -38, -329, 0, 0, 0, // G -68, 63, 98, 45, 82, -94, 786, -185, 164, -72, -132, 258, 86, 388, 277, 55, -15, -197, -181, 488, 0, 0, 0, // H 109, -131, -225, -225, 100, -196, -185, 574, -204, 308, 411, -94, -95, -202, -188, 1, 182, 489, -254, -133, 0, 0, 0, // I -35, -184, 75, 195, -423, -14, 164, -204, 652, -229, -98, 206, -66, 335, 486, 22, 39, -207, -196, -244, 0, 0, 0, // K -47, -176, -341, -307, 340, -304, -72, 308, -229, 611, 389, -203, 73, -66, -150, -49, -21, 259, -46, -9, 0, 0, 0, // L 39, -112, -235, -222, 87, -226, -132, 411, -98, 389, 776, -111, -78, -104, -109, -29, 149, 351, -209, -162, 0, 0, 0, // M 106, -29, 352, 186, -216, 99, 258, -94, 206, -203, -111, 536, -1, 108, 93, 260, 188, -98, -359, 12, 0, 0, 0, // N 206, -122, -149, -121, -160, -57, 86, -95, -66, 73, -78, -1, 756, 142, 25, 241, 159, -55, -353, -206, 0, 0, 0, // P -14, -195, 142, 299, -274, -48, 388, -202, 335, -66, -104, 108, 142, 655, 321, 7, -15, -175, -223, -53, 0, 0, 0, // Q -12, 49, -44, 54, -307, 117, 277, -188, 486, -150, -109, 93, 25, 321, 626, 48, 16, -181, 124, -113, 0, 0, 0, // R 257, 185, 65, -10, -31, 175, 55, 1, 22, -49, -29, 260, 241, 7, 48, 373, 279, 28, -193, -35, 0, 0, 0, // S 293, 13, 7, -36, -153, 41, -15, 182, 39, -21, 149, 188, 159, -15, 16, 279, 442, 163, -323, -170, 0, 0, 0, // T 191, -49, -147, -130, 51, -73, -197, 489, -207, 259, 351, -98, -55, -175, -181, 28, 163, 525, -225, -177, 0, 0, 0, // V -306, 199, -418, -366, 19, -38, -181, -254, -196, -46, -209, -359, -353, -223, 124, -193, -323, -225, 1495, 83, 0, 0, 0, // W -219, 333, -128, -285, 604, -329, 488, -133, -244, -9, -162, 12, -206, -53, -113, -35, -170, -177, 83, 999, 0, 0, 0, // Y 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Z 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // X }; params.cpp0000664000175000017500000004132212360262614011141 0ustar bobbob#include "muscle.h" #include "objscore.h" #include "profile.h" #include "enumopts.h" const double DEFAULT_MAX_MB_FRACT = 0.8; SCORE g_scoreCenter = 0; SCORE g_scoreGapExtend = 0; SCORE g_scoreGapOpen2 = MINUS_INFINITY; SCORE g_scoreGapExtend2 = MINUS_INFINITY; SCORE g_scoreGapAmbig = 0; SCORE g_scoreAmbigFactor = 0; extern SCOREMATRIX VTML_LA; extern SCOREMATRIX PAM200; extern SCOREMATRIX PAM200NoCenter; extern SCOREMATRIX VTML_SP; extern SCOREMATRIX VTML_SPNoCenter; extern SCOREMATRIX NUC_SP; PTR_SCOREMATRIX g_ptrScoreMatrix; const char *g_pstrInFileName = "-"; const char *g_pstrOutFileName = "-"; const char *g_pstrFASTAOutFileName = 0; const char *g_pstrMSFOutFileName = 0; const char *g_pstrClwOutFileName = 0; const char *g_pstrClwStrictOutFileName = 0; const char *g_pstrHTMLOutFileName = 0; const char *g_pstrPHYIOutFileName = 0; const char *g_pstrPHYSOutFileName = 0; const char *g_pstrDistMxFileName1 = 0; const char *g_pstrDistMxFileName2 = 0; const char *g_pstrFileName1 = 0; const char *g_pstrFileName2 = 0; const char *g_pstrSPFileName = 0; const char *g_pstrMatrixFileName = 0; const char *g_pstrUseTreeFileName = 0; bool g_bUseTreeNoWarn = false; const char *g_pstrComputeWeightsFileName; const char *g_pstrScoreFileName; const char *g_pstrProf1FileName = 0; const char *g_pstrProf2FileName = 0; unsigned g_uSmoothWindowLength = 7; unsigned g_uAnchorSpacing = 32; unsigned g_uMaxTreeRefineIters = 1; unsigned g_uRefineWindow = 200; unsigned g_uWindowFrom = 0; unsigned g_uWindowTo = 0; unsigned g_uSaveWindow = uInsane; unsigned g_uWindowOffset = 0; unsigned g_uMaxSubFamCount = 5; unsigned g_uHydrophobicRunLength = 5; float g_dHydroFactor = (float) 1.2; unsigned g_uMinDiagLength = 24; // TODO alpha -- should depend on alphabet? unsigned g_uMaxDiagBreak = 1; unsigned g_uDiagMargin = 5; float g_dSUEFF = (float) 0.1; bool g_bPrecompiledCenter = true; bool g_bNormalizeCounts = false; bool g_bDiags1 = false; bool g_bDiags2 = false; bool g_bAnchors = true; bool g_bQuiet = false; bool g_bVerbose = false; bool g_bRefine = false; bool g_bRefineW = false; bool g_bProfDB = false; bool g_bLow = false; bool g_bSW = false; bool g_bClusterOnly = false; bool g_bProfile = false; bool g_bPPScore = false; bool g_bBrenner = false; bool g_bDimer = false; bool g_bVersion = false; bool g_bStable = false; bool g_bFASTA = false; bool g_bPAS = false; bool g_bTomHydro = false; bool g_bMakeTree = false; #if DEBUG bool g_bCatchExceptions = false; #else bool g_bCatchExceptions = true; #endif bool g_bMSF = false; bool g_bAln = false; bool g_bClwStrict = false; bool g_bHTML = false; bool g_bPHYI = false; bool g_bPHYS = false; unsigned g_uMaxIters = 8; unsigned long g_ulMaxSecs = 0; unsigned g_uMaxMB = 500; PPSCORE g_PPScore = PPSCORE_LE; OBJSCORE g_ObjScore = OBJSCORE_SPM; SEQWEIGHT g_SeqWeight1 = SEQWEIGHT_ClustalW; SEQWEIGHT g_SeqWeight2 = SEQWEIGHT_ClustalW; DISTANCE g_Distance1 = DISTANCE_Kmer6_6; DISTANCE g_Distance2 = DISTANCE_PctIdKimura; CLUSTER g_Cluster1 = CLUSTER_UPGMB; CLUSTER g_Cluster2 = CLUSTER_UPGMB; ROOT g_Root1 = ROOT_Pseudo; ROOT g_Root2 = ROOT_Pseudo; bool g_bDiags; SEQTYPE g_SeqType = SEQTYPE_Auto; TERMGAPS g_TermGaps = TERMGAPS_Half; //------------------------------------------------------ // These parameters depending on the chosen prof-prof // score (g_PPScore), initialized to "Undefined". float g_dSmoothScoreCeil = fInsane; float g_dMinBestColScore = fInsane; float g_dMinSmoothScore = fInsane; SCORE g_scoreGapOpen = fInsane; //------------------------------------------------------ static unsigned atou(const char *s) { return (unsigned) atoi(s); } const char *MaxSecsToStr() { if (0 == g_ulMaxSecs) return "(No limit)"; return SecsToStr(g_ulMaxSecs); } void ListParams() { Log("\n"); Log("%s\n", MUSCLE_LONG_VERSION); Log("http://www.drive5.com/muscle\n"); Log("\n"); Log("Profile-profile score %s\n", PPSCOREToStr(g_PPScore)); Log("Max iterations %u\n", g_uMaxIters); Log("Max trees %u\n", g_uMaxTreeRefineIters); Log("Max time %s\n", MaxSecsToStr()); Log("Max MB %u\n", g_uMaxMB); Log("Gap open %g\n", g_scoreGapOpen); Log("Gap extend (dimer) %g\n", g_scoreGapExtend); Log("Gap ambig factor %g\n", g_scoreAmbigFactor); Log("Gap ambig penalty %g\n", g_scoreGapAmbig); Log("Center (LE) %g\n", g_scoreCenter); Log("Term gaps %s\n", TERMGAPSToStr(g_TermGaps)); Log("Smooth window length %u\n", g_uSmoothWindowLength); Log("Refine window length %u\n", g_uRefineWindow); Log("Min anchor spacing %u\n", g_uAnchorSpacing); Log("Min diag length (lambda) %u\n", g_uMinDiagLength); Log("Diag margin (mu) %u\n", g_uDiagMargin); Log("Min diag break %u\n", g_uMaxDiagBreak); Log("Hydrophobic window %u\n", g_uHydrophobicRunLength); Log("Hydrophobic gap factor %g\n", g_dHydroFactor); Log("Smooth score ceiling %g\n", g_dSmoothScoreCeil); Log("Min best col score %g\n", g_dMinBestColScore); Log("Min anchor score %g\n", g_dMinSmoothScore); Log("SUEFF %g\n", g_dSUEFF); Log("Brenner root MSA %s\n", BoolToStr(g_bBrenner)); Log("Normalize counts %s\n", BoolToStr(g_bNormalizeCounts)); Log("Diagonals (1) %s\n", BoolToStr(g_bDiags1)); Log("Diagonals (2) %s\n", BoolToStr(g_bDiags2)); Log("Anchors %s\n", BoolToStr(g_bAnchors)); Log("MSF output format %s\n", BoolToStr(g_bMSF)); Log("Phylip interleaved %s\n", BoolToStr(g_bPHYI)); Log("Phylip sequential %s\n", BoolToStr(g_bPHYS)); Log("ClustalW output format %s\n", BoolToStr(g_bAln)); Log("Catch exceptions %s\n", BoolToStr(g_bCatchExceptions)); Log("Quiet %s\n", BoolToStr(g_bQuiet)); Log("Refine %s\n", BoolToStr(g_bRefine)); Log("ProdfDB %s\n", BoolToStr(g_bProfDB)); Log("Low complexity profiles %s\n", BoolToStr(g_bLow)); Log("Objective score %s\n", OBJSCOREToStr(g_ObjScore)); Log("Distance method (1) %s\n", DISTANCEToStr(g_Distance1)); Log("Clustering method (1) %s\n", CLUSTERToStr(g_Cluster1)); Log("Root method (1) %s\n", ROOTToStr(g_Root1)); Log("Sequence weighting (1) %s\n", SEQWEIGHTToStr(g_SeqWeight1)); Log("Distance method (2) %s\n", DISTANCEToStr(g_Distance2)); Log("Clustering method (2) %s\n", CLUSTERToStr(g_Cluster2)); Log("Root method (2) %s\n", ROOTToStr(g_Root2)); Log("Sequence weighting (2) %s\n", SEQWEIGHTToStr(g_SeqWeight2)); Log("\n"); } static void SetDefaultsLE() { g_ptrScoreMatrix = &VTML_LA; //g_scoreGapOpen = (SCORE) -3.00; //g_scoreCenter = (SCORE) -0.55; g_scoreGapOpen = (SCORE) -2.9; g_scoreCenter = (SCORE) -0.52; g_bNormalizeCounts = true; //g_dSmoothScoreCeil = 5.0; //g_dMinBestColScore = 4.0; //g_dMinSmoothScore = 2.0; g_dSmoothScoreCeil = 3.0; g_dMinBestColScore = 2.0; g_dMinSmoothScore = 1.0; g_Distance1 = DISTANCE_Kmer6_6; g_Distance2 = DISTANCE_PctIdKimura; } static void SetDefaultsSP() { g_ptrScoreMatrix = &PAM200; g_scoreGapOpen = -1439; g_scoreCenter = 0.0; // center pre-added into score mx g_bNormalizeCounts = false; g_dSmoothScoreCeil = 200.0; g_dMinBestColScore = 300.0; g_dMinSmoothScore = 125.0; g_Distance1 = DISTANCE_Kmer6_6; g_Distance2 = DISTANCE_PctIdKimura; } static void SetDefaultsSV() { g_ptrScoreMatrix = &VTML_SP; g_scoreGapOpen = -300; g_scoreCenter = 0.0; // center pre-added into score mx g_bNormalizeCounts = false; g_dSmoothScoreCeil = 90.0; g_dMinBestColScore = 130.0; g_dMinSmoothScore = 40.0; g_Distance1 = DISTANCE_Kmer6_6; g_Distance2 = DISTANCE_PctIdKimura; } //static void SetDefaultsSPN() // { // g_ptrScoreMatrix = &NUC_SP; // // g_scoreGapOpen = -400; // g_scoreCenter = 0.0; // center pre-added into score mx // // g_bNormalizeCounts = false; // // g_dSmoothScoreCeil = 999.0; // disable // g_dMinBestColScore = 90; // g_dMinSmoothScore = 90; // // g_Distance1 = DISTANCE_Kmer4_6; // g_Distance2 = DISTANCE_PctIdKimura; // } static void SetDefaultsSPN_DNA() { g_ptrScoreMatrix = &NUC_SP; g_scoreGapOpen = -400; g_scoreCenter = 0.0; // center pre-added into score mx g_scoreGapExtend = 0.0; g_bNormalizeCounts = false; g_dSmoothScoreCeil = 999.0; // disable g_dMinBestColScore = 90; g_dMinSmoothScore = 90; g_Distance1 = DISTANCE_Kmer4_6; g_Distance2 = DISTANCE_PctIdKimura; } static void SetDefaultsSPN_RNA() { g_ptrScoreMatrix = &NUC_SP; g_scoreGapOpen = -420; g_scoreCenter = -300; // total center = NUC_EXTEND - 300 g_scoreGapExtend = 0.0; g_bNormalizeCounts = false; g_dSmoothScoreCeil = 999.0; // disable g_dMinBestColScore = 90; g_dMinSmoothScore = 90; g_Distance1 = DISTANCE_Kmer4_6; g_Distance2 = DISTANCE_PctIdKimura; } static void FlagParam(const char *OptName, bool *ptrParam, bool bValueIfFlagSet) { bool bIsSet = FlagOpt(OptName); if (bIsSet) *ptrParam = bValueIfFlagSet; } static void StrParam(const char *OptName, const char **ptrptrParam) { const char *opt = ValueOpt(OptName); if (0 != opt) *ptrptrParam = opt; } static void FloatParam(const char *OptName, float *ptrParam) { const char *opt = ValueOpt(OptName); if (0 != opt) *ptrParam = (float) atof(opt); } static void UintParam(const char *OptName, unsigned *ptrParam) { const char *opt = ValueOpt(OptName); if (0 != opt) *ptrParam = atou(opt); } static void EnumParam(const char *OptName, EnumOpt *Opts, int *Param) { const char *Value = ValueOpt(OptName); if (0 == Value) return; for (;;) { if (0 == Opts->pstrOpt) Quit("Invalid parameter -%s %s", OptName, Value); if (0 == stricmp(Value, Opts->pstrOpt)) { *Param = Opts->iValue; return; } ++Opts; } } static void SetPPDefaultParams() { switch (g_PPScore) { case PPSCORE_SP: SetDefaultsSP(); break; case PPSCORE_LE: SetDefaultsLE(); break; case PPSCORE_SV: SetDefaultsSV(); break; case PPSCORE_SPN: switch (g_Alpha) { case ALPHA_DNA: SetDefaultsSPN_DNA(); break; case ALPHA_RNA: SetDefaultsSPN_RNA(); break; default: Quit("Invalid alpha %d", g_Alpha); } break; default: Quit("Invalid g_PPScore"); } } static void SetPPCommandLineParams() { FloatParam("GapOpen", &g_scoreGapOpen); FloatParam("GapOpen2", &g_scoreGapOpen2); FloatParam("GapExtend", &g_scoreGapExtend); FloatParam("GapExtend2", &g_scoreGapExtend2); FloatParam("GapAmbig", &g_scoreAmbigFactor); FloatParam("Center", &g_scoreCenter); FloatParam("SmoothScoreCeil", &g_dSmoothScoreCeil); FloatParam("MinBestColScore", &g_dMinBestColScore); FloatParam("MinSmoothScore", &g_dMinSmoothScore); EnumParam("Distance", DISTANCE_Opts, (int *) &g_Distance1); EnumParam("Distance", DISTANCE_Opts, (int *) &g_Distance2); EnumParam("Distance1", DISTANCE_Opts, (int *) &g_Distance1); EnumParam("Distance2", DISTANCE_Opts, (int *) &g_Distance2); } void SetPPScore(bool bRespectFlagOpts) { if (bRespectFlagOpts) { if (FlagOpt("SP")) g_PPScore = PPSCORE_SP; else if (FlagOpt("LE")) g_PPScore = PPSCORE_LE; else if (FlagOpt("SV")) g_PPScore = PPSCORE_SV; else if (FlagOpt("SPN")) g_PPScore = PPSCORE_SPN; } switch (g_PPScore) { case PPSCORE_LE: case PPSCORE_SP: case PPSCORE_SV: if (ALPHA_RNA == g_Alpha || ALPHA_DNA == g_Alpha) g_PPScore = PPSCORE_SPN; break; case PPSCORE_SPN: if (ALPHA_Amino == g_Alpha) g_PPScore = PPSCORE_LE; break; } SetPPDefaultParams(); SetPPCommandLineParams(); if (g_bVerbose) ListParams(); } void SetPPScore(PPSCORE p) { g_PPScore = p; SetPPScore(true); } static void SetMaxSecs() { float fMaxHours = 0.0; FloatParam("MaxHours", &fMaxHours); if (0.0 == fMaxHours) return; g_ulMaxSecs = (unsigned long) (fMaxHours*60*60); } static bool CanDoLowComplexity() { if (g_SeqWeight1 != SEQWEIGHT_ClustalW) return false; if (1 == g_uMaxIters) return true; return g_SeqWeight2 == SEQWEIGHT_ClustalW; } bool MissingCommand() { if (strcmp(g_pstrInFileName, "-")) return false; if (0 != g_pstrFileName1) return false; if (0 != g_pstrSPFileName) return false; return true; } void SetParams() { SetMaxSecs(); StrParam("in", &g_pstrInFileName); StrParam("out", &g_pstrOutFileName); StrParam("FASTAOut", &g_pstrFASTAOutFileName); StrParam("ClwOut", &g_pstrClwOutFileName); StrParam("ClwStrictOut", &g_pstrClwStrictOutFileName); StrParam("HTMLOut", &g_pstrHTMLOutFileName); StrParam("PHYIOut", &g_pstrPHYIOutFileName); StrParam("PHYSOut", &g_pstrPHYSOutFileName); StrParam("MSFOut", &g_pstrMSFOutFileName); StrParam("in1", &g_pstrFileName1); StrParam("in2", &g_pstrFileName2); StrParam("Matrix", &g_pstrMatrixFileName); StrParam("SPScore", &g_pstrSPFileName); StrParam("UseTree_NoWarn", &g_pstrUseTreeFileName); if (0 != g_pstrUseTreeFileName) g_bUseTreeNoWarn = true; StrParam("UseTree", &g_pstrUseTreeFileName); StrParam("ComputeWeights", &g_pstrComputeWeightsFileName); StrParam("ScoreFile", &g_pstrScoreFileName); StrParam("DistMx1", &g_pstrDistMxFileName1); StrParam("DistMx2", &g_pstrDistMxFileName2); FlagParam("Core", &g_bCatchExceptions, false); FlagParam("NoCore", &g_bCatchExceptions, true); FlagParam("Diags1", &g_bDiags1, true); FlagParam("Diags2", &g_bDiags2, true); bool Diags = false; FlagParam("Diags", &Diags, true); if (Diags) { g_bDiags1 = true; g_bDiags2 = true; } FlagParam("Anchors", &g_bAnchors, true); FlagParam("NoAnchors", &g_bAnchors, false); FlagParam("Quiet", &g_bQuiet, true); FlagParam("Verbose", &g_bVerbose, true); FlagParam("Version", &g_bVersion, true); FlagParam("Stable", &g_bStable, true); FlagParam("Group", &g_bStable, false); FlagParam("Refine", &g_bRefine, true); FlagParam("RefineW", &g_bRefineW, true); FlagParam("ProfDB", &g_bProfDB, true); FlagParam("SW", &g_bSW, true); FlagParam("ClusterOnly", &g_bClusterOnly, true); FlagParam("Profile", &g_bProfile, true); FlagParam("PPScore", &g_bPPScore, true); FlagParam("Brenner", &g_bBrenner, true); FlagParam("Dimer", &g_bDimer, true); FlagParam("MSF", &g_bMSF, true); FlagParam("PHYI", &g_bPHYI, true); FlagParam("PHYS", &g_bPHYS, true); FlagParam("clw", &g_bAln, true); FlagParam("HTML", &g_bHTML, true); FlagParam("FASTA", &g_bFASTA, true); FlagParam("PAS", &g_bPAS, true); FlagParam("MakeTree", &g_bMakeTree, true); if (g_bStable) Quit("-stable not supported in this version of muscle"); bool b = false; FlagParam("clwstrict", &b, true); if (b) { g_bAln = true; g_bClwStrict = true; } UintParam("MaxIters", &g_uMaxIters); UintParam("MaxTrees", &g_uMaxTreeRefineIters); UintParam("SmoothWindow", &g_uSmoothWindowLength); UintParam("RefineWindow", &g_uRefineWindow); UintParam("FromWindow", &g_uWindowFrom); UintParam("ToWindow", &g_uWindowTo); UintParam("SaveWindow", &g_uSaveWindow); UintParam("WindowOffset", &g_uWindowOffset); UintParam("AnchorSpacing", &g_uAnchorSpacing); UintParam("DiagLength", &g_uMinDiagLength); UintParam("DiagMargin", &g_uDiagMargin); UintParam("DiagBreak", &g_uMaxDiagBreak); UintParam("MaxSubFam", &g_uMaxSubFamCount); UintParam("Hydro", &g_uHydrophobicRunLength); FlagParam("TomHydro", &g_bTomHydro, true); if (g_bTomHydro) g_uHydrophobicRunLength = 0; FloatParam("SUEFF", &g_dSUEFF); FloatParam("HydroFactor", &g_dHydroFactor); EnumParam("ObjScore", OBJSCORE_Opts, (int *) &g_ObjScore); EnumParam("TermGaps", TERMGAPS_Opts, (int *) &g_TermGaps); EnumParam("Weight", SEQWEIGHT_Opts, (int *) &g_SeqWeight1); EnumParam("Weight", SEQWEIGHT_Opts, (int *) &g_SeqWeight2); EnumParam("Weight1", SEQWEIGHT_Opts, (int *) &g_SeqWeight1); EnumParam("Weight2", SEQWEIGHT_Opts, (int *) &g_SeqWeight2); EnumParam("Cluster", CLUSTER_Opts, (int *) &g_Cluster1); EnumParam("Cluster", CLUSTER_Opts, (int *) &g_Cluster2); EnumParam("Cluster1", CLUSTER_Opts, (int *) &g_Cluster1); EnumParam("Cluster2", CLUSTER_Opts, (int *) &g_Cluster2); EnumParam("Root1", ROOT_Opts, (int *) &g_Root1); EnumParam("Root2", ROOT_Opts, (int *) &g_Root2); EnumParam("SeqType", SEQTYPE_Opts, (int *) &g_SeqType); g_scoreGapAmbig = g_scoreGapOpen*g_scoreAmbigFactor; g_bLow = CanDoLowComplexity(); if (g_bDimer) g_bPrecompiledCenter = false; UintParam("MaxMB", &g_uMaxMB); if (0 == ValueOpt("MaxMB")) g_uMaxMB = (unsigned) (GetRAMSizeMB()*DEFAULT_MAX_MB_FRACT); } phy2.cpp0000664000175000017500000001657712360262614010556 0ustar bobbob#include "muscle.h" #include "tree.h" #define TRACE 0 // Return false when done bool PhyEnumEdges(const Tree &tree, PhyEnumEdgeState &ES) { unsigned uNode1 = uInsane; if (!ES.m_bInit) { if (tree.GetNodeCount() <= 1) { ES.m_uNodeIndex1 = NULL_NEIGHBOR; ES.m_uNodeIndex2 = NULL_NEIGHBOR; return false; } uNode1 = tree.FirstDepthFirstNode(); ES.m_bInit = true; } else { uNode1 = tree.NextDepthFirstNode(ES.m_uNodeIndex1); if (NULL_NEIGHBOR == uNode1) return false; if (tree.IsRooted() && tree.IsRoot(uNode1)) { uNode1 = tree.NextDepthFirstNode(uNode1); if (NULL_NEIGHBOR == uNode1) return false; } } unsigned uNode2 = tree.GetParent(uNode1); ES.m_uNodeIndex1 = uNode1; ES.m_uNodeIndex2 = uNode2; return true; } bool PhyEnumEdgesR(const Tree &tree, PhyEnumEdgeState &ES) { unsigned uNode1 = uInsane; if (!ES.m_bInit) { if (tree.GetNodeCount() <= 1) { ES.m_uNodeIndex1 = NULL_NEIGHBOR; ES.m_uNodeIndex2 = NULL_NEIGHBOR; return false; } uNode1 = tree.FirstDepthFirstNodeR(); ES.m_bInit = true; } else { uNode1 = tree.NextDepthFirstNodeR(ES.m_uNodeIndex1); if (NULL_NEIGHBOR == uNode1) return false; if (tree.IsRooted() && tree.IsRoot(uNode1)) { uNode1 = tree.NextDepthFirstNode(uNode1); if (NULL_NEIGHBOR == uNode1) return false; } } unsigned uNode2 = tree.GetParent(uNode1); ES.m_uNodeIndex1 = uNode1; ES.m_uNodeIndex2 = uNode2; return true; } static void GetLeavesSubtree(const Tree &tree, unsigned uNodeIndex1, const unsigned uNodeIndex2, unsigned Leaves[], unsigned *ptruCount) { if (tree.IsLeaf(uNodeIndex1)) { Leaves[*ptruCount] = uNodeIndex1; ++(*ptruCount); return; } const unsigned uLeft = tree.GetFirstNeighbor(uNodeIndex1, uNodeIndex2); const unsigned uRight = tree.GetSecondNeighbor(uNodeIndex1, uNodeIndex2); if (NULL_NEIGHBOR != uLeft) GetLeavesSubtree(tree, uLeft, uNodeIndex1, Leaves, ptruCount); if (NULL_NEIGHBOR != uRight) GetLeavesSubtree(tree, uRight, uNodeIndex1, Leaves, ptruCount); } static void PhyGetLeaves(const Tree &tree, unsigned uNodeIndex1, unsigned uNodeIndex2, unsigned Leaves[], unsigned *ptruCount) { *ptruCount = 0; GetLeavesSubtree(tree, uNodeIndex1, uNodeIndex2, Leaves, ptruCount); } bool PhyEnumBiParts(const Tree &tree, PhyEnumEdgeState &ES, unsigned Leaves1[], unsigned *ptruCount1, unsigned Leaves2[], unsigned *ptruCount2) { bool bOk = PhyEnumEdges(tree, ES); if (!bOk) { *ptruCount1 = 0; *ptruCount2 = 0; return false; } // Special case: in a rooted tree, both edges from the root // give the same bipartition, so skip one of them. if (tree.IsRooted() && tree.IsRoot(ES.m_uNodeIndex2) && tree.GetRight(ES.m_uNodeIndex2) == ES.m_uNodeIndex1) { bOk = PhyEnumEdges(tree, ES); if (!bOk) return false; } PhyGetLeaves(tree, ES.m_uNodeIndex1, ES.m_uNodeIndex2, Leaves1, ptruCount1); PhyGetLeaves(tree, ES.m_uNodeIndex2, ES.m_uNodeIndex1, Leaves2, ptruCount2); if (*ptruCount1 + *ptruCount2 != tree.GetLeafCount()) Quit("PhyEnumBiParts %u + %u != %u", *ptruCount1, *ptruCount2, tree.GetLeafCount()); #if DEBUG { for (unsigned i = 0; i < *ptruCount1; ++i) { if (!tree.IsLeaf(Leaves1[i])) Quit("PhyEnumByParts: not leaf"); for (unsigned j = 0; j < *ptruCount2; ++j) { if (!tree.IsLeaf(Leaves2[j])) Quit("PhyEnumByParts: not leaf"); if (Leaves1[i] == Leaves2[j]) Quit("PhyEnumByParts: dupe"); } } } #endif return true; } #if 0 void TestBiPart() { SetListFileName("c:\\tmp\\lobster.log", false); Tree tree; TextFile fileIn("c:\\tmp\\test.phy"); tree.FromFile(fileIn); tree.LogMe(); const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Leaves1 = new unsigned[uNodeCount]; unsigned *Leaves2 = new unsigned[uNodeCount]; PhyEnumEdgeState ES; bool bDone = false; for (;;) { unsigned uCount1 = uInsane; unsigned uCount2 = uInsane; bool bOk = PhyEnumBiParts(tree, ES, Leaves1, &uCount1, Leaves2, &uCount2); Log("PEBP=%d ES.Init=%d ES.ni1=%d ES.ni2=%d\n", bOk, ES.m_bInit, ES.m_uNodeIndex1, ES.m_uNodeIndex2); if (!bOk) break; Log("\n"); Log("Part1: "); for (unsigned n = 0; n < uCount1; ++n) Log(" %d(%s)", Leaves1[n], tree.GetLeafName(Leaves1[n])); Log("\n"); Log("Part2: "); for (unsigned n = 0; n < uCount2; ++n) Log(" %d(%s)", Leaves2[n], tree.GetLeafName(Leaves2[n])); Log("\n"); } } #endif static void GetLeavesSubtreeExcluding(const Tree &tree, unsigned uNodeIndex, unsigned uExclude, unsigned Leaves[], unsigned *ptruCount) { if (uNodeIndex == uExclude) return; if (tree.IsLeaf(uNodeIndex)) { Leaves[*ptruCount] = uNodeIndex; ++(*ptruCount); return; } const unsigned uLeft = tree.GetLeft(uNodeIndex); const unsigned uRight = tree.GetRight(uNodeIndex); if (NULL_NEIGHBOR != uLeft) GetLeavesSubtreeExcluding(tree, uLeft, uExclude, Leaves, ptruCount); if (NULL_NEIGHBOR != uRight) GetLeavesSubtreeExcluding(tree, uRight, uExclude, Leaves, ptruCount); } void GetLeavesExcluding(const Tree &tree, unsigned uNodeIndex, unsigned uExclude, unsigned Leaves[], unsigned *ptruCount) { *ptruCount = 0; GetLeavesSubtreeExcluding(tree, uNodeIndex, uExclude, Leaves, ptruCount); } void GetInternalNodesInHeightOrder(const Tree &tree, unsigned NodeIndexes[]) { const unsigned uNodeCount = tree.GetNodeCount(); if (uNodeCount < 3) Quit("GetInternalNodesInHeightOrder: %u nodes, none are internal", uNodeCount); const unsigned uInternalNodeCount = (uNodeCount - 1)/2; double *Heights = new double[uInternalNodeCount]; unsigned uIndex = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (tree.IsLeaf(uNodeIndex)) continue; NodeIndexes[uIndex] = uNodeIndex; Heights[uIndex] = tree.GetNodeHeight(uNodeIndex); ++uIndex; } if (uIndex != uInternalNodeCount) Quit("Internal error: GetInternalNodesInHeightOrder"); // Simple but slow bubble sort (probably don't care about speed here) bool bDone = false; while (!bDone) { bDone = true; for (unsigned i = 0; i < uInternalNodeCount - 1; ++i) { if (Heights[i] > Heights[i+1]) { double dTmp = Heights[i]; Heights[i] = Heights[i+1]; Heights[i+1] = dTmp; unsigned uTmp = NodeIndexes[i]; NodeIndexes[i] = NodeIndexes[i+1]; NodeIndexes[i+1] = uTmp; bDone = false; } } } #if TRACE Log("Internal node index Height\n"); Log("------------------- --------\n"); // 1234567890123456789 123456789 for (unsigned n = 0; n < uInternalNodeCount; ++n) Log("%19u %9.3f\n", NodeIndexes[n], Heights[n]); #endif delete[] Heights; } void ApplyMinEdgeLength(Tree &tree, double dMinEdgeLength) { const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { const unsigned uNeighborCount = tree.GetNeighborCount(uNodeIndex); for (unsigned n = 0; n < uNeighborCount; ++n) { const unsigned uNeighborNodeIndex = tree.GetNeighbor(uNodeIndex, n); if (!tree.HasEdgeLength(uNodeIndex, uNeighborNodeIndex)) continue; if (tree.GetEdgeLength(uNodeIndex, uNeighborNodeIndex) < dMinEdgeLength) tree.SetEdgeLength(uNodeIndex, uNeighborNodeIndex, dMinEdgeLength); } } } phy3.cpp0000664000175000017500000003264012360262614010544 0ustar bobbob#include "muscle.h" #include "tree.h" #include "edgelist.h" #define TRACE 0 struct EdgeInfo { EdgeInfo() { m_bSet = false; } // Is data in this structure valid (i.e, has been set)? bool m_bSet; // Node at start of this edge unsigned m_uNode1; // Node at end of this edge unsigned m_uNode2; // Maximum distance from Node2 to a leaf double m_dMaxDistToLeaf; // Sum of distances from Node2 to all leaves under Node2 double m_dTotalDistToLeaves; // Next node on path from Node2 to most distant leaf unsigned m_uMaxStep; // Most distant leaf from Node2 (used for debugging only) unsigned m_uMostDistantLeaf; // Number of leaves under Node2 unsigned m_uLeafCount; }; static void RootByMidLongestSpan(const Tree &tree, EdgeInfo **EIs, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2); static void RootByMinAvgLeafDist(const Tree &tree, EdgeInfo **EIs, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2); static void ListEIs(EdgeInfo **EIs, unsigned uNodeCount) { Log("Node1 Node2 MaxDist TotDist MostDist LeafCount Step\n"); Log("----- ----- ------- ------- -------- --------- ----\n"); // 12345 12345 1234567 1234567 12345678 123456789 for (unsigned uNode = 0; uNode < uNodeCount; ++uNode) for (unsigned uNeighbor = 0; uNeighbor < 3; ++uNeighbor) { const EdgeInfo &EI = EIs[uNode][uNeighbor]; if (!EI.m_bSet) continue; Log("%5u %5u %7.3g %7.3g %8u %9u", EI.m_uNode1, EI.m_uNode2, EI.m_dMaxDistToLeaf, EI.m_dTotalDistToLeaves, EI.m_uMostDistantLeaf, EI.m_uLeafCount); if (NULL_NEIGHBOR != EI.m_uMaxStep) Log(" %4u", EI.m_uMaxStep); Log("\n"); } } static void CalcInfo(const Tree &tree, unsigned uNode1, unsigned uNode2, EdgeInfo **EIs) { const unsigned uNeighborIndex = tree.GetNeighborSubscript(uNode1, uNode2); EdgeInfo &EI = EIs[uNode1][uNeighborIndex]; EI.m_uNode1 = uNode1; EI.m_uNode2 = uNode2; if (tree.IsLeaf(uNode2)) { EI.m_dMaxDistToLeaf = 0; EI.m_dTotalDistToLeaves = 0; EI.m_uMaxStep = NULL_NEIGHBOR; EI.m_uMostDistantLeaf = uNode2; EI.m_uLeafCount = 1; EI.m_bSet = true; return; } double dMaxDistToLeaf = -1e29; double dTotalDistToLeaves = 0.0; unsigned uLeafCount = 0; unsigned uMostDistantLeaf = NULL_NEIGHBOR; unsigned uMaxStep = NULL_NEIGHBOR; const unsigned uNeighborCount = tree.GetNeighborCount(uNode2); for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub) { const unsigned uNode3 = tree.GetNeighbor(uNode2, uSub); if (uNode3 == uNode1) continue; const EdgeInfo &EINext = EIs[uNode2][uSub]; if (!EINext.m_bSet) Quit("CalcInfo: internal error, dist %u->%u not known", uNode2, uNode3); uLeafCount += EINext.m_uLeafCount; const double dEdgeLength = tree.GetEdgeLength(uNode2, uNode3); const double dTotalDist = EINext.m_dTotalDistToLeaves + EINext.m_uLeafCount*dEdgeLength; dTotalDistToLeaves += dTotalDist; const double dDist = EINext.m_dMaxDistToLeaf + dEdgeLength; if (dDist > dMaxDistToLeaf) { dMaxDistToLeaf = dDist; uMostDistantLeaf = EINext.m_uMostDistantLeaf; uMaxStep = uNode3; } } if (NULL_NEIGHBOR == uMaxStep || NULL_NEIGHBOR == uMostDistantLeaf || 0 == uLeafCount) Quit("CalcInfo: internal error 2"); const double dThisDist = tree.GetEdgeLength(uNode1, uNode2); EI.m_dMaxDistToLeaf = dMaxDistToLeaf; EI.m_dTotalDistToLeaves = dTotalDistToLeaves; EI.m_uMaxStep = uMaxStep; EI.m_uMostDistantLeaf = uMostDistantLeaf; EI.m_uLeafCount = uLeafCount; EI.m_bSet = true; } static bool Known(const Tree &tree, EdgeInfo **EIs, unsigned uNodeFrom, unsigned uNodeTo) { const unsigned uSub = tree.GetNeighborSubscript(uNodeFrom, uNodeTo); return EIs[uNodeFrom][uSub].m_bSet; } static bool AllKnownOut(const Tree &tree, EdgeInfo **EIs, unsigned uNodeFrom, unsigned uNodeTo) { const unsigned uNeighborCount = tree.GetNeighborCount(uNodeTo); for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub) { unsigned uNeighborIndex = tree.GetNeighbor(uNodeTo, uSub); if (uNeighborIndex == uNodeFrom) continue; if (!EIs[uNodeTo][uSub].m_bSet) return false; } return true; } void FindRoot(const Tree &tree, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2, ROOT RootMethod) { #if TRACE tree.LogMe(); #endif if (tree.IsRooted()) Quit("FindRoot: tree already rooted"); const unsigned uNodeCount = tree.GetNodeCount(); const unsigned uLeafCount = tree.GetLeafCount(); if (uNodeCount < 2) Quit("Root: don't support trees with < 2 edges"); EdgeInfo **EIs = new EdgeInfo *[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) EIs[uNodeIndex] = new EdgeInfo[3]; EdgeList Edges; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) if (tree.IsLeaf(uNodeIndex)) { unsigned uParent = tree.GetNeighbor1(uNodeIndex); Edges.Add(uParent, uNodeIndex); } #if TRACE Log("Edges: "); Edges.LogMe(); #endif // Main loop: iterate until all distances known double dAllMaxDist = -1e20; unsigned uMaxFrom = NULL_NEIGHBOR; unsigned uMaxTo = NULL_NEIGHBOR; for (;;) { EdgeList NextEdges; #if TRACE Log("\nTop of main loop\n"); Log("Edges: "); Edges.LogMe(); Log("MDs:\n"); ListEIs(EIs, uNodeCount); #endif // For all edges const unsigned uEdgeCount = Edges.GetCount(); if (0 == uEdgeCount) break; for (unsigned n = 0; n < uEdgeCount; ++n) { unsigned uNodeFrom; unsigned uNodeTo; Edges.GetEdge(n, &uNodeFrom, &uNodeTo); CalcInfo(tree, uNodeFrom, uNodeTo, EIs); #if TRACE Log("Edge %u -> %u\n", uNodeFrom, uNodeTo); #endif const unsigned uNeighborCount = tree.GetNeighborCount(uNodeFrom); for (unsigned i = 0; i < uNeighborCount; ++i) { const unsigned uNeighborIndex = tree.GetNeighbor(uNodeFrom, i); if (!Known(tree, EIs, uNeighborIndex, uNodeFrom) && AllKnownOut(tree, EIs, uNeighborIndex, uNodeFrom)) NextEdges.Add(uNeighborIndex, uNodeFrom); } } Edges.Copy(NextEdges); } #if TRACE ListEIs(EIs, uNodeCount); #endif switch (RootMethod) { case ROOT_MidLongestSpan: RootByMidLongestSpan(tree, EIs, ptruNode1, ptruNode2, ptrdLength1, ptrdLength2); break; case ROOT_MinAvgLeafDist: RootByMinAvgLeafDist(tree, EIs, ptruNode1, ptruNode2, ptrdLength1, ptrdLength2); break; default: Quit("Invalid RootMethod=%d", RootMethod); } for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) delete[] EIs[uNodeIndex]; delete[] EIs; } static void RootByMidLongestSpan(const Tree &tree, EdgeInfo **EIs, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned uLeaf1 = NULL_NEIGHBOR; unsigned uMostDistantLeaf = NULL_NEIGHBOR; double dMaxDist = -VERY_LARGE_DOUBLE; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!tree.IsLeaf(uNodeIndex)) continue; const unsigned uNode2 = tree.GetNeighbor1(uNodeIndex); if (NULL_NEIGHBOR == uNode2) Quit("RootByMidLongestSpan: internal error 0"); const double dEdgeLength = tree.GetEdgeLength(uNodeIndex, uNode2); const EdgeInfo &EI = EIs[uNodeIndex][0]; if (!EI.m_bSet) Quit("RootByMidLongestSpan: internal error 1"); if (EI.m_uNode1 != uNodeIndex || EI.m_uNode2 != uNode2) Quit("RootByMidLongestSpan: internal error 2"); const double dSpanLength = dEdgeLength + EI.m_dMaxDistToLeaf; if (dSpanLength > dMaxDist) { dMaxDist = dSpanLength; uLeaf1 = uNodeIndex; uMostDistantLeaf = EI.m_uMostDistantLeaf; } } if (NULL_NEIGHBOR == uLeaf1) Quit("RootByMidLongestSpan: internal error 3"); const double dTreeHeight = dMaxDist/2.0; unsigned uNode1 = uLeaf1; unsigned uNode2 = tree.GetNeighbor1(uLeaf1); double dAccumSpanLength = 0; #if TRACE Log("RootByMidLongestSpan: span=%u", uLeaf1); #endif for (;;) { const double dEdgeLength = tree.GetEdgeLength(uNode1, uNode2); #if TRACE Log("->%u(%g;%g)", uNode2, dEdgeLength, dAccumSpanLength); #endif if (dAccumSpanLength + dEdgeLength >= dTreeHeight) { *ptruNode1 = uNode1; *ptruNode2 = uNode2; *ptrdLength1 = dTreeHeight - dAccumSpanLength; *ptrdLength2 = dEdgeLength - *ptrdLength1; #if TRACE { const EdgeInfo &EI = EIs[uLeaf1][0]; Log("...\n"); Log("Midpoint: Leaf1=%u Leaf2=%u Node1=%u Node2=%u Length1=%g Length2=%g\n", uLeaf1, EI.m_uMostDistantLeaf, *ptruNode1, *ptruNode2, *ptrdLength1, *ptrdLength2); } #endif return; } if (tree.IsLeaf(uNode2)) Quit("RootByMidLongestSpan: internal error 4"); dAccumSpanLength += dEdgeLength; const unsigned uSub = tree.GetNeighborSubscript(uNode1, uNode2); const EdgeInfo &EI = EIs[uNode1][uSub]; if (!EI.m_bSet) Quit("RootByMidLongestSpan: internal error 5"); uNode1 = uNode2; uNode2 = EI.m_uMaxStep; } } /*** Root by balancing average distance to leaves. The root is a point p such that the average distance to leaves to the left of p is the same as the to the right. This is the method used by CLUSTALW, which was originally used in PROFILEWEIGHT: Thompson et al. (1994) CABIOS (10) 1, 19-29. ***/ static void RootByMinAvgLeafDist(const Tree &tree, EdgeInfo **EIs, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2) { const unsigned uNodeCount = tree.GetNodeCount(); const unsigned uLeafCount = tree.GetLeafCount(); unsigned uNode1 = NULL_NEIGHBOR; unsigned uNode2 = NULL_NEIGHBOR; double dMinHeight = VERY_LARGE_DOUBLE; double dBestLength1 = VERY_LARGE_DOUBLE; double dBestLength2 = VERY_LARGE_DOUBLE; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { const unsigned uNeighborCount = tree.GetNeighborCount(uNodeIndex); for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub) { const unsigned uNeighborIndex = tree.GetNeighbor(uNodeIndex, uSub); // Avoid visiting same edge a second time in reversed order. if (uNeighborIndex < uNodeIndex) continue; const unsigned uSubRev = tree.GetNeighborSubscript(uNeighborIndex, uNodeIndex); if (NULL_NEIGHBOR == uSubRev) Quit("RootByMinAvgLeafDist, internal error 1"); // Get info for edges Node1->Node2 and Node2->Node1 (reversed) const EdgeInfo &EI = EIs[uNodeIndex][uSub]; const EdgeInfo &EIRev = EIs[uNeighborIndex][uSubRev]; if (EI.m_uNode1 != uNodeIndex || EI.m_uNode2 != uNeighborIndex || EIRev.m_uNode1 != uNeighborIndex || EIRev.m_uNode2 != uNodeIndex) Quit("RootByMinAvgLeafDist, internal error 2"); if (!EI.m_bSet) Quit("RootByMinAvgLeafDist, internal error 3"); if (uLeafCount != EI.m_uLeafCount + EIRev.m_uLeafCount) Quit("RootByMinAvgLeafDist, internal error 4"); const double dEdgeLength = tree.GetEdgeLength(uNodeIndex, uNeighborIndex); if (dEdgeLength != tree.GetEdgeLength(uNeighborIndex, uNodeIndex)) Quit("RootByMinAvgLeafDist, internal error 5"); // Consider point p on edge 12 in tree (1=Node, 2=Neighbor). // // ----- ---- // | | // 1----p--2 // | | // ----- ---- // // Define: // ADLp = average distance to leaves to left of point p. // ADRp = average distance to leaves to right of point p. // L = edge length = distance 12 // x = distance 1p // So distance p2 = L - x. // Average distance from p to leaves on left of p is: // ADLp = ADL1 + x // Average distance from p to leaves on right of p is: // ADRp = ADR2 + (L - x) // To be a root, we require these two distances to be equal, // ADLp = ADRp // ADL1 + x = ADR2 + (L - x) // Solving for x, // x = (ADR2 - ADL1 + L)/2 // If 0 <= x <= L, we can place the root on edge 12. const double ADL1 = EI.m_dTotalDistToLeaves / EI.m_uLeafCount; const double ADR2 = EIRev.m_dTotalDistToLeaves / EIRev.m_uLeafCount; const double x = (ADR2 - ADL1 + dEdgeLength)/2.0; if (x >= 0 && x <= dEdgeLength) { const double dLength1 = x; const double dLength2 = dEdgeLength - x; const double dHeight1 = EI.m_dMaxDistToLeaf + dLength1; const double dHeight2 = EIRev.m_dMaxDistToLeaf + dLength2; const double dHeight = dHeight1 >= dHeight2 ? dHeight1 : dHeight2; #if TRACE Log("Candidate root Node1=%u Node2=%u Height=%g\n", uNodeIndex, uNeighborIndex, dHeight); #endif if (dHeight < dMinHeight) { uNode1 = uNodeIndex; uNode2 = uNeighborIndex; dBestLength1 = dLength1; dBestLength2 = dLength2; dMinHeight = dHeight; } } } } if (NULL_NEIGHBOR == uNode1 || NULL_NEIGHBOR == uNode2) Quit("RootByMinAvgLeafDist, internal error 6"); #if TRACE Log("Best root Node1=%u Node2=%u Length1=%g Length2=%g Height=%g\n", uNode1, uNode2, dBestLength1, dBestLength2, dMinHeight); #endif *ptruNode1 = uNode1; *ptruNode2 = uNode2; *ptrdLength1 = dBestLength1; *ptrdLength2 = dBestLength2; } void FixRoot(Tree &tree, ROOT Method) { if (!tree.IsRooted()) Quit("FixRoot: expecting rooted tree"); // Pseudo-root: keep root assigned by clustering if (ROOT_Pseudo == Method) return; tree.UnrootByDeletingRoot(); tree.RootUnrootedTree(Method); } phy4.cpp0000664000175000017500000002074712360262614010552 0ustar bobbob#include "muscle.h" #include "tree.h" #include #define TRACE 0 void ClusterByHeight(const Tree &tree, double dMaxHeight, unsigned Subtrees[], unsigned *ptruSubtreeCount) { if (!tree.IsRooted()) Quit("ClusterByHeight: requires rooted tree"); #if TRACE Log("ClusterByHeight, max height=%g\n", dMaxHeight); #endif unsigned uSubtreeCount = 0; const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (tree.IsRoot(uNodeIndex)) continue; unsigned uParent = tree.GetParent(uNodeIndex); double dHeight = tree.GetNodeHeight(uNodeIndex); double dParentHeight = tree.GetNodeHeight(uParent); #if TRACE Log("Node %3u Height %5.2f ParentHeight %5.2f\n", uNodeIndex, dHeight, dParentHeight); #endif if (dParentHeight > dMaxHeight && dHeight <= dMaxHeight) { Subtrees[uSubtreeCount] = uNodeIndex; #if TRACE Log("Subtree[%u]=%u\n", uSubtreeCount, uNodeIndex); #endif ++uSubtreeCount; } } *ptruSubtreeCount = uSubtreeCount; } static void ClusterBySubfamCount_Iteration(const Tree &tree, unsigned Subfams[], unsigned uCount) { // Find highest child node of current set of subfamilies. double dHighestHeight = -1e20; int iParentSubscript = -1; for (int n = 0; n < (int) uCount; ++n) { const unsigned uNodeIndex = Subfams[n]; if (tree.IsLeaf(uNodeIndex)) continue; const unsigned uLeft = tree.GetLeft(uNodeIndex); const double dHeightLeft = tree.GetNodeHeight(uLeft); if (dHeightLeft > dHighestHeight) { dHighestHeight = dHeightLeft; iParentSubscript = n; } const unsigned uRight = tree.GetRight(uNodeIndex); const double dHeightRight = tree.GetNodeHeight(uRight); if (dHeightRight > dHighestHeight) { dHighestHeight = dHeightRight; iParentSubscript = n; } } if (-1 == iParentSubscript) Quit("CBSFCIter: failed to find highest child"); const unsigned uNodeIndex = Subfams[iParentSubscript]; const unsigned uLeft = tree.GetLeft(uNodeIndex); const unsigned uRight = tree.GetRight(uNodeIndex); // Delete parent by replacing with left child Subfams[iParentSubscript] = uLeft; // Append right child to list Subfams[uCount] = uRight; #if TRACE { Log("Iter %3u:", uCount); for (unsigned n = 0; n < uCount; ++n) Log(" %u", Subfams[n]); Log("\n"); } #endif } // Divide a tree containing N leaves into k families by // cutting the tree at a horizontal line at some height. // Each internal node defines a height for the cut, // considering all internal nodes enumerates all distinct // cuts. Visit internal nodes in decreasing order of height. // Visiting the node corresponds to moving the horizontal // line down to cut the tree at the height of that node. // We consider the cut to be "infinitestimally below" // the node, so the effect is to remove the current node // from the list of subfamilies and add its two children. // We must visit a parent before its children (so care may // be needed to handle zero edge lengths properly). // We assume that N is small, and write dumb O(N^2) code. // More efficient strategies are possible for large N // by maintaining a list of nodes sorted by height. void ClusterBySubfamCount(const Tree &tree, unsigned uSubfamCount, unsigned Subfams[], unsigned *ptruSubfamCount) { const unsigned uNodeCount = tree.GetNodeCount(); const unsigned uLeafCount = (uNodeCount + 1)/2; // Special case: empty tree if (0 == uNodeCount) { *ptruSubfamCount = 0; return; } // Special case: more subfamilies than leaves if (uSubfamCount >= uLeafCount) { for (unsigned n = 0; n < uLeafCount; ++n) Subfams[n] = n; *ptruSubfamCount = uLeafCount; return; } // Initialize list of subfamilies to be root Subfams[0] = tree.GetRootNodeIndex(); // Iterate for (unsigned i = 1; i < uSubfamCount; ++i) ClusterBySubfamCount_Iteration(tree, Subfams, i); *ptruSubfamCount = uSubfamCount; } static void GetLeavesRecurse(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[], unsigned &uLeafCount /* in-out */) { if (tree.IsLeaf(uNodeIndex)) { Leaves[uLeafCount] = uNodeIndex; ++uLeafCount; return; } const unsigned uLeft = tree.GetLeft(uNodeIndex); const unsigned uRight = tree.GetRight(uNodeIndex); GetLeavesRecurse(tree, uLeft, Leaves, uLeafCount); GetLeavesRecurse(tree, uRight, Leaves, uLeafCount); } void GetLeaves(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[], unsigned *ptruLeafCount) { unsigned uLeafCount = 0; GetLeavesRecurse(tree, uNodeIndex, Leaves, uLeafCount); *ptruLeafCount = uLeafCount; } void Tree::PruneTree(const Tree &tree, unsigned Subfams[], unsigned uSubfamCount) { if (!tree.IsRooted()) Quit("Tree::PruneTree: requires rooted tree"); Clear(); m_uNodeCount = 2*uSubfamCount - 1; InitCache(m_uNodeCount); const unsigned uUnprunedNodeCount = tree.GetNodeCount(); unsigned *uUnprunedToPrunedIndex = new unsigned[uUnprunedNodeCount]; unsigned *uPrunedToUnprunedIndex = new unsigned[m_uNodeCount]; for (unsigned n = 0; n < uUnprunedNodeCount; ++n) uUnprunedToPrunedIndex[n] = NULL_NEIGHBOR; for (unsigned n = 0; n < m_uNodeCount; ++n) uPrunedToUnprunedIndex[n] = NULL_NEIGHBOR; // Create mapping between unpruned and pruned node indexes unsigned uInternalNodeIndex = uSubfamCount; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uUnprunedNodeIndex = Subfams[uSubfamIndex]; uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uSubfamIndex; uPrunedToUnprunedIndex[uSubfamIndex] = uUnprunedNodeIndex; for (;;) { uUnprunedNodeIndex = tree.GetParent(uUnprunedNodeIndex); if (tree.IsRoot(uUnprunedNodeIndex)) break; // Already visited this node? if (NULL_NEIGHBOR != uUnprunedToPrunedIndex[uUnprunedNodeIndex]) break; uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uInternalNodeIndex; uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedNodeIndex; ++uInternalNodeIndex; } } const unsigned uUnprunedRootIndex = tree.GetRootNodeIndex(); uUnprunedToPrunedIndex[uUnprunedRootIndex] = uInternalNodeIndex; uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedRootIndex; #if TRACE { Log("Pruned to unpruned:\n"); for (unsigned i = 0; i < m_uNodeCount; ++i) Log(" [%u]=%u", i, uPrunedToUnprunedIndex[i]); Log("\n"); Log("Unpruned to pruned:\n"); for (unsigned i = 0; i < uUnprunedNodeCount; ++i) { unsigned n = uUnprunedToPrunedIndex[i]; if (n != NULL_NEIGHBOR) Log(" [%u]=%u", i, n); } Log("\n"); } #endif if (uInternalNodeIndex != m_uNodeCount - 1) Quit("Tree::PruneTree, Internal error"); // Nodes 0, 1 ... are the leaves for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { char szName[32]; sprintf(szName, "Subfam_%u", uSubfamIndex + 1); m_ptrName[uSubfamIndex] = strsave(szName); } for (unsigned uPrunedNodeIndex = uSubfamCount; uPrunedNodeIndex < m_uNodeCount; ++uPrunedNodeIndex) { unsigned uUnprunedNodeIndex = uPrunedToUnprunedIndex[uPrunedNodeIndex]; const unsigned uUnprunedLeft = tree.GetLeft(uUnprunedNodeIndex); const unsigned uUnprunedRight = tree.GetRight(uUnprunedNodeIndex); const unsigned uPrunedLeft = uUnprunedToPrunedIndex[uUnprunedLeft]; const unsigned uPrunedRight = uUnprunedToPrunedIndex[uUnprunedRight]; const double dLeftLength = tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedLeft); const double dRightLength = tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedRight); m_uNeighbor2[uPrunedNodeIndex] = uPrunedLeft; m_uNeighbor3[uPrunedNodeIndex] = uPrunedRight; m_dEdgeLength1[uPrunedLeft] = dLeftLength; m_dEdgeLength1[uPrunedRight] = dRightLength; m_uNeighbor1[uPrunedLeft] = uPrunedNodeIndex; m_uNeighbor1[uPrunedRight] = uPrunedNodeIndex; m_bHasEdgeLength1[uPrunedLeft] = true; m_bHasEdgeLength1[uPrunedRight] = true; m_dEdgeLength2[uPrunedNodeIndex] = dLeftLength; m_dEdgeLength3[uPrunedNodeIndex] = dRightLength; m_bHasEdgeLength2[uPrunedNodeIndex] = true; m_bHasEdgeLength3[uPrunedNodeIndex] = true; } m_uRootNodeIndex = uUnprunedToPrunedIndex[uUnprunedRootIndex]; m_bRooted = true; Validate(); delete[] uUnprunedToPrunedIndex; } void LeafIndexesToIds(const Tree &tree, const unsigned Leaves[], unsigned uCount, unsigned Ids[]) { for (unsigned n = 0; n < uCount; ++n) Ids[n] = tree.GetLeafId(Leaves[n]); } phy.cpp0000664000175000017500000007434312360262614010467 0ustar bobbob#include "muscle.h" #include "tree.h" #include #define TRACE 0 /*** Node has 0 to 3 neighbors: 0 neighbors: singleton root 1 neighbor: leaf, neighbor is parent 2 neigbors: non-singleton root 3 neighbors: internal node (other than root) Minimal rooted tree is single node. Minimal unrooted tree is single edge. Leaf node always has nulls in neighbors 2 and 3, neighbor 1 is parent. When tree is rooted, neighbor 1=parent, 2=left, 3=right. ***/ void Tree::AssertAreNeighbors(unsigned uNodeIndex1, unsigned uNodeIndex2) const { if (uNodeIndex1 >= m_uNodeCount || uNodeIndex2 >= m_uNodeCount) Quit("AssertAreNeighbors(%u,%u), are %u nodes", uNodeIndex1, uNodeIndex2, m_uNodeCount); if (m_uNeighbor1[uNodeIndex1] != uNodeIndex2 && m_uNeighbor2[uNodeIndex1] != uNodeIndex2 && m_uNeighbor3[uNodeIndex1] != uNodeIndex2) { LogMe(); Quit("AssertAreNeighbors(%u,%u) failed", uNodeIndex1, uNodeIndex2); } if (m_uNeighbor1[uNodeIndex2] != uNodeIndex1 && m_uNeighbor2[uNodeIndex2] != uNodeIndex1 && m_uNeighbor3[uNodeIndex2] != uNodeIndex1) { LogMe(); Quit("AssertAreNeighbors(%u,%u) failed", uNodeIndex1, uNodeIndex2); } bool Has12 = HasEdgeLength(uNodeIndex1, uNodeIndex2); bool Has21 = HasEdgeLength(uNodeIndex2, uNodeIndex1); if (Has12 != Has21) { HasEdgeLength(uNodeIndex1, uNodeIndex2); HasEdgeLength(uNodeIndex2, uNodeIndex1); LogMe(); Log("HasEdgeLength(%u, %u)=%c HasEdgeLength(%u, %u)=%c\n", uNodeIndex1, uNodeIndex2, Has12 ? 'T' : 'F', uNodeIndex2, uNodeIndex1, Has21 ? 'T' : 'F'); Quit("Tree::AssertAreNeighbors, HasEdgeLength not symmetric"); } if (Has12) { double d12 = GetEdgeLength(uNodeIndex1, uNodeIndex2); double d21 = GetEdgeLength(uNodeIndex2, uNodeIndex1); if (d12 != d21) { LogMe(); Quit("Tree::AssertAreNeighbors, Edge length disagrees %u-%u=%.3g, %u-%u=%.3g", uNodeIndex1, uNodeIndex2, d12, uNodeIndex2, uNodeIndex1, d21); } } } void Tree::ValidateNode(unsigned uNodeIndex) const { if (uNodeIndex >= m_uNodeCount) Quit("ValidateNode(%u), %u nodes", uNodeIndex, m_uNodeCount); const unsigned uNeighborCount = GetNeighborCount(uNodeIndex); if (2 == uNeighborCount) { if (!m_bRooted) { LogMe(); Quit("Tree::ValidateNode: Node %u has two neighbors, tree is not rooted", uNodeIndex); } if (uNodeIndex != m_uRootNodeIndex) { LogMe(); Quit("Tree::ValidateNode: Node %u has two neighbors, but not root node=%u", uNodeIndex, m_uRootNodeIndex); } } const unsigned n1 = m_uNeighbor1[uNodeIndex]; const unsigned n2 = m_uNeighbor2[uNodeIndex]; const unsigned n3 = m_uNeighbor3[uNodeIndex]; if (NULL_NEIGHBOR == n2 && NULL_NEIGHBOR != n3) { LogMe(); Quit("Tree::ValidateNode, n2=null, n3!=null", uNodeIndex); } if (NULL_NEIGHBOR == n3 && NULL_NEIGHBOR != n2) { LogMe(); Quit("Tree::ValidateNode, n3=null, n2!=null", uNodeIndex); } if (n1 != NULL_NEIGHBOR) AssertAreNeighbors(uNodeIndex, n1); if (n2 != NULL_NEIGHBOR) AssertAreNeighbors(uNodeIndex, n2); if (n3 != NULL_NEIGHBOR) AssertAreNeighbors(uNodeIndex, n3); if (n1 != NULL_NEIGHBOR && (n1 == n2 || n1 == n3)) { LogMe(); Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); } if (n2 != NULL_NEIGHBOR && (n2 == n1 || n2 == n3)) { LogMe(); Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); } if (n3 != NULL_NEIGHBOR && (n3 == n1 || n3 == n2)) { LogMe(); Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); } if (IsRooted()) { if (NULL_NEIGHBOR == GetParent(uNodeIndex)) { if (uNodeIndex != m_uRootNodeIndex) { LogMe(); Quit("Tree::ValiateNode(%u), no parent", uNodeIndex); } } else if (GetLeft(GetParent(uNodeIndex)) != uNodeIndex && GetRight(GetParent(uNodeIndex)) != uNodeIndex) { LogMe(); Quit("Tree::ValidateNode(%u), parent / child mismatch", uNodeIndex); } } } void Tree::Validate() const { for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) ValidateNode(uNodeIndex); } bool Tree::IsEdge(unsigned uNodeIndex1, unsigned uNodeIndex2) const { assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); return m_uNeighbor1[uNodeIndex1] == uNodeIndex2 || m_uNeighbor2[uNodeIndex1] == uNodeIndex2 || m_uNeighbor3[uNodeIndex1] == uNodeIndex2; } double Tree::GetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const { assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); if (!HasEdgeLength(uNodeIndex1, uNodeIndex2)) { LogMe(); Quit("Missing edge length in tree %u-%u", uNodeIndex1, uNodeIndex2); } if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) return m_dEdgeLength1[uNodeIndex1]; else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) return m_dEdgeLength2[uNodeIndex1]; assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); return m_dEdgeLength3[uNodeIndex1]; } void Tree::ExpandCache() { const unsigned uNodeCount = 100; unsigned uNewCacheCount = m_uCacheCount + uNodeCount; unsigned *uNewNeighbor1 = new unsigned[uNewCacheCount]; unsigned *uNewNeighbor2 = new unsigned[uNewCacheCount]; unsigned *uNewNeighbor3 = new unsigned[uNewCacheCount]; unsigned *uNewIds = new unsigned[uNewCacheCount]; memset(uNewIds, 0xff, uNewCacheCount*sizeof(unsigned)); double *dNewEdgeLength1 = new double[uNewCacheCount]; double *dNewEdgeLength2 = new double[uNewCacheCount]; double *dNewEdgeLength3 = new double[uNewCacheCount]; double *dNewHeight = new double[uNewCacheCount]; bool *bNewHasEdgeLength1 = new bool[uNewCacheCount]; bool *bNewHasEdgeLength2 = new bool[uNewCacheCount]; bool *bNewHasEdgeLength3 = new bool[uNewCacheCount]; bool *bNewHasHeight = new bool[uNewCacheCount]; char **ptrNewName = new char *[uNewCacheCount]; memset(ptrNewName, 0, uNewCacheCount*sizeof(char *)); if (m_uCacheCount > 0) { const unsigned uUnsignedBytes = m_uCacheCount*sizeof(unsigned); memcpy(uNewNeighbor1, m_uNeighbor1, uUnsignedBytes); memcpy(uNewNeighbor2, m_uNeighbor2, uUnsignedBytes); memcpy(uNewNeighbor3, m_uNeighbor3, uUnsignedBytes); memcpy(uNewIds, m_Ids, uUnsignedBytes); const unsigned uEdgeBytes = m_uCacheCount*sizeof(double); memcpy(dNewEdgeLength1, m_dEdgeLength1, uEdgeBytes); memcpy(dNewEdgeLength2, m_dEdgeLength2, uEdgeBytes); memcpy(dNewEdgeLength3, m_dEdgeLength3, uEdgeBytes); memcpy(dNewHeight, m_dHeight, uEdgeBytes); const unsigned uBoolBytes = m_uCacheCount*sizeof(bool); memcpy(bNewHasEdgeLength1, m_bHasEdgeLength1, uBoolBytes); memcpy(bNewHasEdgeLength2, m_bHasEdgeLength2, uBoolBytes); memcpy(bNewHasEdgeLength3, m_bHasEdgeLength3, uBoolBytes); memcpy(bNewHasHeight, m_bHasHeight, uBoolBytes); const unsigned uNameBytes = m_uCacheCount*sizeof(char *); memcpy(ptrNewName, m_ptrName, uNameBytes); delete[] m_uNeighbor1; delete[] m_uNeighbor2; delete[] m_uNeighbor3; delete[] m_Ids; delete[] m_dEdgeLength1; delete[] m_dEdgeLength2; delete[] m_dEdgeLength3; delete[] m_bHasEdgeLength1; delete[] m_bHasEdgeLength2; delete[] m_bHasEdgeLength3; delete[] m_bHasHeight; delete[] m_ptrName; } m_uCacheCount = uNewCacheCount; m_uNeighbor1 = uNewNeighbor1; m_uNeighbor2 = uNewNeighbor2; m_uNeighbor3 = uNewNeighbor3; m_Ids = uNewIds; m_dEdgeLength1 = dNewEdgeLength1; m_dEdgeLength2 = dNewEdgeLength2; m_dEdgeLength3 = dNewEdgeLength3; m_dHeight = dNewHeight; m_bHasEdgeLength1 = bNewHasEdgeLength1; m_bHasEdgeLength2 = bNewHasEdgeLength2; m_bHasEdgeLength3 = bNewHasEdgeLength3; m_bHasHeight = bNewHasHeight; m_ptrName = ptrNewName; } // Creates tree with single node, no edges. // Root node always has index 0. void Tree::CreateRooted() { Clear(); ExpandCache(); m_uNodeCount = 1; m_uNeighbor1[0] = NULL_NEIGHBOR; m_uNeighbor2[0] = NULL_NEIGHBOR; m_uNeighbor3[0] = NULL_NEIGHBOR; m_bHasEdgeLength1[0] = false; m_bHasEdgeLength2[0] = false; m_bHasEdgeLength3[0] = false; m_bHasHeight[0] = false; m_uRootNodeIndex = 0; m_bRooted = true; #if DEBUG Validate(); #endif } // Creates unrooted tree with single edge. // Nodes for that edge are always 0 and 1. void Tree::CreateUnrooted(double dEdgeLength) { Clear(); ExpandCache(); m_uNeighbor1[0] = 1; m_uNeighbor2[0] = NULL_NEIGHBOR; m_uNeighbor3[0] = NULL_NEIGHBOR; m_uNeighbor1[1] = 0; m_uNeighbor2[1] = NULL_NEIGHBOR; m_uNeighbor3[1] = NULL_NEIGHBOR; m_dEdgeLength1[0] = dEdgeLength; m_dEdgeLength1[1] = dEdgeLength; m_bHasEdgeLength1[0] = true; m_bHasEdgeLength1[1] = true; m_bRooted = false; #if DEBUG Validate(); #endif } void Tree::SetLeafName(unsigned uNodeIndex, const char *ptrName) { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); free(m_ptrName[uNodeIndex]); m_ptrName[uNodeIndex] = strsave(ptrName); } void Tree::SetLeafId(unsigned uNodeIndex, unsigned uId) { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); m_Ids[uNodeIndex] = uId; } const char *Tree::GetLeafName(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); return m_ptrName[uNodeIndex]; } unsigned Tree::GetLeafId(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); return m_Ids[uNodeIndex]; } // Append a new branch. // This adds two new nodes and joins them to an existing leaf node. // Return value is k, new nodes have indexes k and k+1 respectively. unsigned Tree::AppendBranch(unsigned uExistingLeafIndex) { if (0 == m_uNodeCount) Quit("Tree::AppendBranch: tree has not been created"); #if DEBUG assert(uExistingLeafIndex < m_uNodeCount); if (!IsLeaf(uExistingLeafIndex)) { LogMe(); Quit("AppendBranch(%u): not leaf", uExistingLeafIndex); } #endif if (m_uNodeCount >= m_uCacheCount - 2) ExpandCache(); const unsigned uNewLeaf1 = m_uNodeCount; const unsigned uNewLeaf2 = m_uNodeCount + 1; m_uNodeCount += 2; assert(m_uNeighbor2[uExistingLeafIndex] == NULL_NEIGHBOR); assert(m_uNeighbor3[uExistingLeafIndex] == NULL_NEIGHBOR); m_uNeighbor2[uExistingLeafIndex] = uNewLeaf1; m_uNeighbor3[uExistingLeafIndex] = uNewLeaf2; m_uNeighbor1[uNewLeaf1] = uExistingLeafIndex; m_uNeighbor1[uNewLeaf2] = uExistingLeafIndex; m_uNeighbor2[uNewLeaf1] = NULL_NEIGHBOR; m_uNeighbor2[uNewLeaf2] = NULL_NEIGHBOR; m_uNeighbor3[uNewLeaf1] = NULL_NEIGHBOR; m_uNeighbor3[uNewLeaf2] = NULL_NEIGHBOR; m_dEdgeLength2[uExistingLeafIndex] = 0; m_dEdgeLength3[uExistingLeafIndex] = 0; m_dEdgeLength1[uNewLeaf1] = 0; m_dEdgeLength2[uNewLeaf1] = 0; m_dEdgeLength3[uNewLeaf1] = 0; m_dEdgeLength1[uNewLeaf2] = 0; m_dEdgeLength2[uNewLeaf2] = 0; m_dEdgeLength3[uNewLeaf2] = 0; m_bHasEdgeLength1[uNewLeaf1] = false; m_bHasEdgeLength2[uNewLeaf1] = false; m_bHasEdgeLength3[uNewLeaf1] = false; m_bHasEdgeLength1[uNewLeaf2] = false; m_bHasEdgeLength2[uNewLeaf2] = false; m_bHasEdgeLength3[uNewLeaf2] = false; m_bHasHeight[uNewLeaf1] = false; m_bHasHeight[uNewLeaf2] = false; m_Ids[uNewLeaf1] = uInsane; m_Ids[uNewLeaf2] = uInsane; return uNewLeaf1; } void Tree::LogMe() const { Log("Tree::LogMe %u nodes, ", m_uNodeCount); if (IsRooted()) { Log("rooted.\n"); Log("\n"); Log("Index Parnt LengthP Left LengthL Right LengthR Id Name\n"); Log("----- ----- ------- ---- ------- ----- ------- ----- ----\n"); } else { Log("unrooted.\n"); Log("\n"); Log("Index Nbr_1 Length1 Nbr_2 Length2 Nbr_3 Length3 Id Name\n"); Log("----- ----- ------- ----- ------- ----- ------- ----- ----\n"); } for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { Log("%5u ", uNodeIndex); const unsigned n1 = m_uNeighbor1[uNodeIndex]; const unsigned n2 = m_uNeighbor2[uNodeIndex]; const unsigned n3 = m_uNeighbor3[uNodeIndex]; if (NULL_NEIGHBOR != n1) { Log("%5u ", n1); if (m_bHasEdgeLength1[uNodeIndex]) Log("%7.4f ", m_dEdgeLength1[uNodeIndex]); else Log(" * "); } else Log(" "); if (NULL_NEIGHBOR != n2) { Log("%5u ", n2); if (m_bHasEdgeLength2[uNodeIndex]) Log("%7.4f ", m_dEdgeLength2[uNodeIndex]); else Log(" * "); } else Log(" "); if (NULL_NEIGHBOR != n3) { Log("%5u ", n3); if (m_bHasEdgeLength3[uNodeIndex]) Log("%7.4f ", m_dEdgeLength3[uNodeIndex]); else Log(" * "); } else Log(" "); if (m_Ids != 0 && IsLeaf(uNodeIndex)) { unsigned uId = m_Ids[uNodeIndex]; if (uId == uInsane) Log(" *"); else Log("%5u", uId); } else Log(" "); if (m_bRooted && uNodeIndex == m_uRootNodeIndex) Log(" [ROOT] "); const char *ptrName = m_ptrName[uNodeIndex]; if (ptrName != 0) Log(" %s", ptrName); Log("\n"); } } void Tree::SetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2, double dLength) { assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); assert(IsEdge(uNodeIndex1, uNodeIndex2)); if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) { m_dEdgeLength1[uNodeIndex1] = dLength; m_bHasEdgeLength1[uNodeIndex1] = true; } else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) { m_dEdgeLength2[uNodeIndex1] = dLength; m_bHasEdgeLength2[uNodeIndex1] = true; } else { assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); m_dEdgeLength3[uNodeIndex1] = dLength; m_bHasEdgeLength3[uNodeIndex1] = true; } if (m_uNeighbor1[uNodeIndex2] == uNodeIndex1) { m_dEdgeLength1[uNodeIndex2] = dLength; m_bHasEdgeLength1[uNodeIndex2] = true; } else if (m_uNeighbor2[uNodeIndex2] == uNodeIndex1) { m_dEdgeLength2[uNodeIndex2] = dLength; m_bHasEdgeLength2[uNodeIndex2] = true; } else { assert(m_uNeighbor3[uNodeIndex2] == uNodeIndex1); m_dEdgeLength3[uNodeIndex2] = dLength; m_bHasEdgeLength3[uNodeIndex2] = true; } } unsigned Tree::UnrootFromFile() { #if TRACE Log("Before unroot:\n"); LogMe(); #endif if (!m_bRooted) Quit("Tree::Unroot, not rooted"); // Convention: root node is always node zero assert(IsRoot(0)); assert(NULL_NEIGHBOR == m_uNeighbor1[0]); const unsigned uThirdNode = m_uNodeCount++; m_uNeighbor1[0] = uThirdNode; m_uNeighbor1[uThirdNode] = 0; m_uNeighbor2[uThirdNode] = NULL_NEIGHBOR; m_uNeighbor3[uThirdNode] = NULL_NEIGHBOR; m_dEdgeLength1[0] = 0; m_dEdgeLength1[uThirdNode] = 0; m_bHasEdgeLength1[uThirdNode] = true; m_bRooted = false; #if TRACE Log("After unroot:\n"); LogMe(); #endif return uThirdNode; } // In an unrooted tree, equivalent of GetLeft/Right is // GetFirst/SecondNeighbor. // uNeighborIndex must be a known neighbor of uNodeIndex. // This is the way to find the other two neighbor nodes of // an internal node. // The labeling as "First" and "Second" neighbor is arbitrary. // Calling these functions on a leaf returns NULL_NEIGHBOR, as // for GetLeft/Right. unsigned Tree::GetFirstNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const { assert(uNodeIndex < m_uNodeCount); assert(uNeighborIndex < m_uNodeCount); assert(IsEdge(uNodeIndex, uNeighborIndex)); for (unsigned n = 0; n < 3; ++n) { unsigned uNeighbor = GetNeighbor(uNodeIndex, n); if (NULL_NEIGHBOR != uNeighbor && uNeighborIndex != uNeighbor) return uNeighbor; } return NULL_NEIGHBOR; } unsigned Tree::GetSecondNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const { assert(uNodeIndex < m_uNodeCount); assert(uNeighborIndex < m_uNodeCount); assert(IsEdge(uNodeIndex, uNeighborIndex)); bool bFoundOne = false; for (unsigned n = 0; n < 3; ++n) { unsigned uNeighbor = GetNeighbor(uNodeIndex, n); if (NULL_NEIGHBOR != uNeighbor && uNeighborIndex != uNeighbor) { if (bFoundOne) return uNeighbor; else bFoundOne = true; } } return NULL_NEIGHBOR; } // Compute the number of leaves in the sub-tree defined by an edge // in an unrooted tree. Conceptually, the tree is cut at this edge, // and uNodeIndex2 considered the root of the sub-tree. unsigned Tree::GetLeafCountUnrooted(unsigned uNodeIndex1, unsigned uNodeIndex2, double *ptrdTotalDistance) const { assert(!IsRooted()); if (IsLeaf(uNodeIndex2)) { *ptrdTotalDistance = GetEdgeLength(uNodeIndex1, uNodeIndex2); return 1; } // Recurse down the rooted sub-tree defined by cutting the edge // and considering uNodeIndex2 as the root. const unsigned uLeft = GetFirstNeighbor(uNodeIndex2, uNodeIndex1); const unsigned uRight = GetSecondNeighbor(uNodeIndex2, uNodeIndex1); double dLeftDistance; double dRightDistance; const unsigned uLeftCount = GetLeafCountUnrooted(uNodeIndex2, uLeft, &dLeftDistance); const unsigned uRightCount = GetLeafCountUnrooted(uNodeIndex2, uRight, &dRightDistance); *ptrdTotalDistance = dLeftDistance + dRightDistance; return uLeftCount + uRightCount; } void Tree::RootUnrootedTree(ROOT Method) { assert(!IsRooted()); #if TRACE Log("Tree::RootUnrootedTree, before:"); LogMe(); #endif unsigned uNode1; unsigned uNode2; double dLength1; double dLength2; FindRoot(*this, &uNode1, &uNode2, &dLength1, &dLength2, Method); if (m_uNodeCount == m_uCacheCount) ExpandCache(); m_uRootNodeIndex = m_uNodeCount++; double dEdgeLength = GetEdgeLength(uNode1, uNode2); m_uNeighbor1[m_uRootNodeIndex] = NULL_NEIGHBOR; m_uNeighbor2[m_uRootNodeIndex] = uNode1; m_uNeighbor3[m_uRootNodeIndex] = uNode2; if (m_uNeighbor1[uNode1] == uNode2) m_uNeighbor1[uNode1] = m_uRootNodeIndex; else if (m_uNeighbor2[uNode1] == uNode2) m_uNeighbor2[uNode1] = m_uRootNodeIndex; else { assert(m_uNeighbor3[uNode1] == uNode2); m_uNeighbor3[uNode1] = m_uRootNodeIndex; } if (m_uNeighbor1[uNode2] == uNode1) m_uNeighbor1[uNode2] = m_uRootNodeIndex; else if (m_uNeighbor2[uNode2] == uNode1) m_uNeighbor2[uNode2] = m_uRootNodeIndex; else { assert(m_uNeighbor3[uNode2] == uNode1); m_uNeighbor3[uNode2] = m_uRootNodeIndex; } OrientParent(uNode1, m_uRootNodeIndex); OrientParent(uNode2, m_uRootNodeIndex); SetEdgeLength(m_uRootNodeIndex, uNode1, dLength1); SetEdgeLength(m_uRootNodeIndex, uNode2, dLength2); m_bHasHeight[m_uRootNodeIndex] = false; m_ptrName[m_uRootNodeIndex] = 0; m_bRooted = true; #if TRACE Log("\nPhy::RootUnrootedTree, after:"); LogMe(); #endif Validate(); } bool Tree::HasEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const { assert(uNodeIndex1 < m_uNodeCount); assert(uNodeIndex2 < m_uNodeCount); assert(IsEdge(uNodeIndex1, uNodeIndex2)); if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) return m_bHasEdgeLength1[uNodeIndex1]; else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) return m_bHasEdgeLength2[uNodeIndex1]; assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); return m_bHasEdgeLength3[uNodeIndex1]; } void Tree::OrientParent(unsigned uNodeIndex, unsigned uParentNodeIndex) { if (NULL_NEIGHBOR == uNodeIndex) return; if (m_uNeighbor1[uNodeIndex] == uParentNodeIndex) ; else if (m_uNeighbor2[uNodeIndex] == uParentNodeIndex) { double dEdgeLength2 = m_dEdgeLength2[uNodeIndex]; m_uNeighbor2[uNodeIndex] = m_uNeighbor1[uNodeIndex]; m_dEdgeLength2[uNodeIndex] = m_dEdgeLength1[uNodeIndex]; m_uNeighbor1[uNodeIndex] = uParentNodeIndex; m_dEdgeLength1[uNodeIndex] = dEdgeLength2; } else { assert(m_uNeighbor3[uNodeIndex] == uParentNodeIndex); double dEdgeLength3 = m_dEdgeLength3[uNodeIndex]; m_uNeighbor3[uNodeIndex] = m_uNeighbor1[uNodeIndex]; m_dEdgeLength3[uNodeIndex] = m_dEdgeLength1[uNodeIndex]; m_uNeighbor1[uNodeIndex] = uParentNodeIndex; m_dEdgeLength1[uNodeIndex] = dEdgeLength3; } OrientParent(m_uNeighbor2[uNodeIndex], uNodeIndex); OrientParent(m_uNeighbor3[uNodeIndex], uNodeIndex); } unsigned Tree::FirstDepthFirstNode() const { assert(IsRooted()); // Descend via left branches until we hit a leaf unsigned uNodeIndex = m_uRootNodeIndex; while (!IsLeaf(uNodeIndex)) uNodeIndex = GetLeft(uNodeIndex); return uNodeIndex; } unsigned Tree::FirstDepthFirstNodeR() const { assert(IsRooted()); // Descend via left branches until we hit a leaf unsigned uNodeIndex = m_uRootNodeIndex; while (!IsLeaf(uNodeIndex)) uNodeIndex = GetRight(uNodeIndex); return uNodeIndex; } unsigned Tree::NextDepthFirstNode(unsigned uNodeIndex) const { #if TRACE Log("NextDepthFirstNode(%3u) ", uNodeIndex); #endif assert(IsRooted()); assert(uNodeIndex < m_uNodeCount); if (IsRoot(uNodeIndex)) { #if TRACE Log(">> Node %u is root, end of traversal\n", uNodeIndex); #endif return NULL_NEIGHBOR; } unsigned uParent = GetParent(uNodeIndex); if (GetRight(uParent) == uNodeIndex) { #if TRACE Log(">> Is right branch, return parent=%u\n", uParent); #endif return uParent; } uNodeIndex = GetRight(uParent); #if TRACE Log(">> Descend left from right sibling=%u ... ", uNodeIndex); #endif while (!IsLeaf(uNodeIndex)) uNodeIndex = GetLeft(uNodeIndex); #if TRACE Log("bottom out at leaf=%u\n", uNodeIndex); #endif return uNodeIndex; } unsigned Tree::NextDepthFirstNodeR(unsigned uNodeIndex) const { #if TRACE Log("NextDepthFirstNode(%3u) ", uNodeIndex); #endif assert(IsRooted()); assert(uNodeIndex < m_uNodeCount); if (IsRoot(uNodeIndex)) { #if TRACE Log(">> Node %u is root, end of traversal\n", uNodeIndex); #endif return NULL_NEIGHBOR; } unsigned uParent = GetParent(uNodeIndex); if (GetLeft(uParent) == uNodeIndex) { #if TRACE Log(">> Is left branch, return parent=%u\n", uParent); #endif return uParent; } uNodeIndex = GetLeft(uParent); #if TRACE Log(">> Descend right from left sibling=%u ... ", uNodeIndex); #endif while (!IsLeaf(uNodeIndex)) uNodeIndex = GetRight(uNodeIndex); #if TRACE Log("bottom out at leaf=%u\n", uNodeIndex); #endif return uNodeIndex; } void Tree::UnrootByDeletingRoot() { assert(IsRooted()); assert(m_uNodeCount >= 3); const unsigned uLeft = GetLeft(m_uRootNodeIndex); const unsigned uRight = GetRight(m_uRootNodeIndex); m_uNeighbor1[uLeft] = uRight; m_uNeighbor1[uRight] = uLeft; bool bHasEdgeLength = HasEdgeLength(m_uRootNodeIndex, uLeft) && HasEdgeLength(m_uRootNodeIndex, uRight); if (bHasEdgeLength) { double dEdgeLength = GetEdgeLength(m_uRootNodeIndex, uLeft) + GetEdgeLength(m_uRootNodeIndex, uRight); m_dEdgeLength1[uLeft] = dEdgeLength; m_dEdgeLength1[uRight] = dEdgeLength; } // Remove root node entry from arrays const unsigned uMoveCount = m_uNodeCount - m_uRootNodeIndex; const unsigned uUnsBytes = uMoveCount*sizeof(unsigned); memmove(m_uNeighbor1 + m_uRootNodeIndex, m_uNeighbor1 + m_uRootNodeIndex + 1, uUnsBytes); memmove(m_uNeighbor2 + m_uRootNodeIndex, m_uNeighbor2 + m_uRootNodeIndex + 1, uUnsBytes); memmove(m_uNeighbor3 + m_uRootNodeIndex, m_uNeighbor3 + m_uRootNodeIndex + 1, uUnsBytes); const unsigned uDoubleBytes = uMoveCount*sizeof(double); memmove(m_dEdgeLength1 + m_uRootNodeIndex, m_dEdgeLength1 + m_uRootNodeIndex + 1, uDoubleBytes); memmove(m_dEdgeLength2 + m_uRootNodeIndex, m_dEdgeLength2 + m_uRootNodeIndex + 1, uDoubleBytes); memmove(m_dEdgeLength3 + m_uRootNodeIndex, m_dEdgeLength3 + m_uRootNodeIndex + 1, uDoubleBytes); const unsigned uBoolBytes = uMoveCount*sizeof(bool); memmove(m_bHasEdgeLength1 + m_uRootNodeIndex, m_bHasEdgeLength1 + m_uRootNodeIndex + 1, uBoolBytes); memmove(m_bHasEdgeLength2 + m_uRootNodeIndex, m_bHasEdgeLength2 + m_uRootNodeIndex + 1, uBoolBytes); memmove(m_bHasEdgeLength3 + m_uRootNodeIndex, m_bHasEdgeLength3 + m_uRootNodeIndex + 1, uBoolBytes); const unsigned uPtrBytes = uMoveCount*sizeof(char *); memmove(m_ptrName + m_uRootNodeIndex, m_ptrName + m_uRootNodeIndex + 1, uPtrBytes); --m_uNodeCount; m_bRooted = false; // Fix up table entries for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { #define DEC(x) if (x != NULL_NEIGHBOR && x > m_uRootNodeIndex) --x; DEC(m_uNeighbor1[uNodeIndex]) DEC(m_uNeighbor2[uNodeIndex]) DEC(m_uNeighbor3[uNodeIndex]) #undef DEC } Validate(); } unsigned Tree::GetLeafParent(unsigned uNodeIndex) const { assert(IsLeaf(uNodeIndex)); if (IsRooted()) return GetParent(uNodeIndex); if (m_uNeighbor1[uNodeIndex] != NULL_NEIGHBOR) return m_uNeighbor1[uNodeIndex]; if (m_uNeighbor2[uNodeIndex] != NULL_NEIGHBOR) return m_uNeighbor2[uNodeIndex]; return m_uNeighbor3[uNodeIndex]; } // TODO: This is not efficient for large trees, should cache. double Tree::GetNodeHeight(unsigned uNodeIndex) const { if (!IsRooted()) Quit("Tree::GetNodeHeight: undefined unless rooted tree"); if (IsLeaf(uNodeIndex)) return 0.0; if (m_bHasHeight[uNodeIndex]) return m_dHeight[uNodeIndex]; const unsigned uLeft = GetLeft(uNodeIndex); const unsigned uRight = GetRight(uNodeIndex); double dLeftLength = GetEdgeLength(uNodeIndex, uLeft); double dRightLength = GetEdgeLength(uNodeIndex, uRight); if (dLeftLength < 0) dLeftLength = 0; if (dRightLength < 0) dRightLength = 0; const double dLeftHeight = dLeftLength + GetNodeHeight(uLeft); const double dRightHeight = dRightLength + GetNodeHeight(uRight); const double dHeight = (dLeftHeight + dRightHeight)/2; m_bHasHeight[uNodeIndex] = true; m_dHeight[uNodeIndex] = dHeight; return dHeight; } unsigned Tree::GetNeighborSubscript(unsigned uNodeIndex, unsigned uNeighborIndex) const { assert(uNodeIndex < m_uNodeCount); assert(uNeighborIndex < m_uNodeCount); if (uNeighborIndex == m_uNeighbor1[uNodeIndex]) return 0; if (uNeighborIndex == m_uNeighbor2[uNodeIndex]) return 1; if (uNeighborIndex == m_uNeighbor3[uNodeIndex]) return 2; return NULL_NEIGHBOR; } unsigned Tree::GetNeighbor(unsigned uNodeIndex, unsigned uNeighborSubscript) const { switch (uNeighborSubscript) { case 0: return m_uNeighbor1[uNodeIndex]; case 1: return m_uNeighbor2[uNodeIndex]; case 2: return m_uNeighbor3[uNodeIndex]; } Quit("Tree::GetNeighbor, sub=%u", uNeighborSubscript); return NULL_NEIGHBOR; } // TODO: check if this is a performance issue, could cache a lookup table unsigned Tree::LeafIndexToNodeIndex(unsigned uLeafIndex) const { const unsigned uNodeCount = GetNodeCount(); unsigned uLeafCount = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (IsLeaf(uNodeIndex)) { if (uLeafCount == uLeafIndex) return uNodeIndex; else ++uLeafCount; } } Quit("LeafIndexToNodeIndex: out of range"); return 0; } unsigned Tree::GetLeafNodeIndex(const char *ptrName) const { const unsigned uNodeCount = GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!IsLeaf(uNodeIndex)) continue; const char *ptrLeafName = GetLeafName(uNodeIndex); if (0 == strcmp(ptrName, ptrLeafName)) return uNodeIndex; } Quit("Tree::GetLeafNodeIndex, name not found"); return 0; } void Tree::Copy(const Tree &tree) { const unsigned uNodeCount = tree.GetNodeCount(); InitCache(uNodeCount); m_uNodeCount = uNodeCount; const size_t UnsignedBytes = uNodeCount*sizeof(unsigned); const size_t DoubleBytes = uNodeCount*sizeof(double); const size_t BoolBytes = uNodeCount*sizeof(bool); memcpy(m_uNeighbor1, tree.m_uNeighbor1, UnsignedBytes); memcpy(m_uNeighbor2, tree.m_uNeighbor2, UnsignedBytes); memcpy(m_uNeighbor3, tree.m_uNeighbor3, UnsignedBytes); memcpy(m_Ids, tree.m_Ids, UnsignedBytes); memcpy(m_dEdgeLength1, tree.m_dEdgeLength1, DoubleBytes); memcpy(m_dEdgeLength2, tree.m_dEdgeLength2, DoubleBytes); memcpy(m_dEdgeLength3, tree.m_dEdgeLength3, DoubleBytes); memcpy(m_dHeight, tree.m_dHeight, DoubleBytes); memcpy(m_bHasEdgeLength1, tree.m_bHasEdgeLength1, BoolBytes); memcpy(m_bHasEdgeLength2, tree.m_bHasEdgeLength2, BoolBytes); memcpy(m_bHasEdgeLength3, tree.m_bHasEdgeLength3, BoolBytes); memcpy(m_bHasHeight, tree.m_bHasHeight, BoolBytes); m_uRootNodeIndex = tree.m_uRootNodeIndex; m_bRooted = tree.m_bRooted; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { if (tree.IsLeaf(uNodeIndex)) { const char *ptrName = tree.GetLeafName(uNodeIndex); m_ptrName[uNodeIndex] = strsave(ptrName); } else m_ptrName[uNodeIndex] = 0; } #if DEBUG Validate(); #endif } // Create rooted tree from a vector description. // Node indexes are 0..N-1 for leaves, N..2N-2 for // internal nodes. // Vector subscripts are i-N and have values for // internal nodes only, but those values are node // indexes 0..2N-2. So e.g. if N=6 and Left[2]=1, // this means that the third internal node (node index 8) // has the second leaf (node index 1) as its left child. // uRoot gives the vector subscript of the root, so add N // to get the node index. void Tree::Create(unsigned uLeafCount, unsigned uRoot, const unsigned Left[], const unsigned Right[], const float LeftLength[], const float RightLength[], const unsigned LeafIds[], char **LeafNames) { Clear(); m_uNodeCount = 2*uLeafCount - 1; InitCache(m_uNodeCount); for (unsigned uNodeIndex = 0; uNodeIndex < uLeafCount; ++uNodeIndex) { m_Ids[uNodeIndex] = LeafIds[uNodeIndex]; m_ptrName[uNodeIndex] = strsave(LeafNames[uNodeIndex]); } for (unsigned uNodeIndex = uLeafCount; uNodeIndex < m_uNodeCount; ++uNodeIndex) { unsigned v = uNodeIndex - uLeafCount; unsigned uLeft = Left[v]; unsigned uRight = Right[v]; float fLeft = LeftLength[v]; float fRight = RightLength[v]; m_uNeighbor2[uNodeIndex] = uLeft; m_uNeighbor3[uNodeIndex] = uRight; m_bHasEdgeLength2[uNodeIndex] = true; m_bHasEdgeLength3[uNodeIndex] = true; m_dEdgeLength2[uNodeIndex] = fLeft; m_dEdgeLength3[uNodeIndex] = fRight; m_uNeighbor1[uLeft] = uNodeIndex; m_uNeighbor1[uRight] = uNodeIndex; m_dEdgeLength1[uLeft] = fLeft; m_dEdgeLength1[uRight] = fRight; m_bHasEdgeLength1[uLeft] = true; m_bHasEdgeLength1[uRight] = true; } m_bRooted = true; m_uRootNodeIndex = uRoot + uLeafCount; Validate(); } phyfromclust.cpp0000664000175000017500000000526712360262614012425 0ustar bobbob#include "muscle.h" #include "tree.h" #include "clust.h" void Tree::InitCache(unsigned uCacheCount) { m_uCacheCount = uCacheCount; m_uNeighbor1 = new unsigned[m_uCacheCount]; m_uNeighbor2 = new unsigned[m_uCacheCount]; m_uNeighbor3 = new unsigned[m_uCacheCount]; m_Ids = new unsigned[m_uCacheCount]; m_dEdgeLength1 = new double[m_uCacheCount]; m_dEdgeLength2 = new double[m_uCacheCount]; m_dEdgeLength3 = new double[m_uCacheCount]; m_dHeight = new double[m_uCacheCount]; m_bHasEdgeLength1 = new bool[m_uCacheCount]; m_bHasEdgeLength2 = new bool[m_uCacheCount]; m_bHasEdgeLength3 = new bool[m_uCacheCount]; m_bHasHeight = new bool[m_uCacheCount]; m_ptrName = new char *[m_uCacheCount]; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { m_uNeighbor1[uNodeIndex] = NULL_NEIGHBOR; m_uNeighbor2[uNodeIndex] = NULL_NEIGHBOR; m_uNeighbor3[uNodeIndex] = NULL_NEIGHBOR; m_bHasEdgeLength1[uNodeIndex] = false; m_bHasEdgeLength2[uNodeIndex] = false; m_bHasEdgeLength3[uNodeIndex] = false; m_bHasHeight[uNodeIndex] = false; m_dEdgeLength1[uNodeIndex] = dInsane; m_dEdgeLength2[uNodeIndex] = dInsane; m_dEdgeLength3[uNodeIndex] = dInsane; m_dHeight[uNodeIndex] = dInsane; m_ptrName[uNodeIndex] = 0; m_Ids[uNodeIndex] = uInsane; } } void Tree::FromClust(Clust &C) { Clear(); m_uNodeCount = C.GetNodeCount(); InitCache(m_uNodeCount); // Cluster is always rooted. An unrooted cluster // is represented by a pseudo-root, which we fix later. m_bRooted = true; const unsigned uRoot = C.GetRootNodeIndex(); m_uRootNodeIndex = uRoot; m_uNeighbor1[uRoot] = NULL_NEIGHBOR; m_bHasEdgeLength1[uRoot] = false; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { if (C.IsLeaf(uNodeIndex)) { const char *ptrName = C.GetNodeName(uNodeIndex); m_ptrName[uNodeIndex] = strsave(ptrName); m_Ids[uNodeIndex] = C.GetNodeId(uNodeIndex); continue; } const unsigned uLeft = C.GetLeftIndex(uNodeIndex); const unsigned uRight = C.GetRightIndex(uNodeIndex); const double dLeftLength = C.GetLength(uLeft); const double dRightLength = C.GetLength(uRight); m_uNeighbor2[uNodeIndex] = uLeft; m_uNeighbor3[uNodeIndex] = uRight; m_dEdgeLength1[uLeft] = dLeftLength; m_dEdgeLength1[uRight] = dRightLength; m_uNeighbor1[uLeft] = uNodeIndex; m_uNeighbor1[uRight] = uNodeIndex; m_bHasEdgeLength1[uLeft] = true; m_bHasEdgeLength1[uRight] = true; m_dEdgeLength2[uNodeIndex] = dLeftLength; m_dEdgeLength3[uNodeIndex] = dRightLength; m_bHasEdgeLength2[uNodeIndex] = true; m_bHasEdgeLength3[uNodeIndex] = true; } Validate(); } phyfromfile.cpp0000664000175000017500000001342612360262614012206 0ustar bobbob#include "muscle.h" #include "tree.h" #include "textfile.h" #define TRACE 0 // Tokens in Newick files are: // ( ) : , ; // string // 'string' // "string" // [ comment ] // // We can't safely distinguish between identifiers and floating point // numbers at the lexical level (because identifiers may be numeric, // or start with digits), so both edge lengths and identifiers are // returned as strings. const char *Tree::NTTStr(NEWICK_TOKEN_TYPE NTT) const { switch (NTT) { #define c(x) case NTT_##x: return #x; c(Unknown) c(Lparen) c(Rparen) c(Colon) c(Comma) c(Semicolon) c(String) c(SingleQuotedString) c(DoubleQuotedString) c(Comment) #undef c } return "??"; } NEWICK_TOKEN_TYPE Tree::GetToken(TextFile &File, char szToken[], unsigned uBytes) const { // Skip leading white space File.SkipWhite(); char c; File.GetCharX(c); // In case a single-character token szToken[0] = c; szToken[1] = 0; unsigned uBytesCopied = 0; NEWICK_TOKEN_TYPE TT; switch (c) { case '(': return NTT_Lparen; case ')': return NTT_Rparen; case ':': return NTT_Colon; case ';': return NTT_Semicolon; case ',': return NTT_Comma; case '\'': TT = NTT_SingleQuotedString; File.GetCharX(c); break; case '"': TT = NTT_DoubleQuotedString; File.GetCharX(c); break; case '[': TT = NTT_Comment; break; default: TT = NTT_String; break; } for (;;) { if (TT != NTT_Comment) { if (uBytesCopied < uBytes - 2) { szToken[uBytesCopied++] = c; szToken[uBytesCopied] = 0; } else Quit("Tree::GetToken: input buffer too small, token so far='%s'", szToken); } bool bEof = File.GetChar(c); if (bEof) return TT; switch (TT) { case NTT_String: if (0 != strchr("():;,", c)) { File.PushBack(c); return NTT_String; } if (isspace(c)) return NTT_String; break; case NTT_SingleQuotedString: if ('\'' == c) return NTT_String; break; case NTT_DoubleQuotedString: if ('"' == c) return NTT_String; break; case NTT_Comment: if (']' == c) return GetToken(File, szToken, uBytes); break; default: Quit("Tree::GetToken, invalid TT=%u", TT); } } } // NOTE: this hack must come after definition of Tree::GetToken. #if TRACE #define GetToken GetTokenVerbose #endif void Tree::FromFile(TextFile &File) { // Assume rooted. // If we discover that it is unrooted, will convert on the fly. CreateRooted(); double dEdgeLength; bool bEdgeLength = GetGroupFromFile(File, 0, &dEdgeLength); // Next token should be either ';' for rooted tree or ',' for unrooted. char szToken[16]; NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken)); // If rooted, all done. if (NTT_Semicolon == NTT) { if (bEdgeLength) Log(" *** Warning *** edge length on root group in Newick file %s\n", File.GetFileName()); Validate(); return; } if (NTT_Comma != NTT) Quit("Tree::FromFile, expected ';' or ',', got '%s'", szToken); const unsigned uThirdNode = UnrootFromFile(); bEdgeLength = GetGroupFromFile(File, uThirdNode, &dEdgeLength); if (bEdgeLength) SetEdgeLength(0, uThirdNode, dEdgeLength); Validate(); } // Return true if edge length for this group. bool Tree::GetGroupFromFile(TextFile &File, unsigned uNodeIndex, double *ptrdEdgeLength) { char szToken[1024]; NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken)); // Group is either leaf name or (left, right). if (NTT_String == NTT) { SetLeafName(uNodeIndex, szToken); #if TRACE Log("Group is leaf '%s'\n", szToken); #endif } else if (NTT_Lparen == NTT) { const unsigned uLeft = AppendBranch(uNodeIndex); const unsigned uRight = uLeft + 1; // Left sub-group... #if TRACE Log("Got '(', group is compound, expect left sub-group\n"); #endif double dEdgeLength; bool bLeftLength = GetGroupFromFile(File, uLeft, &dEdgeLength); #if TRACE if (bLeftLength) Log("Edge length for left sub-group: %.3g\n", dEdgeLength); else Log("No edge length for left sub-group\n"); #endif if (bLeftLength) SetEdgeLength(uNodeIndex, uLeft, dEdgeLength); // ... then comma ... #if TRACE Log("Expect comma\n"); #endif NTT = GetToken(File, szToken, sizeof(szToken)); if (NTT_Comma != NTT) Quit("Tree::GetGroupFromFile, expected ',', got '%s'", szToken); // ...then right sub-group... #if TRACE Log("Expect right sub-group\n"); #endif bool bRightLength = GetGroupFromFile(File, uRight, &dEdgeLength); if (bRightLength) SetEdgeLength(uNodeIndex, uRight, dEdgeLength); #if TRACE if (bRightLength) Log("Edge length for right sub-group: %.3g\n", dEdgeLength); else Log("No edge length for right sub-group\n"); #endif // ... then closing parenthesis. #if TRACE Log("Expect closing parenthesis (or comma if > 2-ary)\n"); #endif NTT = GetToken(File, szToken, sizeof(szToken)); if (NTT_Rparen == NTT) ; else if (NTT_Comma == NTT) { File.PushBack(','); return false; } else Quit("Tree::GetGroupFromFile, expected ')' or ',', got '%s'", szToken); } else Quit("Tree::GetGroupFromFile, expected '(' or leaf name, got '%s'", szToken); // Group may optionally be followed by edge length. bool bEof = File.SkipWhiteX(); if (bEof) return false; char c; File.GetCharX(c); #if TRACE Log("Character following group, could be colon, is '%c'\n", c); #endif if (':' == c) { NTT = GetToken(File, szToken, sizeof(szToken)); if (NTT_String != NTT) Quit("Tree::GetGroupFromFile, expected edge length, got '%s'", szToken); *ptrdEdgeLength = atof(szToken); return true; } File.PushBack(c); return false; } physeq.cpp0000664000175000017500000000513712360262613011172 0ustar bobbob#include "muscle.h" #include "msa.h" #include "textfile.h" const int BLOCKSIZE = 60; static char FixChar(char c) { switch (c) { case '(': case ')': case '[': case ']': case ':': case ';': case ',': return '_'; } if (!isprint(c)) return '_'; return c; } static void FixName(char Name[]) { while (char c = *Name) *Name++ = FixChar(c); } void MSA::ToPhySequentialFile(TextFile &File) const { const unsigned SeqCount = GetSeqCount(); const unsigned ColCount = GetColCount(); File.PutFormat("%d %d\n", SeqCount, ColCount); if (0 == ColCount) return; for (unsigned Seq = 0; Seq < SeqCount; ++Seq) { char Name[11]; const char *ptrName = GetSeqName(Seq); size_t n = strlen(ptrName); if (n > 10) n = 10; memcpy(Name, ptrName, n); Name[n] = 0; FixName(Name); File.PutFormat("%-10.10s", Name); int BlockIndex = 0; int Col = 0; for (;;) { const unsigned MaxCols = (BlockIndex == 0) ? (BLOCKSIZE - 10) : BLOCKSIZE; for (unsigned ColsThisBlock = 0; ColsThisBlock < MaxCols; ++ColsThisBlock) { if (Col == ColCount) break; if (ColsThisBlock%10 == 0 && (BlockIndex == 0 || ColsThisBlock > 0)) File.PutChar(' '); char c = GetChar(Seq, Col); if (isalpha(c)) c = toupper(c); File.PutChar(c); ++Col; } File.PutChar('\n'); if (Col == ColCount) break; ++BlockIndex; } } } void MSA::ToPhyInterleavedFile(TextFile &File) const { const unsigned SeqCount = GetSeqCount(); const unsigned ColCount = GetColCount(); File.PutFormat("%d %d\n", SeqCount, ColCount); if (0 == ColCount) return; int Col = 0; for (;;) { const unsigned ColBlockStart = Col; const unsigned MaxCols = (ColBlockStart == 0) ? (BLOCKSIZE - 10) : BLOCKSIZE; for (unsigned Seq = 0; Seq < SeqCount; ++Seq) { if (0 == ColBlockStart) { char Name[11]; const char *ptrName = GetSeqName(Seq); size_t n = strlen(ptrName); if (n > 10) n = 10; memcpy(Name, ptrName, n); Name[n] = 0; FixName(Name); File.PutFormat("%-10.10s", Name); } Col = ColBlockStart; for (unsigned ColsThisBlock = 0; ColsThisBlock < MaxCols; ++ColsThisBlock) { if (Col == ColCount) break; if (ColsThisBlock%10 == 0 && (0 == ColBlockStart || ColsThisBlock > 0)) File.PutChar(' '); char c = GetChar(Seq, Col); if (isalpha(c)) c = toupper(c); File.PutChar(c); ++Col; } File.PutChar('\n'); } if (Col == ColCount) break; File.PutChar('\n'); } } phytofile.cpp0000664000175000017500000000406312360262614011662 0ustar bobbob#include "muscle.h" #include "tree.h" #include "textfile.h" unsigned Tree::GetAnyNonLeafNode() const { for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) if (!IsLeaf(uNodeIndex)) return uNodeIndex; return NULL_NEIGHBOR; } void Tree::ToFile(TextFile &File) const { if (IsRooted()) { ToFileNodeRooted(File, m_uRootNodeIndex); File.PutString(";\n"); return; } // Unrooted. unsigned uNodeIndex = GetAnyNonLeafNode(); File.PutString("(\n"); ToFileNodeUnrooted(File, m_uNeighbor1[uNodeIndex], uNodeIndex); File.PutString(",\n"); ToFileNodeUnrooted(File, m_uNeighbor2[uNodeIndex], uNodeIndex); File.PutString(",\n"); ToFileNodeUnrooted(File, m_uNeighbor3[uNodeIndex], uNodeIndex); File.PutString(");\n"); } void Tree::ToFileNodeUnrooted(TextFile &File, unsigned uNodeIndex, unsigned uParent) const { assert(!IsRooted()); bool bGroup = !IsLeaf(uNodeIndex); if (bGroup) File.PutString("(\n"); if (IsLeaf(uNodeIndex)) File.PutString(GetName(uNodeIndex)); else { ToFileNodeUnrooted(File, GetFirstNeighbor(uNodeIndex, uParent), uNodeIndex); File.PutString(",\n"); ToFileNodeUnrooted(File, GetSecondNeighbor(uNodeIndex, uParent), uNodeIndex); } if (bGroup) File.PutString(")"); if (HasEdgeLength(uNodeIndex, uParent)) File.PutFormat(":%g", GetEdgeLength(uNodeIndex, uParent)); File.PutString("\n"); } void Tree::ToFileNodeRooted(TextFile &File, unsigned uNodeIndex) const { assert(IsRooted()); bool bGroup = !IsLeaf(uNodeIndex) || IsRoot(uNodeIndex); if (bGroup) File.PutString("(\n"); if (IsLeaf(uNodeIndex)) File.PutString(GetName(uNodeIndex)); else { ToFileNodeRooted(File, GetLeft(uNodeIndex)); File.PutString(",\n"); ToFileNodeRooted(File, GetRight(uNodeIndex)); } if (bGroup) File.PutString(")"); if (!IsRoot(uNodeIndex)) { unsigned uParent = GetParent(uNodeIndex); if (HasEdgeLength(uNodeIndex, uParent)) File.PutFormat(":%g", GetEdgeLength(uNodeIndex, uParent)); } File.PutString("\n"); } posgap.cpp0000664000175000017500000000541712360262614011154 0ustar bobbob#include "muscle.h" //// Pascaralle and Argos gap factors //// after Table 1 in Thompson et. al. ClustalW NAR paper. //static double PAFFacs[20] = // { // 1.13, // A // 1.13, // C // 0.96, // D // 1.31, // E // 1.20, // F // 0.61, // G // 1.00, // H // 1.32, // I // 0.96, // K // 1.21, // L // 1.29, // M // 0.62, // N // 0.74, // P // 1.07, // Q // 0.72, // R // 0.76, // S // 0.89, // T // 1.25, // V // 1.00, // Y // 1.23, // W // }; // //// (Not used: does not appear to work well). //SCORE PAFactor(const FCOUNT fcCounts[]) // { // if (ALPHA_Amino != g_Alpha) // Quit("PAFFactor: requires amino acid sequence"); // // FCOUNT fLetterCount = 0; // double dSum = 0; // for (unsigned uLetter = 0; uLetter < 20; ++uLetter) // { // const FCOUNT fCount = fcCounts[uLetter]; // dSum += fCount*PAFFacs[uLetter]; // fLetterCount += fCount; // } // if (0 == fLetterCount) // return 0.5; // return (SCORE) (dSum/fLetterCount); // } //static bool Hydrophilic[20] = // { // false, // A // false, // C // true, // D // true, // E // false, // F // true, // G // false, // H // false, // I // true, // K // false, // L // false, // M // true, // N // true, // P // true, // Q // true, // R // true, // S // false, // T // false, // V // false, // Y // false, // W // }; // //bool IsHydrophilic(const FCOUNT fcCounts[]) // { // if (ALPHA_Amino != g_Alpha) // Quit("IsHydrophilic: requires amino acid sequence"); // // for (unsigned uLetter = 0; uLetter < 20; ++uLetter) // if (fcCounts[uLetter] > 0 && !Hydrophilic[uLetter]) // return false; // return true; // } // //bool IsHydrophilic(const unsigned uCounts[]) // { // if (ALPHA_Amino != g_Alpha) // Quit("IsHydrophilic: requires amino acid sequence"); // // for (unsigned uLetter = 0; uLetter < 20; ++uLetter) // if (uCounts[uLetter] > 0 && !Hydrophilic[uLetter]) // return false; // return true; // } // LIVCATMFYWHK // Venn Pascaralla B&T Me // L y y y // I y y y // V y y y // C y n // A y y y // T N n // M y y y // F y y y // Y n n // W y n // H n n // K n n static bool Hydrophobic[20] = { true, // A true, // C false, // D false, // E true, // F false, // G true, // H true, // I false, // K true, // L true, // M false, // N false, // P false, // Q false, // R false, // S true, // T true, // V true, // Y true, // W }; bool IsHydrophobic(const FCOUNT fcCounts[]) { if (ALPHA_Amino != g_Alpha) Quit("IsHydrophobic: requires amino acid sequence"); for (unsigned uLetter = 0; uLetter < 20; ++uLetter) if (fcCounts[uLetter] > 0.0 && !Hydrophobic[uLetter]) return false; return true; } ppscore.cpp0000664000175000017500000000402712360262614011332 0ustar bobbob#include "muscle.h" #include "textfile.h" #include "msa.h" #include "tree.h" #include "profile.h" #include "objscore.h" bool g_bTracePPScore = false; MSA *g_ptrPPScoreMSA1 = 0; MSA *g_ptrPPScoreMSA2 = 0; static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree) { const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); TreeFromMSA(msa, tree, g_Cluster2, g_Distance2, g_Root1); SetMuscleTree(tree); return ProfileFromMSA(msa); } void PPScore() { if (0 == g_pstrFileName1 || 0 == g_pstrFileName2) Quit("-ppscore needs -in1 and -in2"); SetSeqWeightMethod(g_SeqWeight1); TextFile file1(g_pstrFileName1); TextFile file2(g_pstrFileName2); MSA msa1; MSA msa2; msa1.FromFile(file1); msa2.FromFile(file2); const unsigned uLength1 = msa1.GetColCount(); const unsigned uLength2 = msa2.GetColCount(); if (uLength1 != uLength2) Quit("Profiles must have the same length"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = msa1.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); msa1.FixAlpha(); msa2.FixAlpha(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); const unsigned uMaxSeqCount = (uSeqCount1 > uSeqCount2 ? uSeqCount1 : uSeqCount2); MSA::SetIdCount(uMaxSeqCount); Tree tree1; Tree tree2; ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1); ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2); g_bTracePPScore = true; g_ptrPPScoreMSA1 = &msa1; g_ptrPPScoreMSA2 = &msa2; SCORE Score = ObjScoreDP_Profs(Prof1, Prof2, uLength1); Log("Score=%.4g\n", Score); printf("Score=%.4g\n", Score); } profdb.cpp0000664000175000017500000000236212360262614011133 0ustar bobbob#include "muscle.h" #include "textfile.h" #include "seqvect.h" #include "distfunc.h" #include "msa.h" #include "tree.h" #include "clust.h" #include "profile.h" #include "clustsetmsa.h" void ProfDB() { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrFileName2); SetStartTime(); TextFile file1(g_pstrFileName1); TextFile file2(g_pstrFileName2); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrFileName1); MSA msa1; msa1.FromFile(fileIn); const unsigned uSeqCount1 = msa1.GetSeqCount(); if (0 == uSeqCount1) Quit("No sequences in input alignment"); SeqVect v; v.FromFASTAFile(file2); const unsigned uSeqCount2 = v.Length(); if (0 == uSeqCount2) Quit("No sequences in input alignment"); MSA::SetIdCount(uSeqCount1 + uSeqCount2); SetProgressDesc("Align sequence database to profile"); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount2; ++uSeqIndex) { Progress(uSeqIndex, uSeqCount2); Seq &s = *(v[uSeqIndex]); s.SetId(0); MSA msaTmp; msaTmp.FromSeq(s); MSA msaOut; ProfileProfile(msa1, msaTmp, msaOut); msa1.Copy(msaOut); } ProgressStepsDone(); TextFile fileOut(g_pstrOutFileName, true); msa1.ToFile(fileOut); } profile.cpp0000664000175000017500000000577512360262614011332 0ustar bobbob#include "muscle.h" #include "textfile.h" #include "msa.h" #include "tree.h" #include "profile.h" #include "objscore.h" bool TreeNeededForWeighting(SEQWEIGHT s) { switch (s) { case SEQWEIGHT_ClustalW: case SEQWEIGHT_ThreeWay: return true; default: return false; } } static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree) { const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); if (TreeNeededForWeighting(g_SeqWeight2)) { TreeFromMSA(msa, tree, g_Cluster2, g_Distance2, g_Root1); SetMuscleTree(tree); } return ProfileFromMSA(msa); } void ProfileProfile(MSA &msa1, MSA &msa2, MSA &msaOut) { //ALPHA Alpha = ALPHA_Undefined; //switch (g_SeqType) // { //case SEQTYPE_Auto: // Alpha = msa1.GuessAlpha(); // break; //case SEQTYPE_Protein: // Alpha = ALPHA_Amino; // break; //case SEQTYPE_DNA: // Alpha = ALPHA_DNA; // break; //case SEQTYPE_RNA: // Alpha = ALPHA_RNA; // break; //default: // Quit("Invalid SeqType"); // } //SetAlpha(Alpha); //msa1.FixAlpha(); //msa2.FixAlpha(); unsigned uLength1; unsigned uLength2; uLength1 = msa1.GetColCount(); uLength2 = msa2.GetColCount(); Tree tree1; Tree tree2; ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1); ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2); PWPath Path; ProfPos *ProfOut; unsigned uLengthOut; Progress("Aligning profiles"); AlignTwoProfs(Prof1, uLength1, 1.0, Prof2, uLength2, 1.0, Path, &ProfOut, &uLengthOut); Progress("Building output"); AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut); } // Do profile-profile alignment void Profile() { if (0 == g_pstrFileName1 || 0 == g_pstrFileName2) Quit("-profile needs -in1 and -in2"); SetSeqWeightMethod(g_SeqWeight1); TextFile file1(g_pstrFileName1); TextFile file2(g_pstrFileName2); MSA msa1; MSA msa2; MSA msaOut; Progress("Reading %s", g_pstrFileName1); msa1.FromFile(file1); Progress("%u seqs %u cols", msa1.GetSeqCount(), msa1.GetColCount()); Progress("Reading %s", g_pstrFileName2); msa2.FromFile(file2); Progress("%u seqs %u cols", msa2.GetSeqCount(), msa2.GetColCount()); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = msa1.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); msa1.FixAlpha(); msa2.FixAlpha(); SetPPScore(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); const unsigned uSumSeqCount = uSeqCount1 + uSeqCount2; MSA::SetIdCount(uSumSeqCount); ProfileProfile(msa1, msa2, msaOut); Progress("Writing output"); MuscleOutput(msaOut); } profilefrommsa.cpp0000664000175000017500000001702412360262614012705 0ustar bobbob#include "muscle.h" #include "msa.h" #include "profile.h" #define TRACE 0 static void LogF(FCOUNT f) { if (f > -0.00001 && f < 0.00001) Log(" "); else Log(" %5.3f", f); } static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (s < -1e10 || s > 1e10) return " *"; sprintf(str, "%5.1f", s); return str; } #if DOUBLE_AFFINE void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA) { Log(" Pos Occ LL LG GL GG Open Close Open2 Clos2\n"); Log(" --- --- -- -- -- -- ---- ----- ----- -----\n"); for (unsigned n = 0; n < uLength; ++n) { const ProfPos &PP = Prof[n]; Log("%5u", n); LogF(PP.m_fOcc); LogF(PP.m_LL); LogF(PP.m_LG); LogF(PP.m_GL); LogF(PP.m_GG); Log(" %s", LocalScoreToStr(-PP.m_scoreGapOpen)); Log(" %s", LocalScoreToStr(-PP.m_scoreGapClose)); Log(" %s", LocalScoreToStr(-PP.m_scoreGapOpen2)); Log(" %s", LocalScoreToStr(-PP.m_scoreGapClose2)); if (0 != ptrMSA) { const unsigned uSeqCount = ptrMSA->GetSeqCount(); Log(" "); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%c", ptrMSA->GetChar(uSeqIndex, n)); } Log("\n"); } Log("\n"); Log(" Pos G"); for (unsigned n = 0; n < g_AlphaSize; ++n) Log(" %c", LetterExToChar(n)); Log("\n"); Log(" --- -"); for (unsigned n = 0; n < g_AlphaSize; ++n) Log(" -----"); Log("\n"); for (unsigned n = 0; n < uLength; ++n) { const ProfPos &PP = Prof[n]; Log("%5u", n); if (-1 == PP.m_uResidueGroup) Log(" -", PP.m_uResidueGroup); else Log(" %d", PP.m_uResidueGroup); for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) { FCOUNT f = PP.m_fcCounts[uLetter]; if (f == 0.0) Log(" "); else Log(" %5.3f", f); } if (0 != ptrMSA) { const unsigned uSeqCount = ptrMSA->GetSeqCount(); Log(" "); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%c", ptrMSA->GetChar(uSeqIndex, n)); } Log("\n"); } } #endif // DOUBLE_AFFINE #if SINGLE_AFFINE void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA) { Log(" Pos Occ LL LG GL GG Open Close\n"); Log(" --- --- -- -- -- -- ---- -----\n"); for (unsigned n = 0; n < uLength; ++n) { const ProfPos &PP = Prof[n]; Log("%5u", n); LogF(PP.m_fOcc); LogF(PP.m_LL); LogF(PP.m_LG); LogF(PP.m_GL); LogF(PP.m_GG); Log(" %5.1f", -PP.m_scoreGapOpen); Log(" %5.1f", -PP.m_scoreGapClose); if (0 != ptrMSA) { const unsigned uSeqCount = ptrMSA->GetSeqCount(); Log(" "); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%c", ptrMSA->GetChar(uSeqIndex, n)); } Log("\n"); } Log("\n"); Log(" Pos G"); for (unsigned n = 0; n < g_AlphaSize; ++n) Log(" %c", LetterExToChar(n)); Log("\n"); Log(" --- -"); for (unsigned n = 0; n < g_AlphaSize; ++n) Log(" -----"); Log("\n"); for (unsigned n = 0; n < uLength; ++n) { const ProfPos &PP = Prof[n]; Log("%5u", n); if (-1 == PP.m_uResidueGroup) Log(" -", PP.m_uResidueGroup); else Log(" %d", PP.m_uResidueGroup); for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) { FCOUNT f = PP.m_fcCounts[uLetter]; if (f == 0.0) Log(" "); else Log(" %5.3f", f); } if (0 != ptrMSA) { const unsigned uSeqCount = ptrMSA->GetSeqCount(); Log(" "); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log("%c", ptrMSA->GetChar(uSeqIndex, n)); } Log("\n"); } } #endif void SortCounts(const FCOUNT fcCounts[], unsigned SortOrder[]) { static unsigned InitialSortOrder[MAX_ALPHA] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 }; memcpy(SortOrder, InitialSortOrder, g_AlphaSize*sizeof(unsigned)); bool bAny = true; while (bAny) { bAny = false; for (unsigned n = 0; n < g_AlphaSize - 1; ++n) { unsigned i1 = SortOrder[n]; unsigned i2 = SortOrder[n+1]; if (fcCounts[i1] < fcCounts[i2]) { SortOrder[n+1] = i1; SortOrder[n] = i2; bAny = true; } } } } static unsigned AminoGroupFromFCounts(const FCOUNT fcCounts[]) { bool bAny = false; unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { if (0 == fcCounts[uLetter]) continue; const unsigned uResidueGroup = ResidueGroup[uLetter]; if (bAny) { if (uResidueGroup != uConsensusResidueGroup) return RESIDUE_GROUP_MULTIPLE; } else { bAny = true; uConsensusResidueGroup = uResidueGroup; } } return uConsensusResidueGroup; } static unsigned NucleoGroupFromFCounts(const FCOUNT fcCounts[]) { bool bAny = false; unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) { if (0 == fcCounts[uLetter]) continue; const unsigned uResidueGroup = uLetter; if (bAny) { if (uResidueGroup != uConsensusResidueGroup) return RESIDUE_GROUP_MULTIPLE; } else { bAny = true; uConsensusResidueGroup = uResidueGroup; } } return uConsensusResidueGroup; } unsigned ResidueGroupFromFCounts(const FCOUNT fcCounts[]) { switch (g_Alpha) { case ALPHA_Amino: return AminoGroupFromFCounts(fcCounts); case ALPHA_DNA: case ALPHA_RNA: return NucleoGroupFromFCounts(fcCounts); } Quit("ResidueGroupFromFCounts: bad alpha"); return 0; } ProfPos *ProfileFromMSA(const MSA &a) { const unsigned uSeqCount = a.GetSeqCount(); const unsigned uColCount = a.GetColCount(); // Yuck -- cast away const (inconsistent design here). SetMSAWeightsMuscle((MSA &) a); ProfPos *Pos = new ProfPos[uColCount]; unsigned uHydrophobicRunLength = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { ProfPos &PP = Pos[uColIndex]; PP.m_bAllGaps = a.IsGapColumn(uColIndex); FCOUNT fcGapStart; FCOUNT fcGapEnd; FCOUNT fcGapExtend; FCOUNT fOcc; a.GetFractionalWeightedCounts(uColIndex, g_bNormalizeCounts, PP.m_fcCounts, &fcGapStart, &fcGapEnd, &fcGapExtend, &fOcc, &PP.m_LL, &PP.m_LG, &PP.m_GL, &PP.m_GG); PP.m_fOcc = fOcc; SortCounts(PP.m_fcCounts, PP.m_uSortOrder); PP.m_uResidueGroup = ResidueGroupFromFCounts(PP.m_fcCounts); for (unsigned i = 0; i < g_AlphaSize; ++i) { SCORE scoreSum = 0; for (unsigned j = 0; j < g_AlphaSize; ++j) scoreSum += PP.m_fcCounts[j]*(*g_ptrScoreMatrix)[i][j]; PP.m_AAScores[i] = scoreSum; } SCORE sStartOcc = (SCORE) (1.0 - fcGapStart); SCORE sEndOcc = (SCORE) (1.0 - fcGapEnd); PP.m_fcStartOcc = sStartOcc; PP.m_fcEndOcc = sEndOcc; PP.m_scoreGapOpen = sStartOcc*g_scoreGapOpen/2; PP.m_scoreGapClose = sEndOcc*g_scoreGapOpen/2; #if DOUBLE_AFFINE PP.m_scoreGapOpen2 = sStartOcc*g_scoreGapOpen2/2; PP.m_scoreGapClose2 = sEndOcc*g_scoreGapOpen2/2; #endif // PP.m_scoreGapExtend = (SCORE) ((1.0 - fcGapExtend)*scoreGapExtend); #if PAF if (ALHPA_Amino == g_Alpha && sStartOcc > 0.5) { extern SCORE PAFactor(const FCOUNT fcCounts[]); SCORE paf = PAFactor(PP.m_fcCounts); PP.m_scoreGapOpen *= paf; PP.m_scoreGapClose *= paf; } #endif } #if HYDRO if (ALPHA_Amino == g_Alpha) Hydro(Pos, uColCount); #endif #if TRACE { Log("ProfileFromMSA\n"); ListProfile(Pos, uColCount, &a); } #endif return Pos; } progalign.cpp0000664000175000017500000001260612360262614011643 0ustar bobbob#include "muscle.h" #include "tree.h" #include "seqvect.h" #include "profile.h" #include "msa.h" #include "pwpath.h" #include "distfunc.h" #include "textfile.h" #include "estring.h" #define TRACE 0 #define VALIDATE 0 #define TRACE_LENGTH_DELTA 0 static void LogLeafNames(const Tree &tree, unsigned uNodeIndex) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Leaves = new unsigned[uNodeCount]; unsigned uLeafCount; GetLeaves(tree, uNodeIndex, Leaves, &uLeafCount); for (unsigned i = 0; i < uLeafCount; ++i) { if (i > 0) Log(","); Log("%s", tree.GetLeafName(Leaves[i])); } delete[] Leaves; } ProgNode *ProgressiveAlignE(const SeqVect &v, const Tree &GuideTree, MSA &a) { assert(GuideTree.IsRooted()); #if TRACE Log("GuideTree:\n"); GuideTree.LogMe(); #endif const unsigned uSeqCount = v.Length(); const unsigned uNodeCount = 2*uSeqCount - 1; const unsigned uIterCount = uSeqCount - 1; WEIGHT *Weights = new WEIGHT[uSeqCount]; CalcClustalWWeights(GuideTree, Weights); ProgNode *ProgNodes = new ProgNode[uNodeCount]; unsigned uJoin = 0; unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); SetProgressDesc("Align node"); do { if (GuideTree.IsLeaf(uTreeNodeIndex)) { if (uTreeNodeIndex >= uNodeCount) Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount); ProgNode &Node = ProgNodes[uTreeNodeIndex]; unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); if (uId >= uSeqCount) Quit("Seq index out of range"); const Seq &s = *(v[uId]); Node.m_MSA.FromSeq(s); Node.m_MSA.SetSeqId(0, uId); Node.m_uLength = Node.m_MSA.GetColCount(); Node.m_Weight = Weights[uId]; // TODO: Term gaps settable Node.m_Prof = ProfileFromMSA(Node.m_MSA); Node.m_EstringL = 0; Node.m_EstringR = 0; #if TRACE Log("Leaf id=%u\n", uId); Log("MSA=\n"); Node.m_MSA.LogMe(); Log("Profile (from MSA)=\n"); ListProfile(Node.m_Prof, Node.m_uLength, &Node.m_MSA); #endif } else { Progress(uJoin, uSeqCount - 1); ++uJoin; const unsigned uMergeNodeIndex = uTreeNodeIndex; ProgNode &Parent = ProgNodes[uMergeNodeIndex]; const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); if (g_bVerbose) { Log("Align: ("); LogLeafNames(GuideTree, uLeft); Log(") ("); LogLeafNames(GuideTree, uRight); Log(")\n"); } ProgNode &Node1 = ProgNodes[uLeft]; ProgNode &Node2 = ProgNodes[uRight]; #if TRACE Log("AlignTwoMSAs:\n"); #endif AlignTwoProfs( Node1.m_Prof, Node1.m_uLength, Node1.m_Weight, Node2.m_Prof, Node2.m_uLength, Node2.m_Weight, Parent.m_Path, &Parent.m_Prof, &Parent.m_uLength); #if TRACE_LENGTH_DELTA { unsigned L = Node1.m_uLength; unsigned R = Node2.m_uLength; unsigned P = Parent.m_Path.GetEdgeCount(); unsigned Max = L > R ? L : R; unsigned d = P - Max; Log("LD%u;%u;%u;%u\n", L, R, P, d); } #endif PathToEstrings(Parent.m_Path, &Parent.m_EstringL, &Parent.m_EstringR); Parent.m_Weight = Node1.m_Weight + Node2.m_Weight; #if VALIDATE { #if TRACE Log("AlignTwoMSAs:\n"); #endif PWPath TmpPath; AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, TmpPath); ProfPos *P1 = ProfileFromMSA(Node1.m_MSA, true); ProfPos *P2 = ProfileFromMSA(Node2.m_MSA, true); unsigned uLength = Parent.m_MSA.GetColCount(); ProfPos *TmpProf = ProfileFromMSA(Parent.m_MSA, true); #if TRACE Log("Node1 MSA=\n"); Node1.m_MSA.LogMe(); Log("Node1 prof=\n"); ListProfile(Node1.m_Prof, Node1.m_MSA.GetColCount(), &Node1.m_MSA); Log("Node1 prof (from MSA)=\n"); ListProfile(P1, Node1.m_MSA.GetColCount(), &Node1.m_MSA); AssertProfsEq(Node1.m_Prof, Node1.m_uLength, P1, Node1.m_MSA.GetColCount()); Log("Node2 prof=\n"); ListProfile(Node2.m_Prof, Node2.m_MSA.GetColCount(), &Node2.m_MSA); Log("Node2 MSA=\n"); Node2.m_MSA.LogMe(); Log("Node2 prof (from MSA)=\n"); ListProfile(P2, Node2.m_MSA.GetColCount(), &Node2.m_MSA); AssertProfsEq(Node2.m_Prof, Node2.m_uLength, P2, Node2.m_MSA.GetColCount()); TmpPath.AssertEqual(Parent.m_Path); Log("Parent MSA=\n"); Parent.m_MSA.LogMe(); Log("Parent prof=\n"); ListProfile(Parent.m_Prof, Parent.m_uLength, &Parent.m_MSA); Log("Parent prof (from MSA)=\n"); ListProfile(TmpProf, Parent.m_MSA.GetColCount(), &Parent.m_MSA); #endif // TRACE AssertProfsEq(Parent.m_Prof, Parent.m_uLength, TmpProf, Parent.m_MSA.GetColCount()); delete[] P1; delete[] P2; delete[] TmpProf; } #endif // VALIDATE Node1.m_MSA.Clear(); Node2.m_MSA.Clear(); // Don't delete profiles, may need them for tree refinement. //delete[] Node1.m_Prof; //delete[] Node2.m_Prof; //Node1.m_Prof = 0; //Node2.m_Prof = 0; } uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); ProgressStepsDone(); if (g_bBrenner) MakeRootMSABrenner((SeqVect &) v, GuideTree, ProgNodes, a); else MakeRootMSA(v, GuideTree, ProgNodes, a); #if VALIDATE { unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; AssertMSAEq(a, RootProgNode.m_MSA); } #endif delete[] Weights; return ProgNodes; } progress.cpp0000664000175000017500000000661212360262614011525 0ustar bobbob#include "muscle.h" #include #include // Functions that provide visible feedback to the user // that progress is being made. static unsigned g_uIter = 0; // Main MUSCLE iteration 1, 2.. static unsigned g_uLocalMaxIters = 0; // Max iters static FILE *g_fProgress = stderr; // Default to standard error static char g_strFileName[32]; // File name static time_t g_tLocalStart; // Start time static char g_strDesc[32]; // Description static bool g_bWipeDesc = false; static int g_nPrevDescLength; static unsigned g_uTotalSteps; const char *ElapsedTimeAsStr() { time_t Now = time(0); unsigned long ElapsedSecs = (unsigned long) (Now - g_tLocalStart); return SecsToStr(ElapsedSecs); } const char *MemToStr(double MB) { if (MB < 0) return ""; static char Str[16]; static double MaxMB = 0; static double RAMMB = 0; if (RAMMB == 0) RAMMB = GetRAMSizeMB(); if (MB > MaxMB) MaxMB = MB; double Pct = (MaxMB*100.0)/RAMMB; if (Pct > 100) Pct = 100; sprintf(Str, "%.0f MB(%.0f%%)", MaxMB, Pct); return Str; } void SetInputFileName(const char *pstrFileName) { NameFromPath(pstrFileName, g_strFileName, sizeof(g_strFileName)); } void SetSeqStats(unsigned uSeqCount, unsigned uMinL, unsigned uMaxL, unsigned uAvgL) { if (g_bQuiet) return; fprintf(g_fProgress, "%s %u seqs, lengths min %u, max %u, avg %u\n", g_strFileName, uSeqCount, uMinL, uMaxL, uAvgL); if (g_bVerbose) Log("%u seqs, max length %u, avg length %u\n", uSeqCount, uMaxL, uAvgL); } void SetStartTime() { time(&g_tLocalStart); } unsigned long GetStartTime() { return (unsigned long) g_tLocalStart; } void SetIter(unsigned uIter) { g_uIter = uIter; } void IncIter() { ++g_uIter; } void SetMaxIters(unsigned uMaxIters) { g_uLocalMaxIters = uMaxIters; } void SetProgressDesc(const char szDesc[]) { strncpy(g_strDesc, szDesc, sizeof(g_strDesc)); g_strDesc[sizeof(g_strDesc) - 1] = 0; } static void Wipe(int n) { for (int i = 0; i < n; ++i) fprintf(g_fProgress, " "); } void Progress(const char *szFormat, ...) { CheckMaxTime(); if (g_bQuiet) return; double MB = GetMemUseMB(); char szStr[4096]; va_list ArgList; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); fprintf(g_fProgress, "%8.8s %12s %s", ElapsedTimeAsStr(), MemToStr(MB), szStr); fprintf(g_fProgress, "\n"); fflush(g_fProgress); } void Progress(unsigned uStep, unsigned uTotalSteps) { CheckMaxTime(); if (g_bQuiet) return; double dPct = ((uStep + 1)*100.0)/uTotalSteps; double MB = GetMemUseMB(); fprintf(g_fProgress, "%8.8s %12s Iter %3u %6.2f%% %s", ElapsedTimeAsStr(), MemToStr(MB), g_uIter, dPct, g_strDesc); if (g_bWipeDesc) { int n = g_nPrevDescLength - (int) strlen(g_strDesc); Wipe(n); g_bWipeDesc = false; } fprintf(g_fProgress, "\r"); g_uTotalSteps = uTotalSteps; } void ProgressStepsDone() { CheckMaxTime(); if (g_bVerbose) { double MB = GetMemUseMB(); Log("Elapsed time %8.8s Peak memory use %12s Iteration %3u %s\n", ElapsedTimeAsStr(), MemToStr(MB), g_uIter, g_strDesc); } if (g_bQuiet) return; Progress(g_uTotalSteps - 1, g_uTotalSteps); fprintf(g_fProgress, "\n"); g_bWipeDesc = true; g_nPrevDescLength = (int) strlen(g_strDesc); } progressivealign.cpp0000664000175000017500000000372212360262614013243 0ustar bobbob#include "muscle.h" #include #include "tree.h" #include "seqvect.h" #include "profile.h" #include "msa.h" #include "pwpath.h" #include "distfunc.h" #define TRACE 0 void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a) { assert(GuideTree.IsRooted()); #if TRACE Log("GuideTree:\n"); GuideTree.LogMe(); #endif const unsigned uSeqCount = v.Length(); const unsigned uNodeCount = 2*uSeqCount - 1; ProgNode *ProgNodes = new ProgNode[uNodeCount]; unsigned uJoin = 0; unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); SetProgressDesc("Align node"); do { if (GuideTree.IsLeaf(uTreeNodeIndex)) { if (uTreeNodeIndex >= uNodeCount) Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount); ProgNode &Node = ProgNodes[uTreeNodeIndex]; unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); if (uId >= uSeqCount) Quit("Seq index out of range"); const Seq &s = *(v[uId]); Node.m_MSA.FromSeq(s); Node.m_MSA.SetSeqId(0, uId); Node.m_uLength = Node.m_MSA.GetColCount(); } else { Progress(uJoin, uSeqCount - 1); ++uJoin; const unsigned uMergeNodeIndex = uTreeNodeIndex; ProgNode &Parent = ProgNodes[uMergeNodeIndex]; const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); ProgNode &Node1 = ProgNodes[uLeft]; ProgNode &Node2 = ProgNodes[uRight]; PWPath Path; AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); Parent.m_uLength = Parent.m_MSA.GetColCount(); Node1.m_MSA.Clear(); Node2.m_MSA.Clear(); } uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); ProgressStepsDone(); unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; a.Copy(RootProgNode.m_MSA); delete[] ProgNodes; ProgNodes = 0; } pwpath.cpp0000664000175000017500000002316512360262614011166 0ustar bobbob#include "muscle.h" #include "pwpath.h" #include "seq.h" #include "textfile.h" #include "msa.h" PWPath::PWPath() { m_uArraySize = 0; m_uEdgeCount = 0; m_Edges = 0; } PWPath::~PWPath() { Clear(); } void PWPath::Clear() { delete[] m_Edges; m_Edges = 0; m_uArraySize = 0; m_uEdgeCount = 0; } void PWPath::ExpandPath(unsigned uAdditionalEdgeCount) { PWEdge *OldPath = m_Edges; unsigned uEdgeCount = m_uArraySize + uAdditionalEdgeCount; m_Edges = new PWEdge[uEdgeCount]; m_uArraySize = uEdgeCount; if (m_uEdgeCount > 0) memcpy(m_Edges, OldPath, m_uEdgeCount*sizeof(PWEdge)); delete[] OldPath; } void PWPath::AppendEdge(const PWEdge &Edge) { if (0 == m_uArraySize || m_uEdgeCount + 1 == m_uArraySize) ExpandPath(200); m_Edges[m_uEdgeCount] = Edge; ++m_uEdgeCount; } void PWPath::AppendEdge(char cType, unsigned uPrefixLengthA, unsigned uPrefixLengthB) { PWEdge e; e.uPrefixLengthA = uPrefixLengthA; e.uPrefixLengthB = uPrefixLengthB; e.cType = cType; AppendEdge(e); } void PWPath::PrependEdge(const PWEdge &Edge) { if (0 == m_uArraySize || m_uEdgeCount + 1 == m_uArraySize) ExpandPath(1000); if (m_uEdgeCount > 0) memmove(m_Edges + 1, m_Edges, sizeof(PWEdge)*m_uEdgeCount); m_Edges[0] = Edge; ++m_uEdgeCount; } const PWEdge &PWPath::GetEdge(unsigned uEdgeIndex) const { assert(uEdgeIndex < m_uEdgeCount); return m_Edges[uEdgeIndex]; } void PWPath::Validate() const { const unsigned uEdgeCount = GetEdgeCount(); if (0 == uEdgeCount) return; const PWEdge &FirstEdge = GetEdge(0); const PWEdge &LastEdge = GetEdge(uEdgeCount - 1); unsigned uStartA = FirstEdge.uPrefixLengthA; unsigned uStartB = FirstEdge.uPrefixLengthB; if (FirstEdge.cType != 'I') --uStartA; if (FirstEdge.cType != 'D') --uStartB; unsigned uPrefixLengthA = FirstEdge.uPrefixLengthA; unsigned uPrefixLengthB = FirstEdge.uPrefixLengthB; for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = GetEdge(uEdgeIndex); switch (Edge.cType) { case 'M': if (uPrefixLengthA + 1 != Edge.uPrefixLengthA) Quit("PWPath::Validate MA %u", uPrefixLengthA); if (uPrefixLengthB + 1 != Edge.uPrefixLengthB) Quit("PWPath::Validate MB %u", uPrefixLengthB); ++uPrefixLengthA; ++uPrefixLengthB; break; case 'D': if (uPrefixLengthA + 1 != Edge.uPrefixLengthA) Quit("PWPath::Validate DA %u", uPrefixLengthA); if (uPrefixLengthB != Edge.uPrefixLengthB) Quit("PWPath::Validate DB %u", uPrefixLengthB); ++uPrefixLengthA; break; case 'I': if (uPrefixLengthA != Edge.uPrefixLengthA) Quit("PWPath::Validate IA %u", uPrefixLengthA); if (uPrefixLengthB + 1 != Edge.uPrefixLengthB) Quit("PWPath::Validate IB %u", uPrefixLengthB); ++uPrefixLengthB; break; } } } void PWPath::LogMe() const { for (unsigned uEdgeIndex = 0; uEdgeIndex < GetEdgeCount(); ++uEdgeIndex) { const PWEdge &Edge = GetEdge(uEdgeIndex); if (uEdgeIndex > 0) Log(" "); Log("%c%d.%d", Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); if ((uEdgeIndex > 0 && uEdgeIndex%10 == 0) || uEdgeIndex == GetEdgeCount() - 1) Log("\n"); } } void PWPath::Copy(const PWPath &Path) { Clear(); const unsigned uEdgeCount = Path.GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); AppendEdge(Edge); } } void PWPath::FromMSAPair(const MSA &msaA, const MSA &msaB) { const unsigned uColCount = msaA.GetColCount(); if (uColCount != msaB.GetColCount()) Quit("PWPath::FromMSAPair, lengths differ"); Clear(); unsigned uPrefixLengthA = 0; unsigned uPrefixLengthB = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bIsGapA = msaA.IsGapColumn(uColIndex); bool bIsGapB = msaB.IsGapColumn(uColIndex); PWEdge Edge; char cType; if (!bIsGapA && !bIsGapB) { cType = 'M'; ++uPrefixLengthA; ++uPrefixLengthB; } else if (bIsGapA && !bIsGapB) { cType = 'I'; ++uPrefixLengthB; } else if (!bIsGapA && bIsGapB) { cType = 'D'; ++uPrefixLengthA; } else { assert(bIsGapB && bIsGapA); continue; } Edge.cType = cType; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; AppendEdge(Edge); } } // Very similar to HMMPath::FromFile, should consolidate. void PWPath::FromFile(TextFile &File) { Clear(); char szToken[1024]; File.GetTokenX(szToken, sizeof(szToken)); if (0 != strcmp(szToken, "Path")) Quit("Invalid path file (Path)"); File.GetTokenX(szToken, sizeof(szToken)); if (0 != strcmp(szToken, "edges")) Quit("Invalid path file (edges)"); File.GetTokenX(szToken, sizeof(szToken)); if (!IsValidInteger(szToken)) Quit("Invalid path file (edges value)"); const unsigned uEdgeCount = (unsigned) atoi(szToken); unsigned uEdgeIndex = 0; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { // index File.GetTokenX(szToken, sizeof(szToken)); if (!IsValidInteger(szToken)) Quit("Invalid path file, invalid index '%s'", szToken); unsigned n = (unsigned) atoi(szToken); if (n != uEdgeIndex) Quit("Invalid path file, expecting edge %u got %u", uEdgeIndex, n); // type File.GetTokenX(szToken, sizeof(szToken)); if (1 != strlen(szToken)) Quit("Invalid path file, expecting state, got '%s'", szToken); const char cType = szToken[0]; if ('M' != cType && 'D' != cType && cType != 'I' && 'S' != cType) Quit("Invalid path file, expecting state, got '%c'", cType); // prefix length A File.GetTokenX(szToken, sizeof(szToken)); if (!IsValidInteger(szToken)) Quit("Invalid path file, bad prefix length A '%s'", szToken); const unsigned uPrefixLengthA = (unsigned) atoi(szToken); // prefix length B File.GetTokenX(szToken, sizeof(szToken)); if (!IsValidInteger(szToken)) Quit("Invalid path file, bad prefix length B '%s'", szToken); const unsigned uPrefixLengthB = (unsigned) atoi(szToken); PWEdge Edge; Edge.cType = cType; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; AppendEdge(Edge); } File.GetTokenX(szToken, sizeof(szToken)); if (0 != strcmp(szToken, "//")) Quit("Invalid path file (//)"); } void PWPath::ToFile(TextFile &File) const { const unsigned uEdgeCount = GetEdgeCount(); File.PutString("Path\n"); File.PutFormat("edges %u\n", uEdgeCount); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = GetEdge(uEdgeIndex); File.PutFormat("%u %c %u %u\n", uEdgeIndex, Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); } File.PutString("//\n"); } void PWPath::AssertEqual(const PWPath &Path) const { const unsigned uEdgeCount = GetEdgeCount(); if (uEdgeCount != Path.GetEdgeCount()) { Log("PWPath::AssertEqual, this=\n"); LogMe(); Log("\nOther path=\n"); Path.LogMe(); Log("\n"); Quit("PWPath::AssertEqual, Edge count different %u %u\n", uEdgeCount, Path.GetEdgeCount()); } for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &e1 = GetEdge(uEdgeIndex); const PWEdge &e2 = Path.GetEdge(uEdgeIndex); if (e1.cType != e2.cType || e1.uPrefixLengthA != e2.uPrefixLengthA || e1.uPrefixLengthB != e2.uPrefixLengthB) { Log("PWPath::AssertEqual, this=\n"); LogMe(); Log("\nOther path=\n"); Path.LogMe(); Log("\n"); Log("This edge %c%u.%u, other edge %c%u.%u\n", e1.cType, e1.uPrefixLengthA, e1.uPrefixLengthB, e2.cType, e2.uPrefixLengthA, e2.uPrefixLengthB); Quit("PWPath::AssertEqual, edge %u different\n", uEdgeIndex); } } } bool PWPath::Equal(const PWPath &Path) const { const unsigned uEdgeCount = GetEdgeCount(); if (uEdgeCount != Path.GetEdgeCount()) return false; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &e1 = GetEdge(uEdgeIndex); const PWEdge &e2 = Path.GetEdge(uEdgeIndex); if (e1.cType != e2.cType || e1.uPrefixLengthA != e2.uPrefixLengthA || e1.uPrefixLengthB != e2.uPrefixLengthB) return false; } return true; } unsigned PWPath::GetMatchCount() const { unsigned uMatchCount = 0; const unsigned uEdgeCount = GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &e = GetEdge(uEdgeIndex); if ('M' == e.cType) ++uMatchCount; } return uMatchCount; } unsigned PWPath::GetInsertCount() const { unsigned uInsertCount = 0; const unsigned uEdgeCount = GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &e = GetEdge(uEdgeIndex); if ('I' == e.cType) ++uInsertCount; } return uInsertCount; } unsigned PWPath::GetDeleteCount() const { unsigned uDeleteCount = 0; const unsigned uEdgeCount = GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &e = GetEdge(uEdgeIndex); if ('D' == e.cType) ++uDeleteCount; } return uDeleteCount; } void PWPath::FromStr(const char Str[]) { Clear(); unsigned uPrefixLengthA = 0; unsigned uPrefixLengthB = 0; while (char c = *Str++) { switch (c) { case 'M': ++uPrefixLengthA; ++uPrefixLengthB; break; case 'D': ++uPrefixLengthA; break; case 'I': ++uPrefixLengthB; break; default: Quit("PWPath::FromStr, invalid state %c", c); } AppendEdge(c, uPrefixLengthA, uPrefixLengthB); } } readmx.cpp0000664000175000017500000000630112360262614011134 0ustar bobbob#include "muscle.h" #include "textfile.h" #define TRACE 0 const int MAX_LINE = 4096; const int MAX_HEADINGS = 32; static char Heading[MAX_HEADINGS]; static unsigned HeadingCount = 0; static float Mx[32][32]; static void LogMx() { Log("Matrix\n"); Log(" "); for (int i = 0; i < 20; ++i) Log(" %c", LetterToChar(i)); Log("\n"); for (int i = 0; i < 20; ++i) { Log("%c ", LetterToChar(i)); for (int j = 0; j < 20; ++j) Log("%5.1f", Mx[i][j]); Log("\n"); } Log("\n"); } static unsigned MxCharToLetter(char c) { for (unsigned Letter = 0; Letter < HeadingCount; ++Letter) if (Heading[Letter] == c) return Letter; Quit("Letter '%c' has no heading", c); return 0; } PTR_SCOREMATRIX ReadMx(TextFile &File) { // Find column headers char Line[MAX_LINE]; for (;;) { bool EndOfFile = File.GetLine(Line, sizeof(Line)); if (EndOfFile) Quit("Premature EOF in matrix file"); if (Line[0] == '#') continue; else if (Line[0] == ' ') break; else Quit("Invalid line in matrix file: '%s'", Line); } // Read column headers HeadingCount = 0; for (char *p = Line; *p; ++p) { char c = *p; if (!isspace(c)) Heading[HeadingCount++] = c; } if (HeadingCount > 0 && Heading[HeadingCount-1] == '*') --HeadingCount; if (HeadingCount < 20) Quit("Error in matrix file: < 20 headers, line='%s'", Line); #if TRACE { Log("ReadMx\n"); Log("%d headings: ", HeadingCount); for (unsigned i = 0; i < HeadingCount; ++i) Log("%c", Heading[i]); Log("\n"); } #endif // Zero out matrix for (int i = 0; i < MAX_ALPHA; ++i) for (int j = 0; j < MAX_ALPHA; ++j) Mx[i][j] = 0.0; // Read data lines for (unsigned RowIndex = 0; RowIndex < HeadingCount; ++RowIndex) { bool EndOfFile = File.GetTrimLine(Line, sizeof(Line)); if (EndOfFile) Quit("Premature EOF in matrix file"); #if TRACE Log("Line=%s\n", Line); #endif if (Line[0] == '#') continue; char c = Line[0]; #if TRACE Log("Row char=%c\n", c); #endif if (!IsResidueChar(c)) continue; unsigned RowLetter = CharToLetter(c); if (RowLetter >= 20) continue; #if TRACE Log("Row letter = %u\n", RowLetter); #endif char *p = Line + 1; char *maxp = p + strlen(Line); for (unsigned Col = 0; Col < HeadingCount - 1; ++Col) { if (p >= maxp) Quit("Too few fields in line of matrix file: '%s'", Line); while (isspace(*p)) ++p; char *Value = p; while (!isspace(*p)) ++p; float v = (float) atof(Value); char HeaderChar = Heading[Col]; if (IsResidueChar(HeaderChar)) { unsigned ColLetter = CharToLetter(HeaderChar); if (ColLetter >= 20) continue; Mx[RowLetter][ColLetter] = v; } p += 1; } } // Sanity check for symmetry for (int i = 0; i < 20; ++i) for (int j = 0; j < i; ++j) { if (Mx[i][j] != Mx[j][i]) { Warning("Matrix is not symmetrical, %c->%c=%g, %c->%c=%g", CharToLetter(i), CharToLetter(j), Mx[i][j], CharToLetter(j), CharToLetter(i), Mx[j][i]); goto ExitLoop; } } ExitLoop:; if (g_bVerbose) LogMx(); return &Mx; } realigndiffs.cpp0000664000175000017500000000543612360262614012321 0ustar bobbob#include "muscle.h" #include "msa.h" #include "tree.h" #include "profile.h" #include "pwpath.h" #define TRACE 0 // Progressive alignment according to a diffs tree. static void MakeNode(const MSA &msaIn, const Tree &Diffs, unsigned uDiffsNodeIndex, const unsigned IdToDiffsTreeNodeIndex[], ProgNode &Node) { const unsigned uSeqCount = msaIn.GetSeqCount(); unsigned *Ids = new unsigned[uSeqCount]; unsigned uSeqsInDiffCount = 0; for (unsigned uId = 0; uId < uSeqCount; ++uId) { if (IdToDiffsTreeNodeIndex[uId] == uDiffsNodeIndex) { Ids[uSeqsInDiffCount] = uId; ++uSeqsInDiffCount; } } if (0 == uSeqsInDiffCount) Quit("MakeNode: no seqs in diff"); MSASubsetByIds(msaIn, Ids, uSeqsInDiffCount, Node.m_MSA); #if DEBUG ValidateMuscleIds(Node.m_MSA); #endif DeleteGappedCols(Node.m_MSA); delete[] Ids; } void RealignDiffs(const MSA &msaIn, const Tree &Diffs, const unsigned IdToDiffsTreeNodeIndex[], MSA &msaOut) { assert(Diffs.IsRooted()); #if TRACE Log("RealignDiffs\n"); Log("Diff tree:\n"); Diffs.LogMe(); #endif const unsigned uNodeCount = Diffs.GetNodeCount(); if (uNodeCount%2 == 0) Quit("RealignDiffs: Expected odd number of nodes"); const unsigned uMergeCount = (uNodeCount - 1)/2; ProgNode *ProgNodes = new ProgNode[uNodeCount]; unsigned uJoin = 0; SetProgressDesc("Refine tree"); for (unsigned uDiffsNodeIndex = Diffs.FirstDepthFirstNode(); NULL_NEIGHBOR != uDiffsNodeIndex; uDiffsNodeIndex = Diffs.NextDepthFirstNode(uDiffsNodeIndex)) { if (Diffs.IsLeaf(uDiffsNodeIndex)) { assert(uDiffsNodeIndex < uNodeCount); if (uDiffsNodeIndex >= uNodeCount) Quit("TreeNodeIndex=%u NodeCount=%u\n", uDiffsNodeIndex, uNodeCount); ProgNode &Node = ProgNodes[uDiffsNodeIndex]; MakeNode(msaIn, Diffs, uDiffsNodeIndex, IdToDiffsTreeNodeIndex, Node); Node.m_uLength = Node.m_MSA.GetColCount(); } else { Progress(uJoin, uMergeCount); ++uJoin; const unsigned uMergeNodeIndex = uDiffsNodeIndex; ProgNode &Parent = ProgNodes[uMergeNodeIndex]; const unsigned uLeft = Diffs.GetLeft(uDiffsNodeIndex); const unsigned uRight = Diffs.GetRight(uDiffsNodeIndex); ProgNode &Node1 = ProgNodes[uLeft]; ProgNode &Node2 = ProgNodes[uRight]; PWPath Path; AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); #if TRACE { Log("Combined:\n"); Parent.m_MSA.LogMe(); } #endif Node1.m_MSA.Clear(); Node2.m_MSA.Clear(); } } ProgressStepsDone(); unsigned uRootNodeIndex = Diffs.GetRootNodeIndex(); const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; msaOut.Copy(RootProgNode.m_MSA); #if DEBUG AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut); #endif delete[] ProgNodes; ProgNodes = 0; } realigndiffse.cpp0000664000175000017500000000763712360262614012473 0ustar bobbob#include "muscle.h" #include "msa.h" #include "tree.h" #include "profile.h" #include "pwpath.h" #include "seqvect.h" #include "estring.h" #define TRACE 0 void DeleteProgNode(ProgNode &Node) { delete[] Node.m_Prof; delete[] Node.m_EstringL; delete[] Node.m_EstringR; Node.m_Prof = 0; Node.m_EstringL = 0; Node.m_EstringR = 0; } static void MakeNode(ProgNode &OldNode, ProgNode &NewNode, bool bSwapLR) { if (bSwapLR) { NewNode.m_EstringL = OldNode.m_EstringR; NewNode.m_EstringR = OldNode.m_EstringL; } else { NewNode.m_EstringL = OldNode.m_EstringL; NewNode.m_EstringR = OldNode.m_EstringR; } NewNode.m_Prof = OldNode.m_Prof; NewNode.m_uLength = OldNode.m_uLength; NewNode.m_Weight = OldNode.m_Weight; OldNode.m_Prof = 0; OldNode.m_EstringL = 0; OldNode.m_EstringR = 0; } void RealignDiffsE(const MSA &msaIn, const SeqVect &v, const Tree &NewTree, const Tree &OldTree, const unsigned uNewNodeIndexToOldNodeIndex[], MSA &msaOut, ProgNode *OldProgNodes) { assert(OldProgNodes != 0); const unsigned uNodeCount = NewTree.GetNodeCount(); if (uNodeCount%2 == 0) Quit("RealignDiffs: Expected odd number of nodes"); const unsigned uMergeCount = (uNodeCount - 1)/2; ProgNode *NewProgNodes = new ProgNode[uNodeCount]; for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex) { if (NODE_CHANGED == uNewNodeIndexToOldNodeIndex[uNewNodeIndex]) continue; unsigned uOldNodeIndex = uNewNodeIndexToOldNodeIndex[uNewNodeIndex]; assert(uNewNodeIndex < uNodeCount); assert(uOldNodeIndex < uNodeCount); ProgNode &NewNode = NewProgNodes[uNewNodeIndex]; ProgNode &OldNode = OldProgNodes[uOldNodeIndex]; bool bSwapLR = false; if (!NewTree.IsLeaf(uNewNodeIndex)) { unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex); unsigned uNewRight = NewTree.GetRight(uNewNodeIndex); unsigned uOld = uNewNodeIndexToOldNodeIndex[uNewNodeIndex]; unsigned uOldLeft = OldTree.GetLeft(uOld); unsigned uOldRight = OldTree.GetRight(uOld); assert(uOldLeft < uNodeCount && uOldRight < uNodeCount); if (uOldLeft != uNewNodeIndexToOldNodeIndex[uNewLeft]) { assert(uOldLeft == uNewNodeIndexToOldNodeIndex[uNewRight]); bSwapLR = true; } } MakeNode(OldNode, NewNode, bSwapLR); #if TRACE Log("MakeNode old=%u new=%u swap=%d length=%u weight=%.3g\n", uOldNodeIndex, uNewNodeIndex, bSwapLR, NewNode.m_uLength, NewNode.m_Weight); #endif } unsigned uJoin = 0; SetProgressDesc("Refine tree"); for (unsigned uNewNodeIndex = NewTree.FirstDepthFirstNode(); NULL_NEIGHBOR != uNewNodeIndex; uNewNodeIndex = NewTree.NextDepthFirstNode(uNewNodeIndex)) { if (NODE_CHANGED != uNewNodeIndexToOldNodeIndex[uNewNodeIndex]) continue; Progress(uJoin, uMergeCount - 1); ++uJoin; const unsigned uMergeNodeIndex = uNewNodeIndex; ProgNode &Parent = NewProgNodes[uMergeNodeIndex]; const unsigned uLeft = NewTree.GetLeft(uNewNodeIndex); const unsigned uRight = NewTree.GetRight(uNewNodeIndex); ProgNode &Node1 = NewProgNodes[uLeft]; ProgNode &Node2 = NewProgNodes[uRight]; AlignTwoProfs( Node1.m_Prof, Node1.m_uLength, Node1.m_Weight, Node2.m_Prof, Node2.m_uLength, Node2.m_Weight, Parent.m_Path, &Parent.m_Prof, &Parent.m_uLength); PathToEstrings(Parent.m_Path, &Parent.m_EstringL, &Parent.m_EstringR); Parent.m_Weight = Node1.m_Weight + Node2.m_Weight; delete[] Node1.m_Prof; delete[] Node2.m_Prof; Node1.m_Prof = 0; Node2.m_Prof = 0; } ProgressStepsDone(); if (g_bBrenner) MakeRootMSABrenner((SeqVect &) v, NewTree, NewProgNodes, msaOut); else MakeRootMSA(v, NewTree, NewProgNodes, msaOut); #if DEBUG AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut); #endif for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) DeleteProgNode(NewProgNodes[uNodeIndex]); delete[] NewProgNodes; } refine.cpp0000664000175000017500000000322712360262614011130 0ustar bobbob#include "muscle.h" #include "textfile.h" #include "seqvect.h" #include "distfunc.h" #include "msa.h" #include "tree.h" #include "clust.h" #include "profile.h" #include "clustsetmsa.h" void Refine() { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetStartTime(); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); MSA msa; msa.FromFile(fileIn); const unsigned uSeqCount = msa.GetSeqCount(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = msa.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); msa.FixAlpha(); SetPPScore(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); SetMuscleInputMSA(msa); Tree GuideTree; TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2); SetMuscleTree(GuideTree); if (g_bAnchors) RefineVert(msa, GuideTree, g_uMaxIters); else RefineHoriz(msa, GuideTree, g_uMaxIters, false, false); ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); // TextFile fileOut(g_pstrOutFileName, true); // msa.ToFile(fileOut); MuscleOutput(msa); } refinehoriz.cpp0000664000175000017500000001700012360262614012176 0ustar bobbob#include "muscle.h" #include "tree.h" #include "msa.h" #include "pwpath.h" #include "profile.h" #include "scorehistory.h" #include "objscore.h" unsigned g_uRefineHeightSubtree; unsigned g_uRefineHeightSubtreeTotal; #define TRACE 0 #define DIFFOBJSCORE 0 static bool TryRealign(MSA &msaIn, const Tree &tree, const unsigned Leaves1[], unsigned uCount1, const unsigned Leaves2[], unsigned uCount2, SCORE *ptrscoreBefore, SCORE *ptrscoreAfter, bool bLockLeft, bool bLockRight) { #if TRACE Log("TryRealign, msaIn=\n"); msaIn.LogMe(); #endif const unsigned uSeqCount = msaIn.GetSeqCount(); unsigned *Ids1 = new unsigned[uSeqCount]; unsigned *Ids2 = new unsigned[uSeqCount]; LeafIndexesToIds(tree, Leaves1, uCount1, Ids1); LeafIndexesToIds(tree, Leaves2, uCount2, Ids2); MSA msa1; MSA msa2; MSASubsetByIds(msaIn, Ids1, uCount1, msa1); MSASubsetByIds(msaIn, Ids2, uCount2, msa2); #if DEBUG ValidateMuscleIds(msa1); ValidateMuscleIds(msa2); #endif // Computing the objective score may be expensive for // large numbers of sequences. As a speed optimization, // we check whether the alignment changes. If it does // not change, there is no need to compute the objective // score. We test for the alignment changing by comparing // the Viterbi paths before and after re-aligning. PWPath pathBefore; pathBefore.FromMSAPair(msa1, msa2); DeleteGappedCols(msa1); DeleteGappedCols(msa2); if (0 == msa1.GetColCount() || 0 == msa2.GetColCount()) return false; MSA msaRealigned; PWPath pathAfter; AlignTwoMSAs(msa1, msa2, msaRealigned, pathAfter, bLockLeft, bLockRight); bool bAnyChanges = !pathAfter.Equal(pathBefore); unsigned uDiffCount1; unsigned uDiffCount2; static unsigned Edges1[10000]; static unsigned Edges2[10000]; DiffPaths(pathBefore, pathAfter, Edges1, &uDiffCount1, Edges2, &uDiffCount2); #if TRACE Log("TryRealign, msa1=\n"); msa1.LogMe(); Log("\nmsa2=\n"); msa2.LogMe(); Log("\nRealigned (changes %s)=\n", bAnyChanges ? "TRUE" : "FALSE"); msaRealigned.LogMe(); #endif if (!bAnyChanges) { *ptrscoreBefore = 0; *ptrscoreAfter = 0; return false; } SetMSAWeightsMuscle(msaIn); SetMSAWeightsMuscle(msaRealigned); #if DIFFOBJSCORE const SCORE scoreDiff = DiffObjScore(msaIn, pathBefore, Edges1, uDiffCount1, msaRealigned, pathAfter, Edges2, uDiffCount2); bool bAccept = (scoreDiff > 0); *ptrscoreBefore = 0; *ptrscoreAfter = scoreDiff; //const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2); //const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2); //Log("Diff = %.3g %.3g\n", scoreDiff, scoreAfter - scoreBefore); #else const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2); const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2); bool bAccept = (scoreAfter > scoreBefore); #if TRACE Log("Score %g -> %g Accept %s\n", scoreBefore, scoreAfter, bAccept ? "TRUE" : "FALSE"); #endif *ptrscoreBefore = scoreBefore; *ptrscoreAfter = scoreAfter; #endif if (bAccept) msaIn.Copy(msaRealigned); delete[] Ids1; delete[] Ids2; return bAccept; } static void RefineHeightParts(MSA &msaIn, const Tree &tree, const unsigned InternalNodeIndexes[], bool bReversed, bool bRight, unsigned uIter, ScoreHistory &History, bool *ptrbAnyChanges, bool *ptrbOscillating, bool bLockLeft, bool bLockRight) { *ptrbOscillating = false; const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uInternalNodeCount = uSeqCount - 1; unsigned *Leaves1 = new unsigned[uSeqCount]; unsigned *Leaves2 = new unsigned[uSeqCount]; const unsigned uRootNodeIndex = tree.GetRootNodeIndex(); bool bAnyAccepted = false; for (unsigned i = 0; i < uInternalNodeCount; ++i) { const unsigned uInternalNodeIndex = InternalNodeIndexes[i]; unsigned uNeighborNodeIndex; if (tree.IsRoot(uInternalNodeIndex) && !bRight) continue; else if (bRight) uNeighborNodeIndex = tree.GetRight(uInternalNodeIndex); else uNeighborNodeIndex = tree.GetLeft(uInternalNodeIndex); g_uTreeSplitNode1 = uInternalNodeIndex; g_uTreeSplitNode2 = uNeighborNodeIndex; unsigned uCount1; unsigned uCount2; GetLeaves(tree, uNeighborNodeIndex, Leaves1, &uCount1); GetLeavesExcluding(tree, uRootNodeIndex, uNeighborNodeIndex, Leaves2, &uCount2); #if TRACE Log("\nRefineHeightParts node %u\n", uInternalNodeIndex); Log("Group1="); for (unsigned n = 0; n < uCount1; ++n) Log(" %u(%s)", Leaves1[n], tree.GetName(Leaves1[n])); Log("\n"); Log("Group2="); for (unsigned n = 0; n < uCount2; ++n) Log(" %u(%s)", Leaves2[n], tree.GetName(Leaves2[n])); Log("\n"); #endif SCORE scoreBefore; SCORE scoreAfter; bool bAccepted = TryRealign(msaIn, tree, Leaves1, uCount1, Leaves2, uCount2, &scoreBefore, &scoreAfter, bLockLeft, bLockRight); SetCurrentAlignment(msaIn); ++g_uRefineHeightSubtree; Progress(g_uRefineHeightSubtree, g_uRefineHeightSubtreeTotal); #if TRACE if (uIter > 0) Log("Before %g %g\n", scoreBefore, History.GetScore(uIter - 1, uInternalNodeIndex, bReversed, bRight)); #endif SCORE scoreMax = scoreAfter > scoreBefore? scoreAfter : scoreBefore; bool bRepeated = History.SetScore(uIter, uInternalNodeIndex, bRight, scoreMax); if (bRepeated) { *ptrbOscillating = true; break; } if (bAccepted) bAnyAccepted = true; } delete[] Leaves1; delete[] Leaves2; *ptrbAnyChanges = bAnyAccepted; } // Return true if any changes made bool RefineHoriz(MSA &msaIn, const Tree &tree, unsigned uIters, bool bLockLeft, bool bLockRight) { #if TRACE tree.LogMe(); #endif if (!tree.IsRooted()) Quit("RefineHeight: requires rooted tree"); const unsigned uSeqCount = msaIn.GetSeqCount(); if (uSeqCount < 3) return false; const unsigned uInternalNodeCount = uSeqCount - 1; unsigned *InternalNodeIndexes = new unsigned[uInternalNodeCount]; unsigned *InternalNodeIndexesR = new unsigned[uInternalNodeCount]; GetInternalNodesInHeightOrder(tree, InternalNodeIndexes); ScoreHistory History(uIters, 2*uSeqCount - 1); bool bAnyChangesAnyIter = false; for (unsigned n = 0; n < uInternalNodeCount; ++n) InternalNodeIndexesR[uInternalNodeCount - 1 - n] = InternalNodeIndexes[n]; for (unsigned uIter = 0; uIter < uIters; ++uIter) { bool bAnyChangesThisIter = false; IncIter(); SetProgressDesc("Refine biparts"); g_uRefineHeightSubtree = 0; g_uRefineHeightSubtreeTotal = uInternalNodeCount*2 - 1; bool bReverse = (uIter%2 != 0); unsigned *Internals; if (bReverse) Internals = InternalNodeIndexesR; else Internals = InternalNodeIndexes; bool bOscillating; for (unsigned i = 0; i < 2; ++i) { bool bAnyChanges = false; bool bRight; switch (i) { case 0: bRight = true; break; case 1: bRight = false; break; default: Quit("RefineHeight default case"); } RefineHeightParts(msaIn, tree, Internals, bReverse, bRight, uIter, History, &bAnyChanges, &bOscillating, bLockLeft, bLockRight); if (bOscillating) { ProgressStepsDone(); goto Osc; } if (bAnyChanges) { bAnyChangesThisIter = true; bAnyChangesAnyIter = true; } } ProgressStepsDone(); if (bOscillating) break; if (!bAnyChangesThisIter) break; } Osc: delete[] InternalNodeIndexes; delete[] InternalNodeIndexesR; return bAnyChangesAnyIter; } refinesubfams.cpp0000664000175000017500000001355712360262614012520 0ustar bobbob#include "muscle.h" #include "msa.h" #include "tree.h" #include "clust.h" #include "profile.h" #include "pwpath.h" #define TRACE 0 static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[], unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa); // Identify subfamilies in a tree. // Returns array of internal node indexes, one for each subfamily. // First try is to select groups by height (which should approximate // minimum percent identity), if this gives too many subfamilies then // we cut at a point that gives the maximum allowed number of subfams. static void GetSubfams(const Tree &tree, double dMaxHeight, unsigned uMaxSubfamCount, unsigned **ptrptrSubfams, unsigned *ptruSubfamCount) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Subfams = new unsigned[uNodeCount]; unsigned uSubfamCount; ClusterByHeight(tree, dMaxHeight, Subfams, &uSubfamCount); if (uSubfamCount > uMaxSubfamCount) ClusterBySubfamCount(tree, uMaxSubfamCount, Subfams, &uSubfamCount); *ptrptrSubfams = Subfams; *ptruSubfamCount = uSubfamCount; } static void LogSubfams(const Tree &tree, const unsigned Subfams[], unsigned uSubfamCount) { const unsigned uNodeCount = tree.GetNodeCount(); Log("%u subfamilies found\n", uSubfamCount); Log("Subfam Sequence\n"); Log("------ --------\n"); unsigned *Leaves = new unsigned[uNodeCount]; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uSubfamNodeIndex = Subfams[uSubfamIndex]; unsigned uLeafCount; GetLeaves(tree, uSubfamNodeIndex, Leaves, &uLeafCount); for (unsigned uLeafIndex = 0; uLeafIndex < uLeafCount; ++uLeafIndex) Log("%6u %s\n", uSubfamIndex + 1, tree.GetLeafName(Leaves[uLeafIndex])); Log("\n"); } delete[] Leaves; } bool RefineSubfams(MSA &msa, const Tree &tree, unsigned uIters) { const unsigned uSeqCount = msa.GetSeqCount(); if (uSeqCount < 3) return false; const double dMaxHeight = 0.6; const unsigned uMaxSubfamCount = 16; const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Subfams; unsigned uSubfamCount; GetSubfams(tree, dMaxHeight, uMaxSubfamCount, &Subfams, &uSubfamCount); assert(uSubfamCount <= uSeqCount); if (g_bVerbose) LogSubfams(tree, Subfams, uSubfamCount); MSA *SubfamMSAs = new MSA[uSubfamCount]; unsigned *Leaves = new unsigned[uSeqCount]; unsigned *Ids = new unsigned[uSeqCount]; bool bAnyChanges = false; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uSubfam = Subfams[uSubfamIndex]; unsigned uLeafCount; GetLeaves(tree, uSubfam, Leaves, &uLeafCount); assert(uLeafCount <= uSeqCount); LeafIndexesToIds(tree, Leaves, uLeafCount, Ids); MSA &msaSubfam = SubfamMSAs[uSubfamIndex]; MSASubsetByIds(msa, Ids, uLeafCount, msaSubfam); DeleteGappedCols(msaSubfam); #if TRACE Log("Subfam %u MSA=\n", uSubfamIndex); msaSubfam.LogMe(); #endif if (msaSubfam.GetSeqCount() <= 2) continue; // TODO ///////////////////////////////////////// // Try using existing tree, may actually hurt to // re-estimate, may also be a waste of CPU & mem. ///////////////////////////////////////////////// Tree SubfamTree; TreeFromMSA(msaSubfam, SubfamTree, g_Cluster2, g_Distance2, g_Root2); bool bAnyChangesThisSubfam; if (g_bAnchors) bAnyChangesThisSubfam = RefineVert(msaSubfam, SubfamTree, uIters); else bAnyChangesThisSubfam = RefineHoriz(msaSubfam, SubfamTree, uIters, false, false); #if TRACE Log("Subfam %u Changed %d\n", uSubfamIndex, bAnyChangesThisSubfam); #endif if (bAnyChangesThisSubfam) bAnyChanges = true; } if (bAnyChanges) ProgressiveAlignSubfams(tree, Subfams, uSubfamCount, SubfamMSAs, msa); delete[] Leaves; delete[] Subfams; delete[] SubfamMSAs; return bAnyChanges; } static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[], unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa) { const unsigned uNodeCount = tree.GetNodeCount(); bool *Ready = new bool[uNodeCount]; MSA **MSAs = new MSA *[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { Ready[uNodeIndex] = false; MSAs[uNodeIndex] = 0; } for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uNodeIndex = Subfams[uSubfamIndex]; Ready[uNodeIndex] = true; MSA *ptrMSA = new MSA; // TODO: Wasteful copy, needs re-design ptrMSA->Copy(SubfamMSAs[uSubfamIndex]); MSAs[uNodeIndex] = ptrMSA; } for (unsigned uNodeIndex = tree.FirstDepthFirstNode(); NULL_NEIGHBOR != uNodeIndex; uNodeIndex = tree.NextDepthFirstNode(uNodeIndex)) { if (tree.IsLeaf(uNodeIndex)) continue; unsigned uRight = tree.GetRight(uNodeIndex); unsigned uLeft = tree.GetLeft(uNodeIndex); if (!Ready[uRight] || !Ready[uLeft]) continue; MSA *ptrLeft = MSAs[uLeft]; MSA *ptrRight = MSAs[uRight]; assert(ptrLeft != 0 && ptrRight != 0); MSA *ptrParent = new MSA; PWPath Path; AlignTwoMSAs(*ptrLeft, *ptrRight, *ptrParent, Path); MSAs[uNodeIndex] = ptrParent; Ready[uNodeIndex] = true; Ready[uLeft] = false; Ready[uRight] = false; delete MSAs[uLeft]; delete MSAs[uRight]; MSAs[uLeft] = 0; MSAs[uRight] = 0; } #if DEBUG { unsigned uReadyCount = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (Ready[uNodeIndex]) { assert(tree.IsRoot(uNodeIndex)); ++uReadyCount; assert(0 != MSAs[uNodeIndex]); } else assert(0 == MSAs[uNodeIndex]); } assert(1 == uReadyCount); } #endif const unsigned uRoot = tree.GetRootNodeIndex(); MSA *ptrRootAlignment = MSAs[uRoot]; msa.Copy(*ptrRootAlignment); delete ptrRootAlignment; #if TRACE Log("After refine subfamilies, root alignment=\n"); msa.LogMe(); #endif } refinetree.cpp0000664000175000017500000000243512360262614012010 0ustar bobbob#include "muscle.h" #include "msa.h" #include "tree.h" #include "profile.h" #include void RefineTree(MSA &msa, Tree &tree) { const unsigned uSeqCount = msa.GetSeqCount(); if (tree.GetLeafCount() != uSeqCount) Quit("Refine tree, tree has different number of nodes"); if (uSeqCount < 3) return; #if DEBUG ValidateMuscleIds(msa); ValidateMuscleIds(tree); #endif unsigned *IdToDiffsLeafNodeIndex = new unsigned[uSeqCount]; unsigned uDiffsCount = uSeqCount; Tree Tree2; for (unsigned uIter = 0; uIter < g_uMaxTreeRefineIters; ++uIter) { TreeFromMSA(msa, Tree2, g_Cluster2, g_Distance2, g_Root2, g_pstrDistMxFileName2); #if DEBUG ValidateMuscleIds(Tree2); #endif Tree Diffs; DiffTrees(Tree2, tree, Diffs, IdToDiffsLeafNodeIndex); tree.Copy(Tree2); const unsigned uNewDiffsNodeCount = Diffs.GetNodeCount(); const unsigned uNewDiffsCount = (uNewDiffsNodeCount - 1)/2; if (0 == uNewDiffsCount || uNewDiffsCount >= uDiffsCount) { ProgressStepsDone(); break; } uDiffsCount = uNewDiffsCount; MSA msa2; RealignDiffs(msa, Diffs, IdToDiffsLeafNodeIndex, msa2); #if DEBUG ValidateMuscleIds(msa2); #endif msa.Copy(msa2); SetCurrentAlignment(msa); } delete[] IdToDiffsLeafNodeIndex; } refinetreee.cpp0000664000175000017500000000224212360262614012151 0ustar bobbob#include "muscle.h" #include "msa.h" #include "tree.h" #include "profile.h" #include #define TRACE 0 void RefineTreeE(MSA &msa, const SeqVect &v, Tree &tree, ProgNode *ProgNodes) { const unsigned uSeqCount = msa.GetSeqCount(); if (tree.GetLeafCount() != uSeqCount) Quit("Refine tree, tree has different number of nodes"); if (uSeqCount < 3) return; #if DEBUG ValidateMuscleIds(msa); ValidateMuscleIds(tree); #endif const unsigned uNodeCount = tree.GetNodeCount(); unsigned *uNewNodeIndexToOldNodeIndex= new unsigned[uNodeCount]; Tree Tree2; TreeFromMSA(msa, Tree2, g_Cluster2, g_Distance2, g_Root2, g_pstrDistMxFileName2); #if DEBUG ValidateMuscleIds(Tree2); #endif DiffTreesE(Tree2, tree, uNewNodeIndexToOldNodeIndex); unsigned uRoot = Tree2.GetRootNodeIndex(); if (NODE_CHANGED == uNewNodeIndexToOldNodeIndex[uRoot]) { MSA msa2; RealignDiffsE(msa, v, Tree2, tree, uNewNodeIndexToOldNodeIndex, msa2, ProgNodes); tree.Copy(Tree2); msa.Copy(msa2); #if DEBUG ValidateMuscleIds(msa2); #endif } delete[] uNewNodeIndexToOldNodeIndex; SetCurrentAlignment(msa); ProgressStepsDone(); } refinevert.cpp0000664000175000017500000000776512360262613012043 0ustar bobbob#include "muscle.h" #include "profile.h" #include "msa.h" #include "pwpath.h" #include "seqvect.h" #include "clust.h" #include "tree.h" #define TRACE 0 struct Range { unsigned m_uBestColLeft; unsigned m_uBestColRight; }; static void ListVertSavings(unsigned uColCount, unsigned uAnchorColCount, const Range *Ranges, unsigned uRangeCount) { if (!g_bVerbose || !g_bAnchors) return; double dTotalArea = uColCount*uColCount; double dArea = 0.0; for (unsigned i = 0; i < uRangeCount; ++i) { unsigned uLength = Ranges[i].m_uBestColRight - Ranges[i].m_uBestColLeft; dArea += uLength*uLength; } double dPct = (dTotalArea - dArea)*100.0/dTotalArea; Log("Anchor columns found %u\n", uAnchorColCount); Log("DP area saved by anchors %-4.1f%%\n", dPct); } static void ColsToRanges(const unsigned BestCols[], unsigned uBestColCount, unsigned uColCount, Range Ranges[]) { // N best columns produces N+1 vertical blocks. const unsigned uRangeCount = uBestColCount + 1; for (unsigned uIndex = 0; uIndex < uRangeCount ; ++uIndex) { unsigned uBestColLeft = 0; if (uIndex > 0) uBestColLeft = BestCols[uIndex-1]; unsigned uBestColRight = uColCount; if (uIndex < uBestColCount) uBestColRight = BestCols[uIndex]; Ranges[uIndex].m_uBestColLeft = uBestColLeft; Ranges[uIndex].m_uBestColRight = uBestColRight; } } // Return true if any changes made bool RefineVert(MSA &msaIn, const Tree &tree, unsigned uIters) { bool bAnyChanges = false; const unsigned uColCountIn = msaIn.GetColCount(); const unsigned uSeqCountIn = msaIn.GetSeqCount(); if (uColCountIn < 3 || uSeqCountIn < 3) return false; unsigned *AnchorCols = new unsigned[uColCountIn]; unsigned uAnchorColCount; SetMSAWeightsMuscle(msaIn); FindAnchorCols(msaIn, AnchorCols, &uAnchorColCount); const unsigned uRangeCount = uAnchorColCount + 1; Range *Ranges = new Range[uRangeCount]; #if TRACE Log("%u ranges\n", uRangeCount); #endif ColsToRanges(AnchorCols, uAnchorColCount, uColCountIn, Ranges); ListVertSavings(uColCountIn, uAnchorColCount, Ranges, uRangeCount); #if TRACE { Log("Anchor cols: "); for (unsigned i = 0; i < uAnchorColCount; ++i) Log(" %u", AnchorCols[i]); Log("\n"); Log("Ranges:\n"); for (unsigned i = 0; i < uRangeCount; ++i) Log("%4u - %4u\n", Ranges[i].m_uBestColLeft, Ranges[i].m_uBestColRight); } #endif delete[] AnchorCols; MSA msaOut; msaOut.SetSize(uSeqCountIn, 0); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCountIn; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uSeqIndex); unsigned uId = msaIn.GetSeqId(uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); msaOut.SetSeqId(uSeqIndex, uId); } for (unsigned uRangeIndex = 0; uRangeIndex < uRangeCount; ++uRangeIndex) { MSA msaRange; const Range &r = Ranges[uRangeIndex]; const unsigned uFromColIndex = r.m_uBestColLeft; const unsigned uRangeColCount = r.m_uBestColRight - uFromColIndex; if (0 == uRangeColCount) continue; else if (1 == uRangeColCount) { MSAFromColRange(msaIn, uFromColIndex, 1, msaRange); MSAAppend(msaOut, msaRange); continue; } MSAFromColRange(msaIn, uFromColIndex, uRangeColCount, msaRange); #if TRACE Log("\n-------------\n"); Log("Range %u - %u count=%u\n", r.m_uBestColLeft, r.m_uBestColRight, uRangeColCount); Log("Before:\n"); msaRange.LogMe(); #endif bool bLockLeft = (0 != uRangeIndex); bool bLockRight = (uRangeCount - 1 != uRangeIndex); bool bAnyChangesThisBlock = RefineHoriz(msaRange, tree, uIters, bLockLeft, bLockRight); bAnyChanges = (bAnyChanges || bAnyChangesThisBlock); #if TRACE Log("After:\n"); msaRange.LogMe(); #endif MSAAppend(msaOut, msaRange); #if TRACE Log("msaOut after Cat:\n"); msaOut.LogMe(); #endif } #if DEBUG // Sanity check AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut); #endif delete[] Ranges; if (bAnyChanges) msaIn.Copy(msaOut); return bAnyChanges; } refinew.cpp0000664000175000017500000001245012360262614011315 0ustar bobbob#include "muscle.h" #include "msa.h" #include "seqvect.h" #include "textfile.h" #define MEMDEBUG 0 #if MEMDEBUG #include #endif void MUSCLE(SeqVect &v, MSA &msaOut); // Append msa2 at the end of msa1 void AppendMSA(MSA &msa1, const MSA &msa2) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2; bool bFound = msa2.GetSeqIndex(uId, &uSeqIndex2); if (bFound) { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } else { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } } static void SeqFromMSACols(const MSA &msa, unsigned uSeqIndex, unsigned uColFrom, unsigned uColTo, Seq &s) { s.Clear(); s.SetName(msa.GetSeqName(uSeqIndex)); s.SetId(msa.GetSeqId(uSeqIndex)); for (unsigned uColIndex = uColFrom; uColIndex <= uColTo; ++uColIndex) { char c = msa.GetChar(uSeqIndex, uColIndex); if (!IsGapChar(c)) s.AppendChar(c); } } static void SeqVectFromMSACols(const MSA &msa, unsigned uColFrom, unsigned uColTo, SeqVect &v) { v.Clear(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq s; SeqFromMSACols(msa, uSeqIndex, uColFrom, uColTo, s); v.AppendSeq(s); } } void RefineW(const MSA &msaIn, MSA &msaOut) { const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uColCount = msaIn.GetColCount(); // Reserve same nr seqs, 20% more cols const unsigned uReserveColCount = (uColCount*120)/100; msaOut.SetSize(uSeqCount, uReserveColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { msaOut.SetSeqName(uSeqIndex, msaIn.GetSeqName(uSeqIndex)); msaOut.SetSeqId(uSeqIndex, msaIn.GetSeqId(uSeqIndex)); } const unsigned uWindowCount = (uColCount + g_uRefineWindow - 1)/g_uRefineWindow; if (0 == g_uWindowTo) g_uWindowTo = uWindowCount - 1; #if MEMDEBUG _CrtSetBreakAlloc(1560); #endif if (g_uWindowOffset > 0) { MSA msaTmp; MSAFromColRange(msaIn, 0, g_uWindowOffset, msaOut); } fprintf(stderr, "\n"); for (unsigned uWindowIndex = g_uWindowFrom; uWindowIndex <= g_uWindowTo; ++uWindowIndex) { fprintf(stderr, "Window %d of %d \r", uWindowIndex, uWindowCount); const unsigned uColFrom = g_uWindowOffset + uWindowIndex*g_uRefineWindow; unsigned uColTo = uColFrom + g_uRefineWindow - 1; if (uColTo >= uColCount) uColTo = uColCount - 1; assert(uColTo >= uColFrom); SeqVect v; SeqVectFromMSACols(msaIn, uColFrom, uColTo, v); #if MEMDEBUG _CrtMemState s1; _CrtMemCheckpoint(&s1); #endif MSA msaTmp; MUSCLE(v, msaTmp); AppendMSA(msaOut, msaTmp); if (uWindowIndex == g_uSaveWindow) { MSA msaInTmp; unsigned uOutCols = msaOut.GetColCount(); unsigned un = uColTo - uColFrom + 1; MSAFromColRange(msaIn, uColFrom, un, msaInTmp); char fn[256]; sprintf(fn, "win%d_inaln.tmp", uWindowIndex); TextFile fIn(fn, true); msaInTmp.ToFile(fIn); sprintf(fn, "win%d_inseqs.tmp", uWindowIndex); TextFile fv(fn, true); v.ToFile(fv); sprintf(fn, "win%d_outaln.tmp", uWindowIndex); TextFile fOut(fn, true); msaTmp.ToFile(fOut); } #if MEMDEBUG void FreeDPMemSPN(); FreeDPMemSPN(); _CrtMemState s2; _CrtMemCheckpoint(&s2); _CrtMemState s; _CrtMemDifference(&s, &s1, &s2); _CrtMemDumpStatistics(&s); _CrtMemDumpAllObjectsSince(&s1); exit(1); #endif //#if DEBUG // AssertMSAEqIgnoreCaseAndGaps(msaInTmp, msaTmp); //#endif } fprintf(stderr, "\n"); // AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut);//@@uncomment! } void DoRefineW() { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetStartTime(); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); MSA msa; msa.FromFile(fileIn); const unsigned uSeqCount = msa.GetSeqCount(); if (0 == uSeqCount) Quit("No sequences in input file"); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); SetMuscleInputMSA(msa); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = msa.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); msa.FixAlpha(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); MSA msaOut; RefineW(msa, msaOut); // ValidateMuscleIds(msa); // TextFile fileOut(g_pstrOutFileName, true); // msaOut.ToFile(fileOut); MuscleOutput(msaOut); } savebest.cpp0000664000175000017500000000264612360262614011500 0ustar bobbob#include "muscle.h" #include "msa.h" #include "textfile.h" #include MSA *ptrBestMSA; static const char *pstrOutputFileName; void SetOutputFileName(const char *out) { pstrOutputFileName = out; } void SetCurrentAlignment(MSA &msa) { ptrBestMSA = &msa; } void SaveCurrentAlignment() { static bool bCalled = false; if (bCalled) { fprintf(stderr, "\nRecursive call to SaveCurrentAlignment, giving up attempt to save.\n"); exit(EXIT_FatalError); } if (0 == ptrBestMSA) { fprintf(stderr, "\nAlignment not completed, cannot save.\n"); Log("Alignment not completed, cannot save.\n"); exit(EXIT_FatalError); } if (0 == pstrOutputFileName) { fprintf(stderr, "\nOutput file name not specified, cannot save.\n"); exit(EXIT_FatalError); } fprintf(stderr, "\nSaving current alignment ...\n"); TextFile fileOut(pstrOutputFileName, true); ptrBestMSA->ToFASTAFile(fileOut); fprintf(stderr, "Current alignment saved to \"%s\".\n", pstrOutputFileName); Log("Current alignment saved to \"%s\".\n", pstrOutputFileName); } void CheckMaxTime() { if (0 == g_ulMaxSecs) return; time_t Now = time(0); time_t ElapsedSecs = Now - GetStartTime(); if (ElapsedSecs <= (time_t) g_ulMaxSecs) return; Log("Max time %s exceeded, elapsed seconds = %ul\n", MaxSecsToStr(), ElapsedSecs); SaveCurrentAlignment(); exit(EXIT_Success); } scoredist.cpp0000664000175000017500000000665212360262614011664 0ustar bobbob#include #include #include "muscle.h" #include "msa.h" #include "distfunc.h" #include "msa.h" #include "seqvect.h" #include "pwpath.h" // ScoreDist // E. Sonnhammer & V. Hollich, Scoredist: A simple and robust protein sequence // distance estimator, BMC Bioinformatics 2005, 6:108. extern int BLOSUM62[20][20]; extern double BLOSUM62_Expected; static const double Dayhoff_CalibrationFactor = 1.3370; static const double JTT_CalibrationFactor = 1.2873; static const double MV_CalibrationFactor = 1.1775; static const double LARGE_D = 3.0; static double CalibrationFactor = JTT_CalibrationFactor; // Similarity score static double Sigma(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2, unsigned *ptrLength) { unsigned Length = 0; double Score = 0; const unsigned ColCount = msa.GetColCount(); for (unsigned ColIndex = 0; ColIndex < ColCount; ++ColIndex) { unsigned Letter1 = msa.GetLetterEx(SeqIndex1, ColIndex); unsigned Letter2 = msa.GetLetterEx(SeqIndex2, ColIndex); if (Letter1 >= 20 || Letter2 >= 20) continue; ++Length; Score += BLOSUM62[Letter1][Letter2]; } *ptrLength = Length; return Score; } // Normalized score static double Sigma_N(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2) { unsigned Length = UINT_MAX; double Score = Sigma(msa, SeqIndex1, SeqIndex2, &Length); double RandomScore = Length*BLOSUM62_Expected; return Score - RandomScore; } // Upper limit static double Sigma_U(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2, unsigned *ptrLength) { double Score11 = Sigma(msa, SeqIndex1, SeqIndex1, ptrLength); double Score22 = Sigma(msa, SeqIndex2, SeqIndex2, ptrLength); return (Score11 + Score22)/2; } // Normalized upper limit static double Sigma_UN(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2) { unsigned Length = UINT_MAX; double Score = Sigma_U(msa, SeqIndex1, SeqIndex2, &Length); double RandomScore = Length*BLOSUM62_Expected; return Score - RandomScore; } double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2) { if (g_Alpha != ALPHA_Amino) Quit("Scoredist is only for amino acid sequences"); double s_N = Sigma_N(msa, SeqIndex1, SeqIndex2); double s_UN = Sigma_UN(msa, SeqIndex1, SeqIndex2); double d = 0.0; if (s_UN != 0) { double Ratio = s_N/s_UN; if (Ratio < 0.001) d = LARGE_D; else d = -log(Ratio); } return d*CalibrationFactor; } void DistPWScoreDist(const SeqVect &v, DistFunc &DF) { SEQWEIGHT SeqWeightSave = GetSeqWeightMethod(); SetSeqWeightMethod(SEQWEIGHT_Henikoff); const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; SetProgressDesc("PW ScoreDist"); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const Seq &s1 = v.GetSeq(uSeqIndex1); MSA msa1; msa1.FromSeq(s1); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) { if (0 == uCount%20) Progress(uCount, uPairCount); ++uCount; const Seq &s2 = v.GetSeq(uSeqIndex2); MSA msa2; msa2.FromSeq(s2); PWPath Path; MSA msaOut; AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false); float d = (float) GetScoreDist(msaOut, 0, 1); DF.SetDist(uSeqIndex1, uSeqIndex2, d); } } ProgressStepsDone(); SetSeqWeightMethod(SeqWeightSave); } scoregaps.cpp0000664000175000017500000001063212360262614011644 0ustar bobbob#include "muscle.h" #include "msa.h" #include "objscore.h" #define TRACE 0 struct GAPINFO { GAPINFO *Next; unsigned Start; unsigned End; }; static GAPINFO **g_Gaps; static GAPINFO *g_FreeList; static unsigned g_MaxSeqCount; static unsigned g_MaxColCount; static unsigned g_ColCount; static bool *g_ColDiff; static GAPINFO *NewGapInfo() { if (0 == g_FreeList) { const int NEWCOUNT = 256; GAPINFO *NewList = new GAPINFO[NEWCOUNT]; g_FreeList = &NewList[0]; for (int i = 0; i < NEWCOUNT-1; ++i) NewList[i].Next = &NewList[i+1]; NewList[NEWCOUNT-1].Next = 0; } GAPINFO *GI = g_FreeList; g_FreeList = g_FreeList->Next; return GI; } static void FreeGapInfo(GAPINFO *GI) { GI->Next = g_FreeList; g_FreeList = GI; } // TODO: This could be much faster, no need to look // at all columns. static void FindIntersectingGaps(const MSA &msa, unsigned SeqIndex) { const unsigned ColCount = msa.GetColCount(); bool InGap = false; bool Intersects = false; unsigned Start = uInsane; for (unsigned Col = 0; Col <= ColCount; ++Col) { bool Gap = ((Col != ColCount) && msa.IsGap(SeqIndex, Col)); if (Gap) { if (!InGap) { InGap = true; Start = Col; } if (g_ColDiff[Col]) Intersects = true; } else if (InGap) { InGap = false; if (Intersects) { GAPINFO *GI = NewGapInfo(); GI->Start = Start; GI->End = Col - 1; GI->Next = g_Gaps[SeqIndex]; g_Gaps[SeqIndex] = GI; } Intersects = false; } } } static SCORE Penalty(unsigned Length, bool Term) { if (0 == Length) return 0; SCORE s1 = g_scoreGapOpen + g_scoreGapExtend*(Length - 1); #if DOUBLE_AFFINE SCORE s2 = g_scoreGapOpen2 + g_scoreGapExtend2*(Length - 1); if (s1 > s2) return s1; return s2; #else return s1; #endif } //static SCORE ScorePair(unsigned Seq1, unsigned Seq2) // { //#if TRACE // { // Log("ScorePair(%d,%d)\n", Seq1, Seq2); // Log("Gaps seq 1: "); // for (GAPINFO *GI = g_Gaps[Seq1]; GI; GI = GI->Next) // Log(" %d-%d", GI->Start, GI->End); // Log("\n"); // Log("Gaps seq 2: "); // for (GAPINFO *GI = g_Gaps[Seq2]; GI; GI = GI->Next) // Log(" %d-%d", GI->Start, GI->End); // Log("\n"); // } //#endif // return 0; // } SCORE ScoreGaps(const MSA &msa, const unsigned DiffCols[], unsigned DiffColCount) { #if TRACE { Log("ScoreGaps\n"); Log("DiffCols "); for (unsigned i = 0; i < DiffColCount; ++i) Log(" %u", DiffCols[i]); Log("\n"); Log("msa=\n"); msa.LogMe(); Log("\n"); } #endif const unsigned SeqCount = msa.GetSeqCount(); const unsigned ColCount = msa.GetColCount(); g_ColCount = ColCount; if (SeqCount > g_MaxSeqCount) { delete[] g_Gaps; g_MaxSeqCount = SeqCount + 256; g_Gaps = new GAPINFO *[g_MaxSeqCount]; } memset(g_Gaps, 0, SeqCount*sizeof(GAPINFO *)); if (ColCount > g_MaxColCount) { delete[] g_ColDiff; g_MaxColCount = ColCount + 256; g_ColDiff = new bool[g_MaxColCount]; } memset(g_ColDiff, 0, g_ColCount*sizeof(bool)); for (unsigned i = 0; i < DiffColCount; ++i) { unsigned Col = DiffCols[i]; assert(Col < ColCount); g_ColDiff[Col] = true; } for (unsigned SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) FindIntersectingGaps(msa, SeqIndex); #if TRACE { Log("\n"); Log("Intersecting gaps:\n"); Log(" "); for (unsigned Col = 0; Col < ColCount; ++Col) Log("%c", g_ColDiff[Col] ? '*' : ' '); Log("\n"); Log(" "); for (unsigned Col = 0; Col < ColCount; ++Col) Log("%d", Col%10); Log("\n"); for (unsigned Seq = 0; Seq < SeqCount; ++Seq) { Log("%3d: ", Seq); for (unsigned Col = 0; Col < ColCount; ++Col) Log("%c", msa.GetChar(Seq, Col)); Log(" :: "); for (GAPINFO *GI = g_Gaps[Seq]; GI; GI = GI->Next) Log(" (%d,%d)", GI->Start, GI->End); Log(" >%s\n", msa.GetSeqName(Seq)); } Log("\n"); } #endif SCORE Score = 0; for (unsigned Seq1 = 0; Seq1 < SeqCount; ++Seq1) { const WEIGHT w1 = msa.GetSeqWeight(Seq1); for (unsigned Seq2 = Seq1 + 1; Seq2 < SeqCount; ++Seq2) { const WEIGHT w2 = msa.GetSeqWeight(Seq2); // const SCORE Pair = ScorePair(Seq1, Seq2); const SCORE Pair = ScoreSeqPairGaps(msa, Seq1, msa, Seq2); Score += w1*w2*Pair; #if TRACE Log("Seq1=%u Seq2=%u ScorePair=%.4g w1=%.4g w2=%.4g Sum=%.4g\n", Seq1, Seq2, Pair, w1, w2, Score); #endif } } return Score; } scorehistory.cpp0000664000175000017500000000473612360262614012423 0ustar bobbob#include "muscle.h" #include "scorehistory.h" #include #define TRACE 0 ScoreHistory::ScoreHistory(unsigned uIters, unsigned uNodeCount) { m_uNodeCount = uNodeCount; m_uIters = uIters; m_Score = new SCORE *[uIters]; m_bScoreSet = new bool *[uIters]; for (unsigned n = 0; n < uIters; ++n) { m_Score[n] = new SCORE[uNodeCount*2]; m_bScoreSet[n] = new bool[uNodeCount*2]; memset(m_bScoreSet[n], 0, uNodeCount*2*sizeof(bool)); } } ScoreHistory::~ScoreHistory() { for (unsigned n = 0; n < m_uIters; ++n) { delete[] m_Score[n]; delete[] m_bScoreSet[n]; } delete[] m_Score; delete[] m_bScoreSet; } bool ScoreHistory::SetScore(unsigned uIter, unsigned uNodeIndex, bool bRight, SCORE Score) { #if TRACE Log("ScoreHistory::SetScore(Iter=%u Node=%u Right=%d Score=%g)\n", uIter, uNodeIndex, bRight, Score); #endif if (uIter >= m_uIters) Quit("ScoreHistory::SetScore-1"); if (uNodeIndex >= m_uNodeCount) Quit("ScoreHistory::SetScore-2"); const unsigned uIndex = uNodeIndex*2 + bRight; for (unsigned n = 1; n < uIter; ++n) { const unsigned uPrevIter = n - 1; if (!m_bScoreSet[uPrevIter][uIndex]) { LogMe(); Quit("ScoreHistory::SetScore-3"); } if (m_Score[uPrevIter][uIndex] == Score) { ProgressStepsDone(); #if TRACE Log("Oscillating\n"); #endif return true; } } m_Score[uIter][uIndex] = Score; m_bScoreSet[uIter][uIndex] = true; return false; } void ScoreHistory::LogMe() const { Log("ScoreHistory\n"); Log("Iter Node Right Score\n"); Log("---- ---- ----- ---------\n"); for (unsigned uIter = 0; uIter < m_uIters; ++uIter) { bool bAnySet = false; for (unsigned n = 0; n < m_uNodeCount*2; ++n) if (m_bScoreSet[uIter][n]) { bAnySet = true; break; } if (!bAnySet) return; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { const unsigned uBase = 2*uNodeIndex; if (m_bScoreSet[uIter][uBase]) Log("%4u %4u F %9.3f\n", uIter, uNodeIndex, m_Score[uIter][uBase]); if (m_bScoreSet[uIter][uBase+1]) Log("%4u %4u T %9.3f\n", uIter, uNodeIndex, m_Score[uIter][uBase+1]); } } } SCORE ScoreHistory::GetScore(unsigned uIter, unsigned uNodeIndex, bool bReverse, bool bRight) const { const unsigned uIndex = uNodeIndex*2 + bRight; if (!m_bScoreSet[uIter][uIndex]) Quit("ScoreHistory::GetScore"); return m_Score[uIter][uIndex]; } scorepp.cpp0000664000175000017500000000504212360262614011330 0ustar bobbob#include "muscle.h" #include "profile.h" char ConsensusChar(const ProfPos &PP) { unsigned uMostCommonLetter = 0; FCOUNT fcMostCommon = PP.m_fcCounts[0]; bool bMoreThanOneLetter = false; bool bAnyLetter = false; for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) { const FCOUNT fc = PP.m_fcCounts[uLetter]; if (fc > 0) { if (bAnyLetter) bMoreThanOneLetter = true; bAnyLetter = true; } if (fc > fcMostCommon) { uMostCommonLetter = uLetter; fcMostCommon = fc; } } if (!bAnyLetter) return '-'; char c = LetterToChar(uMostCommonLetter); if (bMoreThanOneLetter) return UnalignChar(c); return c; } SCORE ScoreProfPos2LA(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 20; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } if (0 == Score) return -2.5; SCORE logScore = logf(Score); return (SCORE) ((logScore - g_scoreCenter)*(PPA.m_fOcc * PPB.m_fOcc)); } SCORE ScoreProfPos2NS(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 20; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } return Score - g_scoreCenter; } SCORE ScoreProfPos2SP(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 20; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } return Score - g_scoreCenter; } SCORE ScoreProfPos2SPN(const ProfPos &PPA, const ProfPos &PPB) { SCORE Score = 0; for (unsigned n = 0; n < 4; ++n) { const unsigned uLetter = PPA.m_uSortOrder[n]; const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; if (0 == fcLetter) break; Score += fcLetter*PPB.m_AAScores[uLetter]; } return Score - g_scoreCenter; } SCORE ScoreProfPos2(const ProfPos &PPA, const ProfPos &PPB) { if (PPSCORE_SP == g_PPScore) return ScoreProfPos2NS(PPA, PPB); else if (PPSCORE_LE == g_PPScore) return ScoreProfPos2LA(PPA, PPB); else if (PPSCORE_SV == g_PPScore) return ScoreProfPos2SP(PPA, PPB); else if (PPSCORE_SPN == g_PPScore) return ScoreProfPos2SPN(PPA, PPB); Quit("Invalid g_PPScore"); return 0; } seq.cpp0000664000175000017500000001552412360262614010453 0ustar bobbob#include "muscle.h" #include "seq.h" #include "textfile.h" #include "msa.h" //#include const size_t MAX_FASTA_LINE = 16000; void Seq::SetName(const char *ptrName) { delete[] m_ptrName; size_t n = strlen(ptrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, ptrName); } void Seq::ToFASTAFile(TextFile &File) const { File.PutFormat(">%s\n", m_ptrName); unsigned uColCount = Length(); for (unsigned n = 0; n < uColCount; ++n) { if (n > 0 && n%60 == 0) File.PutString("\n"); File.PutChar(at(n)); } File.PutString("\n"); } // Return true on end-of-file bool Seq::FromFASTAFile(TextFile &File) { Clear(); char szLine[MAX_FASTA_LINE]; bool bEof = File.GetLine(szLine, sizeof(szLine)); if (bEof) return true; if ('>' != szLine[0]) Quit("Expecting '>' in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); size_t n = strlen(szLine); if (1 == n) Quit("Missing annotation following '>' in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); m_ptrName = new char[n]; strcpy(m_ptrName, szLine + 1); TEXTFILEPOS Pos = File.GetPos(); for (;;) { bEof = File.GetLine(szLine, sizeof(szLine)); if (bEof) { if (0 == size()) { Quit("Empty sequence in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); return true; } return false; } if ('>' == szLine[0]) { if (0 == size()) Quit("Empty sequence in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); // Rewind to beginning of this line, it's the start of the // next sequence. File.SetPos(Pos); return false; } const char *ptrChar = szLine; while (char c = *ptrChar++) { if (isspace(c)) continue; if (IsGapChar(c)) continue; if (!IsResidueChar(c)) { if (isprint(c)) { char w = GetWildcardChar(); Warning("Invalid residue '%c' in FASTA file %s line %d, replaced by '%c'", c, File.GetFileName(), File.GetLineNr(), w); c = w; } else Quit("Invalid byte hex %02x in FASTA file %s line %d", (unsigned char) c, File.GetFileName(), File.GetLineNr()); } c = toupper(c); push_back(c); } Pos = File.GetPos(); } } void Seq::ExtractUngapped(MSA &msa) const { msa.Clear(); unsigned uColCount = Length(); msa.SetSize(1, 1); unsigned uUngappedPos = 0; for (unsigned n = 0; n < uColCount; ++n) { char c = at(n); if (!IsGapChar(c)) msa.SetChar(0, uUngappedPos++, c); } msa.SetSeqName(0, m_ptrName); } void Seq::Copy(const Seq &rhs) { clear(); const unsigned uLength = rhs.Length(); for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) push_back(rhs.at(uColIndex)); const char *ptrName = rhs.GetName(); size_t n = strlen(ptrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, ptrName); SetId(rhs.GetId()); } void Seq::CopyReversed(const Seq &rhs) { clear(); const unsigned uLength = rhs.Length(); const unsigned uBase = rhs.Length() - 1; for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) push_back(rhs.at(uBase - uColIndex)); const char *ptrName = rhs.GetName(); size_t n = strlen(ptrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, ptrName); } void Seq::StripGaps() { for (CharVect::iterator p = begin(); p != end(); ) { char c = *p; if (IsGapChar(c)) erase(p); else ++p; } } void Seq::StripGapsAndWhitespace() { for (CharVect::iterator p = begin(); p != end(); ) { char c = *p; if (isspace(c) || IsGapChar(c)) erase(p); else ++p; } } void Seq::ToUpper() { for (CharVect::iterator p = begin(); p != end(); ++p) { char c = *p; if (islower(c)) *p = toupper(c); } } unsigned Seq::GetLetter(unsigned uIndex) const { assert(uIndex < Length()); char c = operator[](uIndex); return CharToLetter(c); } bool Seq::EqIgnoreCase(const Seq &s) const { const unsigned n = Length(); if (n != s.Length()) return false; for (unsigned i = 0; i < n; ++i) { const char c1 = at(i); const char c2 = s.at(i); if (IsGapChar(c1)) { if (!IsGapChar(c2)) return false; } else { if (toupper(c1) != toupper(c2)) return false; } } return true; } bool Seq::Eq(const Seq &s) const { const unsigned n = Length(); if (n != s.Length()) return false; for (unsigned i = 0; i < n; ++i) { const char c1 = at(i); const char c2 = s.at(i); if (c1 != c2) return false; } return true; } bool Seq::EqIgnoreCaseAndGaps(const Seq &s) const { const unsigned uThisLength = Length(); const unsigned uOtherLength = s.Length(); unsigned uThisPos = 0; unsigned uOtherPos = 0; int cThis; int cOther; for (;;) { if (uThisPos == uThisLength && uOtherPos == uOtherLength) break; // Set cThis to next non-gap character in this string // or -1 if end-of-string. for (;;) { if (uThisPos == uThisLength) { cThis = -1; break; } else { cThis = at(uThisPos); ++uThisPos; if (!IsGapChar(cThis)) { cThis = toupper(cThis); break; } } } // Set cOther to next non-gap character in s // or -1 if end-of-string. for (;;) { if (uOtherPos == uOtherLength) { cOther = -1; break; } else { cOther = s.at(uOtherPos); ++uOtherPos; if (!IsGapChar(cOther)) { cOther = toupper(cOther); break; } } } // Compare characters are corresponding ungapped position if (cThis != cOther) return false; } return true; } unsigned Seq::GetUngappedLength() const { unsigned uUngappedLength = 0; for (CharVect::const_iterator p = begin(); p != end(); ++p) { char c = *p; if (!IsGapChar(c)) ++uUngappedLength; } return uUngappedLength; } void Seq::LogMe() const { Log(">%s\n", m_ptrName); const unsigned n = Length(); for (unsigned i = 0; i < n; ++i) Log("%c", at(i)); Log("\n"); } void Seq::FromString(const char *pstrSeq, const char *pstrName) { clear(); const unsigned uLength = (unsigned) strlen(pstrSeq); for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) push_back(pstrSeq[uColIndex]); size_t n = strlen(pstrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, pstrName); } bool Seq::HasGap() const { for (CharVect::const_iterator p = begin(); p != end(); ++p) { char c = *p; if (IsGapChar(c)) return true; } return false; } void Seq::FixAlpha() { for (CharVect::iterator p = begin(); p != end(); ++p) { char c = *p; if (!IsResidueChar(c)) { char w = GetWildcardChar(); // Warning("Invalid residue '%c', replaced by '%c'", c, w); InvalidLetterWarning(c, w); *p = w; } } } seqvect.cpp0000664000175000017500000001433012360262614011327 0ustar bobbob#include "muscle.h" #include "seqvect.h" #include "textfile.h" #include "msa.h" const size_t MAX_FASTA_LINE = 16000; SeqVect::~SeqVect() { Clear(); } void SeqVect::Clear() { for (size_t n = 0; n < size(); ++n) delete (*this)[n]; } void SeqVect::ToFASTAFile(TextFile &File) const { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); ptrSeq->ToFASTAFile(File); } } void SeqVect::FromFASTAFile(TextFile &File) { Clear(); FILE *f = File.GetStdioFile(); for (;;) { char *Label; unsigned uLength; char *SeqData = GetFastaSeq(f, &uLength, &Label); if (0 == SeqData) return; Seq *ptrSeq = new Seq; for (unsigned i = 0; i < uLength; ++i) { char c = SeqData[i]; ptrSeq->push_back(c); } ptrSeq->SetName(Label); push_back(ptrSeq); delete[] SeqData; delete[] Label; } } void SeqVect::PadToMSA(MSA &msa) { unsigned uSeqCount = Length(); if (0 == uSeqCount) { msa.Clear(); return; } unsigned uLongestSeqLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); unsigned uColCount = ptrSeq->Length(); if (uColCount > uLongestSeqLength) uLongestSeqLength = uColCount; } msa.SetSize(uSeqCount, uLongestSeqLength); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); msa.SetSeqName(uSeqIndex, ptrSeq->GetName()); unsigned uColCount = ptrSeq->Length(); unsigned uColIndex; for (uColIndex = 0; uColIndex < uColCount; ++uColIndex) { char c = ptrSeq->at(uColIndex); msa.SetChar(uSeqIndex, uColIndex, c); } while (uColIndex < uLongestSeqLength) msa.SetChar(uSeqIndex, uColIndex++, '.'); } } void SeqVect::Copy(const SeqVect &rhs) { clear(); unsigned uSeqCount = rhs.Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = rhs.at(uSeqIndex); Seq *ptrSeqCopy = new Seq; ptrSeqCopy->Copy(*ptrSeq); push_back(ptrSeqCopy); } } void SeqVect::StripGaps() { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); ptrSeq->StripGaps(); } } void SeqVect::StripGapsAndWhitespace() { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); ptrSeq->StripGapsAndWhitespace(); } } void SeqVect::ToUpper() { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); ptrSeq->ToUpper(); } } bool SeqVect::FindName(const char *ptrName, unsigned *ptruIndex) const { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const Seq *ptrSeq = at(uSeqIndex); if (0 == stricmp(ptrSeq->GetName(), ptrName)) { *ptruIndex = uSeqIndex; return true; } } return false; } void SeqVect::AppendSeq(const Seq &s) { Seq *ptrSeqCopy = new Seq; ptrSeqCopy->Copy(s); push_back(ptrSeqCopy); } void SeqVect::LogMe() const { unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const Seq *ptrSeq = at(uSeqIndex); ptrSeq->LogMe(); } } const char *SeqVect::GetSeqName(unsigned uSeqIndex) const { assert(uSeqIndex < size()); const Seq *ptrSeq = at(uSeqIndex); return ptrSeq->GetName(); } unsigned SeqVect::GetSeqId(unsigned uSeqIndex) const { assert(uSeqIndex < size()); const Seq *ptrSeq = at(uSeqIndex); return ptrSeq->GetId(); } unsigned SeqVect::GetSeqIdFromName(const char *Name) const { const unsigned uSeqCount = GetSeqCount(); for (unsigned i = 0; i < uSeqCount; ++i) { if (!strcmp(Name, GetSeqName(i))) return GetSeqId(i); } Quit("SeqVect::GetSeqIdFromName(%s): not found", Name); return 0; } Seq &SeqVect::GetSeqById(unsigned uId) { const unsigned uSeqCount = GetSeqCount(); for (unsigned i = 0; i < uSeqCount; ++i) { if (GetSeqId(i) == uId) return GetSeq(i); } Quit("SeqVect::GetSeqIdByUd(%d): not found", uId); return (Seq &) *((Seq *) 0); } unsigned SeqVect::GetSeqLength(unsigned uSeqIndex) const { assert(uSeqIndex < size()); const Seq *ptrSeq = at(uSeqIndex); return ptrSeq->Length(); } Seq &SeqVect::GetSeq(unsigned uSeqIndex) { assert(uSeqIndex < size()); return *at(uSeqIndex); } const Seq &SeqVect::GetSeq(unsigned uSeqIndex) const { assert(uSeqIndex < size()); return *at(uSeqIndex); } void SeqVect::SetSeqId(unsigned uSeqIndex, unsigned uId) { assert(uSeqIndex < size()); Seq *ptrSeq = at(uSeqIndex); return ptrSeq->SetId(uId); } ALPHA SeqVect::GuessAlpha() const { // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap // letters belong to the nucleotide alphabet, guess nucleo. // Otherwise amino. const unsigned CHAR_COUNT = 100; const unsigned MIN_NUCLEO_PCT = 95; const unsigned uSeqCount = GetSeqCount(); if (0 == uSeqCount) return ALPHA_Amino; unsigned uSeqIndex = 0; unsigned uPos = 0; unsigned uSeqLength = GetSeqLength(0); unsigned uDNACount = 0; unsigned uRNACount = 0; unsigned uTotal = 0; const Seq *ptrSeq = &GetSeq(0); for (;;) { while (uPos >= uSeqLength) { ++uSeqIndex; if (uSeqIndex >= uSeqCount) break; ptrSeq = &GetSeq(uSeqIndex); uSeqLength = ptrSeq->Length(); uPos = 0; } if (uSeqIndex >= uSeqCount) break; char c = ptrSeq->at(uPos++); if (IsGapChar(c)) continue; if (IsDNA(c)) ++uDNACount; if (IsRNA(c)) ++uRNACount; ++uTotal; if (uTotal >= CHAR_COUNT) break; } if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_DNA; if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_RNA; return ALPHA_Amino; } void SeqVect::FixAlpha() { ClearInvalidLetterWarning(); unsigned uSeqCount = Length(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq *ptrSeq = at(uSeqIndex); ptrSeq->FixAlpha(); } ReportInvalidLetters(); } setblosumweights.cpp0000664000175000017500000001147512360262614013274 0ustar bobbob/*** Code for implementing HMMer's "BLOSUM weighting" algorithm. The algorithm was deduced by reverse-engineering the HMMer code. The HMMer documentation refers to BLOSUM weighting as "Henikoff simple filter weighting" The name BLOSUM implied to me that HMMer would be using a substitution probability matrix to compute distances, but this turned out not to be the case. It is notable, not to say puzzling, that the HMMer BLOSUM weighting algorithm is guaranteed to produce an integral NIC (number-of-indepdent- counts, also known as effective sequence count). Presumably Eddy must have known this, though he doesn't comment on it and he computes & stores the value in a float. Here's the algorithm: Distances between two sequences are based on the average of a simple binary equal (one) / not equal (zero) at each position. The only thing that has anything to do with BLOSUM in this calculation is an obscure (to me) constant value of 0.62. The sequences are clustered using this distance. If the pairwise identity (fraction of identical positions) is less than 0.62, they get assigned to disjoint clusters, the final number of disjoint clusters is the NIC. This makes some intuitive sense: I would interpret this by saying that if a set of sequences are close enough they count as one sequence. The weight for each sequence within a disjoint cluster is then determined to be 1 / (clustersize), from which it follows that the sum of all weights is equal to the number of disjoint clusters and is thus guaranteed to be an integer value. It is exactly this sum that HMMer uses for the NIC, by default. The individual BLOSUM sequence weights are not used for anything else in HMMer, unless you specify that BLOSUM weighting should override the default GSC weighting. GSC weighting uses a different clustering algorithm to determine relative weights. The BLOSUM NIC is then distributed over the GSC tree according to those relative weights. ***/ #include "muscle.h" #include "msa.h" #include "cluster.h" #include "distfunc.h" // Set weights of all sequences in the subtree under given node. void MSA::SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const { if (0 == ptrNode) return; const ClusterNode *ptrRight = ptrNode->GetRight(); const ClusterNode *ptrLeft = ptrNode->GetLeft(); // If leaf, set weight if (0 == ptrRight && 0 == ptrLeft) { unsigned uIndex = ptrNode->GetIndex(); WEIGHT w = DoubleToWeight(dWeight); m_Weights[uIndex] = w; return; } // Otherwise, recursively set subtrees SetBLOSUMSubtreeWeight(ptrLeft, dWeight); SetBLOSUMSubtreeWeight(ptrRight, dWeight); } // Traverse a subtree looking for clusters where all // the leaves are sufficiently similar that they // should be weighted as a group, i.e. given a weight // of 1/N where N is the cluster size. The idea is // to avoid sample bias where we have closely related // sequences in the input alignment. // The weight at a node is the distance between // the two closest sequences in the left and right // subtrees under that node. "Sufficiently similar" // is defined as being where that minimum distance // is less than the dMinDist threshhold. I don't know // why the clustering is done using a minimum rather // than a maximum or average, either of which would // seem more natural to me. // Return value is number of groups under this node. // A "group" is the cluster found under a node with a // weight less than the minimum. unsigned MSA::SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const { if (0 == ptrNode) return 0; if (ptrNode->GetWeight() < dMinDist) { unsigned uClusterSize = ptrNode->GetClusterSize(); assert(uClusterSize > 0); double dWeight = 1.0 / uClusterSize; SetBLOSUMSubtreeWeight(ptrNode, dWeight); return 1; } const ClusterNode *ptrLeft = ptrNode->GetLeft(); const ClusterNode *ptrRight = ptrNode->GetRight(); unsigned uLeftGroupCount = SetBLOSUMNodeWeight(ptrLeft, dMinDist); unsigned uRightGroupCount = SetBLOSUMNodeWeight(ptrRight, dMinDist); return uLeftGroupCount + uRightGroupCount; } // Return value is the group count, i.e. the effective number // of distinctly different sequences. unsigned MSA::CalcBLOSUMWeights(ClusterTree &BlosumCluster) const { // Build distance matrix DistFunc DF; unsigned uSeqCount = GetSeqCount(); DF.SetCount(uSeqCount); for (unsigned i = 0; i < uSeqCount; ++i) for (unsigned j = i+1; j < uSeqCount; ++j) { double dDist = GetPctIdentityPair(i, j); assert(dDist >= 0.0 && dDist <= 1.0); DF.SetDist(i, j, (float) (1.0 - dDist)); } // Cluster based on the distance function BlosumCluster.Create(DF); // Return value is HMMer's "effective sequence count". return SetBLOSUMNodeWeight(BlosumCluster.GetRoot(), 1.0 - BLOSUM_DIST); } setgscweights.cpp0000664000175000017500000001366312360262613012547 0ustar bobbob/*** Gerstein/Sonnhammer/Chothia ad hoc sequence weighting. The algorithm was deduced by reverse-engineering the HMMer code. I used an alternative representation that I prefer over HMMer's. The HMMer code is full of tree manipulations that do something to the left child and then the equivalent thing to the right child. It was clear that there must be a re-formulation that does everything once for each node, which would reduce the number of operations expressed in the code by a factor of two. This gives a more elegant and less error-prone way to code it. These notes explain the correspondence between my design and Eddy's. HMMer stores a data structure phylo_s for each non-leaf node in the cluster tree. This structure contains the following fields: diff Weight of the node lblen Left branch length rblen Right branch length The lblen and rblen branch lengths are calculated as: this.lblen = this.diff - left.diff this.rblen = this.diff - right.diff My code stores one ClusterNode data structure per node in the cluster tree, including leaves. I store only the weight. I can recover the HMMer branch length fields in a trivial O(1) calculation as follows: lblen = Node.GetWeight() - Node.GetLeft()->GetWeight() rblen = Node.GetWeight() - Node.GetRight()->GetWeight() For the GSC weights calculation, HMMer constructs the following vectors, which have entries for all nodes, including leaves: lwt Left weight rwt Right weight The "left weight" is calculated as the sum of the weights in all the nodes reachable through the left branch, including the node itself. (This is not immediately obvious from the code, which does the calculation using branch lengths rather than weights, but this is an equivalent, and to my mind clearer, statement of what they are). Similarly, the "right weight" is the sum of all weights reachable via the right branch. I define the "cluster weight" to be the summed weight of all nodes in the subtree under the node, including the node itself. I provide a function Node.GetClusterWeight() which calculates the cluster weight using a O(ln N) recursion through the tree. The lwt and rwt values can be recovered as follows: lwt = Node.GetLeft()->GetClusterWeight() + Node.GetWeight() lwt = Node.GetLeft()->GetClusterWeight() + Node.GetWeight() HMMer calculates a further vector fwt as follows. this.fwt = parent.fwt * parent.lwt / (parent.lwt + parent.rwt) This applies to nodes reached via a left branch, for nodes reached via a right branch: this.fwt = parent.fwt * parent.rwt / (parent.lwt + parent.rwt) The values of fwt at the leaf nodes are the final GSC weights. We derive the various terms using our equivalents. parent.lwt = Parent.GetLeft()->GetClusterWeight() + Parent.GetWeight() parent.rwt = Parent.GetRight()->GetClusterWeight() + Parent.GetWeight() parent.lwt + parent.rwt = { Parent.GetLeft()->GetClusterWeight() + Parent.GetRight()->GetClusterWeight() + Parent.GetWeight() } + Parent.GetWeight() We recognize the term {...} as the cluster weight of the parent, so parent.lwt + parent.rwt = Parent.GetClusterWeight() + Parent.GetWeight() As you would expect, repeating this exercise for parent.rwt gives exactly the same expression. The GSC weights (fwt) are stored in the Weight2 field of the cluster tree, the Weight field stores the original (BLOSUM) weights used as input to this algorithm. ***/ #include "muscle.h" #include "msa.h" #include "cluster.h" #include "distfunc.h" // Set weights of all sequences in the subtree under given node. void MSA::SetSubtreeWeight2(const ClusterNode *ptrNode) const { if (0 == ptrNode) return; const ClusterNode *ptrRight = ptrNode->GetRight(); const ClusterNode *ptrLeft = ptrNode->GetLeft(); // If leaf, set weight if (0 == ptrRight && 0 == ptrLeft) { unsigned uIndex = ptrNode->GetIndex(); double dWeight = ptrNode->GetWeight2(); WEIGHT w = DoubleToWeight(dWeight); m_Weights[uIndex] = w; return; } // Otherwise, recursively set subtrees SetSubtreeWeight2(ptrLeft); SetSubtreeWeight2(ptrRight); } void MSA::SetSubtreeGSCWeight(ClusterNode *ptrNode) const { if (0 == ptrNode) return; ClusterNode *ptrParent = ptrNode->GetParent(); double dParentWeight2 = ptrParent->GetWeight2(); double dParentClusterWeight = ptrParent->GetClusterWeight(); if (0.0 == dParentClusterWeight) { double dThisClusterSize = ptrNode->GetClusterSize(); double dParentClusterSize = ptrParent->GetClusterSize(); double dWeight2 = dParentWeight2*dThisClusterSize/dParentClusterSize; ptrNode->SetWeight2(dWeight2); } else { // Could cache cluster weights for better performance. // We calculate cluster weight of each node twice, so this // would give x2 improvement. // As weighting is not very expensive, we don't care. double dThisClusterWeight = ptrNode->GetClusterWeight(); double dParentWeight = ptrParent->GetWeight(); double dNum = dThisClusterWeight + dParentWeight; double dDenom = dParentClusterWeight + dParentWeight; double dWeight2 = dParentWeight2*(dNum/dDenom); ptrNode->SetWeight2(dWeight2); } SetSubtreeGSCWeight(ptrNode->GetLeft()); SetSubtreeGSCWeight(ptrNode->GetRight()); } void MSA::SetGSCWeights() const { ClusterTree CT; CalcBLOSUMWeights(CT); // Calculate weights and store in tree. ClusterNode *ptrRoot = CT.GetRoot(); ptrRoot->SetWeight2(1.0); SetSubtreeGSCWeight(ptrRoot->GetLeft()); SetSubtreeGSCWeight(ptrRoot->GetRight()); // Copy weights from tree to MSA. SetSubtreeWeight2(ptrRoot); } void MSA::ListWeights() const { const unsigned uSeqCount = GetSeqCount(); Log("Weights:\n"); WEIGHT wTotal = 0; for (unsigned n = 0; n < uSeqCount; ++n) { wTotal += GetSeqWeight(n); Log("%6.3f %s\n", GetSeqWeight(n), GetSeqName(n)); } Log("Total weights = %6.3f, should be 1.0\n", wTotal); } setnewhandler.cpp0000664000175000017500000000114412360262613012516 0ustar bobbob#include "muscle.h" #include #include const int ONE_MB = 1024*1024; const size_t RESERVE_BYTES = 8*ONE_MB; static void *EmergencyReserve = 0; void OnOutOfMemory() { free(EmergencyReserve); fprintf(stderr, "\n*** OUT OF MEMORY ***\n"); fprintf(stderr, "Memory allocated so far %g MB\n", GetMemUseMB()); extern MSA *ptrBestMSA; if (ptrBestMSA == 0) fprintf(stderr, "No alignment generated\n"); else SaveCurrentAlignment(); exit(EXIT_FatalError); } void SetNewHandler() { EmergencyReserve = malloc(RESERVE_BYTES); std::set_new_handler(OnOutOfMemory); } spfast.cpp0000664000175000017500000001503412360262614011157 0ustar bobbob#include "muscle.h" #include "profile.h" #define TRACE 0 enum { LL = 0, LG = 1, GL = 2, GG = 3, }; static const char *GapTypeToStr(int GapType) { switch (GapType) { case LL: return "LL"; case LG: return "LG"; case GL: return "GL"; case GG: return "GG"; } Quit("Invalid gap type"); return "?"; } static SCORE GapScoreMatrix[4][4]; static void InitGapScoreMatrix() { const SCORE t = (SCORE) 0.2; GapScoreMatrix[LL][LL] = 0; GapScoreMatrix[LL][LG] = g_scoreGapOpen; GapScoreMatrix[LL][GL] = 0; GapScoreMatrix[LL][GG] = 0; GapScoreMatrix[LG][LL] = g_scoreGapOpen; GapScoreMatrix[LG][LG] = 0; GapScoreMatrix[LG][GL] = g_scoreGapOpen; GapScoreMatrix[LG][GG] = t*g_scoreGapOpen; // approximation! GapScoreMatrix[GL][LL] = 0; GapScoreMatrix[GL][LG] = g_scoreGapOpen; GapScoreMatrix[GL][GL] = 0; GapScoreMatrix[GL][GG] = 0; GapScoreMatrix[GG][LL] = 0; GapScoreMatrix[GG][LG] = t*g_scoreGapOpen; // approximation! GapScoreMatrix[GG][GL] = 0; GapScoreMatrix[GG][GG] = 0; for (int i = 0; i < 4; ++i) for (int j = 0; j < i; ++j) if (GapScoreMatrix[i][j] != GapScoreMatrix[j][i]) Quit("GapScoreMatrix not symmetrical"); } static SCORE SPColBrute(const MSA &msa, unsigned uColIndex) { SCORE Sum = 0; const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); unsigned uLetter1 = msa.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter1 >= 20) continue; for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); unsigned uLetter2 = msa.GetLetterEx(uSeqIndex2, uColIndex); if (uLetter2 >= 20) continue; SCORE t = w1*w2*(*g_ptrScoreMatrix)[uLetter1][uLetter2]; #if TRACE Log("Check %c %c w1=%.3g w2=%.3g Mx=%.3g t=%.3g\n", LetterToCharAmino(uLetter1), LetterToCharAmino(uLetter2), w1, w2, (*g_ptrScoreMatrix)[uLetter1][uLetter2], t); #endif Sum += t; } } return Sum; } static SCORE SPGapFreqs(const FCOUNT Freqs[]) { #if TRACE Log("Freqs="); for (unsigned i = 0; i < 4; ++i) if (Freqs[i] != 0) Log(" %s=%.3g", GapTypeToStr(i), Freqs[i]); Log("\n"); #endif SCORE TotalOffDiag = 0; SCORE TotalDiag = 0; for (unsigned i = 0; i < 4; ++i) { const FCOUNT fi = Freqs[i]; if (0 == fi) continue; const float *Row = GapScoreMatrix[i]; SCORE diagt = fi*fi*Row[i]; TotalDiag += diagt; #if TRACE Log("SPFGaps %s %s + Mx=%.3g TotalDiag += %.3g\n", GapTypeToStr(i), GapTypeToStr(i), Row[i], diagt); #endif SCORE Sum = 0; for (unsigned j = 0; j < i; ++j) { SCORE t = Freqs[j]*Row[j]; #if TRACE if (Freqs[j] != 0) Log("SPFGaps %s %s + Mx=%.3g Sum += %.3g\n", GapTypeToStr(i), GapTypeToStr(j), Row[j], fi*t); #endif Sum += t; } TotalOffDiag += fi*Sum; } #if TRACE Log("SPFGap TotalOffDiag=%.3g + TotalDiag=%.3g = %.3g\n", TotalOffDiag, TotalDiag, TotalOffDiag + TotalDiag); #endif return TotalOffDiag*2 + TotalDiag; } static SCORE SPFreqs(const FCOUNT Freqs[]) { #if TRACE Log("Freqs="); for (unsigned i = 0; i < 20; ++i) if (Freqs[i] != 0) Log(" %c=%.3g", LetterToCharAmino(i), Freqs[i]); Log("\n"); #endif SCORE TotalOffDiag = 0; SCORE TotalDiag = 0; for (unsigned i = 0; i < 20; ++i) { const FCOUNT fi = Freqs[i]; if (0 == fi) continue; const float *Row = (*g_ptrScoreMatrix)[i]; SCORE diagt = fi*fi*Row[i]; TotalDiag += diagt; #if TRACE Log("SPF %c %c + Mx=%.3g TotalDiag += %.3g\n", LetterToCharAmino(i), LetterToCharAmino(i), Row[i], diagt); #endif SCORE Sum = 0; for (unsigned j = 0; j < i; ++j) { SCORE t = Freqs[j]*Row[j]; #if TRACE if (Freqs[j] != 0) Log("SPF %c %c + Mx=%.3g Sum += %.3g\n", LetterToCharAmino(i), LetterToCharAmino(j), Row[j], fi*t); #endif Sum += t; } TotalOffDiag += fi*Sum; } #if TRACE Log("SPF TotalOffDiag=%.3g + TotalDiag=%.3g = %.3g\n", TotalOffDiag, TotalDiag, TotalOffDiag + TotalDiag); #endif return TotalOffDiag*2 + TotalDiag; } static SCORE ObjScoreSPCol(const MSA &msa, unsigned uColIndex) { FCOUNT Freqs[20]; FCOUNT GapFreqs[4]; memset(Freqs, 0, sizeof(Freqs)); memset(GapFreqs, 0, sizeof(GapFreqs)); const unsigned uSeqCount = msa.GetSeqCount(); #if TRACE Log("Weights="); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) Log(" %u=%.3g", uSeqIndex, msa.GetSeqWeight(uSeqIndex)); Log("\n"); #endif SCORE SelfOverCount = 0; SCORE GapSelfOverCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { WEIGHT w = msa.GetSeqWeight(uSeqIndex); bool bGapThisCol = msa.IsGap(uSeqIndex, uColIndex); bool bGapPrevCol = (uColIndex == 0 ? false : msa.IsGap(uSeqIndex, uColIndex - 1)); int GapType = bGapThisCol + 2*bGapPrevCol; assert(GapType >= 0 && GapType < 4); GapFreqs[GapType] += w; SCORE gapt = w*w*GapScoreMatrix[GapType][GapType]; GapSelfOverCount += gapt; if (bGapThisCol) continue; unsigned uLetter = msa.GetLetterEx(uSeqIndex, uColIndex); if (uLetter >= 20) continue; Freqs[uLetter] += w; SCORE t = w*w*(*g_ptrScoreMatrix)[uLetter][uLetter]; #if TRACE Log("FastCol compute freqs & SelfOverCount %c w=%.3g M=%.3g SelfOverCount += %.3g\n", LetterToCharAmino(uLetter), w, (*g_ptrScoreMatrix)[uLetter][uLetter], t); #endif SelfOverCount += t; } SCORE SPF = SPFreqs(Freqs); SCORE Col = SPF - SelfOverCount; SCORE SPFGaps = SPGapFreqs(GapFreqs); SCORE ColGaps = SPFGaps - GapSelfOverCount; #if TRACE Log("SPF=%.3g - SelfOverCount=%.3g = %.3g\n", SPF, SelfOverCount, Col); Log("SPFGaps=%.3g - GapsSelfOverCount=%.3g = %.3g\n", SPFGaps, GapSelfOverCount, ColGaps); #endif return Col + ColGaps; } SCORE ObjScoreSPDimer(const MSA &msa) { static bool bGapScoreMatrixInit = false; if (!bGapScoreMatrixInit) InitGapScoreMatrix(); SCORE Total = 0; const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { SCORE Col = ObjScoreSPCol(msa, uColIndex); #if TRACE { SCORE ColCheck = SPColBrute(msa, uColIndex); Log("FastCol=%.3g CheckCol=%.3g\n", Col, ColCheck); } #endif Total += Col; } #if TRACE Log("Total/2 = %.3g (final result from fast)\n", Total/2); #endif return Total/2; } sptest.cpp0000664000175000017500000000740512360262614011204 0ustar bobbob#include "muscle.h" #include "objscore.h" #include "msa.h" #include "textfile.h" #include "pwpath.h" const unsigned INDELS = 1; static void GetPos(const char Str[], unsigned L, int *pi1, int *pi2) { int i1; for (;;) { i1 = rand()%(L-2) + 1; if (Str[i1] == 'M') break; } int i2; for (;;) { i2 = rand()%(L-2) + 1; if (i1 != i2 && Str[i2] == 'M') break; } *pi1 = i1; *pi2 = i2; } static void MakePath(unsigned uSeqLength, unsigned uIndelCount, char Str[]) { unsigned uPathLength = uSeqLength + uIndelCount; for (unsigned i = 0; i < uPathLength; ++i) Str[i] = 'M'; for (unsigned i = 0; i < uIndelCount; ++i) { int i1, i2; GetPos(Str, uPathLength, &i1, &i2); Str[i1] = 'D'; Str[i2] = 'I'; } Str[uPathLength] = 0; Log("MakePath=%s\n", Str); } void SPTest() { SetPPScore(PPSCORE_SV); SetListFileName("c:\\tmp\\muscle.log", false); TextFile file1("c:\\tmp\\msa1.afa"); TextFile file2("c:\\tmp\\msa2.afa"); MSA msa1; MSA msa2; msa1.FromFile(file1); msa2.FromFile(file2); Log("msa1=\n"); msa1.LogMe(); Log("msa2=\n"); msa2.LogMe(); const unsigned uColCount = msa1.GetColCount(); if (msa2.GetColCount() != uColCount) Quit("Different lengths"); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); const unsigned uSeqCount = uSeqCount1 + uSeqCount2; MSA::SetIdCount(uSeqCount); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) { msa1.SetSeqWeight(uSeqIndex1, 1.0); msa1.SetSeqId(uSeqIndex1, uSeqIndex1); } for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) { msa2.SetSeqWeight(uSeqIndex2, 1.0); msa2.SetSeqId(uSeqIndex2, uSeqCount1 + uSeqIndex2); } MSA alnA; MSA alnB; char strPathA[1024]; char strPathB[1024]; MakePath(uColCount, INDELS, strPathA); MakePath(uColCount, INDELS, strPathB); PWPath PathA; PWPath PathB; PathA.FromStr(strPathA); PathB.FromStr(strPathB); Log("PathA=\n"); PathA.LogMe(); Log("PathB=\n"); PathB.LogMe(); AlignTwoMSAsGivenPath(PathA, msa1, msa2, alnA); AlignTwoMSAsGivenPath(PathB, msa1, msa2, alnB); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { alnA.SetSeqWeight(uSeqIndex, 1.0); alnB.SetSeqWeight(uSeqIndex, 1.0); } unsigned Seqs1[1024]; unsigned Seqs2[1024]; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) Seqs1[uSeqIndex1] = uSeqIndex1; for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) Seqs2[uSeqIndex2] = uSeqCount1 + uSeqIndex2; MSA msaA1; MSA msaA2; MSA msaB1; MSA msaB2; MSAFromSeqSubset(alnA, Seqs1, uSeqCount1, msaA1); MSAFromSeqSubset(alnB, Seqs1, uSeqCount1, msaB1); MSAFromSeqSubset(alnA, Seqs2, uSeqCount2, msaA2); MSAFromSeqSubset(alnB, Seqs2, uSeqCount2, msaB2); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) { msaA1.SetSeqWeight(uSeqIndex1, 1.0); msaB1.SetSeqWeight(uSeqIndex1, 1.0); } for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) { msaA2.SetSeqWeight(uSeqIndex2, 1.0); msaB2.SetSeqWeight(uSeqIndex2, 1.0); } Log("msaA1=\n"); msaA1.LogMe(); Log("msaB1=\n"); msaB1.LogMe(); Log("msaA2=\n"); msaA2.LogMe(); Log("msaB2=\n"); msaB2.LogMe(); Log("alnA=\n"); alnA.LogMe(); Log("AlnB=\n"); alnB.LogMe(); Log("\nSPA\n---\n"); SCORE SPA = ObjScoreSP(alnA); Log("\nSPB\n---\n"); SCORE SPB = ObjScoreSP(alnB); Log("\nXPA\n---\n"); SCORE XPA = ObjScoreXP(msaA1, msaA2); Log("\nXPB\n---\n"); SCORE XPB = ObjScoreXP(msaB1, msaB2); Log("SPA=%.4g SPB=%.4g Diff=%.4g\n", SPA, SPB, SPA - SPB); Log("XPA=%.4g XPB=%.4g Diff=%.4g\n", XPA, XPB, XPA - XPB); } stabilize.cpp0000664000175000017500000000115512360262614011644 0ustar bobbob#include "muscle.h" #include "msa.h" void Stabilize(const MSA &msa, MSA &msaStable) { const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); msaStable.SetSize(uSeqCount, uColCount); for (unsigned uId = 0; uId < uSeqCount; ++uId) { const unsigned uSeqIndex = msa.GetSeqIndex(uId); msaStable.SetSeqName(uId, msa.GetSeqName(uSeqIndex)); msaStable.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msa.GetChar(uSeqIndex, uColIndex); msaStable.SetChar(uId, uColIndex, c); } } } subfam.cpp0000664000175000017500000002314512360262614011136 0ustar bobbob#include "muscle.h" #include "tree.h" #include "textfile.h" // for test only #include "msa.h" #include "seqvect.h" #include "profile.h" #ifndef _MSC_VER #include // for unlink #endif #define TRACE 0 /*** Find subfamilies from tree by following criteria: (a) number of leaves <= max, (b) is monophyletic, i.e. most recent common ancestor is parent of no more than one subfamily. ***/ static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount, unsigned SubFams[], unsigned &uSubFamCount) { if (tree.IsLeaf(uNodeIndex)) return 1; unsigned uLeft = tree.GetLeft(uNodeIndex); unsigned uRight = tree.GetRight(uNodeIndex); unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount); unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount); unsigned uLeafCount = uLeftCount + uRightCount; if (uLeftCount + uRightCount > uMaxLeafCount) { if (uLeftCount <= uMaxLeafCount) SubFams[uSubFamCount++] = uLeft; if (uRightCount <= uMaxLeafCount) SubFams[uSubFamCount++] = uRight; } else if (tree.IsRoot(uNodeIndex)) { if (uSubFamCount != 0) Quit("Error in SubFamRecurse"); SubFams[uSubFamCount++] = uNodeIndex; } return uLeafCount; } void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount) { *ptruSubFamCount = 0; SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount); #if TRACE { Log("\n"); Log("Tree:\n"); tree.LogMe(); //void DrawTree(const Tree &tree); //DrawTree(tree); Log("\n"); Log("%d subfams:\n", *ptruSubFamCount); for (unsigned i = 0; i < *ptruSubFamCount; ++i) Log(" %d=%d", i, SubFams[i]); Log("\n"); } #endif } //unsigned SubFams[9999]; //unsigned uSubFamCount; // //static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex) // { // const unsigned uRoot = tree.GetRootNodeIndex(); // unsigned uDist = 0; // while (uNodeIndex != uRoot) // { // ++uDist; // uNodeIndex = tree.GetParent(uNodeIndex); // } // return uDist; // } // //static void DrawNode(const Tree &tree, unsigned uNodeIndex) // { // if (!tree.IsLeaf(uNodeIndex)) // DrawNode(tree, tree.GetLeft(uNodeIndex)); // // unsigned uDist = DistFromRoot(tree, uNodeIndex); // for (unsigned i = 0; i < 5*uDist; ++i) // Log(" "); // Log("%d", uNodeIndex); // for (unsigned i = 0; i < uSubFamCount; ++i) // if (uNodeIndex == SubFams[i]) // { // Log("*"); // break; // } // Log("\n"); // // if (!tree.IsLeaf(uNodeIndex)) // DrawNode(tree, tree.GetRight(uNodeIndex)); // } // //static void DrawTree(const Tree &tree) // { // unsigned uRoot = tree.GetRootNodeIndex(); // DrawNode(tree, uRoot); // } // //void TestSubFams(const char *FileName) // { // Tree tree; // TextFile f(FileName); // tree.FromFile(f); // SubFam(tree, 5, SubFams, &uSubFamCount); // DrawTree(tree); // } static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[]) { if (tree.IsLeaf(uNodeIndex)) return; unsigned uLeft = tree.GetLeft(uNodeIndex); unsigned uRight = tree.GetRight(uNodeIndex); NodeInSubFam[uLeft] = true; NodeInSubFam[uRight] = true; SetInFam(tree, uLeft, NodeInSubFam); SetInFam(tree, uRight, NodeInSubFam); } void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex, MSA &msaOut) { const unsigned uSeqCount = vAll.GetSeqCount(); const char *InTmp = "asf_in.tmp"; const char *OutTmp = "asf_out.tmp"; unsigned *Leaves = new unsigned[uSeqCount]; unsigned uLeafCount; GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount); SeqVect v; for (unsigned i = 0; i < uLeafCount; ++i) { unsigned uLeafNodeIndex = Leaves[i]; unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex); Seq &s = vAll.GetSeqById(uId); v.AppendSeq(s); } #if TRACE { Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount); for (unsigned i = 0; i < uLeafCount; ++i) Log(" %s", v.GetSeqName(i)); Log("\n"); } #endif TextFile fIn(InTmp, true); v.ToFASTAFile(fIn); fIn.Close(); char CmdLine[4096]; sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp); // sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp); int NotUsed = system(CmdLine); TextFile fOut(OutTmp); msaOut.FromFile(fOut); for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex) { const char *Name = msaOut.GetSeqName(uSeqIndex); unsigned uId = vAll.GetSeqIdFromName(Name); msaOut.SetSeqId(uSeqIndex, uId); } unlink(InTmp); unlink(OutTmp); delete[] Leaves; } void ProgAlignSubFams() { MSA msaOut; SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); PTR_SCOREMATRIX UserMatrix = 0; if (0 != g_pstrMatrixFileName) { const char *FileName = g_pstrMatrixFileName; const char *Path = getenv("MUSCLE_MXPATH"); if (Path != 0) { size_t n = strlen(Path) + 1 + strlen(FileName) + 1; char *NewFileName = new char[n]; sprintf(NewFileName, "%s/%s", Path, FileName); FileName = NewFileName; } TextFile File(FileName); UserMatrix = ReadMx(File); g_Alpha = ALPHA_Amino; g_PPScore = PPSCORE_SP; } SetPPScore(); if (0 != UserMatrix) g_ptrScoreMatrix = UserMatrix; if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) { SetPPScore(PPSCORE_SPN); g_Distance1 = DISTANCE_Kmer4_6; } unsigned uMinL = 0; unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (uMinL == 0 || L < uMinL) uMinL = L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags = g_bDiags1; SetSeqStats(uSeqCount, uMinL, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (uSeqCount > 1) MHackStart(v); if (0 == uSeqCount) { msaOut.Clear(); return; } if (1 == uSeqCount && ALPHA_Amino == Alpha) { const Seq &s = v.GetSeq(0); msaOut.FromSeq(s); return; } Tree GuideTree; TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1); SetMuscleTree(GuideTree); MSA msa; if (g_bLow) { ProgNode *ProgNodes = 0; ProgNodes = ProgressiveAlignE(v, GuideTree, msa); delete[] ProgNodes; } else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2); SetMuscleTree(GuideTree); unsigned *SubFams = new unsigned[uSeqCount]; unsigned uSubFamCount; SubFam(GuideTree, g_uMaxSubFamCount, SubFams, &uSubFamCount); SetProgressDesc("Align node"); const unsigned uNodeCount = 2*uSeqCount - 1; ProgNode *ProgNodes = new ProgNode[uNodeCount]; bool *NodeIsSubFam = new bool[uNodeCount]; bool *NodeInSubFam = new bool[uNodeCount]; for (unsigned i = 0; i < uNodeCount; ++i) { NodeIsSubFam[i] = false; NodeInSubFam[i] = false; } for (unsigned i = 0; i < uSubFamCount; ++i) { unsigned uNodeIndex = SubFams[i]; assert(uNodeIndex < uNodeCount); NodeIsSubFam[uNodeIndex] = true; SetInFam(GuideTree, uNodeIndex, NodeInSubFam); } unsigned uJoin = 0; unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); do { if (NodeIsSubFam[uTreeNodeIndex]) { #if TRACE Log("Node %d: align subfam\n", uTreeNodeIndex); #endif ProgNode &Node = ProgNodes[uTreeNodeIndex]; AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA); Node.m_uLength = Node.m_MSA.GetColCount(); } else if (!NodeInSubFam[uTreeNodeIndex]) { #if TRACE Log("Node %d: align two subfams\n", uTreeNodeIndex); #endif Progress(uJoin, uSubFamCount - 1); ++uJoin; const unsigned uMergeNodeIndex = uTreeNodeIndex; ProgNode &Parent = ProgNodes[uMergeNodeIndex]; const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); ProgNode &Node1 = ProgNodes[uLeft]; ProgNode &Node2 = ProgNodes[uRight]; PWPath Path; AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); Parent.m_uLength = Parent.m_MSA.GetColCount(); Node1.m_MSA.Clear(); Node2.m_MSA.Clear(); } else { #if TRACE Log("Node %d: in subfam\n", uTreeNodeIndex); #endif ; } uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); ProgressStepsDone(); unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; TextFile fOut(g_pstrOutFileName, true); MHackEnd(RootProgNode.m_MSA); RootProgNode.m_MSA.ToFile(fOut); delete[] NodeInSubFam; delete[] NodeIsSubFam; delete[] ProgNodes; delete[] SubFams; ProgNodes = 0; NodeInSubFam = 0; NodeIsSubFam = 0; SubFams = 0; } subfams.cpp0000664000175000017500000000254512360262614011322 0ustar bobbob#include "muscle.h" #include "distfunc.h" const float INFINITY = float(1e29); const unsigned NILL = uInsane; static float *ShortestPathEstimate; static unsigned *Predecessor; static void GetMostDistantPair(DistFunc &DF, unsigned *ptrIndex1, unsigned *ptrIndex2) { const unsigned uNodeCount = DF.GetCount(); if (uNodeCount < 2) Quit("GetMostDistantPair: < 2 seqs"); float MaxDist = -1; unsigned Index1 = uInsane; unsigned Index2 = uInsane; for (unsigned i = 0; i < uNodeCount; ++i) { for (unsigned j = i + 1; j < uNodeCount; ++j) { float d = DF.GetDist(i, j); if (d > MaxDist) { MaxDist = d; Index1 = i; Index2 = j; } } } assert(Index1 != uInsane); assert(Index2 != uInsane); *ptrIndex1 = Index1; *ptrIndex2 = Index2; } static void InitializeSingleSource(DistFunc &DF, unsigned uIndex) { const unsigned uNodeCount = 0; for (unsigned i = 0; i < uNodeCount; ++i) { ShortestPathEstimate[i] = INFINITY; Predecessor[i] = NILL; } ShortestPathEstimate[uIndex] = 0; } static void Relax(DistFunc &DF, unsigned u, unsigned v) { float w = DF.GetDist(u, v); float d = ShortestPathEstimate[u] + w; if (ShortestPathEstimate[v] > d) { ShortestPathEstimate[v] = d; Predecessor[v] = u; } } void ShortestPath(DistFunc &DF, unsigned uIndex) { } sw.cpp0000664000175000017500000001314712360262614010313 0ustar bobbob#include "muscle.h" #include #include "pwpath.h" #include "profile.h" #include // Textbook Smith-Waterman affine gap implementation. #define TRACE 0 static const char *LocalScoreToStr(SCORE s) { static char str[16]; if (MINUS_INFINITY == s) return " *"; sprintf(str, "%6.2f", s); return str; } static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, unsigned uPrefixCountA, unsigned uPrefixCountB) { Log(" "); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { char c = ' '; if (uPrefixLengthB > 0) c = ConsensusChar(PB[uPrefixLengthB - 1]); Log(" %4u:%c", uPrefixLengthB, c); } Log("\n"); for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { char c = ' '; if (uPrefixLengthA > 0) c = ConsensusChar(PA[uPrefixLengthA - 1]); Log("%4u:%c ", uPrefixLengthA, c); for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); Log("\n"); } } SCORE SW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; // Allocate DP matrices const size_t LM = uPrefixCountA*uPrefixCountB; SCORE *DPM_ = new SCORE[LM]; SCORE *DPD_ = new SCORE[LM]; SCORE *DPI_ = new SCORE[LM]; DPM(0, 0) = 0; DPD(0, 0) = MINUS_INFINITY; DPI(0, 0) = MINUS_INFINITY; DPM(1, 0) = MINUS_INFINITY; DPD(1, 0) = MINUS_INFINITY; DPI(1, 0) = MINUS_INFINITY; DPM(0, 1) = MINUS_INFINITY; DPD(0, 1) = MINUS_INFINITY; DPI(0, 1) = MINUS_INFINITY; // Empty prefix of B is special case for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { // M=LetterA+LetterB, impossible with empty prefix DPM(uPrefixLengthA, 0) = MINUS_INFINITY; // D=LetterA+GapB, never optimal in local alignment with gap penalties DPD(uPrefixLengthA, 0) = MINUS_INFINITY; // I=GapA+LetterB, impossible with empty prefix DPI(uPrefixLengthA, 0) = MINUS_INFINITY; } // Empty prefix of A is special case for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { // M=LetterA+LetterB, impossible with empty prefix DPM(0, uPrefixLengthB) = MINUS_INFINITY; // D=LetterA+GapB, impossible with empty prefix DPD(0, uPrefixLengthB) = MINUS_INFINITY; // I=GapA+LetterB, never optimal in local alignment with gap penalties DPI(0, uPrefixLengthB) = MINUS_INFINITY; } SCORE scoreMax = MINUS_INFINITY; unsigned uPrefixLengthAMax = uInsane; unsigned uPrefixLengthBMax = uInsane; // ============ // Main DP loop // ============ SCORE scoreGapCloseB = MINUS_INFINITY; for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreGapCloseA = MINUS_INFINITY; for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; { // Match M=LetterA+LetterB SCORE scoreLL = ScoreProfPos2(PPA, PPB); SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; SCORE scoreBest; if (scoreMM >= scoreDM && scoreMM >= scoreIM) scoreBest = scoreMM; else if (scoreDM >= scoreMM && scoreDM >= scoreIM) scoreBest = scoreDM; else { assert(scoreIM >= scoreMM && scoreIM >= scoreDM); scoreBest = scoreIM; } if (scoreBest < 0) scoreBest = 0; scoreBest += scoreLL; if (scoreBest > scoreMax) { scoreMax = scoreBest; uPrefixLengthAMax = uPrefixLengthA; uPrefixLengthBMax = uPrefixLengthB; } DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest; } { // Delete D=LetterA+GapB SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen; SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB); SCORE scoreBest; if (scoreMD >= scoreDD) scoreBest = scoreMD; else { assert(scoreDD >= scoreMD); scoreBest = scoreDD; } DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert I=GapA+LetterB { SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB - 1].m_scoreGapOpen; SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1); SCORE scoreBest; if (scoreMI >= scoreII) scoreBest = scoreMI; else { assert(scoreII > scoreMI); scoreBest = scoreII; } DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; } scoreGapCloseA = PPA.m_scoreGapClose; } scoreGapCloseB = PPB.m_scoreGapClose; } #if TRACE Log("DPM:\n"); ListDP(DPM_, PA, PB, uPrefixLengthA, uPrefixLengthB); Log("DPD:\n"); ListDP(DPD_, PA, PB, uPrefixLengthA, uPrefixLengthB); Log("DPI:\n"); ListDP(DPI_, PA, PB, uPrefixLengthA, uPrefixLengthB); #endif assert(scoreMax == DPM(uPrefixLengthAMax, uPrefixLengthBMax)); TraceBackSW(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, uPrefixLengthAMax, uPrefixLengthBMax, Path); #if TRACE SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path); Path.LogMe(); Log("Score = %s Path = %s\n", LocalScoreToStr(scoreMax), LocalScoreToStr(scorePath)); #endif delete[] DPM_; delete[] DPD_; delete[] DPI_; return scoreMax; } termgaps.cpp0000664000175000017500000000143212360262614011476 0ustar bobbob#include "muscle.h" #include "profile.h" void SetTermGaps(const ProfPos *Prof, unsigned uLength) { if (0 == uLength) return; ProfPos *First = (ProfPos *) Prof; ProfPos *Last = (ProfPos *) (Prof + uLength - 1); switch (g_TermGaps) { case TERMGAPS_Full: break; case TERMGAPS_Half: // -infinity check for lock left/right if (First->m_scoreGapOpen != MINUS_INFINITY) First->m_scoreGapOpen = 0; if (uLength > 1 && Last->m_scoreGapClose != MINUS_INFINITY) Last->m_scoreGapClose = 0; case TERMGAPS_Ext: if (First->m_scoreGapOpen != MINUS_INFINITY) First->m_scoreGapOpen *= -1; if (uLength > 1 && Last->m_scoreGapClose != MINUS_INFINITY) Last->m_scoreGapClose *= -1; break; default: Quit("Invalid g_TermGaps"); } } textfile.cpp0000664000175000017500000001507512360262614011510 0ustar bobbob#include "muscle.h" #include "textfile.h" #include TextFile::TextFile(const char szFileName[], bool bWrite) { FILE *ptrFile = 0; if (bWrite) { if (0 == strcmp(szFileName, "-")) ptrFile = stdout; else ptrFile = fopen(szFileName, "wb"); } else { if (0 == strcmp(szFileName, "-")) ptrFile = stdin; else ptrFile = fopen(szFileName, "rb"); } if (0 == ptrFile) Quit("Cannot open '%s' errno=%d\n", szFileName, errno); Init(ptrFile, szFileName); } void TextFile::Init(FILE *ptrFile, const char *ptrFileName) { m_ptrFile = ptrFile; m_ptrName = strdup(ptrFileName); m_uLineNr = 1; m_uColNr = 0; m_bLastCharWasEOL = true; m_cPushedBack = -1; #if DEBUG setbuf(m_ptrFile, 0); #endif } TextFile::TextFile(FILE *ptrFile, const char *ptrFileName) { Init(ptrFile, "-"); } TextFile::~TextFile() { if (m_ptrFile && m_ptrFile != stdin && m_ptrFile != stdout && m_ptrFile != stderr) fclose(m_ptrFile); free(m_ptrName); } // Get line from file. // Return true if end-of-file, quit if line too long. bool TextFile::GetLine(char szLine[], unsigned uBytes) { if (0 == uBytes) Quit("TextFile::GetLine, buffer zero size"); int FillVal = 0; // suppress warning from gcc that I don't understand memset(szLine, FillVal, (size_t) uBytes); unsigned uBytesCopied = 0; // Loop until end of line or end of file. for (;;) { char c; bool bEof = GetChar(c); if (bEof) return true; if ('\r' == c) continue; if ('\n' == c) return false; if (uBytesCopied < uBytes - 1) szLine[uBytesCopied++] = (char) c; else Quit("TextFile::GetLine: input buffer too small, line %u", m_uLineNr); } } // As GetLine, but trim leading and trailing blanks; skip empty lines bool TextFile::GetTrimLine(char szLine[], unsigned uBytes) { if (uBytes == 0) Quit("GetTrimLine"); for (;;) { bool bEOF = GetLine(szLine, uBytes); if (bEOF) return true; TrimBlanks(szLine); if (0 != szLine[0]) break; } return false; } void TextFile::Rewind() { fseek(m_ptrFile, 0, SEEK_SET); m_uLineNr = 1; m_bLastCharWasEOL = true; } void TextFile::PutChar(char c) { int i = fputc(c, m_ptrFile); assert(i == c); if ('\n' == c) { ++m_uLineNr; m_uColNr = 1; } else ++m_uColNr; } void TextFile::PutString(const char szLine[]) { int iError = fputs(szLine, m_ptrFile); assert(iError >= 0); } void TextFile::PutFormat(const char szFormat[], ...) { char szStr[4096]; va_list ArgList; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); PutString(szStr); } void TextFile::GetLineX(char szLine[], unsigned uBytes) { if (uBytes == 0) Quit("GetLineX"); bool bEof = GetLine(szLine, uBytes); if (bEof) Quit("end-of-file in GetLineX"); } bool TextFile::GetToken(char szToken[], unsigned uBytes, const char szCharTokens[]) { // Skip leading white space char c; for (;;) { bool bEof = GetChar(c); if (bEof) return true; if (!isspace(c)) break; } // Check for special case single-character tokens if (0 != strchr(szCharTokens, c)) { assert(uBytes >= 2); szToken[0] = c; szToken[1] = 0; return false; } // Loop until token terminated by white space, EOF or special unsigned uBytesCopied = 0; for (;;) { if (uBytesCopied < uBytes - 1) szToken[uBytesCopied++] = c; else Quit("TextFile::GetToken: input buffer too small, line %u", m_uLineNr); bool bEof = GetChar(c); if (bEof) { szToken[uBytesCopied] = 0; return true; } // Check for special case single-character tokens if (0 != strchr(szCharTokens, c)) { PushBack(c); assert(uBytesCopied > 0 && uBytesCopied < uBytes); szToken[uBytesCopied] = 0; return false; } if (isspace(c)) { assert(uBytesCopied > 0 && uBytesCopied < uBytes); szToken[uBytesCopied] = 0; return false; } } } void TextFile::GetTokenX(char szToken[], unsigned uBytes, const char szCharTokens[]) { bool bEof = GetToken(szToken, uBytes, szCharTokens); if (bEof) Quit("End-of-file in GetTokenX"); } void TextFile::Skip() { for (;;) { char c; bool bEof = GetChar(c); if (bEof || '\n' == c) return; assert(isspace(c)); } } #ifdef _WIN32 TEXTFILEPOS TextFile::GetPos() { fpos_t p; int i = fgetpos(m_ptrFile, &p); assert(0 == i); assert(p >= 0); TEXTFILEPOS Pos; Pos.uOffset = (unsigned) p; Pos.uLineNr = m_uLineNr; Pos.uColNr = m_uColNr; return Pos; } void TextFile::SetPos(TEXTFILEPOS Pos) { fpos_t p = (fpos_t) Pos.uOffset; int i = fsetpos(m_ptrFile, &p); assert(0 == i); m_uLineNr = Pos.uLineNr; m_uColNr = Pos.uColNr; } #else TEXTFILEPOS TextFile::GetPos() { TEXTFILEPOS Pos; Pos.uOffset = ftell(m_ptrFile); Pos.uLineNr = m_uLineNr; Pos.uColNr = m_uColNr; return Pos; } void TextFile::SetPos(TEXTFILEPOS Pos) { fseek(m_ptrFile, Pos.uOffset, SEEK_SET); m_uLineNr = Pos.uLineNr; m_uColNr = Pos.uColNr; } #endif bool TextFile::GetChar(char &c) { if (-1 != m_cPushedBack) { c = (char) m_cPushedBack; m_cPushedBack = -1; return false; } int ic = fgetc(m_ptrFile); if (ic < 0) { if (feof(m_ptrFile)) { // Hack to fix up a non-empty text file that is missing // and end-of-line character in the last line. if (!m_bLastCharWasEOL && m_uLineNr > 0) { c = '\n'; m_bLastCharWasEOL = true; return false; } return true; } Quit("TextFile::GetChar, error %s", strerror(errno)); } c = (char) ic; if ('\n' == c) { m_bLastCharWasEOL = true; ++m_uLineNr; m_uColNr = 1; } else { m_bLastCharWasEOL = false; ++m_uColNr; } return false; } void TextFile::GetCharX(char &c) { bool bEof = GetChar(c); if (bEof) Quit("End-of-file in GetCharX"); } void TextFile::GetNonblankChar(char &c) { do { bool bEof = GetChar(c); if (bEof) Quit("End-of-file in GetCharX"); } while (isspace(c)); } void TextFile::SkipLine() { if (m_bLastCharWasEOL) return; for (;;) { char c; bool bEof = GetChar(c); if (bEof) Quit("End-of-file in SkipLine"); if ('\n' == c) break; } } void TextFile::SkipWhite() { bool bEof = SkipWhiteX(); if (bEof) Quit("End-of-file skipping white space"); } bool TextFile::SkipWhiteX() { for (;;) { char c; bool bEof = GetChar(c); if (bEof) return true; if (!isspace(c)) { PushBack(c); break; } } return false; } threewaywt.cpp0000664000175000017500000002426112360262613012063 0ustar bobbob#include "muscle.h" #include "tree.h" #include #define TRACE 0 /*** Sequence weights derived from a tree using Gotoh's three-way method. Gotoh (1995) CABIOS 11(5), 543-51. Each edge e is assigned a weight w(e). Consider first a tree with three leaves A,B and C having branch lengths a, b and c, as follows. B | b | A---a---R---c---C The internal node is denoted by R. Define: S = (ab + ca + ab) x = bc(a + b)(a + c) y = a(b + c)FS Here F is a tunable normalization factor which is approximately 1.0. Then the edge weight for AR is computed as: w(AR) = sqrt(x/y) Similar expressions for the other edges follow by symmetry. For a tree with more than three edges, the weight of an edge that ends in a leaf is computed from the three-way tree that includes the edge and its two neighbors. The weight of an internal edge is computed as the product of the weights for that edge derived from the two three-way subtrees that include that edge. For example, consider the following tree. B | A--R--V--C | D Here, w(RV) is computed as the product of the two values for w(RV) derived from the three-way trees with leaves ABV and RCD respectively. The calculation is done using "Gotoh lengths", not the real edge lengths. The Gotoh length G of a directed edge is calculated recursively as: G = d + LR/(L + R) where d is the length of the edge, and L and R are the Gotoh lengths of the left and right edges adjoining the terminal end of the edge. If the edge terminates on a leaf, then G=d. Pairwise sequence weights are computed as the product of edge weights on the path that connects their leaves. If the tree is split into two subtrees by deleting a given edge e, then the pairwise weights factorize. For operations on profiles formed from the two subtrees, it is possible to assign a weight to a sequence as the product of edge weights on a path from e to its leaf. ***/ // The xxxUnrooted functions present a rooted tree as // if it had been unrooted by deleting the root node. static unsigned GetFirstNeighborUnrooted(const Tree &tree, unsigned uNode1, unsigned uNode2) { if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) Quit("GetFirstNeighborUnrooted, should never be called with root"); if (!tree.IsEdge(uNode1, uNode2)) { if (!tree.IsRoot(tree.GetParent(uNode1)) || !tree.IsRoot(tree.GetParent(uNode2))) Quit("GetFirstNeighborUnrooted, not edge"); const unsigned uRoot = tree.GetRootNodeIndex(); return tree.GetFirstNeighbor(uNode1, uRoot); } unsigned uNeighbor = tree.GetFirstNeighbor(uNode1, uNode2); if (tree.IsRoot(uNeighbor)) return tree.GetFirstNeighbor(uNeighbor, uNode1); return uNeighbor; } static unsigned GetSecondNeighborUnrooted(const Tree &tree, unsigned uNode1, unsigned uNode2) { if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) Quit("GetFirstNeighborUnrooted, should never be called with root"); if (!tree.IsEdge(uNode1, uNode2)) { if (!tree.IsRoot(tree.GetParent(uNode1)) || !tree.IsRoot(tree.GetParent(uNode2))) Quit("GetFirstNeighborUnrooted, not edge"); const unsigned uRoot = tree.GetRootNodeIndex(); return tree.GetSecondNeighbor(uNode1, uRoot); } unsigned uNeighbor = tree.GetSecondNeighbor(uNode1, uNode2); if (tree.IsRoot(uNeighbor)) return tree.GetFirstNeighbor(uNeighbor, uNode1); return uNeighbor; } static unsigned GetNeighborUnrooted(const Tree &tree, unsigned uNode1, unsigned uSub) { unsigned uNeighbor = tree.GetNeighbor(uNode1, uSub); if (tree.IsRoot(uNeighbor)) return tree.GetFirstNeighbor(uNeighbor, uNode1); return uNeighbor; } static unsigned GetNeighborSubscriptUnrooted(const Tree &tree, unsigned uNode1, unsigned uNode2) { if (tree.IsEdge(uNode1, uNode2)) return tree.GetNeighborSubscript(uNode1, uNode2); if (!tree.IsRoot(tree.GetParent(uNode1)) || !tree.IsRoot(tree.GetParent(uNode2))) Quit("GetNeighborSubscriptUnrooted, not edge"); for (unsigned uSub = 0; uSub < 3; ++uSub) if (GetNeighborUnrooted(tree, uNode1, uSub) == uNode2) return uSub; Quit("GetNeighborSubscriptUnrooted, not a neighbor"); return NULL_NEIGHBOR; } static double GetEdgeLengthUnrooted(const Tree &tree, unsigned uNode1, unsigned uNode2) { if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) Quit("GetEdgeLengthUnrooted, should never be called with root"); if (!tree.IsEdge(uNode1, uNode2)) { if (!tree.IsRoot(tree.GetParent(uNode1)) || !tree.IsRoot(tree.GetParent(uNode2))) Quit("GetEdgeLengthUnrooted, not edge"); const unsigned uRoot = tree.GetRootNodeIndex(); return tree.GetEdgeLength(uNode1, uRoot) + tree.GetEdgeLength(uNode2, uRoot); } return tree.GetEdgeLength(uNode1, uNode2); } double GetGotohLength(const Tree &tree, unsigned R, unsigned A) { double dThis = GetEdgeLengthUnrooted(tree, R, A); // Enforce non-negative edge lengths if (dThis < 0) dThis = 0; if (tree.IsLeaf(A)) return dThis; const unsigned uFirst = GetFirstNeighborUnrooted(tree, A, R); const unsigned uSecond = GetSecondNeighborUnrooted(tree, A, R); const double dFirst = GetGotohLength(tree, A, uFirst); const double dSecond = GetGotohLength(tree, A, uSecond); const double dSum = dFirst + dSecond; const double dThird = dSum == 0 ? 0 : (dFirst*dSecond)/dSum; return dThis + dThird; } // Return weight of edge A-R in three-way subtree that has // leaves A,B,C and internal node R. static double GotohWeightThreeWay(const Tree &tree, unsigned A, unsigned B, unsigned C, unsigned R) { const double F = 1.0; if (tree.IsLeaf(R)) Quit("GotohThreeWay: R must be internal node"); double a = GetGotohLength(tree, R, A); double b = GetGotohLength(tree, R, B); double c = GetGotohLength(tree, R, C); double S = b*c + c*a + a*b; double x = b*c*(a + b)*(a + c); double y = a*(b + c)*F*S; // y is zero iff all three branch lengths are zero. if (y < 0.001) return 1.0; return sqrt(x/y); } static double GotohWeightEdge(const Tree &tree, unsigned uNodeIndex1, unsigned uNodeIndex2) { double w1 = 1.0; double w2 = 1.0; if (!tree.IsLeaf(uNodeIndex1)) { unsigned R = uNodeIndex1; unsigned A = uNodeIndex2; unsigned B = GetFirstNeighborUnrooted(tree, R, A); unsigned C = GetSecondNeighborUnrooted(tree, R, A); w1 = GotohWeightThreeWay(tree, A, B, C, R); } if (!tree.IsLeaf(uNodeIndex2)) { unsigned R = uNodeIndex2; unsigned A = uNodeIndex1; unsigned B = GetFirstNeighborUnrooted(tree, R, A); unsigned C = GetSecondNeighborUnrooted(tree, R, A); w2 = GotohWeightThreeWay(tree, A, B, C, R); } return w1*w2; } void CalcThreeWayEdgeWeights(const Tree &tree, WEIGHT **EdgeWeights) { const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex1 = 0; uNodeIndex1 < uNodeCount; ++uNodeIndex1) { if (tree.IsRoot(uNodeIndex1)) continue; for (unsigned uSub1 = 0; uSub1 < 3; ++uSub1) { const unsigned uNodeIndex2 = GetNeighborUnrooted(tree, uNodeIndex1, uSub1); if (NULL_NEIGHBOR == uNodeIndex2) continue; // Avoid computing same edge twice in reversed order if (uNodeIndex2 < uNodeIndex1) continue; const WEIGHT w = (WEIGHT) GotohWeightEdge(tree, uNodeIndex1, uNodeIndex2); const unsigned uSub2 = GetNeighborSubscriptUnrooted(tree, uNodeIndex2, uNodeIndex1); #if DEBUG { assert(uNodeIndex2 == GetNeighborUnrooted(tree, uNodeIndex1, uSub1)); assert(uNodeIndex1 == GetNeighborUnrooted(tree, uNodeIndex2, uSub2)); const WEIGHT wRev = (WEIGHT) GotohWeightEdge(tree, uNodeIndex2, uNodeIndex1); if (!BTEq(w, wRev)) Quit("CalcThreeWayWeights: rev check failed %g %g", w, wRev); } #endif EdgeWeights[uNodeIndex1][uSub1] = w; EdgeWeights[uNodeIndex2][uSub2] = w; } } } static void SetSeqWeights(const Tree &tree, unsigned uNode1, unsigned uNode2, double dPathWeight, WEIGHT *Weights) { if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) Quit("SetSeqWeights, should never be called with root"); const double dThisLength = GetEdgeLengthUnrooted(tree, uNode1, uNode2); if (tree.IsLeaf(uNode2)) { const unsigned Id = tree.GetLeafId(uNode2); Weights[Id] = (WEIGHT) (dPathWeight + dThisLength); return; } const unsigned uFirst = GetFirstNeighborUnrooted(tree, uNode2, uNode1); const unsigned uSecond = GetSecondNeighborUnrooted(tree, uNode2, uNode1); dPathWeight *= dThisLength; SetSeqWeights(tree, uNode2, uFirst, dPathWeight, Weights); SetSeqWeights(tree, uNode2, uSecond, dPathWeight, Weights); } void CalcThreeWayWeights(const Tree &tree, unsigned uNode1, unsigned uNode2, WEIGHT *Weights) { #if TRACE Log("CalcThreeWayEdgeWeights\n"); tree.LogMe(); #endif if (tree.IsRoot(uNode1)) uNode1 = tree.GetFirstNeighbor(uNode1, uNode2); else if (tree.IsRoot(uNode2)) uNode2 = tree.GetFirstNeighbor(uNode2, uNode1); const unsigned uNodeCount = tree.GetNodeCount(); WEIGHT **EdgeWeights = new WEIGHT *[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) EdgeWeights[uNodeIndex] = new WEIGHT[3]; CalcThreeWayEdgeWeights(tree, EdgeWeights); #if TRACE { Log("Node1 Node2 Length Gotoh EdgeWt\n"); Log("----- ----- ------ ------ ------\n"); for (unsigned uNodeIndex1 = 0; uNodeIndex1 < uNodeCount; ++uNodeIndex1) { if (tree.IsRoot(uNodeIndex1)) continue; for (unsigned uSub1 = 0; uSub1 < 3; ++uSub1) { const unsigned uNodeIndex2 = GetNeighborUnrooted(tree, uNodeIndex1, uSub1); if (NULL_NEIGHBOR == uNodeIndex2) continue; if (uNodeIndex2 < uNodeIndex1) continue; const WEIGHT ew = EdgeWeights[uNodeIndex1][uSub1]; const double d = GetEdgeLengthUnrooted(tree, uNodeIndex1, uNodeIndex2); const double g = GetGotohLength(tree, uNodeIndex1, uNodeIndex2); Log("%5u %5u %6.3f %6.3f %6.3f\n", uNodeIndex1, uNodeIndex2, d, g, ew); } } } #endif SetSeqWeights(tree, uNode1, uNode2, 0.0, Weights); SetSeqWeights(tree, uNode2, uNode1, 0.0, Weights); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) delete[] EdgeWeights[uNodeIndex]; delete[] EdgeWeights; } tomhydro.cpp0000664000175000017500000000465612360262614011534 0ustar bobbob#include "muscle.h" #include "profile.h" // Original: //HYDROPHILIC_CONTEXT 0 6 -0.3969495574 //HYDROPHILIC_CONTEXT 1 6 -0.9407126603 //HYDROPHILIC_CONTEXT 2 6 -0.4968150972 //HYDROPHILIC_CONTEXT 3 6 -0.271646023 //HYDROPHILIC_CONTEXT 4 6 0.006990406416 //HYDROPHILIC_CONTEXT 5 6 0.1381111256 //HYDROPHILIC_CONTEXT 6 6 0.2541439872 // Blosum62: //HYDROPHILIC_CONTEXT 0 6 -0.2448419585 //HYDROPHILIC_CONTEXT 1 6 -0.8734889946 //HYDROPHILIC_CONTEXT 2 6 -0.5724336598 //HYDROPHILIC_CONTEXT 3 6 -0.2670439975 //HYDROPHILIC_CONTEXT 4 6 0.004844647323 //HYDROPHILIC_CONTEXT 5 6 0.1812057148 //HYDROPHILIC_CONTEXT 6 6 0.1036540864 static SCORE Factors[7] = { (SCORE) -0.2448419585, (SCORE) -0.8734889946, (SCORE) -0.5724336598, (SCORE) -0.2670439975, (SCORE) 0.004844647323, (SCORE) 0.1812057148, (SCORE) 0.1036540864 }; static bool Hydrophilic[20] = { false, // A false, // C true, // D true, // E false, // F true, // G false, // H false, // I true, // K false, // L false, // M true, // N true, // P true, // Q true, // R true, // S false, // T false, // V false, // Y false, // W }; bool IsHydrophilic(const FCOUNT fcCounts[]) { for (unsigned uLetter = 0; uLetter < 20; ++uLetter) if (fcCounts[uLetter] > 0.0 && Hydrophilic[uLetter]) return false; return true; } static double HydrophilicFraction(const FCOUNT fcCounts[]) { double TotalAll = 0.0; double TotalHydrophilic = 0.0; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) { FCOUNT Freq = fcCounts[uLetter]; TotalAll += Freq; if (Hydrophilic[uLetter]) TotalHydrophilic += Freq; } return TotalHydrophilic / TotalAll; } void TomHydro(ProfPos *Prof, unsigned uLength) { if (ALPHA_Amino != g_Alpha) return; if (uLength < 6) return; for (unsigned uColIndex = 3; uColIndex < uLength - 2; ++uColIndex) { // 6-residue window: // xxxxxx // AARNCARNGTAGCATNAC // AARN----------TNAC double dCount = 0.0; for (unsigned uColIndexW = uColIndex - 3; uColIndexW < uColIndex + 3; ++uColIndexW) { const ProfPos &PP = Prof[uColIndexW]; dCount += HydrophilicFraction(PP.m_fcCounts); } // Round to nearest integer unsigned uCount = (unsigned) (dCount + 0.5); if (uCount > 6) uCount = 6; SCORE dFactor = Factors[uCount]; ProfPos &PP = Prof[uColIndex]; PP.m_scoreGapOpen += dFactor; PP.m_scoreGapClose += dFactor; } } traceback.cpp0000664000175000017500000001230412360262614011573 0ustar bobbob#include "muscle.h" #include "profile.h" #include "pwpath.h" #include #define TRACE 0 #define EQ(a, b) (fabs(a-b) < 0.1) SCORE TraceBack(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, PWPath &Path) { #if TRACE Log("\n"); Log("TraceBack LengthA=%u LengthB=%u\n", uLengthA, uLengthB); #endif assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; Path.Clear(); unsigned uPrefixLengthA = uLengthA; unsigned uPrefixLengthB = uLengthB; const SCORE scoreM = DPM(uPrefixLengthA, uPrefixLengthB); SCORE scoreD = DPD(uPrefixLengthA, uPrefixLengthB); SCORE scoreI = DPI(uPrefixLengthA, uPrefixLengthB); const ProfPos &LastPPA = PA[uLengthA - 1]; const ProfPos &LastPPB = PB[uLengthB - 1]; scoreD += LastPPA.m_scoreGapClose; scoreI += LastPPB.m_scoreGapClose; char cEdgeType = cInsane; SCORE scoreMax; if (scoreM >= scoreD && scoreM >= scoreI) { scoreMax = scoreM; cEdgeType = 'M'; } else if (scoreD >= scoreM && scoreD >= scoreI) { scoreMax = scoreD; cEdgeType = 'D'; } else { assert(scoreI >= scoreM && scoreI >= scoreD); scoreMax = scoreI; cEdgeType = 'I'; } for (;;) { if ('S' == cEdgeType) break; PWEdge Edge; Edge.cType = cEdgeType; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; Path.PrependEdge(Edge); char cPrevEdgeType; unsigned uPrevPrefixLengthA = uPrefixLengthA; unsigned uPrevPrefixLengthB = uPrefixLengthB; switch (cEdgeType) { case 'M': { assert(uPrefixLengthA > 0); assert(uPrefixLengthB > 0); const ProfPos &PPA = PA[uPrefixLengthA - 1]; const ProfPos &PPB = PB[uPrefixLengthB - 1]; const SCORE Score = DPM(uPrefixLengthA, uPrefixLengthB); const SCORE scoreMatch = ScoreProfPos2(PPA, PPB); SCORE scoreSM; if (1 == uPrefixLengthA && 1 == uPrefixLengthB) scoreSM = scoreMatch; else scoreSM = MINUS_INFINITY; SCORE scoreMM = MINUS_INFINITY; SCORE scoreDM = MINUS_INFINITY; SCORE scoreIM = MINUS_INFINITY; if (uPrefixLengthA > 1 && uPrefixLengthB > 1) scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1) + scoreMatch; if (uPrefixLengthA > 1) { SCORE scoreTransDM = PA[uPrefixLengthA-2].m_scoreGapClose; scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransDM + scoreMatch; } if (uPrefixLengthB > 1) { SCORE scoreTransIM = PB[uPrefixLengthB-2].m_scoreGapClose; scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransIM + scoreMatch; } if (EQ(scoreMM, Score)) cPrevEdgeType = 'M'; else if (EQ(scoreDM, Score)) cPrevEdgeType = 'D'; else if (EQ(scoreIM, Score)) cPrevEdgeType = 'I'; else if (EQ(scoreSM, Score)) cPrevEdgeType = 'S'; else Quit("TraceBack: failed to match M score=%g M=%g D=%g I=%g S=%g", Score, scoreMM, scoreDM, scoreIM, scoreSM); --uPrevPrefixLengthA; --uPrevPrefixLengthB; break; } case 'D': { assert(uPrefixLengthA > 0); const SCORE Score = DPD(uPrefixLengthA, uPrefixLengthB); SCORE scoreMD = MINUS_INFINITY; SCORE scoreDD = MINUS_INFINITY; SCORE scoreSD = MINUS_INFINITY; if (uPrefixLengthB == 0) { if (uPrefixLengthA == 1) scoreSD = PA[0].m_scoreGapOpen; else scoreSD = DPD(uPrefixLengthA - 1, 0); } if (uPrefixLengthA > 1) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; SCORE scoreTransMD = PPA.m_scoreGapOpen; scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + scoreTransMD; scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB); } if (EQ(Score, scoreMD)) cPrevEdgeType = 'M'; else if (EQ(Score, scoreDD)) cPrevEdgeType = 'D'; else if (EQ(Score, scoreSD)) cPrevEdgeType = 'S'; else Quit("TraceBack: failed to match D"); --uPrevPrefixLengthA; break; } case 'I': { assert(uPrefixLengthB > 0); const SCORE Score = DPI(uPrefixLengthA, uPrefixLengthB); SCORE scoreMI = MINUS_INFINITY; SCORE scoreII = MINUS_INFINITY; SCORE scoreSI = MINUS_INFINITY; if (uPrefixLengthA == 0) { if (uPrefixLengthB == 1) scoreSI = PB[0].m_scoreGapOpen; else scoreSI = DPI(0, uPrefixLengthB - 1); } if (uPrefixLengthB > 1) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreTransMI = PPB.m_scoreGapOpen; scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + scoreTransMI; scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1); } if (EQ(Score, scoreMI)) cPrevEdgeType = 'M'; else if (EQ(Score, scoreII)) cPrevEdgeType = 'I'; else if (EQ(Score, scoreSI)) cPrevEdgeType = 'S'; else Quit("TraceBack: failed to match I"); --uPrevPrefixLengthB; break; } default: assert(false); } #if TRACE Log("Edge %c%c%u.%u", cPrevEdgeType, cEdgeType, uPrefixLengthA, uPrefixLengthB); Log("\n"); #endif cEdgeType = cPrevEdgeType; uPrefixLengthA = uPrevPrefixLengthA; uPrefixLengthB = uPrevPrefixLengthB; } return scoreMax; } tracebackopt.cpp0000664000175000017500000000266712360262613012330 0ustar bobbob#include "muscle.h" #include "pwpath.h" void TraceBackToPath(int **TraceBack, unsigned uLengthA, unsigned uLengthB, PWPath &Path) { Path.Clear(); PWEdge Edge; Edge.uPrefixLengthA = uLengthA; Edge.uPrefixLengthB = uLengthB; for (;;) { if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB) break; int iDelta = TraceBack[Edge.uPrefixLengthA][Edge.uPrefixLengthB]; #if TRACE Log("TraceBack[%u][%u] = %d\n", Edge.uPrefixLengthA, Edge.uPrefixLengthB, iDelta); #endif if (0 == iDelta) { assert(Edge.uPrefixLengthA > 0); assert(Edge.uPrefixLengthB > 0); Edge.cType = 'M'; Path.PrependEdge(Edge); --(Edge.uPrefixLengthA); --(Edge.uPrefixLengthB); continue; } else if (iDelta > 0) { Edge.cType = 'D'; while (iDelta-- > 0) { assert(Edge.uPrefixLengthA > 0); Path.PrependEdge(Edge); --(Edge.uPrefixLengthA); } } else if (iDelta < 0) { Edge.cType = 'I'; while (iDelta++ < 0) { assert(Edge.uPrefixLengthB > 0); Path.PrependEdge(Edge); --(Edge.uPrefixLengthB); } } if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB) break; assert(Edge.uPrefixLengthA > 0); assert(Edge.uPrefixLengthB > 0); Edge.cType = 'M'; Path.PrependEdge(Edge); --(Edge.uPrefixLengthA); --(Edge.uPrefixLengthB); } #if TRACE Log("TraceBackToPath "); Path.LogMe(); #endif } tracebacksw.cpp0000664000175000017500000001151412360262614012147 0ustar bobbob#include "muscle.h" #include "profile.h" #include "pwpath.h" #include #define TRACE 0 #define EQ(a, b) (fabs(a-b) < 0.1) void TraceBackSW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, unsigned uPrefixLengthAMax, unsigned uPrefixLengthBMax, PWPath &Path) { #if TRACE Log("\n"); Log("TraceBackSW LengthA=%u LengthB=%u PLAMax=%u PLBMax=%u\n", uLengthA, uLengthB, uPrefixLengthAMax, uPrefixLengthBMax); #endif assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; Path.Clear(); unsigned uPrefixLengthA = uPrefixLengthAMax; unsigned uPrefixLengthB = uPrefixLengthBMax; SCORE scoreMax = DPM(uPrefixLengthA, uPrefixLengthB); char cEdgeType = 'M'; for (;;) { if ('S' == cEdgeType) break; PWEdge Edge; Edge.cType = cEdgeType; Edge.uPrefixLengthA = uPrefixLengthA; Edge.uPrefixLengthB = uPrefixLengthB; Path.PrependEdge(Edge); char cPrevEdgeType; unsigned uPrevPrefixLengthA = uPrefixLengthA; unsigned uPrevPrefixLengthB = uPrefixLengthB; switch (cEdgeType) { case 'M': { assert(uPrefixLengthA > 0); assert(uPrefixLengthB > 0); const ProfPos &PPA = PA[uPrefixLengthA - 1]; const ProfPos &PPB = PB[uPrefixLengthB - 1]; const SCORE Score = DPM(uPrefixLengthA, uPrefixLengthB); const SCORE scoreMatch = ScoreProfPos2(PPA, PPB); SCORE scoreSM; if (1 == uPrefixLengthA && 1 == uPrefixLengthB) scoreSM = scoreMatch; else scoreSM = MINUS_INFINITY; SCORE scoreMM = MINUS_INFINITY; SCORE scoreDM = MINUS_INFINITY; SCORE scoreIM = MINUS_INFINITY; if (uPrefixLengthA > 1 && uPrefixLengthB > 1) { SCORE scoreTrans = DPM(uPrefixLengthA-1, uPrefixLengthB-1); scoreMM = scoreTrans + scoreMatch; } if (uPrefixLengthA > 1) { SCORE scoreTransDM = PA[uPrefixLengthA-2].m_scoreGapClose; scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransDM + scoreMatch; } if (uPrefixLengthB > 1) { SCORE scoreTransIM = PB[uPrefixLengthB-2].m_scoreGapClose; scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransIM + scoreMatch; } if (EQ(scoreMM, Score)) cPrevEdgeType = 'M'; else if (EQ(scoreDM, Score)) cPrevEdgeType = 'D'; else if (EQ(scoreIM, Score)) cPrevEdgeType = 'I'; else if (EQ(scoreSM, Score)) cPrevEdgeType = 'S'; else if (EQ(scoreMatch, Score)) cPrevEdgeType = 'S'; else Quit("TraceBack2: failed to match M score=%g M=%g D=%g I=%g S=%g", Score, scoreMM, scoreDM, scoreIM, scoreSM); --uPrevPrefixLengthA; --uPrevPrefixLengthB; break; } case 'D': { assert(uPrefixLengthA > 0); const SCORE Score = DPD(uPrefixLengthA, uPrefixLengthB); SCORE scoreMD = MINUS_INFINITY; SCORE scoreDD = MINUS_INFINITY; SCORE scoreSD = MINUS_INFINITY; if (uPrefixLengthB == 0) { if (uPrefixLengthA == 1) scoreSD = PA[0].m_scoreGapOpen; else scoreSD = DPD(uPrefixLengthA - 1, 0); } if (uPrefixLengthA > 1) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; SCORE scoreTransMD = PPA.m_scoreGapOpen; scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + scoreTransMD; scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB); } if (EQ(Score, scoreMD)) cPrevEdgeType = 'M'; else if (EQ(Score, scoreDD)) cPrevEdgeType = 'D'; else if (EQ(Score, scoreSD)) cPrevEdgeType = 'S'; else Quit("TraceBack2: failed to match D"); --uPrevPrefixLengthA; break; } case 'I': { assert(uPrefixLengthB > 0); const SCORE Score = DPI(uPrefixLengthA, uPrefixLengthB); SCORE scoreMI = MINUS_INFINITY; SCORE scoreII = MINUS_INFINITY; SCORE scoreSI = MINUS_INFINITY; if (uPrefixLengthA == 0) { if (uPrefixLengthB == 1) scoreSI = PB[0].m_scoreGapOpen; else scoreSI = DPI(0, uPrefixLengthB - 1); } if (uPrefixLengthB > 1) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreTransMI = PPB.m_scoreGapOpen; scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + scoreTransMI; scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1); } if (EQ(Score, scoreMI)) cPrevEdgeType = 'M'; else if (EQ(Score, scoreII)) cPrevEdgeType = 'I'; else if (EQ(Score, scoreSI)) cPrevEdgeType = 'S'; else Quit("TraceBack2: failed to match I"); --uPrevPrefixLengthB; break; } default: assert(false); } #if TRACE Log("Edge %c%c%u.%u", cPrevEdgeType, cEdgeType, uPrefixLengthA, uPrefixLengthB); Log("\n"); #endif cEdgeType = cPrevEdgeType; uPrefixLengthA = uPrevPrefixLengthA; uPrefixLengthB = uPrevPrefixLengthB; } } treefrommsa.cpp0000664000175000017500000000441012360262613012176 0ustar bobbob#include "muscle.h" #include "msa.h" #include "tree.h" #include "clust.h" #include "clustsetmsa.h" #include "distcalc.h" static void SaveMSADist(const MSA &msa, MSADist &d, const char *FileName) { FILE *f = fopen(FileName, "w"); if (f == 0) Quit("Cannot create %s", FileName); unsigned n = msa.GetSeqCount(); for (unsigned i = 0; i < n; ++i) { fprintf(f, "%10.10s ", msa.GetSeqName(i)); for (unsigned j = 0; j < n; ++j) fprintf(f, " %9g", d.ComputeDist(msa, i, j)); fprintf(f, "\n"); } fclose(f); } static void TreeFromMSA_NJ(const MSA &msa, Tree &tree, CLUSTER Cluster, DISTANCE Distance, const char *SaveFileName) { MSADist MD(Distance); ClustSetMSA Set(msa, MD); if (SaveFileName != 0) SaveMSADist(msa, MD, SaveFileName); Clust C; C.Create(Set, Cluster); tree.FromClust(C); } static void SaveDC(const DistCalcMSA &DC, const char *FileName) { FILE *f = fopen(FileName, "w"); if (f == 0) Quit("Cannot create %s", FileName); unsigned n = DC.GetCount(); fprintf(f, "%u\n", n); float *Dist = new float[n]; for (unsigned i = 0; i < n; ++i) { fprintf(f, "%10.10s ", DC.GetName(i)); DC.CalcDistRange(i, Dist); for (unsigned j = 0; j < i; ++j) fprintf(f, " %9g", Dist[j]); fprintf(f, "\n"); } fclose(f); } static void TreeFromMSA_UPGMA(const MSA &msa, Tree &tree, CLUSTER Cluster, DISTANCE Distance, const char *SaveFileName) { LINKAGE Linkage = LINKAGE_Undefined; switch (Cluster) { case CLUSTER_UPGMA: Linkage = LINKAGE_Avg; break; case CLUSTER_UPGMAMin: Linkage = LINKAGE_Min; break; case CLUSTER_UPGMAMax: Linkage = LINKAGE_Max; break; case CLUSTER_UPGMB: Linkage = LINKAGE_Biased; break; default: Quit("TreeFromMSA_UPGMA, CLUSTER_%u not supported", Cluster); } DistCalcMSA DC; DC.Init(msa, Distance); if (SaveFileName != 0) SaveDC(DC, SaveFileName); UPGMA2(DC, tree, Linkage); } void TreeFromMSA(const MSA &msa, Tree &tree, CLUSTER Cluster, DISTANCE Distance, ROOT Root, const char *SaveFileName) { if (CLUSTER_NeighborJoining == Cluster) TreeFromMSA_NJ(msa, tree, Cluster, Distance, SaveFileName); else TreeFromMSA_UPGMA(msa, tree, Cluster, Distance, SaveFileName); FixRoot(tree, Root); } typetostr.cpp0000664000175000017500000000264212360262614011735 0ustar bobbob#include "muscle.h" #include const char *SecsToStr(unsigned long Secs) { static char Str[16]; long hh, mm, ss; hh = Secs/(60*60); mm = (Secs/60)%60; ss = Secs%60; sprintf(Str, "%02ld:%02ld:%02ld", hh, mm, ss); return Str; } const char *BoolToStr(bool b) { return b ? "True" : "False"; } const char *ScoreToStr(SCORE Score) { if (MINUS_INFINITY >= Score) return " *"; // Hack to use "circular" buffer so when called multiple // times in a printf-like argument list it works OK. const int iBufferCount = 16; const int iBufferLength = 16; static char szStr[iBufferCount*iBufferLength]; static int iBufferIndex = 0; iBufferIndex = (iBufferIndex + 1)%iBufferCount; char *pStr = szStr + iBufferIndex*iBufferLength; sprintf(pStr, "%8g", Score); return pStr; } // Left-justified version of ScoreToStr const char *ScoreToStrL(SCORE Score) { if (MINUS_INFINITY >= Score) return "*"; // Hack to use "circular" buffer so when called multiple // times in a printf-like argument list it works OK. const int iBufferCount = 16; const int iBufferLength = 16; static char szStr[iBufferCount*iBufferLength]; static int iBufferIndex = 0; iBufferIndex = (iBufferIndex + 1)%iBufferCount; char *pStr = szStr + iBufferIndex*iBufferLength; sprintf(pStr, "%.3g", Score); return pStr; } const char *WeightToStr(WEIGHT w) { return ScoreToStr(w); } upgma2.cpp0000664000175000017500000002425612360262613011057 0ustar bobbob#include "muscle.h" #include "tree.h" #include "distcalc.h" // UPGMA clustering in O(N^2) time and space. #define TRACE 0 #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define AVG(x, y) (((x) + (y))/2) static unsigned g_uLeafCount; static unsigned g_uTriangleSize; static unsigned g_uInternalNodeCount; static unsigned g_uInternalNodeIndex; // Triangular distance matrix is g_Dist, which is allocated // as a one-dimensional vector of length g_uTriangleSize. // TriangleSubscript(i,j) maps row,column=i,j to the subscript // into this vector. // Row / column coordinates are a bit messy. // Initially they are leaf indexes 0..N-1. // But each time we create a new node (=new cluster, new subtree), // we re-use one of the two rows that become available (the children // of the new node). This saves memory. // We keep track of this through the g_uNodeIndex vector. static dist_t *g_Dist; // Distance to nearest neighbor in row i of distance matrix. // Subscript is distance matrix row. static dist_t *g_MinDist; // Nearest neighbor to row i of distance matrix. // Subscript is distance matrix row. static unsigned *g_uNearestNeighbor; // Node index of row i in distance matrix. // Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes. // Subscript is distance matrix row. static unsigned *g_uNodeIndex; // The following vectors are defined on internal nodes, // subscripts are internal node index 0..N-2. // For g_uLeft/Right, value is the node index 0 .. 2N-2 // because a child can be internal or leaf. static unsigned *g_uLeft; static unsigned *g_uRight; static dist_t *g_Height; static dist_t *g_LeftLength; static dist_t *g_RightLength; static inline unsigned TriangleSubscript(unsigned uIndex1, unsigned uIndex2) { #if DEBUG if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount) Quit("TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount); #endif unsigned v; if (uIndex1 >= uIndex2) v = uIndex2 + (uIndex1*(uIndex1 - 1))/2; else v = uIndex1 + (uIndex2*(uIndex2 - 1))/2; assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2); return v; } static void ListState() { Log("Dist matrix\n"); Log(" "); for (unsigned i = 0; i < g_uLeafCount; ++i) { if (uInsane == g_uNodeIndex[i]) continue; Log(" %5u", g_uNodeIndex[i]); } Log("\n"); for (unsigned i = 0; i < g_uLeafCount; ++i) { if (uInsane == g_uNodeIndex[i]) continue; Log("%5u ", g_uNodeIndex[i]); for (unsigned j = 0; j < g_uLeafCount; ++j) { if (uInsane == g_uNodeIndex[j]) continue; if (i == j) Log(" "); else { unsigned v = TriangleSubscript(i, j); Log("%5.2g ", g_Dist[v]); } } Log("\n"); } Log("\n"); Log(" i Node NrNb Dist\n"); Log("----- ----- ----- --------\n"); for (unsigned i = 0; i < g_uLeafCount; ++i) { if (uInsane == g_uNodeIndex[i]) continue; Log("%5u %5u %5u %8.3f\n", i, g_uNodeIndex[i], g_uNearestNeighbor[i], g_MinDist[i]); } Log("\n"); Log(" Node L R Height LLength RLength\n"); Log("----- ----- ----- ------ ------- -------\n"); for (unsigned i = 0; i <= g_uInternalNodeIndex; ++i) Log("%5u %5u %5u %6.2g %6.2g %6.2g\n", i, g_uLeft[i], g_uRight[i], g_Height[i], g_LeftLength[i], g_RightLength[i]); } void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage) { g_uLeafCount = DC.GetCount(); g_uTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2; g_uInternalNodeCount = g_uLeafCount - 1; g_Dist = new dist_t[g_uTriangleSize]; g_uNodeIndex = new unsigned[g_uLeafCount]; g_uNearestNeighbor = new unsigned[g_uLeafCount]; g_MinDist = new dist_t[g_uLeafCount]; unsigned *Ids = new unsigned [g_uLeafCount]; char **Names = new char *[g_uLeafCount]; g_uLeft = new unsigned[g_uInternalNodeCount]; g_uRight = new unsigned[g_uInternalNodeCount]; g_Height = new dist_t[g_uInternalNodeCount]; g_LeftLength = new dist_t[g_uInternalNodeCount]; g_RightLength = new dist_t[g_uInternalNodeCount]; for (unsigned i = 0; i < g_uLeafCount; ++i) { g_MinDist[i] = BIG_DIST; g_uNodeIndex[i] = i; g_uNearestNeighbor[i] = uInsane; Ids[i] = DC.GetId(i); Names[i] = strsave(DC.GetName(i)); } for (unsigned i = 0; i < g_uInternalNodeCount; ++i) { g_uLeft[i] = uInsane; g_uRight[i] = uInsane; g_LeftLength[i] = BIG_DIST; g_RightLength[i] = BIG_DIST; g_Height[i] = BIG_DIST; } // Compute initial NxN triangular distance matrix. // Store minimum distance for each full (not triangular) row. // Loop from 1, not 0, because "row" is 0, 1 ... i-1, // so nothing to do when i=0. for (unsigned i = 1; i < g_uLeafCount; ++i) { dist_t *Row = g_Dist + TriangleSubscript(i, 0); DC.CalcDistRange(i, Row); for (unsigned j = 0; j < i; ++j) { const dist_t d = Row[j]; if (d < g_MinDist[i]) { g_MinDist[i] = d; g_uNearestNeighbor[i] = j; } if (d < g_MinDist[j]) { g_MinDist[j] = d; g_uNearestNeighbor[j] = i; } } } #if TRACE Log("Initial state:\n"); ListState(); #endif for (g_uInternalNodeIndex = 0; g_uInternalNodeIndex < g_uLeafCount - 1; ++g_uInternalNodeIndex) { #if TRACE Log("\n"); Log("Internal node index %5u\n", g_uInternalNodeIndex); Log("-------------------------\n"); #endif // Find nearest neighbors unsigned Lmin = uInsane; unsigned Rmin = uInsane; dist_t dtMinDist = BIG_DIST; for (unsigned j = 0; j < g_uLeafCount; ++j) { if (uInsane == g_uNodeIndex[j]) continue; dist_t d = g_MinDist[j]; if (d < dtMinDist) { dtMinDist = d; Lmin = j; Rmin = g_uNearestNeighbor[j]; assert(uInsane != Rmin); assert(uInsane != g_uNodeIndex[Rmin]); } } assert(Lmin != uInsane); assert(Rmin != uInsane); assert(dtMinDist != BIG_DIST); #if TRACE Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n", Lmin, g_uNodeIndex[Lmin], Rmin, g_uNodeIndex[Rmin], dtMinDist); #endif // Compute distances to new node // New node overwrites row currently assigned to Lmin dist_t dtNewMinDist = BIG_DIST; unsigned uNewNearestNeighbor = uInsane; for (unsigned j = 0; j < g_uLeafCount; ++j) { if (j == Lmin || j == Rmin) continue; if (uInsane == g_uNodeIndex[j]) continue; const unsigned vL = TriangleSubscript(Lmin, j); const unsigned vR = TriangleSubscript(Rmin, j); const dist_t dL = g_Dist[vL]; const dist_t dR = g_Dist[vR]; dist_t dtNewDist; switch (Linkage) { case LINKAGE_Avg: dtNewDist = AVG(dL, dR); break; case LINKAGE_Min: dtNewDist = MIN(dL, dR); break; case LINKAGE_Max: dtNewDist = MAX(dL, dR); break; case LINKAGE_Biased: dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR); break; default: Quit("UPGMA2: Invalid LINKAGE_%u", Linkage); } // Nasty special case. // If nearest neighbor of j is Lmin or Rmin, then make the new // node (which overwrites the row currently occupied by Lmin) // the nearest neighbor. This situation can occur when there are // equal distances in the matrix. If we don't make this fix, // the nearest neighbor pointer for j would become invalid. // (We don't need to test for == Lmin, because in that case // the net change needed is zero due to the change in row // numbering). if (g_uNearestNeighbor[j] == Rmin) g_uNearestNeighbor[j] = Lmin; #if TRACE Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n", j, Lmin, dL, Rmin, dR, dtNewDist); #endif g_Dist[vL] = dtNewDist; if (dtNewDist < dtNewMinDist) { dtNewMinDist = dtNewDist; uNewNearestNeighbor = j; } } assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist); assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor); const unsigned v = TriangleSubscript(Lmin, Rmin); const dist_t dLR = g_Dist[v]; const dist_t dHeightNew = dLR/2; const unsigned uLeft = g_uNodeIndex[Lmin]; const unsigned uRight = g_uNodeIndex[Rmin]; const dist_t HeightLeft = uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount]; const dist_t HeightRight = uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount]; g_uLeft[g_uInternalNodeIndex] = uLeft; g_uRight[g_uInternalNodeIndex] = uRight; g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft; g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight; g_Height[g_uInternalNodeIndex] = dHeightNew; // Row for left child overwritten by row for new node g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex; g_uNearestNeighbor[Lmin] = uNewNearestNeighbor; g_MinDist[Lmin] = dtNewMinDist; // Delete row for right child g_uNodeIndex[Rmin] = uInsane; #if TRACE Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n", g_uInternalNodeIndex, Lmin, Rmin); ListState(); #endif } unsigned uRoot = g_uLeafCount - 2; tree.Create(g_uLeafCount, uRoot, g_uLeft, g_uRight, g_LeftLength, g_RightLength, Ids, Names); #if TRACE tree.LogMe(); #endif delete[] g_Dist; delete[] g_uNodeIndex; delete[] g_uNearestNeighbor; delete[] g_MinDist; delete[] g_Height; delete[] g_uLeft; delete[] g_uRight; delete[] g_LeftLength; delete[] g_RightLength; for (unsigned i = 0; i < g_uLeafCount; ++i) free(Names[i]); delete[] Names; delete[] Ids; } class DistCalcTest : public DistCalc { virtual void CalcDistRange(unsigned i, dist_t Dist[]) const { static dist_t TestDist[5][5] = { 0, 2, 14, 14, 20, 2, 0, 14, 14, 20, 14, 14, 0, 4, 20, 14, 14, 4, 0, 20, 20, 20, 20, 20, 0, }; for (unsigned j = 0; j < i; ++j) Dist[j] = TestDist[i][j]; } virtual unsigned GetCount() const { return 5; } virtual unsigned GetId(unsigned i) const { return i; } virtual const char *GetName(unsigned i) const { return "name"; } }; void Test() { SetListFileName("c:\\tmp\\lobster.log", false); DistCalcTest DC; Tree tree; UPGMA2(DC, tree, LINKAGE_Avg); } usage.cpp0000664000175000017500000000331112360262613010755 0ustar bobbob#include "muscle.h" #include void Credits() { static bool Displayed = false; if (Displayed) return; fprintf(stderr, "\n%s\n\n", MUSCLE_LONG_VERSION); fprintf(stderr, "http://www.drive5.com/muscle\n"); fprintf(stderr, "This software is donated to the public domain.\n"); fprintf(stderr, "Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.\n\n"); Displayed = true; } void Usage() { Credits(); fprintf(stderr, "\n" "Basic usage\n" "\n" " muscle -in -out \n" "\n" "Common options (for a complete list please see the User Guide):\n" "\n" " -in Input file in FASTA format (default stdin)\n" " -out Output alignment in FASTA format (default stdout)\n" " -diags Find diagonals (faster for similar sequences)\n" " -maxiters Maximum number of iterations (integer, default 16)\n" " -maxhours Maximum time to iterate in hours (default no limit)\n" " -html Write output in HTML format (default FASTA)\n" " -msf Write output in GCG MSF format (default FASTA)\n" " -clw Write output in CLUSTALW format (default FASTA)\n" " -clwstrict As -clw, with 'CLUSTAL W (1.81)' header\n" " -log[a] Log to file (append if -loga, overwrite if -log)\n" " -quiet Do not write progress messages to stderr\n" " -version Display version information and exit\n" "\n" "Without refinement (very fast, avg accuracy similar to T-Coffee): -maxiters 2\n" "Fastest possible (amino acids): -maxiters 1 -diags -sv -distance1 kbit20_3\n" "Fastest possible (nucleotides): -maxiters 1 -diags\n"); } validateids.cpp0000664000175000017500000000617012360262614012151 0ustar bobbob#include "muscle.h" #include "msa.h" #include "tree.h" #include "seqvect.h" #if DEBUG static SeqVect *g_ptrMuscleSeqVect = 0; static MSA MuscleInputMSA; void SetMuscleInputMSA(MSA &msa) { MuscleInputMSA.Copy(msa); } void SetMuscleSeqVect(SeqVect &v) { g_ptrMuscleSeqVect = &v; } void ValidateMuscleIdsSeqVect(const MSA &msa) { const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const unsigned uId = msa.GetSeqId(uSeqIndex); const char *ptrNameMSA = msa.GetSeqName(uSeqIndex); const char *ptrName = g_ptrMuscleSeqVect->GetSeqName(uId); if (0 != strcmp(ptrNameMSA, ptrName)) Quit("ValidateMuscleIdsSeqVect, names don't match"); } } void ValidateMuscleIdsMSA(const MSA &msa) { const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const unsigned uId = msa.GetSeqId(uSeqIndex); const char *ptrNameMSA = msa.GetSeqName(uSeqIndex); const char *ptrName = MuscleInputMSA.GetSeqName(uId); if (0 != strcmp(ptrNameMSA, ptrName)) { Log("Input MSA:\n"); MuscleInputMSA.LogMe(); Log("MSA being tested:\n"); msa.LogMe(); Log("Id=%u\n", uId); Log("Input name=%s\n", ptrName); Log("Test name=%s\n", ptrNameMSA); Quit("ValidateMuscleIdsMSA, names don't match"); } } } void ValidateMuscleIds(const MSA &msa) { if (0 != g_ptrMuscleSeqVect) ValidateMuscleIdsSeqVect(msa); else if (0 != MuscleInputMSA.GetSeqCount()) ValidateMuscleIdsMSA(msa); else Quit("ValidateMuscleIds, ptrMuscleSeqVect=0 && 0 == MuscleInputMSA.SeqCount()"); } void ValidateMuscleIdsSeqVect(const Tree &tree) { const unsigned uSeqCount = g_ptrMuscleSeqVect->GetSeqCount(); const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!tree.IsLeaf(uNodeIndex)) continue; const unsigned uId = tree.GetLeafId(uNodeIndex); if (uId >= uSeqCount) { tree.LogMe(); Quit("Leaf with node index %u has id=%u, there are %u seqs", uNodeIndex, uId, uSeqCount); } const char *ptrNameTree = tree.GetLeafName(uNodeIndex); const char *ptrName = g_ptrMuscleSeqVect->GetSeqName(uId); if (0 != strcmp(ptrNameTree, ptrName)) Quit("ValidateMuscleIds: names don't match"); } } void ValidateMuscleIdsMSA(const Tree &tree) { const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!tree.IsLeaf(uNodeIndex)) continue; const unsigned uId = tree.GetLeafId(uNodeIndex); const char *ptrNameTree = tree.GetLeafName(uNodeIndex); const char *ptrName = MuscleInputMSA.GetSeqName(uId); if (0 != strcmp(ptrNameTree, ptrName)) Quit("ValidateMuscleIds: names don't match"); } } void ValidateMuscleIds(const Tree &tree) { if (0 != g_ptrMuscleSeqVect) ValidateMuscleIdsSeqVect(tree); else if (0 != MuscleInputMSA.GetSeqCount()) ValidateMuscleIdsMSA(tree); else Quit("ValidateMuscleIds, ptrMuscleSeqVect=0 && 0 == MuscleInputMSA.SeqCount"); } #endif vtml2.cpp0000664000175000017500000002417512360262614010731 0ustar bobbob#include "muscle.h" // Note: We use 32x32 arrays rather than 20x20 as this may give the compiler // optimizer an opportunity to make subscript arithmetic more efficient // (multiplying by 32 is same as shifting left by 5 bits). #define v(x) ((float) x) #define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ v(R), v(S), v(T), v(V), v(W), v(Y) }, // A C D E F G H I K L // M N P Q R S T V W Y // VTML200 float VTML_LA[32][32] = { ROW( 2.25080, 1.31180, 0.82704, 0.88740, 0.55520, 1.09860, 0.71673, 0.80805, 0.81213, 0.68712, 0.79105, 0.86777, 0.99328, 0.86644, 0.72821, 1.33924, 1.20373, 1.05956, 0.38107, 0.54373) // A ROW( 1.31180,15.79469, 0.39862, 0.42329, 0.49882, 0.65541, 0.67100, 0.97185, 0.46414, 0.55673, 0.90230, 0.63236, 0.54479, 0.47895, 0.56465, 1.18490, 0.99069, 1.21604, 0.28988, 0.91338) // C ROW( 0.82704, 0.39862, 4.18833, 2.06850, 0.25194, 0.90937, 1.01617, 0.32860, 1.03391, 0.31300, 0.42498, 1.80888, 0.81307, 1.20043, 0.63712, 1.03001, 0.88191, 0.43557, 0.26313, 0.37947) // D ROW( 0.88740, 0.42329, 2.06850, 3.08354, 0.33456, 0.77183, 0.94536, 0.43151, 1.35989, 0.45579, 0.53423, 1.15745, 0.82832, 1.66752, 0.84500, 0.98693, 0.88132, 0.54047, 0.24519, 0.52025) // E ROW( 0.55520, 0.49882, 0.25194, 0.33456, 6.08351, 0.30140, 1.02191, 1.10969, 0.37069, 1.50587, 1.41207, 0.42850, 0.41706, 0.48113, 0.41970, 0.56867, 0.57172, 0.91256, 2.02494, 3.44675) // F ROW( 1.09860, 0.65541, 0.90937, 0.77183, 0.30140, 5.62829, 0.64191, 0.28432, 0.67874, 0.30549, 0.37739, 1.01012, 0.60851, 0.65996, 0.63660, 1.03448, 0.68435, 0.40728, 0.36034, 0.35679) // G ROW( 0.71673, 0.67100, 1.01617, 0.94536, 1.02191, 0.64191, 6.05494, 0.50783, 1.03822, 0.60887, 0.55685, 1.28619, 0.72275, 1.41503, 1.24635, 0.93344, 0.83543, 0.54817, 0.81780, 1.81552) // H ROW( 0.80805, 0.97185, 0.32860, 0.43151, 1.10969, 0.28432, 0.50783, 3.03766, 0.49310, 1.88886, 1.75039, 0.44246, 0.44431, 0.53213, 0.48153, 0.55603, 0.88168, 2.37367, 0.68494, 0.70035) // I ROW( 0.81213, 0.46414, 1.03391, 1.35989, 0.37069, 0.67874, 1.03822, 0.49310, 2.72883, 0.52739, 0.68244, 1.15671, 0.82911, 1.51333, 2.33521, 0.93858, 0.92730, 0.55467, 0.39944, 0.52549) // K ROW( 0.68712, 0.55673, 0.31300, 0.45579, 1.50587, 0.30549, 0.60887, 1.88886, 0.52739, 3.08540, 2.14480, 0.43539, 0.53630, 0.62771, 0.53025, 0.53468, 0.69924, 1.50372, 0.82822, 0.89854) // L ROW( 0.79105, 0.90230, 0.42498, 0.53423, 1.41207, 0.37739, 0.55685, 1.75039, 0.68244, 2.14480, 4.04057, 0.55603, 0.48415, 0.76770, 0.66775, 0.62409, 0.87759, 1.42742, 0.52278, 0.72067) // M ROW( 0.86777, 0.63236, 1.80888, 1.15745, 0.42850, 1.01012, 1.28619, 0.44246, 1.15671, 0.43539, 0.55603, 3.36000, 0.69602, 1.13490, 0.98603, 1.31366, 1.11252, 0.50603, 0.35810, 0.68349) // N ROW( 0.99328, 0.54479, 0.81307, 0.82832, 0.41706, 0.60851, 0.72275, 0.44431, 0.82911, 0.53630, 0.48415, 0.69602, 7.24709, 0.90276, 0.74827, 1.03719, 0.83014, 0.56795, 0.37867, 0.33127) // P ROW( 0.86644, 0.47895, 1.20043, 1.66752, 0.48113, 0.65996, 1.41503, 0.53213, 1.51333, 0.62771, 0.76770, 1.13490, 0.90276, 2.86937, 1.50116, 0.99561, 0.93103, 0.61085, 0.29926, 0.51971) // Q ROW( 0.72821, 0.56465, 0.63712, 0.84500, 0.41970, 0.63660, 1.24635, 0.48153, 2.33521, 0.53025, 0.66775, 0.98603, 0.74827, 1.50116, 4.28698, 0.84662, 0.80673, 0.51422, 0.47569, 0.59592) // R ROW( 1.33924, 1.18490, 1.03001, 0.98693, 0.56867, 1.03448, 0.93344, 0.55603, 0.93858, 0.53468, 0.62409, 1.31366, 1.03719, 0.99561, 0.84662, 2.13816, 1.52911, 0.67767, 0.45129, 0.66767) // S ROW( 1.20373, 0.99069, 0.88191, 0.88132, 0.57172, 0.68435, 0.83543, 0.88168, 0.92730, 0.69924, 0.87759, 1.11252, 0.83014, 0.93103, 0.80673, 1.52911, 2.58221, 0.98702, 0.31541, 0.57954) // T ROW( 1.05956, 1.21604, 0.43557, 0.54047, 0.91256, 0.40728, 0.54817, 2.37367, 0.55467, 1.50372, 1.42742, 0.50603, 0.56795, 0.61085, 0.51422, 0.67767, 0.98702, 2.65580, 0.43419, 0.63805) // V ROW( 0.38107, 0.28988, 0.26313, 0.24519, 2.02494, 0.36034, 0.81780, 0.68494, 0.39944, 0.82822, 0.52278, 0.35810, 0.37867, 0.29926, 0.47569, 0.45129, 0.31541, 0.43419,31.39564, 2.51433) // W ROW( 0.54373, 0.91338, 0.37947, 0.52025, 3.44675, 0.35679, 1.81552, 0.70035, 0.52549, 0.89854, 0.72067, 0.68349, 0.33127, 0.51971, 0.59592, 0.66767, 0.57954, 0.63805, 2.51433, 7.50693) // Y }; const float VTML_SP_CENTER = (float) 22.0; #undef ROW #undef v #define v(x) ((float) (x + VTML_SP_CENTER)) #define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y, X) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ v(R), v(S), v(T), v(V), v(W), v(Y), v(X) }, // VTML 240 float VTML_SP[32][32] = { // A C D E F G H I K L M N P Q R S T V W Y X ROW( 58, 23, -12, -7, -44, 10, -23, -14, -14, -27, -17, -8, 1, -9, -22, 23, 15, 5, -74, -45, 0) // A ROW( 23, 224, -67, -63, -50, -30, -29, 1, -56, -41, -6, -33, -44, -53, -43, 15, 2, 18, -93, -6, 0) // C ROW( -12, -67, 111, 59,-104, -4, 4, -84, 6, -88, -65, 48, -13, 18, -29, 5, -7, -63,-105, -73, 0) // D ROW( -7, -63, 59, 85, -83, -17, -1, -63, 25, -60, -47, 15, -12, 40, -8, 1, -7, -47,-108, -51, 0) // E ROW( -44, -50,-104, -83, 144, -93, 4, 12, -74, 36, 30, -64, -67, -56, -65, -43, -41, -3, 63, 104, 0) // F ROW( 10, -30, -4, -17, -93, 140, -32, -95, -27, -91, -75, 4, -36, -29, -32, 5, -26, -68, -80, -79, 0) // G ROW( -23, -29, 4, -1, 4, -32, 137, -50, 6, -37, -42, 21, -23, 27, 19, -4, -12, -44, -13, 48, 0) // H ROW( -14, 1, -84, -63, 12, -95, -50, 86, -53, 53, 47, -62, -60, -47, -55, -43, -8, 69, -27, -24, 0) // I ROW( -14, -56, 6, 25, -74, -27, 6, -53, 75, -48, -30, 13, -12, 34, 68, -3, -4, -44, -71, -49, 0) // K ROW( -27, -41, -88, -60, 36, -91, -37, 53, -48, 88, 62, -63, -48, -36, -48, -47, -25, 36, -11, -4, 0) // L ROW( -17, -6, -65, -47, 30, -75, -42, 47, -30, 62, 103, -45, -54, -21, -31, -35, -9, 31, -46, -20, 0) // M ROW( -8, -33, 48, 15, -64, 4, 21, -62, 13, -63, -45, 89, -25, 12, 2, 22, 10, -51, -79, -29, 0) // N ROW( 1, -44, -13, -12, -67, -36, -23, -60, -12, -48, -54, -25, 160, -6, -20, 5, -12, -42, -76, -83, 0) // P ROW( -9, -53, 18, 40, -56, -29, 27, -47, 34, -36, -21, 12, -6, 75, 34, 1, -4, -37, -92, -48, 0) // Q ROW( -22, -43, -29, -8, -65, -32, 19, -55, 68, -48, -31, 2, -20, 34, 113, -10, -14, -49, -58, -39, 0) // R ROW( 23, 15, 5, 1, -43, 5, -4, -43, -3, -47, -35, 22, 5, 1, -10, 53, 32, -28, -62, -31, 0) // S ROW( 15, 2, -7, -7, -41, -26, -12, -8, -4, -25, -9, 10, -12, -4, -14, 32, 68, 0, -87, -40, 0) // T ROW( 5, 18, -63, -47, -3, -68, -44, 69, -44, 36, 31, -51, -42, -37, -49, -28, 0, 74, -61, -32, 0) // V ROW( -74, -93,-105,-108, 63, -80, -13, -27, -71, -11, -46, -79, -76, -92, -58, -62, -87, -61, 289, 81, 0) // W ROW( -45, -6, -73, -51, 104, -79, 48, -24, -49, -4, -20, -29, -83, -48, -39, -31, -40, -32, 81, 162, 0) // Y ROW( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) // X }; #undef v #define v(x) ((float) (x)) #define RNC(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y, X) \ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ v(R), v(S), v(T), v(V), v(W), v(Y), v(X) }, float VTML_SPNoCenter[32][32] = { // A C D E F G H I K L M N P Q R S T V W Y X RNC( 58, 23, -12, -7, -44, 10, -23, -14, -14, -27, -17, -8, 1, -9, -22, 23, 15, 5, -74, -45, 0) // A RNC( 23, 224, -67, -63, -50, -30, -29, 1, -56, -41, -6, -33, -44, -53, -43, 15, 2, 18, -93, -6, 0) // C RNC( -12, -67, 111, 59,-104, -4, 4, -84, 6, -88, -65, 48, -13, 18, -29, 5, -7, -63,-105, -73, 0) // D RNC( -7, -63, 59, 85, -83, -17, -1, -63, 25, -60, -47, 15, -12, 40, -8, 1, -7, -47,-108, -51, 0) // E RNC( -44, -50,-104, -83, 144, -93, 4, 12, -74, 36, 30, -64, -67, -56, -65, -43, -41, -3, 63, 104, 0) // F RNC( 10, -30, -4, -17, -93, 140, -32, -95, -27, -91, -75, 4, -36, -29, -32, 5, -26, -68, -80, -79, 0) // G RNC( -23, -29, 4, -1, 4, -32, 137, -50, 6, -37, -42, 21, -23, 27, 19, -4, -12, -44, -13, 48, 0) // H RNC( -14, 1, -84, -63, 12, -95, -50, 86, -53, 53, 47, -62, -60, -47, -55, -43, -8, 69, -27, -24, 0) // I RNC( -14, -56, 6, 25, -74, -27, 6, -53, 75, -48, -30, 13, -12, 34, 68, -3, -4, -44, -71, -49, 0) // K RNC( -27, -41, -88, -60, 36, -91, -37, 53, -48, 88, 62, -63, -48, -36, -48, -47, -25, 36, -11, -4, 0) // L RNC( -17, -6, -65, -47, 30, -75, -42, 47, -30, 62, 103, -45, -54, -21, -31, -35, -9, 31, -46, -20, 0) // M RNC( -8, -33, 48, 15, -64, 4, 21, -62, 13, -63, -45, 89, -25, 12, 2, 22, 10, -51, -79, -29, 0) // N RNC( 1, -44, -13, -12, -67, -36, -23, -60, -12, -48, -54, -25, 160, -6, -20, 5, -12, -42, -76, -83, 0) // P RNC( -9, -53, 18, 40, -56, -29, 27, -47, 34, -36, -21, 12, -6, 75, 34, 1, -4, -37, -92, -48, 0) // Q RNC( -22, -43, -29, -8, -65, -32, 19, -55, 68, -48, -31, 2, -20, 34, 113, -10, -14, -49, -58, -39, 0) // R RNC( 23, 15, 5, 1, -43, 5, -4, -43, -3, -47, -35, 22, 5, 1, -10, 53, 32, -28, -62, -31, 0) // S RNC( 15, 2, -7, -7, -41, -26, -12, -8, -4, -25, -9, 10, -12, -4, -14, 32, 68, 0, -87, -40, 0) // T RNC( 5, 18, -63, -47, -3, -68, -44, 69, -44, 36, 31, -51, -42, -37, -49, -28, 0, 74, -61, -32, 0) // V RNC( -74, -93,-105,-108, 63, -80, -13, -27, -71, -11, -46, -79, -76, -92, -58, -62, -87, -61, 289, 81, 0) // W RNC( -45, -6, -73, -51, 104, -79, 48, -24, -49, -4, -20, -29, -83, -48, -39, -31, -40, -32, 81, 162, 0) // Y RNC( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) // X }; writescorefile.cpp0000664000175000017500000000323212360262613012701 0ustar bobbob#include "muscle.h" #include "msa.h" #include extern float VTML_SP[32][32]; extern float NUC_SP[32][32]; static double GetColScore(const MSA &msa, unsigned uCol) { const unsigned uSeqCount = msa.GetSeqCount(); unsigned uPairCount = 0; double dSum = 0.0; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { if (msa.IsGap(uSeq1, uCol)) continue; unsigned uLetter1 = msa.GetLetterEx(uSeq1, uCol); if (uLetter1 >= g_AlphaSize) continue; for (unsigned uSeq2 = uSeq1 + 1; uSeq2 < uSeqCount; ++uSeq2) { if (msa.IsGap(uSeq2, uCol)) continue; unsigned uLetter2 = msa.GetLetterEx(uSeq2, uCol); if (uLetter2 >= g_AlphaSize) continue; double Score; switch (g_Alpha) { case ALPHA_Amino: Score = VTML_SP[uLetter1][uLetter2]; break; case ALPHA_DNA: case ALPHA_RNA: Score = NUC_SP[uLetter1][uLetter2]; break; default: Quit("GetColScore: invalid alpha=%d", g_Alpha); } dSum += Score; ++uPairCount; } } if (0 == uPairCount) return 0; return dSum / uPairCount; } void WriteScoreFile(const MSA &msa) { FILE *f = fopen(g_pstrScoreFileName, "w"); if (0 == f) Quit("Cannot open score file '%s' errno=%d", g_pstrScoreFileName, errno); const unsigned uColCount = msa.GetColCount(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uCol = 0; uCol < uColCount; ++uCol) { double Score = GetColScore(msa, uCol); fprintf(f, "%10.3f ", Score); for (unsigned uSeq = 0; uSeq < uSeqCount; ++uSeq) { char c = msa.GetChar(uSeq, uCol); fprintf(f, "%c", c); } fprintf(f, "\n"); } fclose(f); } alpha.h0000664000175000017500000000376012360262613010413 0ustar bobbob#ifndef alpha_h #define alpha_h bool StrHasAmino(const char *Str); bool StrHasGap(const char *Str); void ClearInvalidLetterWarning(); void InvalidLetterWarning(char c, char w); void ReportInvalidLetters(); extern unsigned g_CharToLetter[]; extern unsigned g_CharToLetterEx[]; extern char g_LetterToChar[]; extern char g_LetterExToChar[]; extern char g_UnalignChar[]; extern char g_AlignChar[]; extern bool g_IsWildcardChar[]; extern bool g_IsResidueChar[]; #define CharToLetter(c) (g_CharToLetter[(unsigned char) (c)]) #define CharToLetterEx(c) (g_CharToLetterEx[(unsigned char) (c)]) #define LetterToChar(u) (g_LetterToChar[u]) #define LetterExToChar(u) (g_LetterExToChar[u]) #define IsResidueChar(c) (g_IsResidueChar[(unsigned char) (c)]) #define IsGapChar(c) ('-' == (c) || '.' == (c)) #define IsWildcardChar(c) (g_IsWildcardChar[(unsigned char) (c)]) #define AlignChar(c) (g_AlignChar[(unsigned char) (c)]) #define UnalignChar(c) (g_UnalignChar[(unsigned char) (c)]) // AX=Amino alphabet with eXtensions (B, Z and X) enum AX { AX_A, AX_C, AX_D, AX_E, AX_F, AX_G, AX_H, AX_I, AX_K, AX_L, AX_M, AX_N, AX_P, AX_Q, AX_R, AX_S, AX_T, AX_V, AX_W, AX_Y, AX_X, // Any AX_B, // D or N AX_Z, // E or Q AX_GAP, }; const unsigned AX_COUNT = AX_GAP + 1; // NX=Nucleotide alphabet with extensions enum NX { NX_A, NX_C, NX_G, NX_T, NX_U = NX_T, NX_M, // AC NX_R, // AG NX_W, // AT NX_S, // CG NX_Y, // CT NX_K, // GT NX_V, // ACG NX_H, // ACT NX_D, // AGT NX_B, // CGT NX_X, // GATC NX_N, // GATC NX_GAP }; const unsigned NX_COUNT = NX_GAP + 1; const unsigned MAX_ALPHA = 20; const unsigned MAX_ALPHA_EX = AX_COUNT; const unsigned MAX_CHAR = 256; extern ALPHA g_Alpha; extern unsigned g_AlphaSize; void SetAlpha(ALPHA Alpha); char GetWildcardChar(); bool IsNucleo(char c); bool IsDNA(char c); bool IsRNA(char c); #endif // alpha_h cluster.h0000664000175000017500000000445112360262613011005 0ustar bobbobclass DistFunc; class ClusterNode { friend class ClusterTree; public: ClusterNode() { m_dWeight = 0.0; m_dWeight2 = 0.0; m_ptrLeft = 0; m_ptrRight = 0; m_ptrParent = 0; m_uIndex = 0; m_ptrPrevDisjoint = 0; m_ptrNextDisjoint = 0; } ~ClusterNode() {} public: unsigned GetIndex() const { return m_uIndex; } ClusterNode *GetLeft() const { return m_ptrLeft; } ClusterNode *GetRight() const { return m_ptrRight; } ClusterNode *GetParent() const { return m_ptrParent; } double GetWeight() const { return m_dWeight; } const ClusterNode *GetClusterLeaf(unsigned uLeafIndex) const; unsigned GetClusterSize() const; double GetClusterWeight() const; double GetLeftBranchWeight() const; double GetRightBranchWeight() const; double GetLeftWeight() const; double GetRightWeight() const; void LogMe() const; double GetWeight2() const { return m_dWeight2; } void SetWeight2(double dWeight2) { m_dWeight2 = dWeight2; } protected: void SetIndex(unsigned uIndex) { m_uIndex = uIndex; } void SetWeight(double dWeight) { m_dWeight = dWeight; } void SetLeft(ClusterNode *ptrLeft) { m_ptrLeft = ptrLeft; } void SetRight(ClusterNode *ptrRight) { m_ptrRight = ptrRight; } void SetParent(ClusterNode *ptrParent) { m_ptrParent = ptrParent; } void SetNextDisjoint(ClusterNode *ptrNode) { m_ptrNextDisjoint = ptrNode; } void SetPrevDisjoint(ClusterNode *ptrNode) { m_ptrPrevDisjoint = ptrNode; } ClusterNode *GetNextDisjoint() { return m_ptrNextDisjoint; } ClusterNode *GetPrevDisjoint() { return m_ptrPrevDisjoint; } private: double m_dWeight; double m_dWeight2; unsigned m_uIndex; ClusterNode *m_ptrLeft; ClusterNode *m_ptrRight; ClusterNode *m_ptrParent; ClusterNode *m_ptrNextDisjoint; ClusterNode *m_ptrPrevDisjoint; }; class ClusterTree { public: ClusterTree(); virtual ~ClusterTree(); void Create(const DistFunc &DF); ClusterNode *GetRoot() const; void LogMe() const; protected: void Join(ClusterNode *ptrNode1, ClusterNode *ptrNode2, ClusterNode *ptrJoin); void AddToDisjoints(ClusterNode *ptrNode); void DeleteFromDisjoints(ClusterNode *ptrNode); void Validate(unsigned uNodeCount); private: ClusterNode *m_ptrDisjoints; ClusterNode *m_Nodes; unsigned m_uNodeCount; unsigned m_uLeafCount; }; clust.h0000664000175000017500000001103512360262613010452 0ustar bobbob#ifndef Clust_h #define Clust_h class Clust; class ClustNode; class ClustSet; class Phylip; class SortedNode; const unsigned RB_NIL = ((unsigned) 0xfff0); class ClustNode { public: ClustNode() { m_uIndex = uInsane; m_uSize = uInsane; m_dLength = (float) dInsane; m_ptrLeft = 0; m_ptrRight = 0; m_ptrParent = 0; m_ptrNextCluster = 0; m_ptrPrevCluster = 0; m_uLeafIndexes = 0; } ~ClustNode() { delete[] m_uLeafIndexes; } unsigned m_uIndex; unsigned m_uSize; float m_dLength; ClustNode *m_ptrLeft; ClustNode *m_ptrRight; ClustNode *m_ptrParent; ClustNode *m_ptrNextCluster; ClustNode *m_ptrPrevCluster; unsigned *m_uLeafIndexes; }; class Clust { public: Clust(); virtual ~Clust(); void Create(ClustSet &Set, CLUSTER Method); unsigned GetLeafCount() const; unsigned GetClusterCount() const; unsigned GetClusterSize(unsigned uNodeIndex) const; unsigned GetLeaf(unsigned uClusterIndex, unsigned uLeafIndex) const; unsigned GetNodeCount() const { return 2*m_uLeafCount - 1; } const ClustNode &GetRoot() const { return m_Nodes[GetRootNodeIndex()]; } unsigned GetRootNodeIndex() const { return m_uNodeCount - 1; } const ClustNode &GetNode(unsigned uNodeIndex) const; bool IsLeaf(unsigned uNodeIndex) const; unsigned GetLeftIndex(unsigned uNodeIndex) const; unsigned GetRightIndex(unsigned uNodeIndex) const; float GetLength(unsigned uNodeIndex) const; float GetHeight(unsigned uNodeIndex) const; const char *GetNodeName(unsigned uNodeIndex) const; unsigned GetNodeId(unsigned uNodeIndex) const; JOIN GetJoinStyle() const { return m_JoinStyle; } LINKAGE GetCentroidStyle() const { return m_CentroidStyle; } void SetDist(unsigned uIndex1, unsigned uIndex2, float dDist); float GetDist(unsigned uIndex1, unsigned uIndex2) const; void ToPhylip(Phylip &tree); void LogMe() const; //private: void SetLeafCount(unsigned uLeafCount); void CreateCluster(); void JoinNodes(unsigned uLeftNodeIndex, unsigned uRightNodeIndex, float dLeftLength, float dRightLength, unsigned uNewNodeIndex); void ChooseJoin(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength); void ChooseJoinNeighborJoining(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength); void ChooseJoinNearestNeighbor(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength); float ComputeDist(unsigned uNewNodeIndex, unsigned uNodeIndex); float ComputeDistAverageLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex); float ComputeDistMinLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex); float ComputeDistMaxLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex); float ComputeDistNeighborJoining(unsigned uNewNewIndex, unsigned uNodeIndex); float ComputeDistMAFFT(unsigned uNewNewIndex, unsigned uNodeIndex); float Calc_r(unsigned uNodeIndex) const; unsigned VectorIndex(unsigned uIndex1, unsigned uIndex2) const; unsigned GetFirstCluster() const; unsigned GetNextCluster(unsigned uNodeIndex) const; float ComputeMetric(unsigned uIndex1, unsigned uIndex2) const; float ComputeMetricNearestNeighbor(unsigned i, unsigned j) const; float ComputeMetricNeighborJoining(unsigned i, unsigned j) const; void InitMetric(unsigned uMaxNodeIndex); void InsertMetric(unsigned uIndex1, unsigned uIndex2, float dMetric); float GetMinMetric(unsigned *ptruIndex1, unsigned *ptruIndex2) const; float GetMinMetricBruteForce(unsigned *ptruIndex1, unsigned *ptruIndex2) const; void DeleteMetric(unsigned uIndex); void DeleteMetric(unsigned uIndex1, unsigned uIndex2); void ListMetric() const; void DeleteFromClusterList(unsigned uNodeIndex); void AddToClusterList(unsigned uNodeIndex); void RBDelete(unsigned RBNode); unsigned RBInsert(unsigned i, unsigned j, float fMetric); unsigned RBNext(unsigned RBNode) const; unsigned RBPrev(unsigned RBNode) const; unsigned RBMin(unsigned RBNode) const; unsigned RBMax(unsigned RBNode) const; void ValidateRB(const char szMsg[] = 0) const; void ValidateRBNode(unsigned Node, const char szMsg[]) const; //private: JOIN m_JoinStyle; LINKAGE m_CentroidStyle; ClustNode *m_Nodes; unsigned *m_ClusterIndexToNodeIndex; unsigned *m_NodeIndexToClusterIndex; unsigned m_uLeafCount; unsigned m_uNodeCount; unsigned m_uClusterCount; unsigned m_uTriangularMatrixSize; float *m_dDist; ClustSet *m_ptrSet; ClustNode *m_ptrClusterList; }; #endif // Clust_h clustsetdf.h0000664000175000017500000000176212360262614011507 0ustar bobbob#ifndef ClustSetDF_h #define ClustSetDF_h class MSA; class Clust; #include "clustset.h" #include "distfunc.h" #include "msa.h" class ClustSetDF : public ClustSet { public: ClustSetDF(const DistFunc &DF) : m_ptrDF(&DF) { } public: virtual unsigned GetLeafCount() { return m_ptrDF->GetCount(); } virtual const char *GetLeafName(unsigned uNodeIndex) { return m_ptrDF->GetName(uNodeIndex); } virtual unsigned GetLeafId(unsigned uNodeIndex) { return m_ptrDF->GetId(uNodeIndex); } virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex, unsigned uRightNodeIndex, unsigned uJoinedNodeIndex, double *ptrdLeftLength, double *ptrdRightLength) { Quit("ClustSetDF::JoinNodes, should never be called"); } virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1, unsigned uNodeIndex2) { return m_ptrDF->GetDist(uNodeIndex1, uNodeIndex2); } private: const DistFunc *m_ptrDF; }; #endif // ClustSetDF_h clustset.h0000664000175000017500000000110012360262614011157 0ustar bobbob#ifndef ClustSet_h #define ClustSet_h enum JOIN; enum LINKAGE; class Clust; class ClustSet { public: virtual unsigned GetLeafCount() = 0; virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1, unsigned uNodeIndex2) = 0; virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex, unsigned uRightNodeIndex, unsigned uJoinedNodeIndex, double *ptrdLeftLength, double *ptrdRightLength) = 0; virtual const char *GetLeafName(unsigned uNodeIndex) = 0; virtual unsigned GetLeafId(unsigned uNodeIndex) = 0; }; #endif // ClustSet_h clustsetmsa.h0000664000175000017500000000235612360262614011676 0ustar bobbob#ifndef ClustSetMSA_h #define ClustSetMSA_h class MSA; class Clust; #include "clustset.h" #include "msadist.h" // Distance matrix based set. // Computes distances between leaves, never between // joined clusters (leaves this to distance matrix method). class ClustSetMSA : public ClustSet { public: ClustSetMSA(const MSA &msa, MSADist &MD) : m_ptrMSA(&msa), m_ptrMSADist(&MD) { } public: virtual unsigned GetLeafCount() { return m_ptrMSA->GetSeqCount(); } virtual const char *GetLeafName(unsigned uNodeIndex) { return m_ptrMSA->GetSeqName(uNodeIndex); } virtual unsigned GetLeafId(unsigned uNodeIndex) { return m_ptrMSA->GetSeqId(uNodeIndex); } virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex, unsigned uRightNodeIndex, unsigned uJoinedNodeIndex, double *ptrdLeftLength, double *ptrdRightLength) { Quit("ClustSetMSA::JoinNodes, should never be called"); } virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1, unsigned uNodeIndex2) { return m_ptrMSADist->ComputeDist(*m_ptrMSA, uNodeIndex1, uNodeIndex2); } public: const MSA &GetMSA(); private: const MSA *m_ptrMSA; MSADist *m_ptrMSADist; }; #endif // ClustSetMSA_h diaglist.h0000664000175000017500000000353012360262614011122 0ustar bobbob#ifndef diaglist_h #define diaglist_h const unsigned EMPTY = (unsigned) ~0; const unsigned MAX_DIAGS = 1024; struct Diag { unsigned m_uStartPosA; unsigned m_uStartPosB; unsigned m_uLength; }; struct Rect { unsigned m_uStartPosA; unsigned m_uStartPosB; unsigned m_uLengthA; unsigned m_uLengthB; }; class DiagList { public: DiagList() { m_uCount = 0; } ~DiagList() { Free(); } public: // Creation void Clear() { Free(); } void FromPath(const PWPath &Path); void Add(const Diag &d); void Add(unsigned uStartPosA, unsigned uStartPosB, unsigned uLength); void DeleteIncompatible(); // Accessors unsigned GetCount() const { return m_uCount; } const Diag &Get(unsigned uIndex) const; // Operations void Sort(); void Copy(const DiagList &DL); // Query // returns true iff given diagonal is included in the list // in whole or in part. bool NonZeroIntersection(const Diag &d) const; bool IsSorted() const; // Diagnostics void LogMe() const; private: void Free() { m_uCount = 0; } private: unsigned m_uCount; Diag m_Diags[MAX_DIAGS]; }; unsigned DiagOverlap(const Diag &d1, const Diag &d2); unsigned DiagOverlapA(const Diag &d1, const Diag &d2); unsigned DiagOverlapB(const Diag &d1, const Diag &d2); unsigned DiagBreak(const Diag &d1, const Diag &d2); bool DiagCompatible(const Diag &d1, const Diag &d2); void CheckDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const MSA &msaA, const MSA &msaB, const PWPath &Path); void FindDiags(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, unsigned uLengthY, DiagList &DL); void FindDiagsNuc(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, unsigned uLengthY, DiagList &DL); void MergeDiags(DiagList &DL); #endif // diaglist_h distcalc.h0000664000175000017500000000205612360262614011112 0ustar bobbob#ifndef DistCalc_h #define DistCalc_h typedef float dist_t; const dist_t BIG_DIST = (dist_t) 1e29; class DistFunc; class DistCalc { public: virtual void CalcDistRange(unsigned i, dist_t Dist[]) const = 0; virtual unsigned GetCount() const = 0; virtual unsigned GetId(unsigned i) const = 0; virtual const char *GetName(unsigned i) const = 0; }; class DistCalcDF : public DistCalc { public: void Init(const DistFunc &DF); virtual void CalcDistRange(unsigned i, dist_t Dist[]) const; virtual unsigned GetCount() const; virtual unsigned GetId(unsigned i) const; virtual const char *GetName(unsigned i) const; private: const DistFunc *m_ptrDF; }; class DistCalcMSA : public DistCalc { public: void Init(const MSA &msa, DISTANCE Distance); virtual void CalcDistRange(unsigned i, dist_t Dist[]) const; virtual unsigned GetCount() const; virtual unsigned GetId(unsigned i) const; virtual const char *GetName(unsigned i) const; private: const MSA *m_ptrMSA; DISTANCE m_Distance; }; #endif // DistCalc_h distfunc.h0000664000175000017500000000144712360262614011146 0ustar bobbob#ifndef DistFunc_h #define DistFunc_h class DistFunc { public: DistFunc(); virtual ~DistFunc(); public: virtual void SetCount(unsigned uCount); virtual void SetDist(unsigned uIndex1, unsigned uIndex2, float dDist); void SetName(unsigned uIndex, const char szName[]); void SetId(unsigned uIndex, unsigned uId); const char *GetName(unsigned uIndex) const; unsigned GetId(unsigned uIndex) const; virtual float GetDist(unsigned uIndex1, unsigned uIndex2) const; virtual unsigned GetCount() const; void LogMe() const; protected: unsigned VectorIndex(unsigned uIndex, unsigned uIndex2) const; unsigned VectorLength() const; private: unsigned m_uCount; unsigned m_uCacheCount; float *m_Dists; char **m_Names; unsigned *m_Ids; }; #endif // DistFunc_h dpregionlist.h0000664000175000017500000000177212360262614012033 0ustar bobbob#ifndef DPRegionList_h #define DPRegionList_h #include "diaglist.h" enum DPREGIONTYPE { DPREGIONTYPE_Unknown, DPREGIONTYPE_Diag, DPREGIONTYPE_Rect }; struct DPRegion { DPREGIONTYPE m_Type; union { Diag m_Diag; Rect m_Rect; }; }; const unsigned MAX_DPREGIONS = 1024; class DPRegionList { public: DPRegionList() { m_uCount = 0; } ~DPRegionList() { Free(); } public: // Creation void Clear() { Free(); } void Add(const DPRegion &r); // Accessors unsigned GetCount() const { return m_uCount; } const DPRegion &Get(unsigned uIndex) const { assert(uIndex < m_uCount); return m_DPRegions[uIndex]; } // Diagnostics void LogMe() const; private: void Free() { m_uCount = 0; } private: unsigned m_uCount; DPRegion m_DPRegions[MAX_DPREGIONS]; }; void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL, unsigned uLengthA, unsigned uLengthB); #endif // DPRegionList_h dpreglist.h0000664000175000017500000000202312360262613011312 0ustar bobbob#ifndef dpreglist_h #define dpreglist_h #include "diaglist.h" enum DPREGIONTYPE { DPREGIONTYPE_Unknown, DPREGIONTYPE_Diag, DPREGIONTYPE_Rect }; struct DPRegion { DPREGIONTYPE m_Type; union { Diag m_Diag; Rect m_Rect; }; }; const unsigned MAX_DPREGIONS = 1024; class DPRegionList { public: DPRegionList() { m_uCount = 0; } ~DPRegionList() { Free(); } public: // Creation void Clear() { Free(); } void Add(const DPRegion &r); // Accessors unsigned GetCount() const { return m_uCount; } const DPRegion &Get(unsigned uIndex) const { assert(uIndex < m_uCount); return m_DPRegions[uIndex]; } unsigned GetDPArea() const; // Diagnostics void LogMe() const; private: void Free() { m_uCount = 0; } private: unsigned m_uCount; DPRegion m_DPRegions[MAX_DPREGIONS]; }; void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL, unsigned uLengthA, unsigned uLengthB); #endif // dpreglist_h edgelist.h0000664000175000017500000000077312360262614011130 0ustar bobbob#ifndef EdgeList_h #define EdgeList_h class EdgeList { public: EdgeList(); virtual ~EdgeList(); public: void Clear(); void Add(unsigned uNode1, unsigned uNode2); unsigned GetCount() const; void GetEdge(unsigned uIndex, unsigned *ptruNode1, unsigned *ptruNode2) const; void Copy(const EdgeList &rhs); void LogMe() const; private: void Expand(); private: unsigned m_uCount; unsigned m_uCacheSize; unsigned *m_uNode1; unsigned *m_uNode2; }; #endif // EdgeList_h enumopts.h0000664000175000017500000000037412360262614011177 0ustar bobbob#ifndef enumopts_h #define enumopts_h struct EnumOpt { const char *pstrOpt; int iValue; }; #define s(t) extern EnumOpt t##_Opts[]; #define c(t, x) /* empty */ #define e(t) /* empty */ #include "enums.h" #endif // enumopts_h enums.h0000664000175000017500000000376112360262614010457 0ustar bobbob// enums.h // Define enum types. // Exploit macro hacks to avoid lots of repetetive typing. // Generally I am opposed to macro hacks because of the // highly obscure code that results, but in this case it // makes maintenance much easier and less error-prone. // The idea is that this file can be included in different // places with different definitions of s (Start), c (Case) // and e (End). See types.h. s(ALPHA) c(ALPHA, Amino) c(ALPHA, DNA) c(ALPHA, RNA) e(ALPHA) s(SEQTYPE) c(SEQTYPE, Protein) c(SEQTYPE, DNA) c(SEQTYPE, RNA) c(SEQTYPE, Auto) e(SEQTYPE) s(ROOT) c(ROOT, Pseudo) c(ROOT, MidLongestSpan) c(ROOT, MinAvgLeafDist) e(ROOT) s(CLUSTER) c(CLUSTER, UPGMA) c(CLUSTER, UPGMAMax) c(CLUSTER, UPGMAMin) c(CLUSTER, UPGMB) c(CLUSTER, NeighborJoining) e(CLUSTER) s(JOIN) c(JOIN, NearestNeighbor) c(JOIN, NeighborJoining) e(JOIN) s(LINKAGE) c(LINKAGE, Min) c(LINKAGE, Avg) c(LINKAGE, Max) c(LINKAGE, NeighborJoining) c(LINKAGE, Biased) e(LINKAGE) s(DISTANCE) c(DISTANCE, Kmer6_6) c(DISTANCE, Kmer20_3) c(DISTANCE, Kmer20_4) c(DISTANCE, Kbit20_3) c(DISTANCE, Kmer4_6) c(DISTANCE, PctIdKimura) c(DISTANCE, PctIdLog) c(DISTANCE, PWKimura) c(DISTANCE, PWScoreDist) c(DISTANCE, ScoreDist) c(DISTANCE, Edit) e(DISTANCE) s(PPSCORE) c(PPSCORE, LE) c(PPSCORE, SP) c(PPSCORE, SV) c(PPSCORE, SPN) e(PPSCORE) s(SEQWEIGHT) c(SEQWEIGHT, None) c(SEQWEIGHT, Henikoff) c(SEQWEIGHT, HenikoffPB) c(SEQWEIGHT, GSC) c(SEQWEIGHT, ClustalW) c(SEQWEIGHT, ThreeWay) e(SEQWEIGHT) s(OBJSCORE) c(OBJSCORE, SP) // Sum of Pairs of sequences c(OBJSCORE, DP) // Dynamic Programming score c(OBJSCORE, XP) // Cross Pairs = sum of pairs between two MSAs c(OBJSCORE, PS) // sum of Prof-Seq score for all seqs in MSA c(OBJSCORE, SPF) // sum of pairs, fast approximation c(OBJSCORE, SPM) // sp if <= 100 seqs, spf otherwise e(OBJSCORE) s(TERMGAPS) c(TERMGAPS, Full) c(TERMGAPS, Half) c(TERMGAPS, Ext) e(TERMGAPS) #undef s #undef c #undef e estring.h0000664000175000017500000000076512360262614011004 0ustar bobbob#ifndef pathsum_h #define pathsum_h void PathToEstrings(const PWPath &Path, int **ptresA, int **ptresB); void EstringsToPath(const int esA[], const int esB[], PWPath &Path); void MulEstrings(const int es1[], const int es2[], int esp[]); void EstringOp(const int es[], const Seq &sIn, Seq &sOut); unsigned EstringOp(const int es[], const Seq &sIn, MSA &a); void LogEstring(const int es[]); unsigned LengthEstring(const int es[]); int *EstringNewCopy(const int es[]); #endif // pathsum_h gapscoredimer.h0000664000175000017500000000525312360262614012152 0ustar bobbob// source code generated by dimer.py static SCORE GapScoreMM(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LG + PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_LG) + g_scoreGapExtend*(PPA.m_LL*PPB.m_GG + PPA.m_GG*PPB.m_LL) + g_scoreGapAmbig*(PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_GL); } static SCORE GapScoreMD(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + g_scoreGapExtend*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG) + g_scoreGapAmbig*(PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG); } static SCORE GapScoreMI(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + g_scoreGapExtend*(PPA.m_LG*PPB.m_LL + PPA.m_GG*PPB.m_LL) + g_scoreGapAmbig*(PPA.m_LG*PPB.m_GL + PPA.m_GG*PPB.m_GL); } static SCORE GapScoreDM(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL) + g_scoreGapExtend*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG) + g_scoreGapAmbig*(PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_LL + PPA.m_GG*PPB.m_GL); } static SCORE GapScoreDD(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapExtend*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GL + PPA.m_LL*PPB.m_GG) + g_scoreGapAmbig*(PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GL + PPA.m_GL*PPB.m_GG); } static SCORE GapScoreDI(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + g_scoreGapAmbig*(PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL + PPA.m_GG*PPB.m_LL + PPA.m_GG*PPB.m_GL); } static SCORE GapScoreIM(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LG + PPA.m_GL*PPB.m_LG) + g_scoreGapExtend*(PPA.m_LG*PPB.m_LL + PPA.m_GG*PPB.m_LL) + g_scoreGapAmbig*(PPA.m_LL*PPB.m_GG + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_GL); } static SCORE GapScoreID(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + g_scoreGapAmbig*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG + PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG); } static SCORE GapScoreII(const ProfPos &PPA, const ProfPos &PPB) { return g_scoreGapExtend*(PPA.m_LL*PPB.m_LL + PPA.m_LG*PPB.m_LL + PPA.m_GL*PPB.m_LL + PPA.m_GG*PPB.m_LL) + g_scoreGapAmbig*(PPA.m_LL*PPB.m_GL + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_GL + PPA.m_GG*PPB.m_GL); } gonnet.h0000664000175000017500000000037112360262613010613 0ustar bobbob#ifndef Gonnet_h #define Gonnet_h typedef double t_ROW[20]; const t_ROW *GetGonnetMatrix(unsigned N); SCORE GetGonnetGapOpen(unsigned N); SCORE GetGonnetGapExtend(unsigned N); extern double GonnetLookup[400][400]; #endif // Gonnet_h intmath.h0000664000175000017500000001343412360262614010772 0ustar bobbob// IntMath.h: Header for doing fractional math with integers for speed. #ifndef IntMath_h #define IntMath_h typedef float BASETYPE; //typedef double BASETYPE; // Scaling factor used to store certain floating point // values as integers to a few significant figures. //const int INTSCALE = 1000; const int INTSCALE = 1; // Type for a probability in range 0.0 to 1.0. typedef BASETYPE PROB; // Type for an log-odds integer score. // Stored as log2(PROB)*INTSCALE. //typedef int SCORE; typedef BASETYPE SCORE; // Type for a weight. // Stored as w*INTSCALE where w is in range 0.0 to 1.0. //typedef unsigned WEIGHT; typedef BASETYPE WEIGHT; // Type for a fractional weighted count stored as n*WEIGHT/N // where n=measured count (integer >= 0) and N is total for // the distribution (e.g., n=number of residues of a given // type in a column, N=number of residues in the column). // Hence values in an FCOUNT variable range from 0..INTSCALE // as an integer, representing "true" values 0.0 to 1.0. //typedef unsigned FCOUNT; typedef BASETYPE FCOUNT; // Representation of -infinity. Value should // be large and negative, but not so large // that adding a few of them overflows. // TODO: Multiplied by 10 to work around bug // when aligning Bali 1ckaA in ref4, which is // so long that B->Mmax got to -infinity, causing // traceback to fail. //const int MINUS_INFINITY = -10000000; const BASETYPE MINUS_INFINITY = (BASETYPE) -1e37; const BASETYPE PLUS_INFINITY = (BASETYPE) 1e37; // Probability relative to a null model typedef double RPROB; PROB ScoreToProb(SCORE Score); SCORE ProbToScore(PROB Prob); SCORE DoubleToScore(double d); WEIGHT DoubleToWeight(double d); double WeightToDouble(WEIGHT w); SCORE MulScoreWeight(SCORE Score, WEIGHT Weight); bool ScoreEq(SCORE s1, SCORE s2); bool BTEq(double b1, double b2); static double ScoreToDouble(SCORE Score) { return (double) Score / (double) INTSCALE; } #if 0 // In-line assembler for Result = (x*y)/z // Note that imul and idiv will do 64-bit arithmetic // on 32-bit operands, so this shouldn't overflow // Can't write this efficiently in C/C++ (would // often overlow 32 bits). #define MulDivAssign(Result, x, y, z) \ { \ int X = (x); \ int Y = (y); \ int Z = (z); \ _asm mov eax,X \ _asm imul Y \ _asm mov ecx,Z \ _asm idiv ecx \ _asm mov Result,eax \ } #else #define MulDivAssign(Result, x, y, z) Result = (((x)*(y))/(z)) #endif #define MulScoreWeight(r, s, w) MulDivAssign(r, s, w, INTSCALE) #define MulWeightWCount(r, wt, wc) MulDivAssign(r, wt, wc, INTSCALE) #define MulFCountScore(r, fc, sc) MulDivAssign(r, fc, sc, INTSCALE) #if _DEBUG static inline SCORE Add2(SCORE a, SCORE b) { if (MINUS_INFINITY == a) return MINUS_INFINITY; if (MINUS_INFINITY == b) return MINUS_INFINITY; SCORE sum = a + b; if (sum < MINUS_INFINITY) return MINUS_INFINITY; // assert(sum < OVERFLOW_WARN); return sum; } static inline SCORE Add3(SCORE a, SCORE b, SCORE c) { return Add2(Add2(a, b), c); } static inline SCORE Add4(SCORE a, SCORE b, SCORE c, SCORE d) { return Add2(Add2(a, b), Add2(c, d)); } static inline SCORE Add5(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e) { return Add3(Add2(a, b), Add2(c, d), e); } static inline SCORE Add6(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e, SCORE f) { return Add3(Add2(a, b), Add2(c, d), Add2(e, f)); } static inline SCORE Add7(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e, SCORE f, SCORE g) { return Add4(Add2(a, b), Add2(c, d), Add2(e, f), g); } static inline SCORE Mul2(SCORE a, SCORE b) { if (MINUS_INFINITY == a) return MINUS_INFINITY; if (MINUS_INFINITY == b) return MINUS_INFINITY; //__int64 prod = (__int64) a * (__int64) b; //assert((SCORE) prod == prod); //return (SCORE) prod; return a*b; } static inline SCORE Sub2(SCORE a, SCORE b) { if (MINUS_INFINITY == a) return MINUS_INFINITY; if (MINUS_INFINITY == b) return MINUS_INFINITY; SCORE diff = a - b; if (diff < MINUS_INFINITY) return MINUS_INFINITY; // assert(diff < OVERFLOW_WARN); return diff; } static inline SCORE Div2(SCORE a, int b) { if (MINUS_INFINITY == a) return MINUS_INFINITY; return a/b; } //static inline SCORE MulScoreWeight(SCORE s, WEIGHT w) // { // SCORE Prod = s*(SCORE) w; // assert(Prod < OVERFLOW_WARN); // extern void Log(const char Format[], ...); // if (Prod/(SCORE) w != s) // Log("**WARRNING MulScoreWeight Prod=%d w=%d Prod/w=%d s=%d\n", // Prod, // w, // Prod/(SCORE) w, // s); // assert(Prod/ (SCORE) w == s); // return Prod/INTSCALE; // } // //static inline WCOUNT MulWeightWCount(WEIGHT wt, WCOUNT wc) // { // return (wt*wc)/INTSCALE; // } #else #define Add2(a, b) ((a) + (b)) #define Sub2(a, b) ((MINUS_INFINITY == (a)) ? MINUS_INFINITY : ((a) - (b))) #define Div2(a, b) ((MINUS_INFINITY == (a)) ? MINUS_INFINITY : ((a) / (b))) #define Add3(a, b, c) ((a) + (b) + (c)) #define Add4(a, b, c, d) ((a) + (b) + (c) + (d)) #define Add5(a, b, c, d, e) ((a) + (b) + (c) + (d) + (e)) #define Add6(a, b, c, d, e, f) ((a) + (b) + (c) + (d) + (e) + (f)) #define Add7(a, b, c, d, e, f, g) ((a) + (b) + (c) + (d) + (e) + (f) + (g)) //#define MulScoreWeight(s, w) (((s)*(SCORE) (w))/INTSCALE) #define Mul2(a, b) ((a)*(b)) #endif //static inline SCORE MulFCountScore(FCOUNT fc, SCORE sc) // { //// Fast way to say "if (fc >= 2^15 || sc >= 2^15)": // if ((fc | sc) & 0xffff1000) // { // SCORE Score = ((fc+5)/10)*sc; // assert(Score < assert); // OVERFLOW_WARN(Score > MINUS_INFINITY); // return Score/(INTSCALE/10); // } // SCORE Score = fc*sc; // assert(Score < OVERFLOW_WARN); // assert(Score > MINUS_INFINITY); // return Score/INTSCALE; // } #endif // IntMath_h msadist.h0000664000175000017500000000146512360262613010772 0ustar bobbob#ifndef MSADist_h #define MSADist_h #include double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2); class MSADist { public: MSADist(DISTANCE Distance) { m_Distance = Distance; } double ComputeDist(const MSA &msa, unsigned uSeqIndex1, unsigned uSeqIndex2) { if (m_Distance == DISTANCE_ScoreDist) return GetScoreDist(msa, uSeqIndex1, uSeqIndex2); double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2); switch(m_Distance) { case DISTANCE_PctIdKimura: return KimuraDist(dPctId); case DISTANCE_PctIdLog: if (dPctId < 0.05) dPctId = 0.05; return -log(dPctId); } Quit("MSADist::ComputeDist, invalid DISTANCE_%u", m_Distance); return 0; } private: DISTANCE m_Distance; }; #endif // MSADist_h msadistkimura.h0000664000175000017500000000037512360262614012203 0ustar bobbob#ifndef MSADistKimura_h #define MSADistKimura_h #include "msadist.h" class MSADistKimura : public MSADist { public: virtual double ComputeDist(const MSA &msa, unsigned uSeqIndex1, unsigned uSeqIndex2); }; #endif // MSADistKimura_h msadistmafft.h0000664000175000017500000000102012360262614011774 0ustar bobbob#ifndef MSADistMAFFT_h #define MSADistMAFFT_h #include "msadist.h" #include extern double PctIdToMAFFTDist(double dPctId); class MSADistMAFFT : public MSADist { public: virtual double ComputeDist(const MSA &msa, unsigned uSeqIndex1, unsigned uSeqIndex2) { double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2); //if (dPctId < 0.05) // dPctId = 0.05; //double dDist = -log(dPctId); //return dDist; return PctIdToMAFFTDist(dPctId); } }; #endif // MSADistMAFFT_h msa.h0000664000175000017500000001413712360262614010107 0ustar bobbob#ifndef MSA_h #define MSA_h const int MAX_SEQ_NAME = 63; struct PathEdge; class TextFile; class Seq; class ClusterNode; class NodeCounts; class DataBuffer; class MSA { public: MSA(); virtual ~MSA(); public: // Ways to create an MSA void FromFile(TextFile &File); void FromFASTAFile(TextFile &File); void FromSeq(const Seq &s); void ToFile(TextFile &File) const; void ToFASTAFile(TextFile &File) const; void ToMSFFile(TextFile &File, const char *ptrComment = 0) const; void ToAlnFile(TextFile &File) const; void ToHTMLFile(TextFile &File) const; void ToPhySequentialFile(TextFile &File) const; void ToPhyInterleavedFile(TextFile &File) const; void SetSize(unsigned uSeqCount, unsigned uColCount); void SetSeqCount(unsigned uSeqCount); char GetChar(unsigned uSeqIndex, unsigned uIndex) const; unsigned GetLetter(unsigned uSeqIndex, unsigned uIndex) const; unsigned GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const; const char *GetSeqName(unsigned uSeqIndex) const; unsigned GetSeqId(unsigned uSeqIndex) const; unsigned GetSeqIndex(unsigned uId) const; bool GetSeqIndex(unsigned uId, unsigned *ptruIndex) const; double GetOcc(unsigned uColIndex) const; void GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize, FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd, FCOUNT *fcGapExtend, FCOUNT *ptrfOcc, FCOUNT *fcLL, FCOUNT *fcLG, FCOUNT *fcGL, FCOUNT *fcGG) const; bool IsGap(unsigned uSeqIndex, unsigned uColIndex) const; bool IsWildcard(unsigned uSeqIndex, unsigned uColIndex) const; bool IsGapColumn(unsigned uColIndex) const; bool ColumnHasGap(unsigned uColIndex) const; bool IsGapSeq(unsigned uSeqIndex) const; void SetChar(unsigned uSeqIndex, unsigned uColIndex, char c); void SetSeqName(unsigned uSeqIndex, const char szName[]); void SetSeqId(unsigned uSeqIndex, unsigned uId); bool HasGap() const; bool IsLegalLetter(unsigned uLetter) const; void GetSeq(unsigned uSeqIndex, Seq &seq) const; void Copy(const MSA &msa); double GetCons(unsigned uColIndex) const; double GetAvgCons() const; double GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const; bool GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const; void DeleteCol(unsigned uColIndex); void DeleteColumns(unsigned uColIndex, unsigned uColCount); void CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex); void DeleteSeq(unsigned uSeqIndex); // void DeleteEmptyCols(bool bProgress = false); bool IsEmptyCol(unsigned uColIndex) const; WEIGHT GetSeqWeight(unsigned uSeqIndex) const; WEIGHT GetTotalSeqWeight() const; void SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const; void NormalizeWeights(WEIGHT wTotal) const; bool WeightsSet() const; unsigned GetGCGCheckSum(unsigned uSeqIndex) const; ALPHA GuessAlpha() const; void FixAlpha(); unsigned UniqueResidueTypes(unsigned uColIndex) const; void UnWeight(); void GetNodeCounts(unsigned uAlignedColIndex, NodeCounts &Counts) const; void ValidateBreakMatrices() const; unsigned GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const; const char *GetSeqBuffer(unsigned uSeqIndex) const; unsigned AlignedColIndexToColIndex(unsigned uAlignedColIndex) const; unsigned GetSeqLength(unsigned uSeqIndex) const; void GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrdPWID, unsigned *ptruPosCount) const; void GetPairMap(unsigned uSeqIndex1, unsigned uSeqIndex2, int iMap1[], int iMap2[]) const; void LogMe() const; void ListWeights() const; void GapInfoToDataBuffer(DataBuffer &Buffer) const; void GapInfoFromDataBuffer(const DataBuffer &Buffer); double GetPctGroupIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const; void Clear() { Free(); } unsigned GetSeqCount() const { return m_uSeqCount; } unsigned GetColCount() const { return m_uColCount; } static bool SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2, unsigned uSeqIndex2); static void SetIdCount(unsigned uIdCount); private: friend void SetMSAWeightsMuscle(MSA &msa); friend void SetThreeWayWeightsMuscle(MSA &msa); void SetHenikoffWeightsPB() const; void SetHenikoffWeights() const; void SetGSCWeights() const; void SetUniformWeights() const; void SetClustalWWeights(const Tree &tree); void Free(); void AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel); void ExpandCache(unsigned uSeqCount, unsigned uColCount); void CalcWeights() const; void GetNameFromFASTAAnnotationLine(const char szLine[], char szName[], unsigned uBytes); void CopyCol(unsigned uFromCol, unsigned uToCol); unsigned CalcBLOSUMWeights(ClusterTree &BlosumCluster) const; void SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const; unsigned SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const; void SetSubtreeWeight2(const ClusterNode *ptrNode) const; void SetSubtreeGSCWeight(ClusterNode *ptrNode) const; void CalcHenikoffWeightsColPB(unsigned uColIndex) const; void CalcHenikoffWeightsCol(unsigned uColIndex) const; private: unsigned m_uSeqCount; unsigned m_uColCount; unsigned m_uCacheSeqLength; unsigned m_uCacheSeqCount; char **m_szSeqs; char **m_szNames; static unsigned m_uIdCount; unsigned *m_IdToSeqIndex; unsigned *m_SeqIndexToId; WEIGHT *m_Weights; }; void SeqVectFromMSA(const MSA &msa, SeqVect &v); void DeleteGappedCols(MSA &msa); void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount, MSA &msaOut); void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat); void MSAAppend(MSA &msa1, const MSA &msa2); void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount, MSA &msaOut); void AssertMSAEq(const MSA &msa1, const MSA &msa2); void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2); void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount, MSA &msaOut); void SetMSAWeightsMuscle(MSA &msa); void SetClustalWWeightsMuscle(MSA &msa); void SetThreeWayWeightsMuscle(MSA &msa); #endif // MSA_h muscle.h0000664000175000017500000002706512360262614010623 0ustar bobbob#if DEBUG && !_DEBUG #define _DEBUG 1 #endif #if _DEBUG && !DEBUG #define DEBUG 1 #endif #if _MSC_VER #define TIMING 0 #endif #define VER_3_52 0 #ifdef _MSC_VER // Miscrosoft compiler #pragma warning(disable : 4800) // int-bool conversion #pragma warning(disable : 4996) // deprecated names like strdup, isatty. #endif extern const char *MUSCLE_LONG_VERSION; #define SHORT_VERSION "3.8" #include #include #include #include #include #define DOUBLE_AFFINE 0 #define SINGLE_AFFINE 1 #define PAF 0 #include "types.h" #include "intmath.h" #include "alpha.h" #include "params.h" #ifndef _WIN32 #define stricmp strcasecmp #define strnicmp strncasecmp #define _snprintf snprintf #define _fsopen(name, mode, share) fopen((name), (mode)) #endif #if DEBUG #undef assert #define assert(b) Call_MY_ASSERT(__FILE__, __LINE__, b, #b) void Call_MY_ASSERT(const char *file, int line, bool b, const char *msg); #else #define assert(exp) ((void)0) #endif extern int g_argc; extern char **g_argv; #define Rotate(a, b, c) { SCORE *tmp = a; a = b; b = c; c = tmp; } const double VERY_LARGE_DOUBLE = 1e20; extern unsigned g_uTreeSplitNode1; extern unsigned g_uTreeSplitNode2; // Number of elements in array a[] #define countof(a) (sizeof(a)/sizeof(a[0])) // Maximum of two of any type #define Max2(a, b) ((a) > (b) ? (a) : (b)) // Maximum of three of any type #define Max3(a, b, c) Max2(Max2(a, b), c) // Minimum of two of any type #define Min2(a, b) ((a) < (b) ? (a) : (b)) // Maximum of four of any type #define Max4(a, b, c, d) Max2(Max2(a, b), Max2(c, d)) const double VERY_NEGATIVE_DOUBLE = -9e29; const float VERY_NEGATIVE_FLOAT = (float) -9e29; const double BLOSUM_DIST = 0.62; // todo settable // insane value for uninitialized variables const unsigned uInsane = 8888888; const int iInsane = 8888888; const SCORE scoreInsane = 8888888; const char cInsane = (char) 0xcd; // int 3 instruction, used e.g. for unint. memory const double dInsane = VERY_NEGATIVE_DOUBLE; const float fInsane = VERY_NEGATIVE_FLOAT; const char INVALID_STATE = '*'; const BASETYPE BTInsane = (BASETYPE) dInsane; const WEIGHT wInsane = BTInsane; extern double g_dNAN; extern unsigned long g_tStart; void Quit(const char szFormat[], ...); void Warning(const char szFormat[], ...); void TrimBlanks(char szStr[]); void TrimLeadingBlanks(char szStr[]); void TrimTrailingBlanks(char szStr[]); void Log(const char szFormat[], ...); bool Verbose(); const char *ScoreToStr(SCORE Score); const char *ScoreToStrL(SCORE Score); SCORE StrToScore(const char *pszStr); void Break(); double VecSum(const double v[], unsigned n); bool IsValidInteger(const char *Str); bool IsValidSignedInteger(const char *Str); bool IsValidIdentifier(const char *Str); bool IsValidFloatChar(char c); bool isident(char c); bool isidentf(char c); void TreeFromSeqVect(const SeqVect &c, Tree &tree, CLUSTER Cluster, DISTANCE Distance, ROOT Root, const char *SaveFileName = 0); void TreeFromMSA(const MSA &msa, Tree &tree, CLUSTER Cluster, DISTANCE Distance, ROOT Root, const char *SaveFileName = 0); void StripGaps(char szStr[]); void StripWhitespace(char szStr[]); const char *GetTimeAsStr(); unsigned CalcBLOSUMWeights(MSA &Aln, ClusterTree &BlosumCluster); void CalcGSCWeights(MSA &Aln, const ClusterTree &BlosumCluster); void AssertNormalized(const PROB p[]); void AssertNormalizedOrZero(const PROB p[]); void AssertNormalized(const double p[]); bool VectorIsZero(const double dValues[], unsigned n); void VectorSet(double dValues[], unsigned n, double d); bool VectorIsZero(const float dValues[], unsigned n); void VectorSet(float dValues[], unsigned n, float d); // @@TODO should be "not linux" #if _WIN32 double log2(double x); // Defined in on Linux #endif double pow2(double x); double lnTolog2(double ln); double lp2(double x); SCORE SumLog(SCORE x, SCORE y); SCORE SumLog(SCORE x, SCORE y, SCORE z); SCORE SumLog(SCORE w, SCORE x, SCORE y, SCORE z); double lp2Fast(double x); double SumLogFast(double x, double y); double SumLogFast(double x, double y, double z); double SumLogFast(double w, double x, double y, double z); void chkmem(const char szMsg[] = ""); void Normalize(PROB p[], unsigned n); void Normalize(PROB p[], unsigned n, double dRequiredTotal); void NormalizeUnlessZero(PROB p[], unsigned n); void DebugPrintf(const char szFormat[], ...); void SetListFileName(const char *ptrListFileName, bool bAppend); void ModelFromAlign(const char *strInputFileName, const char *strModelFileName, double dMaxNIC); double GetMemUseMB(); double GetRAMSizeMB(); double GetPeakMemUseMB(); void CheckMemUse(); const char *ElapsedTimeAsString(); char *SecsToHHMMSS(long lSecs, char szStr[]); double GetCPUGHz(); SCORE GetBlosum62(unsigned uLetterA, unsigned uLetterB); SCORE GetBlosum62d(unsigned uLetterA, unsigned uLetterB); SCORE GetBlosum50(unsigned uLetterA, unsigned uLetterB); void AssertNormalizedDist(const PROB p[], unsigned N); void CmdLineError(const char *Format, ...); void Fatal(const char *Format, ...); void InitCmd(); void ExecCommandLine(int argc, char *argv[]); void DoCmd(); void SetLogFile(); void NameFromPath(const char szPath[], char szName[], unsigned uBytes); char *strsave(const char *s); void DistKmer20_3(const SeqVect &v, DistFunc &DF); void DistKbit20_3(const SeqVect &v, DistFunc &DF); void DistKmer6_6(const SeqVect &v, DistFunc &DF); void DistKmer4_6(const SeqVect &v, DistFunc &DF); void DistPWKimura(const SeqVect &v, DistFunc &DF); void FastDistKmer(const SeqVect &v, DistFunc &DF); void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF); double PctIdToMAFFTDist(double dPctId); double KimuraDist(double dPctId); void SetFastParams(); void AssertProfsEq(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB); void ValidateMuscleIds(const MSA &msa); void ValidateMuscleIds(const Tree &tree); void TraceBackToPath(int **TraceBack, unsigned uLengthA, unsigned uLengthB, PWPath &Path); void BitTraceBack(char **TraceBack, unsigned uLengthA, unsigned uLengthB, char LastEdge, PWPath &Path); SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path, bool bLockLeft = false, bool bLockRight = false); SCORE AlignTwoProfs( const ProfPos *PA, unsigned uLengthA, WEIGHT wA, const ProfPos *PB, unsigned uLengthB, WEIGHT wB, PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut); void AlignTwoProfsGivenPath(const PWPath &Path, const ProfPos *PA, unsigned uLengthA, WEIGHT wA, const ProfPos *PB, unsigned uLengthB, WEIGHT wB, ProfPos **ptrPOut, unsigned *ptruLengthOut); void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB, MSA &msaCombined); void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB, MSA &msaCombined); SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const PWPath &Path); SCORE GlobalAlignDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignSP(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignSPN(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); SCORE GlobalAlignLE(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); void CalcThreeWayWeights(const Tree &tree, unsigned uNode1, unsigned uNode2, WEIGHT *Weights); SCORE GlobalAlignSS(const Seq &seqA, const Seq &seqB, PWPath &Path); bool RefineHoriz(MSA &msaIn, const Tree &tree, unsigned uIters, bool bLockLeft, bool bLockRight); bool RefineVert(MSA &msaIn, const Tree &tree, unsigned uIters); SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); void SetInputFileName(const char *pstrFileName); void SetIter(unsigned uIter); void IncIter(); void SetMaxIters(unsigned uMaxIters); void Progress(unsigned uStep, unsigned uTotalSteps); void Progress(const char *szFormat, ...); void SetStartTime(); void ProgressStepsDone(); void SetProgressDesc(const char szDesc[]); void SetSeqStats(unsigned uSeqCount, unsigned uMinL, unsigned uMaxL, unsigned uAvgL); void SetNewHandler(); void SaveCurrentAlignment(); void SetCurrentAlignment(MSA &msa); void SetOutputFileName(const char *out); #if DEBUG void SetMuscleSeqVect(SeqVect &v); void SetMuscleInputMSA(MSA &msa); void ValidateMuscleIds(const MSA &msa); void ValidateMuscleIds(const Tree &tree); #else #define SetMuscleSeqVect(x) /* empty */ #define SetMuscleInputMSA(x) /* empty */ #define ValidateMuscleIds(x) /* empty */ #endif void ProcessArgVect(int argc, char *argv[]); void ProcessArgStr(const char *Str); void Usage(); void SetParams(); void SortCounts(const FCOUNT fcCounts[], unsigned SortOrder[]); unsigned ResidueGroupFromFCounts(const FCOUNT fcCounts[]); FCOUNT SumCounts(const FCOUNT Counts[]); bool FlagOpt(const char *Name); const char *ValueOpt(const char *Name); void DoMuscle(); void ProfDB(); void DoSP(); void ProgAlignSubFams(); void Run(); void ListParams(); void OnException(); void SetSeqWeightMethod(SEQWEIGHT Method); SEQWEIGHT GetSeqWeightMethod(); WEIGHT GetMuscleSeqWeightById(unsigned uId); void ListDiagSavings(); void CheckMaxTime(); const char *MaxSecsToStr(); unsigned long GetStartTime(); void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a); ProgNode *ProgressiveAlignE(const SeqVect &v, const Tree &GuideTree, MSA &a); void CalcDistRangeKmer6_6(const MSA &msa, unsigned uRow, float Dist[]); void CalcDistRangeKmer20_3(const MSA &msa, unsigned uRow, float Dist[]); void CalcDistRangeKmer20_4(const MSA &msa, unsigned uRow, float Dist[]); void CalcDistRangePctIdKimura(const MSA &msa, unsigned uRow, float Dist[]); void CalcDistRangePctIdLog(const MSA &msa, unsigned uRow, float Dist[]); void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a); void MakeRootMSABrenner(SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a); void Refine(); void Local(); void Profile(); void PPScore(); void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage); char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, bool DeleteGaps = true); SCORE SW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); void TraceBackSW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, unsigned uPrefixLengthAMax, unsigned uPrefixLengthBMax, PWPath &Path); void DiffPaths(const PWPath &p1, const PWPath &p2, unsigned Edges1[], unsigned *ptruDiffCount1, unsigned Edges2[], unsigned *ptruDiffCount2); void SetPPScore(bool bRespectFlagOpts = true); void SetPPScore(PPSCORE p); SCORE GlobalAlignDimer(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); bool MissingCommand(); void Credits(); void ProfileProfile(MSA &msa1, MSA &msa2, MSA &msaOut); void MHackStart(SeqVect &v); void MHackEnd(MSA &msa); void WriteScoreFile(const MSA &msa); char ConsensusChar(const ProfPos &PP); void Stabilize(const MSA &msa, MSA &msaStable); void MuscleOutput(MSA &msa); PTR_SCOREMATRIX ReadMx(TextFile &File); void MemPlus(size_t Bytes, char *Where); void MemMinus(size_t Bytes, char *Where); objscore.h0000664000175000017500000000237712360262613011137 0ustar bobbob#ifndef ObjScore_h #define ObjScore_h SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2); SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2); SCORE ScoreGaps(const MSA &msa, const unsigned Cols[], unsigned ColCount); SCORE ObjScore(const MSA &msa, const unsigned SeqIndexes1[], unsigned uSeqCount1, const unsigned SeqIndexes2[], unsigned uSeqCount2); SCORE ObjScoreIds(const MSA &msa, const unsigned Ids1[], unsigned uCount1, const unsigned Ids2[], unsigned uCount2); void GetLetterScores(const MSA &msa, SCORE LetterScores[]); SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[] = 0); SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[] = 0); SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[] = 0); SCORE ObjScoreXP(const MSA &msa, const MSA &msa2); SCORE ObjScoreSPDimer(const MSA &msa); SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount, SCORE MatchScore[] = 0); SCORE DiffObjScore( const MSA &msa1, const PWPath &Path1, const unsigned Edges1[], unsigned uEdgeCount1, const MSA &msa2, const PWPath &Path2, const unsigned Edges2[], unsigned uEdgeCount2); #endif // ObjScore_h params.h0000664000175000017500000000556412360262613010615 0ustar bobbob#ifndef params_h #define params_h extern const char *g_pstrInFileName; extern const char *g_pstrOutFileName; extern const char *g_pstrFASTAOutFileName; extern const char *g_pstrMSFOutFileName; extern const char *g_pstrClwOutFileName; extern const char *g_pstrClwStrictOutFileName; extern const char *g_pstrHTMLOutFileName; extern const char *g_pstrPHYIOutFileName; extern const char *g_pstrPHYSOutFileName; extern const char *g_pstrDistMxFileName1; extern const char *g_pstrDistMxFileName2; extern const char *g_pstrFileName1; extern const char *g_pstrFileName2; extern const char *g_pstrSPFileName; extern const char *g_pstrMatrixFileName; extern const char *g_pstrUseTreeFileName; extern bool g_bUseTreeNoWarn; extern const char *g_pstrComputeWeightsFileName; extern const char *g_pstrScoreFileName; extern SCORE g_scoreGapOpen; extern SCORE g_scoreCenter; extern SCORE g_scoreGapExtend; extern SCORE g_scoreGapAmbig; #if DOUBLE_AFFINE extern SCORE g_scoreGapOpen2; extern SCORE g_scoreGapExtend2; #endif extern unsigned g_uSmoothWindowLength; extern unsigned g_uAnchorSpacing; extern unsigned g_uMaxTreeRefineIters; extern unsigned g_uMinDiagLength; extern unsigned g_uMaxDiagBreak; extern unsigned g_uDiagMargin; extern unsigned g_uRefineWindow; extern unsigned g_uWindowFrom; extern unsigned g_uWindowTo; extern unsigned g_uSaveWindow; extern unsigned g_uWindowOffset; extern unsigned g_uMaxSubFamCount; extern unsigned g_uHydrophobicRunLength; extern float g_dHydroFactor; extern float g_dSmoothScoreCeil; extern float g_dMinBestColScore; extern float g_dMinSmoothScore; extern float g_dSUEFF; extern bool g_bPrecompiledCenter; extern bool g_bNormalizeCounts; extern bool g_bDiags1; extern bool g_bDiags2; extern bool g_bDiags; extern bool g_bAnchors; extern bool g_bCatchExceptions; extern bool g_bMSF; extern bool g_bAln; extern bool g_bClwStrict; extern bool g_bHTML; extern bool g_bPHYI; extern bool g_bPHYS; extern bool g_bQuiet; extern bool g_bVerbose; extern bool g_bRefine; extern bool g_bRefineW; extern bool g_bRefineX; extern bool g_bLow; extern bool g_bSW; extern bool g_bClusterOnly; extern bool g_bProfile; extern bool g_bProfDB; extern bool g_bPPScore; extern bool g_bBrenner; extern bool g_bDimer; extern bool g_bVersion; extern bool g_bStable; extern bool g_bFASTA; extern bool g_bPAS; extern bool g_bTomHydro; extern bool g_bMakeTree; extern PPSCORE g_PPScore; extern OBJSCORE g_ObjScore; extern DISTANCE g_Distance1; extern CLUSTER g_Cluster1; extern ROOT g_Root1; extern SEQWEIGHT g_SeqWeight1; extern DISTANCE g_Distance2; extern CLUSTER g_Cluster2; extern ROOT g_Root2; extern SEQWEIGHT g_SeqWeight2; extern unsigned g_uMaxIters; extern unsigned long g_ulMaxSecs; extern unsigned g_uMaxMB; extern SEQTYPE g_SeqType; extern TERMGAPS g_TermGaps; #endif // params_h profile.h0000664000175000017500000001063612360262614010767 0ustar bobbob#ifndef FastProf2_h #define FastProf2_h #include "msa.h" #include "pwpath.h" #include // for log function class DiagList; class WeightList; struct ProfPos { bool m_bAllGaps; unsigned m_uSortOrder[21]; FCOUNT m_fcCounts[20]; FCOUNT m_LL; FCOUNT m_LG; FCOUNT m_GL; FCOUNT m_GG; SCORE m_AAScores[20]; unsigned m_uResidueGroup; FCOUNT m_fOcc; FCOUNT m_fcStartOcc; FCOUNT m_fcEndOcc; SCORE m_scoreGapOpen; SCORE m_scoreGapClose; #if DOUBLE_AFFINE SCORE m_scoreGapOpen2; SCORE m_scoreGapClose2; #endif // SCORE m_scoreGapExtend; }; struct ProgNode { ProgNode() { m_Prof = 0; m_EstringL = 0; m_EstringR = 0; } MSA m_MSA; ProfPos *m_Prof; PWPath m_Path; int *m_EstringL; int *m_EstringR; unsigned m_uLength; WEIGHT m_Weight; }; extern unsigned ResidueGroup[]; const unsigned RESIDUE_GROUP_MULTIPLE = (unsigned) ~0; extern PTR_SCOREMATRIX g_ptrScoreMatrix; ProfPos *ProfileFromMSA(const MSA &a); SCORE TraceBack(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, PWPath &Path); SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path); void ProgressiveAlign(const SeqVect &v, const Tree &tree, MSA &a); SCORE MSAPairSP(const MSA &msa1, const MSA &msa2); void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB, MSA &msaCombined); void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA = 0); SCORE ScoreProfPos2(const ProfPos &PPA, const ProfPos &PPB); SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, const PWPath &Path); bool IsHydrophilic(const FCOUNT fcCounts[]); int PAM200_Letter(unsigned uLetter1, unsigned uLetter2); SCORE AverageMatchScore(const PWPath &Path, unsigned uEdgeIndex, unsigned uWindowLength); void WindowSmooth(const SCORE Score[], unsigned uCount, unsigned uWindowLength, SCORE SmoothScore[], double dCeil = 9e29); SCORE FastScoreMSA_LA(const MSA &msa, SCORE MatchScore[] = 0); SCORE FastScoreMSA_NS(const MSA &msa, SCORE MatchScore[] = 0); SCORE FastScoreMSA_SP(const MSA &msa, SCORE MatchScore[] = 0); bool RefineMSA(MSA &msa, const Tree &tree); SCORE MSAQScore(const MSA &msa, SCORE MatchScore[] = 0); bool RefineBiParts(MSA &msa, const Tree &tree, bool R); void FindAnchorCols(const MSA &msa, unsigned AnchorCols[], unsigned *ptruAnchorColCount); double PctIdToHeight(double dPctId); double PctIdToHeightKimura(double dPctId); double PctIdToHeightMAFFT(double dPctId); double PctIdToMAFFTDist(double dPctId); bool RefineBlocks(MSA &msa, const Tree &tree); bool RefineSubfams(MSA &msaIn, const Tree &tree, unsigned uIters); void SetMuscleTree(const Tree &tree); void CalcClustalWWeights(const Tree &tree, WEIGHT Weights[]); void RealignDiffs(const MSA &msaIn, const Tree &Diffs, const unsigned IdToDiffsTreeNodeIndex[], MSA &msaOut); void RealignDiffsE(const MSA &msaIn, const SeqVect &v, const Tree &NewTree, const Tree &OldTree, const unsigned uNewNodeIndexToOldNodeIndex[], MSA &msaOut, ProgNode *OldProgNodes); void RefineTree(MSA &msa, Tree &tree); void RefineTreeE(MSA &msa, const SeqVect &v, Tree &tree, ProgNode *ProgNodes); bool IsHydrophobic(const FCOUNT fcCounts[]); void Hydro(ProfPos *Prof, unsigned uLength); void SetTermGaps(const ProfPos *Prof, unsigned uLength); // Macros to simulate 2D matrices #define DPL(PLA, PLB) DPL_[(PLB)*uPrefixCountA + (PLA)] #define DPM(PLA, PLB) DPM_[(PLB)*uPrefixCountA + (PLA)] #define DPD(PLA, PLB) DPD_[(PLB)*uPrefixCountA + (PLA)] #define DPE(PLA, PLB) DPE_[(PLB)*uPrefixCountA + (PLA)] #define DPI(PLA, PLB) DPI_[(PLB)*uPrefixCountA + (PLA)] #define DPJ(PLA, PLB) DPJ_[(PLB)*uPrefixCountA + (PLA)] #define DPU(PLA, PLB) DPU_[(PLB)*uPrefixCountA + (PLA)] #define TBM(PLA, PLB) TBM_[(PLB)*uPrefixCountA + (PLA)] #define TBD(PLA, PLB) TBD_[(PLB)*uPrefixCountA + (PLA)] #define TBE(PLA, PLB) TBE_[(PLB)*uPrefixCountA + (PLA)] #define TBI(PLA, PLB) TBI_[(PLB)*uPrefixCountA + (PLA)] #define TBJ(PLA, PLB) TBJ_[(PLB)*uPrefixCountA + (PLA)] SCORE ScoreProfPos2LA(const ProfPos &PPA, const ProfPos &PPB); SCORE ScoreProfPos2NS(const ProfPos &PPA, const ProfPos &PPB); SCORE ScoreProfPos2SP(const ProfPos &PPA, const ProfPos &PPB); SCORE ScoreProfPos2SPN(const ProfPos &PPA, const ProfPos &PPB); #endif // FastProf_h pwpath.h0000664000175000017500000000457612360262614010640 0ustar bobbob#ifndef PWPath_h #define PWPath_h /*** Each PWEdge in a PWPath specifies a column in a pair-wise (PW) alignment. "Path" is by analogy with the path through an HMM. Edge types are: 'M' LetterA + LetterB 'D' LetterA + GapB 'I' GapB + LetterA The mnemomic is Match, Delete, Insert (with respect to A). Here is a global alignment of sequences A and B. A: AMQT-F B: -M-TIF The path for this example is: Edge cType uPrefixLengthA uPrefixLengthB 0 D 1 0 1 M 2 1 2 D 3 1 3 M 4 2 4 I 4 3 5 M 5 4 Given the starting positions in each alignment (e.g., column zero for a global alignment), the prefix length fields are redundant; they are included only for convenience and as a sanity check, we are not trying to optimize for speed or space here. We use prefix lengths rather than column indexes because of the problem of representing the special case of a gap in the first position. ***/ class Seq; class MSA; class SatchmoParams; class PW; class TextFile; class PWScore; class PWEdge { public: char cType; unsigned uPrefixLengthA; unsigned uPrefixLengthB; bool Equal(const PWEdge &e) const { return uPrefixLengthA == e.uPrefixLengthA && uPrefixLengthB == e.uPrefixLengthB && cType == e.cType; } }; class PWPath { // Disable compiler defaults private: PWPath &operator=(const PWPath &rhs); PWPath(const PWPath &rhs); public: PWPath(); virtual ~PWPath(); public: void Clear(); void FromStr(const char Str[]); void Copy(const PWPath &Path); void AppendEdge(const PWEdge &Edge); void AppendEdge(char cType, unsigned uPrefixLengthA, unsigned uPrefixLengthB); void PrependEdge(const PWEdge &Edge); unsigned GetEdgeCount() const { return m_uEdgeCount; } const PWEdge &GetEdge(unsigned uEdgeIndex) const; void Validate(const PWScore &PWS) const; void Validate() const; void LogMe() const; void FromFile(TextFile &File); void ToFile(TextFile &File) const; void FromMSAPair(const MSA &msaA, const MSA &msaB); void AssertEqual(const PWPath &Path) const; bool Equal(const PWPath &Path) const; unsigned GetMatchCount() const; unsigned GetDeleteCount() const; unsigned GetInsertCount() const; private: void ExpandPath(unsigned uAdditionalEdgeCount); private: unsigned m_uEdgeCount; unsigned m_uArraySize; PWEdge *m_Edges; }; #endif // PWPath_h scorehistory.h0000664000175000017500000000077412360262613012065 0ustar bobbob#ifndef ScoreHistory_h #define ScoreHistory_h class ScoreHistory { public: ScoreHistory(unsigned uIters, unsigned uInternalNodeCount); ~ScoreHistory(); bool SetScore(unsigned uIter, unsigned uInternalNodeIndex, bool bRight, SCORE Score); void LogMe() const; SCORE GetScore(unsigned uIter, unsigned uInternalNodeIndex, bool bReversed, bool bRight) const; private: SCORE **m_Score; bool **m_bScoreSet; unsigned m_uIters; unsigned m_uNodeCount; }; #endif // ScoreHistory_h seq.h0000664000175000017500000000351412360262614010114 0ustar bobbob#ifndef Seq_h #define Seq_h #include class TextFile; class MSA; typedef std::vector CharVect; class Seq : public CharVect { public: Seq() { m_ptrName = 0; // Start with moderate size to avoid // thrashing the heap. reserve(200); } virtual ~Seq() { delete[] m_ptrName; } private: // Not implemented; prevent use of copy c'tor and assignment. Seq(const Seq &); Seq &operator=(const Seq &); public: void Clear() { clear(); delete[] m_ptrName; m_ptrName = 0; m_uId = uInsane; } const char *GetName() const { return m_ptrName; } unsigned GetId() const { if (uInsane == m_uId) Quit("Seq::GetId, id not set"); return m_uId; } void SetId(unsigned uId) { m_uId = uId; } bool FromFASTAFile(TextFile &File); void ToFASTAFile(TextFile &File) const; void ExtractUngapped(MSA &msa) const; void FromString(const char *pstrSeq, const char *pstrName); void Copy(const Seq &rhs); void CopyReversed(const Seq &rhs); void StripGaps(); void StripGapsAndWhitespace(); void ToUpper(); void SetName(const char *ptrName); unsigned GetLetter(unsigned uIndex) const; unsigned Length() const { return (unsigned) size(); } bool Eq(const Seq &s) const; bool EqIgnoreCase(const Seq &s) const; bool EqIgnoreCaseAndGaps(const Seq &s) const; bool HasGap() const; unsigned GetUngappedLength() const; void LogMe() const; char GetChar(unsigned uIndex) const { return operator[](uIndex); } void SetChar(unsigned uIndex, char c) { operator[](uIndex) = c; } void AppendChar(char c) { push_back(c); } void FixAlpha(); #ifndef _WIN32 reference at(size_type i) { return operator[](i); } const_reference at(size_type i) const { return operator[](i); } #endif private: char *m_ptrName; unsigned m_uId; }; #endif // Seq.h seqvect.h0000664000175000017500000000301612360262613010772 0ustar bobbob#ifndef SeqVect_h #define SeqVect_h #include #include "seq.h" typedef std::vector SeqVectBase; class SeqVect : public SeqVectBase { public: SeqVect() {} virtual ~SeqVect(); private: // Not implemented; prevent use of copy c'tor and assignment. SeqVect(const SeqVect &); SeqVect &operator=(const SeqVect &); public: void FromFile(TextFile &File) { FromFASTAFile(File); } void FromFASTAFile(TextFile &File); void ToFASTAFile(TextFile &File) const; void ToFile(TextFile &File) const { ToFASTAFile(File); } void PadToMSA(MSA &msa); void Copy(const SeqVect &rhs); void StripGaps(); void StripGapsAndWhitespace(); void ToUpper(); void Clear(); unsigned Length() const { return (unsigned) size(); } unsigned GetSeqCount() const { return (unsigned) size(); } void AppendSeq(const Seq &s); bool FindName(const char *ptrName, unsigned *ptruIndex) const; void LogMe() const; const char *GetSeqName(unsigned uSeqIndex) const; unsigned GetSeqId(unsigned uSeqIndex) const; unsigned GetSeqIdFromName(const char *Name) const; unsigned GetSeqLength(unsigned uSeqIndex) const; void SetSeqId(unsigned uSeqIndex, unsigned uId); Seq &GetSeq(unsigned uIndex); Seq &GetSeqById(unsigned uId); const Seq &GetSeq(unsigned uIndex) const; ALPHA GuessAlpha() const; void FixAlpha(); #ifndef _WIN32 reference at(size_type i) { return operator[](i); } const_reference at(size_type i) const { return operator[](i); } #endif }; #endif // SeqVect_h svnversion.h0000664000175000017500000000000712470507645011542 0ustar bobbob"1551" textfile.h0000664000175000017500000000306212360262614011146 0ustar bobbob#ifndef TextFile_h #define TextFile_h #include struct TEXTFILEPOS { unsigned uOffset; unsigned uLineNr; unsigned uColNr; }; const unsigned TextFileBufferSize = 256; class TextFile { private: // no default c'tor, not implemented TextFile(); public: virtual ~TextFile(); TextFile(const char szFileName[], bool bWrite = false); TextFile(FILE *ptrFile, const char *ptrFileName = "-"); void Close() { fclose(m_ptrFile); m_ptrFile = 0; } bool GetLine(char szLine[], unsigned uBytes); bool GetTrimLine(char szLine[], unsigned uBytes); void GetLineX(char szLine[], unsigned uBytes); bool GetToken(char szToken[], unsigned uBytes, const char szCharTokens[] = "{}"); void GetTokenX(char szToken[], unsigned uBytes, const char szCharTokens[] = "{}"); void Skip(); void SkipLine(); void SkipWhite(); bool SkipWhiteX(); void Rewind(); TEXTFILEPOS GetPos(); void SetPos(TEXTFILEPOS Pos); bool GetChar(char &c); void GetCharX(char &c); void GetNonblankChar(char &c); unsigned GetLineNr() { return m_uLineNr; } void PutString(const char szLine[]); void PutFormat(const char szFormat[], ...); void PutChar(char c); const char *GetFileName() { return m_ptrName; } void PushBack(int c) { m_cPushedBack = c; } FILE *GetStdioFile() const { return m_ptrFile; } private: void Init(FILE *ptrFile, const char *ptrFileName); private: FILE *m_ptrFile; unsigned m_uLineNr; unsigned m_uColNr; char *m_ptrName; bool m_bLastCharWasEOL; int m_cPushedBack; }; #endif // TextFile_h timing.h0000664000175000017500000000055412360262614010614 0ustar bobbob#if WIN32 typedef unsigned __int64 TICKS; #pragma warning(disable:4035) inline TICKS GetClockTicks() { _asm { _emit 0x0f _emit 0x31 } } #define StartTimer() __int64 t1__ = GetClockTicks() #define GetElapsedTicks() (GetClockTicks() - t1__) static double TicksToSecs(TICKS t) { return (__int64) t/2.5e9; } #endif // WIN32 tree.h0000664000175000017500000002253112360262614010263 0ustar bobbob#ifndef tree_h #define tree_h #include class Clust; const unsigned NULL_NEIGHBOR = UINT_MAX; enum NEWICK_TOKEN_TYPE { NTT_Unknown, // Returned from Tree::GetToken: NTT_Lparen, NTT_Rparen, NTT_Colon, NTT_Comma, NTT_Semicolon, NTT_String, // Following are never returned from Tree::GetToken: NTT_SingleQuotedString, NTT_DoubleQuotedString, NTT_Comment }; class Tree { public: Tree() { m_uNodeCount = 0; m_uCacheCount = 0; m_uNeighbor1 = 0; m_uNeighbor2 = 0; m_uNeighbor3 = 0; m_dEdgeLength1 = 0; m_dEdgeLength2 = 0; m_dEdgeLength3 = 0; m_dHeight = 0; m_bHasEdgeLength1 = 0; m_bHasEdgeLength2 = 0; m_bHasEdgeLength3 = 0; m_bHasHeight = 0; m_ptrName = 0; m_Ids = 0; } virtual ~Tree() { Clear(); } void Clear() { for (unsigned n = 0; n < m_uNodeCount; ++n) free(m_ptrName[n]); m_uNodeCount = 0; m_uCacheCount = 0; delete[] m_uNeighbor1; delete[] m_uNeighbor2; delete[] m_uNeighbor3; delete[] m_dEdgeLength1; delete[] m_dEdgeLength2; delete[] m_dEdgeLength3; delete[] m_bHasEdgeLength1; delete[] m_bHasEdgeLength2; delete[] m_bHasEdgeLength3; delete[] m_ptrName; delete[] m_Ids; delete[] m_bHasHeight; delete[] m_dHeight; m_uNeighbor1 = 0; m_uNeighbor2 = 0; m_uNeighbor3 = 0; m_dEdgeLength1 = 0; m_dEdgeLength2 = 0; m_dEdgeLength3 = 0; m_ptrName = 0; m_Ids = 0; m_uRootNodeIndex = 0; m_bHasHeight = 0; m_dHeight = 0; m_bRooted = false; } // Creation and manipulation void CreateRooted(); void CreateUnrooted(double dEdgeLength); void FromFile(TextFile &File); void FromClust(Clust &C); void Copy(const Tree &tree); void Create(unsigned uLeafCount, unsigned uRoot, const unsigned Left[], const unsigned Right[], const float LeftLength[], const float RightLength[], const unsigned LeafIds[], char *LeafNames[]); unsigned AppendBranch(unsigned uExistingNodeIndex); void SetLeafName(unsigned uNodeIndex, const char *ptrName); void SetLeafId(unsigned uNodeIndex, unsigned uId); void SetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2, double dLength); void RootUnrootedTree(unsigned uNodeIndex1, unsigned uNodeIndex2); void RootUnrootedTree(ROOT Method); void UnrootByDeletingRoot(); // Saving to file void ToFile(TextFile &File) const; // Accessor functions unsigned GetNodeCount() const { return m_uNodeCount; } unsigned GetLeafCount() const { if (m_bRooted) { assert(m_uNodeCount%2 == 1); return (m_uNodeCount + 1)/2; } else { assert(m_uNodeCount%2 == 0); return (m_uNodeCount + 2)/2; } } unsigned GetNeighbor(unsigned uNodeIndex, unsigned uNeighborSubscript) const; unsigned GetNeighbor1(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_uNeighbor1[uNodeIndex]; } unsigned GetNeighbor2(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_uNeighbor2[uNodeIndex]; } unsigned GetNeighbor3(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_uNeighbor3[uNodeIndex]; } unsigned GetParent(unsigned uNodeIndex) const { assert(m_bRooted && uNodeIndex < m_uNodeCount); return m_uNeighbor1[uNodeIndex]; } bool IsRooted() const { return m_bRooted; } unsigned GetLeft(unsigned uNodeIndex) const { assert(m_bRooted && uNodeIndex < m_uNodeCount); return m_uNeighbor2[uNodeIndex]; } unsigned GetRight(unsigned uNodeIndex) const { assert(m_bRooted && uNodeIndex < m_uNodeCount); return m_uNeighbor3[uNodeIndex]; } const char *GetName(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_ptrName[uNodeIndex]; } unsigned GetRootNodeIndex() const { assert(m_bRooted); return m_uRootNodeIndex; } unsigned GetNeighborCount(unsigned uNodeIndex) const { const unsigned n1 = m_uNeighbor1[uNodeIndex]; const unsigned n2 = m_uNeighbor2[uNodeIndex]; const unsigned n3 = m_uNeighbor3[uNodeIndex]; return (NULL_NEIGHBOR != n1) + (NULL_NEIGHBOR != n2) + (NULL_NEIGHBOR != n3); } bool IsLeaf(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); if (1 == m_uNodeCount) return true; return 1 == GetNeighborCount(uNodeIndex); } bool IsRoot(unsigned uNodeIndex) const { return IsRooted() && m_uRootNodeIndex == uNodeIndex; } unsigned GetLeafId(unsigned uNodeIndex) const; unsigned GetLeafNodeIndex(const char *ptrName) const; bool IsEdge(unsigned uNodeIndex1, unsigned uNodeIndex2) const; bool HasEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const; double GetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const; const char *GetLeafName(unsigned uNodeIndex) const; unsigned GetNeighborSubscript(unsigned uNodeIndex, unsigned uNeighborIndex) const; double GetNodeHeight(unsigned uNodeIndex) const; // Depth-first traversal unsigned FirstDepthFirstNode() const; unsigned NextDepthFirstNode(unsigned uNodeIndex) const; unsigned FirstDepthFirstNodeR() const; unsigned NextDepthFirstNodeR(unsigned uNodeIndex) const; // Equivalent of GetLeft/Right in unrooted tree, works in rooted tree too. unsigned GetFirstNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const; unsigned GetSecondNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const; // Getting parent node in unrooted tree defined iff leaf unsigned GetLeafParent(unsigned uNodeIndex) const; // Misc const char *NTTStr(NEWICK_TOKEN_TYPE NTT) const; void FindCenterByLongestSpan(unsigned *ptrNodeIndex1, unsigned *ptrNodeIndex2) const; void PruneTree(const Tree &tree, unsigned Subfams[], unsigned uSubfamCount); unsigned LeafIndexToNodeIndex(unsigned uLeafIndex) const; // Debugging & trouble-shooting support void Validate() const; void ValidateNode(unsigned uNodeIndex) const; void AssertAreNeighbors(unsigned uNodeIndex1, unsigned uNodeIndex2) const; void LogMe() const; private: unsigned UnrootFromFile(); NEWICK_TOKEN_TYPE GetTokenVerbose(TextFile &File, char szToken[], unsigned uBytes) const { NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, uBytes); Log("GetToken %10.10s %s\n", NTTStr(NTT), szToken); return NTT; } void InitCache(unsigned uCacheCount); void ExpandCache(); NEWICK_TOKEN_TYPE GetToken(TextFile &File, char szToken[], unsigned uBytes) const; bool GetGroupFromFile(TextFile &File, unsigned uNodeIndex, double *ptrdEdgeLength); unsigned GetLeafCountUnrooted(unsigned uNodeIndex1, unsigned uNodeIndex2, double *ptrdTotalDistance) const; void ToFileNodeRooted(TextFile &File, unsigned uNodeIndex) const; void ToFileNodeUnrooted(TextFile &File, unsigned uNodeIndex, unsigned uParent) const; void OrientParent(unsigned uNodeIndex, unsigned uParentNodeIndex); double FromClustNode(const Clust &C, unsigned uClustNodeIndex, unsigned uPhyNodeIndex); unsigned GetAnyNonLeafNode() const; // Yuck. Data is made public for the convenience of Tree::Copy. // There has to be a better way. public: unsigned m_uNodeCount; unsigned m_uCacheCount; unsigned *m_uNeighbor1; unsigned *m_uNeighbor2; unsigned *m_uNeighbor3; double *m_dEdgeLength1; double *m_dEdgeLength2; double *m_dEdgeLength3; double *m_dHeight; bool *m_bHasEdgeLength1; bool *m_bHasEdgeLength2; bool *m_bHasEdgeLength3; bool *m_bHasHeight; unsigned *m_Ids; char **m_ptrName; bool m_bRooted; unsigned m_uRootNodeIndex; }; struct PhyEnumEdgeState { PhyEnumEdgeState() { m_bInit = false; m_uNodeIndex1 = NULL_NEIGHBOR; m_uNodeIndex2 = NULL_NEIGHBOR; } bool m_bInit; unsigned m_uNodeIndex1; unsigned m_uNodeIndex2; }; const unsigned NODE_CHANGED = (unsigned) (~0); extern bool PhyEnumBiParts(const Tree &tree, PhyEnumEdgeState &ES, unsigned Leaves1[], unsigned *ptruCount1, unsigned Leaves2[], unsigned *ptruCount2); extern bool PhyEnumBiPartsR(const Tree &tree, PhyEnumEdgeState &ES, unsigned Leaves1[], unsigned *ptruCount1, unsigned Leaves2[], unsigned *ptruCount2); extern void ClusterByHeight(const Tree &tree, double dMaxHeight, unsigned Subtrees[], unsigned *ptruSubtreeCount); void ClusterBySubfamCount(const Tree &tree, unsigned uSubfamCount, unsigned Subfams[], unsigned *ptruSubfamCount); void GetLeaves(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[], unsigned *ptruLeafCount); void GetLeavesExcluding(const Tree &tree, unsigned uNodeIndex, unsigned uExclude, unsigned Leaves[], unsigned *ptruCount); void GetInternalNodesInHeightOrder(const Tree &tree, unsigned NodeIndexes[]); void ApplyMinEdgeLength(Tree &tree, double dMinEdgeLength); void LeafIndexesToLeafNames(const Tree &tree, const unsigned Leaves[], unsigned uCount, char *Names[]); void LeafIndexesToIds(const Tree &tree, const unsigned Leaves[], unsigned uCount, unsigned Ids[]); void MSASeqSubset(const MSA &msaIn, char *Names[], unsigned uSeqCount, MSA &msaOut); void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs, unsigned IdToDiffsLeafNodeIndex[]); void DiffTreesE(const Tree &NewTree, const Tree &OldTree, unsigned NewNodeIndexToOldNodeIndex[]); void FindRoot(const Tree &tree, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2, ROOT RootMethod); void FixRoot(Tree &tree, ROOT RootMethod); #endif // tree_h types.h0000664000175000017500000000417412360262613010472 0ustar bobbob#ifndef types_h #define types_h typedef unsigned char byte; // typedef unsigned int ushort; typedef float SCOREMATRIX[32][32]; typedef SCOREMATRIX *PTR_SCOREMATRIX; class MSA; class Seq; class ClusterTree; class DistFunc; class TextFile; class PWPath; class Tree; class SeqVect; class DistCalc; struct ProgNode; struct ProfPos; #if SINGLE_AFFINE // Compress M, D and I trace-back matrices into 4 bits enum { BIT_MM = 0x00, BIT_DM = 0x01, BIT_IM = 0x02, BIT_xM = 0x03, BIT_DD = 0x00, BIT_MD = 0x04, // ID not allowed BIT_xD = 0x04, BIT_II = 0x00, BIT_MI = 0x08, // DI not allowed BIT_xI = 0x08, }; #endif #if DOUBLE_AFFINE // Compress M, D, E, I and J trace-back matrices into 7 bits enum { BIT_MM = 0x00, BIT_DM = 0x01, BIT_EM = 0x02, BIT_IM = 0x03, BIT_JM = 0x04, BIT_xM = 0x07, BIT_DD = 0x00, BIT_MD = 0x08, // [EIJ]D not sallowed BIT_xD = 0x08, BIT_EE = 0x00, BIT_ME = 0x10, // [DDJ]E not allowed BIT_xE = 0x10, BIT_II = 0x00, BIT_MI = 0x20, // [EDJ]I not allowed BIT_xI = 0x20, BIT_JJ = 0x00, BIT_MJ = 0x40, // [EDI]J not allowed BIT_xJ = 0x40, }; #endif enum EXIT { EXIT_Success = 0, EXIT_NotStarted = 1, EXIT_FatalError = 2, EXIT_Except = 3, }; enum NODECMP { NODECMP_Undefined = 0, NODECMP_Same = 0, // equivalent to node in old tree NODECMP_Diff = 1, // equivalent & parent is changed NODECMP_Changed = 2 // no equivalent node in old tree }; // Declare enums using macro hacks (see enums.h). #define s(t) enum t { t##_Undefined = 0, #define c(t, x) t##_##x, #define e(t) }; #include "enums.h" // Declare conversion function XXXToStr(XXX x) // for each enum type XXX. #define s(t) const char *t##ToStr(t x); #define c(t, x) /* empty */ #define e(t) /* empty */ #include "enums.h" // Declare conversion function StrToXXX(const char *Str) // for each enum type XXX. #define s(t) t StrTo##t(const char *Str); #define c(t, x) /* empty */ #define e(t) /* empty */ #include "enums.h" const char *BoolToStr(bool b); const char *SecsToStr(unsigned long Secs); #endif // types_h unixio.h0000664000175000017500000000026412360262614010636 0ustar bobbob#ifdef WIN32 #include #include #else #include #include #endif #if !defined(WIN32) && !defined(O_BINARY) #define O_BINARY 0 #endif Makefile0000775000175000017500000000162112360262614010613 0ustar bobbob# Porting notes: # For Solaris and other platforms where the logf function # is missing from the math library, add the following line # to the end of muscle.h: # #define logf(x) ((float) log(x)) # Using -static increases the executable size and thus gives a very # small increase in start time, but is more portable (the binding # to dynamic libraries often breaks when a new library is released). # On OSX, using -static gives the error "ld: can't locate file for: -lcrt0.o", # this is fixed by deleting "-static" from the LDLIBS line. CFLAGS = -O3 -funroll-loops -Winline -DNDEBUG=1 LDLIBS = -lm -static # LDLIBS = -lm OBJ = .o EXE = RM = rm -f CP = cp GPP = g++ LD = $(GPP) $(CFLAGS) CPP = $(GPP) -c $(CFLAGS) all: muscle CPPSRC = $(sort $(wildcard *.cpp)) CPPOBJ = $(subst .cpp,.o,$(CPPSRC)) $(CPPOBJ): %.o: %.cpp $(CPP) $< -o $@ muscle: $(CPPOBJ) $(LD) -o muscle $(CPPOBJ) $(LDLIBS) strip muscle