annot.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000232211447530632014706 0ustar ????????????????#include "piler2.h" void Annot() { const char *InputFileName = RequiredValueOpt("annot"); const char *RepeatFileName = RequiredValueOpt("rep"); const char *OutputFileName = RequiredValueOpt("out"); ProgressStart("Reading repeat file"); int RepCount; RepData *Reps = ReadReps(RepeatFileName, &RepCount); ProgressDone(); Progress("%d records", RepCount); FILE *fInput = OpenStdioFile(InputFileName); FILE *fOutput = OpenStdioFile(OutputFileName, FILEIO_MODE_WriteOnly); ProgressStart("Transferring annotation"); GFFRecord Rec; while (GetNextGFFRecord(fInput, Rec)) { const bool Rev = Rec.Strand == '-'; const char *Annot = MakeAnnot(Rec.SeqName, Rec.Start-1, Rec.End-1, Rev, Reps, RepCount); fprintf(fOutput, "%s\t%s\t%s\t%d\t%d\t%.3g\t%c", // 0 1 2 3 4 5 6 Rec.SeqName, // 0 Rec.Source, // 1 Rec.Feature, // 2 Rec.Start, // 3 Rec.End, // 4 Rec.Score, // 5 Rec.Strand); // 6 if (-1 == Rec.Frame) fprintf(fOutput, "\t."); else fprintf(fOutput, "\t%d", Rec.Frame); fprintf(fOutput, "\t%s ; Annot \"%s\"\n", Rec.Attrs, Annot); } fclose(fInput); fclose(fOutput); ProgressDone(); } annotedge.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000234011447530632015533 0ustar ????????????????#include "piler2.h" void AnnotEdge() { const char *InputFileName = RequiredValueOpt("annotedge"); const char *RepeatFileName = RequiredValueOpt("rep"); const char *OutputFileName = RequiredValueOpt("out"); ProgressStart("Reading repeat file"); int RepCount; RepData *Reps = ReadReps(RepeatFileName, &RepCount); ProgressDone(); Progress("%d records", RepCount); FILE *fInput = OpenStdioFile(InputFileName); FILE *fOutput = OpenStdioFile(OutputFileName, FILEIO_MODE_WriteOnly); ProgressStart("Transferring annotation"); GFFRecord Rec; while (GetNextGFFRecord(fInput, Rec)) { const bool Rev = (Rec.Strand == '-'); const char *Annot = MakeAnnotEdge(Rec.SeqName, Rec.Start-1, Rec.End-1, Rev, Reps, RepCount); fprintf(fOutput, "%s\t%s\t%s\t%d\t%d\t%.3g\t%c", // 0 1 2 3 4 5 6 Rec.SeqName, // 0 Rec.Source, // 1 Rec.Feature, // 2 Rec.Start, // 3 Rec.End, // 4 Rec.Score, // 5 Rec.Strand); // 6 if (-1 == Rec.Frame) fprintf(fOutput, "\t."); else fprintf(fOutput, "\t%d", Rec.Frame); fprintf(fOutput, "\t%s ; Annot \"%s\"\n", Rec.Attrs, Annot); } fclose(fInput); fclose(fOutput); ProgressDone(); } cons.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000540511447530632014536 0ustar ????????????????#include "piler2.h" static const char Letter[5] = { 'A', 'C', 'G', 'T', '-'}; static void GetCounts(const char *Seqs, int ColCount, int SeqCount, int ColIndex, int Counts[5]) { memset(Counts, 0, 5*sizeof(unsigned)); for (int SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { char c = Seqs[SeqIndex*ColCount + ColIndex]; c = toupper(c); switch (c) { case 'a': case 'A': ++(Counts[0]); break; case 'c': case 'C': ++(Counts[1]); break; case 'g': case 'G': ++(Counts[2]); break; case 't': case 'T': ++(Counts[3]); break; case '-': ++(Counts[4]); break; } } } static bool Conserved(const char *Seqs, int ColCount, int SeqCount, int Col) { int Counts[5]; GetCounts(Seqs, ColCount, SeqCount, Col, Counts); if (Counts[4] > 0) return false; for (int i = 0; i < 4; ++i) if (Counts[i] != 0 && Counts[i] != SeqCount) return false; return true; } static void FindStartEndExact(const char *Seqs, int ColCount, int SeqCount, int *ptrStart, int *ptrEnd) { int BestLength = 0; int BestStart = 0; int BestEnd = 0; int Length = 0; int Start = 0; for (int Col = 0; Col <= ColCount; ++Col) { if (Col != ColCount && Conserved(Seqs, ColCount, SeqCount, Col)) { if (Start == -1) { Start = Col; Length = 1; } else ++Length; } else { if (Length > BestLength) { BestStart = Start; BestEnd = Col - 1; if (BestEnd - BestStart + 1 != Length) Quit("BestEnd"); } Length = -1; Start = -1; } } *ptrStart = BestStart; *ptrEnd = BestEnd; } void Cons() { const char *InputFileName = RequiredValueOpt("cons"); const char *OutputFileName = RequiredValueOpt("out"); const char *Label = RequiredValueOpt("label"); bool Exact = FlagOpt("exact"); ProgressStart("Reading alignment"); int ColCount; int SeqCount; const char *Seqs = ReadAFA(InputFileName, &ColCount, &SeqCount); ProgressDone(); Progress("%d seqs, length %d", SeqCount, ColCount); char *ConsSeq = all(char, ColCount+1); int ConsSeqLength = 0; int StartCol = 0; int EndCol = ColCount - 1; if (Exact) FindStartEndExact(Seqs, ColCount, SeqCount, &StartCol, &EndCol); for (int Col = StartCol; Col <= EndCol; ++Col) { int Counts[5]; GetCounts(Seqs, ColCount, SeqCount, Col, Counts); int MaxCount = 0; char MaxLetter = 'A'; for (int i = 0; i < 4; ++i) { if (Counts[i] > MaxCount) { MaxLetter = Letter[i]; MaxCount = Counts[i]; } } if (MaxLetter == '-') continue; ConsSeq[ConsSeqLength++] = MaxLetter; } FILE *f = OpenStdioFile(OutputFileName, FILEIO_MODE_WriteOnly); WriteFasta(f, ConsSeq, ConsSeqLength, Label); } contigs.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000001052011447530632015234 0ustar ????????????????#include "piler2.h" const int INSANE_LENGTH = 500000000; // Arbitrarily chosen prime number static ContigData *HashTable[HASH_TABLE_SIZE]; static ContigData **ContigMap; static int SeqLength; void SetSeqLength(int Length) { SeqLength = Length; } static ContigData *HashLookup(const char *Label) { unsigned h = Hash(Label); assert(h < HASH_TABLE_SIZE); ContigData *p = HashTable[h]; while (p != 0) { if (0 == strcmp(Label, p->Label)) return p; p = p->Next; } return 0; } void LogContigs() { Log(" Label Hash Pos Length\n"); Log("-------------------- ------ ---------- ----------\n"); for (int h = 0; h < HASH_TABLE_SIZE; ++h) { for (const ContigData *p = HashTable[h]; p != 0; p = p->Next) { const ContigData &Contig = *p; Log("%20.20s %6d %10d %10d", Contig.Label, h, Contig.From, Contig.Length); if (Hash(Contig.Label) != h) Log(" ** Hash=%d", Hash(Contig.Label)); Log("\n"); } } } static void AppendContig(const char *Label, int Pos) { ContigData *Contig = all(ContigData, 1); Contig->Label = strsave(Label); char *Space = strchr(Contig->Label, ' '); if (0 != Space) *Space = 0; unsigned h = Hash(Label); assert(h < HASH_TABLE_SIZE); ContigData *p = HashTable[h]; Contig->Length = Pos + 1; Contig->Next = p; HashTable[h] = Contig; } void AddContigPos(const char *Label, int Pos) { ContigData *Contig = HashLookup(Label); if (0 == Contig) AppendContig(Label, Pos); else { if (Pos > Contig->Length) Contig->Length = Pos; } } void AddContig(const char *Label, int GlobalPos, int Length) { ContigData *Contig = all(ContigData, 1); Contig->Label = strsave(Label); char *Space = strchr(Contig->Label, ' '); if (0 != Space) *Space = 0; unsigned h = Hash(Contig->Label); assert(h < HASH_TABLE_SIZE); ContigData *p = HashTable[h]; Contig->From = GlobalPos; Contig->Length = Length; Contig->Next = p; HashTable[h] = Contig; } int GlobalizeContigs() { int GlobalPos = 0; for (int h = 0; h < HASH_TABLE_SIZE; ++h) { for (ContigData *p = HashTable[h]; p != 0; p = p->Next) { ContigData &Contig = *p; const int Length = p->Length; if (Length < 1 || Length >= INSANE_LENGTH) Quit("GlobalizeContigs, insane length %d", Length); p->From = GlobalPos; GlobalPos += Length; // Pad up to start of next bin // (required for contig map) int BinRemainder = GlobalPos%CONTIG_MAP_BIN_SIZE; if (BinRemainder > 0) GlobalPos += CONTIG_MAP_BIN_SIZE - BinRemainder; if (GlobalPos%CONTIG_MAP_BIN_SIZE) Quit("Dumb mistake rounding contig"); } } SeqLength = GlobalPos; return SeqLength; } void MakeContigMap() { if (SeqLength%CONTIG_MAP_BIN_SIZE) Quit("MakeContigMap: expects rounded size"); const int BinCount = SeqLength/CONTIG_MAP_BIN_SIZE; ContigMap = all(ContigData *, BinCount); zero(ContigMap, ContigData *, BinCount); for (int h = 0; h < HASH_TABLE_SIZE; ++h) { for (ContigData *p = HashTable[h]; p != 0; p = p->Next) { ContigData &Contig = *p; int From = p->From; int To = From + p->Length - 1; if (From%CONTIG_MAP_BIN_SIZE) Quit("MakeContigMap: expected rounded contig from"); int BinFrom = From/CONTIG_MAP_BIN_SIZE; int BinTo = To/CONTIG_MAP_BIN_SIZE; for (int Bin = BinFrom; Bin <= BinTo; ++Bin) { if (ContigMap[Bin] != 0) Quit("MakeContigMap: overlap error"); ContigMap[Bin] = p; } } } } const char *GlobalToContig(int GlobalPos, int *ptrContigPos) { if (GlobalPos < 0 || GlobalPos >= SeqLength) Quit("GlobalToContig: invalid pos"); const int Bin = GlobalPos/CONTIG_MAP_BIN_SIZE; ContigData *Contig = ContigMap[Bin]; if (0 == Contig) Quit("GlobalToContig: doesn't map"); int ContigPos = GlobalPos - Contig->From; if (ContigPos < 0 || ContigPos >= Contig->Length) Quit("GlobalToContig: out of bounds"); *ptrContigPos = ContigPos; return Contig->Label; } int ContigToGlobal(int ContigPos, const char *Label) { const ContigData *Contig = HashLookup(Label); if (0 == Contig) Quit("ContigToGlobal: contig not found (%s)", Label); if (ContigPos < 0 || ContigPos >= Contig->Length) Quit("ContigToGlobal: out of bounds"); return Contig->From + ContigPos; } crisp.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000004014511447530632014714 0ustar ????????????????#include "piler2.h" #include "bitfuncs.h" #include #define TRACE 0 #define GFFRecord GFFRecord2 /*** Find CRISPR families. CRISPR candidate: <-----spacer-------> ------====--------------------====------------ genome Pile Pile ^-----------------------^ Hit Candidate if (a) hit length >= MIN_LENGTH_CRISPR and <= MAX_LENGTH_CRISPR, and (b) offset is >= MIN_LENGTH_SPACER and <= MAX_LENGTH_SPACER. ***/ static int MIN_CRISPR_LENGTH = 10; static int MAX_CRISPR_LENGTH = 200; static int MIN_SPACER_LENGTH = 10; static int MAX_SPACER_LENGTH = 200; static double MIN_CRISPR_RATIO = 0.8; static double MIN_SPACER_RATIO = 0.8; static int MIN_FAM_SIZE = 3; static int MAX_SPACE_DIFF = 20; static int g_paramMinFamSize = 3; static int g_paramMaxLengthDiffPct = 5; static bool g_paramSingleHitCoverage = true; static PileData *Piles; static int PileCount; static int EdgeCount; static int MaxImageCount = 0; static int SeqLength; static int SeqLengthChunks; static int GetHitLength(const HitData &Hit) { const int QueryHitLength = Hit.QueryTo - Hit.QueryFrom + 1; const int TargetHitLength = Hit.TargetTo - Hit.TargetFrom + 1; return (QueryHitLength + TargetHitLength)/2; } static int GetSpacerLength(const HitData &Hit) { if (Hit.QueryFrom > Hit.TargetTo) return Hit.QueryFrom - Hit.TargetTo; else return Hit.TargetFrom - Hit.QueryTo; } static bool IsCand(const HitData &Hit) { if (Hit.Rev) return false; int HitLength = GetHitLength(Hit); if (HitLength > MAX_CRISPR_LENGTH || HitLength < MIN_CRISPR_LENGTH) return false; int SpacerLength = GetSpacerLength(Hit); if (SpacerLength > MAX_SPACER_LENGTH || SpacerLength < MIN_SPACER_LENGTH) return false; return true; } static PILE_INDEX_TYPE *IdentifyPiles(int *CopyCount) { PILE_INDEX_TYPE *PileIndexes = all(PILE_INDEX_TYPE, SeqLengthChunks); #if DEBUG memset(PileIndexes, 0xff, SeqLengthChunks*sizeof(PILE_INDEX_TYPE)); #endif int PileIndex = -1; bool InPile = false; for (int i = 0; i < SeqLengthChunks; ++i) { if (BitIsSet(CopyCount, i)) { if (!InPile) { ++PileIndex; if (PileIndex > MAX_STACK_INDEX) Quit("Too many stacks"); InPile = true; } PileIndexes[i] = PileIndex; } else InPile = false; } PileCount = PileIndex + 1; return PileIndexes; } static void IncCopyCountImage(int *CopyCount, int From, int To) { if (From < 0) Quit("From < 0"); From /= CHUNK_LENGTH; To /= CHUNK_LENGTH; if (From >= SeqLengthChunks) { Warning("IncCopyCountImage: From=%d, SeqLength=%d", From, SeqLengthChunks); From = SeqLengthChunks - 1; } if (To >= SeqLengthChunks) { Warning("IncCopyCountImage: To=%d, SeqLength=%d", To, SeqLengthChunks); To = SeqLengthChunks - 1; } if (From > To) Quit("From > To"); for (int i = From; i <= To; ++i) SetBit(CopyCount, i); } static void IncCopyCount(int *CopyCount, const HitData &Hit) { IncCopyCountImage(CopyCount, Hit.TargetFrom, Hit.TargetTo); IncCopyCountImage(CopyCount, Hit.QueryFrom, Hit.QueryTo); } static int CmpHits(const void *ptrHit1, const void *ptrHit2) { HitData *Hit1 = (HitData *) ptrHit1; HitData *Hit2 = (HitData *) ptrHit2; return Hit1->QueryFrom - Hit2->QueryFrom; } static int CmpImages(const void *ptrImage1, const void *ptrImage2) { PileImageData *Image1 = (PileImageData *) ptrImage1; PileImageData *Image2 = (PileImageData *) ptrImage2; return Image1->SIPile - Image2->SIPile; } static void AssertImagesSorted(PileImageData *Images, int ImageCount) { for (int i = 0; i < ImageCount - 1; ++i) if (Images[i].SIPile > Images[i+1].SIPile) Quit("Images not sorted"); } static void SortImagesPile(PileImageData *Images, int ImageCount) { qsort(Images, ImageCount, sizeof(PileImageData), CmpImages); } static void SortImages() { for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex) { PileData &Pile = Piles[PileIndex]; SortImagesPile(Pile.Images, Pile.ImageCount); #if DEBUG AssertImagesSorted(Pile.Images, Pile.ImageCount); #endif } } static bool FamMemberLt(const FamMemberData &Mem1, const FamMemberData &Mem2) { const PileData &Pile1 = Piles[Mem1.PileIndex]; const PileData &Pile2 = Piles[Mem2.PileIndex]; return Pile1.From < Pile2.From; } static void CreatePiles(const HitData *Hits, int HitCount, PILE_INDEX_TYPE *PileIndexes) { Piles = all(PileData, PileCount); zero(Piles, PileData, PileCount); for (int i = 0; i < PileCount; ++i) { Piles[i].FamIndex = -1; Piles[i].SuperFamIndex = -1; Piles[i].Rev = -1; } // Count images in stack ProgressStart("Create piles: count images"); for (int HitIndex = 0; HitIndex < HitCount; ++HitIndex) { const HitData &Hit = Hits[HitIndex]; int Pos = Hit.QueryFrom/CHUNK_LENGTH; PILE_INDEX_TYPE PileIndex = PileIndexes[Pos]; assert(PileIndex == PileIndexes[Hit.QueryTo/CHUNK_LENGTH]); assert(PileIndex >= 0 && PileIndex < PileCount); ++(Piles[PileIndex].ImageCount); Pos = Hit.TargetFrom/CHUNK_LENGTH; PileIndex = PileIndexes[Pos]; assert(PileIndex >= 0 && PileIndex < PileCount); assert(PileIndex == PileIndexes[Hit.TargetTo/CHUNK_LENGTH]); ++(Piles[PileIndex].ImageCount); } ProgressDone(); // Allocate memory for image list int TotalImageCount = 0; ProgressStart("Create piles: allocate image memory"); for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex) { PileData &Pile = Piles[PileIndex]; const int ImageCount = Pile.ImageCount; TotalImageCount += ImageCount; assert(ImageCount > 0); Pile.Images = all(PileImageData, ImageCount); } ProgressDone(); // Build image list ProgressStart("Create piles: build image list"); for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex) { PileData &Pile = Piles[PileIndex]; Pile.ImageCount = 0; Pile.From = -1; Pile.To = -1; } for (int HitIndex = 0; HitIndex < HitCount; ++HitIndex) { const HitData &Hit = Hits[HitIndex]; const bool Rev = Hit.Rev; const int Length1 = Hit.QueryTo - Hit.QueryFrom; const int Length2 = Hit.TargetTo - Hit.TargetFrom; const int From1 = Hit.QueryFrom; const int From2 = Hit.TargetFrom; const int To1 = Hit.QueryTo; const int To2 = Hit.TargetTo; const int Pos1 = From1/CHUNK_LENGTH; const int Pos2 = From2/CHUNK_LENGTH; PILE_INDEX_TYPE PileIndex1 = PileIndexes[Pos1]; PILE_INDEX_TYPE PileIndex2 = PileIndexes[Pos2]; assert(PileIndex1 == PileIndexes[(From1 + Length1 - 1)/CHUNK_LENGTH]); assert(PileIndex1 >= 0 && PileIndex1 < PileCount); assert(PileIndex2 == PileIndexes[(From2 + Length2 - 1)/CHUNK_LENGTH]); assert(PileIndex2 >= 0 && PileIndex2 < PileCount); PileData &Pile1 = Piles[PileIndex1]; PileImageData &Image1 = Pile1.Images[Pile1.ImageCount++]; Image1.SILength = Length2; Image1.SIPile = PileIndex2; Image1.SIRev = Rev; PileData &Pile2 = Piles[PileIndex2]; PileImageData &Image2 = Pile2.Images[Pile2.ImageCount++]; Image2.SILength = Length1; Image2.SIPile = PileIndex1; Image2.SIRev = Rev; if (Pile1.From == -1 || From1 < Pile1.From) Pile1.From = From1; if (Pile1.To == -1 || To1 > Pile1.To) Pile1.To = To1; if (Pile2.From == -1 || From2 < Pile2.From) Pile2.From = From2; if (Pile2.To == -1 || To2 > Pile2.To) Pile2.To = To2; if (Pile1.ImageCount > MaxImageCount) MaxImageCount = Pile1.ImageCount; if (Pile2.ImageCount > MaxImageCount) MaxImageCount = Pile2.ImageCount; } ProgressDone(); } static int PileDist(const PileData &Pile1, const PileData &Pile2) { if (Pile1.From > Pile2.To) return Pile1.From - Pile2.To; else return Pile2.From - Pile1.From; } static int FindEdgesPile(PileData &Pile, int PileIndex, PILE_INDEX_TYPE Partners[], bool PartnersRev[]) { const int PileLength = Pile.To - Pile.From + 1; if (PileLength < MIN_CRISPR_LENGTH || PileLength > MAX_CRISPR_LENGTH) return 0; const int ImageCount = Pile.ImageCount; int PartnerCount = 0; for (int ImageIndex = 0; ImageIndex < ImageCount; ++ImageIndex) { const PileImageData &Image = Pile.Images[ImageIndex]; const int PartnerImageLength = Image.SILength; const int PartnerPileIndex = Image.SIPile; const PileData &PartnerPile = Piles[PartnerPileIndex]; const int PartnerPileLength = PartnerPile.To - PartnerPile.From + 1; const int Dist = PileDist(Pile, PartnerPile); if (PartnerPileLength >= MIN_CRISPR_LENGTH && PartnerPileLength <= MAX_CRISPR_LENGTH && Dist >= MIN_SPACER_LENGTH && Dist <= MAX_SPACER_LENGTH) { PartnersRev[PartnerCount] = Image.SIRev; Partners[PartnerCount] = PartnerPileIndex; ++PartnerCount; } } return PartnerCount; } static void AddEdges(EdgeList &Edges, PILE_INDEX_TYPE PileIndex, PILE_INDEX_TYPE Partners[], bool PartnersRev[], int PartnerCount) { EdgeCount += PartnerCount; for (int i = 0; i < PartnerCount; ++i) { int PileIndex2 = Partners[i]; EdgeData Edge; Edge.Node1 = PileIndex; Edge.Node2 = PileIndex2; Edge.Rev = PartnersRev[i]; Edges.push_back(Edge); } } static void FindCandEdges(EdgeList &Edges, int MaxImageCount) { Edges.clear(); PILE_INDEX_TYPE *Partners = all(PILE_INDEX_TYPE, MaxImageCount); bool *PartnersRev = all(bool, MaxImageCount); for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex) { PileData &Pile = Piles[PileIndex]; int PartnerCount; PartnerCount = FindEdgesPile(Pile, PileIndex, Partners, PartnersRev); AddEdges(Edges, PileIndex, Partners, PartnersRev, PartnerCount); } freemem(Partners); freemem(PartnersRev); } static void AssignFamsToPiles(FamList &Fams) { int FamIndex = 0; for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p) { FamData *Fam = *p; if (Fam->size() < (size_t) g_paramMinFamSize) Quit("Fam size"); for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q) { FamMemberData &FamMember = *q; int PileIndex = FamMember.PileIndex; PileData &Pile = Piles[PileIndex]; Pile.FamIndex = FamIndex; Pile.Rev = (int) FamMember.Rev; } ++FamIndex; } } // Discard families that don't have regular spacing typedef std::vector FAMVEC; static void FilterCrispFam(const FamData &Fam, FamList &OutFams) { int PileCount = (int) Fam.size(); FAMVEC FamVec; FamVec.reserve(PileCount); for (FamData::const_iterator p = Fam.begin(); p != Fam.end(); ++p) FamVec.push_back(*p); std::sort(FamVec.begin(), FamVec.end(), FamMemberLt); #if TRACE Log("\n"); Log("FilterCrispFam fam size=%d\n", (int) Fam.size()); Log("\n"); Log("After sort:\n"); Log(" Pile From To\n"); Log("===== ======= =======\n"); for (int i = 0; i < PileCount; ++i) { const FamMemberData &Mem = FamVec[i]; int PileIndex = Mem.PileIndex; const PileData &Pile = Piles[PileIndex]; Log("%5d %7d %7d\n", PileIndex, Pile.From, Pile.To); } Log("\n"); #endif FamData OutFam; for (int i = 0; i < PileCount; ++i) { const FamMemberData &Mem = FamVec[i]; if (i == 0 || i == 1) { OutFam.push_back(Mem); continue; } const FamMemberData &Mem_1 = FamVec[i-1]; const FamMemberData &Mem_2 = FamVec[i-2]; int PileIndex = Mem.PileIndex; int PileIndex_1 = Mem_1.PileIndex; int PileIndex_2 = Mem_2.PileIndex; const PileData &Pile = Piles[PileIndex]; const PileData &Pile_1 = Piles[PileIndex_1]; const PileData &Pile_2 = Piles[PileIndex_2]; int Space_12 = Pile_1.From - Pile_2.To; int Space_1 = Pile.From - Pile_1.To; #if TRACE { Log(" \n", PileIndex_2, Pile_2.From, Pile_2.To, Space_12, PileIndex_1, Pile_1.From, Pile_2.To, Space_1, PileIndex, Pile.From, Pile.To); } #endif if (iabs(Space_12 - Space_1) <= MAX_SPACE_DIFF) { #if TRACE Log("Add %d to current family\n", PileIndex); #endif OutFam.push_back(Mem); } else { #if TRACE Log("Space difference too big, fam size so far %d\n", (int) OutFam.size()); #endif if (OutFam.size() >= (size_t) g_paramMinFamSize) { FamData *NewFam = new FamData; *NewFam = OutFam; OutFams.push_back(NewFam); OutFam = *new FamData; } else OutFam.clear(); } } if (OutFam.size() >= (size_t) g_paramMinFamSize) { FamData *NewFam = new FamData; *NewFam = OutFam; OutFams.push_back(NewFam); } } static void FilterCrispFams(const FamList &InFams, FamList &OutFams) { OutFams.clear(); for (FamList::const_iterator p = InFams.begin(); p != InFams.end(); ++p) { FamData *Fam = *p; FilterCrispFam(*Fam, OutFams); } } void Crisp() { const char *InputFileName = RequiredValueOpt("crisp"); const char *OutputFileName = ValueOpt("out"); const char *PilesFileName = ValueOpt("piles"); const char *ImagesFileName = ValueOpt("images"); const char *ArraysFileName = ValueOpt("arrays"); const char *strMinFamSize = ValueOpt("famsize"); if (0 == OutputFileName && 0 == PilesFileName && 0 == ImagesFileName) Quit("No output file specified, must be at least one of -out, -piles, -images"); if (0 != strMinFamSize) g_paramMinFamSize = atoi(strMinFamSize); ProgressStart("Read hit file"); int HitCount; int SeqLength; HitData *Hits = ReadHits(InputFileName, &HitCount, &SeqLength); ProgressDone(); Progress("%d hits", HitCount); ProgressStart("Filter candidate hits"); int NewHitCount = 0; for (int i = 0; i < HitCount; ++i) { HitData &Hit = Hits[i]; if (IsCand(Hit)) Hits[NewHitCount++] = Hit; } ProgressDone(); Progress("%d of %d candidate hits", NewHitCount, HitCount); HitCount = NewHitCount; SeqLengthChunks = (SeqLength + CHUNK_LENGTH - 1)/CHUNK_LENGTH; const int BitVectorLength = (SeqLengthChunks + BITS_PER_INT - 1)/BITS_PER_INT; int *CopyCount = all(int, BitVectorLength); zero(CopyCount, int, BitVectorLength); ProgressStart("Compute copy counts"); for (int i = 0; i < HitCount; ++i) IncCopyCount(CopyCount, Hits[i]); ProgressDone(); ProgressStart("Identify piles"); PILE_INDEX_TYPE *PileIndexes = IdentifyPiles(CopyCount); ProgressDone(); Progress("%d stacks", PileCount); freemem(CopyCount); CopyCount = 0; CreatePiles(Hits, HitCount, PileIndexes); if (0 != ImagesFileName) { ProgressStart("Writing images file"); WriteImages(ImagesFileName, Hits, HitCount, PileIndexes); ProgressDone(); } freemem(Hits); Hits = 0; if (0 != PilesFileName) { ProgressStart("Writing piles file"); WritePiles(PilesFileName, Piles, PileCount); ProgressDone(); } freemem(PileIndexes); PileIndexes = 0; if (0 == OutputFileName) return; ProgressStart("Find edges"); EdgeList Edges; FindCandEdges(Edges, MaxImageCount); ProgressDone(); Progress("%d edges", (int) Edges.size()); Progress("Find connected components"); FamList Fams; FindConnectedComponents(Edges, Fams, g_paramMinFamSize); Progress("Filter"); FamList OutFams; FilterCrispFams(Fams, OutFams); AssignFamsToPiles(OutFams); ProgressDone(); Progress("%d arrays", (int) OutFams.size()); ProgressStart("Write crisp file"); WriteCrispFile(OutputFileName, Piles, PileCount); ProgressDone(); if (0 != ArraysFileName) { FILE *fArray = OpenStdioFile(ArraysFileName, FILEIO_MODE_WriteOnly); ProgressStart("Writing arrays file"); int FamIndex = 0; for (PtrFamList p = OutFams.begin(); p != OutFams.end(); ++p) { int Lo = -1; int Hi = -1; FamData *Fam = *p; if (Fam->size() < (size_t) g_paramMinFamSize) Quit("Fam size"); for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q) { FamMemberData &FamMember = *q; int PileIndex = FamMember.PileIndex; PileData &Pile = Piles[PileIndex]; if (Lo == -1 || Pile.From < Lo) Lo = Pile.From; if (Hi == -1 || Pile.To > Hi) Hi = Pile.To; } WriteArray(fArray, FamIndex, Lo, Hi); ++FamIndex; } fclose(fArray); ProgressDone(); } } findcc.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000001146611447530636015032 0ustar ????????????????#include "piler2.h" enum REVSTATE { REVSTATE_Unknown = 0, REVSTATE_Normal = 1, REVSTATE_Reversed = 2 }; struct NeighborData { int NodeIndex; bool Rev; }; typedef std::list NeighborList; typedef NeighborList::iterator PtrNeighborList; struct NodeData { REVSTATE Rev; int Index; NeighborList *Neighbors; NodeData *Next; NodeData *Prev; NodeData **List; }; static NodeData *Nodes; static FamData *MakeFam(NodeData *Nodes) { FamData *Fam = new FamData; for (const NodeData *Node = Nodes; Node; Node = Node->Next) { FamMemberData FamMember; FamMember.PileIndex = Node->Index; switch (Node->Rev) { case REVSTATE_Unknown: Quit("REVSTATE_Unknown"); case REVSTATE_Normal: FamMember.Rev = false; break; case REVSTATE_Reversed: FamMember.Rev = true; break; } Fam->push_back(FamMember); } return Fam; } static void AddNodeToList(NodeData *Node, NodeData **List) { NodeData *Head = *List; Node->Next = Head; Node->Prev = 0; if (Head != 0) Head->Prev = Node; Node->List = List; *List = Node; } static void DeleteNodeFromList(NodeData *Node, NodeData **List) { assert(Node->List == List); NodeData *Head = *List; if (Node->Next != 0) Node->Next->Prev = Node->Prev; if (Node->Prev != 0) Node->Prev->Next = Node->Next; else *List = Node->Next; Node->List = 0; } static NodeData *ListHead(NodeData **List) { return *List; } static void MoveBetweenLists(NodeData *Node, NodeData **FromList, NodeData **ToList) { DeleteNodeFromList(Node, FromList); AddNodeToList(Node, ToList); } static bool ListIsEmpty(NodeData **List) { return 0 == *List; } static bool NodeIsInList(NodeData *Node, NodeData **List) { return Node->List == List; } static void LogList(NodeData **List) { for (const NodeData *Node = *List; Node; Node = Node->Next) Log(" %d", Node->Index); Log("\n"); } static int GetMaxIndex(EdgeList &Edges) { int MaxIndex = -1; for (PtrEdgeList p = Edges.begin(); p != Edges.end(); ++p) { EdgeData &Edge = *p; if (Edge.Node1 > MaxIndex) MaxIndex = Edge.Node1; if (Edge.Node2 > MaxIndex) MaxIndex = Edge.Node2; } return MaxIndex; } static REVSTATE RevState(REVSTATE Rev1, bool Rev2) { switch (Rev1) { case REVSTATE_Normal: if (Rev2) return REVSTATE_Reversed; return REVSTATE_Normal; case REVSTATE_Reversed: if (Rev2) return REVSTATE_Normal; return REVSTATE_Reversed; } assert(false); return REVSTATE_Unknown; } int FindConnectedComponents(EdgeList &Edges, FamList &Fams, int MinComponentSize) { Fams.clear(); if (0 == Edges.size()) return 0; int NodeCount = GetMaxIndex(Edges) + 1; Nodes = new NodeData[NodeCount]; for (int i = 0; i < NodeCount; ++i) { Nodes[i].Neighbors = new NeighborList; Nodes[i].Rev = REVSTATE_Unknown; Nodes[i].Index = i; } NodeData *NotVisitedList = 0; NodeData *PendingList = 0; NodeData *CurrentList = 0; for (PtrEdgeList p = Edges.begin(); p != Edges.end(); ++p) { EdgeData &Edge = *p; int From = Edge.Node1; int To = Edge.Node2; assert(From >= 0 && From < NodeCount); assert(To >= 0 && From < NodeCount); NeighborData NTo; NTo.NodeIndex = To; NTo.Rev = Edge.Rev; Nodes[From].Neighbors->push_back(NTo); NeighborData NFrom; NFrom.NodeIndex = From; NFrom.Rev = Edge.Rev; Nodes[To].Neighbors->push_back(NFrom); } for (int i = 0; i < NodeCount; ++i) AddNodeToList(&Nodes[i], &NotVisitedList); int FamCount = 0; while (!ListIsEmpty(&NotVisitedList)) { int ClassSize = 0; NodeData *Node = ListHead(&NotVisitedList); // This node becomes the first in the family // By convention, the first member defines reversal or lack thereof. Node->Rev = REVSTATE_Normal; assert(ListIsEmpty(&PendingList)); MoveBetweenLists(Node, &NotVisitedList, &PendingList); while (!ListIsEmpty(&PendingList)) { Node = ListHead(&PendingList); assert(REVSTATE_Normal == Node->Rev || REVSTATE_Reversed == Node->Rev); NeighborList *Neighbors = Node->Neighbors; for (PtrNeighborList p = Neighbors->begin(); p != Neighbors->end(); ++p) { NeighborData &Neighbor = *p; int NeighborIndex = Neighbor.NodeIndex; NodeData *NeighborNode = &(Nodes[NeighborIndex]); if (NodeIsInList(NeighborNode, &NotVisitedList)) { NeighborNode->Rev = RevState(Node->Rev, Neighbor.Rev); MoveBetweenLists(NeighborNode, &NotVisitedList, &PendingList); } } ++ClassSize; MoveBetweenLists(Node, &PendingList, &CurrentList); } if (ClassSize >= MinComponentSize) { FamData *Fam = MakeFam(CurrentList); Fams.push_back(Fam); ++FamCount; } CurrentList = 0; } return FamCount; } gff.cpp0000777000076500010010000000616611527243121011104 0ustar GuestNone#include "piler2.h" // GFF fields are: // [attributes] [comments] // 0 1 2 3 4 5 6 7 8 9 int GFFLineNr; // Destructive read -- pokes nuls onto FS int GetFields(char *Line, char **Fields, int MaxFields, char FS) { char *p = Line; for (int FieldIndex = 0; FieldIndex < MaxFields; ++FieldIndex) { Fields[FieldIndex] = p; char *Tab = strchr(p, FS); char *End = Tab; if (0 == End) End = strchr(p, '\0'); size_t FieldLength = End - p; if (FieldLength > MAX_GFF_FEATURE_LENGTH) Quit("Max GFF field length exceeded, field is %d chars, max=%d, line %d", FieldLength, MAX_GFF_FEATURE_LENGTH, GFFLineNr); if (0 == Tab) return FieldIndex + 1; *Tab = 0; p = Tab + 1; } return MaxFields; } bool GetNextGFFRecord(FILE *f, GFFRecord &Rec) { for (;;) { ++GFFLineNr; const char TAB = '\t'; char Line[MAX_GFF_LINE+1]; char *Ok = fgets(Line, sizeof(Line), f); if (NULL == Ok) { if (feof(f)) return false; Quit("Error reading GFF file, line=%d feof=%d ftell=%d ferror=%d errno=%d", GFFLineNr, feof(f), ftell(f), ferror(f), errno); } if ('#' == Line[0]) continue; size_t n = strlen(Line); if (0 == n) Quit("fgets returned zero-length line"); if (Line[n-1] != '\n' && !feof(f)) Quit("Max line length in GFF file exceeded, line %d is %d chars long, max=%d", GFFLineNr, n - 1, MAX_GFF_LINE); Line[n-1] = 0; // delete newline char *Fields[9]; int FieldCount = GetFields(Line, Fields, 9, '\t'); if (FieldCount < 8) Quit("GFF record has < 8 fields, line %d", GFFLineNr); const char *SeqName = Fields[0]; const char *Source = Fields[1]; const char *Feature = Fields[2]; const char *Start = Fields[3]; const char *End = Fields[4]; const char *Score = Fields[5]; const char *Strand = Fields[6]; const char *Frame = Fields[7]; const char *Attrs = ""; if (FieldCount > 8) { // Truncate attrs if comment found Attrs = Fields[8]; // char *Pound = strchr(Attrs, '#'); // if (0 != Pound) // *Pound = 0; } strcpy(Rec.SeqName, SeqName); strcpy(Rec.Source, Source); strcpy(Rec.Feature, Feature); Rec.Start = atoi(Start); Rec.End = atoi(End); Rec.Score = (float) atof(Score); Rec.Strand = Strand[0]; Rec.Frame = Frame[0] == '.' ? -1 : atoi(Frame); strcpy(Rec.Attrs, Attrs); return true; } } // GFF fields are: // [attributes] [comments] // 0 1 2 3 4 5 6 7 8 9 void WriteGFFRecord(FILE *f, const GFFRecord &Rec) { fprintf(f, "%s\t%s\t%s\t%d\t%d\t%.3g\t%c", // 0 1 2 3 4 5 6 7 8 Rec.SeqName, // 0 Rec.Source, // 1 Rec.Feature, // 2 Rec.Start, // 3 Rec.End, // 4 Rec.Score, // 5 Rec.Strand); // 6 if (-1 == Rec.Frame) fprintf(f, "\t."); else fprintf(f, "\t%d", Rec.Frame); fprintf(f, "\t%s\n", Rec.Attrs); } gff2.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000001611611447530636014425 0ustar ????????????????#include "piler2.h" #define GFFRecord GFFRecord2 // GFF fields are: // [attributes] [comments] // 0 1 2 3 4 5 6 7 8 9 static int GFFLineNr2; bool HasTargetAttrs(const char *Attrs) { const char *ptrTarget = strstr(Attrs, "Target "); return 0 != ptrTarget; } // Piles 123 456 // 0123456 void ParsePilesAttrs(const char *Attrs, int *ptrQueryPile, int *ptrTargetPile) { const char *ptrPiles = strstr(Attrs, "Piles "); if (0 == ptrPiles) Quit("Piles attribute not found"); const char *ptrValues = ptrPiles + 6; int n = sscanf(ptrValues, "%d %d", ptrQueryPile, ptrTargetPile); if (n != 2) Quit("ParsePilesAttrs, sscanf(%s)=%d", ptrValues, n); } // BandClust 123 // 01234567890 void ParseBandClustAttrs(const char *Attrs, int *ptrBandClustIndex) { const char *ptrBandClust = strstr(Attrs, "BandClust "); if (0 == ptrBandClust) Quit("BandClust attribute not found"); const char *ptrIndex = ptrBandClust + 10; *ptrBandClustIndex = atoi(ptrIndex); } // Pyramid 123 // 01234567890 void ParsePyramidAttrs(const char *Attrs, int *ptrClustIndex) { const char *p = strstr(Attrs, "Pyramid "); if (0 == p) Quit("Pyramid attribute not found"); const char *ptrIndex = p + 8; *ptrClustIndex = atoi(ptrIndex); } // Target SeqName 123 456 // 01234567 void ParseTargetAttrs(const char *Attrs, char SeqName[], int SeqNameBytes, int *ptrStart, int *ptrEnd) { const char *ptrTarget = strstr(Attrs, "Target "); if (0 == ptrTarget) Quit("Target attribute not found"); const char *ptrRest = ptrTarget + 7; const char *ptrBlank = strchr(ptrRest, ' '); if (0 == ptrBlank) Quit("Invalid Target attributes '%s', missing blank", Attrs); size_t NameLength = ptrBlank - ptrRest; if (NameLength >= (size_t) SeqNameBytes) Quit("Target name length too long '%s'", Attrs); memcpy(SeqName, ptrRest, NameLength); SeqName[NameLength] = 0; int n = sscanf(ptrBlank+1, "%d %d", ptrStart, ptrEnd); if (n != 2) Quit("Invalid Target start/end attributes '%s', sscanf=%d", ptrBlank+1); } // Destructive read -- pokes nuls onto FS static int GetFields2(char *Line, char **Fields, int MaxFields, char FS) { char *p = Line; for (int FieldIndex = 0; FieldIndex < MaxFields; ++FieldIndex) { Fields[FieldIndex] = p; char *Tab = strchr(p, FS); char *End = Tab; if (0 == End) End = strchr(p, '\0'); size_t FieldLength = End - p; if (FieldLength > MAX_GFF_FEATURE_LENGTH) Quit("Max GFF field length exceeded, field is %d chars, max=%d, line %d", FieldLength, MAX_GFF_FEATURE_LENGTH, GFFLineNr2); if (0 == Tab) return FieldIndex + 1; *Tab = 0; p = Tab + 1; } return MaxFields; } // WARNING: Line[] and Fields[]are overwritten // by each call to GetNextGFFRecord. // The Rec agument to GetNextGFFRecord returns // pointers into Line[]. static char Line[MAX_GFF_LINE+1]; static char *Fields[9]; bool GetNextGFFRecord(FILE *f, GFFRecord &Rec) { for (;;) { ++GFFLineNr2; const char TAB = '\t'; char *Ok = fgets(Line, sizeof(Line), f); if (NULL == Ok) { if (feof(f)) return false; Quit("Error reading GFF file, line=%d feof=%d ftell=%d ferror=%d errno=%d", GFFLineNr2, feof(f), ftell(f), ferror(f), errno); } if ('#' == Line[0]) continue; size_t n = strlen(Line); if (0 == n) Quit("fgets returned zero-length line"); if (Line[n-1] != '\n') Quit("Max line length in GFF file exceeded, line %d is %d chars long, max=%d", GFFLineNr2, n - 1, MAX_GFF_LINE); Line[n-1] = 0; // delete newline int FieldCount = GetFields2(Line, Fields, 9, '\t'); if (FieldCount < 8) Quit("GFF record has < 8 fields, line %d", GFFLineNr2); char *SeqName = Fields[0]; char *Source = Fields[1]; char *Feature = Fields[2]; char *Start = Fields[3]; char *End = Fields[4]; char *Score = Fields[5]; char *Strand = Fields[6]; char *Frame = Fields[7]; char *Attrs = Fields[8]; // Truncate attrs if comment found char *Pound = strchr(Attrs, '#'); if (0 != Pound) *Pound = 0; Rec.SeqName = SeqName; Rec.Source = Source; Rec.Feature = Feature; Rec.Start = atoi(Start); Rec.End = atoi(End); Rec.Score = (float) atof(Score); Rec.Strand = Strand[0]; Rec.Frame = Frame[0] == '.' ? -1 : atoi(Frame); Rec.Attrs = Attrs; return true; } } // GFF fields are: // [attributes] [comments] // 0 1 2 3 4 5 6 7 8 9 void WriteGFFRecord(FILE *f, const GFFRecord &Rec) { fprintf(f, "%s\t%s\t%s\t%d\t%d\t%.3g\t%c", // 0 1 2 3 4 5 6 7 8 Rec.SeqName, // 0 Rec.Source, // 1 Rec.Feature, // 2 Rec.Start, // 3 Rec.End, // 4 Rec.Score, // 5 Rec.Strand); // 6 if (-1 == Rec.Frame) fprintf(f, "\t."); else fprintf(f, "\t%d", Rec.Frame); fprintf(f, "\t%s\n", Rec.Attrs); } void SaveGFFStrings(GFFRecord &Rec) { Rec.SeqName = strsave(Rec.SeqName); Rec.Feature = strsave(Rec.Feature); Rec.Source = strsave(Rec.Source); Rec.Attrs = strsave(Rec.Attrs); } void FreeGFFStrings(GFFRecord &Rec) { free((void *) Rec.SeqName); free((void *) Rec.Feature); free((void *) Rec.Source); free((void *) Rec.Attrs); Rec.SeqName = 0; Rec.Feature = 0; Rec.Source = 0; Rec.Attrs = 0; } void GFFRecordToHit(const GLIX &Glix, const GFFRecord &Rec, HitData &Hit) { if (0 != strcmp(Rec.Feature, "hit")) Quit("GFFRecordToHit: feature=%s", Rec.Feature); const int QueryLength = Rec.End - Rec.Start + 1; Hit.QueryFrom = Glix.LocalToGlobal(Rec.SeqName, Rec.Start - 1); Hit.QueryTo = Hit.QueryFrom + QueryLength - 1; char TargetName[MAX_GFF_FEATURE_LENGTH+1]; int TargetStart; int TargetEnd; ParseTargetAttrs(Rec.Attrs, TargetName, sizeof(TargetName), &TargetStart, &TargetEnd); const int TargetLength = TargetEnd - TargetStart + 1; Hit.TargetFrom = Glix.LocalToGlobal(TargetName, TargetStart - 1); Hit.TargetTo = Hit.TargetFrom + TargetLength - 1; Hit.Rev = (Rec.Strand == '-'); } void HitToGFFRecord(const GLIX &Glix, const HitData &Hit, GFFRecord &Rec, char AnnotBuffer[]) { Rec.Source = "piler"; Rec.Feature = "hit"; Rec.Score = 0; Rec.Frame = '.'; Rec.Strand = (Hit.Rev ? '-' : '+'); const int QueryLength = Hit.QueryTo - Hit.QueryFrom + 1; int SeqQueryFrom; const char *QueryLabel = Glix.GlobalToSeqPadded(Hit.QueryFrom, &SeqQueryFrom); const int SeqQueryTo = SeqQueryFrom + QueryLength - 1; Rec.SeqName = QueryLabel; Rec.Start = SeqQueryFrom + 1; Rec.End = SeqQueryTo + 1; const int TargetLength = Hit.TargetTo - Hit.TargetFrom + 1; int SeqTargetFrom; const char *TargetLabel = Glix.GlobalToSeqPadded(Hit.TargetFrom, &SeqTargetFrom); const int SeqTargetTo = SeqTargetFrom + TargetLength - 1; sprintf(AnnotBuffer, "Target %s %d %d", TargetLabel, SeqTargetFrom + 1, SeqTargetTo + 1); Rec.Attrs = AnnotBuffer; } gffset.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000404111447530636015051 0ustar ????????????????/*** GFFSet is a container class for a set of GFF records. This is to hide issues to do with retrieving from a file, efficient memory use etc. The initial implementation here is designed for simplicity; by hiding the details we can improve it later as needed without a big impact on the calling code. ***/ #include "piler2.h" #define GFFRecord GFFRecord2 GFFSet::GFFSet() { m_Recs = 0; m_RecordCount = 0; m_RecordBufferSize = 0; m_GLIX = 0; m_IIX = 0; } GFFSet::~GFFSet() { Free(); } void GFFSet::Free() { for (int i = 0; i < m_RecordCount; ++i) FreeGFFStrings(m_Recs[i]); freemem(m_Recs); delete m_GLIX; delete m_IIX; m_Recs = 0; m_RecordCount = 0; m_RecordBufferSize = 0; m_GLIX = 0; m_IIX = 0; } void GFFSet::FromFile(FILE *f) { Free(); GFFRecord Rec; while (GetNextGFFRecord(f, Rec)) { SaveGFFStrings(Rec); Add(Rec); } } // Caller's respsonsibility to allocate memory // for string fields, Add() doesn't make copies. // SaveGFFStrings() makes copies if needed. void GFFSet::Add(const GFFRecord &Rec) { if (m_RecordCount > m_RecordBufferSize) Quit("GFFSet::Add, m_RecordCount > m_BufferSize"); if (m_RecordCount == m_RecordBufferSize) { m_RecordBufferSize += BUFFER_SIZE_INC; reall(m_Recs, GFFRecord, m_RecordBufferSize); } m_Recs[m_RecordCount++] = Rec; } const GFFRecord &GFFSet::Get(int RecordIndex) const { if (RecordIndex < 0 || RecordIndex >= m_RecordCount) Quit("GFFRecord::Get(%d) out of range %d", RecordIndex, m_RecordCount); return m_Recs[RecordIndex]; } void GFFSet::MakeGLIX() { m_GLIX = new GLIX; m_GLIX->Init(); m_GLIX->FromGFFSet(*this); } void GFFSet::MakeIIX() { const int GlobalLength = m_GLIX->GetGlobalLength(); m_IIX = new IIX; m_IIX->Init(GlobalLength); m_IIX->SetGLIX(m_GLIX); for (int RecordIndex = 0; RecordIndex < m_RecordCount; ++RecordIndex) { const GFFRecord &Rec = Get(RecordIndex); m_IIX->AddLocal(Rec.SeqName, Rec.Start - 1, Rec.End - 1, RecordIndex); } } glix.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000001657311447530636014553 0ustar ????????????????/*** GLIX = Global Local IndeX This index provides an efficient mapping of a set of sequences to and from a single set of global coordinates such that each sequence has a unique range of positions. Global coordinates are convenient in a number of algorithms, and there may be quite large numbers of sequences (e.g., contigs or scaffolds in an early assembly), so efficiency is useful here. "Local" coordinate is (Label, SeqPos), where SeqPos is 0-based, i.e. is 0, 1 ... (SeqLength-1) for each sequence. "Global" coordindate is GlobalPos, where GlobalPos = 0, 1 ... (GlobalLength - 1). To convert from Local to Global, a hash table is used to look up by sequence name. To convert from Global to Local, the global sequence is divided into blocks of length B. A vector of length GlobalLength/B contains the sequence index for each block. Padding is therefore required to ensure that there are no blocks that map to two different sequences. ***/ #include "piler2.h" #define GFFRecord GFFRecord2 GLIX::GLIX() { m_SeqCount = 0; m_GlobalLength = 0; m_HashTableSize = 0; m_Pad = 0; m_HashTable = 0; m_SeqMap = 0; } GLIX::~GLIX() { } void GLIX::Free() { // TODO@@ } void GLIX::Init(int Pad, int HashTableSize) { Free(); m_Pad = Pad; m_HashTableSize = HashTableSize; m_HashTable = all(SEQDATA *, HashTableSize); zero(m_HashTable, SEQDATA *, HashTableSize); } SEQDATA *GLIX::SeqIndexLookup(const char *Label) const { int h = Hash(Label, m_HashTableSize); assert(h >= 0 && h < m_HashTableSize); for (SEQDATA *i = m_HashTable[h]; i; i = i->Next) if (0 == strcmp(i->Label, Label)) return i; return 0; } SEQDATA *GLIX::AddToIndex(const char *Label, int Length) { int h = Hash(Label, m_HashTableSize); assert(h >= 0 && h < m_HashTableSize); SEQDATA *i = all(SEQDATA, 1); i->Label = strsave(Label); i->Length = Length; i->Next = m_HashTable[h]; m_HashTable[h] = i; ++m_SeqCount; return i; } void GLIX::AssignOffsets() { int Offset = 0; for (int h = 0; h < m_HashTableSize; ++h) { for (SEQDATA *i = m_HashTable[h]; i; i = i->Next) { i->Offset = Offset; Offset += i->Length; const int NewOffset = RoundUp(Offset, m_Pad, SEQMAP_BLOCKSIZE); i->RoundedLength = i->Length + NewOffset - Offset; Offset = NewOffset; } } m_GlobalLength = Offset; } // Add() is typically used when creating a GLIX from GFF records. // It can be called multiple times with the same label, the // maximum position determines the sequence length. void GLIX::Add(const char *Label, int Pos) { SEQDATA *IE = SeqIndexLookup(Label); if (0 == IE) AddToIndex(Label, Pos + 1); else { if (Pos + 1 > IE->Length) IE->Length = Pos + 1; } } // Insert() is typically used when creating a GLIX from // a sequence file. It must be called exactly once per label. void GLIX::Insert(const char *Label, int Offset, int Length) { const SEQDATA *IE = SeqIndexLookup(Label); if (0 != IE) Quit("Duplicate sequence label %s", Label); int h = Hash(Label, m_HashTableSize); assert(h >= 0 && h < m_HashTableSize); SEQDATA *i = all(SEQDATA, 1); i->Label = strsave(Label); i->Length = Length; i->Offset = Offset; i->Next = m_HashTable[h]; m_HashTable[h] = i; ++m_SeqCount; } int GLIX::FromGFFFile(FILE *f) { int RecordCount = 0; GFFRecord Rec; while (GetNextGFFRecord(f, Rec)) { ++RecordCount; Add(Rec.SeqName, Rec.End - 1); if (HasTargetAttrs(Rec.Attrs)) { char TargetName[MAX_GFF_FEATURE_LENGTH+1]; int Start; int End; ParseTargetAttrs(Rec.Attrs, TargetName, sizeof(TargetName), &Start, &End); Add(TargetName, End - 1); } } AssignOffsets(); return RecordCount; } int GLIX::FromGFFSet(const GFFSet &Set) { const int RecordCount = Set.GetRecordCount(); for (int i = 0; i < RecordCount; ++i) { const GFFRecord &Rec = Set.Get(i); Add(Rec.SeqName, Rec.End - 1); } AssignOffsets(); return RecordCount; } void GLIX::LogMe() const { Log("m_SeqCount=%d\n", m_SeqCount); Log(" Label Length Hash\n"); Log("-------------------- ---------- -------\n"); int Count = 0; for (int h = 0; h < m_HashTableSize; ++h) { for (SEQDATA *i = m_HashTable[h]; i; i = i->Next) { Log("%20.20s %10d %7d\n", i->Label, i->Length, h); ++Count; } } if (Count != m_SeqCount) Log("\n**** ERROR **** Count=%d\n", Count); } int GLIX::GetGlobalLength() const { return m_GlobalLength; } int GLIX::GetSeqCount() const { return m_SeqCount; } int GLIX::GetSeqOffset(const char *Label) const { const SEQDATA *IE = SeqIndexLookup(Label); if (0 == IE) Quit("Label not found %s", Label); return IE->Offset; } int GLIX::GetSeqLength(const char *Label) const { const SEQDATA *IE = SeqIndexLookup(Label); if (0 == IE) Quit("Label not found %s", Label); return IE->Length; } void GLIX::MakeGlobalToLocalIndex() { if (m_GlobalLength%SEQMAP_BLOCKSIZE) Quit("MakeGlobalToLocalIndex: expects rounded size"); const int BinCount = (m_GlobalLength + SEQMAP_BLOCKSIZE - 1)/SEQMAP_BLOCKSIZE; m_SeqMap = all(SEQDATA *, BinCount); zero(m_SeqMap, SEQDATA *, BinCount); for (int h = 0; h < m_HashTableSize; ++h) { for (SEQDATA *p = m_HashTable[h]; p != 0; p = p->Next) { SEQDATA &IE = *p; int From = p->Offset; int To = From + p->Length - 1; if (From%SEQMAP_BLOCKSIZE) Quit("MakeGlobalToLocalIndex: expected rounded contig from"); int BinFrom = From/SEQMAP_BLOCKSIZE; int BinTo = To/SEQMAP_BLOCKSIZE; for (int Bin = BinFrom; Bin <= BinTo; ++Bin) { if (m_SeqMap[Bin] != 0) Quit("MakeGlobalToLocalIndex: overlap error"); m_SeqMap[Bin] = p; } } } } const char *GLIX::GlobalToSeq(int GlobalPos, int *ptrSeqPos) const { if (GlobalPos < 0 || GlobalPos >= m_GlobalLength) Quit("GlobalToSeqPos: invalid pos"); if (0 == m_SeqMap) Quit("GLIX::MakeGlobalToLocalIndex not called"); const int Bin = GlobalPos/SEQMAP_BLOCKSIZE; SEQDATA *IE = m_SeqMap[Bin]; if (0 == IE) Quit("GlobalToSeqPos: doesn't map"); int SeqPos = GlobalPos - IE->Offset; if (SeqPos < 0 || SeqPos >= IE->Length) Quit("GlobalToSeqPos: out of bounds"); *ptrSeqPos = SeqPos; return IE->Label; } const char *GLIX::GlobalToSeqPadded(int GlobalPos, int *ptrSeqPos) const { if (GlobalPos < 0 || GlobalPos >= m_GlobalLength) Quit("GlobalToSeqPos: invalid pos"); if (0 == m_SeqMap) Quit("GLIX::MakeGlobalToLocalIndex not called"); const int Bin = GlobalPos/SEQMAP_BLOCKSIZE; SEQDATA *IE = m_SeqMap[Bin]; if (0 == IE) Quit("GlobalToSeqPos: doesn't map"); int SeqPos = GlobalPos - IE->Offset; if (SeqPos < 0 || SeqPos >= IE->RoundedLength) Quit("GlobalToSeqPos: out of bounds"); *ptrSeqPos = SeqPos; return IE->Label; } int GLIX::LocalToGlobal(const char *Label, int LocalPos) const { const SEQDATA *IE = SeqIndexLookup(Label); if (0 == IE) Quit("ContigToGlobal: contig not found (%s)", Label); if (LocalPos < 0 || LocalPos >= IE->Length) Quit("ContigToGlobal: out of bounds"); return IE->Offset + LocalPos; } int GLIX::LocalToGlobalNoFail(const char *Label, int LocalPos) const { const SEQDATA *IE = SeqIndexLookup(Label); if (0 == IE) return -1; if (LocalPos < 0 || LocalPos >= IE->Length) return -1; return IE->Offset + LocalPos; } hash.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000001403611447530636014523 0ustar ????????????????#include "piler2.h" typedef unsigned long int ub4; /* unsigned 4-byte quantities */ typedef unsigned char ub1; /* unsigned 1-byte quantities */ #define hashsize(n) ((ub4)1<<(n)) #define hashmask(n) (hashsize(n)-1) /* -------------------------------------------------------------------- mix -- mix 3 32-bit values reversibly. For every delta with one or two bits set, and the deltas of all three high bits or all three low bits, whether the original value of a,b,c is almost all zero or is uniformly distributed, * If mix() is run forward or backward, at least 32 bits in a,b,c have at least 1/4 probability of changing. * If mix() is run forward, every bit of c will change between 1/3 and 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.) mix() was built out of 36 single-cycle latency instructions in a structure that could supported 2x parallelism, like so: a -= b; a -= c; x = (c>>13); b -= c; a ^= x; b -= a; x = (a<<8); c -= a; b ^= x; c -= b; x = (b>>13); ... Unfortunately, superscalar Pentiums and Sparcs can't take advantage of that parallelism. They've also turned some of those single-cycle latency instructions into multi-cycle latency instructions. Still, this is the fastest good hash I could find. There were about 2^^68 to choose from. I only looked at a billion or so. -------------------------------------------------------------------- */ #define mix(a,b,c) \ { \ a -= b; a -= c; a ^= (c>>13); \ b -= c; b -= a; b ^= (a<<8); \ c -= a; c -= b; c ^= (b>>13); \ a -= b; a -= c; a ^= (c>>12); \ b -= c; b -= a; b ^= (a<<16); \ c -= a; c -= b; c ^= (b>>5); \ a -= b; a -= c; a ^= (c>>3); \ b -= c; b -= a; b ^= (a<<10); \ c -= a; c -= b; c ^= (b>>15); \ } /* -------------------------------------------------------------------- hash() -- hash a variable-length key into a 32-bit value k : the key (the unaligned variable-length array of bytes) len : the length of the key, counting by bytes initval : can be any 4-byte value Returns a 32-bit value. Every bit of the key affects every bit of the return value. Every 1-bit and 2-bit delta achieves avalanche. About 6*len+35 instructions. The best hash table sizes are powers of 2. There is no need to do mod a prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask. For example, if you need only 10 bits, do h = (h & hashmask(10)); In which case, the hash table should have hashsize(10) elements. If you are hashing n strings (ub1 **)k, do it like this: for (i=0, h=0; i= 12) { a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24)); b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24)); c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24)); mix(a,b,c); k += 12; len -= 12; } /*------------------------------------- handle the last 11 bytes */ c += length; switch(len) /* all the case statements fall through */ { case 11: c+=((ub4)k[10]<<24); case 10: c+=((ub4)k[9]<<16); case 9 : c+=((ub4)k[8]<<8); /* the first byte of c is reserved for the length */ case 8 : b+=((ub4)k[7]<<24); case 7 : b+=((ub4)k[6]<<16); case 6 : b+=((ub4)k[5]<<8); case 5 : b+=k[4]; case 4 : a+=((ub4)k[3]<<24); case 3 : a+=((ub4)k[2]<<16); case 2 : a+=((ub4)k[1]<<8); case 1 : a+=k[0]; /* case 0: nothing left to add */ } mix(a,b,c); /*-------------------------------------------- report the result */ return c%HASH_TABLE_SIZE; } unsigned Hash(const char *key, unsigned TableSize) { register ub1 *k = (ub1 *) key; register ub4 length = (ub4) strlen(key); /* the length of the key */ const ub4 initval = 12345; /* the previous hash, or an arbitrary value */ register ub4 a,b,c,len; /* Set up the internal state */ len = length; a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */ c = initval; /* the previous hash value */ /*---------------------------------------- handle most of the key */ while (len >= 12) { a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24)); b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24)); c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24)); mix(a,b,c); k += 12; len -= 12; } /*------------------------------------- handle the last 11 bytes */ c += length; switch(len) /* all the case statements fall through */ { case 11: c+=((ub4)k[10]<<24); case 10: c+=((ub4)k[9]<<16); case 9 : c+=((ub4)k[8]<<8); /* the first byte of c is reserved for the length */ case 8 : b+=((ub4)k[7]<<24); case 7 : b+=((ub4)k[6]<<16); case 6 : b+=((ub4)k[5]<<8); case 5 : b+=k[4]; case 4 : a+=((ub4)k[3]<<24); case 3 : a+=((ub4)k[2]<<16); case 2 : a+=((ub4)k[1]<<8); case 1 : a+=k[0]; /* case 0: nothing left to add */ } mix(a,b,c); /*-------------------------------------------- report the result */ return c%TableSize; } iix.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000001010011447530636014355 0ustar ????????????????/*** IIX = Interval IndeX. Given a fixed set of intervals on a sequence of length L, provides an efficient way to retrieve all intervals in the set that have non-zero overlap with given interval. This implementation is designed to be simple to implement and efficient for the typical sets of intervals that I will be dealing with -- this is not a good general solution to the problem. The main assumption is that there are few long intervals. The sequence is divided into blocks of length B. For each block, there is a list of intervals that overlap that block. So an indexed interval (ii) of length > B will appear in more than one list, and in fact any ii of length > 1 may appear in more than one list if it happens to cross a block boundary. Intervals of length >> B are a problem because they will appear in many lists, but for my purposes they are rare. Each interval is associated with a user-defined integer. This can be used by the calling code to map an ii to some data, e.g. a GFF record. How this is done is up to the caller. ***/ #include "piler2.h" IIX::IIX() { m_GlobalLength = 0; m_BlockLength = 0; m_BlockCount = 0; m_Blocks = 0; m_GLIX = 0; } IIX::~IIX() { Free(); } void IIX::Free() { // TODO@@ m_GlobalLength = 0; } int IIX::PosToBlockIndex(int Pos) const { return Pos/m_BlockLength; } void IIX::Init(int GlobalLength, int BlockLength) { m_GlobalLength = GlobalLength; m_BlockLength = BlockLength; m_BlockCount = (GlobalLength + BlockLength - 1)/BlockLength; m_Blocks = all(INTERVAL *, m_BlockCount); zero(m_Blocks, INTERVAL *, m_BlockCount); m_GLIX = 0; } bool IIX::ValidInterval(int From, int To) const { if (From < 0 || From >= m_GlobalLength) return false; if (To < 0 || To >= m_GlobalLength) return false; if (From > To) return false; return true; } void IIX::AddGlobal(int GlobalFrom, int GlobalTo, int User) { assert(ValidInterval(GlobalFrom, GlobalTo)); const int BlockFrom = PosToBlockIndex(GlobalFrom); const int BlockTo = PosToBlockIndex(GlobalTo); for (int BlockIndex = BlockFrom; BlockIndex <= BlockTo; ++BlockIndex) { INTERVAL *ii = all(INTERVAL, 1); ii->From = GlobalFrom; ii->To = GlobalTo; ii->User = User; ii->Next = m_Blocks[BlockIndex]; m_Blocks[BlockIndex] = ii; } } void IIX::AddLocal(const char *Label, int LocalFrom, int LocalTo, int User) { if (0 == m_GLIX) Quit("IIX::AddLocal: no GLIX"); int GlobalFrom = m_GLIX->LocalToGlobal(Label, LocalFrom); int GlobalTo = GlobalFrom + (LocalTo - LocalFrom + 1); return AddGlobal(GlobalFrom, GlobalTo, User); } int IIX::LookupGlobal(int GlobalFrom, int GlobalTo, int **ptrHits) const { *ptrHits = 0; if (!ValidInterval(GlobalFrom, GlobalTo)) return 0; int *Hits = 0; int HitCount = 0; int HitBufferSize = 0; const int BlockFrom = PosToBlockIndex(GlobalFrom); const int BlockTo = PosToBlockIndex(GlobalTo); for (int BlockIndex = BlockFrom; BlockIndex <= BlockTo; ++BlockIndex) { for (INTERVAL *ii = m_Blocks[BlockIndex]; ii; ii = ii->Next) { if (Overlap(GlobalFrom, GlobalTo, ii->From, ii->To) > 0) { if (HitCount >= HitBufferSize) { HitBufferSize += 128; reall(Hits, int, HitBufferSize); } Hits[HitCount++] = ii->User; } } } *ptrHits = Hits; return HitCount; } int IIX::LookupLocal(const char *Label, int LocalFrom, int LocalTo, int **ptrHits) const { int GlobalFrom = m_GLIX->LocalToGlobalNoFail(Label, LocalFrom); if (-1 == GlobalFrom) return 0; int GlobalTo = GlobalFrom + (LocalTo - LocalFrom + 1); return LookupGlobal(GlobalFrom, GlobalTo, ptrHits); } void IIX::LogMe() const { Log("IIX %d blocks\n", m_BlockCount); Log(" Block From To User\n"); Log("====== ========== ========== ==========\n"); for (int BlockIndex = 0; BlockIndex < m_BlockCount; ++BlockIndex) { for (INTERVAL *ii = m_Blocks[BlockIndex]; ii; ii = ii->Next) Log("%6d %10d %10d %10d\n", BlockIndex, ii->From, ii->To, ii->User); } } log.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000167411447530636014365 0ustar ????????????????#include "piler2.h" static FILE *g_fLog = 0; void SetLog() { bool Append = false; const char *FileName = ValueOpt("log"); if (0 == FileName) { FileName = ValueOpt("loga"); if (0 == FileName) return; Append = true; } g_fLog = fopen(FileName, Append ? "a" : "w"); if (NULL == g_fLog) { fprintf(stderr, "\n*** FATAL ERROR *** Cannot open log file\n"); perror(FileName); exit(1); } } void Log(const char Format[], ...) { if (0 == g_fLog) return; char Str[4096]; va_list ArgList; va_start(ArgList, Format); vsprintf(Str, Format, ArgList); fprintf(g_fLog, "%s", Str); fflush(g_fLog); } void Warning(const char Format[], ...) { char Str[4096]; va_list ArgList; va_start(ArgList, Format); vsprintf(Str, Format, ArgList); fprintf(stderr, "\n** WARNING ** %s\n", Str); if (0 != g_fLog) { fprintf(g_fLog, "** WARNING ** %s\n", Str); fflush(g_fLog); } } main.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000264711447530636014531 0ustar ????????????????#include "piler2.h" #if WIN32 #include #endif bool g_Quiet = false; const char *g_ProcessName = "piler2"; int main(int argc, char *argv[]) { g_ProcessName = argv[0]; #if WIN32 // Multi-tasking does not work well in CPU-bound // console apps running under Win32. // Reducing the process priority allows GUI apps // to run responsively in parallel. SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS); #endif ProcessArgVect(argc - 1, argv + 1); SetLog(); g_Quiet = FlagOpt("quiet"); for (int i = 0; i < argc; ++i) Log("%s ", argv[i]); Log("\n"); if (ValueOpt("trs") != 0) { TRS(); return 0; } else if (ValueOpt("tan") != 0) { Tan(); return 0; } else if (ValueOpt("tr") != 0) { TR(); return 0; } else if (ValueOpt("trs2fasta")) { TRS2Fasta(); return 0; } else if (ValueOpt("tanmotif2fasta")) { Tanmotif2Fasta(); return 0; } else if (ValueOpt("cons")) { Cons(); return 0; } else if (ValueOpt("annot")) { Annot(); return 0; } else if (ValueOpt("annotedge")) { AnnotEdge(); return 0; } else if (ValueOpt("crisp")) { Crisp(); return 0; } else if (FlagOpt("help")) { Usage(); exit(0); } else if (FlagOpt("version")) { fprintf(stderr, PILER_LONG_VERSION "\n"); exit(0); } else CommandLineError("No command specified"); } makeannot.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000002224211447530637015554 0ustar ????????????????#include "piler2.h" #define MAX(i, j) ((i) >= (j) ? (i) : (j)) #define MIN(i, j) ((i) <= (j) ? (i) : (j)) static double MIN_OVERLAP = 0.05; const int MAX_HITS = 8; const int MAX_STR = 256; const int MAP_CHARS = 20; const double MAX_MARGIN = 0.05; const int EDGE_BASES = 100; const int MIN_OVERLAP_EDGE = 32; struct AnnotHit { int Overlap; int RepIndex; }; static AnnotHit AHs[MAX_HITS]; static int AnnotHitCount; static char Str[MAX_STR+1]; static int StrPos; static const RepData *g_Reps; static int g_RepCount; static bool g_Rev; static int g_Mid; static int InRegion(int ContigFrom, int ContigTo, int Pos) { if (Pos >= ContigFrom && Pos <= ContigTo) return true; if (Pos < ContigFrom && ContigFrom - Pos <= MAX_MARGIN) return true; if (Pos > ContigTo && Pos - ContigTo <= MAX_MARGIN) return true; return false; } static int CmpAH(const void *a1, const void *a2) { const AnnotHit *h1 = (const AnnotHit *) a1; const AnnotHit *h2 = (const AnnotHit *) a2; int i1 = h1->RepIndex; int i2 = h2->RepIndex; assert(i1 >= 0 && i1 < g_RepCount); assert(i2 >= 0 && i2 < g_RepCount); if (g_Rev) return g_Reps[i2].ContigFrom - g_Reps[i1].ContigFrom; return g_Reps[i1].ContigFrom - g_Reps[i2].ContigFrom; } static int CmpAHEdge(const void *a1, const void *a2) { const AnnotHit *h1 = (const AnnotHit *) a1; const AnnotHit *h2 = (const AnnotHit *) a2; int i1 = h1->RepIndex; int i2 = h2->RepIndex; assert(i1 >= 0 && i1 < g_RepCount); assert(i2 >= 0 && i2 < g_RepCount); const RepData &Rep1 = g_Reps[i1]; const RepData &Rep2 = g_Reps[i2]; int df1 = iabs(Rep1.ContigFrom - g_Mid); int df2 = iabs(Rep2.ContigFrom - g_Mid); int dt1 = iabs(Rep1.ContigTo - g_Mid); int dt2 = iabs(Rep2.ContigTo - g_Mid); int d1 = MIN(df1, dt1); int d2 = MIN(df2, dt2); return d1 - d2; } static void SafeCatChar(char c) { if (StrPos >= MAX_STR) return; Str[StrPos++] = c; Str[StrPos] = 0; } static void SafeCat(const char *s) { while (char c = *s++) { if (StrPos >= MAX_STR) break; Str[StrPos++] = c; } Str[StrPos] = 0; } static int GetAnnots(const char *Label, int ContigFrom, int ContigTo, const RepData *Reps, int RepCount, int MinOverlap) { int AnnotHitCount = 0; for (int RepIndex = 0; RepIndex < RepCount; ++RepIndex) { const RepData &Rep = Reps[RepIndex]; if (0 != strcmp(Label, Rep.ContigLabel)) continue; int ov = Overlap(ContigFrom, ContigTo, Rep.ContigFrom, Rep.ContigTo); if (ov < MinOverlap) continue; // Unconditionally accept if not maxed out if (AnnotHitCount < MAX_HITS) { AHs[AnnotHitCount].Overlap = ov; AHs[AnnotHitCount].RepIndex = RepIndex; ++AnnotHitCount; } else { // Overwrite shortest hit if this is longer int SmallestOverlap = AHs[0].Overlap; int SmallestIndex = 0; for (int i = 1; i < MAX_HITS; ++i) { if (AHs[i].Overlap < SmallestOverlap) { SmallestOverlap = AHs[i].Overlap; SmallestIndex = 0; } } if (ov > SmallestOverlap) { AHs[SmallestIndex].RepIndex = RepIndex; AHs[SmallestIndex].Overlap = ov; } } } return AnnotHitCount; } static int PosToCharIndex(int RegionFrom, int RegionTo, int Pos, int MapChars) { if (MapChars <= 1) return 0; int RegionLength = RegionTo - RegionFrom + 1; if (RegionLength <= 1) return 0; if (Pos < RegionFrom) return 0; if (Pos >= RegionTo) return MapChars - 1; int CharIndex = ((Pos - RegionFrom)*MapChars)/RegionLength; if (CharIndex < 0 || CharIndex >= MapChars) Quit("PosToCharIndex messed up"); return CharIndex; } static void RevStr() { for (int i = 0; i < MAP_CHARS/2; ++i) { char c1 = Str[i]; char c2 = Str[MAP_CHARS-i-1]; Str[i] = c2; Str[MAP_CHARS-i-1] = c1; } } const char *MakeAnnotEdge(const char *Label, int SeqFrom, int SeqTo, bool Rev, const RepData *Reps, int RepCount) { StrPos = 0; for (int i = 0; i < MAP_CHARS; ++i) SafeCatChar('-'); const int Mid = (SeqFrom + SeqTo)/2; int ContigFrom = SeqFrom - EDGE_BASES/2; int ContigTo = SeqFrom + EDGE_BASES/2; int From = ContigFrom; int To = ContigTo; if (From < 0) From = 0; AnnotHitCount = GetAnnots(Label, From, To, Reps, RepCount, MIN_OVERLAP_EDGE); if (0 == AnnotHitCount) return Str; // Global vars needed by CmpAHEdge g_Rev = Rev; g_Reps = Reps; g_RepCount = RepCount; g_Mid = Mid; // Sort by edge qsort((void *) AHs, AnnotHitCount, sizeof(AnnotHit), CmpAHEdge); for (int AnnotIndex = 0; AnnotIndex < AnnotHitCount; ++AnnotIndex) { const int RepIndex = AHs[AnnotIndex].RepIndex; const RepData &Rep = Reps[RepIndex]; bool LeftEnd = false; bool RightEnd = false; int RepFrom = Rep.RepeatFrom; if (-1 != RepFrom) { int RepTo = Rep.RepeatTo; int RepLeft = Rep.RepeatLeft; int RepLength = RepTo + RepLeft - 1; if (RepLength <= 0) RepLength = 9999; // hack to avoid div by 0 int RepContigFrom = Rep.ContigFrom; int RepContigTo = Rep.ContigTo; if (Rep.Rev) { int Tmp = RepFrom; RepFrom = RepLeft; RepLeft = Tmp; } const double LeftMargin = (double) RepFrom / RepLength; const double RightMargin = (double) RepLeft / RepLength; bool LeftInRegion = InRegion(ContigFrom, ContigTo, Rep.ContigFrom); bool RightInRegion = InRegion(ContigFrom, ContigTo, Rep.ContigTo); LeftEnd = (LeftInRegion && LeftMargin <= MAX_MARGIN); RightEnd = (RightInRegion && RightMargin <= MAX_MARGIN); } int MapFrom = PosToCharIndex(ContigFrom, ContigTo, Rep.ContigFrom, MAP_CHARS); int MapTo = PosToCharIndex(ContigFrom, ContigTo, Rep.ContigTo, MAP_CHARS); const char cLower = 'a' + AnnotIndex; const char cUpper = 'A' + AnnotIndex; if (LeftEnd) Str[MapFrom] = cUpper; else Str[MapFrom] = cLower; for (int i = MapFrom + 1; i <= MapTo - 1; ++i) Str[i] = cLower; if (RightEnd) Str[MapTo] = cUpper; else Str[MapTo] = cLower; SafeCat(" "); SafeCat(Rep.RepeatName); if (Rep.RepeatFrom >= 0) { int FullRepLength = Rep.RepeatTo + Rep.RepeatLeft; int RepMissing = Rep.RepeatFrom + Rep.RepeatLeft; double Pct = ((FullRepLength - RepMissing)*100.0)/FullRepLength; char s[32]; sprintf(s, "(%.0f%%)", Pct); SafeCat(s); } } if (Rev) RevStr(); return Str; } const char *MakeAnnot(const char *Label, int SeqFrom, int SeqTo, bool Rev, const RepData *Reps, int RepCount) { int ContigFrom = SeqFrom; int ContigTo = SeqTo; int Length = ContigTo - ContigFrom + 1; const char *strMinRatio = ValueOpt("minratio"); if (strMinRatio != 0) MIN_OVERLAP = atof(strMinRatio)/100.0; const int MinOverlap = (int) (Length*MIN_OVERLAP); StrPos = 0; for (int i = 0; i < MAP_CHARS; ++i) SafeCatChar('-'); AnnotHitCount = GetAnnots(Label, ContigFrom, ContigTo, Reps, RepCount, MinOverlap); if (0 == AnnotHitCount) return Str; // Global vars needed by CmpAH g_Rev = Rev; g_Reps = Reps; g_RepCount = RepCount; // Sort by start pos qsort((void *) AHs, AnnotHitCount, sizeof(AnnotHit), CmpAH); for (int AnnotIndex = 0; AnnotIndex < AnnotHitCount; ++AnnotIndex) { const int RepIndex = AHs[AnnotIndex].RepIndex; const RepData &Rep = Reps[RepIndex]; const int From = MAX(Rep.ContigFrom, ContigFrom); const int To = MIN(Rep.ContigTo, ContigTo); bool LeftEnd = false; bool RightEnd = false; int RepFrom = Rep.RepeatFrom; if (-1 != RepFrom) { int RepTo = Rep.RepeatTo; int RepLeft = Rep.RepeatLeft; int RepLength = RepTo + RepLeft - 1; if (RepLength <= 0) RepLength = 9999; // hack to avoid div by 0 if (Rep.ContigFrom < ContigFrom) RepFrom += (ContigFrom - Rep.ContigFrom); if (Rep.ContigTo > ContigTo) RepTo -= Rep.ContigTo - ContigTo; const double LeftMargin = (double) RepFrom / RepLength; const double RightMargin = (double) RepLeft / RepLength; LeftEnd = (LeftMargin <= MAX_MARGIN); RightEnd = (RightMargin <= MAX_MARGIN); } int MapFrom = ((From - ContigFrom)*MAP_CHARS)/Length; int MapTo = ((To - ContigFrom)*MAP_CHARS)/Length; if (Rev) { int Tmp = MapFrom; MapFrom = MAP_CHARS - MapTo - 1; MapTo = MAP_CHARS - Tmp - 1; } if (MapFrom < 0 || MapFrom >= MAP_CHARS || MapTo < 0 || MapTo >= MAP_CHARS) Quit("MakeAnnot: failed to map"); const char cLower = 'a' + AnnotIndex; const char cUpper = 'A' + AnnotIndex; if (LeftEnd) Str[MapFrom] = cUpper; else Str[MapFrom] = cLower; for (int i = MapFrom + 1; i <= MapTo - 1; ++i) Str[i] = cLower; if (RightEnd) Str[MapTo] = cUpper; else Str[MapTo] = cLower; SafeCat(" "); SafeCat(Rep.RepeatName); if (Rep.RepeatFrom >= 0) { int FullRepLength = Rep.RepeatTo + Rep.RepeatLeft; int RepMissing = Rep.RepeatFrom + Rep.RepeatLeft; if (ContigFrom > Rep.ContigFrom) RepMissing += ContigFrom - Rep.ContigFrom; if (Rep.ContigTo > ContigTo) RepMissing += Rep.ContigTo - ContigTo; double Pct = ((FullRepLength - RepMissing)*100.0)/FullRepLength; char s[32]; sprintf(s, "(%.0f%%)", Pct); SafeCat(s); } } return Str; } mem.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000204611447530637014355 0ustar ????????????????#include "piler2.h" #ifdef _MSC_VER #include #endif static int AllocatedBytes; static int PeakAllocatedBytes; // Allocate memory, fail on error, track total void *allocmem(int bytes) { assert(bytes < 0xfffffff0); // check for overlow char *p = (char *) malloc((size_t) (bytes + 4)); if (0 == p) Quit("Out of memory (%d)", bytes); int *pSize = (int *) p; *pSize = bytes; AllocatedBytes += bytes; if (AllocatedBytes > PeakAllocatedBytes) PeakAllocatedBytes = AllocatedBytes; return p + 4; } void freemem(void *p) { if (0 == p) return; int *pSize = (int *) ((char *) p - 4); int bytes = *pSize; assert(bytes <= AllocatedBytes); AllocatedBytes -= bytes; free(((char *) p) - 4); } void chkmem() { #ifdef _MSC_VER assert(_CrtCheckMemory()); #endif } void *reallocmem(void *p, int bytes) { if (0 == p) return allocmem(bytes); int *pSize = (int *) ((char *) p - 4); int oldbytes = *pSize; void *newp = allocmem(bytes); memcpy(newp, p, oldbytes); return newp; } options.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000624611447530640015272 0ustar ????????????????#include "piler2.h" struct VALUE_OPT { const char *m_pstrName; const char *m_pstrValue; }; struct FLAG_OPT { const char *m_pstrName; bool m_bSet; }; static VALUE_OPT ValueOpts[] = { "trs", 0, "trs2", 0, "trs2fasta", 0, "tanmotif2fasta", 0, "cons", 0, "arrays", 0, "out", 0, "piles", 0, "images", 0, "log", 0, "loga", 0, "seq", 0, "path", 0, "rep", 0, "mincover", 0, "maxlengthdiffpct", 0, "maxfam", 0, "label", 0, "annot", 0, "annotedge", 0, "famsize", 0, "prefix", 0, "tan", 0, "pyramid", 0, "motif", 0, "minhits", 0, "maxmargin", 0, "minratio", 0, "tr", 0, "cand", 0, "mintrspacing", 0, "maxtrspacing", 0, "mintrlength", 0, "minspacingratio", 0, "minfam", 0, "minhitratio", 0, "mindistpairs", 0, "crisp", 0, }; static int ValueOptCount = sizeof(ValueOpts)/sizeof(ValueOpts[0]); static FLAG_OPT FlagOpts[] = { "multihit", 0, "quiet", 0, "version", 0, "help", 0, "edge", 0, "pilesonly", 0, "exact", 0, }; static int FlagOptCount = sizeof(FlagOpts)/sizeof(FlagOpts[0]); void CommandLineError(const char *Format, ...) { va_list ArgList; va_start(ArgList, Format); char Str[4096]; vsprintf(Str, Format, ArgList); Usage(); fprintf(stderr, "\n** Command line error** %s\n", Str); exit(1); } static bool TestSetFlagOpt(const char *Arg) { for (int i = 0; i < FlagOptCount; ++i) if (!stricmp(Arg, FlagOpts[i].m_pstrName)) { FlagOpts[i].m_bSet = true; return true; } return false; } static bool TestSetValueOpt(const char *Arg, const char *Value) { for (int i = 0; i < ValueOptCount; ++i) if (!stricmp(Arg, ValueOpts[i].m_pstrName)) { if (0 == Value) CommandLineError("Option -%s must have value\n", Arg); ValueOpts[i].m_pstrValue = strsave(Value); return true; } return false; } bool FlagOpt(const char *Name) { for (int i = 0; i < FlagOptCount; ++i) if (!stricmp(Name, FlagOpts[i].m_pstrName)) return FlagOpts[i].m_bSet; Quit("FlagOpt(%s) invalid", Name); return false; } const char *ValueOpt(const char *Name) { for (int i = 0; i < ValueOptCount; ++i) if (!stricmp(Name, ValueOpts[i].m_pstrName)) return ValueOpts[i].m_pstrValue; Quit("ValueOpt(%s) invalid", Name); return 0; } const char *RequiredValueOpt(const char *Name) { const char *s = ValueOpt(Name); if (0 == s) CommandLineError("Required option -%s not specified\n", Name); return s; } void ProcessArgVect(int argc, char *argv[]) { for (int iArgIndex = 0; iArgIndex < argc; ) { const char *Arg = argv[iArgIndex]; if (Arg[0] != '-') Quit("Command-line option \"%s\" must start with '-'\n", Arg); const char *ArgName = Arg + 1; if (TestSetFlagOpt(ArgName)) { ++iArgIndex; continue; } char *Value = 0; if (iArgIndex < argc - 1) Value = argv[iArgIndex+1]; if (TestSetValueOpt(ArgName, Value)) { iArgIndex += 2; continue; } CommandLineError("Invalid command line option \"%s\"\n", ArgName); } } progress.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000264211447530641015440 0ustar ????????????????#include "piler2.h" #include static time_t g_StartTime = time(0); static time_t g_StartTimeStep; static char *g_Desc = 0; int GetElapsedSecs() { return (int) (time(0) - g_StartTime); } static void ProgressPrefix() { int ElapsedSecs = (int) (time(0) - g_StartTime); unsigned MemUseMB = (GetMemUseBytes() + 500000)/1000000; unsigned RAMMB = (unsigned) ((GetRAMSize() + 500000)/1000000); unsigned MemUsePct = (MemUseMB*100)/RAMMB; Log("%4d s %4d Mb (%3d%%) ", ElapsedSecs, MemUseMB, MemUsePct); if (!g_Quiet) fprintf(stderr, "%4d s %6d Mb (%3d%%) ", ElapsedSecs, MemUseMB, MemUsePct); } void ProgressStart(const char *Format, ...) { ProgressPrefix(); g_StartTimeStep = time(0); char Str[4096]; va_list ArgList; va_start(ArgList, Format); vsprintf(Str, Format, ArgList); if (g_Desc != 0) free(g_Desc); g_Desc = strsave(Str); Log("%s\n", Str); if (!g_Quiet) fprintf(stderr, "%s\n", Str); } void ProgressDone() { if (0 == g_Desc) return; ProgressPrefix(); int Secs = (int) (time(0) - g_StartTimeStep); Log("%s done (%d s).\n", g_Desc, Secs); if (!g_Quiet) fprintf(stderr, "%s done (%d s).\n", g_Desc, Secs); } void Progress(const char *Format, ...) { char Str[4096]; va_list ArgList; va_start(ArgList, Format); vsprintf(Str, Format, ArgList); Log("%s\n", Str); if (!g_Quiet) fprintf(stderr, "%s\n", Str); } quit.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000126211447530641014553 0ustar ????????????????#include "piler2.h" #ifdef WIN32 #define _WIN32_WINNT 0x0400 #include void Break() { DebugBreak(); } #endif // Exit immediately with error message, printf-style. void Quit(const char szFormat[], ...) { va_list ArgList; char szStr[4096]; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); fprintf(stderr, "\n*** FATAL ERROR *** %s %s\n", g_ProcessName, szStr); Log("\n*** FATAL ERROR *** "); Log("%s\n", szStr); #if DEBUG #ifdef WIN32 if (IsDebuggerPresent()) { int iBtn = MessageBox(NULL, szStr, "piler", MB_ICONERROR | MB_OKCANCEL); if (IDCANCEL == iBtn) Break(); } #endif #endif exit(1); } readafa.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000334711447530641015162 0ustar ????????????????#include "piler2.h" char *ReadAFA(FILE *f, int *ptrSeqLength, int *ptrSeqCount) { rewind(f); int FileSize = GetFileSize(f); int BufferSize = FileSize; char *Buffer = all(char, BufferSize); char prev_c = '\n'; bool InLabel = false; int Pos = 0; int SeqStart = 0; int SeqCount = 0; int SeqLength; for (;;) { int c = fgetc(f); if (EOF == c) { if (feof(f)) break; Quit("Stream error"); } if (InLabel) { if (c == '\r') continue; if ('\n' == c) InLabel = false; } else { if ('>' == c && '\n' == prev_c) { int ThisSeqLength = Pos - SeqStart; if (0 == SeqCount) ; else if (1 == SeqCount) SeqLength = ThisSeqLength; else if (SeqCount > 1) { if (SeqLength != ThisSeqLength) Quit("ReadAFA: %u seqs, sequence lengths differ (a) %d %d", SeqCount, SeqLength, ThisSeqLength); } ++SeqCount; SeqStart = Pos; InLabel = true; } else if (!isspace(c)) { if (Pos >= BufferSize) Quit("ReadAFA: buffer too small"); Buffer[Pos++] = (c); } } prev_c = c; } int ThisSeqLength = Pos - SeqStart; if (1 == SeqCount) SeqLength = ThisSeqLength; else { if (SeqLength != ThisSeqLength) Quit("ReadAFA: %u seqs, sequence lengths differ (b) %d %d", SeqCount, SeqLength, ThisSeqLength); } *ptrSeqCount = SeqCount; *ptrSeqLength = SeqLength; if (Pos != SeqCount*SeqLength) Quit("ReadAFA: Internal error"); return Buffer; } char *ReadAFA(const char FileName[], int *ptrSeqLength, int *ptrSeqCount) { FILE *f = OpenStdioFile(FileName); char *Seqs = ReadAFA(f, ptrSeqLength, ptrSeqCount); fclose(f); return Seqs; } readhits.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000645511447530641015405 0ustar ????????????????#include "piler2.h" // Destructive: pokes Rec.Attrs static char *GetTarget(GFFRecord &Rec, int *ptrStart, int *ptrEnd) { char *Attrs = Rec.Attrs; char *SemiColon = strchr(Attrs, ';'); if (0 != SemiColon) *SemiColon = 0; char *Space = strchr(Attrs, ' '); if (0 == Space) Quit("GFF hit record line %d, missing space in attrs", GFFLineNr); *Space = 0; if (0 != strcmp(Attrs, "Target")) Quit("GFF hit record line %d, expected Target as first attr", GFFLineNr); char *Label = Space + 1; Space = strchr(Label, ' '); if (0 == Space) Quit("GFF hit record line %d, missing space following Target label", GFFLineNr); *Space = 0; char *Start = Space + 1; Space = strchr(Start, ' '); if (0 == Space) Quit("GFF hit record line %d, missing space following Target start", GFFLineNr); *Space = 0; char *End = Space + 1; *ptrStart = atoi(Start); *ptrEnd = atoi(End); return Label; } static int ReadHitsPass1(FILE *f) { GFFLineNr = 0; int HitCount = 0; GFFRecord Rec; while (GetNextGFFRecord(f, Rec)) { if (0 != strcmp(Rec.Feature, "hit")) continue; if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End) Warning("GFF line %d: invalid start %d / end %d", GFFLineNr, Rec.Start, Rec.End); int TargetStart; int TargetEnd; const char *TargetLabel = GetTarget(Rec, &TargetStart, &TargetEnd); if (TargetStart <= 0 || TargetEnd <= 0 || TargetStart > TargetEnd) Warning("GFF line %d: invalid target start %d / end %d", GFFLineNr, Rec.Start, Rec.End); ++HitCount; AddContigPos(Rec.SeqName, Rec.End - 1); AddContigPos(TargetLabel, TargetEnd - 1); } return HitCount; } static int ReadHitsPass2(FILE *f, HitData *Hits) { rewind(f); GFFLineNr = 0; int HitCount = 0; GFFRecord Rec; while (GetNextGFFRecord(f, Rec)) { if (0 != strcmp(Rec.Feature, "hit")) continue; if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End) Warning("GFF line %d: invalid start %d / end %d", GFFLineNr, Rec.Start, Rec.End); int TargetStart; int TargetEnd; const char *TargetLabel = GetTarget(Rec, &TargetStart, &TargetEnd); if (TargetStart <= 0 || TargetEnd <= 0 || TargetStart > TargetEnd) Warning("GFF line %d: invalid target start %d / end %d", GFFLineNr, Rec.Start, Rec.End); HitData &Hit = Hits[HitCount]; const int QueryFrom = ContigToGlobal(Rec.Start, Rec.SeqName); const int TargetFrom = ContigToGlobal(TargetStart, TargetLabel); const int QueryLength = Rec.End - Rec.Start + 1; const int TargetLength = TargetEnd - TargetStart + 1; Hit.QueryFrom = QueryFrom; Hit.QueryTo = QueryFrom + QueryLength - 1; Hit.TargetFrom = TargetFrom; Hit.TargetTo = TargetFrom + TargetLength - 1; if (Rec.Strand == '+') Hit.Rev = false; else if (Rec.Strand == '-') Hit.Rev = true; else Quit("GFF line %d, Invalid strand %c", GFFLineNr, Rec.Strand); ++HitCount; } return HitCount; } HitData *ReadHits(const char *FileName, int *ptrHitCount, int *ptrSeqLength) { FILE *f = OpenStdioFile(FileName); int HitCount = ReadHitsPass1(f); int SeqLength = GlobalizeContigs(); MakeContigMap(); HitData *Hits = all(HitData, HitCount); ReadHitsPass2(f, Hits); fclose(f); *ptrSeqLength = SeqLength; *ptrHitCount = HitCount; return Hits; } readmfa.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000354711447530641015200 0ustar ????????????????#include "piler2.h" #define APPEND_CHAR(c) \ { \ if (Pos >= BufferSize) \ Quit("ReadMFA: buffer too small"); \ Buffer[Pos++] = (c); \ } #define APPEND_LABEL(c) \ { \ if (LabelLength >= LabelBufferLength-1) \ { \ LabelBufferLength += 128; \ char *NewLabel = all(char, LabelBufferLength); \ memcpy(NewLabel, Label, LabelLength); \ freemem(Label); \ Label = NewLabel; \ } \ Label[LabelLength] = c; \ ++LabelLength; \ } char *ReadMFA(FILE *f, int *ptrLength) { rewind(f); int FileSize = GetFileSize(f); int BufferSize = FileSize; char *Buffer = all(char, BufferSize); char prev_c = '\n'; bool InLabel = false; int ContigFrom = 0; char *Label = 0; int LabelLength = 0; int LabelBufferLength = 0; int Pos = 0; int ContigStart = 0; for (;;) { int c = fgetc(f); if (EOF == c) { if (feof(f)) break; Quit("Stream error"); } if (InLabel) { if (c == '\r') continue; if ('\n' == c) { Label[LabelLength] = 0; InLabel = false; } else { APPEND_LABEL(c) } } else { if ('>' == c && '\n' == prev_c) { int ContigLength = Pos - ContigStart; if (ContigLength > 0) AddContig(Label, ContigStart, ContigLength); ContigStart = Pos; InLabel = true; LabelLength = 0; } else if (!isspace(c)) { APPEND_CHAR(c) } } prev_c = c; } int ContigLength = Pos - ContigStart; if (ContigLength > 0) AddContig(Label, ContigStart, ContigLength); *ptrLength = Pos; SetSeqLength(Pos); return Buffer; } char *ReadMFA(const char FileName[], int *ptrSeqLength) { FILE *f = OpenStdioFile(FileName); char *Seq = ReadMFA(f, ptrSeqLength); fclose(f); return Seq; } readmotif.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000325611447530641015550 0ustar ????????????????#include "piler2.h" static int GetFam(GFFRecord &Rec) { char *Attrs = Rec.Attrs; char *Pyr = strstr(Attrs, "Pyramid"); if (0 == Pyr) Quit("GFF tandemmotif record line %d, failed to find Pyramid attr", GFFLineNr); // Pyramid 123 // 012345678 return atoi(Pyr + 8); } static int ReadMotifPass1(FILE *f) { GFFLineNr = 0; int MotifCount = 0; GFFRecord Rec; while (GetNextGFFRecord(f, Rec)) { if (0 != strcmp(Rec.Feature, "tandemmotif")) continue; if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End) Warning("GFF line %d: invalid start %d / end %d", GFFLineNr, Rec.Start, Rec.End); ++MotifCount; } return MotifCount; } static int ReadMotifPass2(FILE *f, MotifData *Motifs) { rewind(f); GFFLineNr = 0; int MotifCount = 0; GFFRecord Rec; while (GetNextGFFRecord(f, Rec)) { if (0 != strcmp(Rec.Feature, "tandemmotif")) continue; if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End) Warning("GFF line %d: invalid start %d / end %d", GFFLineNr, Rec.Start, Rec.End); int FamIndex = GetFam(Rec); MotifData &Motif = Motifs[MotifCount]; const int Length = Rec.End - Rec.Start + 1; Motif.ContigLabel = strsave(Rec.SeqName); Motif.ContigFrom = Rec.Start - 1; Motif.ContigTo = Motif.ContigFrom + Length - 1; Motif.FamIndex = FamIndex; ++MotifCount; } return MotifCount; } MotifData *ReadMotif(const char *FileName, int *ptrMotifCount) { FILE *f = OpenStdioFile(FileName); int MotifCount = ReadMotifPass1(f); MotifData *Motifs = all(MotifData, MotifCount); ReadMotifPass2(f, Motifs); fclose(f); *ptrMotifCount = MotifCount; return Motifs; } readreps.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000575211447530641015406 0ustar ????????????????#include "piler2.h" // Destructive: pokes Rec.Attrs static char *GetRepeat(GFFRecord &Rec) { char *Attrs = Rec.Attrs; char *Start = strstr(Attrs, "Repeat"); if (0 == Start) Quit("GFF line %d, repeat record does not have Repeat attribute", GFFLineNr); char *SemiColon = strchr(Start, ';'); if (0 != SemiColon) *SemiColon = 0; char *Space = strchr(Start, ' '); if (0 == Space) Quit("GFF repeat record line %d, missing space in attrs", GFFLineNr); return Space + 1; } // "Repeat HETRP_DM Satellite 1518 1669 0" // 0 1 2 3 4 void ParseRepeat(char *Str, RepData &Rep) { char *Fields[5]; int FieldCount = GetFields(Str, Fields, 5, ' '); if (FieldCount != 5) Quit("GFF line %d, Repeat attribute does not have 5 fields"); const char *RepeatName = Fields[0]; const char *RepeatClass = Fields[1]; const char *RepeatFrom = Fields[2]; const char *RepeatTo = Fields[3]; const char *RepeatLeft = Fields[4]; Rep.RepeatName = strsave(RepeatName); Rep.RepeatClass = strsave(RepeatClass); if (0 == strcmp(RepeatFrom, ".")) { Rep.RepeatFrom = -1; Rep.RepeatTo = -1; Rep.RepeatLeft = -1; } else { Rep.RepeatFrom = atoi(RepeatFrom) - 1; Rep.RepeatTo = atoi(RepeatTo) - 1; Rep.RepeatLeft = atoi(RepeatLeft); } } static int ReadRepsPass1(FILE *f) { GFFLineNr = 0; int RepCount = 0; GFFRecord Rec; while (GetNextGFFRecord(f, Rec)) { if (0 != strcmp(Rec.Feature, "repeat")) { static bool WarningIssued = false; if (!WarningIssued) { Warning("GFF record feature '%s' is not a repeat", Rec.Feature); WarningIssued = true; } continue; } if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End) Warning("GFF line %d: invalid start %d / end %d", GFFLineNr, Rec.Start, Rec.End); ++RepCount; } return RepCount; } static int ReadRepsPass2(FILE *f, RepData *Reps) { rewind(f); GFFLineNr = 0; int RepCount = 0; GFFRecord Rec; while (GetNextGFFRecord(f, Rec)) { if (0 != strcmp(Rec.Feature, "repeat")) continue; static char *Repeat = GetRepeat(Rec); RepData &Rep = Reps[RepCount]; ParseRepeat(Repeat, Rep); if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End) Warning("GFF line %d: invalid start %d / end %d", GFFLineNr, Rec.Start, Rec.End); const int Length = Rec.End - Rec.Start + 1; Rep.ContigLabel = strsave(Rec.SeqName); Rep.ContigFrom = Rec.Start - 1; Rep.ContigTo = Rep.ContigFrom + Length - 1; if (Rec.Strand == '+') Rep.Rev = false; else if (Rec.Strand == '-') Rep.Rev = true; else Quit("GFF line %d, Invalid strand %c", GFFLineNr, Rec.Strand); ++RepCount; } return RepCount; } RepData *ReadReps(const char *FileName, int *ptrRepCount) { FILE *f = OpenStdioFile(FileName); int RepCount = ReadRepsPass1(f); RepData *Reps = all(RepData, RepCount); ReadRepsPass2(f, Reps); fclose(f); *ptrRepCount = RepCount; return Reps; } readtrs.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000430111447530641015232 0ustar ????????????????#include "piler2.h" // Destructive: pokes Rec.Attrs static char *GetFam(GFFRecord &Rec) { char *Attrs = Rec.Attrs; char *SemiColon = strchr(Attrs, ';'); if (0 != SemiColon) *SemiColon = 0; char *Space = strchr(Attrs, ' '); if (0 == Space) Quit("GFF trs record line %d, missing space in attrs", GFFLineNr); *Space = 0; if (0 != strcmp(Attrs, "Family")) Quit("GFF trs record line %d, expected Family as first attr", GFFLineNr); return Space + 1; } static int ReadTRSPass1(FILE *f) { GFFLineNr = 0; int TRSCount = 0; GFFRecord Rec; while (GetNextGFFRecord(f, Rec)) { if (0 != strcmp(Rec.Feature, "trs")) continue; if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End) Warning("GFF line %d: invalid start %d / end %d", GFFLineNr, Rec.Start, Rec.End); ++TRSCount; } return TRSCount; } static int ReadTRSPass2(FILE *f, TRSData *TRSs) { rewind(f); GFFLineNr = 0; int TRSCount = 0; GFFRecord Rec; while (GetNextGFFRecord(f, Rec)) { if (0 != strcmp(Rec.Feature, "trs")) continue; if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End) Warning("GFF line %d: invalid start %d / end %d", GFFLineNr, Rec.Start, Rec.End); static char *Fam = GetFam(Rec); int FamIndex = 0; int SuperFamIndex = 0; int n = sscanf(Fam, "%d.%d", &FamIndex, &SuperFamIndex); if (n == 1) SuperFamIndex = -1; else if (n != 2) Quit("Invalid Family %s", Fam); TRSData &TRS = TRSs[TRSCount]; const int Length = Rec.End - Rec.Start + 1; TRS.ContigLabel = strsave(Rec.SeqName); TRS.ContigFrom = Rec.Start - 1; TRS.ContigTo = TRS.ContigFrom + Length - 1; TRS.FamIndex = FamIndex; TRS.SuperFamIndex = SuperFamIndex; if (Rec.Strand == '+') TRS.Rev = false; else if (Rec.Strand == '-') TRS.Rev = true; else Quit("GFF line %d, Invalid strand %c", GFFLineNr, Rec.Strand); ++TRSCount; } return TRSCount; } TRSData *ReadTRS(const char *FileName, int *ptrTRSCount) { FILE *f = OpenStdioFile(FileName); int TRSCount = ReadTRSPass1(f); TRSData *TRSs = all(TRSData, TRSCount); ReadTRSPass2(f, TRSs); fclose(f); *ptrTRSCount = TRSCount; return TRSs; } tan.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000003312711447530642014361 0ustar ????????????????#include "piler2.h" #define GFFRecord GFFRecord2 static int MIN_HIT_COUNT = 2; static double MAX_FRACT_MARGIN = 0.05; static double MIN_RATIO = 0.5; struct TanPile { char *Label; int From; int To; HitData *Hits; int HitCount; }; static TanPile *Piles; static int PileCount; static int PileBufferSize; static int PyramidIndex; FILE *fOut; FILE *fPyramid; FILE *fMotif; static inline int min3(int i, int j, int k) { return min(i, min(j, k)); } static inline int max3(int i, int j, int k) { return max(i, max(j, k)); } static void TouchPiles(int PileIndex) { if (PileIndex < PileBufferSize) { if (PileIndex >= PileCount) PileCount = PileIndex + 1; return; } int NewPileBufferSize = PileBufferSize + 10000; TanPile *NewPiles = all(TanPile, NewPileBufferSize); zero(NewPiles, TanPile, NewPileBufferSize); memcpy(NewPiles, Piles, PileCount*sizeof(TanPile)); Piles = NewPiles; PileBufferSize = NewPileBufferSize; PileCount = PileIndex + 1; } static void AssignHit(int PileIndex, const char *Label, int QueryFrom, int QueryTo, int TargetFrom, int TargetTo, bool Rev) { TanPile &Pile = Piles[PileIndex]; int HitIndex = Pile.HitCount; HitData &Hit = Pile.Hits[HitIndex]; Hit.QueryFrom = QueryFrom; Hit.QueryTo = QueryTo; Hit.TargetFrom = TargetFrom; Hit.TargetTo = TargetTo; Hit.Rev = Rev; ++(Pile.HitCount); } static void AddHit(int PileIndex, const char *Label, int QueryFrom, int QueryTo, int TargetFrom, int TargetTo, bool Rev) { TouchPiles(PileIndex); TanPile &Pile = Piles[PileIndex]; if (0 == Pile.HitCount) { Pile.Label = strsave(Label); Pile.From = min(QueryFrom, TargetFrom); Pile.To = max(QueryTo, TargetTo); } else { if (0 != strcmp(Label, Pile.Label)) Quit("Labels disagree"); Pile.From = min3(Pile.From, QueryFrom, TargetFrom); Pile.To = max3(Pile.To, QueryTo, TargetTo); } ++(Pile.HitCount); } bool TandemPair(const HitData &Hit1, const HitData &Hit2) { int Length1 = (Hit1.QueryTo - Hit1.QueryFrom + Hit1.TargetTo- Hit1.TargetFrom)/2; int Length2 = (Hit2.QueryTo - Hit2.QueryFrom + Hit2.TargetTo - Hit2.TargetFrom)/2; int ShorterLength = min(Length1, Length2); int LongerLength = max(Length1, Length2); if ((double) ShorterLength / (double) LongerLength < MIN_RATIO) return false; int StartDist = iabs(Hit1.TargetFrom - Hit2.TargetFrom); int EndDist = iabs(Hit1.QueryTo - Hit2.QueryTo); double StartMargin = (double) StartDist / (double) ShorterLength; double EndMargin = (double) EndDist / (double) ShorterLength; return StartMargin <= MAX_FRACT_MARGIN && EndMargin <= MAX_FRACT_MARGIN; } static void WritePyramid(int PyramidIndex, const char *Label, int From, int To) { if (0 == fPyramid) return; GFFRecord Rec; Rec.Source = "piler"; Rec.Feature = "pyramid"; Rec.Score = 0; Rec.Frame = -1; Rec.Strand = '.'; Rec.SeqName = Label; Rec.Start = From + 1; Rec.End = To + 1; char s[32]; sprintf(s, "PyramidIndex %d", PyramidIndex); Rec.Attrs = s; WriteGFFRecord(fPyramid, Rec); } static void InferMotif(int PyramidIndex, const TanPile &Pile, FamData *Fam) { const int MAX_PAIRS = 1024; int DiagDists[MAX_PAIRS]; int PairCount = 0; int MinDiagDist = -1; for (PtrFamData q1 = Fam->begin(); q1 != Fam->end(); ++q1) { FamMemberData &FamMember1 = *q1; int HitIndex1 = FamMember1.PileIndex; if (HitIndex1 < 0 || HitIndex1 >= Pile.HitCount) Quit("Hit index out of range"); const HitData &Hit1 = Pile.Hits[HitIndex1]; int Diag1Start = Hit1.TargetFrom - Hit1.QueryFrom; int Diag1End = Hit1.TargetTo - Hit1.QueryTo; int Diag1 = (Diag1Start + Diag1End)/2; PtrFamData q2 = q1; for (++q2; q2 != Fam->end(); ++q2) { FamMemberData &FamMember2 = *q2; int HitIndex2 = FamMember2.PileIndex; if (HitIndex2 < 0 || HitIndex2 >= Pile.HitCount) Quit("Hit index out of range"); const HitData &Hit2 = Pile.Hits[HitIndex2]; int Diag2Start = Hit2.TargetFrom - Hit2.QueryFrom; int Diag2End = Hit2.TargetTo - Hit2.QueryTo; int Diag2 = (Diag2Start + Diag2End)/2; int DiagDist = iabs(Diag1 - Diag2); if (-1 == MinDiagDist || DiagDist < MinDiagDist) MinDiagDist = DiagDist; DiagDists[PairCount++] = DiagDist; if (PairCount == MAX_PAIRS) goto Done; } } Done: // Find all dists that are no more than 10% bigger than MinDist. // The average of these distances is the estimated repeat length. int Count = 0; int Sum = 0; int MaxCandidateDist = (MinDiagDist*110)/100; for (int i = 0; i < PairCount; ++i) { const int Dist = DiagDists[i]; if (Dist <= MaxCandidateDist) { Sum += Dist; ++Count; } } if (0 == Count) Quit("Huh?"); const int EstimatedRepeatLength = Sum/Count; Log("\n"); Log("Pyramid %d: min dist = %d ", PyramidIndex, MinDiagDist); Log(" Count=%d Sum=%d Estd=%d\n", Count, Sum, EstimatedRepeatLength); GFFRecord Rec; Rec.Source = "piler"; Rec.Feature = "tandemmotif"; Rec.Score = 0; Rec.Frame = -1; Rec.SeqName = Pile.Label; Rec.Strand = '.'; // By definition, a Pyramid is set of hits for which QueryTo // and TargetFrom values are approximately equal. // We arbitrarily choose the canonical motif to end // at QueryTo. We output this motif and the Target motifs // to which it aligns. FamMemberData &FamMember = *(Fam->begin()); int HitIndex = FamMember.PileIndex; if (HitIndex < 0 || HitIndex >= Pile.HitCount) Quit("Hit index out of range"); const HitData &Hit = Pile.Hits[HitIndex]; const int MotifQueryTo = Hit.QueryTo; const int MotifQueryFrom = MotifQueryTo - EstimatedRepeatLength + 1; const int TargetTo = Hit.TargetTo; const int TargetFrom = TargetTo - EstimatedRepeatLength + 1; char s[1024]; sprintf(s, "Target %s %d %d ; Pyramid %d", Pile.Label, TargetFrom + 1, TargetTo + 1, PyramidIndex); Rec.Start = MotifQueryFrom + 1; Rec.End = MotifQueryTo + 1; Rec.Attrs = s; WriteGFFRecord(fMotif, Rec); for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q) { FamMemberData &FamMember = *q; int HitIndex = FamMember.PileIndex; if (HitIndex < 0 || HitIndex >= Pile.HitCount) Quit("Hit index out of range"); const HitData &Hit = Pile.Hits[HitIndex]; const int To = Hit.TargetTo; const int From = To - EstimatedRepeatLength + 1; char s[1024]; sprintf(s, "Target %s %d %d ; Pyramid %d", Pile.Label, MotifQueryFrom + 1, MotifQueryTo + 1, PyramidIndex); Rec.Start = From + 1; Rec.End = To + 1; Rec.Attrs = s; WriteGFFRecord(fMotif, Rec); } } static void LogPyramid(int PyramidIndex, const TanPile &Pile, FamData *Fam) { Log("\n"); Log("Pyramid %d\n", PyramidIndex); Log( " Label QStart1 QEnd1 TStart1 TEnd1 QStart1 QEnd1 TStart1 TEnd1 Diag Start End\n"); Log( "---------- -------- -------- -------- -------- -------- -------- -------- -------- ------ ------ ------\n"); for (PtrFamData q1 = Fam->begin(); q1 != Fam->end(); ++q1) { FamMemberData &FamMember1 = *q1; int HitIndex1 = FamMember1.PileIndex; if (HitIndex1 < 0 || HitIndex1 >= Pile.HitCount) Quit("Hit index out of range"); const HitData &Hit1 = Pile.Hits[HitIndex1]; int Diag1Start = Hit1.TargetFrom - Hit1.QueryFrom; int Diag1End = Hit1.TargetTo - Hit1.QueryTo; int Diag1 = (Diag1Start + Diag1End)/2; PtrFamData q2 = q1; for (++q2; q2 != Fam->end(); ++q2) { FamMemberData &FamMember2 = *q2; int HitIndex2 = FamMember2.PileIndex; if (HitIndex2 < 0 || HitIndex2 >= Pile.HitCount) Quit("Hit index out of range"); const HitData &Hit2 = Pile.Hits[HitIndex2]; int Diag2Start = Hit2.TargetFrom - Hit2.QueryFrom; int Diag2End = Hit2.TargetTo - Hit2.QueryTo; int Diag2 = (Diag2Start + Diag2End)/2; int DiagDist = iabs(Diag1 - Diag2); int StartDist = iabs(Hit1.TargetFrom - Hit2.TargetFrom); int EndDist = iabs(Hit1.QueryTo - Hit2.QueryTo); Log("%10.10s %8d %8d %8d %8d %8d %8d %8d %8d %6d", Pile.Label, Hit1.QueryFrom, Hit1.QueryTo, Hit1.TargetFrom, Hit1.TargetTo, Hit2.QueryFrom, Hit2.QueryTo, Hit2.TargetFrom, Hit2.TargetTo, DiagDist); Log(" %6d", StartDist); Log(" %6d", EndDist); Log("\n"); } } } static void FindPyramids(int PileIndex) { const TanPile &Pile = Piles[PileIndex]; const int HitCount = Pile.HitCount; if (HitCount < MIN_HIT_COUNT) return; EdgeList Edges; EdgeData Edge; Edge.Rev = false; const HitData *Hits = Pile.Hits; for (int HitIndex1 = 0; HitIndex1 < HitCount; ++HitIndex1) { const HitData &Hit1 = Hits[HitIndex1]; Edge.Node1 = HitIndex1; for (int HitIndex2 = 0; HitIndex2 < HitIndex1; ++HitIndex2) { const HitData &Hit2 = Hits[HitIndex2]; if (TandemPair(Hit1, Hit2)) { Edge.Node2 = HitIndex2; Edges.push_back(Edge); } } } FamList Fams; FindConnectedComponents(Edges, Fams, MIN_HIT_COUNT); if (0 == Fams.size()) return; GFFRecord Rec; Rec.Source = "piler"; Rec.Feature = "hit"; Rec.Score = 0; Rec.Frame = -1; Rec.SeqName = Pile.Label; for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p) { FamData *Fam = *p; if (0 != fMotif) InferMotif(PyramidIndex, Pile, Fam); LogPyramid(PyramidIndex, Pile, Fam); int PyramidFrom = -1; int PyramidTo = -1; for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q) { FamMemberData &FamMember = *q; int HitIndex = FamMember.PileIndex; if (HitIndex < 0 || HitIndex >= Pile.HitCount) Quit("Hit index out of range"); const HitData &Hit = Pile.Hits[HitIndex]; char s[1024]; sprintf(s, "Target %s %d %d ; Pile %d ; Pyramid %d", Pile.Label, Hit.TargetFrom + 1, Hit.TargetTo + 1, PileIndex, PyramidIndex); Rec.Strand = (Hit.Rev ? '-' : '+'); Rec.Start = Hit.QueryFrom + 1; Rec.End = Hit.QueryTo + 1; Rec.Attrs = s; WriteGFFRecord(fOut, Rec); if (PyramidFrom == -1) PyramidFrom = min(Hit.QueryFrom, Hit.TargetFrom); else PyramidFrom = min3(PyramidFrom, Hit.QueryFrom, Hit.TargetFrom); if (PyramidTo == -1) PyramidTo = max(Hit.QueryTo, Hit.TargetTo); else PyramidTo = max3(PyramidTo, Hit.QueryTo, Hit.TargetTo); } WritePyramid(PyramidIndex, Pile.Label, PyramidFrom, PyramidTo); ++PyramidIndex; } } void Tan() { // Image file annotated with from-to pile indexes // Produced by: // piler2 -trs banded_hits.gff -images mainband_images.gff const char *HitFileName = RequiredValueOpt("tan"); const char *OutFileName = RequiredValueOpt("out"); const char *PyramidFileName = ValueOpt("pyramid"); const char *MotifFileName = ValueOpt("motif"); const char *strMinHits = ValueOpt("minhits"); const char *strMaxMargin = ValueOpt("maxmargin"); const char *strMinRatio = ValueOpt("minratio"); if (0 != strMinHits) MIN_HIT_COUNT = atoi(strMinHits); if (0 != strMaxMargin) MAX_FRACT_MARGIN = atof(strMaxMargin); if (0 != strMinRatio) MIN_RATIO = atof(strMinRatio); FILE *fInput = OpenStdioFile(HitFileName); ProgressStart("Initialize piles"); GFFRecord Rec; int HitCount = 0; while (GetNextGFFRecord(fInput, Rec)) { if (0 != strcmp(Rec.Feature, "hit")) continue; int QueryPileIndex = -1; int TargetPileIndex = -1; ParsePilesAttrs(Rec.Attrs, &QueryPileIndex, &TargetPileIndex); if (QueryPileIndex != TargetPileIndex) continue; char TargetLabel[128]; int TargetStart; int TargetEnd; ParseTargetAttrs(Rec.Attrs, TargetLabel, sizeof(TargetLabel), &TargetStart, &TargetEnd); if (0 != strcmp(Rec.SeqName, TargetLabel)) Quit("Labels don't match"); const int QueryFrom = Rec.Start - 1; const int QueryTo = Rec.End - 1; const int TargetFrom = TargetStart - 1; const int TargetTo = TargetEnd - 1; const bool Rev = (Rec.Strand == '-'); AddHit(QueryPileIndex, Rec.SeqName, QueryFrom, QueryTo, TargetFrom, TargetTo, Rev); ++HitCount; } ProgressDone(); Progress("%d hits, %d piles", HitCount, PileCount); ProgressStart("Allocate piles"); for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex) { TanPile &Pile = Piles[PileIndex]; Pile.Hits = all(HitData, Pile.HitCount); Pile.HitCount = 0; } ProgressDone(); ProgressStart("Assign hits to piles"); Rewind(fInput); while (GetNextGFFRecord(fInput, Rec)) { if (0 != strcmp(Rec.Feature, "hit")) continue; int QueryPileIndex = -1; int TargetPileIndex = -1; ParsePilesAttrs(Rec.Attrs, &QueryPileIndex, &TargetPileIndex); if (QueryPileIndex != TargetPileIndex) continue; char TargetLabel[128]; int TargetStart; int TargetEnd; ParseTargetAttrs(Rec.Attrs, TargetLabel, sizeof(TargetLabel), &TargetStart, &TargetEnd); if (0 != strcmp(Rec.SeqName, TargetLabel)) Quit("Labels don't match"); const int QueryFrom = Rec.Start - 1; const int QueryTo = Rec.End - 1; const int TargetFrom = TargetStart - 1; const int TargetTo = TargetEnd - 1; const bool Rev = (Rec.Strand == '-'); AssignHit(QueryPileIndex, Rec.SeqName, QueryFrom, QueryTo, TargetFrom, TargetTo, Rev); } ProgressDone(); fOut = OpenStdioFile(OutFileName, FILEIO_MODE_WriteOnly); fPyramid = (0 == PyramidFileName ? 0 : OpenStdioFile(PyramidFileName, FILEIO_MODE_WriteOnly)); fMotif = (0 == PyramidFileName ? 0 : OpenStdioFile(MotifFileName, FILEIO_MODE_WriteOnly)); ProgressStart("Find pyramids"); for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex) FindPyramids(PileIndex); int PyramidCount = PyramidIndex; ProgressDone(); Progress("%d pyramids", PyramidCount); } tanmotif2fasta.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000613711447530642016522 0ustar ????????????????#include "piler2.h" static void LogRecords(const MotifData *Motifs, int MotifCount) { Log(" Contig From To Length Family\n"); Log("-------------------- ---------- ---------- ------ ------\n"); for (int MotifIndex = 0; MotifIndex < MotifCount; ++MotifIndex) { const MotifData &Motif = Motifs[MotifIndex]; Log("%20.20s %10d %10d %6d %d\n", Motif.ContigLabel, Motif.ContigFrom, Motif.ContigTo, Motif.ContigTo - Motif.ContigFrom + 1, Motif.FamIndex); } } static int CmpMotif(const void *s1, const void *s2) { const MotifData *Motif1 = (const MotifData *) s1; const MotifData *Motif2 = (const MotifData *) s2; return Motif1->FamIndex - Motif2->FamIndex; } static char *FamFileName(const char *Path, int Fam) { static char FileName[256]; char s[128]; sprintf(s, "/%d", Fam); if (strlen(Path) + strlen(s) + 3 >= sizeof(FileName)) Quit("Path name too long"); strcpy(FileName, Path); strcat(FileName, s); fprintf(stderr, "\n\n*** FILE NAME *** '%s'\n", FileName); return FileName; } static char *MotifLabel(const char *Prefix, const MotifData &Motif) { int n = (int) strlen(Motif.ContigLabel) + 128; char *s = all(char, n); if (0 != Prefix) sprintf(s, "%s.%d %s:%d", Prefix, Motif.FamIndex, Motif.ContigLabel, Motif.ContigFrom+1); else sprintf(s, "%d %s:%d", Motif.FamIndex, Motif.ContigLabel, Motif.ContigFrom+1); return s; } void Tanmotif2Fasta() { const char *MotifFileName = RequiredValueOpt("tanmotif2fasta"); const char *SeqFileName = RequiredValueOpt("seq"); const char *Path = ValueOpt("path"); const char *strMaxFam = ValueOpt("maxfam"); const char *Prefix = ValueOpt("prefix"); int MaxFam = DEFAULT_MAX_FAM; if (strMaxFam != 0) MaxFam = atoi(strMaxFam); if (0 == Path) Path = "."; ProgressStart("Reading seq file"); int SeqLength; const char *Seq = ReadMFA(SeqFileName, &SeqLength); ProgressDone(); Progress("Seq length %d bases, %.3g Mb", SeqLength, SeqLength/1e6); ProgressStart("Read Motif file"); int MotifCount; MotifData *Motifs = ReadMotif(MotifFileName, &MotifCount); ProgressDone(); Progress("%d records", MotifCount); ProgressStart("Sorting by family"); qsort((void *) Motifs, MotifCount, sizeof(MotifData), CmpMotif); ProgressDone(); FILE *f = 0; int CurrentFamily = -1; int MemberCount = 0; for (int MotifIndex = 0; MotifIndex < MotifCount; ++MotifIndex) { const MotifData &Motif = Motifs[MotifIndex]; if (Motif.FamIndex != CurrentFamily) { if (f != 0) fclose(f); char *FastaFileName = FamFileName(Path, Motif.FamIndex); f = OpenStdioFile(FastaFileName, FILEIO_MODE_WriteOnly); CurrentFamily = Motif.FamIndex; MemberCount = 0; } ++MemberCount; if (MemberCount > MaxFam) continue; const int From = ContigToGlobal(Motif.ContigFrom, Motif.ContigLabel); const int Length = Motif.ContigTo - Motif.ContigFrom + 1; char *Label = MotifLabel(Prefix, Motif); WriteFasta(f, Seq + From, Length, Label, false); freemem(Label); } } tr.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000002211211447530642014214 0ustar ????????????????#include "piler2.h" #if defined(DEBUG) && defined(_MSC_VER) #include #endif #define GFFRecord GFFRecord2 /*** Find LTR families. 1. LTR candidates ------===--------------------===------------ genome ^----------------------^ Hit Hit is candidate pair of LTRs bounding a candidate LINE if: (a) hit length >= MIN_LENGTH_LTR and <= MAX_LENGTH_LTR, and (b) offset is >= MIN_LENGTH_LINE and <= MAX_LENGTH_LINE. 2. Same family if a local alignment connects two LTR candidates of similar length. Cand A Cand B ------===........===----------===........===----- genome ^------------^ Hit ***/ static int MIN_LENGTH_LINE = 50; static int MAX_LENGTH_LINE = 12000; static int MIN_LENGTH_LTR = 50; static int MAX_LENGTH_LTR = 2000; static double MIN_LINE_RATIO = 0.9; static int MIN_FAM_SIZE = 3; static double MIN_HIT_LENGTH_RATIO = 0.5; static int MIN_DIST_EDGE = 50000; static EdgeList Edges; static double Ratio(int i, int j) { if (i == 0 && j == 0) return 0; if (i < j) return (double) i / (double) j; else return (double) j / (double) i; } static int GetHitLength(const HitData &Hit) { const int QueryHitLength = Hit.QueryTo - Hit.QueryFrom + 1; const int TargetHitLength = Hit.TargetTo - Hit.TargetFrom + 1; return (QueryHitLength + TargetHitLength)/2; } static int GetLTRLength(const HitData &Hit) { const int QueryLTRLength = Hit.QueryTo - Hit.QueryFrom + 1; const int TargetLTRLength = Hit.TargetTo - Hit.TargetFrom + 1; return (QueryLTRLength + TargetLTRLength)/2; } static int GetHitFrom(const HitData &Hit) { return min(Hit.QueryFrom, Hit.TargetFrom); } static int GetHitTo(const HitData &Hit) { return max(Hit.QueryTo, Hit.TargetTo); } static int GetLINELength(const HitData &Hit) { return GetHitTo(Hit) - GetHitFrom(Hit) + 1; } static bool IsCandLTR(const HitData &Hit) { const int LTRLength = GetLTRLength(Hit); if (LTRLength < MIN_LENGTH_LTR || LTRLength > MAX_LENGTH_LTR) return false; const int LINELength = GetLINELength(Hit); if (LINELength < MIN_LENGTH_LINE || LINELength > MAX_LENGTH_LINE) return false; return true; } static HitData *Cands; static int CandCount = 0; static int CandBufferSize = 0; static void AddCand(const HitData &Hit, IIX &IntervalIndex) { if (CandCount >= CandBufferSize) { CandBufferSize += 10000; Cands = reall(Cands, HitData, CandBufferSize); } int From; int To; if (Hit.QueryFrom < Hit.TargetFrom) { From = Hit.QueryFrom; To = Hit.TargetTo; } else { From = Hit.TargetFrom; To = Hit.TargetTo; } IntervalIndex.AddGlobal(From, To, CandCount); Cands[CandCount++] = Hit; } static bool TooClose(const GLIX &HitGlix, const HitData &Hit1, const HitData &Hit2) { const int Hit1From = Hit1.QueryFrom; const int Hit2From = Hit2.QueryFrom; int SeqFrom1; int SeqFrom2; const char *Label1 = HitGlix.GlobalToSeq(Hit1From, &SeqFrom1); const char *Label2 = HitGlix.GlobalToSeq(Hit2From, &SeqFrom2); if (0 != strcmp(Label1, Label2)) return false; return iabs(SeqFrom2 - SeqFrom1) < MIN_DIST_EDGE; } static bool IsEdge(const GLIX &HitGlix, const HitData &Hit, int i, int j) { if (i == j) return false; assert(i >= 0 && i < CandCount); assert(j >= 0 && j < CandCount); const HitData &Hit_i = Cands[i]; const HitData &Hit_j = Cands[j]; const int HitLength = GetHitLength(Hit); const int HitLength_i = GetHitLength(Hit_i); const int HitLength_j = GetHitLength(Hit_j); if (Ratio(HitLength, HitLength_i) < MIN_HIT_LENGTH_RATIO || Ratio(HitLength, HitLength_j) < MIN_HIT_LENGTH_RATIO || Ratio(HitLength_i, HitLength_j) < MIN_HIT_LENGTH_RATIO) return false; const int LINELength_i = GetLINELength(Hit_i); const int LINELength_j = GetLINELength(Hit_j); if (Ratio(LINELength_i, LINELength_j) < MIN_LINE_RATIO) return false; if (TooClose(HitGlix, Hit_i, Hit_j)) return false; return true; } static void FindEdges(const HitData &Hit, const GLIX &HitGlix, const IIX &IntervalIndex) { int *QueryMatches; int *TargetMatches; const int QueryMatchCount = IntervalIndex.LookupGlobal(Hit.QueryFrom, Hit.QueryTo, &QueryMatches); const int TargetMatchCount = IntervalIndex.LookupGlobal(Hit.TargetFrom, Hit.TargetTo, &TargetMatches); const int MatchCount = QueryMatchCount + TargetMatchCount; int *Matches = all(int, MatchCount); memcpy(Matches, QueryMatches, QueryMatchCount*sizeof(int)); memcpy(Matches + QueryMatchCount, TargetMatches, TargetMatchCount*sizeof(int)); for (int i = 0; i < MatchCount; ++i) { const int m = Matches[i]; if (m < 0 || m > CandCount) Quit("m=%d count=%d", m, CandCount); } for (int i = 0; i < MatchCount; ++i) { const int CandIndex1 = Matches[i]; for (int j = 0; j < i; ++j) { const int CandIndex2 = Matches[j]; if (IsEdge(HitGlix, Hit, CandIndex1, CandIndex2)) { EdgeData Edge; Edge.Node1 = CandIndex1; Edge.Node2 = CandIndex2; Edge.Rev = Hit.Rev; Edges.push_back(Edge); } } } } static void WriteOutputFile(FILE *fOut, const GLIX &HitGlix, FamList &Fams) { GFFRecord Rec; Rec.Feature = "tr"; Rec.Source = "piler"; Rec.Score = 0; Rec.Strand = '.'; Rec.Frame = -1; int FamIndex = 0; for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p) { FamData *Fam = *p; for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q) { FamMemberData &FamMember = *q; int CandIndex = FamMember.PileIndex; assert(CandIndex >= 0 && CandIndex < CandCount); const HitData &Hit = Cands[CandIndex]; const int GlobalFrom = GetHitFrom(Hit); const int GlobalTo = GetHitTo(Hit); const int Length = GlobalTo - GlobalFrom + 1; int SeqFrom; const char *Label = HitGlix.GlobalToSeq(GlobalFrom, &SeqFrom); int SeqTo = SeqFrom + Length - 1; Rec.SeqName = Label; Rec.Start = SeqFrom + 1; Rec.End = SeqTo + 1; Rec.Strand = (FamMember.Rev ? '-' : '+'); char Attrs[1024]; sprintf(Attrs, "Family %d ; Cand %d", FamIndex, CandIndex); Rec.Attrs = Attrs; WriteGFFRecord(fOut, Rec); } ++FamIndex; } } static void WriteCands(FILE *f, const GLIX &HitGlix) { GFFRecord Rec; for (int CandIndex = 0; CandIndex < CandCount; ++CandIndex) { const HitData &Hit = Cands[CandIndex]; char AnnotBuffer[1024]; HitToGFFRecord(HitGlix, Hit, Rec, AnnotBuffer); char s[4096]; sprintf(s, "%s ; Cand %d", Rec.Attrs, CandIndex); Rec.Attrs = s; WriteGFFRecord(f, Rec); } } void TR() { #if defined(DEBUG) && defined(_MSC_VER) _CrtSetDbgFlag(0); // too expensive #endif const char *HitFileName = RequiredValueOpt("tr"); const char *OutFileName = RequiredValueOpt("out"); const char *CandFileName = ValueOpt("cand"); const char *strMinTrSpacing = ValueOpt("mintrspacing"); const char *strMaxTrSpacing = ValueOpt("maxtrspacing"); const char *strMinTrLength = ValueOpt("mintrlength"); const char *strMaxTrLength = ValueOpt("minspacingratio"); const char *strMinFam = ValueOpt("minfam"); const char *strMinHitRatio = ValueOpt("minhitratio"); const char *strMinDistPairs = ValueOpt("mindistpairs"); if (0 != strMinTrSpacing) MIN_LENGTH_LINE = atoi(strMinTrSpacing); if (0 != strMaxTrSpacing) MAX_LENGTH_LINE = atoi(strMaxTrSpacing); if (0 != strMinTrLength) MIN_LENGTH_LTR = atoi(strMinTrLength); if (0 != strMaxTrLength) MAX_LENGTH_LTR = atoi(strMaxTrLength); if (0 != strMinFam) MIN_FAM_SIZE = atoi(strMinFam); if (0 != strMinHitRatio) MIN_HIT_LENGTH_RATIO = atoi(strMinHitRatio); if (0 != strMinDistPairs) MIN_DIST_EDGE = atoi(strMinDistPairs); FILE *fHit = OpenStdioFile(HitFileName, FILEIO_MODE_ReadOnly); ProgressStart("Index hits"); GLIX HitGlix; HitGlix.Init(); HitGlix.FromGFFFile(fHit); HitGlix.MakeGlobalToLocalIndex(); ProgressDone(); const int GlobalLength = HitGlix.GetGlobalLength(); IIX IntervalIndex; IntervalIndex.Init(GlobalLength); ProgressStart("Find candidate TRs"); Rewind(fHit); GFFRecord Rec; while (GetNextGFFRecord(fHit, Rec)) { HitData Hit; GFFRecordToHit(HitGlix, Rec, Hit); if (IsCandLTR(Hit)) AddCand(Hit, IntervalIndex); } ProgressDone(); Progress("%d candidates", CandCount); if (0 != CandFileName) { ProgressStart("Write candidates"); FILE *fCand = OpenStdioFile(CandFileName, FILEIO_MODE_WriteOnly); WriteCands(fCand, HitGlix); ProgressDone(); } ProgressStart("Make graph"); Rewind(fHit); while (GetNextGFFRecord(fHit, Rec)) { HitData Hit; GFFRecordToHit(HitGlix, Rec, Hit); FindEdges(Hit, HitGlix, IntervalIndex); } fclose(fHit); fHit = 0; ProgressDone(); Progress("%d edges", (int) Edges.size()); ProgressStart("Find families"); FamList Fams; FindConnectedComponents(Edges, Fams, MIN_FAM_SIZE); ProgressDone(); Progress("%d families", (int) Fams.size()); FILE *fOut = OpenStdioFile(OutFileName, FILEIO_MODE_WriteOnly); WriteOutputFile(fOut, HitGlix, Fams); } trs.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000004512211447530642014405 0ustar ????????????????#include "piler2.h" #include "bitfuncs.h" #define TRACE 0 static int g_paramMinFamSize = 3; static int g_paramMaxLengthDiffPct = 5; static bool g_paramSingleHitCoverage = true; static PileData *Piles; static int PileCount; static int EdgeCount; static int MaxImageCount = 0; static int SeqLength; static int SeqLengthChunks; static PILE_INDEX_TYPE *IdentifyPiles(int *CopyCount) { PILE_INDEX_TYPE *PileIndexes = all(PILE_INDEX_TYPE, SeqLengthChunks); #if DEBUG memset(PileIndexes, 0xff, SeqLengthChunks*sizeof(PILE_INDEX_TYPE)); #endif int PileIndex = -1; bool InPile = false; for (int i = 0; i < SeqLengthChunks; ++i) { if (BitIsSet(CopyCount, i)) { if (!InPile) { ++PileIndex; if (PileIndex > MAX_STACK_INDEX) Quit("Too many stacks"); InPile = true; } PileIndexes[i] = PileIndex; } else InPile = false; } PileCount = PileIndex + 1; return PileIndexes; } static void IncCopyCountImage(int *CopyCount, int From, int To) { if (From < 0) Quit("From < 0"); From /= CHUNK_LENGTH; To /= CHUNK_LENGTH; if (From >= SeqLengthChunks) { Warning("IncCopyCountImage: From=%d, SeqLength=%d", From, SeqLengthChunks); From = SeqLengthChunks - 1; } if (To >= SeqLengthChunks) { Warning("IncCopyCountImage: To=%d, SeqLength=%d", To, SeqLengthChunks); To = SeqLengthChunks - 1; } if (From > To) Quit("From > To"); for (int i = From; i <= To; ++i) SetBit(CopyCount, i); } static void IncCopyCount(int *CopyCount, const HitData &Hit) { IncCopyCountImage(CopyCount, Hit.TargetFrom, Hit.TargetTo); IncCopyCountImage(CopyCount, Hit.QueryFrom, Hit.QueryTo); } static int CmpHits(const void *ptrHit1, const void *ptrHit2) { HitData *Hit1 = (HitData *) ptrHit1; HitData *Hit2 = (HitData *) ptrHit2; return Hit1->QueryFrom - Hit2->QueryFrom; } static int CmpImages(const void *ptrImage1, const void *ptrImage2) { PileImageData *Image1 = (PileImageData *) ptrImage1; PileImageData *Image2 = (PileImageData *) ptrImage2; return Image1->SIPile - Image2->SIPile; } static void AssertImagesSorted(PileImageData *Images, int ImageCount) { for (int i = 0; i < ImageCount - 1; ++i) if (Images[i].SIPile > Images[i+1].SIPile) Quit("Images not sorted"); } static void SortImagesPile(PileImageData *Images, int ImageCount) { qsort(Images, ImageCount, sizeof(PileImageData), CmpImages); } static void SortImages() { for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex) { PileData &Pile = Piles[PileIndex]; SortImagesPile(Pile.Images, Pile.ImageCount); #if DEBUG AssertImagesSorted(Pile.Images, Pile.ImageCount); #endif } } static void CreatePiles(const HitData *Hits, int HitCount, PILE_INDEX_TYPE *PileIndexes) { Piles = all(PileData, PileCount); zero(Piles, PileData, PileCount); for (int i = 0; i < PileCount; ++i) { Piles[i].FamIndex = -1; Piles[i].SuperFamIndex = -1; Piles[i].Rev = -1; } // Count images in stack ProgressStart("Create stacks: count images"); for (int HitIndex = 0; HitIndex < HitCount; ++HitIndex) { const HitData &Hit = Hits[HitIndex]; int Pos = Hit.QueryFrom/CHUNK_LENGTH; PILE_INDEX_TYPE PileIndex = PileIndexes[Pos]; assert(PileIndex == PileIndexes[Hit.QueryTo/CHUNK_LENGTH]); assert(PileIndex >= 0 && PileIndex < PileCount); ++(Piles[PileIndex].ImageCount); Pos = Hit.TargetFrom/CHUNK_LENGTH; PileIndex = PileIndexes[Pos]; assert(PileIndex >= 0 && PileIndex < PileCount); assert(PileIndex == PileIndexes[Hit.TargetTo/CHUNK_LENGTH]); ++(Piles[PileIndex].ImageCount); } ProgressDone(); // Allocate memory for image list int TotalImageCount = 0; ProgressStart("Create stacks: allocate image memory"); for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex) { PileData &Pile = Piles[PileIndex]; const int ImageCount = Pile.ImageCount; TotalImageCount += ImageCount; assert(ImageCount > 0); Pile.Images = all(PileImageData, ImageCount); } ProgressDone(); // Build image list ProgressStart("Create stacks: build image list"); for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex) { PileData &Pile = Piles[PileIndex]; Pile.ImageCount = 0; Pile.From = -1; Pile.To = -1; } for (int HitIndex = 0; HitIndex < HitCount; ++HitIndex) { const HitData &Hit = Hits[HitIndex]; const bool Rev = Hit.Rev; const int Length1 = Hit.QueryTo - Hit.QueryFrom; const int Length2 = Hit.TargetTo - Hit.TargetFrom; const int From1 = Hit.QueryFrom; const int From2 = Hit.TargetFrom; const int To1 = Hit.QueryTo; const int To2 = Hit.TargetTo; const int Pos1 = From1/CHUNK_LENGTH; const int Pos2 = From2/CHUNK_LENGTH; PILE_INDEX_TYPE PileIndex1 = PileIndexes[Pos1]; PILE_INDEX_TYPE PileIndex2 = PileIndexes[Pos2]; assert(PileIndex1 == PileIndexes[(From1 + Length1 - 1)/CHUNK_LENGTH]); assert(PileIndex1 >= 0 && PileIndex1 < PileCount); assert(PileIndex2 == PileIndexes[(From2 + Length2 - 1)/CHUNK_LENGTH]); assert(PileIndex2 >= 0 && PileIndex2 < PileCount); PileData &Pile1 = Piles[PileIndex1]; PileImageData &Image1 = Pile1.Images[Pile1.ImageCount++]; Image1.SILength = Length2; Image1.SIPile = PileIndex2; Image1.SIRev = Rev; PileData &Pile2 = Piles[PileIndex2]; PileImageData &Image2 = Pile2.Images[Pile2.ImageCount++]; Image2.SILength = Length1; Image2.SIPile = PileIndex1; Image2.SIRev = Rev; if (Pile1.From == -1 || From1 < Pile1.From) Pile1.From = From1; if (Pile1.To == -1 || To1 > Pile1.To) Pile1.To = To1; if (Pile2.From == -1 || From2 < Pile2.From) Pile2.From = From2; if (Pile2.To == -1 || To2 > Pile2.To) Pile2.To = To2; if (Pile1.ImageCount > MaxImageCount) MaxImageCount = Pile1.ImageCount; if (Pile2.ImageCount > MaxImageCount) MaxImageCount = Pile2.ImageCount; } ProgressDone(); } static int FindGlobalEdgesPileMulti(PileData &Pile, int PileIndex, PILE_INDEX_TYPE Partners[], bool PartnersRev[]) { const int PileLength = Pile.To - Pile.From + 1; const int MinLength = (PileLength*(100 - g_paramMaxLengthDiffPct))/100; const int MaxLength = (PileLength*(100 + g_paramMaxLengthDiffPct))/100; const int ImageCount = Pile.ImageCount; SortImagesPile(Pile.Images, ImageCount); #if TRACE Log("Pile1 Pile2 Pile1L Pile2L Fract1 Fract2 Global [Multi]\n"); Log("------ ------ ------- ------- ------ ------ ------\n"); #endif int CurrentPartnerPileIndex = -1; int BasesCovered = 0; int PartnerCount = 0; for (int ImageIndex = 0; ; ++ImageIndex) { int PartnerPileIndex; int ImageLength = 0; bool Rev = false; if (ImageIndex < ImageCount) { const PileImageData &Image = Pile.Images[ImageIndex]; PartnerPileIndex = Image.SIPile; ImageLength = Image.SILength; Rev = Image.SIRev; } else PartnerPileIndex = -1; if (PartnerPileIndex == CurrentPartnerPileIndex) BasesCovered += ImageLength; else { if (CurrentPartnerPileIndex != -1) { const PileData &PartnerPile = Piles[CurrentPartnerPileIndex]; const int PartnerPileLength = PartnerPile.To - PartnerPile.From + 1; bool IsGlobalMatch = PartnerPileLength >= MinLength && PartnerPileLength <= MaxLength && BasesCovered >= MinLength && PartnerPileIndex != PileIndex; #if TRACE Log("%6d %6d %7d %7d %5.0f%% %5.0f%% %c\n", PileIndex, CurrentPartnerPileIndex, PileLength, PartnerPileLength, (BasesCovered*100.0)/PileLength, (BasesCovered*100.0)/PartnerPileLength, IsGlobalMatch ? 'Y' : 'N'); #endif if (IsGlobalMatch) { PartnersRev[PartnerCount] = Rev; // TODO Partners[PartnerCount] = CurrentPartnerPileIndex; ++PartnerCount; } } CurrentPartnerPileIndex = PartnerPileIndex; BasesCovered = ImageLength; } if (ImageIndex == ImageCount) break; } return PartnerCount; } static int FindGlobalEdgesPileSingle(PileData &Pile, int PileIndex, PILE_INDEX_TYPE Partners[], bool PartnersRev[]) { const int ImageCount = Pile.ImageCount; const int PileLength = Pile.To - Pile.From + 1; const int MinLength = (PileLength*(100 - g_paramMaxLengthDiffPct))/100; const int MaxLength = (PileLength*(100 + g_paramMaxLengthDiffPct))/100; #if TRACE Log("Pile1 Pile2 Pile1L Pile2L Fract1 Fract2 Global [Single]\n"); Log("------ ------ ------- ------- ------ ------ ------\n"); #endif int PartnerCount = 0; for (int ImageIndex = 0; ImageIndex < ImageCount; ++ImageIndex) { const PileImageData &Image = Pile.Images[ImageIndex]; const int PartnerImageLength = Image.SILength; const int PartnerPileIndex = Image.SIPile; const PileData &PartnerPile = Piles[PartnerPileIndex]; const int PartnerPileLength = PartnerPile.To - PartnerPile.From + 1; bool IsGlobalImage = PartnerPileLength >= MinLength && PartnerPileLength <= MaxLength && PartnerImageLength >= MinLength && PartnerImageLength <= MaxLength && PartnerPileIndex != PileIndex; #if TRACE Log("%6d %6d %7d %7d %5.0f%% %5.0f%% %c\n", PileIndex, PartnerPileIndex, PileLength, PartnerPileLength, (PartnerImageLength*100.0)/PileLength, (PartnerImageLength*100.0)/PartnerPileLength, IsGlobalImage ? 'Y' : 'N'); #endif if (IsGlobalImage) { PartnersRev[PartnerCount] = Image.SIRev; Partners[PartnerCount] = PartnerPileIndex; ++PartnerCount; } } return PartnerCount; } static void AddEdges(EdgeList &Edges, PILE_INDEX_TYPE PileIndex, PILE_INDEX_TYPE Partners[], bool PartnersRev[], int PartnerCount) { EdgeCount += PartnerCount; for (int i = 0; i < PartnerCount; ++i) { int PileIndex2 = Partners[i]; EdgeData Edge; Edge.Node1 = PileIndex; Edge.Node2 = PileIndex2; Edge.Rev = PartnersRev[i]; Edges.push_back(Edge); } } static void FindGlobalEdges(EdgeList &Edges, int MaxImageCount) { Edges.clear(); PILE_INDEX_TYPE *Partners = all(PILE_INDEX_TYPE, MaxImageCount); bool *PartnersRev = all(bool, MaxImageCount); for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex) { PileData &Pile = Piles[PileIndex]; int PartnerCount; if (g_paramSingleHitCoverage) PartnerCount = FindGlobalEdgesPileSingle(Pile, PileIndex, Partners, PartnersRev); else PartnerCount = FindGlobalEdgesPileMulti(Pile, PileIndex, Partners, PartnersRev); AddEdges(Edges, PileIndex, Partners, PartnersRev, PartnerCount); } freemem(Partners); freemem(PartnersRev); } static void AssignFamsToPiles(FamList &Fams) { int FamIndex = 0; for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p) { FamData *Fam = *p; for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q) { FamMemberData &FamMember = *q; int PileIndex = FamMember.PileIndex; PileData &Pile = Piles[PileIndex]; Pile.FamIndex = FamIndex; Pile.Rev = (int) FamMember.Rev; } ++FamIndex; } } static inline unsigned TriangleSubscript(unsigned FamCount, unsigned i, unsigned j) { assert(i >= 0 && j >= 0 && i < FamCount && j < FamCount); unsigned v; if (i >= j) v = j + (i*(i - 1))/2; else v = i + (j*(j - 1))/2; assert(v < (FamCount*(FamCount - 1))/2); return v; } static void FindSuperFamEdges(FamList &Fams, EdgeList &Edges) { const int FamCount = (int) Fams.size(); // Allocate triangular array Related[i][j], value is true // iff families i and j are related (i.e., there is a local // alignment between some member of i and some member of j). const int TriangleSize = (FamCount*(FamCount - 1))/2; bool *Related = all(bool, TriangleSize); zero(Related, bool, TriangleSize); for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p) { FamData *Fam = *p; for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q) { FamMemberData &FamMember = *q; int PileIndex = FamMember.PileIndex; const PileData &Pile = Piles[PileIndex]; const int FamIndex = Pile.FamIndex; if (-1 == FamIndex) continue; const int ImageCount = Pile.ImageCount; for (int ImageIndex = 0; ImageIndex < ImageCount; ++ImageIndex) { const PileImageData &Image = Pile.Images[ImageIndex]; const int PartnerPileIndex = Image.SIPile; if (PartnerPileIndex == PileIndex) continue; const PileData &PartnerPile = Piles[PartnerPileIndex]; const int PartnerFamIndex = PartnerPile.FamIndex; if (-1 == PartnerFamIndex || PartnerFamIndex == FamIndex) continue; const int Index = TriangleSubscript(FamCount, FamIndex, PartnerFamIndex); assert(Index >= 0 && Index < TriangleSize); Related[Index] = true; } } } Edges.clear(); for (int i = 0; i < FamCount; ++i) for (int j = i + 1; j < FamCount; ++j) { const int Index = TriangleSubscript(FamCount, i, j); if (Related[Index]) { // Log("R %d %d\n", i, j); EdgeData Edge; Edge.Node1 = i; Edge.Node2 = j; Edge.Rev = false; Edges.push_back(Edge); } } } static void AssignSuperFamsToPiles(FamList &Fams, FamList &SuperFams) { const int FamCount = (int) Fams.size(); FamData **FamVect = all(FamData *, FamCount); int FamIndex = 0; for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p) { FamVect[FamIndex] = *p; ++FamIndex; } int SuperFamIndex = 0; for (PtrFamList pSF = SuperFams.begin(); pSF != SuperFams.end(); ++pSF) { FamData &SFFams = *(*pSF); for (PtrFamData p = SFFams.begin(); p != SFFams.end(); ++p) { FamMemberData &FamMember = *p; int FamIndex = FamMember.PileIndex; assert(FamIndex >= 0 && FamIndex < FamCount); FamData *Fam = FamVect[FamIndex]; for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q) { FamMemberData &FamMember = *q; int PileIndex = FamMember.PileIndex; assert(PileIndex >= 0 && PileIndex < PileCount); PileData &Pile = Piles[PileIndex]; assert(Pile.FamIndex == FamIndex); Pile.SuperFamIndex = SuperFamIndex; } } ++SuperFamIndex; } } static void FindSingletonSuperFams(FamList &Fams, FamList &SuperFams) { const int FamCount = (int) Fams.size(); FamData **FamVect = all(FamData *, FamCount); bool *FamAssigned = all(bool, FamCount); int FamIndex = 0; for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p) { FamVect[FamIndex] = *p; FamAssigned[FamIndex] = false; ++FamIndex; } // Flag families that have been assigned to superfamilies for (PtrFamList pSF = SuperFams.begin(); pSF != SuperFams.end(); ++pSF) { FamData &SFFams = *(*pSF); for (PtrFamData p = SFFams.begin(); p != SFFams.end(); ++p) { FamMemberData &FamMember = *p; int FamIndex = FamMember.PileIndex; assert(FamIndex >= 0 && FamIndex < FamCount); FamAssigned[FamIndex] = true; } } // Create new superfamily for each unassigned family for (int FamIndex = 0; FamIndex < FamCount; ++FamIndex) { if (FamAssigned[FamIndex]) continue; FamMemberData Fam; Fam.PileIndex = FamIndex; Fam.Rev = false; FamData *SuperFam = new FamData; SuperFam->push_back(Fam); SuperFams.push_back(SuperFam); } // Validate int SuperFamIndex = 0; for (PtrFamList pSF = SuperFams.begin(); pSF != SuperFams.end(); ++pSF) { FamData &SFFams = *(*pSF); for (PtrFamData p = SFFams.begin(); p != SFFams.end(); ++p) { FamMemberData &FamMember = *p; int FamIndex = FamMember.PileIndex; assert(FamIndex >= 0 && FamIndex < FamCount); FamData *Fam = FamVect[FamIndex]; for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q) { FamMemberData &FamMember = *q; int PileIndex = FamMember.PileIndex; if (PileIndex == 5354) Log(""); PileData &Pile = Piles[PileIndex]; assert(Pile.FamIndex == FamIndex); } } ++SuperFamIndex; } } void TRS() { const char *InputFileName = RequiredValueOpt("trs"); const char *OutputFileName = ValueOpt("out"); const char *PilesFileName = ValueOpt("piles"); const char *ImagesFileName = ValueOpt("images"); const char *strMinFamSize = ValueOpt("famsize"); const char *strMaxLengthDiffPct = ValueOpt("maxlengthdiffpct"); g_paramSingleHitCoverage = !FlagOpt("multihit"); if (0 == OutputFileName && 0 == PilesFileName && 0 == ImagesFileName) Quit("No output file specified, must be at least one of -out, -piles, -images"); if (0 != strMinFamSize) g_paramMinFamSize = atoi(strMinFamSize); if (0 != strMaxLengthDiffPct) g_paramMaxLengthDiffPct = atoi(strMaxLengthDiffPct); Log("singlehit=%s famsize=%d maxlengthdiffpct=%d\n", g_paramSingleHitCoverage ? "True" : "False", g_paramMinFamSize, g_paramMaxLengthDiffPct); ProgressStart("Read hit file"); int HitCount; int SeqLength; HitData *Hits = ReadHits(InputFileName, &HitCount, &SeqLength); ProgressDone(); Progress("%d hits", HitCount); SeqLengthChunks = (SeqLength + CHUNK_LENGTH - 1)/CHUNK_LENGTH; const int BitVectorLength = (SeqLengthChunks + BITS_PER_INT - 1)/BITS_PER_INT; int *CopyCount = all(int, BitVectorLength); zero(CopyCount, int, BitVectorLength); ProgressStart("Compute copy counts"); for (int i = 0; i < HitCount; ++i) IncCopyCount(CopyCount, Hits[i]); ProgressDone(); ProgressStart("Identify piles"); PILE_INDEX_TYPE *PileIndexes = IdentifyPiles(CopyCount); ProgressDone(); Progress("%d stacks", PileCount); freemem(CopyCount); CopyCount = 0; CreatePiles(Hits, HitCount, PileIndexes); if (0 != ImagesFileName) { ProgressStart("Writing images file"); WriteImages(ImagesFileName, Hits, HitCount, PileIndexes); ProgressDone(); } freemem(Hits); Hits = 0; if (0 != PilesFileName) { ProgressStart("Writing piles file"); WritePiles(PilesFileName, Piles, PileCount); ProgressDone(); } freemem(PileIndexes); PileIndexes = 0; if (0 == OutputFileName) return; ProgressStart("Find edges"); EdgeList Edges; FindGlobalEdges(Edges, MaxImageCount); ProgressDone(); Progress("%d edges", (int) Edges.size()); ProgressStart("Find families"); FamList Fams; FindConnectedComponents(Edges, Fams, g_paramMinFamSize); AssignFamsToPiles(Fams); ProgressDone(); Progress("%d families", (int) Fams.size()); ProgressStart("Find superfamilies"); EdgeList SuperEdges; FindSuperFamEdges(Fams, SuperEdges); FamList SuperFams; FindConnectedComponents(SuperEdges, SuperFams, 1); FindSingletonSuperFams(Fams, SuperFams); AssignSuperFamsToPiles(Fams, SuperFams); ProgressDone(); Progress("%d superfamilies", (int) SuperFams.size()); ProgressStart("Write TRS output file"); WriteTRSFile(OutputFileName, Piles, PileCount); ProgressDone(); } trs2fasta.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000676211447530642015515 0ustar ????????????????#include "piler2.h" static void LogRecords(const TRSData *TRSs, int TRSCount) { Log(" Contig From To Length + Family\n"); Log("-------------------- ---------- ---------- ------ - ------\n"); for (int TRSIndex = 0; TRSIndex < TRSCount; ++TRSIndex) { const TRSData &TRS = TRSs[TRSIndex]; Log("%20.20s %10d %10d %6d %c %d.%d\n", TRS.ContigLabel, TRS.ContigFrom, TRS.ContigTo, TRS.ContigTo - TRS.ContigFrom + 1, TRS.Rev ? '-' : '+', TRS.SuperFamIndex, TRS.FamIndex); } } static int CmpTRS(const void *s1, const void *s2) { const TRSData *TRS1 = (const TRSData *) s1; const TRSData *TRS2 = (const TRSData *) s2; return TRS1->FamIndex - TRS2->FamIndex; } static char *FamFileName(const char *Path, int Fam, int SuperFam) { static char FileName[256]; char s[128]; if (SuperFam == -1) sprintf(s, "/%d", Fam); else sprintf(s, "/%d.%d", SuperFam, Fam); if (strlen(Path) + strlen(s) + 3 >= sizeof(FileName)) Quit("Path name too long"); strcpy(FileName, Path); strcat(FileName, s); return FileName; } static char *TRSLabel(const char *Prefix, const TRSData &TRS) { int n = (int) strlen(TRS.ContigLabel) + 128; char *s = all(char, n); if (TRS.SuperFamIndex == -1) { if (0 != Prefix) sprintf(s, "%s.%d %s:%d%c", Prefix, TRS.FamIndex, TRS.ContigLabel, TRS.ContigFrom+1, TRS.Rev ? '-' : '+'); else sprintf(s, "%d.%s:%d%c", TRS.FamIndex, TRS.ContigLabel, TRS.ContigFrom+1, TRS.Rev ? '-' : '+'); } else { if (0 != Prefix) sprintf(s, "%s.%d.%d %s:%d%c", Prefix, TRS.SuperFamIndex, TRS.FamIndex, TRS.ContigLabel, TRS.ContigFrom+1, TRS.Rev ? '-' : '+'); else sprintf(s, "%d.%d %s:%d%c", TRS.SuperFamIndex, TRS.FamIndex, TRS.ContigLabel, TRS.ContigFrom+1, TRS.Rev ? '-' : '+'); } return s; } void TRS2Fasta() { const char *TRSFileName = RequiredValueOpt("trs2fasta"); const char *SeqFileName = RequiredValueOpt("seq"); const char *Path = ValueOpt("path"); const char *strMaxFam = ValueOpt("maxfam"); const char *Prefix = ValueOpt("prefix"); int MaxFam = DEFAULT_MAX_FAM; if (strMaxFam != 0) MaxFam = atoi(strMaxFam); if (0 == Path) Path = "."; ProgressStart("Reading seq file"); int SeqLength; const char *Seq = ReadMFA(SeqFileName, &SeqLength); ProgressDone(); Progress("Seq length %d bases, %.3g Mb", SeqLength, SeqLength/1e6); ProgressStart("Read TRS file"); int TRSCount; TRSData *TRSs = ReadTRS(TRSFileName, &TRSCount); ProgressDone(); Progress("%d records", TRSCount); ProgressStart("Sorting by family"); qsort((void *) TRSs, TRSCount, sizeof(TRSData), CmpTRS); ProgressDone(); FILE *f = 0; int CurrentFamily = -1; int MemberCount = 0; for (int TRSIndex = 0; TRSIndex < TRSCount; ++TRSIndex) { const TRSData &TRS = TRSs[TRSIndex]; if (TRS.FamIndex != CurrentFamily) { if (f != 0) fclose(f); char *FastaFileName = FamFileName(Path, TRS.FamIndex, TRS.SuperFamIndex); f = OpenStdioFile(FastaFileName, FILEIO_MODE_WriteOnly); CurrentFamily = TRS.FamIndex; MemberCount = 0; } ++MemberCount; if (MemberCount > MaxFam) continue; const int From = ContigToGlobal(TRS.ContigFrom, TRS.ContigLabel); const int Length = TRS.ContigTo - TRS.ContigFrom + 1; char *Label = TRSLabel(Prefix, TRS); WriteFasta(f, Seq + From, Length, Label, TRS.Rev); freemem(Label); } } usage.cpp0000755€ÿÿÿÿ€ÿÿÿÿ0000000203711447530643014700 0ustar ????????????????#include "piler2.h" void Credits() { static bool Displayed = false; if (Displayed) return; fprintf(stderr, "\n" PILER_LONG_VERSION "\n"); fprintf(stderr, "http://www.drive5.com/piler\n"); fprintf(stderr, "Written by Robert C. Edgar\n"); fprintf(stderr, "This software is donated to the public domain.\n"); fprintf(stderr, "Please visit web site for requested citation.\n"); Displayed = true; } void Usage() { Credits(); fprintf(stderr, "\n" "Usage:\n" " piler -trs -out [options]\n" " piler -trs2fasta -seq [-path ] [-maxfam ]\n" " piler -cons -out -label