pax_global_header00006660000000000000000000000064136124115250014512gustar00rootroot0000000000000052 comment=199fdeaeece6d6a2f507f314063dd90f8a6c6420 gffread-0.11.7/000077500000000000000000000000001361241152500131765ustar00rootroot00000000000000gffread-0.11.7/LICENSE000066400000000000000000000020601361241152500142010ustar00rootroot00000000000000MIT License Copyright (c) 2001-2018 Geo Pertea Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. gffread-0.11.7/Makefile000066400000000000000000000056651361241152500146520ustar00rootroot00000000000000GCLDIR := $(if $(GCLDIR),$(GCLDIR),../gclib) SEARCHDIRS := -I. -I${GCLDIR} SYSTYPE := $(shell uname) CXX := $(if $(CXX),$(CXX),g++) LINKER := $(if $(LINKER),$(LINKER),g++) LDFLAGS := $(if $(LDFLAGS),$(LDFLAGS),-g) BASEFLAGS := -Wall -Wextra ${SEARCHDIRS} -D_FILE_OFFSET_BITS=64 \ -D_LARGEFILE_SOURCE -D_REENTRANT -fno-strict-aliasing \ -std=c++0x -fno-exceptions -fno-rtti GCCV8 := $(shell expr `g++ -dumpversion | cut -f1 -d.` \>= 8) ifeq "$(GCCV8)" "1" BASEFLAGS += -Wno-class-memaccess endif CXXFLAGS := $(if $(CXXFLAGS),$(BASEFLAGS) $(CXXFLAGS),$(BASEFLAGS)) ifneq (,$(filter %release %static, $(MAKECMDGOALS))) # -- release build CXXFLAGS := -g -O3 -DNDEBUG $(CXXFLAGS) else ifneq (,$(filter %profile %gprof %prof, $(MAKECMDGOALS))) CXXFLAGS += -pg -O0 -DNDEBUG LDFLAGS += -pg else #CXXFLAGS += -g -O0 -DNDEBUG CXXFLAGS += -g -O0 -DDEBUG -D_DEBUG -DGDEBUG endif ifneq (,$(filter %memcheck %memdebug, $(MAKECMDGOALS))) #use sanitizer in gcc 4.9+ MEMCHECK_BUILD := 1 GCCVER49 := $(shell expr `g++ -dumpversion | cut -f1,2 -d.` \>= 4.9) ifeq "$(GCCVER49)" "0" $(error gcc version 4.9 or greater is required for this build target) endif CXXFLAGS += -fno-omit-frame-pointer -fsanitize=undefined -fsanitize=address GCCVER5 := $(shell expr `g++ -dumpversion | cut -f1 -d.` \>= 5) ifeq "$(GCCVER5)" "1" CXXFLAGS += -fsanitize=bounds -fsanitize=float-divide-by-zero -fsanitize=vptr CXXFLAGS += -fsanitize=float-cast-overflow -fsanitize=object-size #CXXFLAGS += -fcheck-pointer-bounds -mmpx endif CXXFLAGS += -fno-common -fstack-protector LIBS := -lasan -lubsan -ldl $(LIBS) else #just plain debug build DEBUG_BUILD := 1 endif endif #ifneq (,$(filter %memtrace %memusage %memuse, $(MAKECMDGOALS))) # CXXFLAGS += -DGMEMTRACE # OBJS += ${GDIR}/proc_mem.o #endif #ifdef DEBUG_BUILD # #$(warning Building DEBUG version of stringtie.. ) # DBG_WARN=@echo # DBG_WARN+='WARNING: built DEBUG version [much slower], use "make clean release" for a faster, optimized version of the program.' #endif %.o : %.cpp ${CXX} ${CXXFLAGS} -c $< -o $@ # C/C++ linker OBJS := ${GCLDIR}/GBase.o ${GCLDIR}/GArgs.o ${GCLDIR}/GFaSeqGet.o \ ${GCLDIR}/gdna.o ${GCLDIR}/codons.o ${GCLDIR}/gff.o ${GCLDIR}/GStr.o \ ${GCLDIR}/GFastaIndex.o gff_utils.o .PHONY : all nodebug: release all release debug memcheck memdebug profile gprof prof: gffread $(OBJS) : $(GCLDIR)/GBase.h $(GCLDIR)/gff.h gffread.o : gff_utils.h $(GCLDIR)/GBase.h $(GCLDIR)/gff.h gff_utils.o : gff_utils.h $(GCLDIR)/gff.h ${GCLDIR}/gff.o : ${GCLDIR}/gff.h ${GCLDIR}/GFaSeqGet.h ${GCLDIR}/GList.hh ${GCLDIR}/GHash.hh ${GCLDIR}/GFaSeqGet.o : ${GCLDIR}/GFaSeqGet.h gffread: $(OBJS) gffread.o ${LINKER} ${LDFLAGS} -o $@ ${filter-out %.a %.so, $^} ${LIBS} # @echo # ${DBG_WARN} # target for removing all object files .PHONY : clean clean: @${RM} gffread gffread.o* gffread.exe $(OBJS) @${RM} core.* gffread-0.11.7/README.md000066400000000000000000000007651361241152500144650ustar00rootroot00000000000000# gffread GFF/GTF parsing utility providing format conversions, region filtering, FASTA sequence extraction and more. Use gffread -h to check the usage options. Compiling this program from source requires my other code library, [GCLib](../../../gclib). It can be done like this: ``` cd /some/build/dir git clone https://github.com/gpertea/gclib git clone https://github.com/gpertea/gffread cd gffread make release ``` This should build the **gffread** binary in the current directory. gffread-0.11.7/gff_utils.cpp000066400000000000000000000667701361241152500157040ustar00rootroot00000000000000#include "gff_utils.h" bool verbose=false; //same with GffReader::showWarnings and GffLoader::beVserbose //bool debugState=false; /* void printTabFormat(FILE* f, GffObj* t) { static char dbuf[1024]; fprintf(f, "%s\t%s\t%c\t%d\t%d\t%d\t", t->getID(), t->getGSeqName(), t->strand, t->start, t->end, t->exons.Count()); t->printExonList(f); if (t->hasCDS()) fprintf(f, "\t%d:%d", t->CDstart, t->CDend); else fprintf(f, "\t."); if (t->getGeneID()!=NULL) fprintf(f, "\tgeneID=%s",t->getGeneID()); if (t->getGeneName()!=NULL) { GffObj::decodeHexChars(dbuf, t->getGeneName()); fprintf(f, "\tgene_name=%s", dbuf); } if (t->attrs!=NULL) { for (int i=0;iattrs->Count();i++) { const char* attrname=t->getAttrName(i); GffObj::decodeHexChars(dbuf, t->attrs->Get(i)->attr_val); fprintf(f,"\t%s=%s", attrname, dbuf); } } fprintf(f, "\n"); } */ void printFasta(FILE* f, GStr& defline, char* seq, int seqlen, bool useStar) { if (seq==NULL) return; int len=(seqlen>0)?seqlen:strlen(seq); if (len<=0) return; if (!defline.is_empty()) fprintf(f, ">%s\n",defline.chars()); int ilen=0; for (int i=0; i < len; i++, ilen++) { if (ilen == 70) { fputc('\n', f); ilen = 0; } if (useStar && seq[i]=='.') putc('*', f); else putc(seq[i], f); } //for fputc('\n', f); } int qsearch_gloci(uint x, GList& loci) { //binary search //do the simplest tests first: if (loci[0]->start>x) return 0; if (loci.Last()->start>1; istart=loci[i]->start; if (istart < x) l = i + 1; else { if (istart == x) { //found matching coordinate here idx=i; while (idx<=maxh && loci[idx]->start==x) { idx++; } return (idx>maxh) ? -1 : idx; } h = i - 1; } } //while idx = l; while (idx<=maxh && loci[idx]->start<=x) { idx++; } return (idx>maxh) ? -1 : idx; } int qsearch_rnas(uint x, GList& rnas) { //binary search //do the simplest tests first: if (rnas[0]->start>x) return 0; if (rnas.Last()->start>1; istart=rnas[i]->start; if (istart < x) l = i + 1; else { if (istart == x) { //found matching coordinate here idx=i; while (idx<=maxh && rnas[idx]->start==x) { idx++; } return (idx>maxh) ? -1 : idx; } h = i - 1; } } //while idx = l; while (idx<=maxh && rnas[idx]->start<=x) { idx++; } return (idx>maxh) ? -1 : idx; } int cmpRedundant(GffObj& a, GffObj& b) { if (a.exons.Count()==b.exons.Count()) { if (a.covlen==b.covlen) { return strcmp(a.getID(), b.getID()); } else return (a.covlen>b.covlen)? 1 : -1; } else return (a.exons.Count()>b.exons.Count())? 1: -1; } bool tMatch(GffObj& a, GffObj& b) { //strict intron chain match, or single-exon perfect match int imax=a.exons.Count()-1; int jmax=b.exons.Count()-1; int ovlen=0; if (imax!=jmax) return false; //different number of introns if (imax==0) { //single-exon mRNAs //if (equnspl) { //fuzz match for single-exon transfrags: // it's a match if they overlap at least 80% of max len ovlen=a.exons[0]->overlapLen(b.exons[0]); int maxlen=GMAX(a.covlen,b.covlen); return (ovlen>=maxlen*0.8); /*} else { //only exact match ovlen=a.covlen; return (a.exons[0]->start==b.exons[0]->start && a.exons[0]->end==b.exons[0]->end); }*/ } //check intron overlaps ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1; ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start; for (int i=1;i<=imax;i++) { if (ilen(); if ((a.exons[i-1]->end!=b.exons[i-1]->end) || (a.exons[i]->start!=b.exons[i]->start)) { return false; //intron mismatch } } return true; } bool GffLoader::unsplContained(GffObj& ti, GffObj& tj) { //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons //but it does not cross any intron-exon boundary of tj int imax=ti.exons.Count()-1; int jmax=tj.exons.Count()-1; if (imax>0) GError("Error: bad unsplContained() call, 1st parameter must be single-exon transcript!\n"); if (fuzzSpan) { int maxIntronOvl=dOvlSET ? 25 : 0; //int minovl = dOvlSET ? 5 : (int)(0.8 * ti.len()); //minimum overlap to declare "redundancy" for (int j=0;j<=jmax;j++) { bool exonOverlap=false; if (dOvlSET) { exonOverlap= (tj.exons[j]->overlapLen(ti.start-1, ti.end+1) > 0); } else { exonOverlap=(ti.overlapLen(tj.exons[j])>=0.8 * ti.len()); } if (exonOverlap) { //must not overlap the introns if ((j>0 && ti.start+maxIntronOvlstart) || (jtj.exons[j]->end+maxIntronOvl)) return false; return true; } } //for each exon } else { // not fuzzSpan, strict containment required for (int j=0;j<=jmax;j++) { if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start) return true; } } return false; } GffObj* GffLoader::redundantTranscripts(GffObj& ti, GffObj& tj) { // matchAllIntrons==true: transcripts are considered "redundant" only if // they have the exact same number of introns and same splice sites (or none) // (single-exon transcripts should be also fully contained to be considered matching) // matchAllIntrons==false: an intron chain could be a subset of a "container" chain, // as long as no intron-exon boundaries are violated; also, a single-exon // transcript will be collapsed if it's contained in one of the exons of the another transcript // fuzzSpan==false: the genomic span of one transcript MUST BE contained in or equal with the genomic // span of the other // // fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained // (i.e. they may extend each-other in opposite directions) //if redundancy is detected, the "bigger" transcript is returned (otherwise NULL is returned) int adj=dOvlSET ? 1 : 0; if (ti.start>tj.end+adj || tj.start>ti.end+adj || (tj.strand!='.' && ti.strand!='.' && tj.strand!=ti.strand)) return NULL; //no span overlap int imax=ti.exons.Count()-1; int jmax=tj.exons.Count()-1; GffObj* bigger=NULL; GffObj* smaller=NULL; if (matchAllIntrons) { //full intron chain match expected, or full containment for SET if (imax!=jmax) return NULL; //must have the same number of exons! if (ti.covlen>tj.covlen) { bigger=&ti; if (!fuzzSpan && (ti.start>tj.start || ti.endti.start || tj.endend!=tj.exons[i]->end || ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL; } return bigger; } //--- matchAllIntrons==false: intron-chain containment is also considered redundancy int minlen=0; if (ti.covlen>tj.covlen) { if (tj.exons.Count()>ti.exons.Count()) { //exon count override bigger=&tj; smaller=&ti; } else { bigger=&ti; smaller=&tj; } //maxlen=ti.covlen; minlen=tj.covlen; } else { //tj has more bases covered if (ti.exons.Count()>tj.exons.Count()) { //exon count override bigger=&ti; smaller=&tj; } else { bigger=&tj; smaller=&ti; } //maxlen=tj.covlen; minlen=ti.covlen; } if (imax==0 && jmax==0) { //single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other if (fuzzSpan) { if (dOvlSET) { return (ti.exons[0]->overlapLen(tj.exons[0]->start-1, tj.exons[0]->end+1)>0) ? bigger : NULL; } else { return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL; } } else { //boundary containment required return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL; } } //containment is also considered redundancy if (smaller->exons.Count()==1) { //check if this single exon is contained in any of tj exons //without violating any intron-exon boundaries return (unsplContained(*smaller, *bigger) ? bigger : NULL); } //--- from here on: both are multi-exon transcripts: imax>0 && jmax>0 if (ti.exons[imax]->startend || tj.exons[jmax]->startend ) return NULL; //intron chains do not overlap at all //checking full intron chain containment uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries int i=1; //exon idx to the right of the current intron of ti int j=1; //exon idx to the right of the current intron of tj //find the first intron overlap: while (i<=imax && j<=jmax) { eistart=ti.exons[i-1]->end; eiend=ti.exons[i]->start; ejstart=tj.exons[j-1]->end; ejend=tj.exons[j]->start; if (ejendstart>smaller->start || bigger->end < smaller->end)) return NULL; if ((i>1 && j>1) || i>imax || j>jmax) { return NULL; //either no intron overlaps found at all //or it's not the first intron for at least one of the transcripts } if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match int maxIntronOvl=dOvlSET ? 25 : 0; if (j>i) { //i==1, ti's start must not conflict with the previous intron of tj if (ti.start+maxIntronOvlstart) return NULL; //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains ) //so i's first intron starts AFTER j's first intron // then j must contain i, so i's last intron must end with or before j's last intron if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL; } else if (i>j) { //j==1, tj's start must not conflict with the previous intron of ti if (tj.start+maxIntronOvlstart) return NULL; //comment out the line above for just "intronCompatible()" check (allowing extension of intron chain) //so j's intron chain starts AFTER i's // then i must contain j, so j's last intron must end with or before j's last intron if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL; } //now check if the rest of the introns overlap, in the same sequence i++; j++; while (i<=imax && j<=jmax) { if (ti.exons[i-1]->end!=tj.exons[j-1]->end || ti.exons[i]->start!=tj.exons[j]->start) return NULL; i++; j++; } i--; j--; if (i==imax && jtj.exons[j]->end+maxIntronOvl) return NULL; } else if (j==jmax && iti.exons[i]->end+maxIntronOvl) return NULL; } return bigger; } int gseqCmpName(const pointer p1, const pointer p2) { return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name); } void printLocus(GffLocus* loc, const char* pre) { if (pre!=NULL) fprintf(stderr, "%s", pre); GMessage(" [%d-%d] : ", loc->start, loc->end); GMessage("%s",loc->rnas[0]->getID()); for (int i=1;irnas.Count();i++) { GMessage(",%s",loc->rnas[i]->getID()); } GMessage("\n"); } void preserveContainedCDS(GffObj* tcontainer, GffObj* t) { //transfer contained CDS info to the container if t has a CDS but container does not if (!t->hasCDS()) return; if (!tcontainer->hasCDS())//no CDS info on container, just copy it from the contained tcontainer->setCDS(t); } bool exonOverlap2Gene(GffObj* t, GffObj& g) { if (t->exons.Count()>0) { return t->exonOverlap(g.start, g.end); } else return g.overlap(*t); } bool GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata) { bool keep=false; GTData* tdata=NULL; //int tidx=-1; /* if (debug) { GMessage(">>Placing transcript %s\n", t->getID()); debugState=true; } else debugState=false; */ //dumb TRNA case for RefSeq: gene parent link missing //try to restore it here; BUT this only works if gene feature comes first ////DEBUG ONLY: //if (strcmp(t->getID(),"id24448")==0) { //&& t->start==309180) { // GMessage("placeGf %s (%d, %d) (%d exons)\n", t->getID(),t->start, t->end, t->exons.Count()); //} //GMessage("DBG>>Placing transcript %s(%d-%d, %d exons)\n", t->getID(), t->start, t->end, t->exons.Count()); if (t->parent==NULL && t->isTranscript() && trAdoption) { int gidx=gdata->gfs.Count()-1; while (gidx>=0 && gdata->gfs[gidx]->end>=t->start) { GffObj& g = *(gdata->gfs[gidx]); //try to find a container gene object for this transcript //if (g.isGene() && t->strand==g.strand && exonOverlap2Gene(t, g)) { if (g.isGene() && (t->strand=='.' || t->strand==g.strand) && g.exons.Count()==0 && t->start>=g.start && t->end<=g.end) { if (g.children.IndexOf(t)<0) g.children.Add(t); keep=true; if (tdata==NULL) { tdata=new GTData(t); //additional transcript data gdata->tdata.Add(tdata); } t->parent=&g; //disable printing of gene if transcriptsOnly and --keep-genes wasn't given if (transcriptsOnly && !keepGenes) { T_NO_PRINT(g.udata); //tag it as non-printable //keep gene ID and Name into transcript, when we don't print genes const char* geneName=g.getAttr("Name"); if (t->getAttr("Name")==NULL && geneName) { t->addAttr("Name", geneName); if (t->getAttr("gene_name")==NULL) t->addAttr("gene_name", geneName); } t->addAttr("geneID", g.getID()); } break; } --gidx; } } bool noexon_gfs=false; if (t->exons.Count()>0) { //treating this entry as a transcript gdata->rnas.Add(t); //added it in sorted order if (tdata==NULL) { tdata=new GTData(t); //additional transcript data gdata->tdata.Add(tdata); } keep=true; } else { if (t->isGene() || !this->transcriptsOnly) { gdata->gfs.Add(t); keep=true; //GTData* tdata=new GTData(t); //additional transcript data if (tdata==NULL) { tdata=new GTData(t); //additional transcript data gdata->tdata.Add(tdata); } noexon_gfs=true; //gene-like record, no exons defined keep=true; } else { return false; //nothing to do with these non-transcript objects } } if (!doCluster) return keep; if (!keep) return false; //---- place into a locus if (dOvlSET && t->exons.Count()==1) { //for single exon transcripts temporarily set the strand to '.' //so we can check both strands for overlap/locus T_SET_OSTRAND(t->udata, t->strand); t->strand='.'; } if (gdata->loci.Count()==0) { gdata->loci.Add(new GffLocus(t)); return true; //new locus on this ref seq } //--- look for any existing loci overlapping t uint t_end=t->end; uint t_start=t->start; if (dOvlSET) { t_end++; t_start--; } int nidx=qsearch_gloci(t_end, gdata->loci); //get index of nearest locus starting just ABOVE t->end //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx); if (nidx==0) { //cannot have any overlapping loci //if (debug) GMessage(" <start, t->end); gdata->loci.Add(new GffLocus(t)); return true; } if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end int lfound=0; //count of parent loci GArray mrgloci(false); GList tloci(true); //candidate parent loci to adopt this //if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1); for (int l=nidx-1;l>=0;l--) { GffLocus& loc=*(gdata->loci[l]); if ((loc.strand=='+' || loc.strand=='-') && t->strand!='.'&& loc.strand!=t->strand) continue; if (t_start>loc.end) { if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already continue; } if (loc.start>t_end) { //this should never be the case if nidx was found correctly GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID()); continue; } if (loc.add_gfobj(t, dOvlSET)) { //will add this transcript to loc lfound++; mrgloci.Add(l); if (collapseRedundant && !noexon_gfs) { //compare to every single transcript in this locus for (int ti=0;tiuptr); //GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID()); GffObj* container=NULL; if (odata->replaced_by==NULL && (container=redundantTranscripts(*t, *(loc.rnas[ti])))!=NULL) { if (container==t) { odata->replaced_by=t; preserveContainedCDS(t, loc.rnas[ti]); } else {// t is being replaced by previously defined transcript tdata->replaced_by=loc.rnas[ti]; preserveContainedCDS(loc.rnas[ti], t); } } }//for each transcript in the exon-overlapping locus } //if doCollapseRedundant } //overlapping locus } //for each existing locus if (lfound==0) { //overlapping loci not found, create a locus with only this mRNA int addidx=gdata->loci.Add(new GffLocus(t)); if (addidx<0) { //should never be the case! GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end); } } else { //found at least one overlapping locus lfound--; int locidx=mrgloci[lfound]; GffLocus& loc=*(gdata->loci[locidx]); //last locus index found is also the smallest index if (lfound>0) { //more than one loci found parenting this mRNA, merge loci /* if (debug) GMessage(" merging %d loci \n",lfound); */ for (int l=0;lloci[mlidx]), t); gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove } } int i=locidx; while (i>0 && loc<*(gdata->loci[i-1])) { //bubble down until it's in the proper order i--; gdata->loci.Swap(i,i+1); } }//found at least one overlapping locus return true; } void collectLocusData(GList& ref_data, bool covInfo) { int locus_num=0; for (int g=0;gloci.Count();l++) { GffLocus& loc=*(gdata->loci[l]); GHash gnames(true); //gene names in this locus //GHash geneids(true); //Entrez GeneID: numbers GHash geneids(true); int fstrand=0,rstrand=0,ustrand=0; for (int i=0;i0 && rstrand>0) || (fstrand==0 && rstrand==0)) loc.strand='.'; else if (fstrand==0 && rstrand>0) loc.strand='-'; else loc.strand='+'; for (int i=0;if_bases+=loc.mexons[m].len(); else if (loc.strand=='-') gdata->r_bases+=loc.mexons[m].len(); else gdata->u_bases+=loc.mexons[m].len(); } } locus_num++; loc.locus_num=locus_num; if (gnames.Count()>0) { //collect all gene names associated to this locus gnames.startIterate(); int* gfreq=NULL; char* key=NULL; while ((gfreq=gnames.NextData(key))!=NULL) { loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq)); } } //added collected gene_names if (geneids.Count()>0) { //collect all GeneIDs names associated to this locus geneids.startIterate(); int* gfreq=NULL; char* key=NULL; while ((gfreq=geneids.NextData(key))!=NULL) { loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq)); } } } //for each locus }//for each genomic sequence } void GffLoader::loadRefNames(GStr& flst) { //load the whole file and split by (' \t\n\r,' int64_t fsize=fileSize(flst.chars()); if (fsize<0) GError("Error: could not get file size for %s !\n", flst.chars()); GStr slurp("", fsize+1); //sanity check for file size? FILE* f=fopen(flst.chars(), "r"); if (f==NULL) GError("Error: could not open file %s !\n", flst.chars()); slurp.read(f, NULL); fclose(f); slurp.startTokenize(" ,;\t\r\n", tkCharSet); GStr refname; while (slurp.nextToken(refname)) { if (refname.is_empty()) continue; names->gseqs.addName(refname.chars()); } } GenomicSeqData* getGSeqData(GList& seqdata, int gseq_id) { int i=-1; GenomicSeqData f(gseq_id); GenomicSeqData* gdata=NULL; if (seqdata.Found(&f,i)) gdata=seqdata[i]; else { //entry not created yet for this genomic seq gdata=new GenomicSeqData(gseq_id); seqdata.Add(gdata); } return gdata; } void warnPseudo(GffObj& m) { GMessage("Info: pseudo gene/transcript record with ID=%s discarded.\n",m.getID()); } void GffLoader::load(GList& seqdata, GFValidateFunc* gf_validate, GFFCommentParser* gf_parsecomment) { if (f==NULL) GError("Error: GffLoader::load() cannot be called before ::openFile()!\n"); GffReader* gffr=new GffReader(f, this->transcriptsOnly, true); //not only mRNA features, sorted clearHeaderLines(); gffr->showWarnings(verbose); // keepAttrs mergeCloseExons noExonAttr gffr->gene2Exon(gene2exon); if (BEDinput) gffr->isBED(true); //if (TLFinput) gffr->isTLF(true); gffr->mergeCloseExons(mergeCloseExons); gffr->keepAttrs(fullAttributes, gatherExonAttrs, keep_AllExonAttrs); gffr->keepGenes(keepGenes); gffr->setIgnoreLocus(ignoreLocus); gffr->setRefAlphaSorted(this->sortRefsAlpha); if (keepGff3Comments && gf_parsecomment!=NULL) gffr->setCommentParser(gf_parsecomment); gffr->readAll(); GVec pseudoFeatureIds; //feature type: pseudo* GVec pseudoAttrIds; // attribute: [is]pseudo*=true/yes/1 GVec pseudoTypeAttrIds; // attribute: *_type=pseudo* if (this->noPseudo) { GffNameList& fnames = GffObj::names->feats; //gffr->names->feats; for (int i=0;iname; if (startsWith(n, "pseudo")) { pseudoFeatureIds.Add(fnames[i]->idx); } } GffNameList& attrnames = GffObj::names->attrs;//gffr->names->attrs; for (int i=0;iname; if (endsiWith(n, "type")) { pseudoTypeAttrIds.Add(attrnames[i]->idx); }// else { char* p=strifind(n, "pseudo"); if (p==n || (p==n+2 && tolower(n[0])=='i' && tolower(n[1])=='s') || (p==n+3 && startsiWith(n, "is_")) ) { pseudoAttrIds.Add(attrnames[i]->idx); } //} } } //int redundant=0; //redundant annotation discarded if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars()); //int rna_deleted=0; //add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates for (int k=0;kgflst.Count();k++) { GffObj* m=gffr->gflst[k]; if (strcmp(m->getFeatureName(), "locus")==0 && m->getAttr("transcripts")!=NULL) { continue; //discard locus meta-features } if (this->noPseudo) { bool is_pseudo=false; for (int i=0;iftype_id) { is_pseudo=true; break; } } if (is_pseudo) { if (verbose) warnPseudo(*m); continue; } for (int i=0;iattrs!=NULL) attrv=m->attrs->getAttr(pseudoAttrIds[i]); if (attrv!=NULL) { char fc=tolower(attrv[0]); if (fc=='t' || fc=='y' || fc=='1') { is_pseudo=true; break; } } } if (is_pseudo) { if (verbose) warnPseudo(*m); continue; } // *type=*_pseudogene //find all attributes ending with _type and have value like: *_pseudogene for (int i=0;iattrs!=NULL) attrv=m->attrs->getAttr(pseudoTypeAttrIds[i]); if (attrv!=NULL && (startsWith(attrv, "pseudogene") || endsWith(attrv, "_pseudogene")) ) { is_pseudo=true; break; } } if (is_pseudo) { if (verbose) warnPseudo(*m); continue; } } //pseudogene detection requested char* rloc=m->getAttr("locus"); if (rloc!=NULL && startsWith(rloc, "RLOC_")) { m->removeAttr("locus", rloc); } if (forceExons) { m->subftype_id=gff_fid_exon; } //GList gfadd(false,false); -- for gf_validate()? if (gf_validate!=NULL && !(*gf_validate)(m, NULL)) { continue; } m->isUsed(true); //so the gffreader won't destroy it GenomicSeqData* gdata=getGSeqData(seqdata, m->gseq_id); bool keep=placeGf(m, gdata); if (!keep) { m->isUsed(false); //DEBUG //GMessage("Feature %s(%d-%d) is going to be discarded..\n",m->getID(), m->start, m->end); } } //for each read gffObj //if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars()); //if (f && f!=stdin) { fclose(f); f=NULL; } delete gffr; } gffread-0.11.7/gff_utils.h000066400000000000000000000504301361241152500153330ustar00rootroot00000000000000#ifndef GFF_UTILS_H #define GFF_UTILS_H #include "gff.h" #include "GStr.h" #include "GFaSeqGet.h" extern bool verbose; extern bool debugMode; typedef bool GFValidateFunc(GffObj* gf, GList* gfadd); //test if a transcript should be printed (and not printed yet) #define T_PRINTABLE(d) (((d) & 0x100)==0) //set a transcript to not be printed #define T_NO_PRINT(d) d |= 0x100 //test if a duplicate transcript should be shown in the duplicate info file #define T_DUPSHOWABLE(d) (((d) & 0x200)==0) //set a duplicate transcript to not be shown in the duplicate info file #define T_NO_DUPSHOW(d) d |= 0x200 //check original/old strand: #define T_OSTRAND(d) (d & 0xFF) //keep/set original/old strand #define T_SET_OSTRAND(d, s) d |= s class GeneInfo { //for Ensembl GTF conversion public: int flag; GffObj* gf; GList gene_names; GList transcripts; //list of transcript IDs GeneInfo():gene_names(true, true, true), transcripts(true,true,true) { gf=NULL; flag=0; } GeneInfo(GffObj* gfrec, bool ensembl_convert=false):gene_names(true, true, true), transcripts(true,true,true) { flag=0; if (gfrec->getGeneName()) gene_names.Add(new GStr(gfrec->getGeneName())); transcripts.Add(new GStr(gfrec->getID())); create_gf(gfrec, ensembl_convert); } void create_gf(GffObj* gfrec, bool ensembl_convert) { gf=new GffObj(gfrec->getGeneID()); gf->gseq_id=gfrec->gseq_id; gf->track_id=gfrec->track_id; gf->start=gfrec->start; gf->end=gfrec->end; gf->strand=gfrec->strand; gf->setFeatureName("gene"); gf->isGene(true); gf->isUsed(true); gf->uptr=this; gfrec->incLevel(); gfrec->parent=gf; gf->children.Add(gfrec); if (ensembl_convert) { //gf->addAttr("type", gf->getTrackName()); const char* biotype=gfrec->getAttr("type"); if (biotype) gf->addAttr("type", biotype); } //gf->children.Add(gfrec); } //~GeneInfo() { // } void update(GffObj* gfrec) { if (transcripts.AddedIfNew(new GStr(gfrec->getID()))<0) return; gene_names.AddedIfNew(new GStr(gfrec->getGeneName())); if (gf==NULL) { GError("GeneInfo::update() called on uninitialized gf!\n"); //create_gf(gfrec); //return; } gfrec->parent=gf; gf->children.Add(gfrec); gfrec->incLevel(); if (gf->start>gfrec->start) gf->start=gfrec->start; if (gf->endend) gf->end=gfrec->end; } void finalize() { //prepare attributes for printing //must be called right before printing if (gf==NULL || transcripts.Count()==0) return; if (gene_names.Count()>0) { gf->addAttr("Name", gene_names[0]->chars()); /* GStr s(gene_names[0]->chars()); for (int i=1;ichars()); } gf->addAttr("genes", s.chars()); */ } //has gene names GStr t(transcripts[0]->chars()); for (int i=1;ichars()); } gf->addAttr("transcripts", t.chars()); } }; class GffLocus; class GTData { //transcript associated data public: GffObj* rna; GffLocus* locus; GffObj* replaced_by; GeneInfo* geneinfo; //int flag; GTData(GffObj* t=NULL) { rna=t; //flag=0; locus=NULL; replaced_by=NULL; geneinfo=NULL; if (rna!=NULL) { geneinfo=(GeneInfo*)rna->uptr; //take over geneinfo, if there rna->uptr=this; } } bool operator<(GTData& b) { return (rna < b.rna); } bool operator==(GTData& b) { return (rna==b.rna); } }; class CGeneSym { public: GStr name; int freq; CGeneSym(const char* n=NULL, int f=0):name(n), freq(f) { } bool operator<(CGeneSym& b) { return (freq==b.freq)? ( (name.length()==b.name.length()) ? (nameb.freq ); } bool operator==(CGeneSym& b) { return name==b.name; } }; const char* getGeneDescr(const char* gsym); void printLocus(GffLocus* loc, const char* pre=NULL); class GffLocus:public GSeg { public: int gseq_id; //id of underlying genomic sequence int locus_num; bool is_mrna; char strand; GffObj* t_maxcov; //transcript with maximum coverage (for main "ref" transcript) GList gfs; //list of non-transcripts (genes) in this locus GList rnas; //list of transcripts (isoforms) for this locus GArray mexons; //list of merged exons in this region GList gene_names; GList gene_ids; int v; //user flag/data /* bool operator==(GffLocus& d){ return (gseq_id==d.gseq_id && strand==d.strand && start==d.start && end==d.end); } bool operator<(GffLocus& d){ if (gseq_id!=d.gseq_id) return (gseq_idname.chars(); } const char* get_tmax_id() { return t_maxcov->getID(); } const char* get_descr() { if (gene_names.Count()>0) { for (int i=0;iname.chars()); if (gn!=NULL) return gn; } } char* s=t_maxcov->getAttr("product"); if (s!=NULL) return s; s=t_maxcov->getAttr("descr"); if (s!=NULL) return s; s=t_maxcov->getAttr("description"); if (s!=NULL) return s; s=t_maxcov->getAttr("info"); if (s!=NULL) return s; return NULL; } GffLocus(GffObj* t=NULL):gfs(true,false,false), rnas(true,false,false),mexons(true,true), gene_names(true,true,false), gene_ids(true,true,false) { //this will NOT free rnas! t_maxcov=NULL; gseq_id=-1; v=0; locus_num=0; start=0; end=0; strand=0; is_mrna=false; if (t!=NULL) { GSeg seg; bool is_t=(t->exons.Count()>0); if (is_t) { start=t->exons.First()->start; end=t->exons.Last()->end; for (int i=0;iexons.Count();i++) { seg.start=t->exons[i]->start; seg.end=t->exons[i]->end; mexons.Add(seg); } rnas.Add(t); } else { start=t->start; end=t->end; seg.start=start; seg.end=end; mexons.Add(seg); gfs.Add(t); } gseq_id=t->gseq_id; ((GTData*)(t->uptr))->locus=this; t_maxcov=t; strand=t->strand; //if (t->ftype_id==gff_fid_mRNA) { if (t->isTranscript()) is_mrna=true; } } void print(FILE *f, int idxfirstvalid, GStr& locname, GStr& loctrack) { const char* gseqname=NULL; if (rnas.Count()>0) gseqname=rnas[0]->getGSeqName(); else gseqname=gfs[0]->getGSeqName(); fprintf(f,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s", gseqname, loctrack.chars(), this->start, this->end, this->strand, locname.chars()); //const char* loc_gname=loc.getGeneName(); if (this->gene_names.Count()>0) { //print all gene names associated to this locus fprintf(f, ";genes=%s",this->gene_names.First()->name.chars()); for (int i=1;igene_names.Count();i++) { fprintf(f, ",%s",this->gene_names[i]->name.chars()); } } if (this->gene_ids.Count()>0) { //print all GeneIDs names associated to this locus fprintf(f, ";geneIDs=%s",this->gene_ids.First()->name.chars()); for (int i=1;igene_ids.Count();i++) { fprintf(f, ",%s",this->gene_ids[i]->name.chars()); } } if (idxfirstvalid>=0) { GVec tidx; //set of printable (non-discarded) rnas indexes for (int i=idxfirstvalid;irnas.Count();i++) if (((GTData*)this->rnas[i]->uptr)->replaced_by==NULL) tidx.Add(i); if (tidx.Count()>0) { fprintf(f, ";transcripts=%s",this->rnas[tidx[0]]->getID()); for (int i=1;irnas[tidx[i]]->getID()); } } fprintf(f, "\n"); } void addMerge(GffLocus& locus, GffObj* lnkrna) { //add all the elements of the other locus (merging) //-- merge mexons GArray ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons int i=0; //index of first mexons with a merge int j=0; //index current mrna exon while (iiend) { //mexons[i] end extend mexons[i].end=jend; //now this could overlap the next mexon(s), so we have to merge them all while (imexons[i+1].start) { uint nextend=mexons[i+1].end; mexons.Delete(i+1); if (nextend>mexons[i].end) { mexons[i].end=nextend; break; //no need to check next mexons } } //while next mexons merge } // mexons[i] end extend j++; //check the next locus.mexon } //-- add the rest of the non-overlapping mexons: GSeg seg; for (int i=0;iuptr))->locus=this; if (locus.rnas[i]!=lnkrna) rnas.Add(locus.rnas[i]); } for (int i=0;iuptr))->locus=this; if (locus.gfs[i]!=lnkrna) gfs.Add(locus.gfs[i]); } // -- adjust start/end as needed if (start>locus.start) start=locus.start; if (endcovlencovlen) t_maxcov=locus.t_maxcov; } bool add_gfobj(GffObj* t, bool adj) { //if (rnas.Count()==0) return true; //? should never be called on an empty locus uint t_start=t->start; uint t_end=t->end; if (adj) { t_start--; t_end++; } if (t->gseq_id!=gseq_id || /* t->strand!=strand || */ t_start>end || start>t_end) return false; //rna must be on the same genomic seq //check for exon overlap with existing mexons //also update mexons accordingly if t is to be added bool hasovl=false; if (t->exons.Count()>0) { //transcript-like entity if (adj) { t->exons.First()->start--; t->exons.Last()->end++; } int i=0; //index of first mexons with a merge int j=0; //index current t exon GArray ovlexons(true,true); //list of mrna exon indexes overlapping mexons while (iexons.Count()) { uint istart=mexons[i].start; uint iend=mexons[i].end; uint jstart=t->exons[j]->start; uint jend=t->exons[j]->end; if (iendiend) { //mexon stretch up mexons[i].end=jend; //now this could overlap the next mexon(s), so we have to merge them all while (imexons[i+1].start) { uint nextend=mexons[i+1].end; mexons.Delete(i+1); if (nextend>mexons[i].end) { mexons[i].end=nextend; break; //no need to check next mexons } } //while next mexons merge } //possible mexons merge j++; //check the next t exon }//all vs all exon check loop if (adj) { t->exons.First()->start++; t->exons.Last()->end--; } if (hasovl) { GSeg seg; //add the rest of the non-overlapping exons for (int i=0;iexons.Count();i++) { seg.start=t->exons[i]->start; seg.end=t->exons[i]->end; if (!ovlexons.Exists(i)) mexons.Add(seg); } t_add(t); // add to rnas ((GTData*)t->uptr)->locus=this; gseq_id=t->gseq_id; } } else { //gene overlap check uint jstart=t->start; uint jend=t->end; for (int i=0;ijend) break; //exon overlap found: hasovl=true; //extend mexons[i] as needed if (jstartiend) { //mexon stretch up mexons[i].end=jend; //now this could overlap the next mexon(s), so we have to merge them all while (imexons[i+1].start) { uint nextend=mexons[i+1].end; mexons.Delete(i+1); if (nextend>mexons[i].end) { mexons[i].end=nextend; break; //no need to check next mexons } } //while next mexons merge } //possible mexons merge } if (hasovl) { t_add(t); // add to locus rnas or gfs ((GTData*)t->uptr)->locus=this; gseq_id=t->gseq_id; } } return hasovl; } //basic adding of a GffObj to a locus void t_add(GffObj* t) { if (t->exons.Count()>0) rnas.Add(t); else gfs.Add(t); // adjust start/end //if (start==0 || start>t->start) start=t->start; if (start==0) start=t->start; else if (start>t->start) { start=t->start; } if (endend) end=t->end; if (t_maxcov->covlencovlen) t_maxcov=t; if (strand==0 || (strand=='.' && t->strand!='.')) strand=t->strand; //if (t->ftype_id==gff_fid_mRNA) is_mrna=true; if (t->isTranscript()) is_mrna=true; } }; class GenomicSeqData { int gseq_id; public: const char* gseq_name; int seqreg_start; //if given by ##sequence-region comment int seqreg_end; GList gfs; //all non-transcript features -> usually gene features GList rnas; //all transcripts on this genomic sequence GList loci; //all loci clusters GList tdata; //transcript data (uptr holder for all rnas loaded here) uint64 f_bases;//base coverage on forward strand uint64 r_bases;//base coverage on reverse strand uint64 u_bases;//base coverage on undetermined strand //GenomicSeqData(int gid=-1):rnas(true,true,false),loci(true,true,true), GenomicSeqData(int gid=-1):gseq_id(gid), gseq_name(NULL), seqreg_start(0), seqreg_end(0), gfs(true, true, false),rnas((GCompareProc*)gfo_cmpByLoc),loci(true,true,false), tdata(false,true,false), f_bases(0), r_bases(0), u_bases(0) { if (gseq_id>=0) gseq_name=GffObj::names->gseqs.getName(gseq_id); } bool operator==(GenomicSeqData& d){ return gseq_id==d.gseq_id; } bool operator<(GenomicSeqData& d){ return (gseq_id& seqdata, int gseq_id); class GSpliceSite { public: char nt[3]; GSpliceSite(const char* c, bool revc=false) { nt[2]=0; if (c==NULL) { nt[0]=0; nt[1]=0; return; } if (revc) { nt[0]=toupper(ntComplement(c[1])); nt[1]=toupper(ntComplement(c[0])); } else { nt[0]=toupper(c[0]); nt[1]=toupper(c[1]); } } GSpliceSite(const char* intron, int intronlen, bool getAcceptor, bool revc=false) { nt[2]=0; if (intron==NULL || intronlen==0) GError("Error: invalid intron or intron len for GSpliceSite()!\n"); const char* c=intron; if (revc) { if (!getAcceptor) c+=intronlen-2; nt[0]=toupper(ntComplement(c[1])); nt[1]=toupper(ntComplement(c[0])); } else { //on forward strand if (getAcceptor) c+=intronlen-2; nt[0]=toupper(c[0]); nt[1]=toupper(c[1]); }//forward strand } GSpliceSite(const char n1, const char n2) { nt[2]=0; nt[0]=toupper(n1); nt[1]=toupper(n2); } bool canonicalDonor() { return (nt[0]=='G' && (nt[1]=='C' || nt[1]=='T')); } bool operator==(GSpliceSite& c) { return (c.nt[0]==nt[0] && c.nt[1]==nt[1]); } bool operator==(GSpliceSite* c) { return (c->nt[0]==nt[0] && c->nt[1]==nt[1]); } bool operator==(const char* c) { //return (nt[0]==toupper(c[0]) && nt[1]==toupper(c[1])); //assumes given const nucleotides are uppercase already! return (nt[0]==c[0] && nt[1]==c[1]); } bool operator!=(const char* c) { //assumes given const nucleotides are uppercase already! return (nt[0]!=c[0] || nt[1]!=c[1]); } }; class GffLoader { public: GVec headerLines; //for GFF3 we keep the first few header lines (not the sequence-region one) GStr fname; FILE* f; GffNames* names; union { unsigned int options; struct { bool transcriptsOnly:1; bool gene2exon:1; bool fullAttributes:1; bool keep_AllExonAttrs:1; bool gatherExonAttrs:1; bool mergeCloseExons:1; bool ignoreLocus:1; bool noPseudo:1; bool BEDinput:1; bool TLFinput:1; bool keepGenes:1; bool trAdoption:1; //orphan transcript adoption by the container gene bool keepGff3Comments:1; bool sortRefsAlpha:1; bool doCluster:1; bool collapseRedundant:1; //discard "redundant" transcripts (-M/--merge activated) bool matchAllIntrons:1; //if true, contained transcripts are NOT discarded bool fuzzSpan:1; //matching/contained redundancy relaxed to disregard full boundary containment bool dOvlSET:1; //discard overlapping Single Exon Transcripts on any strand bool forceExons:1; }; }; GffLoader():fname(),f(NULL), names(NULL), options(0) { transcriptsOnly=true; gffnames_ref(GffObj::names); names=GffObj::names; } void loadRefNames(GStr& flst); void openFile(GStr& file_name) { //if (f!=NULL) closeFile(); fname=file_name; if (fname=="-" || fname=="stdin") { f=stdin; fname="stdin"; } else { if ((f=fopen(fname.chars(), "r"))==NULL) { GError("Error: cannot open GFF file %s!\n",fname.chars()); } } } void load(GList&seqdata, GFValidateFunc* gf_validate=NULL, GFFCommentParser* gf_parsecomment=NULL); bool placeGf(GffObj* t, GenomicSeqData* gdata); bool unsplContained(GffObj& ti, GffObj& tj); GffObj* redundantTranscripts(GffObj& ti, GffObj& tj); void terminate() { //if (f!=NULL) closeFile(); GffReader is going to close the file gffnames_unref(GffObj::names); names=NULL; } void clearHeaderLines() { if (headerLines.Count()>0) { for (int i=0;iterminate(); clearHeaderLines(); } }; void printFasta(FILE* f, GStr& defline, char* seq, int seqlen=-1, bool useStar=false); //void printTabFormat(FILE* f, GffObj* t); //"position" a given coordinate x within a list of transcripts sorted by their start (lowest) //coordinate, using quick-search; the returned int is the list index of the closest *higher* //GffObj - i.e. starting right *ABOVE* the given coordinate //Convention: returns -1 if there is no such GffObj (i.e. last GffObj starts below x) int qsearch_rnas(uint x, GList& rnas); int qsearch_gloci(uint x, GList& loci); GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons=true, bool fuzzSpan=false); //void loadGFF(FILE* f, GList& seqdata, const char* fname); void collectLocusData(GList& ref_data, bool covInfo=false); #endif gffread-0.11.7/gffread.cpp000066400000000000000000001470461361241152500153140ustar00rootroot00000000000000#include "GArgs.h" #include "gff_utils.h" #include #define __STDC_FORMAT_MACROS #include #define VERSION "0.11.7" #define USAGE "gffread v" VERSION ". Usage:\n\ gffread [-g | ][-s ] \n\ [-o ] [-t ] [-r [[]:].. [-R]]\n\ [-CTVNJMKQAFPGUBHZWTOLE] [-w ] [-x ] [-y ]\n\ [-i ] [--bed] [--table ] [--sort-by ]\n\ \n\ Filter, convert or cluster GFF/GTF/BED records, extract the sequence of\n\ transcripts (exon or CDS) and more.\n\ By default (i.e. without -O) only transcripts are processed, discarding any\n\ other non-transcript features. Default output is a simplified GFF3 with only\n\ the basic attributes.\n\ \n\ is a GFF file, use '-' for stdin\n\ \n\ Options:\n\ -i discard transcripts having an intron larger than \n\ -l discard transcripts shorter than bases\n\ -r only show transcripts overlapping coordinate range ..\n\ (on chromosome/contig , strand if provided)\n\ -R for -r option, discard all transcripts that are not fully \n\ contained within the given range\n\ -U discard single-exon transcripts\n\ -C coding only: discard mRNAs that have no CDS features\n\ --nc non-coding only: discard mRNAs that have CDS features\n\ --ignore-locus : discard locus features and attributes found in the input\n\ -A use the description field from and add it\n\ as the value for a 'descr' attribute to the GFF record\n\ -s is a tab-delimited file providing this info\n\ for each of the mapped sequences:\n\ \n\ (useful for -A option with mRNA/EST/protein mappings)\n\ Sorting: (by default, chromosomes are kept in the order they were found)\n\ --sort-alpha : chromosomes (reference sequences) are sorted alphabetically\n\ --sort-by : sort the reference sequences by the order in which their\n\ names are given in the file\n\ Misc options: \n\ -F preserve all GFF attributes (for non-exon features)\n\ --keep-exon-attrs : for -F option, do not attempt to reduce redundant\n\ exon/CDS attributes\n\ -G do not keep exon attributes, move them to the transcript feature\n\ (for GFF3 output)\n\ --keep-genes : in transcript-only mode (default), also preserve gene records\n\ --keep-comments: for GFF3 input/output, try to preserve comments\n\ -O process other non-transcript GFF records (by default non-transcript\n\ records are ignored)\n\ -V discard any mRNAs with CDS having in-frame stop codons (requires -g)\n\ -H for -V option, check and adjust the starting CDS phase\n\ if the original phase leads to a translation with an \n\ in-frame stop codon\n\ -B for -V option, single-exon transcripts are also checked on the\n\ opposite strand (requires -g)\n\ -P add transcript level GFF attributes about the coding status of each\n\ transcript, including partialness or in-frame stop codons (requires -g)\n\ --add-hasCDS : add a \"hasCDS\" attribute with value \"true\" for transcripts\n\ that have CDS features\n\ --adj-stop stop codon adjustment: enables -P and performs automatic\n\ adjustment of the CDS stop coordinate if premature or downstream\n\ -N discard multi-exon mRNAs that have any intron with a non-canonical\n\ splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n\ -J discard any mRNAs that either lack initial START codon\n\ or the terminal STOP codon, or have an in-frame stop codon\n\ (i.e. only print mRNAs with a complete CDS)\n\ --no-pseudo: filter out records matching the 'pseudo' keyword\n\ --in-bed: input should be parsed as BED format (automatic if the input\n\ filename ends with .bed*)\n\ --in-tlf: input GFF-like one-line-per-transcript format without exon/CDS\n\ features (see --tlf option below); automatic if the input\n\ filename ends with .tlf)\n\ Clustering:\n\ -M/--merge : cluster the input transcripts into loci, discarding\n\ \"duplicated\" transcripts (those with the same exact introns\n\ and fully contained or equal boundaries)\n\ -d : for -M option, write duplication info to file \n\ --cluster-only: same as -M/--merge but without discarding any of the\n\ \"duplicate\" transcripts, only create \"locus\" features\n\ -K for -M option: also discard as redundant the shorter, fully contained\n\ transcripts (intron chains matching a part of the container)\n\ -Q for -M option, no longer require boundary containment when assessing\n\ redundancy (can be combined with -K); only introns have to match for\n\ multi-exon transcripts, and >=80% overlap for single-exon transcripts\n\ -Y for -M option, enforce -Q but also discard overlapping single-exon \n\ transcripts, even on the opposite strand (can be combined with -K)\n\ Output options:\n\ --force-exons: make sure that the lowest level GFF features are considered\n\ \"exon\" features\n\ --gene2exon: for single-line genes not parenting any transcripts, add an\n\ exon feature spanning the entire gene (treat it as a transcript)\n\ --t-adopt: try to find a parent gene overlapping/containing a transcript\n\ that does not have any explicit gene Parent\n\ -D decode url encoded characters within attributes\n\ -Z merge very close exons into a single exon (when intron size<4)\n\ -g full path to a multi-fasta file with the genomic sequences\n\ for all input mappings, OR a directory with single-fasta files\n\ (one per genomic sequence, with file names matching sequence names)\n\ -w write a fasta file with spliced exons for each transcript\n\ --w-add for the -w option, extract additional bases\n\ both upstream and downstream of the transcript boundaries\n\ -x write a fasta file with spliced CDS for each GFF transcript\n\ -y write a protein fasta file with the translation of CDS for each record\n\ -W for -w and -x options, write in the FASTA defline the exon\n\ coordinates projected onto the spliced sequence;\n\ for -y option, write transcript attributes in the FASTA defline\n\ -S for -y option, use '*' instead of '.' as stop codon translation\n\ -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\ -m is a name mapping table for converting reference \n\ sequence names, having this 2-column format:\n\ \n\ -t use in the 2nd column of each GFF/GTF output line\n\ -o write the records into instead of stdout\n\ -T main output will be GTF instead of GFF3\n\ --bed output records in BED format instead of default GFF3\n\ --tlf output \"transcript line format\" which is like GFF\n\ but exons, CDS features and related data are stored as GFF \n\ attributes in the transcript feature line, like this:\n\ exoncount=N;exons=;CDSphase=;CDS= \n\ is a comma-delimited list of exon_start-exon_end coordinates;\n\ is CDS_start:CDS_end coordinates or a list like \n\ --table output a simple tab delimited format instead of GFF, with columns\n\ having the values of GFF attributes given in ; special\n\ pseudo-attributes (prefixed by @) are recognized:\n\ @id, @geneid, @chr, @start, @end, @strand, @numexons, @exons, \n\ @cds, @covlen, @cdslen\n\ -v,-E expose (warn about) duplicate transcript IDs and other potential\n\ problems with the given GFF/GTF records\n\ " class SeqInfo { //populated from the -s option of gffread public: int len; char* descr; SeqInfo( int l, char* s): len(l), descr(NULL) { if (s!=NULL) descr=Gstrdup(s); } ~SeqInfo() { GFREE(descr); } }; class RefTran { public: char* new_name; RefTran(char *ns) { new_name=NULL; if (ns!=NULL) new_name=Gstrdup(ns); } ~RefTran() { GFREE(new_name); } }; enum ETableFieldType { ctfGFF_Attr=0, // attribute name as is ctfGFF_ID, //ID or @id or transcript_id ctfGFF_geneID, //geneID or @gene_id ctfGFF_Parent, //Parent or @parent ctfGFF_chr, //@chr ctfGFF_feature, //@feature ctfGFF_start, //@start ctfGFF_end, //@end ctfGFF_strand, //@strand ctfGFF_numexons, //@numexons ctfGFF_exons, //@exons ctfGFF_cds, //@cds ctfGFF_covlen, //@covlen ctfGFF_cdslen//@cdslen }; class CTableField { public: ETableFieldType type; GStr name; //only for type ctfGFF_Attr CTableField(ETableFieldType atype=ctfGFF_Attr):type(atype) { } CTableField(GStr& attrname):type(ctfGFF_Attr),name(attrname) { } }; FILE* ffasta=NULL; FILE* f_in=NULL; FILE* f_out=NULL; FILE* f_w=NULL; //writing fasta with spliced exons (transcripts) int wPadding = 0; //padding for -w option FILE* f_x=NULL; //writing fasta with spliced CDS FILE* f_y=NULL; //wrting fasta with translated CDS bool wCDSonly=false; bool wNConly=false; int minLen=0; //minimum transcript length bool validCDSonly=false; // translation with no in-frame STOP bool bothStrands=false; //for single-exon mRNA validation, check the other strand too bool altPhases=false; //if original phase fails translation validation, //try the other 2 phases until one makes it bool addCDSattrs=false; bool add_hasCDS=false; bool adjustStop=false; //automatic adjust the CDS stop coordinate bool covInfo=false; // --cov-info : only report genome coverage //bool transcriptsOnly=true; //bool keepGenes=false; //for transcriptsOnly //bool sortAlpha=false; GStr sortBy; //file name with chromosomes listed in the desired order //bool keepRefOrder=false; //sort within chromosomes, but follow the input chromosome order -- default! GStr tableFormat; //list of "attributes" to print in tab delimited format //bool NoPseudo=false; bool spliceCheck=false; //only known splice-sites bool decodeChars=false; //decode url-encoded chars in attrs (-D) bool StarStop=false; //use * instead of . for stop codon translation bool fullCDSonly=false; // starts with START, ends with STOP codon //bool fullattr=false; //-F //bool gatherExonAttrs=false; //-G //bool sortByLoc=false; // if the GFF output should be sorted by location bool ensembl_convert=false; //-L, assist in converting Ensembl GTF to GFF3 bool BEDinput=false; bool TLFinput=false; bool fmtGFF3=true; //default output: GFF3 //other formats only make sens in transcriptOnly mode bool fmtGTF=false; bool fmtBED=false; bool fmtTLF=false; bool fmtTable=false; bool addDescr=false; //bool protmap=false; bool multiExon=false; bool writeExonSegs=false; char* tracklabel=NULL; int maxintron=999000000; //bool mergeCloseExons=false; //range filter: char* rfltGSeq=NULL; char rfltStrand=0; uint rfltStart=0; uint rfltEnd=MAX_UINT; bool rfltWithin=false; //check for full containment within given range GffLoader gffloader; GList g_data(true,true,true); //list of GFF records by genomic seq //hash with sequence info GHash seqinfo; GHash isoCounter; //counts the valid isoforms GHash reftbl; GHash gene_ids; //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion) bool debugMode=false; //bool verbose=false; GVec tableCols; //table output format fields void loadSeqInfo(FILE* f, GHash &si) { GLineReader fr(f); while (!fr.isEof()) { char* line=fr.getLine(); if (line==NULL) break; char* id=line; char* lenstr=NULL; char* text=NULL; char* p=line; while (*p!=0 && !isspace(*p)) p++; if (*p==0) continue; *p=0;p++; while (*p==' ' || *p=='\t') p++; if (*p==0) continue; lenstr=p; while (*p!=0 && !isspace(*p)) p++; if (*p!=0) { *p=0;p++; } while (*p==' ' || *p=='\t') p++; if (*p!=0) text=p; //else text remains NULL int len=0; if (!parseInt(lenstr,len)) { GMessage("Warning: could not parse sequence length: %s %s\n", id, lenstr); continue; } // --- here we have finished parsing the line si.Add(id, new SeqInfo(len,text)); } //while lines } void setTableFormat(GStr& s) { if (s.is_empty()) return; GHash specialFields; specialFields.Add("chr", new ETableFieldType(ctfGFF_chr)); specialFields.Add("id", new ETableFieldType(ctfGFF_ID)); specialFields.Add("geneid", new ETableFieldType(ctfGFF_geneID)); specialFields.Add("parent", new ETableFieldType(ctfGFF_Parent)); specialFields.Add("feature", new ETableFieldType(ctfGFF_feature)); specialFields.Add("start", new ETableFieldType(ctfGFF_start)); specialFields.Add("end", new ETableFieldType(ctfGFF_end)); specialFields.Add("strand", new ETableFieldType(ctfGFF_strand)); specialFields.Add("numexons", new ETableFieldType(ctfGFF_numexons)); specialFields.Add("exons", new ETableFieldType(ctfGFF_exons)); specialFields.Add("cds", new ETableFieldType(ctfGFF_cds)); specialFields.Add("covlen", new ETableFieldType(ctfGFF_covlen)); specialFields.Add("cdslen", new ETableFieldType(ctfGFF_cdslen)); s.startTokenize(" ,;.:", tkCharSet); GStr w; while (s.nextToken(w)) { if (w[0]=='@') { w=w.substr(1); ETableFieldType* v=specialFields.Find(w.chars()); if (v!=NULL) { CTableField tcol(*v); tableCols.Add(tcol); } else GMessage("Warning: table field '@%s' not recognized!\n",w.chars()); continue; } if (w=="ID" || w=="transcript_id") { CTableField tcol(ctfGFF_ID); tableCols.Add(tcol); continue; } if (w=="geneID" || w=="gene_id") { CTableField tcol(ctfGFF_geneID); tableCols.Add(tcol); continue; } if (w=="Parent") { CTableField tcol(ctfGFF_Parent); tableCols.Add(tcol); continue; } CTableField col(w); tableCols.Add(col); } } void loadRefTable(FILE* f, GHash& rt) { GLineReader fr(f); char* line=NULL; while ((line=fr.getLine())) { char* orig_id=line; char* p=line; while (*p!=0 && !isspace(*p)) p++; if (*p==0) continue; *p=0;p++;//split the line here while (*p==' ' || *p=='\t') p++; if (*p==0) continue; rt.Add(orig_id, new RefTran(p)); } //while lines } char* getSeqDescr(char* seqid) { static char charbuf[128]; if (seqinfo.Count()==0) return NULL; char* suf=rstrchr(seqid, '.'); if (suf!=NULL) *suf=0; SeqInfo* seqd=seqinfo.Find(seqid); if (suf!=NULL) *suf='.'; if (seqd!=NULL) { GStr s(seqd->descr); //cleanup some Uniref gunk if (s[0]=='[') { int r=s.index(']'); if (r>=0 && r<8 && isdigit(s[1])) s.remove(0,r+1); } if (s.length()>80) { int r=s.index(';'); if (r>5) s.cut(r); } if (s.length()>127) { s.cut(127); int r=s.rindex(' '); if (r>0) s.cut(r); } strcpy(charbuf, s.chars()); return charbuf; } else return NULL; } char* getSeqName(char* seqid) { static char charbuf[128]; char* suf=rstrchr(seqid, '.'); if (suf!=NULL) *suf=0; strcpy(charbuf, seqid); if (suf!=NULL) *suf='.'; return charbuf; } int adjust_stopcodon(GffObj& gffrec, int adj, GList* seglst=NULL) { //adj>0, extend CDS to include a potential stop codon //when CDS is expanded, the terminal exon might have to be adjusted too int realadj=0; if (gffrec.strand=='-') { if ((int)gffrec.CDstart>adj) { gffrec.CDstart-=adj; realadj=adj; if (gffrec.exons.First()->start>gffrec.CDstart) { gffrec.covlen+=gffrec.exons.First()->start - gffrec.CDstart; gffrec.exons.First()->start=gffrec.CDstart; gffrec.start=gffrec.CDstart; } } } else { // forward strand //expand beyond realadj=adj; gffrec.CDend+=adj; if (adj<0) {//restore if (gffrec.exons.Last()->end==gffrec.CDend-adj) { gffrec.exons.Last()->end+=adj; gffrec.end=gffrec.exons.Last()->end; gffrec.covlen+=adj; } } else if (gffrec.exons.Last()->endend; gffrec.exons.Last()->end=gffrec.CDend; gffrec.end=gffrec.CDend; } } if (seglst!=NULL) seglst->Last()->end+=realadj; return realadj; } bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) { if (!gffrec.isTranscript()) return false; //shouldn't call this function unless it's a transcript //returns true if the transcript passed the filter char* gname=gffrec.getGeneName(); if (gname==NULL) gname=gffrec.getGeneID(); if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) { const char* biotype=gffrec.getAttr("gene_biotype"); if (biotype) { gffrec.addAttr("type", biotype); gffrec.removeAttr("gene_biotype"); } else { //old Ensembl files lacking gene_biotype gffrec.addAttr("type", gffrec.getTrackName()); } //bool is_gene=false; bool is_pseudo=false; if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS()) gffrec.setFeatureName("mRNA"); else { if (strcmp(biotype, "processed_transcript")==0) gffrec.setFeatureName("proc_RNA"); else { //is_gene=endsWith(biotype, "gene"); is_pseudo=strifind(biotype, "pseudo"); if (is_pseudo) { gffrec.setFeatureName("pseudo_RNA"); } else if (endsWith(biotype, "RNA")) { gffrec.setFeatureName(biotype); } else gffrec.setFeatureName("misc_RNA"); } } } if (gname && strcmp(gname, gffrec.getID())!=0) { int* isonum=isoCounter.Find(gname); if (isonum==NULL) { isonum=new int(1); isoCounter.Add(gname,isonum); } else (*isonum)++; //defline.appendfmt(" gene=%s", gname); } int seqlen=0; const char* tlabel=tracklabel; if (tlabel==NULL) tlabel=gffrec.getTrackName(); //defline.appendfmt(" track:%s",tlabel); char* cdsnt = NULL; char* cdsaa = NULL; int aalen=0; for (int i=1;istart-gffrec.exons[i-1]->end-1; if (verbose && ilen>4000000) GMessage("Warning: very large intron (%d) for transcript %s\n", ilen, gffrec.getID()); if (ilen>maxintron) { return false; } } GMapSegments seglst(gffrec.strand); GFaSeqGet* faseq=NULL; if (f_x!=NULL || f_y!=NULL || f_w!=NULL || spliceCheck || validCDSonly || addCDSattrs) { faseq=fastaSeqGet(gfasta, gffrec.getGSeqName()); if (faseq==NULL) GError("Error: no genomic sequence available (check -g option!).\n"); } if (spliceCheck && gffrec.exons.Count()>1) { //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC ) int glen=gffrec.end-gffrec.start+1; const char* gseq=faseq->subseq(gffrec.start, glen); bool revcompl=(gffrec.strand=='-'); bool ssValid=true; for (int e=1;eend+1-gffrec.start; int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1; GSpliceSite acceptorSite(intron,intronlen,true, revcompl); GSpliceSite donorSite(intron,intronlen, false, revcompl); //GMessage("%c intron %d-%d : %s .. %s\n", // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt); if (acceptorSite=="AG") { // GT-AG or GC-AG if (!donorSite.canonicalDonor()) { ssValid=false;break; } } else if (acceptorSite=="AC") { //AT-AC also accepted if (donorSite!="AT") { ssValid=false; break; } } else { ssValid=false; break; } } if (!ssValid) { if (verbose) GMessage("Unrecognized splice sites found for '%s'\n",gffrec.getID()); return false; //don't print this one! } } bool trprint=true; bool inframeStop=false; //int stopCodonAdjust=0; int mCDphase=0; bool fullCDS=false; bool endStop=false; bool stopAdjusted=false; if (add_hasCDS && gffrec.hasCDS()) gffrec.addAttr("hasCDS", "true"); if (gffrec.CDphase=='1' || gffrec.CDphase=='2') mCDphase = gffrec.CDphase-'0'; //CDS partialness only added when -y -x -V options are given if (gffrec.hasCDS() && (f_y!=NULL || f_x!=NULL || validCDSonly || addCDSattrs)) { int strandNum=0; int phaseNum=0; CDS_CHECK: uint cds_olen=0; cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, &cds_olen, &seglst, adjustStop); //if adjustStop, seqlen has the CDS+3'UTR length, but cds_olen still has the original CDS length if (cdsnt!=NULL && cdsnt[0]!='\0') { //has CDS cdsaa=translateDNA(cdsnt, aalen, seqlen); char* p=strchr(cdsaa,'.'); int cds_aalen=aalen; if (adjustStop) cds_aalen=cds_olen/3; //originally stated CDS length endStop=false; if (p!=NULL) { //stop codon found if (p-cdsaa==cds_aalen-1) { //stop found as the stated last CDS codon *p='\0';//remove it endStop=true; if (adjustStop) { seqlen=cds_aalen*3; aalen=cds_aalen; } cds_aalen--; aalen--; //no need to adjust stop codon } else {//stop found in a different position than the last codon if (p-cdsaarestore(); if (stopCodonAdjust!=0 && !endStop) { //restore stop codon location //adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst); if (seglst.Count()>0) seglst.Last()->end-=stopCodonAdjust; if (cdsnt!=NULL && seqlen>0) { seqlen-=stopCodonAdjust; cdsnt[seqlen]=0; } if (cdsaa!=NULL) aalen--; } } if (adjstop!=NULL) delete adjstop; */ if (cdsnt!=NULL) { // && !inframeStop) { if (f_y!=NULL) { //CDS translation fasta output requested if (cdsaa==NULL) { //translate now if not done before cdsaa=translateDNA(cdsnt, aalen, seqlen); } GStr defline(gffrec.getID()); if (gffrec.attrs!=NULL) { //append all attributes found for each transcripts for (int i=0;iCount();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); char* s=gffrec.getAttrValue(i); if (s[0]=='"') defline.append(s); else defline.appendQuoted(s, '{', true); } } if (aalen>0) { if (cdsaa[aalen-1]=='.' || cdsaa[aalen-1]=='\0') --aalen; //avoid printing the stop codon printFasta(f_y, defline, cdsaa, aalen, StarStop); } } if (f_x!=NULL) { //CDS only GStr defline(gffrec.getID()); if (writeExonSegs) { defline.append(" loc:"); defline.append(gffrec.getGSeqName()); defline.appendfmt("(%c)",gffrec.strand); //warning: not CDS coordinates are written here, but the exon ones defline+=(int)gffrec.start; defline+=(char)'-'; defline+=(int)gffrec.end; // -- here these are CDS substring coordinates on the spliced sequence: defline.append(" segs:"); for (int i=0;i0) defline.append(","); defline+=(int)seglst[i].start; defline.append("-"); defline+=(int)seglst[i].end; } } if (gffrec.attrs!=NULL) { //append all attributes found for each transcript for (int i=0;iCount();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); char* s=gffrec.getAttrValue(i); if (s[0]=='"') defline.append(s); else defline.appendQuoted(s, '{', true); } } printFasta(f_x, defline, cdsnt, seqlen); } GFREE(cdsnt); GFREE(cdsaa); } //writing CDS or its translation if (f_w!=NULL) { //write spliced exons uint cds_start=0; uint cds_end=0; seglst.Clear(); //TODO: ? if wPadding is set, *temporarily* change first and last exon coordinates ?!? // or perhaps getSpliced() should take an additional padding parameter ?!? int padLeft=0; int padRight=0; if (wPadding>0) { padLeft= (gffrec.start>(uint)wPadding) ? wPadding : gffrec.start - 1; int ediff=faseq->getseqlen()-gffrec.end; padRight=(wPadding>ediff) ? ediff : wPadding; gffrec.addPadding(padLeft, padRight); } char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst); //restore exons to normal (remove padding) if (wPadding>0) gffrec.removePadding(padLeft, padRight); GStr defline(gffrec.getID()); if (exont!=NULL) { if (gffrec.CDstart>0) { defline.appendfmt(" CDS=%d-%d", cds_start, cds_end); } if (writeExonSegs) { defline.append(" loc:"); defline.append(gffrec.getGSeqName()); defline+=(char)'|'; defline+=(int)gffrec.start; defline+=(char)'-'; defline+=(int)gffrec.end; defline+=(char)'|'; defline+=(char)gffrec.strand; defline.append(" exons:"); for (int i=0;i0) defline.append(","); defline+=(int)gffrec.exons[i]->start; defline.append("-"); defline+=(int)gffrec.exons[i]->end; } if (wPadding>0) { defline.append(" padding:"); defline.append(padLeft); defline+=(char)'|'; defline.append(padRight); } defline.append(" segs:"); for (int i=0;i0) defline.append(","); defline+=(int)seglst[i].start; defline.append("-"); defline+=(int)seglst[i].end; } } if (gffrec.attrs!=NULL) { //append all attributes found for each transcripts for (int i=0;iCount();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); char* s=gffrec.getAttrValue(i); if (s[0]=='"') defline.append(s); else defline.appendQuoted(s, '{', true); } } printFasta(f_w, defline, exont, seqlen); GFREE(exont); } } //writing f_w (spliced exons) return true; } void openfw(FILE* &f, GArgs& args, char opt) { GStr s=args.getOpt(opt); if (!s.is_empty()) { if (s=='-') f=stdout; else { f=fopen(s,"w"); if (f==NULL) GError("Error creating file: %s\n", s.chars()); } } } #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh) void printGff3Header(FILE* f, GArgs& args) { if (gffloader.keepGff3Comments) { for (int i=0;iseqreg_start>0 && gdata->seqreg_end>0) fprintf(f, "##sequence-region %s %d %d\n", gdata->gseq_name, gdata->seqreg_start, gdata->seqreg_end); } void processGffComment(const char* cmline, GfList* gflst) { if (cmline[0]!='#') return; const char* p=cmline; while (*p=='#') p++; GStr s(p); //this can be called only after gffloader initialization // so we can use gffloader.names->gseqs.addName() s.startTokenize("\t ", tkCharSet); GStr w; if (s.nextToken(w) && w=="sequence-region") { GStr chr, wend; if (s.nextToken(chr) && s.nextToken(w) && s.nextToken(wend)) { int gseq_id=gffloader.names->gseqs.addName(chr.chars()); if (gseq_id>=0) { GenomicSeqData* gseqdata=getGSeqData(g_data, gseq_id); gseqdata->seqreg_start=w.asInt(); gseqdata->seqreg_end=wend.asInt(); } else GError("Error adding ref seq ID %s\n", chr.chars()); } return; } if (gflst->Count()==0) { //initial Gff3 header, store it char* hl=Gstrdup(cmline); gffloader.headerLines.Add(hl); } } bool validateGffRec(GffObj* gffrec, GList* gfnew) { if (reftbl.Count()>0) { //check if we need to reject by ref seq filter GStr refname(gffrec->getRefName()); RefTran* rt=reftbl.Find(refname.chars()); if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) { //try removing the version suffix refname.cut(-2); //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars()); rt=reftbl.Find(refname.chars()); } if (rt) { gffrec->setRefName(rt->new_name); } /* //no, do not discard non-matching entries, let them pass through! else { if (verbose) GMessage("Info: %s discarded due to reference %s not being mapped\n", gffrec->getID(), refname.chars()); return false; //discard, ref seq not in the given translation table }*/ } if (gffloader.transcriptsOnly && gffrec->isDiscarded()) { //discard generic "locus" features with no other detailed subfeatures //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",gffrec->getID()); return false; } if (minLen>0 && gffrec->covlengetID(), minLen); return false; } if (rfltGSeq!=NULL) { //filter by gseqName if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) { return false; } } if (rfltStrand>0 && gffrec->strand !=rfltStrand) { return false; } //check coordinates if (rfltStart!=0 || rfltEnd!=MAX_UINT) { if (rfltWithin) { if (gffrec->startend>rfltEnd) { return false; //not within query range } } else { if (gffrec->start>rfltEnd || gffrec->endexons.Count()<=1) { return false; } if (wCDSonly && gffrec->CDstart==0) { return false; } if (wNConly && gffrec->hasCDS()) return false; if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) { //keep track of chr|gene_id data -- coordinate range char* geneid=gffrec->getGeneID(); if (geneid!=NULL) { GeneInfo* ginfo=gene_ids.Find(geneid); if (ginfo==NULL) {//first time seeing this gene ID GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert); gene_ids.Add(geneid, geneinfo); if (gfnew!=NULL) gfnew->Add(geneinfo->gf); //do we really need this? } else ginfo->update(gffrec); } } return true; } void printGffObj(FILE* f, GffObj* gfo, GStr& locname, GffPrintMode exonPrinting, int& out_counter) { GffObj& t=*gfo; GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL || !T_PRINTABLE(t.udata)) return; //if (t.exons.Count()==0 && t.children.Count()==0 && forceExons) // t.addExonSegment(t.start,t.end); T_NO_PRINT(t.udata); if (!fmtGFF3 && !gfo->isTranscript()) return; //only GFF3 prints non-transcript records (incl. parent genes) t.addAttr("locus", locname.chars()); out_counter++; if (fmtGFF3) { //print the parent first, if any and if not printed already if (t.parent!=NULL && T_PRINTABLE(t.parent->udata)) { GTData* pdata=(GTData*)(t.parent->uptr); if (pdata && pdata->geneinfo!=NULL) pdata->geneinfo->finalize(); t.parent->addAttr("locus", locname.chars()); t.parent->printGxf(f, exonPrinting, tracklabel, NULL, decodeChars); T_NO_PRINT(t.parent->udata); } } t.printGxf(f, exonPrinting, tracklabel, NULL, decodeChars); } void printGxfTab(FILE* f, GffObj& g) { //using attribute list in tableCols char* av=NULL; for(int i=0;i0) fprintf(f,"\t"); switch(tableCols[i].type) { case ctfGFF_Attr: av=g.getAttr(tableCols[i].name.chars()); if (av!=NULL) fprintf(f,"%s",av); else fprintf(f, "."); break; case ctfGFF_chr: fprintf(f,"%s",g.getGSeqName()); break; case ctfGFF_ID: fprintf(f,"%s",g.getID()); break; case ctfGFF_geneID: fprintf(f,"%s",g.getGeneID()); break; case ctfGFF_Parent: if (g.parent!=NULL) fprintf(f,"%s",g.parent->getID()); else fprintf(f, "."); break; case ctfGFF_feature: fprintf(f,"%s",g.getFeatureName()); break; case ctfGFF_start: fprintf(f,"%d",g.start); break; case ctfGFF_end: fprintf(f,"%d",g.end); break; case ctfGFF_strand: fprintf(f,"%c",g.strand); break; case ctfGFF_numexons: fprintf(f,"%d",g.exons.Count()); break; case ctfGFF_exons: if (g.exons.Count()>0) { for (int x=0;x0) fprintf(f,","); fprintf(f,"%d-%d",g.exons[x]->start, g.exons[x]->end); } } else fprintf(f,"."); break; case ctfGFF_cds: if (g.hasCDS()) { GVec cds; g.getCDSegs(cds); for (int x=0;x0) fprintf(f,","); fprintf(f,"%d-%d",cds[x].start, cds[x].end); } } else fprintf(f,"."); break; case ctfGFF_covlen: fprintf(f, "%d", g.covlen); break; case ctfGFF_cdslen: if (g.hasCDS()) { GVec cds; g.getCDSegs(cds); int clen=0; for (int x=0;xreplaced_by!=NULL || !T_PRINTABLE(t.udata)) return; T_NO_PRINT(t.udata); if (out_counter!=NULL) (*out_counter)++; //print the parent first, if any and if not printed already if (t.parent!=NULL && T_PRINTABLE(t.parent->udata)) { GTData* pdata=(GTData*)(t.parent->uptr); if (pdata && pdata->geneinfo!=NULL) pdata->geneinfo->finalize(); //t.parent->addAttr("locus", locname.chars()); //(*out_counter)++; ? printGxfTab(f, *t.parent); T_NO_PRINT(t.parent->udata); } printGxfTab(f, *gfo); } int main(int argc, char* argv[]) { GArgs args(argc, argv, "version;debug;merge;adj-stop;bed;in-bed;tlf;in-tlf;cluster-only;nc;cov-info;help;" "sort-alpha;keep-genes;w-add=;keep-comments;keep-exon-attrs;force-exons;t-adopt;gene2exon;" "ignore-locus;no-pseudo;table=sort-by=hvOUNHPWCVJMKQYTDARSZFGLEBm:g:i:r:s:l:t:o:w:x:y:d:"); args.printError(USAGE, true); if (args.getOpt('h') || args.getOpt("help")) { GMessage("%s",USAGE); exit(1); } debugMode=(args.getOpt("debug")!=NULL); decodeChars=(args.getOpt('D')!=NULL); gffloader.forceExons=(args.getOpt("force-exons")!=NULL); gffloader.noPseudo=(args.getOpt("no-pseudo")!=NULL); gffloader.ignoreLocus=(args.getOpt("ignore-locus")!=NULL); gffloader.transcriptsOnly=(args.getOpt('O')==NULL); //sortByLoc=(args.getOpt('S')!=NULL); addDescr=(args.getOpt('A')!=NULL); verbose=(args.getOpt('v')!=NULL || args.getOpt('E')!=NULL); wCDSonly=(args.getOpt('C')!=NULL); wNConly=(args.getOpt("nc")!=NULL); addCDSattrs=(args.getOpt('P')!=NULL); add_hasCDS=(args.getOpt("add-hasCDS")!=NULL); adjustStop=(args.getOpt("adj-stop")!=NULL); if (adjustStop) addCDSattrs=true; validCDSonly=(args.getOpt('V')!=NULL); altPhases=(args.getOpt('H')!=NULL); fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF fmtBED=(args.getOpt("bed")!=NULL); fmtTLF=(args.getOpt("tlf")!=NULL); if (fmtGTF || fmtBED || fmtTLF) { if (!gffloader.transcriptsOnly) { GMessage("Error: option -O is only supported with GFF3 output"); exit(1); } fmtGFF3=false; } BEDinput=(args.getOpt("in-bed")!=NULL); TLFinput=(args.getOpt("in-tlf")!=NULL); bothStrands=(args.getOpt('B')!=NULL); fullCDSonly=(args.getOpt('J')!=NULL); spliceCheck=(args.getOpt('N')!=NULL); StarStop=(args.getOpt('S')!=NULL); gffloader.keepGenes=(args.getOpt("keep-genes")!=NULL); gffloader.trAdoption=(args.getOpt("t-adopt")!=NULL); gffloader.keepGff3Comments=(args.getOpt("keep-comments")!=NULL); gffloader.sortRefsAlpha=(args.getOpt("sort-alpha")!=NULL); if (args.getOpt("sort-by")!=NULL) { if (gffloader.sortRefsAlpha) GError("Error: options --sort-by and --sort-alpha are mutually exclusive!\n"); sortBy=args.getOpt("sort-by"); } if (!sortBy.is_empty()) gffloader.loadRefNames(sortBy); gffloader.gene2exon=(args.getOpt("gene2exon")!=NULL); gffloader.matchAllIntrons=(args.getOpt('K')==NULL); gffloader.fuzzSpan=(args.getOpt('Q')!=NULL); gffloader.dOvlSET=(args.getOpt('Y')!=NULL); if (args.getOpt('M') || args.getOpt("merge")) { gffloader.doCluster=true; gffloader.collapseRedundant=true; } else { if (!gffloader.matchAllIntrons || gffloader.fuzzSpan || gffloader.dOvlSET) { GMessage("%s",USAGE); GMessage("Error: options -K,-Q,-Y require -M/--merge option!\n"); exit(1); } } if (args.getOpt("cluster-only")) { gffloader.doCluster=true; gffloader.collapseRedundant=false; if (!gffloader.matchAllIntrons || gffloader.fuzzSpan || gffloader.dOvlSET) { GMessage("%s",USAGE); GMessage("Error: option -K,-Q,-Y have no effect with --cluster-only.\n"); exit(1); } } if (gffloader.dOvlSET) gffloader.fuzzSpan=true; //-Q enforced by -Y covInfo=(args.getOpt("cov-info")); if (covInfo) gffloader.doCluster=true; //need to collapse overlapping exons if (fullCDSonly) validCDSonly=true; if (verbose) { fprintf(stderr, "Command line was:\n"); args.printCmdLine(stderr); } if (args.getOpt("version")) { GMessage(VERSION"\n"); exit(0); } gffloader.fullAttributes=(args.getOpt('F')!=NULL); gffloader.keep_AllExonAttrs=(args.getOpt("keep-exon-attrs")!=NULL); if (gffloader.keep_AllExonAttrs && !gffloader.fullAttributes) { GMessage("Error: option --keep-exon-attrs requires option -F !\n"); exit(0); } if (args.getOpt('G')==NULL) gffloader.gatherExonAttrs=!gffloader.fullAttributes; else { gffloader.gatherExonAttrs=true; gffloader.fullAttributes=true; } if (gffloader.noPseudo && !gffloader.fullAttributes) { gffloader.gatherExonAttrs=true; gffloader.fullAttributes=true; } ensembl_convert=(args.getOpt('L')!=NULL); if (ensembl_convert) { gffloader.fullAttributes=true; gffloader.gatherExonAttrs=false; //sortByLoc=true; } tableFormat=args.getOpt("table"); if (!tableFormat.is_empty()) { setTableFormat(tableFormat); fmtTable=true; fmtGFF3=false; gffloader.fullAttributes=true; } gffloader.mergeCloseExons=(args.getOpt('Z')!=NULL); multiExon=(args.getOpt('U')!=NULL); writeExonSegs=(args.getOpt('W')!=NULL); tracklabel=args.getOpt('t'); GFastaDb gfasta(args.getOpt('g')); //if (gfasta.fastaPath!=NULL) // sortByLoc=true; //enforce sorting by chromosome/contig GStr s=args.getOpt('i'); if (!s.is_empty()) maxintron=s.asInt(); s=args.getOpt('l'); if (!s.is_empty()) minLen=s.asInt(); FILE* f_repl=NULL; s=args.getOpt('d'); if (!s.is_empty()) { if (s=="-") f_repl=stdout; else { f_repl=fopen(s.chars(), "w"); if (f_repl==NULL) GError("Error creating file %s\n", s.chars()); } } rfltWithin=(args.getOpt('R')!=NULL); s=args.getOpt('r'); if (!s.is_empty()) { s.trim(); if (s[0]=='+' || s[0]=='-') { rfltStrand=s[0]; s.cut(0,1); } int isep=s.index(':'); if (isep>0) { //gseq name given if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) { isep--; rfltStrand=s[isep]; s.cut(isep,1); } if (isep>0) rfltGSeq=Gstrdup((s.substr(0,isep)).chars()); s.cut(0,isep+1); } GStr gsend; char slast=s[s.length()-1]; if (rfltStrand==0 && (slast=='+' || slast=='-')) { s.chomp(slast); rfltStrand=slast; } if (s.index("..")>=0) gsend=s.split(".."); else gsend=s.split('-'); if (!s.is_empty()) rfltStart=(uint)s.asInt(); if (!gsend.is_empty()) { rfltEnd=(uint)gsend.asInt(); if (rfltEnd==0) rfltEnd=MAX_UINT; } } //gseq/range filtering else { if (rfltWithin) GError("Error: option -R requires -r!\n"); //if (rfltWholeTranscript) // GError("Error: option -P requires -r!\n"); } s=args.getOpt('m'); if (!s.is_empty()) { FILE* ft=fopen(s,"r"); if (ft==NULL) GError("Error opening reference table: %s\n",s.chars()); loadRefTable(ft, reftbl); fclose(ft); } s=args.getOpt('s'); if (!s.is_empty()) { FILE* fsize=fopen(s,"r"); if (fsize==NULL) GError("Error opening info file: %s\n",s.chars()); loadSeqInfo(fsize, seqinfo); fclose(fsize); } openfw(f_out, args, 'o'); //if (f_out==NULL) f_out=stdout; if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL)) GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n"); openfw(f_w, args, 'w'); openfw(f_x, args, 'x'); openfw(f_y, args, 'y'); s=args.getOpt("w-add"); if (!s.is_empty()) { if (f_w==NULL) GError("Error: --w-add option requires -w option!\n"); wPadding=s.asInt(); } if (f_out==NULL && f_w==NULL && f_x==NULL && f_y==NULL && !covInfo) f_out=stdout; //if (f_y!=NULL || f_x!=NULL) wCDSonly=true; //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL); int numfiles = args.startNonOpt(); //GList gfkept(false,true); //unsorted, free items on delete int out_counter=0; //number of records printed while (true) { GStr infile; if (numfiles) { infile=args.nextNonOpt(); if (infile.is_empty()) break; if (infile=="-") { f_in=stdin; infile="stdin"; } else if ((f_in=fopen(infile, "r"))==NULL) GError("Error: cannot open input file %s!\n",infile.chars()); else fclose(f_in); numfiles--; } else infile="-"; const char* fext=getFileExt(infile.chars()); if (BEDinput || (Gstricmp(fext, "bed")==0)) gffloader.BEDinput=true; if (TLFinput || (Gstricmp(fext, "tlf")==0)) gffloader.TLFinput=true; gffloader.openFile(infile); gffloader.load(g_data, &validateGffRec, &processGffComment); // will also place the transcripts in loci, if doCluster is enabled if (gffloader.doCluster) collectLocusData(g_data, covInfo); if (numfiles==0) break; } if (covInfo) { //report coverage info at STDOUT uint64 f_bases=0; uint64 r_bases=0; uint64 u_bases=0; for (int g=0;gf_bases; r_bases+=g_data[g]->r_bases; u_bases+=g_data[g]->u_bases; } fprintf(stdout, "Total bases covered by transcripts:\n"); if (f_bases>0) fprintf(stdout, "\t%" PRIu64 " on + strand\n", f_bases); if (r_bases>0) fprintf(stdout, "\t%" PRIu64 " on - strand\n", r_bases); if (u_bases>0) fprintf(stdout, "\t%" PRIu64 " on . strand\n", u_bases); } GStr loctrack("gffcl"); if (tracklabel) loctrack=tracklabel; if (gffloader.sortRefsAlpha) g_data.setSorted(&gseqCmpName); GffPrintMode exonPrinting; if (fmtGTF) exonPrinting = pgtfAny; else if (fmtBED) exonPrinting=pgffBED; else if (fmtTLF) exonPrinting=pgffTLF; else { //printing regular GFF3 exonPrinting = gffloader.forceExons ? pgffBoth : pgffAny; } bool firstGff3Print=fmtGFF3; if (gffloader.doCluster) { //grouped in loci for (int g=0;gseqreg_start>0) fprintf(f_out, "##sequence-region %s %d %d\n", gdata->gseq_name, gdata->seqreg_start, gdata->seqreg_end); for (int l=0;lloci.Count();l++) { bool firstLocusPrint=true; GffLocus& loc=*(gdata->loci[l]); //check all non-replaced transcripts in this locus: int numvalid=0; int idxfirstvalid=-1; for (int i=0;ireplaced_by!=NULL) { if (f_repl && T_DUPSHOWABLE(t.udata)) { fprintf(f_repl, "%s", t.getID()); GTData* rby=tdata; while (rby->replaced_by!=NULL) { fprintf(f_repl," => %s", rby->replaced_by->getID()); T_NO_DUPSHOW(rby->rna->udata); rby=(GTData*)(rby->replaced_by->uptr); } fprintf(f_repl, "\n"); } T_NO_PRINT(t.udata); if (verbose) { GMessage("Info: %s discarded: superseded by %s\n", t.getID(), tdata->replaced_by->getID()); } continue; } //restore strand for dOvlSET char orig_strand=T_OSTRAND(t.udata); if (orig_strand!=0) t.strand=orig_strand; if (process_transcript(gfasta, t)) { numvalid++; if (idxfirstvalid<0) idxfirstvalid=i; } } //for each transcript int rnas_i=0; if (idxfirstvalid>=0) rnas_i=idxfirstvalid; int gfs_i=0; if (f_out) { GStr locname("RLOC_"); locname.appendfmt("%08d",loc.locus_num); //GMessage("Locus: %s (%d-%d), %d rnas, %d gfs\n", locname.chars(), loc.start, loc.end, // loc.rnas.Count(), loc.gfs.Count()); while (gfs_i=loc.rnas.Count() || loc.gfs[gfs_i]->start<=loc.rnas[rnas_i]->start) ) { //print the gene object first if (fmtGFF3) { //BED, TLF and GTF: only show transcripts if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (firstGSeqHeader) { printGSeqHeader(f_out, gdata); firstGSeqHeader=false; } if (firstLocusPrint) { loc.print(f_out, idxfirstvalid, locname, loctrack); firstLocusPrint=false; } printGffObj(f_out, loc.gfs[gfs_i], locname, exonPrinting, out_counter); } ++gfs_i; continue; } if (rnas_iprintGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); if (fmtGFF3) { if (firstGff3Print) { printGff3Header(f_out, args); firstGff3Print=false; } if (firstGSeqHeader) { printGSeqHeader(f_out, gdata); firstGSeqHeader=false; } if (firstLocusPrint) { loc.print(f_out, idxfirstvalid, locname, loctrack); firstLocusPrint=false; } } if (fmtTable) printAsTable(f_out, loc.rnas[rnas_i], &out_counter); else printGffObj(f_out, loc.rnas[rnas_i], locname, exonPrinting, out_counter); ++rnas_i; } } } }//for each locus } //for each genomic sequence } //if Clustering enabled else { //no clustering //not grouped into loci, print the rnas with their parents, if any int numvalid=0; for (int g=0;grnas.Count();m++) { GffObj& t=*(gdata->rnas[m]); if (f_out && (fmtGFF3 || fmtTable)) { //print other non-transcript (gene?) feature that might be there before t while (gfs_igfs.Count() && gdata->gfs[gfs_i]->start<=t.start) { GffObj& gfst=*(gdata->gfs[gfs_i]); if T_PRINTABLE(gfst.udata) { //never printed T_NO_PRINT(gfst.udata); if (fmtGFF3) { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (firstGSeqHeader) { printGSeqHeader(f_out, gdata); firstGSeqHeader=false; } gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } else printGxfTab(f_out, gfst); } ++gfs_i; } } GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL) continue; if (process_transcript(gfasta, t)) { numvalid++; if (f_out && T_PRINTABLE(t.udata) ) { T_NO_PRINT(t.udata); if (fmtGFF3 || fmtTable || t.isTranscript()) { if (tdata->geneinfo) tdata->geneinfo->finalize(); out_counter++; if (fmtGFF3) { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (firstGSeqHeader) { printGSeqHeader(f_out, gdata); firstGSeqHeader=false; } } //for GFF3 && table output, print the parent first, if any if ((fmtGFF3 || fmtTable) && t.parent!=NULL && T_PRINTABLE(t.parent->udata)) { GTData* pdata=(GTData*)(t.parent->uptr); if (pdata && pdata->geneinfo!=NULL) pdata->geneinfo->finalize(); if (fmtTable) printGxfTab(f_out, *(t.parent)); else t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); T_NO_PRINT(t.parent->udata); } if (fmtTable) printGxfTab(f_out, t); else t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } }//GFF/GTF output requested } //valid transcript } //for each rna //print the rest of the isolated pseudo/gene/region features not printed yet if (f_out && (fmtGFF3 || fmtTable)) { while (gfs_igfs.Count()) { GffObj& gfst=*(gdata->gfs[gfs_i]); if T_PRINTABLE(gfst.udata) { //never printed T_NO_PRINT(gfst.udata); if (fmtGFF3) { if (firstGff3Print) { printGff3Header(f_out, args); firstGff3Print=false; } if (firstGSeqHeader) { printGSeqHeader(f_out, gdata); firstGSeqHeader=false; } gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } else printGxfTab(f_out, gfst); } ++gfs_i; } } } //for each genomic seq } //no clustering if (f_repl && f_repl!=stdout) fclose(f_repl); seqinfo.Clear(); //if (faseq!=NULL) delete faseq; //if (gcdb!=NULL) delete gcdb; GFREE(rfltGSeq); FWCLOSE(f_out); FWCLOSE(f_w); FWCLOSE(f_x); FWCLOSE(f_y); } gffread-0.11.7/prep_linux.sh000077500000000000000000000012111361241152500157150ustar00rootroot00000000000000#!/bin/bash -e ver=$(fgrep '#define VERSION ' gffread.cpp) ver=${ver#*\"} ver=${ver%%\"*} srcpack=gffread-$ver source prep_source.sh linpack=$pack.Linux_x86_64 echo "preparing $linpack.tar.gz" echo "-------------------" /bin/rm -rf $linpack /bin/rm -f $linpack.tar.gz mkdir $linpack cd $srcpack make clean make release cp LICENSE README.md gffread ../$linpack/ cd .. tar cvfz $linpack.tar.gz $linpack ls -l $srcpack.tar.gz $linpack.tar.gz echo "scp $linpack.tar.gz $pack.tar.gz salz:~/html/software/stringtie/dl/" echo ".. then on the server: " echo "perl -i -pe 's/gffread\-\d\.\d+\.\d+\w?\./gffread-$ver./g' ~/html/software/stringtie/gff*.shtml" gffread-0.11.7/prep_mac.sh000077500000000000000000000011101361241152500153140ustar00rootroot00000000000000#!/bin/sh ver=$(fgrep '#define VERSION ' gffread.cpp) ver=${ver#*\"} ver=${ver%%\"*} pack=gffread-$ver macpack=$pack.OSX_x86_64 echo "preparing $macpack.tar.gz" echo "-------------------" /bin/rm -rf $macpack /bin/rm -f $macpack.tar.gz mkdir $macpack make clean make release cp gffread $macpack/ tar cvfz $macpack.tar.gz $macpack ls -l $macpack.tar.gz #echo "If you're on igmN machines you can also update the web files:" echo "scp $macpack.tar.gz salz:~/html/software/stringtie/dl/" #echo "perl -i -pe 's/gffread\-\d\.\d+\.\d+\./gffread-$ver./g' ~/html/software/gffutils/home.shtml" gffread-0.11.7/prep_source.sh000077500000000000000000000010631361241152500160630ustar00rootroot00000000000000#!/bin/sh ver=$(fgrep '#define VERSION ' gffread.cpp) ver=${ver#*\"} ver=${ver%%\"*} pack=gffread-$ver echo " preparing souce $pack.tar.gz" echo "----------------------------------" /bin/rm -rf $pack /bin/rm -f $pack.tar.gz mkdir $pack mkdir $pack/gclib libdir=$pack/gclib/ cp LICENSE README.md gffread.cpp gff_utils.{h,cpp} $pack/ sed 's|\.\./gclib|./gclib|' Makefile > $pack/Makefile cp ../gclib/{GVec,GList,GHash}.hh $libdir cp ../gclib/{GArgs,GBase,gdna,GStr,gff,codons,GFaSeqGet,GFastaIndex}.{h,cpp} $libdir tar cvfz $pack.tar.gz $pack ls -l $pack.tar.gz gffread-0.11.7/tag_git.sh000077500000000000000000000002431361241152500151520ustar00rootroot00000000000000#!/bin/bash -e git checkout master ver=$(fgrep '#define VERSION ' gffread.cpp) ver=${ver#*\"} ver=${ver%%\"*} git tag -a "v$ver" -m "release $ver" git push --tags