cdbfasta/0000775002442700244270000000000011306017430012476 5ustar gperteagperteacdbfasta/Makefile0000664002442700244270000000347211306016357014153 0ustar gperteagpertea# Useful directories THISCODEDIR := . GCLDIR := ./gclib # Directory where libz.a can be found ZDIR := ../zlib # Directories to search for header files SEARCHDIRS := -I${ZDIR} -I${THISCODEDIR} -I${GCLDIR} SYSTYPE := $(shell uname) MACHTYPE := $(shell uname -m) ifeq ($(MACHTYPE), i686) MARCH = -march=i686 else MARCH = endif CC := g++ BASEFLAGS = -Wall ${SEARCHDIRS} $(MARCH) -DENABLE_COMPRESSION=0 -D_FILE_OFFSET_BITS=64 \ -D_LARGEFILE_SOURCE -fno-exceptions -fno-rtti -fno-strict-aliasing \ -D_REENTRANT ifeq ($(findstring debug,$(MAKECMDGOALS)),) DBGFLAGS = -O2 -DNDEBUG LDFLAGS = else DBGFLAGS = -g -DDEBUG LDFLAGS = -g endif ifeq ($(findstring nommap,$(MAKECMDGOALS)),) CFLAGS = $(DBGFLAGS) $(BASEFLAGS) else CFLAGS = $(DBGFLAGS) $(BASEFLAGS) -DNO_MMAP endif %.o : %.c ${CC} ${CFLAGS} -c $< -o $@ %.o : %.cc ${CC} ${CFLAGS} -c $< -o $@ %.o : %.C ${CC} ${CFLAGS} -c $< -o $@ %.o : %.cpp ${CC} ${CFLAGS} -c $< -o $@ %.o : %.cxx ${CC} ${CFLAGS} -c $< -o $@ # C/C++ linker LINKER := g++ LDFLAGS = -lz #if ENABLE_COMPRESSION is not needed, # LDFLAGS = .PHONY : all all: cdbfasta cdbyank debug: all nommap: all cdbfasta: ./cdbfasta.o ./gcdbz.o $(LIBS) ${GCLDIR}/gcdb.o ${GCLDIR}/GBase.o ${GCLDIR}/GStr.o ${GCLDIR}/GArgs.o ${LINKER} -o $@ ${filter-out %.a %.so, $^} $(LDFLAGS) cdbyank : ./cdbyank.o ./gcdbz.o ${GCLDIR}/gcdb.o ${GCLDIR}/GBase.o ${GCLDIR}/GStr.o ${GCLDIR}/GArgs.o ${LINKER} -o $@ ${filter-out %.a %.so, $^} $(LDFLAGS) # target for removing all object files .PHONY : tidy tidy:: @${RM} core cdbfasta cdbyank *.o ${GCLDIR}/gcdb.o ${GCLDIR}/GBase.o ${GCLDIR}/GStr.o ${GCLDIR}/GArgs.o # target for removing all object files .PHONY : clean clean:: tidy @${RM} core cdbfasta cdbyank *.o ${GCLDIR}/gcdb.o ${GCLDIR}/GBase.o ${GCLDIR}/GStr.o ${GCLDIR}/GArgs.o cdbfasta/cdbfasta.cpp0000664002442700244270000006222211263452413014763 0ustar gperteagpertea#include #include #include #include #include "GBase.h" #include "GArgs.h" #include "GHash.hh" #include "gcdb.h" #ifdef ENABLE_COMPRESSION #include "gcdbz.h" #endif #ifdef __WIN32__ #define VERSION "cdbfasta version 0.99w" #else #define VERSION "cdbfasta version 0.99" #endif #define USAGE "Usage:\n\ cdbfasta [-o ] [-r ]\n\ [-z ] [-i] [-m|-n |-f]|-c|-C]\n\ [-w ] [-s ] [-v]\n\ \n\ Creates an index file for records from a multi-fasta file.\n\ By default (without -m/-n/-c/-C option), only the first \n\ space-delimited token from the defline is used as a key.\n\ \n\ is the multi-fasta file to index; \n\ -o the index file will be named ; if not given,\n\ the index filename is database name plus the suffix '.cidx'\n\ -r a string of characters at the beginning of line\n\ marking the start of a record (default: '>')\n\ -Q treat input as fastq format, i.e. with '@' as record delimiter\n\ and with records expected to have at least 4 lines\n\ -z database is compressed into the file \n\ before indexing ( can be \"-\" or \"stdin\" \n\ in order to get the input records from stdin)\n\ -s strip extraneous characters from *around* the space delimited\n\ tokens, for the multikey options below (-m,-n,-f);\n\ Default set is: '\",`.(){}/[]!:;~|><+-\n\ -m (\"multi-key\" option) create hash entries pointing to \n\ the same record for all tokens found in\n\ the defline\n\ -n same as -m, but only takes the first \n\ tokens from the defline\n\ -f indexes *space* delimited tokens (fields) in the defline as given\n\ by LIST of fields or fields ranges (the same syntax as UNIX 'cut')\n\ -w exclude from indexing all the words found\n\ in the file (for options -m, -n and -k)\n\ -i do case insensitive indexing (i.e. create additional keys for \n\ all-lowercase tokens used for indexing from the defline \n\ -c for deflines in the format: db1|accession1|db2|accession2|...,\n\ only the first db-accession pair ('db1|accession1') is taken as key\n\ -C like -c, but also subsequent db|accession constructs are indexed,\n\ along with the full (default) token; additionally,\n\ all nrdb concatenated accessions found in the defline \n\ are parsed and stored (assuming 0x01 or '^|^' as separators)\n\ -a accession mode: like -C option, but indexes the 'accession'\n\ part for all 'db|accession' constructs found\n\ -A like -a and -C together (both accessions and 'db|accession'\n\ constructs are used as keys\n\ -v show program version and exit\n" #define ERR_TOOMANYFIELDS "Error: too many fields for -f option\n" //16K initial defline buffer #define KBUFSIZE 0x4000 #ifndef O_BINARY #define O_BINARY 0x0000 #endif #define MAX_KEYLEN 1024 //64K input buffer #define GREADBUF_SIZE 0x010000 typedef void (*addFuncType)(char*, off_t, uint32); // for passing around index data: struct IdxData32 { uint32 fpos; uint32 reclen; }; struct IdxData { off_t fpos; //64bit value on Linux uint32 reclen; }; int IdxDataSIZE=offsetof(IdxData, reclen)+sizeof(uint32); int IdxDataSIZE32=offsetof(IdxData32, reclen)+sizeof(uint32); char ftmp[365]; char fztmp[365]; char record_marker[127]; //record delimiter int record_marker_len=1; int num_recs; int num_keys; off_t last_cdbfpos=0; int compact_plus; //shortcut key and bool acc_mode=false; bool acc_only=false; bool do_compress=false; // compression used bool fastq=false; FILE* zf=NULL; //compressed file handle //store just offset and record length const char* defWordJunk="'\",`.(){}/[]!:;~|><+-"; char* wordJunk=NULL; bool caseInsensitive=false; //case insensitive storage bool useStopWords=false; unsigned int numFields=0; // we have fields[numFields-1]=MAX_UINT as defined in gcdb.h -- // as an indicator of taking every single token in the defline, // or for open ended ranges (e.g. -f5- ) unsigned int fields[255]; //array of numFields field indices (1-based) GHash stopList; //static int datalen=sizeof(uint32)+sizeof(off_t); char lastKey[MAX_KEYLEN]; //keep a copy of the last valid written key GCdbWrite* cdbidx; addFuncType addKeyFunc; #define ERR_W_DBSTAT "Error writing the database statististics!\n" void die_write(const char* fname) { GError("Error: cdbhash was unable to write into file %s\n", fname); } void die_read(const char* infile) { GError("Error: cdbhash was unable to read the input file %s\n",infile); } void die_readformat(const char* infile) { GError("Error: bad format for file %s; is it a fastA file?\n", infile); } bool add_cdbkey(char* key, off_t fpos, uint32 reclen) { unsigned int klen=strlen(key); if (fpos==last_cdbfpos && strcmp(key, lastKey)==0) return true; if (klen<1) { GMessage("Warning: zero length key found following key '%s'\n", lastKey); return false; } //------------ adding record ----------------- num_keys++; strncpy(lastKey, key, MAX_KEYLEN-1); lastKey[MAX_KEYLEN-1]='\0'; if ((uint64)fpos>(uint64)MAX_UINT) { //64 bit file offset IdxData recdata; uint64 v= (uint64) fpos; //needed for Solaris' off_t issues with gcc/32 recdata.fpos=gcvt_offt(&v); recdata.reclen=gcvt_uint(&reclen); if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE)==-1) GError("Error adding cdb record with key '%s'\n",key); } else {//32 bit file offset IdxData32 recdata; uint32 v=(uint32) fpos; recdata.fpos=gcvt_uint(&v); recdata.reclen=gcvt_uint(&reclen); //GMessage("Adding 32bit: '%s' reclen=%d\n", key, recdata.reclen); if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE32)==-1) GError("Error adding cdb record with key '%s'\n",key); } last_cdbfpos=fpos; return true; } //default indexing: key directly passed -- // as the first space delimited token void addKey(char* key, off_t fpos, uint32 reclen) { num_recs++; add_cdbkey(key, fpos, reclen); if (caseInsensitive) { char* lckey=loCase(key); if (strcmp(lckey, key)!=0) add_cdbkey(lckey, fpos, reclen); GFREE(lckey); } } //the whole defline is passed void addKeyMulti(char* defline, off_t fpos, uint32 reclen) { char* p=defline; unsigned int fieldno=0; char* pn; num_recs++; bool stillParsing=true; unsigned int fidx=0; //index in fields[] array while (stillParsing) { while ((*p)==' ' || (*p)=='\t') p++; if (*p == '\0') break; //skip any extraneous characters at the beginning of the token while (chrInStr(*p, wordJunk)) p++; //skip any padding spaces or other extraneous characters //at the beginning of the word if (*p == '\0') break; pn=p; while (*pn!='\0' && !isspace(*pn)) pn++; //found next space or end of string fieldno++; while (fields[fidx] p=>'%s' [%d, %d, %d] (next='%s')\n",p, numFields, // fieldno, fields[numFields-1], pn+1); stillParsing = (((*pn)!='\0') && (fieldno+1<=fields[numFields-1])); char* pend = pn-1; //pend is on the last non-space in the current token //--strip the ending junk, if any while (chrInStr(*pend, wordJunk) && pend>p) pend--; if (pend0) { if (fields[fidx]==MAX_UINT || fields[fidx]==fieldno) { if (useStopWords && stopList.hasKey(p)) { p=pn+1; continue; } //--- store this key with the same current record data: add_cdbkey(p, fpos, reclen); //---storage code ends here if (caseInsensitive) { char* lcp=loCase(p); if (strcmp(lcp,p)!=0) add_cdbkey(lcp, fpos, reclen); GFREE(lcp); } } //if (isEnd) break; //if all the token were stored } p=pn+1;//p is now on the next token's start } //token parsing loop } int qcmpInt(const void *p1, const void *p2) { //int n1=*((int*)p1); //int n2=*((int*)p2); const unsigned int *a = (unsigned int *) p1; const unsigned int *b = (unsigned int *) p2; if (*a < *b) return -1; else if (*a > *b) return 1; else return 0; } char* parse_dbacc(char* pstart, char*& end_acc, char*& accst) { if (pstart==NULL || *pstart==0) return NULL; bool hasDigits=false; char* pend=pstart; end_acc=NULL; //end of first accession accst=NULL; while (*pstart=='|') pstart++; for(char* p=pstart;;p++) { if (hasDigits==false && *p>='0' && *p<='9') hasDigits=true; /* if (*p==0) { //end of seq_name pend=p; //doesn't matter if it's accession or "db"-like break; }*/ if (*p=='|' || *p==0) { int curlen=p-pstart; if (*p==0 || (hasDigits && curlen>3) || curlen>7 || accst!=NULL) {//previous token was an accession pend=p; //advance pend if (end_acc==NULL) end_acc=p; if (accst==NULL) accst=pstart; break; } else { //first pipe char or no digits accst=p+1; } hasDigits=false;//reset flag } // | separator } if (pend!=pstart) return pend; else return NULL; } char* parseSpToken(char* str) { if (str==NULL) return NULL; char* p=str; while (*p!=' ' && *p!='\t' && *p!='\v' && *p!=0) p++; *p=0; return p; } #define NRDB_CHARSEP "\1\2\3\4" #define NRDB_STRSEP "^|^" //nrdbp is positioned at the end of current nrdb concatenated //defline, or NULL if there is no more inline void NRDB_Rec(char* &nrdbp, char* defline) { nrdbp=strpbrk(defline, NRDB_CHARSEP); if (nrdbp==NULL) { nrdbp=strstr(defline, NRDB_STRSEP); if (nrdbp!=NULL) { *nrdbp='\0'; nrdbp+=2; } } else *nrdbp='\0'; } //-c/-C/-a/-A indexing: key up to the first space or second '|' //receives the full defline void addKeyCompact(char* defline, off_t fpos, uint32 reclen) { //we got the first token found on the defline num_recs++; char* nrdb_end; //breaks defline at the next nrdb concatenation point NRDB_Rec(nrdb_end, defline); //isolate the first token in this nrdb concatenation char* token_end=parseSpToken(defline); if (!compact_plus) { //shortcut key //only the first db|accession construct will be indexed, if found char* end_acc1=NULL; //end of first accession char* acc1st=NULL; char* dbacc_end=parse_dbacc(defline, end_acc1, acc1st); if (end_acc1!=NULL) { //has acceptable shortcut *end_acc1=0; add_cdbkey(defline, fpos, reclen); return; } if (dbacc_end!=NULL) { *dbacc_end=0; add_cdbkey(defline, fpos, reclen); return; } //store this whole non-space token as key: add_cdbkey(defline, fpos, reclen); return; } //from now on only -C/-a/-A treatment: for(;;) { //defline is on the first token if (strlen(defline)>0) //add whole non-space token as the "full key" add_cdbkey(defline, fpos, reclen); //add the db|accession constructs as keys char* dbacc_start=defline; char* firstacc_end=NULL; char* accst=NULL; char* dbacc_end=parse_dbacc(dbacc_start, firstacc_end, accst); while (dbacc_end!=NULL) { if (firstacc_end!=NULL && firstacc_end& xhash) { int c; int count=0; char name[256]; int len=0; while ((c=getc(f))!=EOF) { if (isspace(c) || c==',' || c==';') { if (len>0) { name[len]='\0'; xhash.Add(name, new int(1)); count++; len=0; } continue; } //a non-space name[len]=(char) c; len++; if (len>255) { name[len-1]='\0'; GError("Error reading words file: token too long ('%s') !\n",name); } } if (len>0) { name[len]='\0'; xhash.Add(name, new int(1)); count++; } return count; } //========================== MAIN =============================== int main(int argc, char **argv) { FILE* f_read=NULL; off_t fdbsize; int ch; char* zfilename; char* fname; char* marker; //record marker int maxkeys=0; int multikey; record_marker[0]='>'; record_marker[1]=0; GArgs args(argc, argv, "mn:o:r:z:w:f:s:icvQCaA"); int e=args.isError(); if (e>0) GError("%s Invalid argument: %s\n", USAGE, argv[e] ); if (args.getOpt('v')!=NULL) { printf("%s\n",VERSION); return 0; } fastq = (args.getOpt('Q')!=NULL); multikey = (args.getOpt('m')!=NULL); if (multikey) { fields[numFields]=1; numFields++; fields[numFields]=MAX_UINT; numFields++; } caseInsensitive = (args.getOpt('i')!=NULL); acc_only=(args.getOpt('a')!=NULL); acc_mode=(acc_only || args.getOpt('A')!=NULL); compact_plus=(args.getOpt('C')!=NULL || acc_mode); wordJunk = (char *)args.getOpt('s'); if (wordJunk==NULL) wordJunk=(char*)defWordJunk; int compact=(args.getOpt('c')!=NULL || compact_plus); if (compact && multikey) { GError("%s Error: invalid flags combination.\n", USAGE); } char* s = (char*)args.getOpt('n'); if (s!=NULL) { maxkeys = atoi(s); if (maxkeys<=1 || compact || multikey) GError("%s Error: invalid options (-m, -c/C, -n and -f are exclusive)\n", USAGE); multikey=1; numFields=maxkeys; if (numFields>254) GError(ERR_TOOMANYFIELDS); for (unsigned int i=1;i<=numFields;i++) fields[i-1]=i; } char* argfields = (char*)args.getOpt('f'); if (argfields!=NULL) { //parse all the field #s char* pbrk; int prevnum=0; char prevsep='\0'; numFields=0; char sep; char *p=argfields; do { pbrk=strpbrk(p,",-"); if (pbrk==NULL) { sep='\0'; pbrk=p+strlen(p); if (prevsep == '-' && *p=='\0' && prevnum>0) { //open ended range -- ending with '-' fields[numFields]=prevnum; numFields++; if (numFields>253) GError(ERR_TOOMANYFIELDS); fields[numFields]=MAX_UINT; numFields++; //GMessage("--- stored %d, %d\n",prevnum, MAX_UINT); break; }// ending with '-' } // '\0' else { sep=*pbrk; *pbrk = '\0'; } int num = atoi(p); if (num<=0 || num>254 ) GError("%s Error: invalid syntax for -f option.\n", USAGE); if (prevsep == '-') { //store a range for (int i=prevnum;i<=num;i++) { fields[numFields]=i; numFields++; if (numFields>254) GError(ERR_TOOMANYFIELDS); } } else if (sep!='-') { fields[numFields]=num; numFields++; if (numFields>254) GError(ERR_TOOMANYFIELDS); } prevsep=sep; prevnum=num; p=pbrk+1; } while (sep != '\0'); //range parsing loop if (numFields<=0 || numFields>254 ) GError("%s Error at parsing -f option.\n", USAGE); //GMessage("[%d] Fields parsed (%d values):\n", sizeof(fields[0]), numFields); qsort(fields, numFields, sizeof(fields[0]), &qcmpInt); multikey=1; /*-- --------debug: for (unsigned int i=0;i126) GError("Error: the specified record delimiter is too long. " "Maximum accepted is 126\n"); //special case: hex (0xXX) and octal codes (\XXX) are accepted, only if by themselves if (strlen(marker)==4 && marker[0]=='\\' || (marker[0]=='0' && (toupper(marker[1])=='X'))) { if (marker[0]=='\\') { marker++; v=strtol(marker, NULL, 8); } else v=strtol(marker, NULL, 16); if (v==0 || v>255) GError("Invalid record delimiter: should be only one character,\n" "'\\NNN' (octal value), '0xNN' (hexadecimal value)"); record_marker[0]=v; record_marker_len=1; } else { strcpy(record_marker, marker); record_marker_len=strlen(record_marker); } } } char* stopwords=(char*)args.getOpt('w'); //stop words filename given? if (stopwords!=NULL) { FILE* fstopwords=NULL; if ((fstopwords=fopen(stopwords, "r"))==NULL) GError("Cannot open stop words file '%s'!\n", stopwords); int c=readWords(fstopwords, stopList); GMessage("Loaded %d stop words.\n", c); fclose(fstopwords); useStopWords=(c>0); } if ((zfilename=(char*)args.getOpt('z')) !=NULL) { do_compress=true; #ifndef ENABLE_COMPRESSION GError("Error: compression requested but not enabled when cdbfasta was compiled\n") #endif strcpy(fztmp,zfilename); strcat(fztmp,"_ztmp"); zf=fopen(fztmp,"wb"); if (zf==NULL) GError("Error creating file '%s'\n'", fztmp); } char* outfile=(char*) args.getOpt('o'); int numfiles = args.startNonOpt(); if (numfiles==0) GError("%sError: no fasta file given.\n", USAGE); fname=(char*) args.nextNonOpt(); //first fasta file given if (do_compress) { //-------- compression case ------------------- if (strcmp(fname, "-")==0 || strcmp(fname, "stdin")==0) f_read=stdin; else f_read= fopen(fname, "rb"); if (f_read == NULL) die_read(fname); fname=zfilename; //forget the input file name, keep the output } else {// int fdread= open(fname, O_RDONLY|O_BINARY); if (fdread == -1) die_read(fname); struct stat dbstat; fstat(fdread, &dbstat); fdbsize=dbstat.st_size; close(fdread); f_read= fopen(fname, "rb"); if (f_read == NULL) die_read(fname); } char idxfile[365]; if (outfile==NULL) { if (do_compress) { strcpy(ftmp, zfilename); strcat(ftmp, ".cidx"); strcpy(idxfile, ftmp); strcat(ftmp, "_tmp"); } else { strcpy(ftmp, fname); strcat(ftmp, ".cidx"); strcpy(idxfile, ftmp); strcat(ftmp, "_tmp"); } //should add the process ID, time and user to make this unique? } else { strcpy(ftmp, outfile); strcpy(idxfile, outfile); strcat(ftmp, "_tmp"); } cdbidx=new GCdbWrite(ftmp); //test if this was successful? if (compact) addKeyFunc=&addKeyCompact; else if (multikey) addKeyFunc = &addKeyMulti; else addKeyFunc = &addKey; off_t recpos=0; off_t r=0; unsigned int recsize=0; char* key=NULL; bool fullDefline=(multikey || compact_plus); GReadBuf *readbuf = new GReadBuf(f_read, GREADBUF_SIZE); if (do_compress) { //---------------- compression case ------------- if (fastq) GError("Error: sorry, compression is not supported with fastq format\n"); fdbsize=0; GCdbz cdbz(zf); // zlib interface recpos=cdbz.getZRecPos(); while ((key=cdbz.compress(readbuf,record_marker))!=NULL) { recsize=cdbz.getZRecSize(); if (!fullDefline) { //find first space after the record_marker and place a '\0' there for (int i=record_marker_len; key[i]!='\0';i++) { if (isspace(key[i])) { key[i]='\0';break; } } } addKeyFunc(key, recpos, recsize); recpos = cdbz.getZRecPos(); } remove(zfilename); cdbz.compress_end(); fclose(zf); //determine the size of this file: int ftmp= open(fztmp, O_RDONLY|O_BINARY); if (ftmp == -1) die_read(fztmp); struct stat dbstat; fstat(ftmp, &dbstat); fdbsize=dbstat.st_size; //rename it to the intended file name if (rename(fztmp,zfilename) != 0) { GMessage("Error: unable to rename '%s' to '%s'\n",fztmp,zfilename); perror("rename"); } } //compression requested else { // not compressed -- buffered file access bool defline=false; int kbufsize=KBUFSIZE; if (fullDefline) { GMALLOC(key, KBUFSIZE); }//large defline storage buffer, just in case else { GMALLOC(key, 1024); } int kidx=-1; num_recs=0; num_keys=0; char lastchar=0; //first iteration -- for the beginning of file case if (readbuf->peekCmp(record_marker, record_marker_len)==0) { //new record start found (defline) recpos=readbuf->getPos(); //new record pos defline=true; //we're in defline readbuf->skip(record_marker_len); kidx=0; }//new record start int linecounter=0; bool checkNewRec=true; while ((ch=readbuf->getch())>0) { if (defline && kidx>=0) { //on the defline here, still parsing possible keys key[kidx]=(char)ch; kidx++; if (kidx>=kbufsize) { kbufsize+=KBUFSIZE; GREALLOC(key, kbufsize); } if (((isspace(ch) || ch<31) && fullDefline==false) || ch=='\n' || ch=='\r') { //end key here, don't care about the rest key[kidx-1]='\0'; kidx=-1; } } if (ch=='\n') { // newline! //TODO: should this be '\r' on MacOS ?! linecounter++; //check ahead if a record delimiter follows checkNewRec = (!fastq || linecounter>3); if (checkNewRec && readbuf->peekCmp(record_marker, record_marker_len)==0) { //new record start (defline) recsize = readbuf->getPos()-recpos-1; //previous recsize if (recsize>off_t(record_marker_len+1) && key[0]!='\0') { //add previous record, if there addKeyFunc(key, recpos, recsize); linecounter=0; //GMessage("adding key=%s\n",key); } recpos=readbuf->getPos(); //new record pos defline=true; //we're in defline readbuf->skip(record_marker_len); //if (r<0) die_readformat(fname); kidx=0; } //new record start else { //after newline but not a new record start if (defline) { //we just finished a defline if (fullDefline) { //close the defline string if (kidx>0) key[kidx-1]='\0'; kidx=-1; } defline=false; } } //after newline but not a new record } // was newline lastchar=ch; }//while char recsize=readbuf->getPos()-recpos; if (recsize>0) {//add last record, if there if (lastchar=='\n') recsize--; if (fullDefline && kidx>0) {//close the defline string if (lastchar!='\n') kidx++; key[kidx-1]='\0'; } addKeyFunc(key, recpos, recsize); linecounter=0; //GMessage("adding key=%s\n",key); } delete readbuf; } if (f_read!=stdin) fclose(f_read); if (cdbidx->finish() == -1) die_write(""); // === add some statistics at the end of the cdb index file! r=lseek(cdbidx->getfd(), 0, SEEK_END); cdbInfo info; memcpy((void*)info.tag, (void*)"CDBX", 4); info.idxflags=0; if (multikey) info.idxflags |= CDBMSK_OPT_MULTI; if (do_compress) { info.idxflags |= CDBMSK_OPT_COMPRESS; GMessage("Input data were compressed into file '%s'\n",fname); } if (compact) { if (compact_plus) info.idxflags |= CDBMSK_OPT_CADD; else info.idxflags |= CDBMSK_OPT_C; } info.num_records=gcvt_uint(&num_recs); info.num_keys=gcvt_uint(&num_keys); info.dbsize=gcvt_offt(&fdbsize); info.idxflags=gcvt_uint(&info.idxflags); int nlen=strlen(fname); info.dbnamelen=gcvt_uint(&nlen); r=write(cdbidx->getfd(), fname, nlen); if (r!=nlen) GError(ERR_W_DBSTAT); r=write(cdbidx->getfd(), &info, cdbInfoSIZE); if (r!=cdbInfoSIZE) GError(ERR_W_DBSTAT); delete cdbidx; GFREE(key); remove(idxfile); if (rename(ftmp,idxfile) == -1) GError("Error: unable to rename %s to %s",ftmp,idxfile); GMessage("%d entries from file %s were indexed in file %s\n", num_recs, fname, idxfile); return 0; } cdbfasta/cdbyank.cpp0000664002442700244270000005455011263126143014632 0ustar gperteagpertea#include "gcdb.h" #include "GBase.h" #include "GArgs.h" #include "ctype.h" #include #include #ifdef ENABLE_COMPRESSION #include "gcdbz.h" #else const char err_COMPRESSION[]="Error: compression requested but not compiled in!\n" #endif #ifdef __WIN32__ #define VERSION "cdbyank version 0.981w" #else #define VERSION "cdbyank version 0.981" #endif #define USAGE "Usage:\n\ cdbyank [-d ] [-a |-n|-l|-s]\n\ [-o ] [-q |-Q][-F] [-R] [-P] [-x] [-w] \n\ [-z \n\n\ is the index file created previously with cdbfasta\n\ (usually having a \".cidx\" suffix)\n\ -a the sequence name (accession) for a fasta record to be\n\ retrieved; if not given, a list of accessions is expected\n\ at stdin\n\ -d is the fasta file to pull records from; \n\ if not specified, cdbyank will look in the same directory\n\ where resides, for a file with the same name\n\ but without the \".cidx\" suffix\n\ -o the records found are written to file instead of stdout\n\ -x allows retrieval of multiple records per key, if the indexed \n\ database had records with the same key (non-unique keys);\n\ (without -x only one record for a given key is retrieved)\n\ -i case insensitive query (expects the to have been \n\ created with cdbfasta -i option)\n\ -Q output the query key surrounded by character '%' before the\n\ corresponding record\n\ -q same as -Q but use character instead of '%'\n\ -w enable warnings (sent to stderr) when a key is not found\n\ -F pulls only the defline for each record (discard the sequence)\n\ -P only displays the position(s) (file offset) within the \n\ database file, for the requested record(s)\n\ -R sequence range extraction: expects the input to have \n\ the format: ' '\n\ and pulls only the specified sequence range\n\ -z decompress the entire file \n\ (assumes it was built using cdbfasta with '-z' option)\n\ -v show version number and exit\n\ \n\ Index file statistics (no database file needed):\n\ -n display the number of records indexed\n\ -l list all keys stored in \n\ -s display indexing summary info\n\n" /* -E same as -R but assumes FASTA records have a fixed line length\n\ (faster extraction of distant ranges for long records)\n\ */ #define ERR_READ "cdbyank: error reading from file.\n" #define ERR_READFMT "cdbyank read error: incorrect file format.\n" #define ERR_RANGEFMT "Sequence range parsing error for key '%s'\n" #define ERR_RANGE_INVALID "Invalid range (%d-%d) specified for sequence '%s' of length %d\n" // 1MB memory buffer: #define MAX_MEM_RECSIZE 1048576 #ifndef O_BINARY #define O_BINARY 0x0000 #endif static char* idxfile; static int warnings; bool is_compressed=false; bool defline_only=false; bool rec_pos_only=false; bool use_range=false; bool fixed_linelen=false; bool caseInsensitive=false; bool showQuery=false; char delimQuery='%'; off_t lastfpos=-1; //to avoid pulling the same record twice in a row.. FILE* fout=NULL; GCdbRead* cdb=NULL; #ifdef ENABLE_COMPRESSION GCdbz* cdbz=NULL; #endif int fdb=-1; FILE* fz=NULL; void inplace_Lower(char* c) { char *p=c; while (*p!='\0') { *p=tolower(*p);p++; } } void buf_get(GCDBuffer* b, uint32& pos, char *buf, unsigned int len) { int r; while (len > 0) { r = b->get(buf,len); if (r == -1) GError(ERR_READ); if (r == 0) GError(ERR_READFMT); pos += r; buf += r; len -= r; } } void buf_getnum(GCDBuffer* b, uint32& pos, uint32 *num) { char buf[4]; buf_get(b, pos, buf, 4); uint32_unpack(buf,num); } int fetch_record(char* key, char* dbname, int many, int r_start=0, int r_end=0) { //assumes fdb is open, cdb was created on the index file if (caseInsensitive) inplace_Lower(key); int r=cdb->find(key); if (r==0 && warnings) { GMessage("cdbyank: key \"%s\" not found in %s\n", key, idxfile); return 0; } if (r==-1) GError("cdbyank: error searching for key %s in %s\n", key, idxfile); while (r>0) { off_t pos = cdb->datapos(); //position of this key's record in the index file unsigned int len=cdb->datalen(); // length of this key's record char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length if (cdb->read(bytes,len,pos) == -1) GError("cdbyank: error at GCbd::read (%s)!\n", idxfile); off_t fpos; //this will be the fastadb offset uint32 reclen; //this will be the fasta record offset if (len>8) { //64 bit file offset was used fpos=gcvt_offt(bytes); if (rec_pos_only) { fprintf(fout, "%lld\n", fpos); return 1; } reclen=gcvt_uint(&bytes[sizeof(uint32)<<1]); } else { //32bit offset used fpos=gcvt_uint(bytes); if (rec_pos_only) { fprintf(fout, "%lld\n", fpos); return 1; } reclen=gcvt_uint(&bytes[sizeof(uint32)]); } //GMessage("reclen=%d\n", reclen); if (fpos == lastfpos) { if (many) r=cdb->findnext(key, strlen(key)); else r=0; continue; } lastfpos=fpos; if (showQuery) fprintf(fout, "%c%s%c\t", delimQuery, key, delimQuery); if (is_compressed) { #ifdef ENABLE_COMPRESSION //for now: ignore special retrievals, just print the whole record cdbz->decompress(fout, reclen, fpos); if (many) r=cdb->findnext(key, strlen(key)); else r=0; #endif continue; } lseek(fdb, fpos, SEEK_SET); if (reclen<=MAX_MEM_RECSIZE) { char* p; GMALLOC(p,reclen+1); //errno=0; r=read(fdb, p, reclen); if (r<=0) GError("cdbyank: Error reading from database file [%s] for %s (returned %d, offset %d) !\n", dbname, idxfile, r, fpos); p[reclen]='\0'; //--- now we have the whole record, check if some special options were given: if (defline_only) { char* q=strchr(p,'\n'); if (q!=NULL) *q='\0'; //skip '>' char fprintf(fout, "%s\n",p+1); } else if (use_range && r_start>0) { //range case if (r_end<=0) r_end=reclen; //extract only a substring of the sequence char* r=strchr(p,'\n'); if (r!=NULL) *r='\0'; //now p only has the defline fprintf(fout, "%s\n", p); //output the defline r++; unsigned int recpos=r-p; //p[recpos] MUST be a nucleotide or aminoacid now! int seqpos=0; char linebuf[61]; int linelen=0; while (recpos=r_start && seqpos<=r_end) { linebuf[linelen]=p[recpos]; linelen++; if (linelen==60 || seqpos==r_end) { linebuf[linelen]='\0'; linelen=0; fprintf(fout, "%s\n", linebuf); if (seqpos==r_end) break; } } recpos++; } }//while if (linelen>0) { linebuf[linelen]='\0'; linelen=0; fprintf(fout, "%s\n", linebuf); } } else { //not range display fprintf(fout, "%s\n",p); } GFREE(p); } //small record else { //large record, read it char by char and return it as output char c='\0'; if (defline_only) { reclen--; read(fdb, &c, 1); } while (reclen-- && read(fdb, &c, 1)==1) { fprintf(fout, "%c", c); if (c=='\n') break; } //defline written if (!defline_only) { int seqpos=1; if (use_range) { while (reclen-- && read(fdb, &c, 1)==1 && seqpos<=r_end) { if (isspace(c)) continue; if (seqpos>=r_start) { int written=seqpos-r_start; if (written && written%60 == 0) fprintf(fout,"\n"); fprintf(fout, "%c", c); } seqpos++; }//while } //range case else { //no range, just copy all chars to output while (reclen-- && read(fdb, &c, 1)==1) { fprintf(fout, "%c", c); } } fprintf(fout, "\n"); } } if (many) r=cdb->findnext(key, strlen(key)); else r=0; } return 1; } int read_dbinfo(int fd, char** fnameptr, cdbInfo& dbstat) { //this is messy due to the need of compatibility with the //old 32bit file-length char* dbname=*fnameptr; //read just the tag first: 4 bytes ID lseek(fd, -cdbInfoSIZE, SEEK_END); int r=read(fd, &dbstat, cdbInfoSIZE ); if (r!=cdbInfoSIZE) return 2; //GMessage("Size of dbstat=%d\n", cdbInfoSIZE); if (strncmp(dbstat.oldtag, "CIDX", 4)==0) { //old dbstat structure -- convert it dbstat.num_keys=gcvt_uint(&dbstat.oldnum[0]); dbstat.num_records=gcvt_uint(&dbstat.oldnum[1]); dbstat.dbsize=gcvt_uint(&dbstat.old_dbsize); dbstat.idxflags = gcvt_uint(&dbstat.old_idxflags); //position on the dbnamelen entry dbstat.dbnamelen = gcvt_uint(&dbstat.old_dbnamelen); //GMessage("dbnamelen=%d\n", dbstat.dbnamelen); lseek(fd, -(off_t)(cdbInfoSIZE-4+dbstat.dbnamelen), SEEK_END); } else if (strncmp(dbstat.tag, "CDBX", 4)!=0) { GMessage("Error: this doesn't appear to be a cdbfasta created file!\n"); return 1; } else { // new CDBX type: dbstat.dbsize = gcvt_offt(&dbstat.dbsize); dbstat.num_keys=gcvt_uint(&dbstat.num_keys); dbstat.num_records=gcvt_uint(&dbstat.num_records); dbstat.idxflags = gcvt_uint(&dbstat.idxflags); //position on the dbnamelen entry dbstat.dbnamelen = gcvt_uint(&dbstat.dbnamelen); //GMessage("dbnamelen=%d\n", dbstat.dbnamelen); lseek(fd, -(off_t)(cdbInfoSIZE+dbstat.dbnamelen), SEEK_END); } GMALLOC(dbname, dbstat.dbnamelen+1); dbname[dbstat.dbnamelen]='\0'; r=read(fd, dbname, dbstat.dbnamelen); *fnameptr=dbname; if (r!=dbstat.dbnamelen) return 2; return 0; } int parse_int(FILE* f, char* buf, char* key, int& e) { char* p, *q; while (e!=EOF && isspace(e)) { //skip any spaces if (e=='\n') return 0; //GError(ERR_RANGEFMT, key); e=fgetc(stdin); } if (e==EOF) return 0; //GError(ERR_RANGEFMT, key); //now e is the first non-space p=buf; q=p; while (e!=EOF && !isspace(e)) { *q=e; q++; e=fgetc(stdin); } *q='\0'; //now p is the starting coordinate string return atoi(p); //now the file pointer should be on the first space after the parsed value } int parse_int(char*& f, char* key, int& e) { char* p, *q; char buf[16]; while (e!='\0' && isspace(e)) { //skip any spaces //if (e=='\n') GError(ERR_RANGEFMT, key); if (e=='\n') return 0; f++; e=*f; } //if (e=='\0') GError(ERR_RANGEFMT, key); if (e=='\0') return 0; //now e is the first non-space char p=buf; q=p; while (e!='\0' && !isspace(e)) { *q=e; q++; f++; e=*f; } *q='\0'; return atoi(p); //now f and e should be on the first space after the parsed value (or '\0') } #ifdef ENABLE_COMPRESSION GCdbz* openCdbz(char* p) { //in case this was not done before: gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86; FILE* zf=fopen(p, "rb"); if (zf==NULL) { GMessage("Error: cannot open compressed file '%s'!\n",p); return NULL; } //check if the file is valid and read the length of the first record // char ztag[5]; ztag[4]='\0'; if (fread(ztag, 1, 4, zf)<4) { GMessage("Error reading header of compressed file '%s'\n",p); return NULL; } if (strcmp(ztag, "CDBZ")!=0) { GMessage("Error: file '%s' doesn't appear to be a zlib compressed cdb?\n",p); return NULL; } unsigned int zrecsize; if (fread((void*) &zrecsize,1,4,zf)<4) { GMessage("Error reading 1st compressed record size for file '%s'!\n",p); return NULL; } zrecsize=gcvt_uint(&zrecsize); return new GCdbz(zf, true, zrecsize); } #endif int main(int argc, char **argv) { char namebuf[1024]; int r_start, r_end; char* p; char* dbname=NULL; int result=0; int r=0; cdbInfo dbstat; dbstat.dbsize=0; GArgs args(argc, argv, "a:d:o:z:q:nlsxwvFREiPQ"); int e=args.isError(); if (e>0) GError("%s Invalid argument: %s\n", USAGE, argv[e]); if (args.getOpt('v')!=NULL) { printf("%s\n",VERSION); return 0; } char* outfile=(char*)args.getOpt('o'); if (outfile!=NULL) { if ((fout=fopen(outfile, "wb"))==NULL) GError("Cannot create file '%s'!", outfile); } else fout=stdout; if ((p=(char*)args.getOpt('z'))!=NULL) { //simply stream-decompress cdbz #ifndef ENABLE_COMPRESSION GError(err_COMPRESSION); #else GCdbz* cdbz=openCdbz(p); if (cdbz==NULL) GError("Error opening the cdbz file '%s'\n"); FILE* zf=cdbz->getZFile(); int numrecs=0; int xcode; while ((xcode=cdbz->decompress(fout))>0) numrecs++; delete cdbz; fclose(zf); #endif return 0; } int numfiles = args.startNonOpt(); if (numfiles==0) GError("%s Error: an index file must be provided !\n", USAGE); idxfile=(char*)args.nextNonOpt(); //first fasta file given char* key=(char*)args.getOpt('a'); defline_only=(args.getOpt('F')!=NULL); rec_pos_only=(args.getOpt('P')!=NULL); showQuery=(args.getOpt('Q')!=NULL); const char* q; if ((q=args.getOpt('q'))!=NULL) { delimQuery=*q; showQuery=true; } use_range=((args.getOpt('R')!=NULL) || (args.getOpt('E')!=NULL)); fixed_linelen=(args.getOpt('E')!=NULL); caseInsensitive=(args.getOpt('i')!=NULL); /*is_compressed=((args.getOpt('Z')!=NULL) || (strstr(idxfile,".cidxz")!=NULL));*/ int listQuery=(args.getOpt('l')!=NULL); warnings=(args.getOpt('w')!=NULL); int dataQuery=(!listQuery && args.getOpt('n')==NULL && args.getOpt('l')==NULL &&args.getOpt('s')==NULL); //exclude the possibility of index-only stats query dbname=(char*)args.getOpt('d'); int fd; cdb=new GCdbRead(idxfile); fd=cdb->getfd(); char* info_dbname=NULL; off_t db_size=0; dbstat.dbsize=0; r=read_dbinfo(fd, &info_dbname, dbstat); lseek(fd, 0, SEEK_SET); if (r==1) GError("This file does not seem to be a cdbfasta generated file.\n"); else if (r==2) GError("Error reading info chunk!\n"); if (dataQuery) { //--------------- DB QUERY MODE: (always read the cdb stored info!) /*try to find the database file rules: if given, only the -d given filename is used otherwise: 1) the same directory with the given index file(stripping the suffix) 2) the dbstat filepath/name stored by cdbfasta */ if (!rec_pos_only && dbname==NULL) { // no -d database given, find it // 1) try to rip the suffix: p = rstrchr(idxfile, '.'); if (p!=NULL) { /*GError("%s\ncdbyank error: cannot use %s as an index file. When no -d is\n\ given, so the database file can be located in the same directory \n\ by removing the index file suffix (.cidx)\n", USAGE, idxfile);*/ int nlen=p-idxfile; strncpy(namebuf, idxfile, nlen); namebuf[nlen]='\0'; if (fileExists(namebuf)) dbname=namebuf; } // 2) try the stored dbstat name if (dbname==NULL) { if (fileExists(info_dbname)) dbname=info_dbname; else GError("Cannot locate the database file for this index\n"); } } if (!rec_pos_only) { if (!is_compressed) { if (r==0 && (dbstat.idxflags & CDBMSK_OPT_COMPRESS)) is_compressed=true; } if (is_compressed) { //try to open the dbname as a compressed file #ifndef ENABLE_COMPRESSION GError(err_COMPRESSION); #endif fz=fopen(dbname, "rb"); } else fdb=open(dbname, O_RDONLY|O_BINARY); if (fdb==-1 && fz==NULL) GError("Error: cannot open database file %s\n",dbname); if (is_compressed) { fclose(fz);//just to start fresh here if (use_range) GError("Error: cannot use range extraction with compressed records, sorry.\n"); if (defline_only) GError("Error: cannot use defline-only retrieval with compressed records (sorry).\n"); //determine size: int ftmp = open(dbname, O_RDONLY|O_BINARY); if (ftmp == -1) GError("Error reopening db '%s'?\n",dbname); struct stat fdbstat; fstat(ftmp, &fdbstat); db_size=fdbstat.st_size; close(ftmp); //-------- reopen here #ifdef ENABLE_COMPRESSION cdbz=openCdbz(dbname); if (cdbz==NULL) GError("Error opening the cdbz file '%s'\n"); fz=cdbz->getZFile(); #endif } else { struct stat fdbstat; if (stat(dbname, &fdbstat)!=0) { perror("stat()"); exit(1); } db_size=fdbstat.st_size; } //abort if the database size was read and it doesn't match the cdbfasta stored size if (dbstat.dbsize>0 && dbstat.dbsize!=db_size) GError("Error: invalid %d database size - (%lld vs %lld) please rerun cdbfasta for '%s'\n", fdb, dbstat.dbsize, db_size, dbname); } int many=(args.getOpt('x')!=NULL); int keypos=0; if (key==NULL) { //key not given GMALLOC(key, 2048); //get the keys at stdin if (use_range) { //expects the key and its sequence range on a single line! while ((e=fgetc(stdin)) != EOF) { if (isspace(e)) { //word end, close it key[keypos]='\0'; if (keypos==0) continue; r_start=parse_int(stdin, &key[keypos+1], key, e); if (r_start<=0) GError(ERR_RANGEFMT, key); //if (e==EOF || e=='\n') GError(ERR_RANGEFMT, key); r_end=0; r_end=parse_int(stdin, &key[keypos+1], key, e); //if (r_end<=0 || r_end<=r_start) GError(ERR_RANGEFMT, key); fetch_record(key, dbname, many, r_start, r_end); //if (rec_pos_only) break; if (e==EOF) break; keypos=0; } else { //extend the key string key[keypos]=e; keypos++; } } //while } //range case else { //no range, accept any space delimiter while ((e=fgetc(stdin)) != EOF) { if (isspace(e)) { //word end, close it key[keypos]='\0'; fetch_record(key, dbname, many); //if (rec_pos_only) break; keypos=0; } else { //extend the key string key[keypos]=e; keypos++; } } //while } GFREE(key); } //stdin case else { //key given already on command line //get only the first word of it: size_t keylen=strlen(key); p=key; while (!isspace(*p) && *p!='\0') p++; if (*p!='\0') *p='\0'; if (use_range) { //parse the range from the query string if (keylen==strlen(p)) GError(ERR_RANGEFMT, key); p++;e=*p; r_start=parse_int(p, key, e); if (r_start<=0) GError(ERR_RANGEFMT, key); //if (e=='\0' || e=='\n') GError(ERR_RANGEFMT, key); r_end=parse_int(p, key, e); //if (r_end<=0 || r_end<=r_start) GError(ERR_RANGEFMT, key); } else { r_start=0; r_end=0; } if (fetch_record(key, dbname, many, r_start, r_end)==0) result=1; //the only key given not found } //end data query: if (!rec_pos_only) { if (is_compressed) { fclose(fz); #ifdef ENABLE_COMPRESSION delete cdbz; #endif } else close(fdb); } if (fout!=NULL) fclose(fout); } //--------------- INDEX ONLY QUERY MODE: else { //index query mode: just retrieve some statistics or key names if (listQuery) { //request for list keys uint32 eod; uint32 pos=0; uint32 klen; uint32 dlen; char* bufspace; GMALLOC(bufspace, GCDBUFFER_INSIZE); GCDBuffer* readbuf=new GCDBuffer((opfunc)&read, fd, bufspace, GCDBUFFER_INSIZE); buf_getnum(readbuf, pos, &eod); GMALLOC(key, 1024); //!!! hopefully we don't have keys larger than that while (pos < 2048) buf_getnum(readbuf, pos, &dlen); while (pos < eod) { buf_getnum(readbuf, pos,&klen); buf_getnum(readbuf, pos,&dlen); //read key: buf_get(readbuf, pos, key, klen); key[klen]='\0'; printf("%s\n", key); //read data (and ignore it) //assume that data is always shorter than 1K (should be just 4 bytes) buf_get(readbuf, pos, key, dlen); } GFREE(key); GFREE(bufspace); delete readbuf; } else { //dig up the info written at the end of the database file if (args.getOpt('n')!=NULL) { printf("%d\n",dbstat.num_records); } else {//must be -s printf("-= Indexing information: =-\n"); printf("Number of records:%12d\n", dbstat.num_records); printf("Number of keys :%12d\n", dbstat.num_keys); if (dbstat.idxflags & CDBMSK_OPT_COMPRESS) printf("Database records are compressed.\n"); if (dbstat.idxflags & CDBMSK_OPT_MULTI) printf("Index was built with \"multi-key\" option enabled.\n"); if (dbstat.idxflags & CDBMSK_OPT_C) printf("Index was built with \"shortcut keys\" only.\n"); else if (dbstat.idxflags & CDBMSK_OPT_CADD) printf("The index was built with full keys and \"shortcut keys\".\n"); printf("Database file: %s\n", info_dbname); printf("Database size: %lld bytes\n", dbstat.dbsize); } } } GFREE(info_dbname); delete cdb; close(fd); //getc(stdin); return result; } cdbfasta/gcdbz.cpp0000664002442700244270000002131011263126143014274 0ustar gperteagpertea#include "gcdbz.h" GCdbz::GCdbz(FILE* azf, bool uc, int zrsize) { uncompress=uc; zrecsize=-1; zpos=0; defline_cap=1024; begin_defline(); GMALLOC(defline, defline_cap); zf=azf; // FULL_FLUSH method instead of finish: if (uncompress) decomp_start(zrsize); else compress_start(); } GCdbz::~GCdbz() { //if (zf!=NULL && zf!=stdout && zf!=stdin) fclose(zf); // FULL_FLUSH method instead of finish if (uncompress) decomp_end(); else if (!zclosed) compress_end(); GFREE(defline); } void GCdbz::extend_defline(int ch) { if (defline_len+1 >= defline_cap) { defline_cap+=(defline_cap>>2); GREALLOC(defline, defline_cap); } defline[defline_len]= ch; defline_len++; } #define DUMMY_ZREC ">AA1234567890 DNA protein\n\ ACGTTGCTAGCT\n\ NRMTPYYHEIEP\n\ RTASNTSPTPNS\n\ IKSAHPAEPPKR\n" void GCdbz::compress_start() { //initialize zstream compression zstream.zalloc = (alloc_func)0; //no alloc function to use zstream.zfree = (free_func)0; //no free function to use zstream.opaque = (voidpf)0; //no private object to pass to zalloc/zfree int err=deflateInit(&zstream, Z_DEFAULT_COMPRESSION); if (err!=Z_OK) GError("GCdbz error: deflateInit failed!(err=%d)\n",err); zclosed=false; //write a dummy record as the first record, //so we can use random access (FULL_FLUSH style) later char ztag[5];strcpy(ztag, "CDBZ"); uint32 zsize=0; zstream.next_in = (Bytef*)sbuf; strcpy(sbuf, DUMMY_ZREC); zstream.avail_in=strlen(sbuf); zstream.next_out = (Bytef*)lbuf; zstream.avail_out = GCDBZ_LBUF_LEN; uLong t_out=zstream.total_out; err = deflate(&zstream, Z_FULL_FLUSH); zsize=zstream.total_out-t_out; if ((err !=Z_OK && err!=Z_STREAM_END) || zsize<=0) GError("GCdbz error: deflate 1st record failed! (err=%d)\n", err); //now write the header and the dummy record //in case this was not done before: gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86; uint32 zfv = gcvt_uint(&zsize); if (fwrite(ztag, 1, 4, zf)<4 || fwrite(&zfv,1,sizeof(uint32), zf) < sizeof(uint32) || fwrite(lbuf, 1, zsize, zf) < zsize) GError("Error writing 1st deflated record!\n"); zpos+=4+sizeof(uint32)+zsize; } void GCdbz::compress_end() { zstream.next_out = (Bytef*)lbuf; zstream.avail_out = GCDBZ_LBUF_LEN; zstream.avail_in = 0; uLong t_out=zstream.total_out; int err = deflate(&zstream, Z_FINISH); if (err != Z_STREAM_END) { GError("GCdbz error: deflate/Z_FINISH() failed! (err=%d) \n", err); } uLong toWrite=zstream.total_out-t_out; if (toWrite>0) { if (fwrite(lbuf, 1, toWrite, zf)defline or NULL if error encountered //-- WARNING: this subrutine assumes that inf file position // is at the beginning of the record, right AFTER the delim // (exactly as left after a previous call) if (zf==NULL || uncompress) GError("GCdbz Error: cannot use compress() method !\n"); unsigned int total_out=0; int c=0; bool in_rec=true; int delimlen=strlen(delim); zrecsize=0; if ((c=readbuf->peekCmp(delim, delimlen))!=0) { if (c<-1) return NULL; //end of file reached GError("GCdbZ::compress error: delimiter '%s' expected at record start!\n", delim); } bool bol=false; //beginning of line flag int deflate_flag=0; begin_defline(); int rec_pos=0; int err=0; while (in_rec) { // main read loop int bytes_read=0; while ((c=readbuf->getch())>=0) { sbuf[bytes_read++]=c; if (c=='\n' || c=='\r') { //beginning of line bol = true; if (in_defline) end_defline(); //look_ahead for record delimiter: if (readbuf->peekCmp(delim, delimlen)==0) { in_rec=false; break; } } else bol = false; if (rec_pos>delimlen-1 && in_defline) extend_defline(c); rec_pos++; if (bytes_read == GCDBZ_SBUF_LEN) break; }//while not EOF or space in buffer /*if (bytes_read==0) return NULL;*/ if (c==EOF) { in_rec=false; if (in_defline) end_defline(); } zstream.next_in = (Bytef*)sbuf; zstream.avail_in = bytes_read; //deflate_flag = in_rec ? 0 : Z_FINISH; deflate_flag = in_rec ? 0 : Z_FULL_FLUSH; do { //compression loop zstream.next_out = (Bytef*)lbuf; zstream.avail_out = GCDBZ_LBUF_LEN; uLong t_out=zstream.total_out; err = deflate(&zstream, deflate_flag); if (err !=Z_OK && err!=Z_STREAM_END) GError("GCdbz error: deflate failed! (err=%d)\n", err); uLong toWrite=zstream.total_out-t_out; if (toWrite>0) { if (fwrite(lbuf, 1, toWrite, zf)=0) { if (fseek(zf, zfofs, 0)) GError("GCdbz::decompress: error fseek() to %d\n", zfofs); } else if (feof(zf)) return 0; bool in_rec=true; int err=0; int total_read=0; int total_written=0; while (in_rec) { // main read loop int to_read=0; int bytes_read=0; if (csize<=0) { //read one byte at a time to_read=1; int c; if ((c =fgetc(zf))!=EOF) { bytes_read = 1; sbuf[0]=c; } else { //bytes_read=0; return 0; //eof } total_read+=bytes_read; } else { to_read = csize-total_read>GCDBZ_SBUF_LEN ? GCDBZ_SBUF_LEN : csize-total_read; // check for csize vs bytes_read match: if (to_read==0) return 0; bytes_read=fread(sbuf, 1, to_read, zf); if (bytes_read!=to_read) GError("Error reading from zrec file\n"); total_read+=bytes_read; in_rec=(total_read0) { if (fwrite(lbuf, 1, toWrite, outf) #include class GCdbz { private: char lbuf[GCDBZ_LBUF_LEN]; //larger buffer char sbuf[GCDBZ_SBUF_LEN]; //smaller buffer char* defline; //defline copy storage -- compression only int defline_cap; //currently allocated length of defline int defline_len; //currently used length of defline z_stream zstream; // de/compression stream FILE* zf; //compressed file, could be input or output bool uncompress; // compression or decompression long zpos; //current position in zf int zrecsize; // the size of the compressed record bool in_defline; bool zclosed; // if compress_end() was issued or not! void begin_defline() { defline_len=0; in_defline=true; } // initialize the defline storage void extend_defline(int ch); //append character ch to defline //reallocating as necessary void end_defline() { defline[defline_len]='\0'; in_defline=false; } // add \0 public: GCdbz(FILE* af, bool dc = false, int zrsize=0); ~GCdbz(); void compress_start(); void compress_end(); char* compress(GReadBuf *readbuf, char* delim); // returns a pointer to the defline copy or // NULL if nothing was compressed; // (getZRecSize should be called to find out the // actual number of compressed bytes written to zf) int getZRecSize() { return zrecsize; } //to be called AFTER compress() long getZRecPos() { return zpos; } //to be called BEFORE compress() FILE* getZFile() { return zf; } void decomp_start(int zrsize); void decomp_end(); int decompress(FILE* outf, int csize=0, int zfofs=-1); // uncompress csize bytes from zf, from optional offset zfofs, // and send the uncompressed stream to outf }; #endif cdbfasta/README0000644002442700244270000002603711306017455013373 0ustar gperteagperteaCDB (Constant DataBase) indexing and retrieval tools for FASTA files ===================================================================== This is a brief introduction to a couple of platform independent file-based hashing tools (cdbfasta and cdbyank) that can be used for creating indices for quick retrieval of any particular sequences from large multi-FASTA files. The last version has the option to compress data records in order to save space. The index files are now architecture independent, the same index file can be created and used on many different Unix platform (be it 32bit/64bit, big-endian or little-endian architectures) and even Windows. 1.Install instructions 2.Typical usage 3.Retrieving sequence ranges or only the defline 4.Data compression option 5.Development notes 1.Install instructions =============================== Before running 'make' in the source directory, please take a look at the Makefile and note the following: * GCLDIR must point to the directory containing the gclib source files (should be included in this source package already as a subdirectory) * in order to support record compression, change the BASEFLAGS variable to have -DENABLE_COMPRESSION=1 instead of -DENABLE_COMPRESSION=0 (default is: no compression support) * if compression was enabled, ZDIR should point to the directory where the zlib library (libz.a and all the zlib header files like zlib.h) can be found. This is only needed if your system does not have the zlib library installed already (most systems do). In case you get zlib related errors when you try to compile cdbfasta you might have to download zlib and install/build it in a directory that should then be specified as ZDIR in the Makefile Running 'make' should produce the binaries 'cdbfasta' (the indexer program) and 'cdbyank' (the query program) in the current directory. 2.Typical usage =============== Use cdbfasta to create the index file for a multi-FASTA file and cdbyank to pull records based on that index file. An usage message is displayed if the commands cdbyank or cdbyank are run without any parameters. In order to create an index file, only the name of the fasta file must be provided: cdbfasta The fasta file can be specified with the whole path (if it's not in the current directory), e.g. cdbfasta /usr/local/db/GUDB.human By default cdbfasta creates an index file with the same name as the database file but with the .cidx suffix added to the original name. So in the example above, a file GUDB.human.cidx will be created in /usr/local/db/. The default usage considers the key for a FASTA record to be the first space-delimited token following the ">" starting character from the definition line. For example, if a FASTA record had a defline like this: >AA141526 Then we can use the string 'AA141526' with cdbyank to retrieve the full FASTA record associated to that sequence name: cdbyank -a 'AA141526' /usr/local/db/GUDB.human.cidx Sometimes all the space delimited tokens in the defline need to be declared as keys in the index file, pointing to the same fasta record. This can be accomplished by cdbfasta by using the "-m" switch. For long and complex fastA file accessions like this: EGAD|61|GP|186739|gb|AAA63210.1||M60828 there is an option to create the index file in such a way that there is no need to provide the full string to cdbyank in order to retrieve such a sequence, but only the first "|" pair (i.e. a substring ending at the second '|' character) should be enough. (EGAD|61 in the example above). In order to enable this feature, there are two alternative options for cdbfasta: -c : the index file is built only by storing the "shortcut key" (the first "db|accession" pair found in the defline of each fasta record). In this case, cdbyank will only be able to accept these "shortcut" accessions for record retrieval. -C : the index file is built by storing both the "shortcut key" and the full keys (which are considered to end at the first space character in the defline). In this case, two strings are stored as keys for each fastA record so any of them can be used as an accession for retrieval of the same record with cdbyank. In order to retrieve records from the database file, cdbyank should be provided with the name of the index file created previously with cdbfasta, e.g.: cdbyank -a 'human|Z98492' /usr/local/db/GUDB.human.cidx A list of accessions is expected at stdin if -a option is not provided, e.g.: cat seq_list | cdbyank /usr/local/db/GUDB.human.cidx This way the output will be a series a fasta records at stdout. By redirecting this output to a file a multifasta file is obtained. cdbyank locates the database file by stripping the '.cidx' suffix off the index filename. But this is not enforced, because by using the -d option, cdbyank can make use of a user-provided database to be used by the given index file. In the example above, if the index file "GUDB.human.cidx" is moved into another directory, a cdbyank command (in that other directory) can be issued like that: cdbyank -a 'human|Z98492' -d /usr/local/db/GUDB.human GUDB.human.cidx The position of the index file in the list of arguments of cdbyank is not enforced. For the -a usage, the error status returned by cdbyank to the shell will be 1 if the given key was not found and 0 for success. The total number of fasta records indexed and the list of the keys stored in a specific cdb index file can be retrieved with cdbyank's -n and -l switches, respectively. This information is obtained from the index file directly (the database file is not needed for that). There is also a -s option that displays a summary of the indexing information stored in the index at index time. These are the initial name of the fastA file, its size, how the index was created (e.g. was -m (multiple keys) option given ? was -c or -C (shortcut keys) option given?), the number of keys stored in the file as well as the number of fasta records indexed - the latter being the same with what -n option returns. As an extra feature, cdbfasta and cdbyank can also be used for some special cases where databases may have different records but with the same key (non-unique keys). Although the performance will degrade a little, cdbfasta is able to index this kind of files, but by default cdbyank only outputs the first record found. If you want all the possible records sharing the same key (accession) to be retrieved and displayed, the -x option should be given to cdbyank. 3.Retrieving sequence ranges or only the defline ================================================ There are two cdbyank options added for convenience: -F option returns the definition line of each requested FASTA record (the first line for each record). The -R option of cdbyank is intended for FASTA files containing actual genetic sequences (nucleotide or protein) and expects each of the retrieval commands to have the following format (space delimited) For example if we only want to retrieve the sequence range 24...178 (letter numbering starts at 1) from sequence with the name 'human|Z98492', then the cdbyank command would look like this: cdbyank -a 'human|Z98492 24 178' -R GUDB.human.cidx Multiple sequence ranges can be extracted this way by providing a file having each line following the format above (key followed by the two coordinates). Then, as before, such file can be piped into cdbyank with -R option to pull specific sequence ranges for each of the sequences specified in the input file. cat seqlistranges | cdbyank -R GUDB.human.cidx Note that this range option works by actually parsing and looping through the retrieved record characters internally - so the performance is poor when some terminal range is pulled from a very large record. 4.Data compression option ========================= (This only applies if the programs were built with compression support enabled) The indexing program cdbfasta has the -z option which creates a compressed file from the input file and at the same time creates an index file for this compressed file. The original input file can then be discarded (if it is only needed for random access through cdbyank). The entire input file can be recovered from the resulting by using the -z option of cdbyank. Because each record is compressed separately, compression is poor if the records are small. Compression is only advised when: * data records are large enough for the compression algorithm to adapt (at least 1KB, the more the better) * only random access is needed to the data records (so the original file can be discarded) The compression can be quite slow for large files and there is also some performance penalty for cdbyank as it has to decompress the retrieved records on the fly. The input data for cdbfasta compression can be collected from stdin if '-' is used instead of a file name: cat my_data_files* | cdbfasta - -z mydata.cdbz This option is useful especially when the total size of input data files is extremely large (over the file-system limits or over the 4GB internal limit of cdbfasta) while the compressed output can be small enough to fall under such limits. With compressed databases cdbyank can be used normally without extra options as it will auto-detect the compression (from the index file info) and activate on-the-fly decompression of the retrieved records. The -F and -R options are not yet accepted when working with compressed records. 5.Development notes =================== These tools were developed in C++, based on the publicly available cdb ("constant database") code written by D.J. Bernstein (http://cr.yp.to/djb.html). "Constant databases" are those that we don't need to add to or remove records from. The original C source was (rather crudely) wrapped into C++ classes and adjusted to automatically index fasta records and to create an external index instead of compacting the original data file like the original cdb library code does. Also the "endianness" is now checked at runtime and the bytes are swapped accordingly such that the file offsets and record sizes are always read/written in the same way in the index file. The compression option uses zlib's "deflate" method. The program uses deflate() with Z_FULL_FLUSH after each record, such that random record decompression is possible after the first dummy record is decompressed. The index file contains an info chunk (actually stored at the end of the file) which maintains a summary data and flags about the indexing process (the -s option of cdbyank retrieves this information). Since the compression option was added, cdbyank is always trying to read this information first (before opening the data file) in order to determine if the data records are compressed or not. Please let me know if you notice problems running with these tools. -- Geo Pertea gpertea@tigr.org 06/09/2003 7. Copyright ============ Copyright (c) 2002-2003, The Institute for Genomic Research, All Rights Reserved This software is OSI Certified Open Source Software. OSI Certified is a certification mark of the Open Source Initiative. cdbfasta/gclib/0000775002442700244270000000000011306016001013547 5ustar gperteagperteacdbfasta/gclib/gcdb.h0000664002442700244270000003000211306015633014624 0ustar gperteagpertea#ifndef __GCDB_H #define __GCDB_H #include #include #include #include #include "GBase.h" #if defined(__WIN32__) || defined(WIN32) #define PROT_READ 1 #define PROT_WRITE 2 #define PROT_READWRITE 3 #define MAP_SHARED 1 #define MAP_PRIVATE 2 #define F_OK 0 #define R_OK 4 #define W_OK 2 #define RW_OK 6 #if !defined(MAP_FAILED) #define MAP_FAILED ((void *) -1) #endif void *mmap(char *,size_t,int,int,int,off_t); int munmap(void *,size_t); #else #include #endif #define MAX_UINT 0xFFFFFFFFUL //===================================================== //------------- buffer stuff ------------------- //===================================================== #define GCDBUFFER_INSIZE 8192 #define GCDBUFFER_OUTSIZE 8192 typedef int (*opfunc)(int, char*, size_t); //typedef unsigned long gcdb_seek_pos; typedef off_t gcdb_seek_pos; typedef unsigned int (*uint_conv)(void*); //uint conversion function pointer typedef off_t (*offt_conv)(void*); //uint conversion function pointer //conversion function --> to platform independent uint extern uint_conv gcvt_uint; extern offt_conv gcvt_offt; int endian_test(void); unsigned int uint32_sun(void* x86int); unsigned int uint32_x86(void* x86int); //for file offsets: off_t runtime conversions: off_t offt_sun(void* offt); off_t offt_x86(void* offt); class GCDBuffer { public: char *x; unsigned int p; unsigned int n; int fd; opfunc op; //methods: GCDBuffer() { x=NULL; fd=0; op=NULL; n=0; //check endianness gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86; gcvt_offt=(endian_test())? &offt_sun : &offt_x86; } GCDBuffer(opfunc aop,int afd,char *buf,unsigned int len) { //check endianness gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86; gcvt_offt=(endian_test())? &offt_sun : &offt_x86; init(aop, afd, buf, len); } void init(opfunc aop,int afd,char *buf,unsigned int len) { x=buf; fd=afd; op=aop; p=0; n=len; } int flush(); int write_all(char* buf, unsigned int pt); int put(char* buf,unsigned int len); int putalign(char* buf,unsigned int len); int putflush(char* buf,unsigned int len); int puts(char *buf); int putsalign(char *buf); int putsflush(char *buf); int oneRead(char* buf, unsigned int len); int getthis(char* buf,unsigned int len); int get(char* buf,unsigned int len); int bget(char* buf,unsigned int len); int feed(); char *peek(); void seek(unsigned int len); int copy(GCDBuffer* bin); }; //===================================================== //------------- cdb utils ------------------- //===================================================== #ifndef __WIN32__ extern int errno; #endif extern int error_intr; extern int error_nomem; extern int error_proto; //additional data to be appended to the cdb file: #define CDBMSK_OPT_MULTI 0x00000001 #define CDBMSK_OPT_C 0x00000002 #define CDBMSK_OPT_CADD 0x00000004 #define CDBMSK_OPT_COMPRESS 0x00000008 //creates a compressed version of the database //uses plenty of unions for ensuring compatibility with // the old 'CIDX' info structure //damn, sun and 64bit machines // align this to 64bit -- so sizeof() is misled! #pragma pack(4) // I wish, but stupid gcc 2.95.3 alpha-decosf version does not // recognize this pragma directive !!? // struct cdbInfo { uint32 num_keys; union { uint32 num_records; char oldtag[4]; // 'CIDX' for old tag style }; // data file size -- used to be uint32, now it could be 64bit union { off_t dbsize; uint32 oldnum[2]; //num_keys, num_records }; union { uint32 idxflags; uint32 old_dbsize; }; union { int dbnamelen; int old_idxflags; }; // -- the actual db name precedes this fixed-size record union { char tag[4]; //'CDBX' for new files with LFS uint32 old_dbnamelen; }; }; #pragma pack() extern int cdbInfoSIZE; void uint32_pack(char *,uint32); void uint32_pack_big(char *,uint32); void uint32_unpack(char *,uint32 *); void uint32_unpack_big(char *,uint32 *); //===================================================== //------------- cdb index ------------------- //===================================================== #define CDB_HPLIST 1000 struct cdb_hp { uint32 h; uint32 p; } ; struct cdb_hplist { struct cdb_hp hp[CDB_HPLIST]; struct cdb_hplist *next; int num; }; //the index file should always be smaller than 4GB ! class GCdbWrite { GCDBuffer* cdbuf; char bspace[8192]; char fname[1024]; char final[2048]; uint32 count[256]; uint32 start[256]; struct cdb_hplist *head; struct cdb_hp *split; /* includes space for hash */ struct cdb_hp *hash; uint32 numentries; uint32 pos; //file position int posplus(uint32 len); int fd; //file descriptor public: //methods: GCdbWrite(int afd); //was: init GCdbWrite(char* fname); ~GCdbWrite(); int addbegin(unsigned int keylen,unsigned int datalen); int addend(unsigned int keylen,unsigned int datalen,uint32 h); int addrec(const char *key,unsigned int keylen,char *data,unsigned int datalen); int add(const char *key, char *data, unsigned int datalen); int getNumEntries() { return numentries; } int finish(); int close(); int getfd() { return fd; } char* getfile() { return fname; } }; //===================================================== //------------- cdb ------------------- //===================================================== #define CDB_HASHSTART 5381 uint32 cdb_hashadd(uint32,unsigned char); uint32 cdb_hash(const char *,unsigned int); class GCdbRead { uint32 size; // initialized if map is nonzero uint32 loop; // number of hash slots searched under this key uint32 khash; // initialized if loop is nonzero uint32 kpos; // initialized if loop is nonzero uint32 hpos; // initialized if loop is nonzero uint32 hslots; // initialized if loop is nonzero uint32 dpos; // initialized if cdb_findnext() returns 1 uint32 dlen; // initialized if cdb_findnext() returns 1 char fname[1024]; char *map; // 0 if no map is available int fd; public: //methods: GCdbRead(int fd); //was cdb_init GCdbRead(char* afname); //was cdb_init ~GCdbRead(); //was cdb_free int read(char *,unsigned int,uint32); int match(const char *key, unsigned int len, uint32 pos); void findstart() { loop =0; } int findnext(const char *key,unsigned int len); int find(const char *key); int datapos() { return dpos; } int datalen() { return dlen; } int getfd() { return fd; } char* getfile() { return fname; } }; class GReadBuf { protected: FILE* f; uchar* buf; int buflen; int bufused; // int bufpos; off_t fpos; bool eof; bool eob; int refill(bool repos=false) { //refill the buffer----------- if (repos && bufpos==0) return 0; //no need to repos if (eof) return 0; int fr=0; if (repos && bufposreturns the number of bytes read int get(uchar *outbuf, int len) { if (eob) return 0; int rd=0; //bytes read while (!eob && rd=bufused) { if (eof) eob=true; else refill(); } }//while return rd; } uchar* getStr(uchar *outbuf, int len) { int rd=get(outbuf,len); if (rd==0) return NULL; else { outbuf[rd]='\0'; return outbuf; } } // getc equivalent int getch() { if (eob) return -1; int ch=(int)(uchar)buf[bufpos]; bufpos++; if (bufpos>=bufused) { if (eof) eob=true; else refill(); } return ch; } //--- bool isEof() { return eob; } bool ended() { return eob; } off_t getPos() { //returns the virtual file position // = the actual file offset of the byte at bufpos return fpos-(bufused-bufpos); } //skip into the stream the specified number of bytes int skip(int skiplen) { if (eob) return 0; int r=0; //the actual number of bytes skipped while (skiplen && !eob) { int dif=GMIN(bufused-bufpos,skiplen); skiplen-=dif; bufpos+=dif; r+=dif; if (bufpos>=bufused) { if (eof) { eob=true; return r; } refill(); } } return r; } //look ahead without updating the read pointer (bufpos) //Cannot peek more than buflen! int peek(uchar* outbuf, int len) { if (eob) return -1; //if (eob || len>buflen) return -1; if (len>bufused-bufpos) refill(true); int mlen=GMIN((bufused-bufpos),len); memcpy((void*)outbuf, (void*)(buf+bufpos), mlen); return mlen; } uchar* peekStr(uchar* outbuf, int len) { int rd=peek(outbuf,len); if (rd>0) { outbuf[rd]='\0'; return outbuf; } else return NULL; } //looks ahead to check if what follows matches int peekCmp(char* cmpstr, int cmplen=0) { if (eob) //GError("GReadBuf::peekcmp error: eob!\n"); return -2; if (!cmplen) cmplen=strlen(cmpstr); if (cmplen>bufused-bufpos) { refill(true); if (cmplen>bufused-bufpos) return -2; } //use memcmp return memcmp((void*)(buf+bufpos), cmpstr, cmplen); } }; //circular line buffer, with read-ahead (peeking) capability class GReadBufLine { protected: struct BufLine { off_t fpos; int len; char* chars; }; int bufcap; //total number of lines in the buf array int bufidx; // the "current line" index in buf array bool isEOF; int lno; FILE* file; off_t filepos; //current file/stream offset for the first char of buf[bufidx] BufLine* buf; //array of bufferred lines char* readline(int idx);//read line from file into the buffer int fillbuf(); bool isEOB; public: const char* line(); //gets current line and advances the "current line" pointer //use putLine() to revert/undo this advancement off_t fpos(); //gets current line's byte offset in the file // does NOT advance the "current line" pointer int len(); //gets current line's length // does NOT advance the "current line" pointer bool isEof() { return isEOB; } bool eof() { return isEOB; } off_t getfpos() { return fpos(); } const char* getline() { return line(); } const char* getLine() { return line(); } int getLen() { return len(); } int linenumber() { return lno; } int lineno() { return lno; } int getLineNo() { return lno; } void putLine(); GReadBufLine(FILE* stream, int bcap=20) { if (bcap<2) bcap=2; //at least 1 prev line is needed for putLine() bufcap=bcap; bufidx=-1; isEOB=false; isEOF=false; lno=0; GMALLOC(buf, bufcap * sizeof(BufLine)); for (int i=0;i=fCount) GError(SLISTINDEX_ERR, __FILE__,__LINE__, x) #else #define SLISTINDEX_ERR "GList error:Invalid list index: %d" #define TEST_INDEX(x) \ if (x<0 || x>=fCount) GError(SLISTINDEX_ERR, x, __FILE__,__LINE__) #endif #define SLISTCAPACITY_ERR "GList error: invalid capacity: %d" #define SLISTCOUNT_ERR "GList error: invalid count: %d" #define SLISTSORTED_ERR "Operation not allowed on a sorted list!" #define SLISTUNSORTED_ERR "Operation not allowed on an unsorted list!" // ------ macros: #define BE_UNSORTED if (fCompareProc!=NULL) { GError(SLISTSORTED_ERR); return; } #define BE_SORTED if (fCompareProc==NULL) { GError(SLISTUNSORTED_ERR); return; } #define MAXLISTSIZE INT_MAX-1 #define SORTED (fCompareProc!=NULL) #define UNSORTED (fCompareProc==NULL) #define FREEDATA (fFreeProc!=NULL) /* #define TEST_INDEX(x) assert(x>=0 && x=fCount) GError(SLISTINDEX_ERR, x) */ //template for array of objects template class GArray { protected: OBJ* fArray; int fCount; int fCapacity; bool fUnique; static int DefaultCompareProc(OBJ& item1, OBJ& item2) { //the comparison operators MUST be defined for OBJ class! if ( item1 > item2) return 1; else return (item2 > item1) ? -1 : 0 ; } public: typedef int CompareProc(OBJ& item1, OBJ& item2); protected: CompareProc* fCompareProc; void idxInsert(int idx, OBJ& item); void Grow(); void Grow(int idx, OBJ& item); void qSort(int L, int R); public: GArray(CompareProc* cmpFunc=NULL); GArray(bool sorted, bool unique=false); GArray(int init_capacity, bool sorted, bool unique=false); GArray(GArray& array); //copy constructor const GArray& operator=(GArray& array); virtual ~GArray(); //assignment operator void setSorted(CompareProc* cmpFunc); //sort the array if cmpFunc not NULL or changes void Reverse(); //WARNING: will break the sort order if SORTED! int Add(OBJ* item); // specific implementation if sorted int Add(OBJ& item) { return Add(&item); } //both will CREATE a new OBJ and COPY to it // using OBJ new operator= void Add(GArray& list); //add copies of all items from another list OBJ& Get(int idx) { TEST_INDEX(idx); return fArray[idx]; } OBJ& operator[](int i) { TEST_INDEX(i); return fArray[i]; } void Clear(); void Delete(int index); void Exchange(int idx1, int idx2); int Capacity() { return fCapacity; } int Unique() { return fUnique; } //this will reject identical items in sorted lists only! void setUnique(bool beUnique) { fUnique = beUnique; }; void setCapacity(int NewCapacity); int Count() { return fCount; } void setCount(int NewCount); void Sort(); //explicit sort may be requested bool Sorted() { return fCompareProc!=NULL; } int IndexOf(OBJ& item); //this needs the == operator to have been defined for OBJ bool Found(OBJ& item, int& idx); // for sorted arrays only; //search by content; if found, returns true and idx will be the index //of the first item found matching for which CompareProc returns 0 bool Exists(OBJ& item); //same as above without existing index info //unsorted only, place item at position idx: void Insert(int idx, OBJ* item); void Insert(int idx, OBJ& item) { Insert(idx,&item); } void Replace(int idx, OBJ& item); //Put, use operator= to copy void Move(int curidx, int newidx); }; //------- template for array of pointers to objects --------- template class GList { protected: OBJ** fList; //pointer to an array of pointers to objects int fCount; //total number of entries in list int fCapacity; //current allocated size bool fUnique; GCompareProc* fCompareProc; //a pointer to a Compare function GFreeProc* fFreeProc; //useful for deleting objects static int DefaultCompareProc(const pointer item1, const pointer item2) { //the comparison operators MUST be defined for OBJ class! if (*((OBJ*)item1) > *((OBJ*)item2)) return 1; else if (*((OBJ*)item2) > *((OBJ*)item1)) return -1; else return 0; } void Expand(); void Grow(); void QuickSort(int L, int R); public: void sortInsert(int idx, OBJ* item); static void DefaultFreeProc(pointer item) { delete (OBJ*)item; } GList(GCompareProc* compareProc=NULL); //free by default GList(GCompareProc* compareProc, //unsorted by default GFreeProc *freeProc, bool beUnique=false); GList(bool sorted, bool free_elements=true, bool beUnique=false); GList(int init_capacity, bool sorted, bool free_elements=true, bool beUnique=false); GList(GList& list); //copy constructor? GList(GList* list); //kind of a copy constructor virtual ~GList(); void Reverse(); //reverse pointer array; WARNING: will break the sort order if sorted! void freeItem(int idx); void setSorted(GCompareProc* compareProc); //sorted if compareProc not NULL; sort the list if compareProc changes ! void setFreeItem(GFreeProc *freeProc) { fFreeProc=freeProc; } void setFreeItem(bool doFree) { if (doFree) fFreeProc=DefaultFreeProc; else fFreeProc=NULL; } bool Sorted() { return fCompareProc!=NULL; } void setSorted(bool sorted) { if (sorted) { if (fCompareProc!=&DefaultCompareProc) { fCompareProc=&DefaultCompareProc; Sort(); } } else fCompareProc=NULL; } int Add(OBJ* item); //-- specific implementation if sorted void Add(GList& list); //add all pointers from another list OBJ* AddIfNew(OBJ* item, bool deleteIfFound=true, int* fidx=NULL); // default: delete item if Found() (and pointers are not equal)! //returns the equal (==) object if it's in the list already //or the item itself if it is unique, and it addsit // -- stack usage: int Push(OBJ* item) { return Add(item); } OBJ* Pop();// Stack use; removes and returns last item,but does NOT FREE it OBJ* Shift(); //Queue use: removes and returns first item, but does NOT FREE it void Clear(); void Delete(int index); void Forget(int idx); void Exchange(int idx1, int idx2); OBJ* First() { return (fCount>0)?fList[0]:NULL; } OBJ* Last() { return (fCount>0)?fList[fCount-1]:NULL;} bool isEmpty() { return fCount==0; } bool notEmpty() { return fCount>0; } int Capacity() { return fCapacity; } int Unique() { return fUnique; } //this will reject identical items in sorted lists only! void setUnique(bool beUnique) { fUnique = beUnique; }; void setCapacity(int NewCapacity); int Count() { return fCount; } void setCount(int NewCount); GCompareProc* GetCompareProc() {return fCompareProc;} OBJ* Get(int idx); OBJ* operator[](int i); void Grow(int idx, OBJ* item); int IndexOf(OBJ* item); //this has a specific implementation for sorted lists //if list is sorted, item data is located by binary search //based on the Compare function //if not, a linear search is performed, but //this needs the == operator to have been defined for OBJ bool Found(OBJ* item, int & idx); // sorted only; //search by content; if found, returns true and idx will be the index //of the first item found matching for which GTCompareProc returns 0 bool Exists(OBJ* item); //same as above without existing index info bool Exists(OBJ& item); //same as above without existing index info void Insert(int idx, OBJ* item); //unsorted only, place item at position idx void Move(int curidx, int newidx); void Put(int idx, OBJ* item, bool re_sort=false); int Remove(OBJ* item); //search for pointer, using binary search if sorted int RemovePtr(OBJ* item); //always use linear search to find the pointer! void Pack(); void Sort(); //explicit sort may be requested using this function const GList& operator=(GList& list); }; //basic template for a Stack of pointers template class GStack { protected: struct StackOBJ { OBJ* obj; StackOBJ* prev; }; int fCount; //total number of elements in stack StackOBJ* base; StackOBJ* top; public: GStack(OBJ* po=NULL) { base=NULL; top=NULL; fCount=0; if (po!=NULL) Push(po); } ~GStack() { while (fCount>0) Pop(); } bool isEmpty() { return fCount==0; } int Size() { return fCount; } int Count() { return fCount; } OBJ* Pop() { if (top==NULL) return NULL; fCount--; StackOBJ* ctop=top; if (top==base) base=NULL; OBJ* r=top->obj; top=top->prev; GFREE(ctop); return r; } OBJ* Push(OBJ* o) { fCount++; StackOBJ* ctop=top; //could be NULL GMALLOC(top, sizeof(StackOBJ)); top->obj=o; top->prev=ctop; if (base==NULL) base=top; return o; } OBJ* Top() { return ((top==NULL)? NULL : top->obj); } OBJ* Base() { return ((base==NULL)? NULL : base->obj); } }; //-------------------- TEMPLATE IMPLEMENTATION------------------------------- template GArray::GArray(GArray& array) { //copy constructor fCount=array.fCount; fCapacity=array.fCapacity; if (fCapacity>0) { GMALLOC(fArray, fCapacity*sizeof(OBJ)); } fUnique=array.fUnique; fCompareProc=array.fCompareProc; fCount=array.fCount; // uses OBJ operator= for (int i=0;i const GArray& GArray::operator=(GArray& array) { if (&array==this) return *this; Clear(); fCount=array.fCount; fUnique=array.fUnique; fCapacity=array.fCapacity; if (fCapacity>0) { GMALLOC(fArray, fCapacity*sizeof(OBJ)); } fCompareProc=array.fCompareProc; fCount=array.fCount; // uses OBJ operator= for (int i=0;i GArray::GArray(CompareProc* cmpFunc) { fCount=0; fCapacity=0; fArray=NULL; fCompareProc = cmpFunc; fUnique = false; //only affects sorted lists } template GArray::GArray(bool sorted, bool unique) { fCount=0; fCapacity=0; fArray=NULL; fUnique=unique; fCompareProc=sorted? &DefaultCompareProc : NULL; } template GArray::GArray(int init_capacity, bool sorted, bool unique) { fCount=0; fCapacity=0; fArray=NULL; fUnique=unique; fCompareProc=sorted? &DefaultCompareProc : NULL; setCapacity(init_capacity); } template GArray::~GArray() { Clear();//this will free the items if fFreeProc is defined } template void GArray::setCapacity(int NewCapacity) { if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE) GError(SLISTCAPACITY_ERR, NewCapacity); //error: capacity not within range if (NewCapacity!=fCapacity) { if (NewCapacity==0) { GFREE(fArray); } else { GREALLOC(fArray, NewCapacity*sizeof(OBJ)); } fCapacity=NewCapacity; } } template void GArray::Clear() { CompareProc* fcmp=fCompareProc; fCompareProc=NULL; setCount(0); setCapacity(0); //so the array itself is deallocated too! fCompareProc=fcmp; } template void GArray::setSorted(CompareProc* cmpFunc) { CompareProc* old_proc=fCompareProc; fCompareProc=cmpFunc; if (fCompareProc!=old_proc && fCompareProc!=NULL) Sort(); //new compare method } template void GArray::Grow() { int delta; if (fCapacity > 64) delta = fCapacity/4; else if (fCapacity > 8) delta = 16; else delta = 4; setCapacity(fCapacity + delta); } template void GArray::Reverse() { int l=0; int r=fCount-1; OBJ c; while (l void GArray::Grow(int idx, OBJ& item) { int delta; if (fCapacity > 64) delta = fCapacity/4; else if (fCapacity > 8) delta = 16; else delta = 4; int NewCapacity=fCapacity+delta; if (NewCapacity <= fCount || NewCapacity >= MAXLISTSIZE) GError(SLISTCAPACITY_ERR, NewCapacity); //error: capacity not within range if (NewCapacity!=fCapacity) { if (NewCapacity==0) GFREE(fArray); else { //add the new item if (idx==fCount) { //append item GREALLOC(fArray, NewCapacity*sizeof(OBJ)); fArray[idx]=item; } else { //insert item at idx OBJ* newList; GMALLOC(newList, NewCapacity*sizeof(OBJ)); //copy data before idx memmove(&newList[0],&fArray[0], idx*sizeof(OBJ)); newList[idx]=item; // operator= //copy data after idx memmove(&newList[idx+1],&fArray[idx], (fCount-idx)*sizeof(OBJ)); memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ)); //data copied: GFREE(fArray); fArray=newList; } fCount++; } fCapacity=NewCapacity; } } template int GArray::IndexOf(OBJ& item) { int result=0; if (Found(item, result)) return result; else return -1; } template bool GArray::Exists(OBJ& item) { int result=0; if (Found(item, result)) return true; else return false; } template int GArray::Add(OBJ* item) { if (item==NULL) return -1; int result; if (SORTED) { if (Found(*item, result)) if (fUnique) return -1; //cannot add a duplicate! //Found sets result to the position where the item should be! idxInsert(result, *item); } else { if (fUnique && Found(*item,result)) return -1; //set behaviour result = fCount; if (result==fCapacity) Grow(); fArray[result] = *item; //operator=, copies the item fCount++; } return result; } template void GArray::Add(GArray& list) { if (list.Count()==0) return; if (SORTED) { for (int i=0;i bool GArray::Found(OBJ& item, int& idx) { //search the list by using CompareProc (if defined) //or == operator for a non-sortable list //for sorted lists, even when the result is false, the idx is //set to the closest matching object! int i; idx=-1; if (fCount==0) { idx=0;return false;} if (SORTED) { //binary search based on CompareProc //do the simplest tests first: if ((*fCompareProc)(fArray[0],item)>0) { idx=0; return false; } if ((*fCompareProc)(item, fArray[fCount-1])>0) { idx=fCount; return false; } int l=0; int h = fCount - 1; int c; while (l <= h) { i = (l + h) >> 1; c = (*fCompareProc)(fArray[i], item); if (c < 0) l = i + 1; else { h = i - 1; if (c == 0) { //found! idx=i; return true; } } } //while idx = l; return false; } else {//not sorted: use linear search // needs == operator to compare user defined objects ! i=0; while (i void GArray::idxInsert(int idx, OBJ& item) { //idx must be the new position this new item must have //so the allowed range is [0..fCount] //the old idx item all the above will be shifted to idx+1 if (idx<0 || idx>fCount) GError(SLISTINDEX_ERR, idx); if (fCount==fCapacity) { //need to resize Grow(idx, item); //expand and also copy/move data and insert the new item return; } //move data around to make room for the new item if (idx void GArray::Insert(int idx, OBJ* item) { //idx can be [0..fCount] so an item can be actually added BE_UNSORTED; //forbid this operation on sorted data idxInsert(idx, item); } template void GArray::Move(int curidx, int newidx) { BE_UNSORTED; //cannot do this in a sorted list! if (curidx!=newidx || newidx>=fCount) GError(SLISTINDEX_ERR, newidx); OBJ tmp=fArray[curidx]; //copy constructor here fArray[curidx]=fArray[newidx]; fArray[newidx]=tmp; } template void GArray::Replace(int idx, OBJ& item) { TEST_INDEX(idx); fArray[idx]=item; if ( SORTED ) Sort(); //re-sort ! } template void GArray::Delete(int index) { TEST_INDEX(index); //fArray[index]=NULL; fCount--; if (index void GArray::setCount(int NewCount) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(SLISTCOUNT_ERR, NewCount); if (NewCount > fCapacity) setCapacity(NewCount); if (NewCount > fCount) memset(&fArray[fCount], 0, (NewCount - fCount) * sizeof(OBJ)); fCount = NewCount; } template void GArray::qSort(int l, int r) { int i, j; OBJ p,t; do { i = l; j = r; p = fArray[(l + r) >> 1]; do { while (fCompareProc(fArray[i], p) < 0) i++; while (fCompareProc(fArray[j], p) > 0) j--; if (i <= j) { t = fArray[i]; fArray[i] = fArray[j]; fArray[j] = t; i++; j--; } } while (i <= j); if (l < j) qSort(l, j); l = i; } while (i < r); } template void GArray::Sort() { if (fArray!=NULL && fCount>0 && fCompareProc!=NULL) qSort(0, fCount-1); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //*=> GList implementation -- sortable array of pointers to OBJ template OBJ* GList::operator[](int i) { TEST_INDEX(i); return fList[i]; } template GList::GList(GList& list) { //copy constructor fCount=list.fCount; fUnique=list.fUnique; fCapacity=list.fCapacity; if (fCapacity>0) { GMALLOC(fList, fCapacity*sizeof(OBJ*)); } fCompareProc=list.fCompareProc; fFreeProc=list.fFreeProc; fCount=list.fCount; memcpy(fList, list.fList, fCount*sizeof(OBJ*)); //for (int i=0;i GList::GList(GList* plist) { //another copy constructor fCount=0; fCapacity=plist->fCapacity; if (fCapacity>0) { GMALLOC(fList, fCapacity*sizeof(OBJ*)); } fUnique=plist->fUnique; fCompareProc=plist->fCompareProc; fFreeProc=plist->fFreeProc; fCount=plist->fCount; memcpy(fList, plist->fList, fCount*sizeof(OBJ*)); //for (int i=0;ifCount;i++) Add(plist->Get(i)); } template void GList::Add(GList& list) { if (list.Count()==0) return; if (SORTED) { for (int i=0;i GList::GList(GCompareProc* compareProc, GFreeProc* freeProc, bool beUnique) { fCount=0; fCapacity=0; fList=NULL; fCompareProc = compareProc; fFreeProc = freeProc; fUnique = beUnique; //only affects sorted lists } template GList::GList(GCompareProc* compareProc) { fCount=0; fCapacity=0; fList=NULL; fCompareProc = compareProc; fFreeProc = &DefaultFreeProc; fUnique = false; //only affects sorted lists } template void GList::Reverse() { int l=0; int r=fCount-1; OBJ* c; while (l GList::GList(bool sorted, bool free_elements, bool beUnique) { fCount=0; fCapacity=0; fList=NULL; if (sorted) { if (free_elements) { fCompareProc=&DefaultCompareProc; fFreeProc=&DefaultFreeProc; fUnique=beUnique; } else { fCompareProc=&DefaultCompareProc; fFreeProc=NULL; fUnique=beUnique; } } else { if (free_elements) { fCompareProc=NULL; fFreeProc=&DefaultFreeProc; fUnique=beUnique; } else { fCompareProc=NULL; fFreeProc=NULL; fUnique=beUnique; } } } template GList::GList(int init_capacity, bool sorted, bool free_elements, bool beUnique) { fCount=0; fCapacity=0; fList=NULL; if (sorted) { if (free_elements) { fCompareProc=&DefaultCompareProc; fFreeProc=&DefaultFreeProc; fUnique=beUnique; } else { fCompareProc=&DefaultCompareProc; fFreeProc=NULL; fUnique=beUnique; } } else { if (free_elements) { fCompareProc=NULL; fFreeProc=&DefaultFreeProc; fUnique=beUnique; } else { fCompareProc=NULL; fFreeProc=NULL; fUnique=beUnique; } } setCapacity(init_capacity); } template GList::~GList() { Clear();//this will free the items if fFreeProc is defined } template void GList::setCapacity(int NewCapacity) { if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE) GError(SLISTCAPACITY_ERR, NewCapacity); //error: capacity not within range if (NewCapacity!=fCapacity) { if (NewCapacity==0) GFREE(fList); else GREALLOC(fList, NewCapacity*sizeof(OBJ*)); fCapacity=NewCapacity; } } template void GList::freeItem(int idx) { TEST_INDEX(idx); (*fFreeProc)(fList[idx]); fList[idx]=NULL; } template void GList::Clear() { if (FREEDATA) { for (int i=0; i void GList::Exchange(int idx1, int idx2) { BE_UNSORTED; //cannot do that in a sorted list! TEST_INDEX(idx1); TEST_INDEX(idx2); OBJ* item=fList[idx1]; fList[idx1]=fList[idx2]; fList[idx2]=item; } template void GList::Expand() { if (fCount==fCapacity) Grow(); //return this; } template OBJ* GList::Get(int idx) { TEST_INDEX(idx); return fList[idx]; } template const GList& GList::operator=(GList& list) { if (&list!=this) { Clear(); fCompareProc=list.fCompareProc; fFreeProc=list.fFreeProc; //Attention: the object pointers are copied directly, //but the actual objects are NOT duplicated for (int i=0;i void GList::setSorted(GCompareProc* compareProc) { GCompareProc* old_proc=fCompareProc; fCompareProc=compareProc; if (fCompareProc!=old_proc && fCompareProc!=NULL) Sort(); //new compare method } template void GList::Grow() { int delta; if (fCapacity > 64) delta = fCapacity/4; else if (fCapacity > 8) delta = 16; else delta = 4; setCapacity(fCapacity + delta); } template void GList::Grow(int idx, OBJ* newitem) { int delta; if (fCapacity > 64) delta = fCapacity/4; else if (fCapacity > 8) delta = 16; else delta = 4; // setCapacity(fCapacity + delta); int NewCapacity=fCapacity+delta; if (NewCapacity <= fCount || NewCapacity > MAXLISTSIZE) GError(SLISTCAPACITY_ERR, NewCapacity); //error: capacity not within range if (NewCapacity!=fCapacity) { if (NewCapacity==0) GFREE(fList); else {//add the new item if (idx==fCount) { GREALLOC(fList, NewCapacity*sizeof(OBJ*)); fList[idx]=newitem; } else { OBJ** newList; GMALLOC(newList, NewCapacity*sizeof(OBJ*)); //copy data before idx memmove(&newList[0],&fList[0], idx*sizeof(OBJ*)); newList[idx]=newitem; //copy data after idx memmove(&newList[idx+1],&fList[idx], (fCount-idx)*sizeof(OBJ*)); memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ*)); //data copied: GFREE(fList); fList=newList; } fCount++; } fCapacity=NewCapacity; } } template int GList::IndexOf(OBJ* item) { int result=0; if (Found(item, result)) return result; else return -1; } template bool GList::Exists(OBJ& item) { int result=0; if (Found(&item, result)) return true; else return false; } template bool GList::Exists(OBJ* item) { int result=0; if (Found(item, result)) return true; else return false; } template int GList::Add(OBJ* item) { int result; if (item==NULL) return -1; if (SORTED) { if (Found(item, result)) if (fUnique) return -1; //duplicates forbidden //Found sets result to the position where the item should be! sortInsert(result, item); } else { if (fUnique && Found(item,result)) return -1; //set behaviour result = fCount; if (result==fCapacity) Grow(); fList[result]=item; fCount++; } return result; } //by default, it deletes the item if it has an equal in the list! //returns the existing equal (==) object if it's in the list already //or returns the item itself if it's unique (and adds it) template OBJ* GList::AddIfNew(OBJ* item, bool deleteIfFound, int* fidx) { int r; if (Found(item, r)) { if (deleteIfFound && (pointer)item != (pointer)fList[r]) delete item; if (fidx!=NULL) *fidx=r; return fList[r]; //found } //not found: if (SORTED) { //Found() set result to the position where the item should be inserted: sortInsert(r, item); } else { r = fCount; if (r==fCapacity) Grow(); fList[r]=item; fCount++; } if (fidx!=NULL) *fidx=r; return item; } template bool GList::Found(OBJ* item, int& idx) { //search the list by using CompareProc (if defined) //or == operator for a non-sortable list //for sorted lists, even when the result is false, the idx is //set to the closest matching object! int i; idx=-1; if (fCount==0) { idx=0;return false;} if (SORTED) { //binary search based on CompareProc //do the simple test first: if ((*fCompareProc)(fList[0],item)>0) { idx=0; return false; } if ((*fCompareProc)(item, fList[fCount-1])>0) { idx=fCount; return false; } int l, h, c; l = 0; h = fCount - 1; while (l <= h) { i = (l + h) >> 1; c = (*fCompareProc)(fList[i], item); if (c < 0) l = i + 1; else { h = i - 1; if (c == 0) { idx=i; return true; } } } //while idx = l; return false; } else {//not sorted: use linear search // needs == operator to compare user defined objects ! i=0; while (i void GList::sortInsert(int idx, OBJ* item) { //idx must be the new position this new item must have //so the allowed range is [0..fCount] //the old idx item all the above will be shifted to idx+1 if (idx<0 || idx>fCount) GError(SLISTINDEX_ERR, idx); if (fCount==fCapacity) { Grow(idx, item); //expand and also copy/move data and insert the new item return; } //room still left, just move data around and insert the new one if (idx void GList::Insert(int idx, OBJ* item) { //idx can be [0..fCount] so an item can be actually added BE_UNSORTED; //cannot do that with a sorted list! if (idx<0 || idx>fCount) GError(SLISTINDEX_ERR, idx); if (fCount==fCapacity) { Grow(idx, item); return; } if (idx void GList::Move(int curidx, int newidx) { BE_UNSORTED; //cannot do that in a sorted list! if (curidx!=newidx || newidx>=fCount) GError(SLISTINDEX_ERR, newidx); OBJ* p; p=Get(curidx); //this is a delete: fCount--; if (curidx void GList::Put(int idx, OBJ* item, bool re_sort) { //WARNING: this will never free the replaced item!!! TEST_INDEX(idx); fList[idx]=item; if (SORTED && item!=NULL && re_sort) Sort(); //re-sort } template void GList::Forget(int idx) { TEST_INDEX(idx); fList[idx]=NULL; } template void GList::Delete(int index) { TEST_INDEX(index); if (fFreeProc!=NULL && fList[index]!=NULL) { (*fFreeProc)(fList[index]); //freeItem } fList[index]=NULL; fCount--; if (index OBJ* GList::Pop() { if (fCount<=0) return NULL; fCount--; OBJ* o=fList[fCount]; fList[fCount]=NULL; return o; } //Queue usage: template OBJ* GList::Shift() { if (fCount<=0) return NULL; fCount--; OBJ* o=fList[0]; if (fCount>0) memmove(&fList[0], &fList[1], (fCount)*sizeof(OBJ*)); fList[fCount]=NULL; //not that it matters.. return o; } template int GList::Remove(OBJ* item) { //removes an item if it's in our list int result=IndexOf(item); if (result>=0) Delete(result); return result; } //linear search for the pointer template int GList::RemovePtr(OBJ* item) { int i; if (item==NULL) return -1; for (i=0;i void GList::Pack() {//also frees items! for (int i=fCount-1; i>=0; i--) if (fList[i]==NULL) Delete(i); //also shift contents of fList accordingly } template void GList::setCount(int NewCount) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(SLISTCOUNT_ERR, NewCount); if (NewCount > fCapacity) setCapacity(NewCount); if (NewCount > fCount) memset(fList[fCount], 0, (NewCount - fCount) * sizeof(OBJ*)); fCount = NewCount; } template void GList::QuickSort(int L, int R) { int I, J; OBJ* P; OBJ* T; do { I = L; J = R; P = fList[(L + R) >> 1]; do { while (fCompareProc(fList[I], P) < 0) I++; while (fCompareProc(fList[J], P) > 0) J--; if (I <= J) { T = fList[I]; fList[I] = fList[J]; fList[J] = T; I++; J--; } } while (I <= J); if (L < J) QuickSort(L, J); L = I; } while (I < R); } template void GList::Sort() { if (fList!=NULL && fCount>0 && fCompareProc!=NULL) QuickSort(0, fCount-1); } //--------------------------------------------------------------------------- #endif cdbfasta/gclib/GArgs.cpp0000664002442700244270000001644711306015621015301 0ustar gperteagpertea#include #include #include "GArgs.h" #include #define TRACE 1 #include "GBase.h" //GArgs::is_opt="1"; //just to have a non-NULL value for switch testing GArgs::GArgs(int argc, char* const argv[], const char* format, bool nodigitopts) { /* format is: [:] for e.g. p:hT <- -p testing -ptesting -h -T = for e.g. PID=S= <- PID=50 S=3.5 This means that the = options, if present, must NEVER be given after dashed switches (non-value) directly */ //parse format string first: const char* fstr=format; fmtcount=0; count=0; nonOptCount=0; nonOptPos=0; optPos=0; errarg=0; args=NULL; fmt=NULL; int fmtlen=strlen(format); while (fstr-format < fmtlen ) { int l=strcspn(fstr, ":="); if (fstr[l]=='\0') { //end of string reached //all previous chars are just switches: GREALLOC(fmt, (fmtcount+l)*sizeof(fmtdef)); //store each switches for (int i=0; i=0) { if (fmt[f].type==0) {//switch type GREALLOC(args, (count+1)*sizeof(argdata)); GCALLOC(args[count].opt, 2); args[count].opt[0]=c; GCALLOC(args[count].value, 1); count++; // only switches can be grouped with some other switches or options if (argv[p][cpos+1]!='\0') { cpos++; c=argv[p][cpos]; goto COLLAPSED; } } else if (fmt[f].type==1) { //dash argument GREALLOC(args, (count+1)*sizeof(argdata)); GCALLOC(args[count].opt, 2); args[count].opt[0]=c; if (argv[p][cpos+1]=='\0') { if (p+1=0 && fmt[f].type==2) { GREALLOC(args, (count+1)*sizeof(argdata)); args[count].opt=Gstrdup(part); if (strlen(argv[p])-strlen(part)>0) { GMALLOC(args[count].value, strlen(argv[p])-strlen(part)+1); strcpy(args[count].value, e+1); } else { args[count].value=NULL; } count++; } else { //error - format does not match this '=' argument errarg=p; return; } } else { //it seems it's just a plain argument, like a filename, etc. GREALLOC(args, (count+1)*sizeof(argdata)); args[count].opt=NULL; //it's not an option args[count].value=Gstrdup(argv[p]); count++; nonOptCount++; } } p++;//check next arg string } } GArgs::~GArgs() { int i; for (i=0; i[ ][] = */ #ifndef G_ARGS_DEFINED #define G_ARGS_DEFINED class GArgs { //structure for parsing arguments format definition struct fmtdef { int type; // 0=dashed switch, 1=dashed value, 2='=' value char* opt; //switch/opt char/string }; int fmtcount; fmtdef* fmt; //this will store format definition after parsing it struct argdata { char* opt; // this is NULL for non-dashed arguments //one character for dashed style arguments //one string for = style arguments char* value; // is NULL for switches (dashed flags) }; argdata* args; //arguments table after parsing it int count; //total count of elements in 'args' array int nonOptCount; //count of non-dashed, non= arguments int nonOptPos; //current position for nonOpt arguments iterator int optPos; //current position for options iterator int errarg; //argv error position after parsing static const char* is_opt; // = non NULL just for getOpt easy testing int validOpt(char o); //parsing helper function int validOpt(char* o); public: GArgs(int argc, char* const argv[], const char* format, bool nodigitopts=false); /* format is: [:] for e.g. p:hT <= -p testing -ptesting -h -T = for e.g. PID=S= <= PID=50 S=3.5 This means that the = options, if present, must NEVER be given after dashed (non-value) switches directly */ ~GArgs(); int isError(); // returns the offending argv position or 0 if no error int getCount() { return count; } int getFmtCount() { return fmtcount; } int getNonOptCount() { return nonOptCount; } char* getOpt(const char* o); /* retrieve the value for option o returns NULL if option not given at all !=NULL if boolean option was given opt's value if value option was given */ char* getOpt(const char o); int startOpt(); //init iteration through option arguments char* nextOpt(); //get next option argument int startNonOpt(); //init iteration through non-option arguments //returns the number of non-option arguments char* nextNonOpt(); //get the next non-option argument }; #endif cdbfasta/gclib/GBase.cpp0000664002442700244270000003506411306015625015257 0ustar gperteagpertea#include #include #include "GBase.h" #include static char msg[4069]; //************************* Debug helpers ************************** // Assert failed routine void GAssert(const char* expression, const char* filename, unsigned int lineno){ sprintf(msg,"%s(%d): ASSERT(%s) failed.\n",filename,lineno,expression); fprintf(stderr,"%s",msg); //abort(); } // Error routine (prints error message and exits!) void GError(const char* format,...){ #ifdef __WIN32__ va_list arguments; va_start(arguments,format); vsprintf(msg,format,arguments); va_end(arguments); OutputDebugString(msg); fprintf(stderr,"%s",msg); // if a console is available MessageBox(NULL,msg,NULL,MB_OK|MB_ICONEXCLAMATION|MB_APPLMODAL); #else va_list arguments; va_start(arguments,format); vfprintf(stderr,format,arguments); va_end(arguments); #ifdef DEBUG // modify here if you want a core dump abort(); #endif #endif exit(1); } // Warning routine (just print message without exiting) void GMessage(const char* format,...){ va_list arguments; va_start(arguments,format); vsprintf(msg,format,arguments); va_end(arguments); #ifdef __WIN32__ OutputDebugString(msg); #endif fprintf(stderr,"%s",msg);fflush(stderr); } /*************** Memory management routines *****************/ // Allocate memory bool GMalloc(pointer* ptr,unsigned long size){ //GASSERT(ptr); if (size!=0) *ptr=malloc(size); return *ptr!=NULL; } // Allocate cleaned memory (0 filled) bool GCalloc(pointer* ptr,unsigned long size){ GASSERT(ptr); *ptr=calloc(size,1); return *ptr!=NULL; } // Resize memory bool GRealloc(pointer* ptr,unsigned long size){ //GASSERT(ptr); if (size==0) { GFree(ptr); return true; } if (*ptr==NULL) {//simple malloc void *p=malloc(size); if (p != NULL) { *ptr=p; return true; } else return false; }//malloc else {//realloc void *p=realloc(*ptr,size); if (p) { *ptr=p; return true; } return false; } } // Free memory, resets ptr to NULL afterward void GFree(pointer* ptr){ GASSERT(ptr); if (*ptr) free(*ptr); *ptr=NULL; } char* Gstrdup(const char* str) { if (str==NULL) return NULL; char *copy; GMALLOC(copy, strlen(str)+1); strcpy(copy,str); return copy; } char* newEmptyStr() { char* zs; GMALLOC(zs,1); zs[0]=0; return zs; } char* Gstrdup(const char* sfrom, const char* sto) { if (sfrom==NULL || sto==NULL) return NULL; char *copy; if (sfrom[0]==0) return newEmptyStr(); GMALLOC(copy, sto-sfrom+2); strncpy(copy, sfrom, sto-sfrom+1); copy[sto-sfrom+1]=0; return copy; } int Gstrcmp(char* a, char* b) { if (a==NULL || b==NULL) { return a==NULL ? -1 : 1; } else return strcmp(a,b); } int Gstricmp(const char* a, const char* b) { if (a==NULL || b==NULL) return a==NULL ? -1 : 1; register int ua, ub; while ((*a!=0) && (*b!=0)) { ua=tolower((unsigned char)*a); ub=tolower((unsigned char)*b); a++;b++; if (ua!=ub) return ua < ub ? -1 : 1; } return (*a == 0) ? ( (*b == 0) ? 0 : -1 ) : 1 ; } int strsplit(char* str, char** fields, int maxfields, const char* delim) { //splits by placing 0 where delim chars are found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int tidx=0; bool afterdelim=true; int i=0; while (str[i]!=0 && tidx=str) { if (*p==ch) return p; p--; } return NULL; } /* DOS/UNIX safer fgets : reads a text line from a (binary) file and update the file position accordingly and the buffer capacity accordingly. The given buf is resized to read the entire line in memory -- even when it's abnormally long */ char* fgetline(char* & buf, int& buf_cap, FILE *stream, off_t* f_pos, int* linelen) { //reads a char at a time until \n and/or \r are encountered int i=0; int c=0; off_t fpos=(f_pos!=NULL) ? *f_pos : 0; while ((c=getc(stream))!=EOF) { if (i>=buf_cap-1) { buf_cap+=1024; GREALLOC(buf, buf_cap); } if (c=='\n' || c=='\r') { if (c=='\r') { if ((c=getc(stream))!='\n') ungetc(c,stream); else fpos++; } fpos++; break; } fpos++; buf[i]=(char)c; i++; } //while i=allocated-1) { allocated+=1024; GREALLOC(buf, allocated); } if (c=='\n' || c=='\r') { buf[len]='\0'; if (c=='\r') { //DOS file -- special case if ((c=getc(stream))!='\n') ungetc(c,stream); else f_pos++; } f_pos++; lcount++; return buf; } f_pos++; buf[len]=(char)c; len++; } //while i=str) { for (i=0; i=lend) { for (i=0;i>24; h&=0x0fffffff; } GASSERT(h<=0x0fffffff); return h; } // removes the directory part from a full-path file name // this is a destructive operation for the given string!!! // the trailing '/' is guaranteed to be there void delFileName(char* filepath) { char *p, *sep; if (filepath==NULL) return; for (p=filepath, sep=filepath;*p!='\0';p++) if (*p==CHPATHSEP) sep=p+1; *sep='\0'; // truncate filepath } // returns a pointer to the file name part in a full-path filename char* getFileName(char* filepath) { char *p, *sep; if (filepath==NULL) return NULL; for (p=filepath, sep=filepath;*p!='\0';p++) if (*p==CHPATHSEP) sep=p+1; return sep; } int fileExists(const char* fname) { struct stat stFileInfo; int r=0; // Attempt to get the file attributes int fs = stat(fname,&stFileInfo); if (fs == 0) { r=3; // We were able to get the file attributes // so the file obviously exists. if (S_ISREG (stFileInfo.st_mode)) { r=2; } if (S_ISDIR (stFileInfo.st_mode)) { r=1; } } return r; } /*bool fileExists(const char* filepath) { if (filepath==NULL) return false; FILE* ft=fopen(filepath, "rb"); if (ft==NULL) return false; fclose(ft); return true; } */ off_t fileSize(const char* fpath) { struct stat results; if (stat(fpath, &results) == 0) // The size of the file in bytes is in return results.st_size; else // An error occurred //GError("Error at stat(%s)!\n", fpath) return 0; } bool parseNumber(char* &p, double& v) { //skip any spaces.. while (*p==' ' || *p=='\t') p++; char* start=p; /*if (*p=='-') p++; else if (*p=='+') { p++;start++; }*/ /* while ((*p>='1' && *p<='9') || *p=='0' || *p=='.' || *p=='-' || tolower(*p)=='e') p++; */ int numlen=strspn(start, "0123456789eE.-+"); p=start+numlen; //now p is on a non-digit; if (*start=='-' && p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; v=strtod(start,&endptr); *p=saved; if (endptr!=p) return false; return true; } bool parseDouble(char* &p, double& v) { return parseNumber(p,v); } bool parseInt(char* &p, int& i) { while (*p==' ' || *p=='\t') p++; char* start=p; if (*p=='-') p++; else if (*p=='+') { p++;start++; } while ((*p>='1' && *p<='9') || *p=='0') p++; //now p is on a non-digit; if (*start=='-' && p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; long l=strtol(start,&endptr,10); i=(int)l; *p=saved; if (endptr!=p || i!=l) return false; return true; } bool parseUInt(char* &p, uint& i) { while (*p==' ' || *p=='\t') p++; char* start=p; if (*p=='-') return false; else if (*p=='+') { p++;start++; } while ((*p>='1' && *p<='9') || *p=='0') p++; //now p is on a non-digit; if (*start=='-' && p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; unsigned long l=strtoul(start,&endptr,10); i=(uint) l; *p=saved; if (endptr!=p || i!=l) return false; return true; } bool parseHex(char* &p, uint& i) { //skip initial spaces/prefix while (*p==' ' || *p=='\t' || *p=='0' || *p=='x') p++; char* start=p; if (*p=='-') return false; else if (*p=='+') { p++;start++; } while (isxdigit(*p)) p++; //now p is on a non-hexdigit; if (p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; unsigned long l=strtoul(start,&endptr,16); i=(uint) l; *p=saved; if (endptr!=p || i!=l) return false; return true; } cdbfasta/gclib/GBase.h0000664002442700244270000002357011306015625014723 0ustar gperteagpertea#ifndef G_BASE_DEFINED #define G_BASE_DEFINED #include #include #include #include #include #include #include #if defined __WIN32__ || defined _WIN32 #include #endif #ifdef DEBUG #undef NDEBUG #endif typedef unsigned int uint32; typedef int int32; typedef unsigned char uchar; typedef unsigned char byte; // If long is natively 64 bit, use the regular fseek and ftell #ifdef _NATIVE_64 #define ftello ftell #define fseeko fseek #endif #ifndef MAXUINT #define MAXUINT ((unsigned int)-1) #endif #if defined(_NATIVE_64) || defined(_LP64) || defined(__LP64__) typedef long int64; typedef unsigned long uint64; #else //assume 32bit environment with long long for int64 stuff typedef long long int64; typedef unsigned long long uint64; #endif /****************************************************************************/ #ifndef EXIT_FAILURE #define EXIT_FAILURE 1 #endif #ifndef EXIT_SUCCESS #define EXIT_SUCCESS 0 #endif /****************************************************************************/ #define ERR_ALLOC "Error allocating memory.\n" #if defined (__WIN32__) || defined (WIN32) #define CHPATHSEP '\\' #include #define ftello ftell #define fseeko fseek #else #define CHPATHSEP '/' #include #endif //------------------- // Debug helpers #ifndef NDEBUG #define GASSERT(exp) ((exp)?((void)0):(void)GAssert(#exp,__FILE__,__LINE__)) #ifdef TRACE #define GTRACE(exp) (GMessage exp) #else #define GTRACE(exp) ((void)0) #endif #else #define GASSERT(exp) ((void)0) #define GTRACE(exp) ((void)0) #endif #define GERROR(exp) (GError exp) /********************************** Macros ***********************************/ // Abolute value #define GABS(val) (((val)>=0)?(val):-(val)) // Min and Max #define GMAX(a,b) (((a)>(b))?(a):(b)) #define GMIN(a,b) (((a)>(b))?(b):(a)) // Min of three #define GMIN3(x,y,z) ((x)<(y)?GMIN(x,z):GMIN(y,z)) // Max of three #define GMAX3(x,y,z) ((x)>(y)?GMAX(x,z):GMAX(y,z)) // Return minimum and maximum of a, b #define GMINMAX(lo,hi,a,b) ((a)<(b)?((lo)=(a),(hi)=(b)):((lo)=(b),(hi)=(a))) // Clamp value x to range [lo..hi] #define GCLAMP(lo,x,hi) ((x)<(lo)?(lo):((x)>(hi)?(hi):(x))) typedef void* pointer; typedef unsigned int uint; typedef int GCompareProc(const pointer item1, const pointer item2); typedef void GFreeProc(pointer item); //usually just delete, //but may also support structures with embedded dynamic members #define GMALLOC(ptr,size) if (!GMalloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GCALLOC(ptr,size) if (!GCalloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GREALLOC(ptr,size) if (!GRealloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GFREE(ptr) GFree((pointer*)(&ptr)) inline char* min(char *arg1, char *arg2) { return (strcmp(arg1, arg2) < 0)? arg1 : arg2; } inline int iround(double x) { return (int)floor(x + 0.5); } /****************************************************************************/ inline char* max(char *arg1, char *arg2) { return (strcmp(arg2, arg1) < 0)? arg1 : arg2; } inline int Gintcmp(int a, int b) { //return (a>b)? 1 : ((a==b)?0:-1); return a-b; } int Gstrcmp(char* a, char* b); //same as strcmp but doesn't crash on NULL pointers int Gstricmp(const char* a, const char* b); inline void swap(int &arg1, int &arg2){ arg1 ^= arg2 ^= arg1 ^= arg2; } inline void swap(char* &arg1, char* &arg2){ register char* swp=arg1; arg1=arg2; arg2=swp; } inline void swap(unsigned int &arg1, unsigned int &arg2) { arg1 ^= arg2 ^= arg1 ^= arg2; } inline void swap(short &arg1, short &arg2) { arg1 ^= arg2 ^= arg1 ^= arg2; } inline void swap(unsigned short &arg1, unsigned short &arg2) { arg1 ^= arg2 ^= arg1 ^= arg2; } inline void swap(long &arg1, long &arg2) { arg1 ^= arg2 ^= arg1 ^= arg2; } inline void swap(unsigned long &arg1, unsigned long &arg2) { arg1 ^= arg2 ^= arg1 ^= arg2; } inline void swap(char &arg1, char &arg2) { arg1 ^= arg2 ^= arg1 ^= arg2; } inline void swap(unsigned char &arg1, unsigned char &arg2) { arg1 ^= arg2 ^= arg1 ^= arg2; } inline void swap(bool &arg1, bool &arg2) { arg1 ^= arg2 ^= arg1 ^= arg2; } /**************** Memory management ***************************/ bool GMalloc(pointer* ptr, unsigned long size); // Allocate memory bool GCalloc(pointer* ptr, unsigned long size); // Allocate and initialize memory bool GRealloc(pointer* ptr,unsigned long size); // Resize memory void GFree(pointer* ptr); // Free memory, resets ptr to NULL /********************* debug functions *********************/ void GError(const char* format,...); // Error routine (aborts program) void GMessage(const char* format,...);// Log message to stderr // Assert failed routine:- usually not called directly but through GASSERT void GAssert(const char* expression, const char* filename, unsigned int lineno); // ****************** string manipulation ************************* char *Gstrdup(const char* str); //duplicate a string by allocating a copy for it and returning it char* Gstrdup(const char* sfrom, const char* sto); //same as GStrdup, but with an early termination (e.g. on delimiter) char* Gsubstr(const char* str, char* from, char* to=NULL); //extracts a substring, allocating it, including boundaries (from/to) int strsplit(char* str, char** fields, int maxfields, const char* delim); int strsplit(char* str, char** fields, int maxfields, const char delim); int strsplit(char* str, char** fields, int maxfields); //splits by tab or space char* replaceStr(char* &str, char* newvalue); //conversion: to Lower/Upper case // creating a new string: char* upCase(const char* str); char* loCase(const char* str); // changing string in place: char* strlower(char * str); char* strupper(char * str); //strstr but for memory zones: scans a memory region //for a substring: void* Gmemscan(void *mem, unsigned int len, void *part, unsigned int partlen); // test if a char is in a string: bool chrInStr(char c, char* str); char* rstrchr(char* str, char ch); /* returns a pointer to the rightmost occurence of ch in str - like rindex for platforms missing it*/ char* strchrs(char* s, const char* chrs); //strchr but with a set of chars instead of only one char* rstrfind(char* str, const char *substr); /* like rindex() but for strings or like the right side version of strstr() */ //reverse character string or char* reverseChars(char* str, int slen=0); char* rstrstr(char* rstart, char *lend, char* substr); /*the reversed, rightside equivalent of strstr: starts searching from right end (rstart), going back to left end (lend) and returns a pointer to the last (right) matching character in str */ char* strifind(char* str, const char* substr); // the case insensitive version of strstr -- finding a string within a strin //Determines if a string begins with a given prefix //(returns false when any of the params is NULL, // but true when prefix is '' (empty string)!) bool startsWith(char* s, const char* prefix); // ELF hash function for strings int strhash(const char* str); //-------------------------------------------------------- // ************** simple line reading class for text files //GLineReader -- text line reading/buffering class class GLineReader { int len; int allocated; char* buf; bool isEOF; FILE* file; off_t filepos; //current position bool pushed; //pushed back int lcount; //line counter (read lines) public: char* chars() { return buf; } char* line() { return buf; } int readcount() { return lcount; } //number of lines read int length() { return len; } int size() { return len; } //same as size(); bool isEof() {return isEOF; } bool eof() { return isEOF; } off_t getfpos() { return filepos; } off_t getFpos() { return filepos; } char* nextLine() { return getLine(); } char* getLine() { if (pushed) { pushed=false; return buf; } else return getLine(file); } char* getLine(FILE* stream) { if (pushed) { pushed=false; return buf; } else return getLine(stream, filepos); } char* getLine(FILE* stream, off_t& f_pos); //read a line from a stream and update // the given file position void pushBack() { if (lcount>0) pushed=true; } // "undo" the last getLine request // so the next call will in fact return the same line GLineReader(FILE* stream=NULL, off_t fpos=0) { len=0; isEOF=false; allocated=1024; GMALLOC(buf,allocated); lcount=0; buf[0]=0; file=stream; filepos=fpos; pushed=false; } ~GLineReader() { GFREE(buf); } }; /* extended fgets() - to read one full line from a file and update the file position correctly ! buf will be reallocated as necessary, to fit the whole line */ char* fgetline(char* & buf, int& buflen, FILE* stream, off_t* f_pos=NULL, int* linelen=NULL); /*********************** File management functions *********************/ // removes the directory part from a full-path file name // this is a destructive operation for the given string! void delFileName(char* filepath); // returns a pointer to the file name part in a full-path filename char* getFileName(char* filepath); int fileExists(const char* fname); //returns 0 if file entry doesn't exist // 1 if it's a directory // 2 if it's a regular file // 3 otherwise (?) off_t fileSize(const char* fpath); //parses the next number found in a string at the current position //until a non-digit (and not a '.', 'e','E','-','+') is encountered; //updates the char* pointer to be after the last digit parsed bool parseNumber(char* &p, double& v); bool parseDouble(char* &p, double& v); //just an alias for parseNumber bool parseInt(char* &p, int& i); bool parseUInt(char* &p, uint& i); bool parseHex(char* &p, uint& i); #endif /* G_BASE_DEFINED */ cdbfasta/gclib/gcdb.cpp0000664002442700244270000005514311306015633015174 0ustar gperteagpertea#include "gcdb.h" #include #include #include #if defined(__WIN32__) || defined(WIN32) #include /* m m a p === got from imagick sources % Method mmap emulates the Unix method of the same name. % The format of the mmap method is: % void *mmap(char *address,size_t length,int protection, % int access,int file,off_t offset) */ void *mmap(char *address,size_t length,int protection,int access, int file, off_t offset) { void *map; HANDLE handle; map=(void *) NULL; handle=INVALID_HANDLE_VALUE; switch (protection) { case PROT_READ: default: { handle=CreateFileMapping((HANDLE) _get_osfhandle(file),0,PAGE_READONLY,0, length,0); if (!handle) break; map=(void *) MapViewOfFile(handle,FILE_MAP_READ,0,0,length); CloseHandle(handle); break; } case PROT_WRITE: { handle=CreateFileMapping((HANDLE) _get_osfhandle(file),0,PAGE_READWRITE,0, length,0); if (!handle) break; map=(void *) MapViewOfFile(handle,FILE_MAP_WRITE,0,0,length); CloseHandle(handle); break; } case PROT_READWRITE: { handle=CreateFileMapping((HANDLE) _get_osfhandle(file),0,PAGE_READWRITE,0, length,0); if (!handle) break; map=(void *) MapViewOfFile(handle,FILE_MAP_ALL_ACCESS,0,0,length); CloseHandle(handle); break; } } if (map == (void *) NULL) return((void *) MAP_FAILED); return((void *) ((char *) map+offset)); } /* =========== m u n m a p =========================== % % Method munmap emulates the Unix method with the same name. % The format of the munmap method is: % int munmap(void *map,size_t length) % A description of each parameter follows: % > status: Method munmap returns 0 on success; otherwise, it % returns -1 and sets errno to indicate the error. % > map: The address of the binary large object. % > length: The length of the binary large object. % */ int munmap(void *map,size_t length) { if (!UnmapViewOfFile(map)) return(-1); return(0); } #endif int cdbInfoSIZE=offsetof(cdbInfo, tag)+4; //===================================================== //------------- buffer stuff ------------------- //===================================================== //------------------------------------- //--------- misc utility functions ----- static int gcdb_seek_set(int fd,gcdb_seek_pos pos) { if (lseek(fd, pos, 0) == -1) return -1; return 0; } #define gcdb_seek_begin(fd) (gcdb_seek_set((fd),(gcdb_seek_pos) 0)) static unsigned int gcdb_strlen(const char *s) { register char *t; t = (char*)s; for (;;) { if (!*t) return t - s; ++t; if (!*t) return t - s; ++t; if (!*t) return t - s; ++t; if (!*t) return t - s; ++t; } } static int byte_diff(char *s, unsigned int n,char *t) { for (;;) { if (!n) return 0; if (*s != *t) break; ++s; ++t; --n; if (!n) return 0; if (*s != *t) break; ++s; ++t; --n; if (!n) return 0; if (*s != *t) break; ++s; ++t; --n; if (!n) return 0; if (*s != *t) break; ++s; ++t; --n; } return ((int)(unsigned int)(unsigned char) *s) - ((int)(unsigned int)(unsigned char) *t); } static void gcdb_byte_copy(char *to, unsigned int n, char *from) { for (;;) { if (!n) return; *to++ = *from++; --n; if (!n) return; *to++ = *from++; --n; if (!n) return; *to++ = *from++; --n; if (!n) return; *to++ = *from++; --n; } } static void gcdb_byte_copyr(char *to, unsigned int n, char *from) { to += n; from += n; for (;;) { if (!n) return; *--to = *--from; --n; if (!n) return; *--to = *--from; --n; if (!n) return; *--to = *--from; --n; if (!n) return; *--to = *--from; --n; } } #define ALIGNMENT 16 /* XXX: assuming that this alignment is enough */ #define SPACE 4096 /* must be multiple of ALIGNMENT */ typedef union { char irrelevant[ALIGNMENT]; double d; } aligned; static aligned realspace[SPACE / ALIGNMENT]; #define space ((char *) realspace) static unsigned int avail = SPACE; /* multiple of ALIGNMENT; 0<=avail<=SPACE */ offt_conv gcvt_offt; uint_conv gcvt_uint; char *gcdb_alloc(unsigned int n) { char *x; n = ALIGNMENT + n - (n & (ALIGNMENT - 1)); /* XXX: could overflow */ if (n <= avail) { avail -= n; return space + avail; } x = (char*) malloc(n); if (!x) return NULL; //if (!x) GError("Error: mgcdb_alloc(%d) failed !\n", n); return x; } int GCDBuffer::write_all(char* buf, unsigned int len) { int w; while (len) { w = op(fd,buf,len); if (w == -1) { if (errno == error_intr) continue; return -1; /* note that some data may have been written */ } /* if (w == 0) ; luser's fault */ buf += w; len -= w; } return 0; } int GCDBuffer::flush() { int pt=p; if (!pt) return 0; p = 0; //return allwrite(op,fd,x,pt); return write_all(x,pt); } int GCDBuffer::putalign(char *buf,unsigned int len) { unsigned int bn; while (len > (bn = n-p)) { gcdb_byte_copy(x + p,bn,buf); p += bn; buf += bn; len -= bn; if (GCDBuffer::flush() == -1) return -1; } /* now len <= s->n - s->p */ gcdb_byte_copy(x + p,len,buf); p += len; return 0; } int GCDBuffer::put(char *buf,unsigned int len) { unsigned int bn=n; if (len > bn - p) { if (GCDBuffer::flush() == -1) return -1; /* now s->p == 0 */ if (bn < GCDBUFFER_OUTSIZE) bn = GCDBUFFER_OUTSIZE; while (len > n) { if (bn > len) bn = len; if (write_all(buf, bn) == -1) return -1; buf += bn; len -= bn; } } /* now len <= s->n - s->p */ gcdb_byte_copy(x + p,len,buf); p += len; return 0; } int GCDBuffer::putflush(char *buf,unsigned int len) { if (flush() == -1) return -1; return write_all(buf,len); } int GCDBuffer::putsalign(char *buf) { return GCDBuffer::putalign(buf, gcdb_strlen(buf)); } int GCDBuffer::puts(char *buf) { return GCDBuffer::put(buf, gcdb_strlen(buf)); } int GCDBuffer::putsflush(char *buf) { return GCDBuffer::putflush(buf, gcdb_strlen(buf)); } static int oneread(opfunc op,int fd, char *buf,unsigned int len) { int r; for (;;) { r = op(fd,buf,len); if (r == -1 && errno == error_intr) continue; return r; } } int GCDBuffer::oneRead(char* buf, unsigned int len) { return op(fd,buf,len); /*int r; for (;;) { r = op(fd,buf,len); if (r == -1 && errno == error_intr) continue; return r; }*/ } int GCDBuffer::getthis(char *buf,unsigned int len) { if (len > p) len = p; p -= len; gcdb_byte_copy(buf, len,x + n); n += len; return len; } int GCDBuffer::feed() { int r; if (p) return p; r = oneRead(x,n); if (r <= 0) return r; p = r; n -= r; if (n > 0) gcdb_byte_copyr(x + n,r,x); return r; } int GCDBuffer::bget(char *buf,unsigned int len) { int r; if (p > 0) return getthis(buf,len); if (n <= len) return oneRead(buf,n); r = GCDBuffer::feed(); if (r <= 0) return r; return getthis(buf,len); } int GCDBuffer::get(char *buf,unsigned int len) { int r; if (p > 0) return getthis(buf,len); if (n <= len) return oneread(op,fd,buf,len); r = GCDBuffer::feed(); if (r <= 0) return r; return getthis(buf,len); } char* GCDBuffer::peek() { return x + n; } void GCDBuffer::seek(unsigned int len) { n += len; p -= len; } int GCDBuffer::copy(GCDBuffer* bin) { int n_in; char *x_in; for (;;) { n_in = bin->feed(); if (n_in < 0) return -2; if (!n_in) return 0; x_in = bin->peek(); if (GCDBuffer::put(x_in,n_in) == -1) return -3; bin->seek(n_in); } } //===================================================== //------------- cdb utils ------------------- //===================================================== int error_intr = #ifdef EINTR EINTR; #else -1; #endif int error_nomem = #ifdef ENOMEM ENOMEM; #else -2; #endif int error_proto = #ifdef EPROTO EPROTO; #else -15; #endif //------------------------------------------------ //------------ allocation routines: /* big/little endian check */ int endian_test(void) { unsigned short v=0x0001; unsigned char* b = (unsigned char*)&v; return b[1]; } /* conversion of unsigned int offsets read from a file can also be used to prepare unsigned integers to be written into a file in an independent platform manner */ unsigned int uint32_sun(void* x86int) { unsigned char b[4]; b[3]=((unsigned char*)x86int)[0]; b[0]=((unsigned char*)x86int)[3]; b[1]=((unsigned char*)x86int)[2]; b[2]=((unsigned char*)x86int)[1]; return *((unsigned int*)b); } unsigned int uint32_x86(void* offt) { return *((unsigned int*)offt); } //-------- 64bit types, if that's the case: off_t offt_sun(void* offt) { unsigned char b[8]; if (sizeof(off_t)==8) { //64 bit? // upper words: b[3]=((unsigned char*)offt)[4]; b[0]=((unsigned char*)offt)[7]; b[1]=((unsigned char*)offt)[6]; b[2]=((unsigned char*)offt)[5]; //-- b[7]=((unsigned char*)offt)[0]; b[4]=((unsigned char*)offt)[3]; b[5]=((unsigned char*)offt)[2]; b[6]=((unsigned char*)offt)[1]; } else { b[3]=((unsigned char*)offt)[0]; b[0]=((unsigned char*)offt)[3]; b[1]=((unsigned char*)offt)[2]; b[2]=((unsigned char*)offt)[1]; } return *((off_t*)b); } off_t offt_x86(void* offt) { return *((off_t*)offt); } //------------------------ platform independent uint32 : void uint32_pack(char s[4],uint32 u) { s[0] = u & 255; u >>= 8; s[1] = u & 255; u >>= 8; s[2] = u & 255; s[3] = u >> 8; } void uint32_pack_big(char s[4],uint32 u) { s[3] = u & 255; u >>= 8; s[2] = u & 255; u >>= 8; s[1] = u & 255; s[0] = u >> 8; } /* unpacking: */ void uint32_unpack(char s[4],uint32 *u) { uint32 result; result = (unsigned char) s[3]; result <<= 8; result += (unsigned char) s[2]; result <<= 8; result += (unsigned char) s[1]; result <<= 8; result += (unsigned char) s[0]; *u = result; } void uint32_unpack_big(char s[4],uint32 *u) { uint32 result; result = (unsigned char) s[0]; result <<= 8; result += (unsigned char) s[1]; result <<= 8; result += (unsigned char) s[2]; result <<= 8; result += (unsigned char) s[3]; *u = result; } //===================================================== //------------- cdb index ------------------- //===================================================== GCdbWrite::GCdbWrite(int afd) { //check endianness :) gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86; gcvt_offt=(endian_test())? &offt_sun : &offt_x86; cdbuf=new GCDBuffer((opfunc)&write,(int) afd,(char*)bspace,sizeof bspace); head = NULL; split = 0; hash = 0; numentries = 0; fd = afd; pos = sizeof final; gcdb_seek_set(fd, pos); fname[0]='\0'; //should return and test the result of gcdb_seek_set!!! } GCdbWrite::GCdbWrite(char* afname) { #if defined(__WIN32__) || defined(WIN32) fd = open(afname,O_WRONLY | O_TRUNC | O_BINARY | O_CREAT, S_IREAD|S_IWRITE); #else fd = open(afname,O_WRONLY | O_NDELAY | O_TRUNC | O_CREAT, 0664); #endif if (fd == -1) GError("GCdbWrite: Error creating file '%s'\n", fname); //check endianness :) gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86; gcvt_offt=(endian_test())? &offt_sun : &offt_x86; cdbuf=new GCDBuffer((opfunc)&write,(int) fd,(char*)bspace,sizeof bspace); head = NULL; split = 0; hash = 0; numentries = 0; pos = sizeof final; gcdb_seek_set(fd, pos); strcpy(fname, afname); //should return and test the result of gcdb_seek_set!!! } GCdbWrite::~GCdbWrite() { cdbuf->flush(); #if !(defined(__WIN32__) || defined(WIN32)) /* NFS silliness */ if (fsync(fd) == -1) GError("GCdbWrite: Error at fsync() for file '%s'\n", fname); #endif if (::close(fd) == -1) GError("GCdbWrite: Error at closing file '%s'\n", fname); delete cdbuf; if (head!=NULL) free(head); } int GCdbWrite::posplus(uint32 len) { uint32 newpos = pos + len; if (newpos < len) { //errno = error_nomem; return -1; } pos = newpos; return 0; } int GCdbWrite::addend(unsigned int keylen,unsigned int datalen,uint32 h) { struct cdb_hplist *chead = head; if (!chead || (chead->num >= CDB_HPLIST)) { chead = (struct cdb_hplist *) gcdb_alloc(sizeof(struct cdb_hplist)); if (!chead) return -1; chead->num = 0; chead->next = head; head = chead; } chead->hp[head->num].h = h; chead->hp[head->num].p = pos; ++chead->num; ++numentries; if (posplus(8) == -1) return -1; if (posplus(keylen) == -1) return -1; if (posplus(datalen) == -1) return -1; return 0; } int GCdbWrite::addbegin(unsigned int keylen,unsigned int datalen) { char buf[8]; //if (keylen > MAX_UINT) { /* errno = error_nomem; */return -1; } // if (datalen > MAX_UINT) { /*errno = error_nomem;*/ return -1; } uint32_pack(buf,keylen); uint32_pack(buf + 4,datalen); if (cdbuf->putalign(buf,8) == -1) return -1; return 0; } #define cdbuffer_PUTC(s,c) \ ( ((s).n != (s).p) \ ? ( (s).x[(s).p++] = (c), 0 ) \ : (s).put(&(c),1) \ ) int GCdbWrite::add(const char* key, char* recdata, unsigned int datalen) { unsigned int i; unsigned int klen=strlen(key); if (klen<1) { GMessage("Warning: zero length key found\n"); return 0; } //------------ adding record ----------------- if (addbegin(klen,datalen)==-1) GError("GCdbWrite: Error at addbegin(%d, %d)\n",klen, datalen); uint32 h=CDB_HASHSTART; for (i = 0;i < klen; ++i) { //if (cdbuffer_PUTC(c.cdbuf,key[i]) == -1) if ( ((cdbuf->n!=cdbuf->p) ? (cdbuf->x[cdbuf->p++]=(key[i]),0 ) : cdbuf->put((char*)&(key[i]),1) )==-1) GError("GCdbWrite: Error at cdbbuf.put, key '%s'\n", key); h = cdb_hashadd(h,key[i]); } if (cdbuf->put(recdata,datalen) == -1) GError("GCdbWrite: Error at final cdbuf.put() at key='%s', datalen=%d\n", key, datalen); if (addend(klen,datalen,h) == -1) GError("GCdbWrite: Error at addend(%d, %d, h)\n", klen, datalen); return 1; } int GCdbWrite::addrec(const char *key,unsigned int keylen,char *data,unsigned int datalen) { if (GCdbWrite::addbegin(keylen,datalen) == -1) return -1; if (cdbuf->putalign((char*)key,keylen) == -1) return -1; if (cdbuf->putalign(data,datalen) == -1) return -1; return GCdbWrite::addend(keylen,datalen,cdb_hash(key,keylen)); } int GCdbWrite::finish() { char buf[8]; int i; uint32 len; uint32 u; uint32 memsize; uint32 icount; uint32 where; struct cdb_hplist *x; struct cdb_hp *hp; for (i = 0;i < 256;++i) count[i] = 0; for (x = head;x;x = x->next) { i = x->num; while (i--) ++count[255 & x->hp[i].h]; } memsize = 1; for (i = 0;i < 256;++i) { u = count[i] * 2; if (u > memsize) memsize = u; } memsize += numentries; /* no overflow possible up to now */ u = (uint32) 0 - (uint32) 1; u /= sizeof(struct cdb_hp); if (memsize > u) { /* errno = error_nomem;*/ return -1; } split = (struct cdb_hp *) gcdb_alloc(memsize * sizeof(struct cdb_hp)); if (!split) return -1; hash = split + numentries; u = 0; for (i = 0;i < 256;++i) { u += count[i]; /* bounded by numentries, so no overflow */ start[i] = u; } for (x = head;x;x = x->next) { i = x->num; while (i--) split[--start[255 & x->hp[i].h]] = x->hp[i]; } for (i = 0;i < 256;++i) { icount = count[i]; len = icount + icount; /* no overflow possible */ uint32_pack(final + 8 * i,pos); uint32_pack(final + 8 * i + 4,len); for (u = 0;u < len;++u) hash[u].h = hash[u].p = 0; hp = split + start[i]; for (u = 0;u < icount;++u) { where = (hp->h >> 8) % len; while (hash[where].p) if (++where == len) where = 0; hash[where] = *hp++; } for (u = 0;u < len;++u) { uint32_pack(buf,hash[u].h); uint32_pack(buf + 4,hash[u].p); if (cdbuf->putalign(buf,8) == -1) return -1; if (posplus(8) == -1) return -1; } } if (cdbuf->flush() == -1) return -1; if (gcdb_seek_begin(fd) == -1) return -1; return cdbuf->putflush(final,sizeof final); } //===================================================== //------------- cdb ------------------- //===================================================== uint32 cdb_hashadd(uint32 h,unsigned char c) { h += (h << 5); return h ^ c; } uint32 cdb_hash(const char *buf,unsigned int len) { uint32 h; h = CDB_HASHSTART; while (len) { h = cdb_hashadd(h,*buf++); --len; } return h; } //--------------------------------------------------------------- //-------------------------- cdb methods ------------------------ GCdbRead::GCdbRead(int afd) { struct stat st; char *x; map=NULL; //check endianness :) gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86; gcvt_offt=(endian_test())? &offt_sun : &offt_x86; findstart(); fd = afd; if (fstat(fd,&st) == 0) if (st.st_size <= MAX_UINT) { #ifndef NO_MMAP x = (char *) mmap(0,st.st_size,PROT_READ,MAP_SHARED,fd,0); if (x + 1) { size = st.st_size; map = x; } else { GError("Error mapping the file (size=%ld)!\n",st.st_size); } #endif } else { GError("Error mapping the file (size %ld > MAX_UINT)\n", st.st_size); } } GCdbRead::GCdbRead(char* afname) { struct stat st; char *x; map=NULL; //check endianness :) gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86; gcvt_offt=(endian_test())? &offt_sun : &offt_x86; findstart(); #ifdef __WIN32__ fd = open(afname, O_RDONLY|O_BINARY); #else fd = open(afname, O_RDONLY); #endif if (fd == -1) GError("Error: cannot open file %s\n", afname); strcpy(fname, afname); if (fstat(fd,&st) == 0) if (st.st_size <= MAX_UINT) { #ifndef NO_MMAP x = (char *) mmap(0,st.st_size,PROT_READ,MAP_SHARED,fd,0); if (x + 1) { size = st.st_size; map = x; } else { GError("GCdbRead: Error mapping the file (size=%ld)!\n",st.st_size); } #endif } else { GError("GCdbRead: Error mapping the file (size %ld > MAX_UINT)\n", st.st_size); } } GCdbRead::~GCdbRead() { if (map!=NULL) { munmap(map,size); map = 0; } } int GCdbRead::read(char *buf,unsigned int len, uint32 pos) { #ifndef NO_MMAP if (map) { if ((pos > size) || (size - pos < len)) { /* errno = error_proto; */ return -1; } gcdb_byte_copy(buf, len, map + pos); } else #endif { if (gcdb_seek_set(fd,pos) == -1) return -1; while (len > 0) { int r; do { r = ::read(fd,buf,len); } while ((r == -1) && (errno == error_intr)); if (r == -1) return -1; if (r == 0) { //errno = error_proto; return -1; } buf += r; len -= r; } } return 0; } int GCdbRead::match(const char *key, unsigned int len, uint32 pos) { char buf[32]; unsigned int n; while (len > 0) { n = sizeof buf; if (n > len) n = len; if (GCdbRead::read(buf,n,pos) == -1) return -1; if (byte_diff(buf,n,(char*)key)) return 0; pos += n; key += n; len -= n; } return 1; } int GCdbRead::findnext(const char *key,unsigned int len) { char buf[8]; uint32 pos; uint32 u; if (!loop) { u = cdb_hash(key,len); if (GCdbRead::read(buf,8,(u << 3) & 2047) == -1) return -1; uint32_unpack(buf + 4,&hslots); if (!hslots) return 0; uint32_unpack(buf,&hpos); khash = u; u >>= 8; u %= hslots; u <<= 3; kpos = hpos + u; } while (loop < hslots) { if (GCdbRead::read(buf,8,kpos) == -1) return - 1; uint32_unpack(buf + 4, &pos); if (!pos) return 0; loop += 1; kpos += 8; if (kpos == hpos + (hslots << 3)) kpos = hpos; uint32_unpack(buf,&u); if (u == khash) { if (GCdbRead::read(buf,8,pos) == -1) return -1; uint32_unpack(buf,&u); if (u == len) switch(GCdbRead::match(key,len,pos + 8)) { case -1: return -1; case 1: uint32_unpack(buf + 4,&dlen); dpos = pos + 8 + len; return 1; } } } return 0; } int GCdbRead::find(const char *key) { GCdbRead::findstart(); return GCdbRead::findnext(key,gcdb_strlen(key)); } //----- GReadBuf and GReadBufLine char* GReadBufLine::readline(int idx) { //reads a char at a time until \n and/or \r are encountered GFREE(buf[idx].chars); buf[idx].len=0; if (isEOF) return NULL; int len=0; buf[idx].fpos=filepos; int c=0; int allocated=256; GMALLOC(buf[idx].chars, allocated); while ((c=getc(file))!=EOF) { if (len>=allocated-1) { allocated+=256; GREALLOC(buf[idx].chars, allocated); } if (c=='\n' || c=='\r') { buf[idx].chars[len]='\0'; if (c=='\r') { //DOS file -- special case if ((c=getc(file))!='\n') ungetc(c,file); else filepos++; } filepos++; buf[idx].len=len; return buf[idx].chars; } filepos++; buf[idx].chars[len]=(char)c; len++; } //while i0) { //preserve the lines already in buffer int bidx=bufidx-1;//always leave room for PREVIOUS line, for putLine() for (int i=0;i=0 && bufidx0 && bufidx0 && bufidx class GHash { protected: GHashEntry* hash; // Hash int fCapacity; // table size int fCount; // number of valid entries int fCurrentEntry; char* lastkeyptr; //pointer to last key string added //---------- Raw data retrieval (including empty entries // Return key at position pos. const char* Key(uint pos) const { return hash[pos].key; } // return data OBJ* at given position OBJ* Data(uint pos) const { return (OBJ*) hash[pos].data; } // Return mark flag of entry at position pos. bool Mark(uint pos) const { return hash[pos].mark; } // Return position of first filled slot, or >= fCapacity int First() const; // Return position of last filled slot or -1 int Last() const; // Return position of next filled slot in hash table // or a value greater than or equal to fCapacity if no filled // slot was found int Next(int pos) const; //Return position of previous filled slot in hash table //or a -1 if no filled slot was found int Prev(int pos) const; private: GHash(const GHash&); GHash &operator=(const GHash&); GFreeProc* fFreeProc; //procedure to free item data protected: public: static void DefaultFreeProc(pointer item) { delete (OBJ*)item; item=NULL; } public: GHash(GFreeProc* freeProc); // constructs of an empty hash GHash(bool doFree=true); // constructs of an empty hash (free the item objects) void setFreeItem(GFreeProc *freeProc) { fFreeProc=freeProc; } void setFreeItem(bool doFree) { fFreeProc=(doFree)? &DefaultFreeProc : NULL; } int Capacity() const { return fCapacity; } // table's size, including the empty slots. void Resize(int m); // Resize the table to the given size. int Count() const { return fCount; }// the total number of entries in the table. // Insert a new entry into the table given key and mark. // If there is already an entry with that key, leave it unchanged, const OBJ* Add(const char* ky, const OBJ* ptr, bool mrk=false); //same as Add, but the key pointer is stored directly, no string duplicate //is made (shared-key-Add) const OBJ* shkAdd(const char* ky, const OBJ* ptr, bool mrk=false); // Replace data at key, if the entry's mark is less than // or equal to the given mark. If there was no existing entry, // a new entry is inserted with the given mark. OBJ* Replace(const char* ky, const OBJ* ptr, bool mrk=false); // Remove a given key and its data OBJ* Remove(const char* ky); // Find data OBJ* given key. OBJ* Find(const char* ky); bool hasKey(const char* ky); const char* getLastKey() { return lastkeyptr; } OBJ* operator[](const char* ky) { return Find(ky); } void startIterate(); //iterator-like initialization char* NextKey(); //returns next valid key in the table (NULL if no more) OBJ* NextData(); //returns next valid hash[].data OBJ* NextData(char*& nextkey); //returns next valid hash[].data //or NULL if no more //nextkey is SET to the corresponding key GHashEntry* NextEntry(); //returns a pointer to a GHashEntry /// Clear all entries void Clear(); /// Destructor virtual ~GHash(); }; // //======================== method definitions ======================== // /* Notes: - The hash algorithm should yield a fCount in the range [0...GHash::EMPTY) GHash::EMPTY and GHash::UNUSED are needed for flag purposes. - Since the algorithm doubles the table size when exceeding MAX_LOAD, it would be prudent to keep MIN_LOAD less than 1/2 MAX_LOAD; otherwise, the algorithm might hip-hop between halving and doubling, which would be quite expensive!! - Not many people seem to know that hash tables don't have to be prime numbers; in fact, a table size of 2**n and odd probe distance are very easy to arrange, and this works just as well! - We store the hash key, so that 99.999% of the time we can compare hash numbers; only when hash numbers match do we need to compare keys. Thus, with a good hash function, the fCount of calls to strcmp() should be roughly the same as the fCount of successful lookups. - The hash table should NEVER get full, or stuff will loop forever!! */ // Initial table size (MUST be power of 2) #define DEF_HASH_SIZE 32 // Maximum hash table load factor (%) #define MAX_LOAD 80 // Minimum hash table load factor (%) #define MIN_LOAD 10 // Probe Position [0..n-1] #define HASH1(x,n) (((unsigned int)(x)*13)%(n)) // Probe Distance [1..n-1] #define HASH2(x,n) (1|(((unsigned int)(x)*17)%((n)-1))) #define FREEDATA (fFreeProc!=NULL) /*******************************************************************************/ // Construct empty hash template GHash::GHash(GFreeProc* freeProc) { GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); fFreeProc=freeProc; for (uint i=0; i GHash::GHash(bool doFree) { GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); fFreeProc = (doFree)?&DefaultFreeProc : NULL; for (uint i=0; i void GHash::Resize(int m){ register int i,n,p,x,h; GHashEntry *k; GASSERT(fCount<=fCapacity); if(m>2)>m) n>>=1; // Shrink until n/4 <= m while((n>>1)>1)); GASSERT(DEF_HASH_SIZE<=n); if(n!=fCapacity){ GASSERT(m<=n); GMALLOC(k, sizeof(GHashEntry)*n); for(i=0; i const OBJ* GHash::Add(const char* ky, const OBJ* pdata,bool mrk){ register int p,i,x,h,n; if(!ky) GError("GHash::insert: NULL key argument.\n"); GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount const OBJ* GHash::shkAdd(const char* ky, const OBJ* pdata,bool mrk){ register int p,i,x,h,n; if(!ky) GError("GHash::insert: NULL key argument.\n"); GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::Replace(const char* ky,const OBJ* pdata, bool mrk){ register int p,i,x,h,n; if(!ky){ GError("GHash::replace: NULL key argument.\n"); } GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::Remove(const char* ky){ register int p,x,h,n; if(!ky){ GError("GHash::remove: NULL key argument.\n"); } if(0 bool GHash::hasKey(const char* ky) { register int p,x,h,n; if(!ky){ GError("GHash::find: NULL key argument.\n"); } if(0 OBJ* GHash::Find(const char* ky){ register int p,x,h,n; if(!ky){ GError("GHash::find: NULL key argument.\n"); } if(0 void GHash::startIterate() {// initialize a key iterator; call fCurrentEntry=0; } template char* GHash::NextKey() { register int pos=fCurrentEntry; while (pos OBJ* GHash::NextData() { register int pos=fCurrentEntry; while (pos OBJ* GHash::NextData(char* &nextkey) { register int pos=fCurrentEntry; while (pos GHashEntry* GHash::NextEntry() { register int pos=fCurrentEntry; while (pos int GHash::First() const { register int pos=0; while(pos int GHash::Last() const { register int pos=fCapacity-1; while(0<=pos){ if(0<=hash[pos].hash) break; pos--; } GASSERT(pos<0 || 0<=hash[pos].hash); return pos; } // Find next valid entry template int GHash::Next(int pos) const { GASSERT(0<=pos && pos int GHash::Prev(int pos) const { GASSERT(0<=pos && pos= 0){ if(0<=hash[pos].hash) break; } GASSERT(pos<0 || 0<=hash[pos].hash); return pos; } // Remove all template void GHash::Clear(){ register int i; for(i=0; i=0){ if (hash[i].keyalloc) GFREE((hash[i].key)); if (FREEDATA) (*fFreeProc)(hash[i].data); } } GFREE(hash); GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); //reinitialize it for (i=0; i=0){ uint len=strlen(hash[i].key); store << len; store << hash[i].mark; store.save(hash[i].key,len); } } } // Load data void GHash::Load(Stream& store){ Object::load(store); store >> fCapacity; store >> fCount; for(int i=0; i> hash[i].hash; if(hash[i].hash>=0){ uint len; store >> len; store >> hash[i].mark; GMALLOC(hash[i].key,len+1); store.load(hash[i].key,len); hash[i].key[len]='\0'; } } } */ // Destroy table template GHash::~GHash(){ register int i; for(i=0; i=0){ if (hash[i].keyalloc) GFREE((hash[i].key)); if (FREEDATA) (*fFreeProc)(hash[i].data); } } GFREE(hash); } #endif cdbfasta/gclib/GStr.cpp0000664002442700244270000010135311306015643015150 0ustar gperteagpertea//--------------------------------------------------------------------------- #include "GStr.h" #include #include #include #include "GBase.h" #include #include //--------------------------------------------------------------------------- GStr::Data GStr::null_data; //========================================= GStr::Data * GStr::new_data(int length) { //static method to return a new Data object (allocate length) //content is undefined, but it's null terminated if (length > 0) { Data* data; GMALLOC(data, sizeof(Data)+length); data->ref_count = 0; data->length = length; data->chars[length] = '\0'; return data; } else return &null_data; } GStr::Data* GStr::new_data(const char* str) { //static method to return a new Data object (allocate length) //as a copy of a given string if (str==NULL) return &null_data; int length=strlen(str); if (length > 0) { Data* data; GMALLOC(data, sizeof(Data)+length); strcpy(data->chars, str); data->ref_count = 0; data->length = length; data->chars[length] = '\0'; return data; } else return &null_data; } void GStr::replace_data(int len) { if (len == my_data->length && my_data->ref_count <= 1) return; if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); if (len > 0) { //my_data = (Data *) malloc(sizeof(Data) + len); GMALLOC(my_data, sizeof(Data) + len); my_data->ref_count = 1; my_data->length = len; my_data->chars[len] = '\0'; } else my_data = &null_data; } void GStr::replace_data(Data *data) { if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); if (data != &null_data) data->ref_count++; my_data = data; } void GStr::make_unique() {//make sure is not a reference to other string if (my_data->ref_count > 1) { Data *data = new_data(length()); ::memcpy(data->chars, chars(), length()); my_data->ref_count--; my_data = data; my_data->ref_count++; } } bool operator==(const char *s1, const GStr& s2){ if (s1==NULL) return s2.is_empty(); return (strcmp(s1, s2.chars()) == 0); } bool operator<(const char *s1, const GStr& s2) { if (s1==NULL) return !s2.is_empty(); return (strcmp(s1, s2.chars()) < 0); } bool operator<=(const char *s1, const GStr& s2){ if (s1==NULL) return true; return (strcmp(s1, s2.chars()) <= 0); } bool operator>(const char *s1, const GStr& s2) { if (s1==NULL) return false; return (strcmp(s1, s2.chars()) > 0); } GStr::GStr():my_data(&null_data) { fTokenDelimiter=NULL; fLastTokenStart=0; readbuf=NULL; } GStr::GStr(const GStr& s): my_data(&null_data){ fTokenDelimiter=NULL; fLastTokenStart=0; readbuf=NULL; replace_data(s.my_data); } GStr::GStr(const char *s): my_data(&null_data) { fTokenDelimiter=NULL; fLastTokenStart=0; readbuf=NULL; my_data=new_data(s); my_data->ref_count = 1; } GStr::GStr(const int i): my_data(&null_data) { fTokenDelimiter=NULL; fLastTokenStart=0; readbuf=NULL; char buf[20]; sprintf(buf,"%d",i); const int len = ::strlen(buf); replace_data(len); ::memcpy(chrs(), buf, len); } GStr::GStr(const double f): my_data(&null_data) { fTokenDelimiter=NULL; fLastTokenStart=0; readbuf=NULL; char buf[20]; sprintf(buf,"%f",f); const int len = ::strlen(buf); replace_data(len); ::memcpy(chrs(), buf, len); } GStr::GStr(char c, int n): my_data(&null_data) { fTokenDelimiter=NULL; fLastTokenStart=0; readbuf=NULL; replace_data(n); ::memset(chrs(), c, n); } GStr::~GStr() { if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); GFREE(fTokenDelimiter); GFREE(readbuf); } char& GStr::operator[](int idx){ //returns reference to char (can be l-value) if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("operator[]"); make_unique(); //because the user will probably modify this char! return chrs()[idx]; } char GStr::operator[](int idx) const { //returns char copy (cannot be l-value!) if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("operator[]"); return chars()[idx]; } GStr& GStr::operator=(const GStr& s) { make_unique(); //edit operation ahead replace_data(s.my_data); return *this; } GStr& GStr::operator=(const char *s) { make_unique(); //edit operation ahead if (s==NULL) { replace_data(0); return *this; } const int len = ::strlen(s); replace_data(len); ::memcpy(chrs(), s, len); return *this; } GStr& GStr::operator=(const double f) { make_unique(); //edit operation ahead char buf[20]; sprintf(buf,"%f",f); const int len = ::strlen(buf); replace_data(len); ::memcpy(chrs(), buf, len); return *this; } GStr& GStr::operator=(const int i) { make_unique(); //edit operation ahead char buf[20]; sprintf(buf,"%d",i); const int len = ::strlen(buf); replace_data(len); ::memcpy(chrs(), buf, len); return *this; } bool GStr::operator==(const GStr& s) const { if (s.is_empty()) return is_empty(); return (length() == s.length()) && (memcmp(chars(), s.chars(), length()) == 0); } bool GStr::operator==(const char *s) const { if (s==NULL) return is_empty(); return (strcmp(chars(), s) == 0); } bool GStr::operator<(const GStr& s) const { if (s.is_empty()) return false; return (strcmp(chars(), s.chars()) < 0); } bool GStr::operator<(const char *s) const { if (s==NULL) return false; return (strcmp(chars(), s) < 0); } bool GStr::operator<=(const GStr& s) const { if (s.is_empty()) return is_empty(); return (strcmp(chars(), s.chars()) <= 0); } bool GStr::operator<=(const char *s) const { if (s==NULL) return is_empty(); return (strcmp(chars(), s) <= 0); } bool GStr::operator>(const GStr& s) const { if (s.is_empty()) return !is_empty(); return (strcmp(chars(), s.chars()) > 0); } bool GStr::operator>(const char *s) const { if (s==NULL) return !is_empty(); return (strcmp(chars(), s) > 0); } bool GStr::operator>=(const GStr& s) const { if (s.is_empty()) return true; return (strcmp(chars(), s.chars()) >= 0); } bool GStr::operator>=(const char *s) const { if (s==NULL) return true; return (strcmp(chars(), s) >= 0); } bool GStr::operator!=(const GStr& s) const { if (s.is_empty()) return !is_empty(); return (length() != s.length()) || (memcmp(chars(), s.chars(), length()) != 0); } bool GStr::operator!=(const char *s) const { if (s==NULL) return !is_empty(); return (strcmp(chars(), s) != 0); } GStr& GStr::operator+=(const GStr& s) { return append((const char *)s); } GStr& GStr::operator+=(const char* s) { return append(s); } GStr& GStr::operator+=(const char c) { char buf[4]; sprintf(buf,"%c",c); return append(buf); } GStr& GStr::operator+=(const int i) { char buf[20]; sprintf(buf,"%d",i); return append(buf); } GStr& GStr::operator+=(const double f) { char buf[30]; sprintf(buf,"%f",f); return append(buf); } bool GStr::is_empty() const { //return my_data == &null_data; return (length()==0); } GStr GStr::copy() const { GStr newstring(*this); return newstring; } GStr& GStr::clear() { make_unique(); //edit operation ahead replace_data(0); return *this; } int GStr::index(const GStr& s, int start_index) const { return index(s.chars(), start_index); } bool GStr::contains(const GStr& s) const { return (index(s, 0) >= 0); } bool GStr::contains(const char *s) const { return (index(s, 0) >= 0); } bool GStr::startsWith(const char *s) const { return (index(s, 0) == 0); } bool GStr::contains(char c) const { return (index(c, 0) >= 0); } GStr& GStr::format(const char *fmt,...) { // Format as in sprintf make_unique(); //edit operation ahead char* buf; GMALLOC(buf, strlen(fmt)+1024); va_list arguments; va_start(arguments,fmt); //+1K buffer, should be enough for common expressions int len=vsprintf(buf,fmt,arguments); va_end(arguments); replace_data(len); //this also adds the '\0' at the end! //and sets the right len ::memcpy(chrs(), buf, len); GFREE(buf); return *this; } GStr& GStr::appendfmt(const char *fmt,...) { // Format as in sprintf make_unique(); //edit operation ahead char* buf; GMALLOC(buf, strlen(fmt)+1024); va_list arguments; va_start(arguments,fmt); //+1K buffer, should be enough for common expressions vsprintf(buf,fmt,arguments); va_end(arguments); append(buf); GFREE(buf); return *this; } GStr& GStr::trim(char c) { register int istart; register int iend; for (istart=0; istartistart && chars()[iend]==c;iend--); int newlen=iend-istart+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, &chars()[istart], newlen); replace_data(data); return *this; } GStr& GStr::trim(const char* c) { register int istart; register int iend; for (istart=0; istartistart && strchr(c, chars()[iend])!=NULL;iend--); int newlen=iend-istart+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, &chars()[istart], newlen); replace_data(data); return *this; } GStr& GStr::trimR(char c) { //only trim the right end //register int istart; register int iend; for (iend=length()-1; iend>=0 && chars()[iend]==c;iend--); if (iend==-1) { replace_data(0); //string was entirely trimmed return *this; } int newlen=iend+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, chars(), newlen); replace_data(data); return *this; } GStr& GStr::trimR(const char* c) { register int iend; for (iend=length()-1; iend>=0 && strchr(c,chars()[iend])!=NULL;iend--); if (iend==-1) { replace_data(0); //string was entirely trimmed return *this; } int newlen=iend+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, chars(), newlen); replace_data(data); return *this; } GStr& GStr::chomp(const char* cstr) { register int iend; if (cstr==NULL || *cstr==0) return *this; //check if this ends with cstr int cend=strlen(cstr)-1; iend=my_data->length-1; while (iend>=0 && cend>=0) { if (my_data->chars[iend]!=cstr[cend]) return *this; iend--; cend--; } if (iend==-1) { replace_data(0); //string will be entirely trimmed return *this; } int newlen=iend+1; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, chars(), newlen); replace_data(data); return *this; } GStr& GStr::trimL(char c) { register int istart; for (istart=0; istartchars, &chars()[istart], newlen); replace_data(data); return *this; } GStr& GStr::trimL(const char* c) { register int istart; for (istart=0; istartchars, &chars()[istart], newlen); replace_data(data); return *this; } GStr& GStr::padR(int len, char c) { //actually means align right in len if (length()>=len) return *this; //no room for padding make_unique(); //edit operation ahead Data *data = new_data(len); ::memset(data->chars,c,len-length()); ::memcpy(&data->chars[len-length()], chars(), length()); replace_data(data); return *this; } GStr& GStr::padL(int len, char c) { //align left the string if (length()>=len) return *this; //no room for padding make_unique(); //edit operation ahead Data *data = new_data(len); ::memcpy(data->chars, chars(), length()); ::memset(&data->chars[length()],c,len-length()); replace_data(data); return *this; } GStr& GStr::padC(int len, char c) { if (length()>=len) return *this; //no room for padding make_unique(); //edit operation ahead int istart=(len-length())/2; Data *data = new_data(len); if (istart>0) ::memset(data->chars, c, istart); ::memcpy(&data->chars[istart], chars(), length()); int iend=istart+length(); if (iendchars[iend],c,len-iend); replace_data(data); return *this; } GStr operator+(const char *s1, const GStr& s2) { const int s1_length = ::strlen(s1); if (s1_length == 0) return s2; else { GStr newstring; newstring.replace_data(s1_length + s2.length()); ::memcpy(newstring.chrs(), s1, s1_length); ::memcpy(&(newstring.chrs())[s1_length], s2.chars(), s2.length()); return newstring; } } //========================================= GStr GStr::operator+(const GStr& s) const { if (length() == 0) return s; else if (s.length() == 0) return *this; else { GStr newstring; newstring.replace_data(length() + s.length()); ::memcpy(newstring.chrs(), chars(), length()); ::memcpy(&(newstring.chrs())[length()], s.chars(), s.length()); return newstring; } } //========================================= GStr GStr::operator+(const char *s) const { const int s_length = ::strlen(s); if (s_length == 0) return *this; else { GStr newstring; newstring.replace_data(length() + s_length); ::memcpy(newstring.chrs(), chars(), length()); ::memcpy(&(newstring.chrs())[length()], s, s_length); return newstring; } } GStr GStr::operator+(const int i) const { char buf[20]; sprintf(buf, "%d", i); const int s_length = ::strlen(buf); GStr newstring; newstring.replace_data(length() + s_length); ::memcpy(newstring.chrs(), chars(), length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } GStr GStr::operator+(const char c) const { char buf[4]; sprintf(buf, "%c", c); const int s_length = ::strlen(buf); GStr newstring; newstring.replace_data(length() + s_length); ::memcpy(newstring.chrs(), chars(), length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } GStr GStr::operator+(const double f) const { char buf[30]; sprintf(buf, "%f", f); const int s_length = ::strlen(buf); GStr newstring; newstring.replace_data(length() + s_length); ::memcpy(newstring.chrs(), chars(), length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } //========================================= bool GStr::is_space() const { if (my_data == &null_data) return false; for (register const char *p = chars(); *p; p++) if (!isspace(*p)) return false; return true; } //========================================= GStr GStr::substr(int idx, int len) const { // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); // A length of -1 specifies the rest of the string. if (len == -1 || len>length()-idx) len = length() - idx; if (idx<0 || idx>=length() || len<0 ) invalid_args_error("substr()"); GStr newstring; newstring.replace_data(len); ::memcpy(newstring.chrs(), &chars()[idx], len); return newstring; } //transform: any character from 'from' is replaced with a coresponding //char from 'to' GStr& GStr::tr(const char *rfrom, const char* rto) { if (length() == 0 || rfrom==NULL || strlen(rfrom)==0) return *this; unsigned int l=strlen(rfrom); if (rto!=NULL && strlen(rto)!=l) invalid_args_error("tr()"); make_unique(); //edit operation ahead Data *data = new_data(length()); if (rto==NULL) { //deletion case char* s = my_data->chars; char* p; char* dest = data->chars; do { if ((p=strpbrk(s,rfrom))!=NULL) { memcpy(dest,s,p-s); dest+=p-s; s=p+1; } else { strcpy(dest, s); dest+=strlen(s); } } while (p!=NULL); (*dest)='\0'; } else { //char substitution case - easier! const char* p; for (int i=0; ichars[i]))!=NULL) my_data->chars[i]=rto[p-rfrom]; } } data->length=strlen(data->chars); replace_data(data); return *this; } // search and replace all the occurences of a string with another string // or just remove the given string (if replacement is NULL) GStr& GStr::replace(const char *rfrom, const char* rto) { if (length() == 0 || rfrom==NULL || strlen(rfrom)==0) return *this; unsigned int l=strlen(rfrom); unsigned int tl= (rto==NULL)?0:strlen(rto); make_unique(); //edit operation ahead char* p; char* dest; char* newdest=NULL; char* s = my_data->chars; if (tl!=l) { //reallocation if (tl>l) { //possible enlargement GMALLOC(newdest, length()*(tl-l+1)+1); } else {//delete or replace with a shorter string GMALLOC(newdest, length() + 1); } dest=newdest; if (tl==0) {//deletion while ((p=strstr(s,rfrom))!=NULL) { //rfrom found at position p memcpy(dest,s,p-s); dest+=p-s; s+=p-s+l; //s positioned in string after rfrom } //no more occurences, copy the remaining string strcpy(dest, s); } else { //replace with another string while ((p=strstr(s,rfrom))!=NULL) { memcpy(dest,s,p-s); //copy up rto the match dest+=p-s; memcpy(dest,rto,tl); //put the replacement string dest+=tl; s+=p-s+l; } //not found any more, copy rto end of string strcpy(dest, s); } Data* data=new_data(newdest); replace_data(data); GFREE(newdest); } else { //inplace editing: no need rto reallocate while ((p=strstr(s,rfrom))!=NULL) { memcpy(p,rto,l); s+=p-s+l; } } return *this; } GStr& GStr::cut(int idx, int len) { if (len == 0) return *this; make_unique(); //edit operation ahead // A negative idx specifies an idx from the right of the string, // so the left part will be cut out if (idx < 0) idx += length(); // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("cut()"); Data *data = new_data(length() - len); if (idx > 0) ::memcpy(data->chars, chars(), idx); ::strcpy(&data->chars[idx], &chars()[idx+len]); replace_data(data); return *this; } //========================================= GStr& GStr::paste(const GStr& s, int idx, int len) { // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); make_unique(); //edit operation ahead // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("replace()"); if (len == s.length() && my_data->ref_count == 1) ::memcpy(&chrs()[idx], s.chars(), len); else { Data *data = new_data(length() - len + s.length()); if (idx > 0) ::memcpy(data->chars, chars(), idx); if (s.length() > 0) ::memcpy(&data->chars[idx], s.chars(), s.length()); ::strcpy(&data->chars[idx+s.length()], &chars()[idx+len]); replace_data(data); } return *this; } //========================================= GStr& GStr::paste(const char *s, int idx, int len) { // A negative idx specifies an idx from the right of the string. make_unique(); //edit operation ahead if (idx < 0) idx += length(); // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("replace()"); const int s_length = ::strlen(s); if (len == s_length && my_data->ref_count == 1) ::memcpy(&chrs()[idx], s, len); else { Data *data = new_data(length() - len + s_length); if (idx > 0) ::memcpy(data->chars, chars(), idx); if (s_length > 0) ::memcpy(&data->chars[idx], s, s_length); ::strcpy(&data->chars[idx+s_length], &chars()[idx+len]); replace_data(data); } return *this; } //========================================= GStr& GStr::insert(const GStr& s, int idx) { make_unique(); //edit operation ahead // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("insert()"); if (s.length() > 0) { Data *data = new_data(length() + s.length()); if (idx > 0) ::memcpy(data->chars, chars(), idx); ::memcpy(&data->chars[idx], s.chars(), s.length()); ::strcpy(&data->chars[idx+s.length()], &chars()[idx]); replace_data(data); } return *this; } //========================================= GStr& GStr::insert(const char *s, int idx) { // A negative idx specifies an idx from the right of the string. make_unique(); //edit operation ahead if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("insert()"); const int s_length = ::strlen(s); if (s_length > 0) { Data *data = new_data(length() + s_length); if (idx > 0) ::memcpy(data->chars, chars(), idx); ::memcpy(&data->chars[idx], s, s_length); ::strcpy(&data->chars[idx+s_length], &chars()[idx]); replace_data(data); } return *this; } //========================================= GStr& GStr::append(const char* s) { make_unique(); //edit operation ahead int len=::strlen(s); int newlength=len+my_data->length; if (newlength<=my_data->length) return *this; if (my_data->length==0) { replace_data(len); ::memcpy(my_data->chars, s, len); return *this; } //faster solution with realloc GREALLOC(my_data, sizeof(Data)+newlength); ::strcpy(&my_data->chars[my_data->length], s); my_data->length=newlength; my_data->chars[newlength]='\0'; return *this; } GStr& GStr::append(const GStr& s) { return append((const char *)s); } GStr& GStr::upper() { make_unique(); //edit operation ahead for (register char *p = chrs(); *p; p++) *p = (char) toupper(*p); return *this; } //========================================= GStr& GStr::lower() { make_unique(); for (register char *p = chrs(); *p; p++) *p = (char) tolower(*p); return *this; } //========================================= int GStr::index(const char *s, int start_index) const { // A negative index specifies an index from the right of the string. if (strlen(s)>(size_t)length()) return -1; if (start_index < 0) start_index += length(); if (start_index < 0 || start_index >= length()) invalid_index_error("index()"); const char* idx = strstr(&chars()[start_index], s); if (!idx) return -1; else return idx - chars(); } //========================================= int GStr::index(char c, int start_index) const { // A negative index specifies an index from the right of the string. if (length()==0) return -1; if (start_index < 0) start_index += length(); if (start_index < 0 || start_index >= length()) invalid_index_error("index()"); if (c == '\0') return -1; const char *idx=(char *) ::memchr(&chars()[start_index], c, length()-start_index); if (idx==NULL) return -1; else return idx - chars(); } int GStr::rindex(char c) const { if (c == '\0' || length()==0) return -1; char* idx= rstrchr((char*)chars(), c); if (idx==NULL) return -1; else return idx-chars(); } int GStr::rindex(const char* str) const { if (str==NULL || *str == '\0' || length()==0) return -1; char* idx= rstrfind((char*)chars(), str); if (idx==NULL) return -1; else return idx-chars(); } GStr GStr::split(const char* delim) { /* splits "this" in two parts, at the first (left) encounter of delim: 1st would stay in "this", 2nd part will be returned as a new string! */ GStr result; int i=index(delim); if (i>=0){ result=substr(i+strlen(delim)); cut(i); return result; } return result; } GStr GStr::split(char c) { /* splits "this" in two parts, at the first (left) encounter of delim: 1st would stay in "this", 2nd part will be returned as a new string! */ GStr result; int i=index(c); if (i>=0){ result=substr(i+1); cut(i); return result; } return result; } GStr GStr::splitr(const char* delim) { GStr result; int i=rindex(delim); if (i>=0){ result=substr(i+strlen(delim)); cut(i); return result; } return result; } GStr GStr::splitr(char c) { GStr result; int i=rindex(c); if (i>=0){ result=substr(i+1); cut(i); return result; } return result; } void GStr::startTokenize(const char* delimiter, enTokenizeMode tokenizemode) { GFREE(fTokenDelimiter); GMALLOC(fTokenDelimiter,strlen(delimiter)+1); strcpy(fTokenDelimiter, delimiter); fLastTokenStart=0; fTokenizeMode=tokenizemode; } bool GStr::nextToken(GStr& token) { if (fTokenDelimiter==NULL) { GError("GStr:: no token delimiter; use StartTokenize first\n"); } if (fLastTokenStart>=length()) {//no more GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } int dlen=strlen(fTokenDelimiter); char* delpos=NULL; //delimiter position int tlen=0; if (fTokenizeMode==tkFullString) { //exact string as a delimiter delpos=(char*)strstr(chars()+fLastTokenStart,fTokenDelimiter); if (delpos==NULL) delpos=(char*)(chars()+length()); //empty records may be returned if (chars()+fLastTokenStart == delpos) { //empty token fLastTokenStart=(delpos-chars())+dlen; token=""; return true; } else { tlen=delpos-(chars()+fLastTokenStart); token.replace_data(tlen); ::memcpy(token.chrs(), &chars()[fLastTokenStart], tlen); fLastTokenStart=(delpos-chars())+dlen; return true; } } else { //tkCharSet - any character is a delimiter //empty records are never returned ! if (fLastTokenStart==0) {//skip any starting delimiters delpos=(char*)chars(); while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL) delpos++; if (*delpos!='\0') fLastTokenStart = delpos-chars(); else { //only delimiters here,no tokens GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } } //now fLastTokenStart is on a non-delimiter char //GMessage("String at fLastTokenStart=%d is %s\n", fLastTokenStart, delpos); char* token_end=NULL; delpos=(char*)strpbrk(chars()+fLastTokenStart,fTokenDelimiter); if (delpos==NULL) delpos=(char*)(chars()+length()); token_end=delpos-1; while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL) delpos++; //skip any other delimiters in the set! //now we know that delpos is on the beginning of next token tlen=(token_end-chars())-fLastTokenStart+1; if (tlen==0) { GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } token.replace_data(tlen); ::memcpy(token.chrs(), &chars()[fLastTokenStart], tlen); fLastTokenStart=delpos-chars(); return true; } //return true; } size_t GStr::read(FILE* stream, const char* delimiter, size_t bufsize) { //read up to (and including) the given delimiter string if (readbuf==NULL) { GMALLOC(readbuf, bufsize); readbufsize=bufsize; } else if (bufsize!=readbufsize) { GFREE(readbuf); if (bufsize>0) { GMALLOC(readbuf, bufsize); } readbufsize=bufsize; } if (bufsize==0) { replace_data(0); return 0; //clear the string and free the buffer } size_t numread; size_t acc_len=0; //accumulated length int seplen=strlen(delimiter); void* p=NULL; Data *data = new_data(0); do { numread=fread(readbuf, 1, bufsize, stream); if (numread) { p=Gmemscan(readbuf, bufsize, (void*) delimiter, seplen); if (p!=NULL) {//found the delimiter //position the stream after it int l = (char*)p-(char*)readbuf; fseek(stream, l+seplen-numread, SEEK_CUR); numread=l+seplen; } else {//not found, go back if not eof if (numread==bufsize) { fseek(stream, -seplen, SEEK_CUR); //check if this works! numread-=seplen; } } if (data==&null_data) { data=new_data(numread); ::memcpy(data->chars, readbuf, numread); acc_len+=numread; } else { GREALLOC(data, sizeof(Data)+acc_len+numread); memcpy(&data->chars[acc_len], readbuf, numread); acc_len+=numread; data->length=acc_len; data->chars[acc_len]='\0'; } } //if something read } while (p==NULL && numread!=0); replace_data(data); return acc_len; } int GStr::asInt(int base /*=10 */) { return strtol(text(), NULL, base); } bool GStr::asInt(int& r, int base) { errno=0; char*endptr; long val=strtol(text(), &endptr, base); if (errno!=0) return false; if (endptr == text()) return false; /* If we got here, strtol() successfully parsed a number */ r=val; return true; } double GStr::asReal() { return strtod(text(), NULL); } bool GStr::asReal(double& r) { errno=0; char* endptr; double val=strtod(text(), &endptr); if (errno!=0) return false; if (endptr == text()) return false; //no digits to parse r=val; return true; } int GStr::peelInt() const { if (is_empty()) return 0; char buf[24]; bool started=false; int j=0; int i; for (i=0;ichars[i])) j++; //set coord else break; //finished } else if (isdigit(my_data->chars[i])) { j++; started=true; } } if (j>0) { strncpy(buf, &my_data->chars[i-j], j); buf[j]='\0'; return strtol(buf, NULL, 10); } return 0; } int GStr::peelIntR() const { if (is_empty()) return 0; char buf[24]; bool started=false; int j=0; int i; for (i=length()-1;i>=0;i--) { if (started) { if (isdigit(my_data->chars[i])) j++; //set length else break; //finished } else if (isdigit(my_data->chars[i])) { j++; started=true; } } if (j>0) { strncpy(buf, &my_data->chars[i+1], j); buf[j]='\0'; return strtol(buf, NULL, 10); } return 0; } GStr GStr::to(char c) { //return the first part up to first occurence of c int i=index(c); if (i>=0) return substr(0,i); else return (*this); } //or whole string if c not found GStr GStr::from(char c) { //same as to, but starting from the right side int i=rindex(c); if (i>=0) return substr(i+1); else return (*this); } int GStr::count(char c){ //return the number of occurences of char c within the string int result=0; for (int i=0;ichars[i]==c) result++; return result; } //========================================= void GStr::invalid_args_error(const char *fname) { GError("GStr:: %s - invalid arguments\n", fname); } //**************************************************************************** void GStr::invalid_index_error(const char *fname) { GError("GStr:: %s - invalid index\n", fname); } //**************************************************************************** cdbfasta/gclib/GStr.h0000664002442700244270000002050511306015643014614 0ustar gperteagpertea//--------------------------------------------------------------------------- #ifndef GSTR_H #define GSTR_H //--------------------------------------------------------------------------- #include #include #include #include "GBase.h" // This class uses reference counting and copy-on-write semantics // All indexes are zero-based. For all functions that accept an index, a // negative index specifies an index from the right of the string. Also, // for all functions that accept a length, a length of -1 specifies the rest // of the string. enum enTokenizeMode { tkFullString, tkCharSet }; class GStr { friend GStr operator+(const char* s1, const GStr& s2); friend bool operator==(const char* s1, const GStr& s2); friend bool operator<(const char* s1, const GStr& s2); friend bool operator<=(const char* s1, const GStr& s2); friend bool operator>(const char* s1, const GStr& s2); friend bool operator>=(const char* s1, const GStr& s2); friend bool operator!=(const char* s1, const GStr& s2); friend void swap(GStr& s1, GStr& s2); public: GStr(); GStr(const GStr& s); GStr(const char* s); GStr(const int i); GStr(const double f); GStr(char c, int n = 1); ~GStr(); operator const char* () const { return my_data->chars;} //inline here char& operator[](int index); char operator[](int index) const; GStr& operator=(const GStr& s); GStr& operator=(const char* s); GStr& operator=(const int i); GStr& operator=(const double f); GStr operator+(const GStr& s) const; GStr operator+(const char* s) const; GStr operator+(const char c) const; GStr operator+(const int i) const; GStr operator+(const double f) const; bool operator==(const GStr& s) const; bool operator==(const char* s) const; bool operator<(const GStr& s) const; bool operator<(const char* s) const; bool operator<=(const GStr& s) const; bool operator<=(const char* s) const; bool operator>(const GStr& s) const; bool operator>(const char* s) const; bool operator>=(const GStr& s) const; bool operator>=(const char* s) const; bool operator!=(const GStr& s) const; bool operator!=(const char* s) const; GStr& operator+=(const GStr& s); GStr& operator+=(const char* s); GStr& operator+=(const char c); GStr& operator+=(const int i); GStr& operator+=(const double f); //interface: public: int length() const; bool is_empty() const; bool is_space() const; GStr substr(int index = 0, int len = -1) const; GStr to(char c); //return the first part up to first occurence of c //or whole string if c not found GStr from(char c); //same as to, but starting from the right side GStr copy() const; GStr& format(const char *fmt,...); GStr& appendfmt(const char *fmt,...); GStr& cut(int index = 0, int len = -1); //delete a specified length GStr& remove(int from, int to) { return cut(from, to-from+1); } //paste a string at the specified position GStr& paste(const GStr& s, int index = 0, int len=-1); GStr& paste(const char* s, int index = 0, int len = -1); GStr& replace(const char* from, const char* to=NULL); GStr& insert(const GStr& s, int index = 0); GStr& insert(const char* s, int index = 0); GStr& append(const char* s); GStr& append(const GStr& s); GStr& upper(); GStr& lower(); GStr& clear();//make empty //character translation or removal: GStr& tr(const char* from, const char* to=NULL); //number of occurences of a char in the string: int count(char c); void startTokenize(const char* delimiter, enTokenizeMode tokenizemode=tkCharSet); bool nextToken(GStr& token); int asInt(int base=10); double asReal(); double asDouble() { return asReal(); } bool asReal(double& r); bool asDouble(double& r) { return asReal(r); } bool asInt(int& r, int base=10); int index(const GStr& s, int start_index = 0) const; int index(const char* s, int start_index = 0) const; int index(char c, int start_index = 0) const; int rindex(char c) const; int rindex(const char* str) const; bool contains(const GStr& s) const; bool contains(const char* s) const; bool contains(char c) const; bool startsWith(const char* s) const; GStr split(const char* delim); GStr split(char c); /* splits "this" in two parts, at the first (leftmost) encounter of delim: 1st would stay in "this" (which this way is truncated) 2nd will go to the returned string */ GStr splitr(const char* delim); GStr splitr(char c); /* splits "this" in two parts, at the last (rightmost) encounter of delim: 1st would stay in "this" 2nd will be returned */ int peelInt() const; //extract an integer, (left to right), from a //mixed alphanumeric string, e.g. 'T24HC1234b'=> 2 int peelIntR() const; //same as above, but starts from the right side //e.g. 'T2HC1234b'=> 1234 GStr& trim(char c); GStr& trim(const char* c=" \t\n\r"); //trim both ends of characters in given set GStr& trimR(const char* c=" \t\n\r"); //trim only right end GStr& trimR(char c=' '); GStr& chomp(char c='\n') { return trimR(c); } GStr& chomp(const char* cstr); //like trimR, but given string is taken as a whole GStr& trimL(const char* c=" \t\n\r"); //trim only left end GStr& trimL(char c=' '); GStr& padR(int len, char c=' '); //align it in len spaces to the right GStr& padL(int len, char c=' '); //align it in len spaces to the left GStr& padC(int len, char c=' '); //center it size_t read(FILE* stream, const char* delimiter="\n", size_t bufsize=4096); //read next token from stream, using the given string as //a marker where the block should stop static const int max_token_size = 200; static const int max_line_size = 600; const char* chars() const; const char* text() const; protected: char* fTokenDelimiter; int fLastTokenStart; enTokenizeMode fTokenizeMode; void* readbuf; //file read buffer for the read() function size_t readbufsize; //last setting for the readbuf static void invalid_args_error(const char* fname); static void invalid_index_error(const char* fname); struct Data {//structure holding actual //string data and reference count information Data() { ref_count=0; length=0; chars[0] = '\0'; } unsigned int ref_count; int length; char chars[1]; }; static Data* new_data(int length); //alloc a specified length string's Data static Data* new_data(const char* str); //alloc a copy of a specified string void replace_data(int length); void replace_data(Data* data); void make_unique(); char* chrs(); // this is dangerous, length should not be affected static Data null_data; //a null (empty) string Data is available here Data* my_data; //pointer to a Data object holding actual string data }; /***************************************************************************/ inline int GStr::length() const { return my_data->length; } inline const char *GStr::chars() const { return my_data->chars; } inline char *GStr::chrs() { //protected version, allows modification of the chars return my_data->chars; } inline const char *GStr::text() const { return my_data->chars; } inline bool operator>=(const char *s1, const GStr& s2) { return (strcmp(s1, s2.chars()) >= 0); } inline bool operator!=(const char *s1, const GStr& s2) { return (strcmp(s1, s2.chars()) != 0); } inline void swap(GStr& s1, GStr& s2) { GStr::Data *tmp = s1.my_data; s1.my_data = s2.my_data; s2.my_data = tmp; } #endif