pax_global_header00006660000000000000000000000064132270342250014512gustar00rootroot0000000000000052 comment=7104852f4d47eed0146bc04d57f4b2aa5bd7031d DAZZ_DB-master/000077500000000000000000000000001322703422500134645ustar00rootroot00000000000000DAZZ_DB-master/Catrack.c000066400000000000000000000256451322703422500152140ustar00rootroot00000000000000/******************************************************************************************** * * Concate in block order all "block tracks" ..# into a single track * . * * Author: Gene Myers * Date : June 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-vfd] "; int main(int argc, char *argv[]) { char *prefix; FILE *aout, *dout; int nblocks; int nfiles; int VERBOSE; int FORCE; int DELETE; // Process arguments { int i, j, k; int flags[128]; ARG_INIT("Catrack") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vfd") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; FORCE = flags['f']; DELETE = flags['d']; if (argc != 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -v: verbose\n"); fprintf(stderr," -d: delete individual blocks after a successful concatenation\n"); fprintf(stderr," -f: force overwrite of track if already present\n"); exit (1); } } { char *pwd, *root; int i, plen, index, isdam; FILE *dstub; char *dstub_name; plen = strlen(argv[1]); if (strcmp(argv[1]+(plen-3),".dam") == 0) root = Root(argv[1],".dam"); else root = Root(argv[1],".db"); pwd = PathTo(argv[1]); prefix = Strdup(Catenate(pwd,PATHSEP,root,"."),"Allocating track name"); dstub = fopen(Catenate(pwd,"/",root,".db"),"r"); isdam = 0; if (dstub == NULL) { dstub = fopen(Catenate(pwd,"/",root,".dam"),"r"); isdam = 1; if (dstub == NULL) { fprintf(stderr,"%s: Cannot find %s either as a .db or a .dam\n",Prog_Name,root); exit (1); } } dstub_name = Strdup(Catenate(pwd,"/",root,isdam?".dam":".db"),"Allocating db file name"); if (dstub_name == NULL) exit (1); FSCANF(dstub,DB_NFILE,&nblocks) for (i = 0; i < nblocks; i++) { char prolog[MAX_NAME], fname[MAX_NAME]; FSCANF(dstub,DB_FDATA,&index,fname,prolog) } FSCANF(dstub,DB_NBLOCK,&nblocks) fclose(dstub); free(dstub_name); free(pwd); free(root); aout = fopen(Catenate(prefix,argv[2],".","anno"),"r"); if (aout != NULL && !FORCE) { fprintf(stderr,"%s: Track file %s%s.anno already exists!\n",Prog_Name,prefix,argv[2]); exit (1); } dout = fopen(Catenate(prefix,argv[2],".","data"),"r"); if (dout != NULL && !FORCE) { fprintf(stderr,"%s: Track file %s%s.data already exists!\n",Prog_Name,prefix,argv[2]); exit (1); } aout = Fopen(Catenate(prefix,argv[2],".","anno"),"w"); if (aout == NULL) exit (1); dout = NULL; } { int tracktot, tracksiz; int64 trackoff; char data[1024]; void *anno; FILE *lfile = NULL; DAZZ_EXTRA *extra; int nextra; int64 extail; anno = NULL; trackoff = 0; tracktot = tracksiz = 0; if (fwrite(&tracktot,sizeof(int),1,aout) != 1) SYSTEM_WRITE_ERROR if (fwrite(&tracksiz,sizeof(int),1,aout) != 1) SYSTEM_WRITE_ERROR nextra = 0; nfiles = 0; while (1) { FILE *dfile, *afile; char *dfile_name, *afile_name; int i, size, esize, tracklen; afile_name = Strdup(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","anno")), "Allocating .anno file name"); dfile_name = Strdup(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","data")), "Allocating .data file name"); if (afile_name == NULL || dfile_name == NULL) goto error; afile = fopen(afile_name,"r"); if (afile == NULL) break; dfile = fopen(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","data")),"r"); if (dfile == NULL && errno != ENOENT) { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,dfile_name); goto error; } if (nfiles > 0) fclose(lfile); lfile = afile; if (VERBOSE) { fprintf(stderr,"Concatenating %s%d.%s ...\n",prefix,nfiles+1,argv[2]); fflush(stderr); } FREAD(&tracklen,sizeof(int),1,afile) FREAD(&size,sizeof(int),1,afile) if (size == 0) esize = 8; else esize = size; if (nfiles == 0) { tracksiz = size; if (dfile != NULL) { dout = Fopen(Catenate(prefix,argv[2],".","data"),"w"); if (dout == NULL) goto error; } else { anno = Malloc(esize,"Allocating annotation record"); if (anno == NULL) goto error; } } else { int escape = 1; if (tracksiz != size) { fprintf(stderr,"%s: Track block %d does not have the same annotation size (%d)", Prog_Name,nfiles+1,size); fprintf(stderr," as previous blocks (%d)\n",tracksiz); } else if (dfile == NULL && dout != NULL) fprintf(stderr,"%s: Track block %d does not have data but previous blocks do\n", Prog_Name,nfiles+1); else if (dfile != NULL && dout == NULL) fprintf(stderr,"%s: Track block %d has data but previous blocks do not\n", Prog_Name,nfiles+1); else escape = 0; if (escape) goto error; } if (dfile != NULL) { int64 dlen; if (esize == 4) { int anno4; for (i = 0; i < tracklen; i++) { FREAD(&anno4,sizeof(int),1,afile) anno4 += trackoff; FWRITE(&anno4,sizeof(int),1,aout) } FREAD(&anno4,sizeof(int),1,afile) dlen = anno4; } else { int64 anno8; for (i = 0; i < tracklen; i++) { FREAD(&anno8,sizeof(int64),1,afile) anno8 += trackoff; FWRITE(&anno8,sizeof(int64),1,aout) } FREAD(&anno8,sizeof(int64),1,afile) dlen = anno8; } trackoff += dlen; for (i = 1024; i < dlen; i += 1024) { FREAD(data,1024,1,dfile) FWRITE(data,1024,1,dout) } i -= 1024; if (i < dlen) { FREAD(data,dlen-i,1,dfile) FWRITE(data,dlen-i,1,dout) } } else { for (i = 0; i < tracklen; i++) { FREAD(anno,esize,1,afile) FWRITE(anno,esize,1,aout) } } FSEEKO(afile,0,SEEK_END) if (dfile != NULL) extail = FTELLO(afile) - (esize*(tracklen+1) + 2*sizeof(int)); else extail = FTELLO(afile) - (esize*tracklen + 2*sizeof(int)); FSEEKO(afile,-extail,SEEK_END) if (extail >= 20) { if (nfiles == 0) { nextra = 0; while (1) if (Read_Extra(afile,afile_name,NULL)) break; else nextra += 1; extra = (DAZZ_EXTRA *) Malloc(sizeof(DAZZ_EXTRA)*(nextra+1),"Allocating extras"); if (extra == NULL) goto error; FSEEKO(afile,-extail,SEEK_END) for (i = 0; i < nextra; i++) { extra[i].nelem = 0; Read_Extra(afile,afile_name,extra+i); } } else { for (i = 0; i < nextra; i++) if (Read_Extra(afile,afile_name,extra+i)) { fprintf(stderr,"%s: File %s has fewer extras than previous .anno files\n", Prog_Name,afile_name); goto error; } if (Read_Extra(afile,afile_name,extra+nextra) == 0) { fprintf(stderr,"%s: File %s has more extras than previous .anno files\n", Prog_Name,afile_name); goto error; } } } tracktot += tracklen; nfiles += 1; if (dfile != NULL) fclose(dfile); } if (nfiles == 0) { fprintf(stderr,"%s: Couldn't find first track block %s1.%s.anno\n", Prog_Name,prefix,argv[2]); goto error; } else { char *byte; if (dout != NULL) { if (tracksiz == 4) { int anno4 = trackoff; FWRITE(&anno4,sizeof(int),1,aout) } else { int64 anno8 = trackoff; FWRITE(&anno8,sizeof(int64),1,aout) } } if (nextra == 0) { while (fread(&byte,1,1,lfile) == 1) FWRITE(&byte,1,1,aout) } else { int i; for (i = 0; i < nextra; i++) Write_Extra(aout,extra+i); } fclose(lfile); FSEEKO(aout,0,SEEK_SET) FWRITE(&tracktot,sizeof(int),1,aout) FWRITE(&tracksiz,sizeof(int),1,aout) } } FCLOSE(aout); if (dout != NULL) FCLOSE(dout); if (nfiles != nblocks) { fprintf(stderr,"%s: Did not catenate all tracks of DB (nfiles %d != nblocks %d)\n", Prog_Name, nfiles, nblocks); goto error; } if (DELETE) { int i; char *name; for (i = 1; i <= nblocks ;i++) { name = Numbered_Suffix(prefix,i,Catenate(".",argv[2],".","anno")); if (unlink(name) != 0) fprintf(stderr,"%s: [WARNING] Couldn't delete file %s\n",Prog_Name,name); if (dout != NULL) { name = Numbered_Suffix(prefix,i,Catenate(".",argv[2],".","data")); if (unlink(name) != 0) fprintf(stderr,"%s: [WARNING] Couldn't delete file %s\n",Prog_Name,name); } } } free(prefix); exit (0); error: { char *name; fclose(aout); name = Catenate(prefix,argv[2],".","anno"); if (unlink(name) != 0) fprintf(stderr,"%s: [WARNING] Couldn't delete file %s during abort\n",Prog_Name,name); if (dout != NULL) { fclose(dout); name = Catenate(prefix,argv[2],".","data"); if (unlink(name) != 0) fprintf(stderr,"%s: [WARNING] Couldn't delete file %s during abort\n",Prog_Name,name); } free(prefix); } exit (1); } DAZZ_DB-master/DAM2fasta.c000066400000000000000000000125231322703422500153350ustar00rootroot00000000000000/******************************************************************************************** * * Recreate all the .fasta files that are in a specified DAM. * * Author: Gene Myers * Date : May 2014 * ********************************************************************************************/ #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-vU] [-w] "; int main(int argc, char *argv[]) { DAZZ_DB _db, *db = &_db; FILE *dbfile, *hdrs; char *dbfile_name, *hdrs_name; int nfiles; int VERBOSE, UPPER, WIDTH; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DAM2fasta") WIDTH = 80; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vU") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; } else argv[j++] = argv[i]; argc = j; UPPER = 1 + flags['U']; VERBOSE = flags['v']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -U: Use upper case for DNA (default is lower case).\n"); fprintf(stderr," -w: Print -w bp per line (default is 80).\n"); exit (1); } } // Open db { int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 0) { fprintf(stderr,"%s: Cannot be called on a .db: %s\n",Prog_Name,argv[1]); exit (1); } if (db->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } } { char *pwd, *root; pwd = PathTo(argv[1]); root = Root(argv[1],".dam"); dbfile_name = Strdup(Catenate(pwd,"/",root,".dam"),"Allocating db file name"); hdrs_name = Strdup(Catenate(pwd,PATHSEP,root,".hdr"),"Allocating header file name"); dbfile = Fopen(dbfile_name,"r"); hdrs = Fopen(hdrs_name,"r"); free(pwd); free(root); if (dbfile_name == NULL || hdrs_name == NULL || dbfile == NULL || hdrs == NULL) exit (1); } // nfiles = # of files in data base FSCANF(dbfile,DB_NFILE,&nfiles) // For each file do: { DAZZ_READ *reads; char *read; int f, first; char nstring[WIDTH+1]; if (UPPER == 2) for (f = 0; f < WIDTH; f++) nstring[f] = 'N'; else for (f = 0; f < WIDTH; f++) nstring[f] = 'n'; nstring[WIDTH] = '\0'; reads = db->reads; read = New_Read_Buffer(db); first = 0; for (f = 0; f < nfiles; f++) { int i, last, wpos; FILE *ofile; char prolog[MAX_NAME], fname[MAX_NAME], header[MAX_NAME]; // Scan db image file line, create .fasta file for writing FSCANF(dbfile,DB_FDATA,&last,fname,prolog) if (strcmp(fname,"stdout") == 0) { ofile = stdout; if (VERBOSE) { fprintf(stderr,"Sending %d contigs to stdout ...\n",last-first); fflush(stdout); } } else { if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Creating %s.fasta ...\n",fname); fflush(stdout); } } // For the relevant range of reads, write each to the file // recreating the original headers with the index meta-data about each read wpos = 0; for (i = first; i < last; i++) { int j, len, nlen, w; DAZZ_READ *r; r = reads + i; len = r->rlen; if (r->origin == 0) { if (i != first && wpos != 0) { fprintf(ofile,"\n"); wpos = 0; } FSEEKO(hdrs,r->coff,SEEK_SET) FGETS(header,MAX_NAME,hdrs) FPUTS(header,ofile) } if (r->fpulse != 0) { if (r->origin != 0) nlen = r->fpulse - (reads[i-1].fpulse + reads[i-1].rlen); else nlen = r->fpulse; for (j = 0; j+(w = WIDTH-wpos) <= nlen; j += w) { FPRINTF(ofile,"%.*s\n",w,nstring) wpos = 0; } if (j < nlen) { FPRINTF(ofile,"%.*s",nlen-j,nstring) if (j == 0) wpos += nlen; else wpos = nlen-j; } } Load_Read(db,i,read,UPPER); for (j = 0; j+(w = WIDTH-wpos) <= len; j += w) { FPRINTF(ofile,"%.*s\n",w,read+j) wpos = 0; } if (j < len) { FPRINTF(ofile,"%s",read+j) if (j == 0) wpos += len; else wpos = len-j; } } if (wpos > 0) FPRINTF(ofile,"\n") if (ofile != stdout) FCLOSE(ofile) first = last; } } fclose(hdrs); fclose(dbfile); Close_DB(db); exit (0); } DAZZ_DB-master/DB.c000066400000000000000000001522361322703422500141260ustar00rootroot00000000000000/******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif /******************************************************************************************* * * GENERAL UTILITIES * ********************************************************************************************/ char *Prog_Name; #ifdef INTERACTIVE char Ebuffer[1000]; #endif int Count_Args(char *var) { int cnt, lev; char *s; cnt = 1; lev = 0; for (s = var; *s != '\0'; s++) if (*s == ',') { if (lev == 0) cnt += 1; } else if (*s == '(') lev += 1; else if (*s == ')') lev -= 1; return (cnt); } void *Malloc(int64 size, char *mesg) { void *p; if ((p = malloc(size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } void *Realloc(void *p, int64 size, char *mesg) { if (size <= 0) size = 1; if ((p = realloc(p,size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } char *Strdup(char *name, char *mesg) { char *s; if (name == NULL) return (NULL); if ((s = strdup(name)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (s); } FILE *Fopen(char *name, char *mode) { FILE *f; if (name == NULL || mode == NULL) return (NULL); if ((f = fopen(name,mode)) == NULL) EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); return (f); } char *PathTo(char *name) { char *path, *find; if (name == NULL) return (NULL); if ((find = rindex(name,'/')) != NULL) { *find = '\0'; path = Strdup(name,"Extracting path from"); *find = '/'; } else path = Strdup(".","Allocating default path"); return (path); } char *Root(char *name, char *suffix) { char *path, *find, *dot; int epos; if (name == NULL) return (NULL); find = rindex(name,'/'); if (find == NULL) find = name; else find += 1; if (suffix == NULL) { dot = strchr(find,'.'); if (dot != NULL) *dot = '\0'; path = Strdup(find,"Extracting root from"); if (dot != NULL) *dot = '.'; } else { epos = strlen(find); epos -= strlen(suffix); if (epos > 0 && strcasecmp(find+epos,suffix) == 0) { find[epos] = '\0'; path = Strdup(find,"Extracting root from"); find[epos] = suffix[0]; } else path = Strdup(find,"Allocating root"); } return (path); } char *Catenate(char *path, char *sep, char *root, char *suffix) { static char *cat = NULL; static int max = -1; int len; if (path == NULL || root == NULL || sep == NULL || suffix == NULL) return (NULL); len = strlen(path); len += strlen(sep); len += strlen(root); len += strlen(suffix); if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); return (NULL); } } sprintf(cat,"%s%s%s%s",path,sep,root,suffix); return (cat); } char *Numbered_Suffix(char *left, int num, char *right) { static char *suffix = NULL; static int max = -1; int len; if (left == NULL || right == NULL) return (NULL); len = strlen(left); len += strlen(right) + 40; if (len > max) { max = ((int) (1.2*len)) + 100; if ((suffix = (char *) realloc(suffix,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); return (NULL); } } sprintf(suffix,"%s%d%s",left,num,right); return (suffix); } #define COMMA ',' // Print big integers with commas/periods for better readability void Print_Number(int64 num, int width, FILE *out) { if (width == 0) { if (num < 1000ll) fprintf(out,"%lld",num); else if (num < 1000000ll) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else if (num < 1000000000ll) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll, COMMA,(num%1000000000ll)/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); } else { if (num < 1000ll) fprintf(out,"%*lld",width,num); else if (num < 1000000ll) { if (width <= 4) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll); } else if (num < 1000000000ll) { if (width <= 8) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); } else { if (width <= 12) fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); } } } // Return the number of digits, base 10, of num int Number_Digits(int64 num) { int digit; digit = 0; while (num >= 1) { num /= 10; digit += 1; } return (digit); } /******************************************************************************************* * * READ COMPRESSION/DECOMPRESSION UTILITIES * ********************************************************************************************/ // Compress read into 2-bits per base (from [0-3] per byte representation void Compress_Read(int len, char *s) { int i; char c, d; char *s0, *s1, *s2, *s3; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; c = s1[len]; d = s2[len]; s0[len] = s1[len] = s2[len] = 0; for (i = 0; i < len; i += 4) *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]); s1[len] = c; s2[len] = d; } // Uncompress read form 2-bits per base into [0-3] per byte representation void Uncompress_Read(int len, char *s) { int i, tlen, byte; char *s0, *s1, *s2, *s3; char *t; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; tlen = (len-1)/4; t = s+tlen; for (i = tlen*4; i >= 0; i -= 4) { byte = *t--; s0[i] = (char) ((byte >> 6) & 0x3); s1[i] = (char) ((byte >> 4) & 0x3); s2[i] = (char) ((byte >> 2) & 0x3); s3[i] = (char) (byte & 0x3); } s[len] = 4; } // Convert read in [0-3] representation to ascii representation (end with '\n') void Lower_Read(char *s) { static char letter[4] = { 'a', 'c', 'g', 't' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } void Upper_Read(char *s) { static char letter[4] = { 'A', 'C', 'G', 'T' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } void Letter_Arrow(char *s) { static char letter[4] = { '1', '2', '3', '4' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } // Convert read in ascii representation to [0-3] representation (end with 4) void Number_Read(char *s) { static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; for ( ; *s != '\0'; s++) *s = number[(int) *s]; *s = 4; } void Number_Arrow(char *s) { static char arrow[128] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, }; for ( ; *s != '\0'; s++) *s = arrow[(int) *s]; *s = 4; } /******************************************************************************************* * * DB OPEN, TRIM & CLOSE ROUTINES * ********************************************************************************************/ // Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char* path, DAZZ_DB *db) { DAZZ_DB dbcopy; char *root, *pwd, *bptr, *fptr, *cat; int nreads; FILE *index, *dbvis; int status, plen, isdam; int part, cutoff, all; int ufirst, tfirst, ulast, tlast; status = -1; dbcopy = *db; plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); pwd = PathTo(path); bptr = rindex(root,'.'); if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-') { part = strtol(bptr+1,&fptr,10); if (*fptr != '\0' || part == 0) part = 0; else *bptr = '\0'; } else part = 0; isdam = 0; cat = Catenate(pwd,"/",root,".db"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { cat = Catenate(pwd,"/",root,".dam"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path); goto error; } isdam = 1; } if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) goto error1; if (fread(db,sizeof(DAZZ_DB),1,index) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); goto error2; } { int p, nblocks, nfiles; int64 size; char fname[MAX_NAME], prolog[MAX_NAME]; nblocks = 0; if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } for (p = 0; p < nfiles; p++) if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) if (part == 0) { cutoff = 0; all = DB_ALL; } else { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", Prog_Name,root); goto error2; } else { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (part > nblocks) { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); goto error2; } } if (part > 0) { for (p = 1; p <= part; p++) if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } } else { ufirst = tfirst = 0; ulast = db->ureads; tlast = db->treads; } } db->trimmed = 0; db->tracks = NULL; db->part = part; db->cutoff = cutoff; db->allarr |= all; db->ufirst = ufirst; db->tfirst = tfirst; nreads = ulast-ufirst; if (part <= 0) { db->reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); if (db->reads == NULL) goto error2; db->reads += 1; if (fread(db->reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(db->reads-1); goto error2; } } else { DAZZ_READ *reads; int i, r, maxlen; int64 totlen; reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); if (reads == NULL) goto error2; reads += 1; fseeko(index,sizeof(DAZZ_READ)*ufirst,SEEK_CUR); if (fread(reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(reads-1); goto error2; } totlen = 0; maxlen = 0; for (i = 0; i < nreads; i++) { r = reads[i].rlen; totlen += r; if (r > maxlen) maxlen = r; } db->maxlen = maxlen; db->totlen = totlen; db->reads = reads; } ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part ((int *) (db->reads))[-2] = tlast - tfirst; db->nreads = nreads; db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); if (db->path == NULL) goto error2; db->bases = NULL; db->loaded = 0; status = isdam; error2: fclose(index); error1: fclose(dbvis); error: if (bptr != NULL) *bptr = '.'; free(pwd); free(root); if (status < 0) *db = dbcopy; return (status); } // Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(DAZZ_DB *db) { int i, j, r; int allflag, cutoff; int64 totlen; int maxlen, nreads; DAZZ_TRACK *record; DAZZ_READ *reads; if (db->trimmed) return; if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return; cutoff = db->cutoff; if ((db->allarr & DB_ALL) != 0) allflag = 0; else allflag = DB_BEST; reads = db->reads; nreads = db->nreads; for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,".@qvs") == 0) { uint16 *table = ((DAZZ_QV *) record)->table; j = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) table[j++] = table[i]; } else { int *anno4, size; int64 *anno8; char *anno, *data; size = record->size; data = (char *) record->data; if (data == NULL) { anno = (char *) record->anno; j = 0; for (i = r = 0; i < db->nreads; i++, r += size) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (record->anno); j = anno4[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (record->anno); j = anno8[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno8[j],NULL); } record->anno = Realloc(record->anno,record->size*(j+1),NULL); } totlen = maxlen = 0; for (j = i = 0; i < nreads; i++) { r = reads[i].rlen; if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff) { totlen += r; if (r > maxlen) maxlen = r; reads[j++] = reads[i]; } } db->totlen = totlen; db->maxlen = maxlen; db->nreads = j; db->trimmed = 1; if (j < nreads) { db->reads = Realloc(reads-1,sizeof(DAZZ_READ)*(j+2),NULL); db->reads += 1; } } // The DB has already been trimmed, but a track over the untrimmed DB needs to be loaded. // Trim the track by rereading the untrimmed DB index from the file system. static int Late_Track_Trim(DAZZ_DB *db, DAZZ_TRACK *track, int ispart) { int i, j, r; int allflag, cutoff; int ureads; char *root; DAZZ_READ read; FILE *indx; if (!db->trimmed) return (0); if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return (0); cutoff = db->cutoff; if ((db->allarr & DB_ALL) != 0) allflag = 0; else allflag = DB_BEST; root = rindex(db->path,'/') + 2; indx = Fopen(Catenate(db->path,"","",".idx"),"r"); fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*db->ufirst,SEEK_SET); if (ispart) ureads = ((int *) (db->reads))[-1]; else ureads = db->ureads; if (strcmp(track->name,".@qvs") == 0) { EPRINTF(EPLACE,"%s: Cannot load QV track after trimming\n",Prog_Name); fclose(indx); EXIT(1); } { int *anno4, size; int64 *anno8; char *anno, *data; size = track->size; data = (char *) track->data; if (data == NULL) { anno = (char *) track->anno; j = r = 0; for (i = r = 0; i < ureads; i++, r += size) { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } r += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (track->anno); j = anno4[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (track->anno); j = anno8[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno8[j],NULL); } track->anno = Realloc(track->anno,track->size*(j+1),NULL); } fclose(indx); return (0); } // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(DAZZ_DB *db) { DAZZ_TRACK *t, *p; if (db->loaded) free(((char *) (db->bases)) - 1); else if (db->bases != NULL) fclose((FILE *) db->bases); if (db->reads != NULL) free(db->reads-1); free(db->path); Close_QVs(db); for (t = db->tracks; t != NULL; t = p) { p = t->next; free(t->anno); free(t->data); free(t); } } // Return the size in bytes of the memory occupied by a given DB int64 sizeof_DB(DAZZ_DB *db) { int64 s; DAZZ_TRACK *t; s = sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*(db->nreads+2) + strlen(db->path)+1 + (db->totlen+db->nreads+4); t = db->tracks; if (t != NULL && strcmp(t->name,".@qvs") == 0) { DAZZ_QV *q = (DAZZ_QV *) t; s += sizeof(DAZZ_QV) + sizeof(uint16) * db->nreads + q->ncodes * sizeof(QVcoding) + 6; t = t->next; } for (; t != NULL; t = t->next) { s += sizeof(DAZZ_TRACK) + strlen(t->name)+1 + t->size * (db->nreads+1); if (t->data != NULL) { if (t->size == 8) s += sizeof(int)*((int64 *) t->anno)[db->nreads]; else // t->size == 4 s += sizeof(int)*((int *) t->anno)[db->nreads]; } } return (s); } /******************************************************************************************* * * QV LOAD & CLOSE ROUTINES * ********************************************************************************************/ DAZZ_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" DAZZ_QV *Active_QV; // Becomes invalid after closing int Load_QVs(DAZZ_DB *db) { FILE *quiva, *istub, *indx; char *root; uint16 *table; DAZZ_QV *qvtrk; QVcoding *coding, *nx; int ncodes = 0; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) return (0); if (db->trimmed) { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); EXIT(1); } if (db->reads[db->nreads-1].coff < 0) { if (db->part > 0) { EPRINTF(EPLACE,"%s: All QVs for this block have not been added to the DB!\n",Prog_Name); EXIT(1); } else { EPRINTF(EPLACE,"%s: All QVs for this DB have not been added!\n",Prog_Name); EXIT(1); } } // Open .qvs, .idx, and .db files quiva = Fopen(Catenate(db->path,"","",".qvs"),"r"); if (quiva == NULL) return (-1); istub = NULL; indx = NULL; table = NULL; coding = NULL; qvtrk = NULL; root = rindex(db->path,'/'); if (root[1] == '.') { *root = '\0'; istub = Fopen(Catenate(db->path,"/",root+2,".db"),"r"); *root = '/'; } else istub = Fopen(Catenate(db->path,"","",".db"),"r"); if (istub == NULL) goto error; { int first, last, nfiles; char prolog[MAX_NAME], fname[MAX_NAME]; int i, j; if (fscanf(istub,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (db->part > 0) { int pfirst, plast; int fbeg, fend; int n, k; FILE *indx; // Determine first how many and which files span the block (fbeg to fend) pfirst = db->ufirst; plast = pfirst + db->nreads; first = 0; for (fbeg = 0; fbeg < nfiles; fbeg++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (last > pfirst) break; first = last; } for (fend = fbeg+1; fend <= nfiles; fend++) { if (last >= plast) break; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } first = last; } indx = Fopen(Catenate(db->path,"","",".idx"),"r"); ncodes = fend-fbeg; coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (indx == NULL || coding == NULL || table == NULL) { ncodes = 0; goto error; } // Carefully get the first coding scheme (its offset is most likely in a DAZZ_RECORD // in .idx that is *not* in memory). Get all the other coding schemes normally and // assign the tables # for each read in the block in "tables". rewind(istub); (void) fscanf(istub,DB_NFILE,&nfiles); first = 0; for (n = 0; n < fbeg; n++) { (void) fscanf(istub,DB_FDATA,&last,fname,prolog); first = last; } for (n = fbeg; n < fend; n++) { (void) fscanf(istub,DB_FDATA,&last,fname,prolog); i = n-fbeg; if (first < pfirst) { DAZZ_READ read; fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*first,SEEK_SET); if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); ncodes = i; goto error; } fseeko(quiva,read.coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; } else { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first-pfirst].coff = ftello(quiva); } j = first-pfirst; if (j < 0) j = 0; k = last-pfirst; if (k > db->nreads) k = db->nreads; while (j < k) table[j++] = (uint16) i; first = last; } fclose(indx); indx = NULL; } else { // Load in coding scheme for each file, adjust .coff of first read in the file, and // record which table each read uses ncodes = nfiles; coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (coding == NULL || table == NULL) goto error; first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } fseeko(quiva,db->reads[first].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first].coff = ftello(quiva); for (j = first; j < last; j++) table[j] = (uint16) i; first = last; } } // Allocate and fill in the DAZZ_QV record and add it to the front of the // track list qvtrk = (DAZZ_QV *) Malloc(sizeof(DAZZ_QV),"Allocating QV pseudo-track"); if (qvtrk == NULL) goto error; qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); if (qvtrk->name == NULL) goto error; qvtrk->next = db->tracks; db->tracks = (DAZZ_TRACK *) qvtrk; qvtrk->ncodes = ncodes; qvtrk->table = table; qvtrk->coding = coding; qvtrk->quiva = quiva; } fclose(istub); return (0); error: if (qvtrk != NULL) free(qvtrk); if (table != NULL) free(table); if (coding != NULL) { int i; for (i = 0; i < ncodes; i++) Free_QVcoding(coding+i); free(coding); } if (indx != NULL) fclose(indx); if (istub != NULL) fclose(istub); fclose(quiva); EXIT(1); } // Close the QV stream, free the QV pseudo track and all associated memory void Close_QVs(DAZZ_DB *db) { DAZZ_TRACK *track; DAZZ_QV *qvtrk; int i; Active_DB = NULL; track = db->tracks; if (track != NULL && strcmp(track->name,".@qvs") == 0) { qvtrk = (DAZZ_QV *) track; for (i = 0; i < qvtrk->ncodes; i++) Free_QVcoding(qvtrk->coding+i); free(qvtrk->coding); free(qvtrk->table); fclose(qvtrk->quiva); db->tracks = track->next; free(track); } return; } /******************************************************************************************* * * TRACK LOAD & CLOSE ROUTINES * ********************************************************************************************/ // Return status of track: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track int Check_Track(DAZZ_DB *db, char *track, int *kind) { FILE *afile; int tracklen, size, ispart; int ureads, treads; afile = NULL; if (db->part > 0) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) return (-2); if (fread(&tracklen,sizeof(int),1,afile) != 1) { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } if (fread(&size,sizeof(int),1,afile) != 1) { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } if (size == 0) *kind = MASK_TRACK; else if (size > 0) *kind = CUSTOM_TRACK; else { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } fclose(afile); if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (tracklen == ureads) return (0); else if (tracklen == treads) return (1); else return (-1); } // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created DAZZ_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned. DAZZ_TRACK *Load_Track(DAZZ_DB *db, char *track) { FILE *afile, *dfile; int tracklen, size; int nreads, ispart; int treads, ureads; void *anno; void *data; char *name; DAZZ_TRACK *record; if (track[0] == '.') { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); EXIT(NULL); } for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,track) == 0) return (record); afile = NULL; if (db->part) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); return (NULL); } dfile = NULL; anno = NULL; data = NULL; record = NULL; if (ispart) name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"); else name = Catenate(db->path,".",track,".data"); if (name == NULL) goto error; dfile = fopen(name,"r"); if (fread(&tracklen,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (fread(&size,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size < 0) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 0) size = 8; if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (db->trimmed) { if (tracklen != treads && tracklen != ureads) { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) { if (tracklen == treads) fseeko(afile,size*db->tfirst,SEEK_CUR); else fseeko(afile,size*db->ufirst,SEEK_CUR); } } else { if (tracklen != ureads) { if (tracklen == treads) EPRINTF(EPLACE,"%s: Track '%s' is for a trimmed DB !\n",Prog_Name,track); else EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->ufirst,SEEK_CUR); } if (tracklen == treads) nreads = ((int *) (db->reads))[-2]; else nreads = ((int *) (db->reads))[-1]; anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); if (anno == NULL) goto error; if (dfile != NULL) { int64 *anno8, off8, dlen; int *anno4, off4; int i; if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 4) { anno4 = (int *) anno; off4 = anno4[0]; if (off4 != 0) { for (i = 0; i <= nreads; i++) anno4[i] -= off4; fseeko(dfile,off4,SEEK_SET); } dlen = anno4[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } else { anno8 = (int64 *) anno; off8 = anno8[0]; if (off8 != 0) { for (i = 0; i <= nreads; i++) anno8[i] -= off8; fseeko(dfile,off8,SEEK_SET); } dlen = anno8[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } if (data == NULL) goto error; if (dlen > 0) { if (fread(data,dlen,1,dfile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track); goto error; } } fclose(dfile); dfile = NULL; } else { if (fread(anno,size,nreads,afile) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } data = NULL; } fclose(afile); record = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK),"Allocating Track Record"); if (record == NULL) goto error; record->name = Strdup(track,"Allocating Track Name"); if (record->name == NULL) goto error; record->data = data; record->anno = anno; record->size = size; if (db->trimmed && tracklen != treads) { if (Late_Track_Trim(db,record,ispart)) goto error; } if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) { record->next = db->tracks->next; db->tracks->next = record; } else { record->next = db->tracks; db->tracks = record; } return (record); error: if (record != NULL) free(record); if (data != NULL) free(data); if (anno != NULL) free(anno); if (dfile != NULL) fclose(dfile); fclose(afile); EXIT (NULL); } // Assumming file pointer for afile is correctly positioned at the start of a extra item, // and aname is the name of the .anno file, decode the value present and places it in // extra if extra->nelem == 0, otherwise reduce the value just read into extra according // according the to the directive given by 'accum'. Leave the read poinrt at the next // extra or end-of-file. // Returns: // 1 if at the end of file, // 0 if item was read and folded correctly, // -1 if there was a system IO or allocation error (if interactive), and // -2 if the new value could not be reduced into the currenct value of extra (interactive) int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra) { int vtype, nelem, accum, slen; char *name; void *value; #define EREAD(v,s,n,file,ret) \ { if (fread(v,s,n,file) != (size_t) n) \ { if (ferror(file)) \ fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ else if (ret) \ return (1); \ else \ fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,aname); \ EXIT(-1); \ } \ } EREAD(&vtype,sizeof(int),1,afile,1) EREAD(&nelem,sizeof(int),1,afile,0) EREAD(&accum,sizeof(int),1,afile,0) EREAD(&slen,sizeof(int),1,afile,0) if (extra == NULL) { if (fseeko(afile,slen+8*nelem,SEEK_CUR) < 0) { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); EXIT(-1); } return (0); } name = (char *) Malloc(slen+1,"Allocating extra name"); value = Malloc(8*nelem,"Allocating extra value"); if (name == NULL || value == NULL) EXIT(-1); EREAD(name,1,slen,afile,0); EREAD(value,8,nelem,afile,0); name[slen] = '\0'; if (extra->nelem == 0) { extra->vtype = vtype; extra->nelem = nelem; extra->accum = accum; extra->name = name; extra->value = value; return (0); } if (vtype != extra->vtype) { fprintf(stderr,"%s: Type of extra %s does not agree with previous .anno block files\n", Prog_Name,name); goto error; } if (nelem != extra->nelem) { fprintf(stderr,"%s: Length of extra %s does not agree with previous .anno block files\n", Prog_Name,name); goto error; } if (accum != extra->accum) { fprintf(stderr,"%s: Reduction indicator of extra %s does not agree with",Prog_Name,name); fprintf(stderr," previos .anno block files\n"); goto error; } if (strcmp(name,extra->name) != 0) { fprintf(stderr,"%s: Expecting extra %s in .anno block file, not %s\n", Prog_Name,extra->name,name); goto error; } if (vtype == DB_INT) { int64 *ival = (int64 *) value; int64 *eval = (int64 *) (extra->value); int j; if (accum == DB_EXACT) { for (j = 0; j < nelem; j++) if (eval[j] != ival[j]) { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name); fprintf(stderr," with previous .anno block files\n"); goto error; } } else { for (j = 0; j < nelem; j++) eval[j] += ival[j]; } } else { double *ival = (double *) value; double *eval = (double *) (extra->value); int j; if (accum == DB_EXACT) { for (j = 0; j < nelem; j++) if (eval[j] != ival[j]) { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name); fprintf(stderr," with previous .anoo block files\n"); goto error; } } else { for (j = 0; j < nelem; j++) eval[j] += ival[j]; } } free(value); free(name); return (0); error: free(value); free(name); EXIT(1); } // Write extra record to end of file afile and advance write pointer // If interactive, then return non-zero on error, if bash, then print // and halt if an error int Write_Extra(FILE *afile, DAZZ_EXTRA *extra) { int slen; #define EWRITE(v,s,n,file) \ { if (fwrite(v,s,n,file) != (size_t) n) \ { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ EXIT(1); \ } \ } EWRITE(&(extra->vtype),sizeof(int),1,afile) FWRITE(&(extra->nelem),sizeof(int),1,afile) FWRITE(&(extra->accum),sizeof(int),1,afile) slen = strlen(extra->name); FWRITE(&slen,sizeof(int),1,afile) FWRITE(extra->name,1,slen,afile) FWRITE(extra->value,8,extra->nelem,afile) return (0); } void Close_Track(DAZZ_DB *db, char *track) { DAZZ_TRACK *record, *prev; prev = NULL; for (record = db->tracks; record != NULL; record = record->next) { if (strcmp(record->name,track) == 0) { free(record->anno); free(record->data); free(record->name); if (prev == NULL) db->tracks = record->next; else prev->next = record->next; free(record); return; } prev = record; } return; } /******************************************************************************************* * * READ BUFFER ALLOCATION AND READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer big enough for the largest read in 'db', leaving room // for an initial delimiter character char *New_Read_Buffer(DAZZ_DB *db) { char *read; read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); if (read == NULL) EXIT(NULL); return (read+1); } // Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a // lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and // 3(T) otherwise. // // **NB**, the byte before read will be set to a delimiter character! int Load_Read(DAZZ_DB *db, int i, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(1); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); db->bases = (void *) bases; } off = r[i].boff; len = r[i].rlen; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(1); } } Uncompress_Read(len,read); if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (0); } // Load into 'read' the i'th arrow in 'db'. As an ASCII string if ascii is 1, // and as a numeric string otherwise. // DAZZ_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow" FILE *Arrow_File = NULL; // Becomes invalid after closing int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii) { FILE *arrow; int64 off; int len, clen; DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Arrow)\n",Prog_Name); EXIT(1); } if (Arrow_DB != db) { if (Arrow_File != NULL) fclose(Arrow_File); arrow = Fopen(Catenate(db->path,"","",".arw"),"r"); if (arrow == NULL) EXIT(1); Arrow_File = arrow; Arrow_DB = db; } else arrow = Arrow_File; off = r[i].boff; len = r[i].rlen; if (ftello(arrow) != off) fseeko(arrow,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,arrow) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Arrow)\n",Prog_Name); EXIT(1); } } Uncompress_Read(len,read); if (ascii == 1) { Letter_Arrow(read); read[-1] = '\0'; } else read[-1] = 4; return (0); } char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; int bbeg, bend; DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(NULL); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(NULL); db->bases = (void *) bases; } bbeg = beg/4; bend = (end-1)/4+1; off = r[i].boff + bbeg; len = end - beg; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = bend-bbeg; if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(NULL); } } Uncompress_Read(4*clen,read); read += beg%4; read[len] = 4; if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (read); } /******************************************************************************************* * * QV BUFFER ALLOCATION QV READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer of 5 vectors big enough for the largest read in 'db' char **New_QV_Buffer(DAZZ_DB *db) { char **entry; char *qvs; int i; qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); if (qvs == NULL || entry == NULL) EXIT(NULL); for (i = 0; i < 5; i++) entry[i] = qvs + i*db->maxlen; return (entry); } // Load into entry the QV streams for the i'th read from db. The parameter ascii applies to // the DELTAG stream as described for Load_Read. int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii) { DAZZ_READ *reads; FILE *quiva; int rlen; if (db != Active_DB) { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name); EXIT(1); } Active_QV = (DAZZ_QV *) db->tracks; Active_DB = db; } if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); EXIT(1); } reads = db->reads; quiva = Active_QV->quiva; rlen = reads[i].rlen; fseeko(quiva,reads[i].coff,SEEK_SET); if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) EXIT(1); if (ascii != 1) { char *deltag = entry[1]; if (ascii != 2) { char x = deltag[rlen]; deltag[rlen] = '\0'; Number_Read(deltag); deltag[rlen] = x; } else { int j; int u = 'A'-'a'; for (j = 0; j < rlen; j++) deltag[j] = (char) (deltag[j]+u); } } return (0); } /******************************************************************************************* * * BLOCK LOAD OF ALL READS (PRIMARILY FOR DALIGNER) * ********************************************************************************************/ // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // non-zero then the reads are converted to ACGT ascii, otherwise the reads are left // as numeric strings over 0(A), 1(C), 2(G), and 3(T). int Read_All_Sequences(DAZZ_DB *db, int ascii) { FILE *bases; int nreads = db->nreads; DAZZ_READ *reads = db->reads; void (*translate)(char *s); char *seq; int64 o, off; int i, len, clen; bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); if (seq == NULL) { fclose(bases); EXIT(1); } *seq++ = 4; if (ascii == 1) translate = Lower_Read; else translate = Upper_Read; o = 0; for (i = 0; i < nreads; i++) { len = reads[i].rlen; off = reads[i].boff; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(seq+o,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name); free(seq); fclose(bases); EXIT(1); } } Uncompress_Read(len,seq+o); if (ascii) translate(seq+o); reads[i].boff = o; o += (len+1); } reads[nreads].boff = o; fclose(bases); db->bases = (void *) seq; db->loaded = 1; return (0); } // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. // -1 is returned if the path could not be found, and 1 is returned if an error (reported // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. int List_DB_Files(char *path, void actor(char *path, char *extension)) { int status, plen, rlen, dlen; char *root, *pwd, *name; int isdam; DIR *dirp; struct dirent *dp; status = 0; pwd = PathTo(path); plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); rlen = strlen(root); if (root == NULL || pwd == NULL) { free(pwd); free(root); EXIT(1); } if ((dirp = opendir(pwd)) == NULL) { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; goto error; } isdam = 0; while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary) { name = dp->d_name; if (strcmp(name,Catenate("","",root,".db")) == 0) break; if (strcmp(name,Catenate("","",root,".dam")) == 0) { isdam = 1; break; } } if (dp == NULL) { status = -1; closedir(dirp); goto error; } if (isdam) actor(Catenate(pwd,"/",root,".dam"),"dam"); else actor(Catenate(pwd,"/",root,".db"),"db"); rewinddir(dirp); // Report each auxiliary file while ((dp = readdir(dirp)) != NULL) { name = dp->d_name; dlen = strlen(name); #ifdef HIDE_FILES if (name[0] != '.') continue; dlen -= 1; name += 1; #endif if (dlen < rlen+1) continue; if (name[rlen] != '.') continue; if (strncmp(name,root,rlen) != 0) continue; actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); } closedir(dirp); error: free(pwd); free(root); return (status); } void Print_Read(char *s, int width) { int i; if (s[0] < 4) { for (i = 0; s[i] != 4; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%d",s[i]); } printf("\n"); } else { for (i = 0; s[i] != '\0'; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%c",s[i]); } printf("\n"); } } DAZZ_DB-master/DB.h000066400000000000000000000614371322703422500141350ustar00rootroot00000000000000/******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. Eventually will also * support compressed quality information. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #ifndef _DAZZ_DB #define _DAZZ_DB #include #include "QV.h" #define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" // Undefine if you don't want this // For interactive applications where it is inappropriate to simply exit with an error // message to standard error, define the constant INTERACTIVE. If set, then error // messages are put in the global variable Ebuffer and the caller of a DB routine // can decide how to deal with the error. // // DB, QV, or alignment routines that can encounter errors function as before in // non-INTERACTIVE mode by exiting after printing an error message to stderr. In // INTERACTIVE mode the routines place a message at EPLACE and return an error // value. For such routines that were previously void, they are now int, and // return 1 if an error occured, 0 otherwise. #ifdef INTERACTIVE #define EPRINTF sprintf #define EPLACE Ebuffer #define EXIT(x) return (x) #else // BATCH #define EPRINTF fprintf #define EPLACE stderr #define EXIT(x) exit (1) #endif typedef unsigned char uint8; typedef unsigned short uint16; typedef unsigned int uint32; typedef unsigned long long uint64; typedef signed char int8; typedef signed short int16; typedef signed int int32; typedef signed long long int64; typedef float float32; typedef double float64; /******************************************************************************************* * * COMMAND LINE INTERPRETATION MACROS * ********************************************************************************************/ extern char *Prog_Name; // Name of program #ifdef INTERACTIVE extern char Ebuffer[]; #endif #define ARG_INIT(name) \ Prog_Name = Strdup(name,""); \ for (i = 0; i < 128; i++) \ flags[i] = 0; #define ARG_FLAGS(set) \ for (k = 1; argv[i][k] != '\0'; k++) \ { if (index(set,argv[i][k]) == NULL) \ { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \ exit (1); \ } \ flags[(int) argv[i][k]] = 1; \ } #define ARG_POSITIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } \ if (var <= 0) \ { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_NON_NEGATIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } \ if (var < 0) \ { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_REAL(var) \ var = strtod(argv[i]+2,&eptr); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not a real number\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } /******************************************************************************************* * * GUARDED BATCH IO MACROS * ********************************************************************************************/ // Utilitieis int Count_Args(char *arg); #define SYSTEM_READ_ERROR \ { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ exit (2); \ } #define SYSTEM_WRITE_ERROR \ { fprintf(stderr,"%s: System error, write failed!\n",Prog_Name); \ exit (2); \ } #define SYSTEM_CLOSE_ERROR \ { fprintf(stderr,"%s: System error, file close failed!\n",Prog_Name); \ exit (2); \ } // Output #define FWRITE(v,s,n,file) \ { if (fwrite(v,s,n,file) != (size_t) n) \ SYSTEM_WRITE_ERROR \ } #define FPRINTF(file,...) \ { if (fprintf(file,__VA_ARGS__) < 0) \ SYSTEM_WRITE_ERROR \ } #define PRINTF(...) \ { if (printf(__VA_ARGS__) < 0) \ SYSTEM_WRITE_ERROR \ } #define FPUTS(x,file) \ { if (fputs(x,file) == EOF) \ SYSTEM_WRITE_ERROR \ } // Close #define FCLOSE(file) \ { if (fclose(file) != 0) \ SYSTEM_CLOSE_ERROR \ } // Input #define FREAD(v,s,n,file) \ { if (fread(v,s,n,file) != (size_t) n) \ { if (ferror(file)) \ SYSTEM_READ_ERROR \ else \ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ exit (1); \ } \ } \ } #define FSCANF(file,...) \ { if (fscanf(file,__VA_ARGS__) != Count_Args(#__VA_ARGS__)-1) \ { if (ferror(file)) \ SYSTEM_READ_ERROR \ else \ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ exit (1); \ } \ } \ } #define FGETS(v,n,file) \ { if (fgets(v,n,file) == NULL) \ { if (ferror(file)) \ SYSTEM_READ_ERROR \ else \ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ exit (1); \ } \ } \ } #define FSEEKO(file,p,d) \ { if (fseeko(file,p,d) < 0) \ SYSTEM_READ_ERROR \ } #define FTELLO(file) \ ( { int x = ftello(file); \ if (x < 0) \ SYSTEM_READ_ERROR \ ; x; \ } ) /******************************************************************************************* * * UTILITIES * ********************************************************************************************/ // The following general utilities return NULL if any of their input pointers are NULL, or if they // could not perform their function (in which case they also print an error to stderr). void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to char *Strdup(char *string, char *mesg); // stderr if out of memory FILE *Fopen(char *path, char *mode); // Open file path for "mode" char *PathTo(char *path); // Return path portion of file name "path" char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path" // Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer // Numbered_Suffix returns concatenation of left..right in a *temporary* buffer char *Catenate(char *path, char *sep, char *root, char *suffix); char *Numbered_Suffix(char *left, int num, char *right); // DB-related utilities void Print_Number(int64 num, int width, FILE *out); // Print readable big integer int Number_Digits(int64 num); // Return # of digits in printed number #define COMPRESSED_LEN(len) (((len)+3) >> 2) void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form void Print_Read(char *s, int width); void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt) void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT) void Number_Read(char *s); // Convert read from letters to numbers void Letter_Arrow(char *s); // Convert arrow pw's from numbers to uppercase letters (0-3 to 1234) void Number_Arrow(char *s); // Convert arrow pw string from letters to numbers /******************************************************************************************* * * DB IN-CORE DATA STRUCTURES * ********************************************************************************************/ #define DB_QV 0x03ff // Mask for 3-digit quality value #define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert #define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1) #define DB_ARROW 0x2 // DB is an arrow DB #define DB_ALL 0x1 // all wells are in the trimmed DB // Fields have different interpretations if a .db versus a .dam typedef struct { int origin; // Well # (DB), Contig # (DAM) int rlen; // Length of the sequence (Last pulse = fpulse + rlen) int fpulse; // First pulse (DB), left index of contig in scaffold (DAM) int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in '.qvs' file (DB), // Offset (in bytes) of scaffold header string in '.hdr' file (DAM) // 4 compressed shorts containing snr info if an arrow DB. int flags; // QV of read + flags above (DB only) } DAZZ_READ; // A track can be of 3 types: // data == NULL: there are nreads 'anno' records of size 'size'. // data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1]) // contains the variable length data // data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1]) // contains the variable length data typedef struct _track { struct _track *next; // Link to next track char *name; // Symbolic name of track int size; // Size in bytes of anno records void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL } DAZZ_TRACK; // The tailing part of a .anno track file can contain meta-information produced by the // command that produced the track. For example, the coverage, or good/bad parameters // for trimming, or even say a histogram of QV values. Each item is an array of 'nelem' // 64-bit ints or floats ('vtype' = DB_INT or DB_REAL), has a 'name' string that // describes it, and an indicator as to whether the values should be equal accross all // block tracks, or summed accross all block tracks (by Catrack). 'value' points at the // array of values #define DB_INT 0 #define DB_REAL 1 #define DB_EXACT 0 #define DB_SUM 1 typedef struct { int vtype; // INT64 or FLOAST64 int nelem; // >= 1 int accum; // EXACT, SUM char *name; void *value; } DAZZ_EXTRA; // The information for accessing QV streams is in a DAZZ_QV record that is a "pseudo-track" // named ".@qvs" and is always the first track record in the list (if present). Since normal // track names cannot begin with a . (this is enforced), this pseudo-track is never confused // with a normal track. typedef struct { struct _track *next; char *name; int ncodes; // # of coding tables QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h) uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with // scheme coding[table[i]] FILE *quiva; // the open file pointer to the .qvs file } DAZZ_QV; // The DB record holds all information about the current state of an active DB including an // array of DAZZ_READS, one per read, and a linked list of DAZZ_TRACKs the first of which // is always a DAZZ_QV pseudo-track (if the QVs have been loaded). typedef struct { int ureads; // Total number of reads in untrimmed DB int treads; // Total number of reads in trimmed DB int cutoff; // Minimum read length in block (-1 if not yet set) int allarr; // DB_ALL | DB_ARROW float freq[4]; // frequency of A, C, G, T, respectively // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed) int maxlen; // length of maximum read (initially over all DB) int64 totlen; // total # of bases (initially over all DB) int nreads; // # of reads in actively loaded portion of DB int trimmed; // DB has been trimmed by cutoff/all int part; // DB block (if > 0), total DB (if == 0) int ufirst; // Index of first read in block (without trimming) int tfirst; // Index of first read in block (with trimming) // In order to avoid forcing users to have to rebuild all thier DBs to accommodate // the addition of fields for the size of the actively loaded trimmed and untrimmed // blocks, an additional read record is allocated in "reads" when a DB is loaded into // memory (reads[-1]) and the two desired fields are crammed into the first two // integer spaces of the record. char *path; // Root name of DB for .bps, .qvs, and tracks int loaded; // Are reads loaded in memory? void *bases; // file pointer for bases file (to fetch reads from), // or memory pointer to uncompressed block of all sequences. DAZZ_READ *reads; // Array [-1..nreads] of DAZZ_READ DAZZ_TRACK *tracks; // Linked list of loaded tracks } DAZZ_DB; /******************************************************************************************* * * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock * ********************************************************************************************/ #define MAX_NAME 10000 // Longest file name or fasta header line #define DB_NFILE "files = %9d\n" // number of files #define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name #define DB_NBLOCK "blocks = %9d\n" // number of blocks #define DB_PARAMS "size = %10lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well #define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed) /******************************************************************************************* * * DB ROUTINES * ********************************************************************************************/ // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps, // .DB.qvs, and files .DB..anno and DB..data where is a track name // (not containing a . !). // A DAM is basically a DB except that: // 1. there are no QV's, instead .coff points the '\0' terminated fasta header of the read // in the file ..hdr file // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences // contain N-separated contigs), and .fpulse the first base of the contig in the // fasta entry // Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char *path, DAZZ_DB *db); // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(DAZZ_DB *db); // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(DAZZ_DB *db); // Return the size in bytes of the given DB int64 sizeof_DB(DAZZ_DB *db); // If QV pseudo track is not already in db's track list, then load it and set it up. // The database must not have been trimmed yet. -1 is returned if a .qvs file is not // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE // is defined. Otherwise a 0 is returned. int Load_QVs(DAZZ_DB *db); // Remove the QV pseudo track, all space associated with it, and close the .qvs file. void Close_QVs(DAZZ_DB *db); // Look up the file and header in the file of the indicated track. Return: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track // In addition, if opened (0 or 1 returned), then kind points at an integer indicating // the type of track as follows: // CUSTOM 0 => a custom track // MASK 1 => a mask track #define CUSTOM_TRACK 0 #define MASK_TRACK 1 int Check_Track(DAZZ_DB *db, char *track, int *kind); // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created DAZZ_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise // the routine prints an error message to stderr and exits if an error occurs, and returns // with NULL only if the track does not exist. DAZZ_TRACK *Load_Track(DAZZ_DB *db, char *track); // Assumming file pointer for afile is correctly positioned at the start of a extra item, // and aname is the name of the .anno file, decode the value present and places it in // extra if extra->nelem == 0, otherwise reduce the value just read into extra according // according the to the directive given by 'accum'. Leave the read poinrt at the next // extra or end-of-file. // Returns: // 1 if at the end of file, // 0 if item was read and folded correctly, // -1 if there was a system IO or allocation error (if interactive), and // -2 if the new value could not be reduced into the currenct value of extra (interactive) int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra); // Write extra record to end of file afile and advance write pointer // If interactive, then return non-zero on error, if bash, then print // and halt if an error int Write_Extra(FILE *afile, DAZZ_EXTRA *extra); // If track is on the db's track list, then it is removed and all storage associated with it // is freed. void Close_Track(DAZZ_DB *db, char *track); // Allocate and return a buffer big enough for the largest read in 'db'. // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte // are needed by the alignment algorithms. If cannot allocate memory then return NULL // if INTERACTIVE is defined, or print error to stderr and exit otherwise. char *New_Read_Buffer(DAZZ_DB *db); // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter // for traversals in either direction. A non-zero value is returned if an error occured // and INTERACTIVE is defined. int Load_Read(DAZZ_DB *db, int i, char *read, int ascii); // Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence, // and there is only a choice between numeric (0) or ascii (1); int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii); // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to // the string holding the substring so it has a delimeter for traversals in either direction. // A NULL pointer is returned if an error occured and INTERACTIVE is defined. char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii); // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, // or print error to stderr and exit otherwise. #define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer #define DEL_TAG 1 // The deleted characters #define INS_QV 2 // The insertion QVs #define SUB_QV 3 // The substitution QVs #define MRG_QV 4 // The merge QVs char **New_QV_Buffer(DAZZ_DB *db); // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters // are converted to a numeric or upper/lower case ascii string as per ascii. Return with // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii); // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T). // Return with a zero, except when an error occurs and INTERACTIVE is defined in which // case return wtih 1. int Read_All_Sequences(DAZZ_DB *db, int ascii); // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. // -1 is returned if the path could not be found, and 1 is returned if an error (reported // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. int List_DB_Files(char *path, void actor(char *path, char *extension)); #endif // _DAZZ_DB DAZZ_DB-master/DB2arrow.c000066400000000000000000000112721322703422500152550ustar00rootroot00000000000000/******************************************************************************************** * * Recreate all the .arrow files that have been loaded into a specified database. * * Author: Gene Myers * Date : October 2016 * ********************************************************************************************/ #include #include #include #include "DB.h" static char *Usage = "[-v] [-w] "; int main(int argc, char *argv[]) { DAZZ_DB _db, *db = &_db; FILE *dbfile; char *dbfile_name; int VERBOSE, WIDTH; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DB2arrow") WIDTH = 80; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vU") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -w: Print -w bp per line (default is 80).\n"); exit (1); } } // Open db, and db stub file { int status; char *pwd, *root; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (db->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } if ((db->allarr & DB_ARROW) == 0) { fprintf(stderr,"%s: There is no Arrow information in the DB: %s\n",Prog_Name,argv[1]); exit (1); } pwd = PathTo(argv[1]); root = Root(argv[1],".db"); dbfile_name = Strdup(Catenate(pwd,"/",root,".db"),"Allocting db file name"); dbfile = Fopen(dbfile_name,"r"); free(pwd); free(root); if (dbfile_name == NULL || dbfile == NULL) exit (1); } // For each cell do: { DAZZ_READ *reads; char lname[MAX_NAME]; FILE *ofile = NULL; int f, first, last, ofirst, nfiles; char *read; FSCANF(dbfile,DB_NFILE,&nfiles) reads = db->reads; read = New_Read_Buffer(db); first = ofirst = 0; for (f = 0; f < nfiles; f++) { int i; char prolog[MAX_NAME], fname[MAX_NAME]; // Scan db image file line, create .arrow file for writing FSCANF(dbfile,DB_FDATA,&last,fname,prolog) if (f == 0 || strcmp(fname,lname) != 0) { if (f > 0) { if (ofile == stdout) { fprintf(stderr," %d reads\n",first-ofirst); fflush(stderr); } else FCLOSE(ofile) } if (strcmp(fname,"stdout") == 0) { ofile = stdout; ofirst = first; if (VERBOSE) { fprintf(stderr,"Sending to stdout ..."); fflush(stdout); } } else { if ((ofile = Fopen(Catenate(".","/",fname,".arrow"),"w")) == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Creating %s.arrow ...\n",fname); fflush(stdout); } } strcpy(lname,fname); } // For the relevant range of reads, write each to the file // recreating the original headers with the index meta-data about each read for (i = first; i < last; i++) { int j, len; uint64 big; float snr[4]; DAZZ_READ *r; r = reads + i; len = r->rlen; big = *((uint64 *) &(r->coff)); for (j = 0; j < 4; j++) { snr[3-j] = (big & 0xffff) / 100.; big >>= 16; } FPRINTF(ofile,">%s",prolog) FPRINTF(ofile," SN=%.2f,%.2f,%.2f,%.2f",snr[0],snr[1],snr[2],snr[3]) FPRINTF(ofile,"\n") Load_Arrow(db,i,read,1); for (j = 0; j+WIDTH < len; j += WIDTH) FPRINTF(ofile,"%.*s\n",WIDTH,read+j) if (j < len) FPRINTF(ofile,"%s\n",read+j) } first = last; } if (f > 0) { if (ofile == stdout) { fprintf(stderr," %d reads\n",first-ofirst); fflush(stderr); } else FCLOSE(ofile) } } fclose(dbfile); Close_DB(db); exit (0); } DAZZ_DB-master/DB2fasta.c000066400000000000000000000110541322703422500152170ustar00rootroot00000000000000/******************************************************************************************** * * Recreate all the .fasta files that have been loaded into a specified database. * * Author: Gene Myers * Date : May 2014 * ********************************************************************************************/ #include #include #include #include "DB.h" static char *Usage = "[-vU] [-w] "; int main(int argc, char *argv[]) { DAZZ_DB _db, *db = &_db; FILE *dbfile; char *dbfile_name; int VERBOSE, UPPER, WIDTH; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DB2fasta") WIDTH = 80; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vU") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; } else argv[j++] = argv[i]; argc = j; UPPER = 1 + flags['U']; VERBOSE = flags['v']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -U: Use upper case for DNA (default is lower case).\n"); fprintf(stderr," -w: Print -w bp per line (default is 80).\n"); exit (1); } } // Open db, and db stub file { int status; char *pwd, *root; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (db->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } pwd = PathTo(argv[1]); root = Root(argv[1],".db"); dbfile_name = Strdup(Catenate(pwd,"/",root,".db"),"Allocating db file name"); dbfile = Fopen(dbfile_name,"r"); free(pwd); free(root); if (dbfile_name == NULL || dbfile == NULL) exit (1); } // For each cell do: { DAZZ_READ *reads; char lname[MAX_NAME]; FILE *ofile = NULL; int f, first, last, ofirst, nfiles; char *read; FSCANF(dbfile,DB_NFILE,&nfiles) reads = db->reads; read = New_Read_Buffer(db); first = ofirst = 0; for (f = 0; f < nfiles; f++) { int i; char prolog[MAX_NAME], fname[MAX_NAME]; // Scan db image file line, create .fasta file for writing FSCANF(dbfile,DB_FDATA,&last,fname,prolog) if (f == 0 || strcmp(fname,lname) != 0) { if (f > 0) { if (ofile == stdout) { fprintf(stderr," %d reads\n",first-ofirst); fflush(stderr); } else FCLOSE(ofile) } if (strcmp(fname,"stdout") == 0) { ofile = stdout; ofirst = first; if (VERBOSE) { fprintf(stderr,"Sending to stdout ..."); fflush(stdout); } } else { if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Creating %s.fasta ...\n",fname); fflush(stdout); } } strcpy(lname,fname); } // For the relevant range of reads, write each to the file // recreating the original headers with the index meta-data about each read for (i = first; i < last; i++) { int j, len; int flags, qv; DAZZ_READ *r; r = reads + i; len = r->rlen; flags = r->flags; qv = (flags & DB_QV); FPRINTF(ofile,">%s/%d/%d_%d",prolog,r->origin,r->fpulse,r->fpulse+len) if (qv > 0) FPRINTF(ofile," RQ=0.%3d",qv) FPRINTF(ofile,"\n") Load_Read(db,i,read,UPPER); for (j = 0; j+WIDTH < len; j += WIDTH) FPRINTF(ofile,"%.*s\n",WIDTH,read+j) if (j < len) FPRINTF(ofile,"%s\n",read+j) } first = last; } if (f > 0) { if (ofile == stdout) { fprintf(stderr," %d reads\n",first-ofirst); fflush(stderr); } else FCLOSE(ofile) } } fclose(dbfile); Close_DB(db); exit (0); } DAZZ_DB-master/DB2quiva.c000066400000000000000000000115311322703422500152460ustar00rootroot00000000000000/******************************************************************************************** * * Recreate all the .quiva files that have been loaded into a specified database. * * Author: Gene Myers * Date : May 2014 * ********************************************************************************************/ #include #include #include #include "DB.h" #include "QV.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-vU] "; int main(int argc, char *argv[]) { DAZZ_DB _db, *db = &_db; FILE *dbfile, *quiva; char *dbfile_name; int VERBOSE, UPPER; // Process arguments { int i, j, k; int flags[128]; ARG_INIT("DB2quiva") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vU") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; UPPER = flags['U']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -U: Use upper case for DNA (default is lower case).\n"); exit (1); } } // Open db, db stub file, and .qvs file { char *pwd, *root; int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (db->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } if (db->reads[0].coff < 0 || (db->allarr & DB_ARROW) != 0) { fprintf(stderr,"%s: There is no Quiver information in the DB: %s\n",Prog_Name,argv[1]); exit (1); } pwd = PathTo(argv[1]); root = Root(argv[1],".db"); dbfile_name = Strdup(Catenate(pwd,"/",root,".db"),"Allocating db file name"); dbfile = Fopen(dbfile_name,"r"); quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r"); free(pwd); free(root); if (dbfile_name == NULL || dbfile == NULL || quiva == NULL) exit (1); } // For each cell do: { DAZZ_READ *reads; char lname[MAX_NAME]; FILE *ofile = NULL; int f, first, last, ofirst, nfiles; QVcoding *coding; char **entry; FSCANF(dbfile,DB_NFILE,&nfiles) reads = db->reads; entry = New_QV_Buffer(db); first = ofirst = 0; for (f = 0; f < nfiles; f++) { int i; char prolog[MAX_NAME], fname[MAX_NAME]; // Scan db image file line, create .quiva file for writing if (reads[first].coff < 0) break; FSCANF(dbfile,DB_FDATA,&last,fname,prolog) if (f == 0 || strcmp(fname,lname) != 0) { if (f > 0) { if (ofile == stdout) { fprintf(stderr," %d quivas\n",first-ofirst); fflush(stderr); } else FCLOSE(ofile) } if (strcmp(fname,"stdout") == 0) { ofile = stdout; ofirst = first; if (VERBOSE) { fprintf(stderr,"Sending to stdout ..."); fflush(stdout); } } else { if ((ofile = Fopen(Catenate(".","/",fname,".quiva"),"w")) == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Creating %s.quiva ...\n",fname); fflush(stderr); } } strcpy(lname,fname); } // For the relevant range of reads, write the header for each to the file // and then uncompress and write the quiva entry for each coding = Read_QVcoding(quiva); for (i = first; i < last; i++) { int e, flags, qv, rlen; DAZZ_READ *r; r = reads + i; flags = r->flags; rlen = r->rlen; qv = (flags & DB_QV); FPRINTF(ofile,"@%s/%d/%d_%d",prolog,r->origin,r->fpulse,r->fpulse+rlen) if (qv > 0) FPRINTF(ofile," RQ=0.%3d",qv) FPRINTF(ofile,"\n") Uncompress_Next_QVentry(quiva,entry,coding,rlen); if (UPPER) { char *deltag = entry[1]; int j; for (j = 0; j < rlen; j++) deltag[j] -= 32; } for (e = 0; e < 5; e++) FPRINTF(ofile,"%.*s\n",rlen,entry[e]) } first = last; } if (f > 0) { if (ofile == stdout) { fprintf(stderr," %d quivas\n",first-ofirst); fflush(stderr); } else FCLOSE(ofile) } } fclose(quiva); fclose(dbfile); Close_DB(db); exit (0); } DAZZ_DB-master/DBdump.c000066400000000000000000000563661322703422500150230ustar00rootroot00000000000000/******************************************************************************************* * * Display a portion of the data-base and selected information in 1-code format. * * Author: Gene Myers * Date : November 2015 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage[] = { "[-rhsaiqp] [-uU] [-m]+", " [ | ... ]" }; #define LAST_READ_SYMBOL '$' #define MAX_BUFFER 10001 typedef struct { FILE *input; int lineno; int read; int beg; int end; } File_Iterator; File_Iterator *init_file_iterator(FILE *input) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->input = input; it->lineno = 1; rewind(input); return (it); } int next_read(File_Iterator *it) { static char nbuffer[MAX_BUFFER]; char *eol; int x; if (fgets(nbuffer,MAX_BUFFER,it->input) == NULL) { if (feof(it->input)) return (1); SYSTEM_READ_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in read list is longer than %d chars!\n", Prog_Name,it->lineno,MAX_BUFFER-1); return (1); } *eol = '\0'; x = sscanf(nbuffer," %d %d %d",&(it->read),&(it->beg),&(it->end)); if (x == 1) it->beg = -1; else if (x != 3) { fprintf(stderr,"%s: Line %d of read list is improperly formatted\n",Prog_Name,it->lineno); return (1); } it->lineno += 1; return (0); } static int qv_map[51] = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y' }; static int prof_map[41] = { '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', }; int main(int argc, char *argv[]) { DAZZ_DB _db, *db = &_db; int Quiva_DB, Arrow_DB; FILE *hdrs = NULL; char *hdrs_name = NULL; int64 *qv_idx = NULL; uint8 *qv_val = NULL; int64 *pf_idx = NULL; uint8 *pf_val = NULL; int nfiles; char **flist = NULL; int *findx = NULL; int input_pts; int reps = 0; int *pts = NULL; File_Iterator *iter = NULL; FILE *input = NULL; int TRIM, UPPER; int DORED, DOSEQ, DOARW, DOQVS, DOHDR, DOIQV, DOPRF, DAM; int MMAX, MTOP; char **MASK; DAZZ_TRACK **MTRACK; // Process arguments { int i, j, k; int flags[128]; ARG_INIT("DBdump") MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("hpqrsaiuU") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; DAM = 0; TRIM = 1-flags['u']; UPPER = 1+flags['U']; DOQVS = flags['q']; DORED = flags['r']; DOSEQ = flags['s']; DOARW = flags['a']; DOHDR = flags['h']; DOIQV = flags['i']; DOPRF = flags['p']; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr,"\n"); fprintf(stderr," -r: R # - read number\n"); fprintf(stderr," -h: H # string - original file name string (header)\n"); fprintf(stderr," L # # # - location: well, pulse start, pulse end\n"); fprintf(stderr," Q # - quality of read (#/1000)\n"); fprintf(stderr," -s: S # string - sequence string\n"); fprintf(stderr," -a: N # # # # - SNR of ACGT channels (#/100)\n"); fprintf(stderr," A # string - arrow pulse-width string\n"); fprintf(stderr," -i: I # string "); fprintf(stderr,"- intrinsic quality vector (as an ASCII string)\n"); fprintf(stderr," -q: d # string - Quiva deletion values (as an ASCII string)\n"); fprintf(stderr," c # string - Quiva deletion character string\n"); fprintf(stderr," i # string - Quiva insertion value string\n"); fprintf(stderr," m # string - Quiva merge value string\n"); fprintf(stderr," s # string - Quiva substitution value string\n"); fprintf(stderr," -p: P # string - repeat profile vector (as an ASCII string)\n"); fprintf(stderr," -m: Tx #n (#b #e)^#n "); fprintf(stderr,"- x'th track on command line, #n intervals all on same line\n"); fprintf(stderr,"\n"); fprintf(stderr," -u: Dump entire untrimmed database.\n"); fprintf(stderr," -U: Output base pairs in upper case letters\n"); exit (1); } if ( ! TRIM && DOIQV) { fprintf(stderr,"%s: -i and -u are incompatible\n",Prog_Name); exit (1); } if ( ! TRIM && DOPRF) { fprintf(stderr,"%s: -p and -u are incompatible\n",Prog_Name); exit (1); } } // Open DB or DAM, and if a DAM open also .hdr file { char *pwd, *root; int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { root = Root(argv[1],".dam"); pwd = PathTo(argv[1]); if (db->part > 0) *rindex(root,'.') = '\0'; hdrs_name = Strdup(Catenate(pwd,PATHSEP,root,".hdr"),"Allocating header file name"); hdrs = Fopen(hdrs_name,"r"); if (hdrs == NULL) exit (1); DAM = 1; if (DOQVS) { fprintf(stderr,"%s: -q option is not compatible with a .dam DB\n",Prog_Name); exit (1); } if (DOARW) { fprintf(stderr,"%s: -a option is not compatible with a .dam DB\n",Prog_Name); exit (1); } free(root); free(pwd); } Arrow_DB = ((db->allarr & DB_ARROW) != 0); Quiva_DB = (db->reads[0].coff >= 0 && (db->allarr & DB_ARROW) == 0); if (DOARW) { if (!Arrow_DB) { fprintf(stderr,"%s: -a option set but no Arrow data in DB\n",Prog_Name); exit (1); } } if (DOQVS) { if (!Quiva_DB) { fprintf(stderr,"%s: -q option set but no Quiver data in DB\n",Prog_Name); exit (1); } } } // Load QVs if requested if (DOQVS) { if (Load_QVs(db) < 0) { fprintf(stderr,"%s: QVs requested, but no .qvs for data base\n",Prog_Name); exit (1); } } // Check tracks and load tracks for untrimmed DB { int i, status, kind; MTRACK = Malloc(sizeof(DAZZ_TRACK *)*MTOP,"Allocation of track pointer vector"); if (MTRACK == NULL) exit (1); for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status == -2) { fprintf(stderr,"%s: Warning: -m%s option given but no track found.\n", Prog_Name,MASK[i]); exit (1); } else if (status == -1) { fprintf(stderr,"%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]); exit (1); } else if (kind != MASK_TRACK) { fprintf(stderr,"%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]); exit (1); } else if (status == 0) MTRACK[i] = Load_Track(db,MASK[i]); else if (status == 1 && !TRIM) { fprintf(stderr,"%s: Warning: %s track is for a trimmed db but -u is set.\n", Prog_Name,MASK[i]); exit (1); } } } // If not a DAM then get prolog names and index ranges from the .db file if (!DAM) { char *pwd, *root; FILE *dstub; char *dstub_name; int i; root = Root(argv[1],".db"); pwd = PathTo(argv[1]); if (db->part > 0) *rindex(root,'.') = '\0'; dstub_name = Strdup(Catenate(pwd,"/",root,".db"),"Allocating db file name"); dstub = Fopen(dstub_name,"r"); if (dstub_name == NULL || dstub == NULL) exit (1); free(pwd); free(root); FSCANF(dstub,DB_NFILE,&nfiles) flist = (char **) Malloc(sizeof(char *)*nfiles,"Allocating file list"); findx = (int *) Malloc(sizeof(int *)*(nfiles+1),"Allocating file index"); if (flist == NULL || findx == NULL) exit (1); findx += 1; findx[-1] = 0; for (i = 0; i < nfiles; i++) { char prolog[MAX_NAME], fname[MAX_NAME]; FSCANF(dstub,DB_FDATA,findx+i,fname,prolog) if ((flist[i] = Strdup(prolog,"Adding to file list")) == NULL) exit (1); } free(dstub_name); fclose(dstub); // If TRIM (the default) then "trim" prolog ranges and the DB if (TRIM) { int nid, oid, lid; int cutoff, allflag; DAZZ_READ *reads; reads = db->reads - db->ufirst; cutoff = db->cutoff; if ((db->allarr & DB_ALL) != 0) allflag = 0; else allflag = DB_BEST; nid = 0; oid = db->ufirst; lid = oid + db->nreads; for (i = 0; i < nfiles; i++) { while (oid < findx[i] && oid < lid) { if ((reads[oid].flags & DB_BEST) >= allflag && reads[oid].rlen >= cutoff) nid++; oid += 1; } findx[i] = nid; } } else if (db->part > 0) { for (i = 0; i < nfiles; i++) findx[i] -= db->ufirst; } } if (TRIM) { int i, status, kind; Trim_DB(db); // Load tracks for trimmed DB for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status < 0) continue; else if (status == 1) MTRACK[i] = Load_Track(db,MASK[i]); } } if (DOIQV) { int status, kind; DAZZ_TRACK *track; status = Check_Track(db,"qual",&kind); if (status == -2) { fprintf(stderr,"%s: .qual-track does not exist for this db.\n",Prog_Name); exit (1); } if (status == -1) { fprintf(stderr,"%s: .qual-track not sync'd with db.\n",Prog_Name); exit (1); } track = Load_Track(db,"qual"); qv_idx = (int64 *) track->anno; qv_val = (uint8 *) track->data; } if (DOPRF) { int status, kind; DAZZ_TRACK *track; status = Check_Track(db,"prof",&kind); if (status == -2) { fprintf(stderr,"%s: .prof-track does not exist for this db.\n",Prog_Name); exit (1); } if (status == -1) { fprintf(stderr,"%s: .prof-track not sync'd with db.\n",Prog_Name); exit (1); } track = Load_Track(db,"prof"); pf_idx = (int64 *) track->anno; pf_val = (uint8 *) track->data; } // Process read index arguments into a list of read ranges input_pts = 0; if (argc == 3) { if (argv[2][0] != LAST_READ_SYMBOL || argv[2][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[2],&eptr,10); if (eptr > argv[2] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { input = Fopen(argv[2],"r"); if (input == NULL) exit (1); iter = init_file_iterator(input); } else { pts = (int *) Malloc(sizeof(int)*2*(argc-1),"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 2) { int c, b, e; char *eptr, *fptr; for (c = 2; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == 0) { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = db->nreads; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } } else { pts[reps++] = 1; pts[reps++] = db->nreads; } } // Scan to count the size of things { DAZZ_READ *reads; int c, b, e, i, m; int map, substr; int64 noreads; int64 seqmax, seqtot; int64 iqvmax, iqvtot; int64 prfmax, prftot; int64 hdrmax, hdrtot; int64 trkmax[MTOP], trktot[MTOP]; map = 0; reads = db->reads; substr = 0; noreads = 0; seqmax = 0; seqtot = 0; iqvmax = 0; iqvtot = 0; prfmax = 0; prftot = 0; hdrmax = 0; hdrmax = 0; hdrtot = 0; for (m = 0; m < MTOP; m++) { trkmax[m] = 0; trktot[m] = 0; } c = 0; while (1) { if (input_pts) { if (next_read(iter)) break; e = iter->read; b = e-1; substr = (iter->beg >= 0); } else { if (c >= reps) break; b = pts[c]-1; e = pts[c+1]; if (e > db->nreads) e = db->nreads; c += 2; } for (i = b; i < e; i++) { int len, ten; int fst, lst; DAZZ_READ *r; r = reads + i; len = r->rlen; noreads += 1; if (DOHDR) { int ten; if (DAM) { char header[MAX_NAME]; FSEEKO(hdrs,r->coff,SEEK_SET) FGETS(header,MAX_NAME,hdrs) header[strlen(header)-1] = '\0'; ten = strlen(header); } else { while (i < findx[map-1]) map -= 1; while (i >= findx[map]) map += 1; ten = strlen(flist[map]); } if (hdrmax < ten) hdrmax = ten; hdrtot += ten; } for (m = 0; m < MTOP; m++) { int64 *anno; anno = (int64 *) MTRACK[m]->anno; ten = ((anno[i+1]-anno[i]) >> 3); if (ten > trkmax[m]) trkmax[m] = ten; trktot[m] += ten; } if (substr) { fst = iter->beg; lst = iter->end; if (DOIQV) { fprintf(stderr,"%s: Cannot select subreads when -i is requested\n",Prog_Name); exit (1); } if (DOPRF) { fprintf(stderr,"%s: Cannot select subreads when -p is requested\n",Prog_Name); exit (1); } } else { fst = 0; lst = len; } if (DOSEQ | DOQVS | DOARW) { int ten = lst-fst; if (ten > seqmax) seqmax = ten; seqtot += ten; } if (DOIQV) { int ten = qv_idx[i+1] - qv_idx[i]; if (ten > iqvmax) iqvmax = ten; iqvtot += ten; } if (DOPRF) { int ten = pf_idx[i+1] - pf_idx[i]; if (ten > prfmax) prfmax = ten; prftot += ten; } } } PRINTF("+ R %lld\n",noreads) PRINTF("+ M %d\n",MTOP) if (DOHDR) { PRINTF("+ H %lld\n",hdrtot) PRINTF("@ H %lld\n",hdrmax) } for (m = 0; m < MTOP; m++) { PRINTF("+ T%d %lld\n",m,trktot[m]) PRINTF("@ T%d %lld\n",m,trkmax[m]) } if (DOSEQ | DOQVS | DOARW) { PRINTF("+ S %lld\n",seqtot) PRINTF("@ S %lld\n",seqmax) } if (DOIQV) { PRINTF("+ I %lld\n",iqvtot) PRINTF("@ I %lld\n",iqvmax) } if (DOPRF) { PRINTF("+ P %lld\n",prftot) PRINTF("@ P %lld\n",prfmax) } } // Display each read (and/or QV streams) in the active DB according to the // range pairs in pts[0..reps) and according to the display options. { DAZZ_READ *reads; char *read, *arrow, **entry; int c, b, e, i, m; int substr; int map; char qvname[5] = { 'd', 'c', 'i', 'm', 's' }; read = New_Read_Buffer(db); if (DOQVS) entry = New_QV_Buffer(db); else entry = NULL; if (DOARW) arrow = New_Read_Buffer(db); else arrow = NULL; map = 0; reads = db->reads; substr = 0; if (input_pts) iter = init_file_iterator(input); else iter = NULL; c = 0; while (1) { if (input_pts) { if (next_read(iter)) break; e = iter->read; b = e-1; substr = (iter->beg >= 0); } else { if (c >= reps) break; b = pts[c]-1; e = pts[c+1]; if (e > db->nreads) e = db->nreads; c += 2; } for (i = b; i < e; i++) { int len; int fst, lst; int flags, qv; DAZZ_READ *r; r = reads + i; len = r->rlen; if (DORED) printf("R %d\n",i+1); flags = r->flags; qv = (flags & DB_QV); if (DOHDR) { if (DAM) { char header[MAX_NAME]; FSEEKO(hdrs,r->coff,SEEK_SET) FGETS(header,MAX_NAME,hdrs) header[strlen(header)-1] = '\0'; PRINTF("H %ld %s\n",strlen(header),header) PRINTF("L %d %d %d\n",r->origin,r->fpulse,r->fpulse+len) } else { while (i < findx[map-1]) map -= 1; while (i >= findx[map]) map += 1; PRINTF("H %ld %s\n",strlen(flist[map]),flist[map]) PRINTF("L %d %d %d\n",r->origin,r->fpulse,r->fpulse+len) if (Quiva_DB && qv > 0) PRINTF("Q %d\n",qv) else if (Arrow_DB) { int j, snr[4]; int64 big; big = *((uint64 *) &(r->coff)); for (j = 0; j < 4; j++) { snr[3-j] = (big & 0xffff); big >>= 16; } PRINTF("N %d %d %d %d\n",snr[0],snr[1],snr[2],snr[3]) } } } if (DOQVS) Load_QVentry(db,i,entry,UPPER); if (DOSEQ) Load_Read(db,i,read,UPPER); if (DOARW) Load_Arrow(db,i,arrow,1); for (m = 0; m < MTOP; m++) { int64 *anno; int *data; int64 s, f, j; anno = (int64 *) MTRACK[m]->anno; data = (int *) MTRACK[m]->data; s = (anno[i] >> 2); f = (anno[i+1] >> 2); PRINTF("T%d %lld ",m,(f-s)/2) if (s < f) { for (j = s; j < f; j += 2) PRINTF(" %d %d",data[j],data[j+1]) } PRINTF("\n") } if (substr) { fst = iter->beg; lst = iter->end; } else { fst = 0; lst = len; } if (DOSEQ) { PRINTF("S %d ",lst-fst) PRINTF("%.*s\n",lst-fst,read+fst) } if (DOARW) { PRINTF("A %d ",lst-fst) PRINTF("%.*s\n",lst-fst,arrow+fst) } if (DOIQV) { int64 k, e; k = qv_idx[i]; e = qv_idx[i+1]; PRINTF("I %lld ",e-k) while (k < e) { if (putchar(qv_map[qv_val[k++]]) == EOF) SYSTEM_WRITE_ERROR } PRINTF("\n") } if (DOPRF) { int64 k, e; k = pf_idx[i]; e = pf_idx[i+1]; PRINTF("P %lld ",e-k) while (k < e) { if (putchar(prof_map[pf_val[k++]]) == EOF) SYSTEM_WRITE_ERROR } PRINTF("\n") } if (DOQVS) { int k; for (k = 0; k < 5; k++) { PRINTF("%c %d ",qvname[k],lst-fst) PRINTF("%.*s\n",lst-fst,entry[k]+fst) } } } } } FCLOSE(stdout) if (input_pts) { fclose(input); free(iter); } else free(pts); if (DAM) fclose(hdrs); else { int i; for (i = 0; i < nfiles; i++) free(flist[i]); free(flist); free(findx-1); } Close_DB(db); exit (0); } DAZZ_DB-master/DBdust.c000066400000000000000000000317011322703422500150170ustar00rootroot00000000000000/******************************************************************************************* * * My implementation of the SDUST algorithm (Morgulis et al., JCB 13, 5 (2006), 1028-1040) * * Author: Gene Myers * Date : September 2013 * Mod : Is now incremental * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif #undef DEBUG #ifdef DEBUG static int Caps[4] = { 'A', 'C', 'G', 'T' }; static int Lowr[4] = { 'a', 'c', 'g', 't' }; #endif static char *Usage = "[-b] [-w] [-t] [-m] "; typedef struct _cand { struct _cand *next; struct _cand *prev; int beg; int end; double score; } Candidate; int main(int argc, char *argv[]) { DAZZ_DB _db, *db = &_db; FILE *afile, *dfile; int64 indx; int nreads; int *mask; Candidate *cptr; int WINDOW; double THRESH; int MINLEN; int BIASED; { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DBdust") WINDOW = 64; THRESH = 2.; MINLEN = 9; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("b") break; case 'w': ARG_POSITIVE(WINDOW,"Window size") break; case 't': ARG_REAL(THRESH) if (THRESH <= 0.) { fprintf(stderr,"%s: Threshold must be positive (%g)\n",Prog_Name,THRESH); exit (1); } break; case 'm': ARG_NON_NEGATIVE(MINLEN,"Minimum hit") MINLEN -= 1; break; } else argv[j++] = argv[i]; argc = j; BIASED = flags['b']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -w: DUST algorithm window size.\n"); fprintf(stderr," -t: DUST algorithm threshold.\n"); fprintf(stderr," -m: Record only low-complexity intervals >= this size.\n"); fprintf(stderr," -b: Take into account base composition bias.\n"); exit (1); } } // Open .db or .dam { int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); } mask = (int *) Malloc((db->maxlen+1)*sizeof(int),"Allocating mask vector"); cptr = (Candidate *) Malloc((WINDOW+1)*sizeof(Candidate),"Allocating candidate vector"); if (mask == NULL || cptr == NULL) exit (1); { char *pwd, *root, *fname; int size; pwd = PathTo(argv[1]); root = Root(argv[1],".db"); size = 0; fname = Catenate(pwd,PATHSEP,root,".dust.anno"); if ((afile = fopen(fname,"r+")) == NULL || db->part > 0) { if (afile != NULL) fclose(afile); afile = Fopen(fname,"w"); dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"w"); if (dfile == NULL || afile == NULL) exit (1); FWRITE(&(db->nreads),sizeof(int),1,afile) FWRITE(&size,sizeof(int),1,afile) nreads = 0; indx = 0; FWRITE(&indx,sizeof(int64),1,afile) } else { dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"r+"); if (dfile == NULL) exit (1); if (fread(&nreads,sizeof(int),1,afile) != 1) SYSTEM_READ_ERROR if (nreads >= db->nreads) { fclose(afile); fclose(dfile); exit(0); } FSEEKO(afile,0,SEEK_SET) FWRITE(&(db->nreads),sizeof(int),1,afile) FWRITE(&size,sizeof(int),1,afile) FSEEKO(afile,0,SEEK_END) FSEEKO(dfile,0,SEEK_END) indx = FTELLO(dfile); } free(pwd); free(root); } { int *mask1; char *read, *lag2; int wcount[64], lcount[64]; Candidate *aptr; double skew[64], thresh2r; int thresh2i; int i; read = New_Read_Buffer(db); lag2 = read-2; mask1 = mask+1; *mask = -2; aptr = cptr+1; for (i = 1; i < WINDOW; i++) cptr[i].next = aptr+i; cptr[WINDOW].next = NULL; cptr->next = cptr->prev = cptr; cptr->beg = -2; thresh2r = 2.*THRESH; thresh2i = (int) ceil(thresh2r); if (BIASED) { int a, b, c, p; p = 0; for (a = 0; a < 4; a++) for (b = 0; b < 4; b++) for (c = 0; c < 4; c++) skew[p++] = .015625 / (db->freq[a]*db->freq[b]*db->freq[c]); } for (i = nreads; i < db->nreads; i++) { Candidate *lptr, *jptr; int *mtop; double mscore; int len; int wb, lb; int j, c, d; len = db->reads[i].rlen; // Fetch read Load_Read(db,i,read,0); c = (read[0] << 2) | read[1]; // Convert to triple codes for (j = 2; j < len; j++) { c = ((c << 2) & 0x3f) | read[j]; lag2[j] = (char) c; } len -= 2; for (j = 0; j < 64; j++) // Setup counter arrays wcount[j] = lcount[j] = 0; mtop = mask; // The dust algorithm lb = wb = -1; if (BIASED) { double lsqr, wsqr, trun; // Modification for high-compositional bias wsqr = lsqr = 0.; for (j = 0; j < len; j++) { c = read[j]; #define ADDR(e,cnt,sqr) sqr += (cnt[e]++) * skew[e]; #define DELR(e,cnt,sqr) sqr -= (--cnt[e]) * skew[e]; #define WADDR(e) ADDR(e,wcount,wsqr) #define WDELR(e) DELR(e,wcount,wsqr) #define LADDR(e) ADDR(e,lcount,lsqr) #define LDELR(e) DELR(e,lcount,lsqr) if (j > WINDOW-3) { d = read[++wb]; WDELR(d) } WADDR(c) if (lb < wb) { d = read[++lb]; LDELR(d) } trun = (lcount[c]++) * skew[c]; lsqr += trun; if (trun >= thresh2r) { while (lb < j) { d = read[++lb]; LDELR(d) if (d == c) break; } } jptr = cptr->prev; if (jptr != cptr && jptr->beg <= wb) { c = jptr->end + 2; if (*mtop+1 >= jptr->beg) { if (*mtop < c) *mtop = c; } else { *++mtop = jptr->beg; *++mtop = c; } lptr = jptr->prev; cptr->prev = lptr; lptr->next = cptr; jptr->next = aptr; aptr = jptr; } if (wsqr <= lsqr*THRESH) continue; jptr = cptr->next; lptr = cptr; mscore = 0.; for (c = lb; c > wb; c--) { d = read[c]; LADDR(d) if (lsqr >= THRESH * (j-c)) { for ( ; jptr->beg >= c; jptr = (lptr = jptr)->next) if (jptr->score > mscore) mscore = jptr->score; if (lsqr >= mscore * (j-c)) { mscore = lsqr / (j-c); if (lptr->beg == c) { lptr->end = j; lptr->score = mscore; } else { aptr->beg = c; aptr->end = j; aptr->score = mscore; aptr->prev = lptr; lptr = lptr->next = aptr; aptr = aptr->next; jptr->prev = lptr; lptr->next = jptr; } } } } for (c++; c <= lb; c++) { d = read[c]; LDELR(d) } } } else { int lsqr, wsqr, trun; // Algorithm for GC-balanced sequences wsqr = lsqr = 0; for (j = 0; j < len; j++) { c = read[j]; #define ADDI(e,cnt,sqr) sqr += (cnt[e]++); #define DELI(e,cnt,sqr) sqr -= (--cnt[e]); #define WADDI(e) ADDI(e,wcount,wsqr) #define WDELI(e) DELI(e,wcount,wsqr) #define LADDI(e) ADDI(e,lcount,lsqr) #define LDELI(e) DELI(e,lcount,lsqr) if (j > WINDOW-3) { d = read[++wb]; WDELI(d) } WADDI(c) if (lb < wb) { d = read[++lb]; LDELI(d) } trun = lcount[c]++; lsqr += trun; if (trun >= thresh2i) { while (lb < j) { d = read[++lb]; LDELI(d) if (d == c) break; } } jptr = cptr->prev; if (jptr != cptr && jptr->beg <= wb) { c = jptr->end + 2; if (*mtop+1 >= jptr->beg) { if (*mtop < c) *mtop = c; } else { *++mtop = jptr->beg; *++mtop = c; } lptr = jptr->prev; cptr->prev = lptr; lptr->next = cptr; jptr->next = aptr; aptr = jptr; } if (wsqr <= lsqr*THRESH) continue; jptr = cptr->next; lptr = cptr; mscore = 0.; for (c = lb; c > wb; c--) { d = read[c]; LADDI(d) if (lsqr >= THRESH * (j-c)) { for ( ; jptr->beg >= c; jptr = (lptr = jptr)->next) if (jptr->score > mscore) mscore = jptr->score; if (lsqr >= mscore * (j-c)) { mscore = (1. * lsqr) / (j-c); if (lptr->beg == c) { lptr->end = j; lptr->score = mscore; } else { aptr->beg = c; aptr->end = j; aptr->score = mscore; aptr->prev = lptr; lptr = lptr->next = aptr; aptr = aptr->next; jptr->prev = lptr; lptr->next = jptr; } } } } for (c++; c <= lb; c++) { d = read[c]; LDELI(d) } } } while ((jptr = cptr->prev) != cptr) { c = jptr->end + 2; if (*mtop+1 >= jptr->beg) { if (*mtop < c) *mtop = c; } else { *++mtop = jptr->beg; *++mtop = c; } cptr->prev = jptr->prev; jptr->prev->next = cptr; jptr->next = aptr; aptr = jptr; } { int *jtop, ntop; ntop = 0; for (jtop = mask1; jtop < mtop; jtop += 2) if (jtop[1] - jtop[0] >= MINLEN) { mask[++ntop] = jtop[0]; mask[++ntop] = jtop[1]+1; } mtop = mask + ntop; indx += ntop*sizeof(int); FWRITE(&indx,sizeof(int64),1,afile) FWRITE(mask1,sizeof(int),ntop,dfile) } #ifdef DEBUG { int *jtop; printf("\nREAD %d\n",i); for (jtop = mask1; jtop < mtop; jtop += 2) printf(" [%5d,%5d]\n",jtop[0],jtop[1]); Load_Read(db,i,read,0); jtop = mask1; for (c = 0; c < len; c++) { while (jtop < mtop && c > jtop[1]) jtop += 2; if (jtop < mtop && c >= *jtop) printf("%c",Caps[(int) read[c]]); else printf("%c",Lowr[(int) read[c]]); if ((c%80) == 79) printf("\n"); } printf("\n"); } #endif } } FCLOSE(afile) FCLOSE(dfile) Close_DB(db); exit (0); } DAZZ_DB-master/DBmv.c000066400000000000000000000044471322703422500144710ustar00rootroot00000000000000/******************************************************************************************** * * Remove a list of .db databases * Delete all the files for the given data bases .db ... (there are a couple * of hidden . files for each DB, and these are removed too.) Do not use "rm" to * remove a database. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #include #include #include #include #include "DB.h" static char *Usage = "[-v] "; static int VERBOSE; static char *nroot; static char *npath; // We assume this program uses so little code that memory allocation checks are unecessary static char *Catenate5(char *path, char *sep1, char *root, char *sep2, char *suffix) { static char *cat = NULL; static int max = -1; int len; len = strlen(path) + strlen(sep1) + strlen(root) + strlen(sep2) + strlen(suffix); if (len > max) { max = ((int) (1.2*len)) + 100; cat = (char *) realloc(cat,max+1); } sprintf(cat,"%s%s%s%s%s",path,sep1,root,sep2,suffix); return (cat); } static void HANDLER(char *path, char *exten) { char *r, *n; r = Root(path,""); if (*r == '.') n = Catenate5(npath,"/.",nroot,".",exten); else n = Catenate5(npath,"/",nroot,".",exten); if (rename(path,n) != 0) fprintf(stderr,"%s: [WARNING] Couldn't rename file %s\n",Prog_Name,r); else if (VERBOSE) fprintf(stderr," Moving %s to %s\n",path,n); free(r); } int main(int argc, char *argv[]) { // Process arguments { int i, j, k; int flags[128]; ARG_INIT("DBmv") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("v") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc != 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0) nroot = Root(argv[2],".dam"); else nroot = Root(argv[2],".db"); npath = PathTo(argv[2]); printf(" From = '%s'\n",argv[1]); if (List_DB_Files(argv[1],HANDLER) < 0) { fprintf(stderr,"%s: Could not find database %s\n",Prog_Name,argv[1]); exit (1); } exit (0); } DAZZ_DB-master/DBrm.c000066400000000000000000000027141322703422500144600ustar00rootroot00000000000000/******************************************************************************************** * * Remove a list of .db databases * Delete all the files for the given data bases .db ... (there are a couple * of hidden . files for each DB, and these are removed too.) Do not use "rm" to * remove a database. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #include #include #include #include #include "DB.h" static char *Usage = "[-v] ... "; static int VERBOSE; static void HANDLER(char *path, char *exten) { (void) exten; if (unlink(path) != 0) fprintf(stderr,"%s: [WARNING] Couldn't delete file %s\n",Prog_Name,path); else if (VERBOSE) fprintf(stderr," Deleting %s\n",path); } int main(int argc, char *argv[]) { // Process arguments { int i, j, k; int flags[128]; ARG_INIT("DBrm") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("v") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } { int i; for (i = 1; i < argc; i++) if (List_DB_Files(argv[i],HANDLER) < 0) fprintf(stderr,"%s: [WARNING] Could not find database %s\n",Prog_Name,argv[i]); } exit (0); } DAZZ_DB-master/DBshow.c000066400000000000000000000442121322703422500150210ustar00rootroot00000000000000/******************************************************************************************* * * Display a specified set of reads of a database in fasta format. * * Author: Gene Myers * Date : September 2013 * Mod : With DB overhaul, made this a routine strictly for printing a selected subset * and created DB2fasta for recreating all the fasta files of a DB * Date : April 2014 * Mod : Added options to display QV streams * Date : July 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage[] = { "[-unqaUQA] [-w] [-m]+", " [ | ... ]" }; #define LAST_READ_SYMBOL '$' #define MAX_BUFFER 10001 typedef struct { FILE *input; int lineno; int read; int beg; int end; } File_Iterator; File_Iterator *init_file_iterator(FILE *input) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->input = input; it->lineno = 1; rewind(input); return (it); } int next_read(File_Iterator *it) { static char nbuffer[MAX_BUFFER]; char *eol; int x; if (fgets(nbuffer,MAX_BUFFER,it->input) == NULL) { if (feof(it->input)) return (1); SYSTEM_READ_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in read list is longer than %d chars!\n", Prog_Name,it->lineno,MAX_BUFFER-1); return (1); } *eol = '\0'; x = sscanf(nbuffer," %d %d %d",&(it->read),&(it->beg),&(it->end)); if (x == 1) it->beg = -1; else if (x != 3) { fprintf(stderr,"%s: Line %d of read list is improperly formatted\n",Prog_Name,it->lineno); return (1); } it->lineno += 1; return (0); } int main(int argc, char *argv[]) { DAZZ_DB _db, *db = &_db; FILE *hdrs = NULL; char *hdrs_name = NULL; int nfiles; char **flist = NULL; int *findx = NULL; int reps, *pts; int input_pts; File_Iterator *iter = NULL; FILE *input; int TRIM, UPPER; int DOSEQ, DOQVS, DOARR, QUIVA, ARROW, DAM; int WIDTH; int MMAX, MTOP; char **MASK; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DBshow") WIDTH = 80; MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("unqaUQA") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; DAM = 0; TRIM = 1-flags['u']; UPPER = 1+flags['U']; DOQVS = flags['q']; DOARR = flags['a']; DOSEQ = 1-flags['n']; QUIVA = flags['Q']; ARROW = flags['A']; if ((QUIVA || DOQVS) && (ARROW || DOARR)) { fprintf(stderr,"%s: Cannot request both Quiver (-Q,-q) and Arrow (-A,a) information\n", Prog_Name); exit (1); } if (QUIVA) { DOQVS = 1; DOSEQ = 0; MTOP = 0; } if (ARROW) { DOARR = 1; DOSEQ = 0; MTOP = 0; } if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr,"\n"); fprintf(stderr," -u: Show the untrimmed database.\n"); fprintf(stderr,"\n"); fprintf(stderr," -q: Show also the .quiva streams.\n"); fprintf(stderr," -a: Show also the .arrow pulse sequences.\n"); fprintf(stderr," -n: Do not show the default read DNA sequences.\n"); fprintf(stderr," -m: Show mask intervals and highlight in sequence.\n"); fprintf(stderr,"\n"); fprintf(stderr," -Q: Produce a .quiva file (ignore all other options but -uU.\n"); fprintf(stderr," -A: Produce a .arrow file (ignore all other options but -uw.\n"); fprintf(stderr,"\n"); fprintf(stderr," -U: Use upper case for DNA (default is lower case).\n"); fprintf(stderr," -w: Print -w bp per line (default is 80).\n"); exit (1); } } // Open DB or DAM, and if a DAM open also .hdr file { char *pwd, *root; int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { root = Root(argv[1],".dam"); pwd = PathTo(argv[1]); if (db->part > 0) *rindex(root,'.') = '\0'; hdrs_name = Strdup(Catenate(pwd,PATHSEP,root,".hdr"),"Allocating header file name"); hdrs = Fopen(hdrs_name,"r"); if (hdrs_name == NULL || hdrs == NULL) exit (1); DAM = 1; if (DOQVS || DOARR) { fprintf(stderr,"%s: -q, a, Q and A options not compatible with a .dam DB\n",Prog_Name); exit (1); } free(root); free(pwd); } if (DOQVS) { if (db->reads[0].coff < 0 || (db->allarr & DB_ARROW) != 0) { fprintf(stderr,"%s: -q or Q option but no Quiver data in DB!\n",Prog_Name); exit (1); } } if (DOARR) { if ((db->allarr & DB_ARROW) == 0) { fprintf(stderr,"%s: -a or A option but no Arrow data in DB!\n",Prog_Name); exit (1); } } } // Load QVs if requested if (DOQVS) { if (Load_QVs(db) < 0) { fprintf(stderr,"%s: QVs requested, but no .qvs for data base\n",Prog_Name); exit (1); } } // Check tracks and load tracks for untrimmed DB { int i, status, kind; for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status == -2) printf("%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]); else if (status == -1) printf("%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]); else if (kind != MASK_TRACK) printf("%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]); else if (status == 0) Load_Track(db,MASK[i]); else if (status == 1 && !TRIM) printf("%s: Warning: %s track is for a trimmed db but -u is set.\n",Prog_Name,MASK[i]); } } // If not a DAM then get prolog names and index ranges from the .db file if (!DAM) { char *pwd, *root; FILE *dstub; char *dstub_name; int i; root = Root(argv[1],".db"); pwd = PathTo(argv[1]); if (db->part > 0) *rindex(root,'.') = '\0'; dstub_name = Strdup(Catenate(pwd,"/",root,".db"),"Allocating db file name"); dstub = Fopen(dstub_name,"r"); if (dstub_name == NULL || dstub == NULL) exit (1); free(pwd); free(root); FSCANF(dstub,DB_NFILE,&nfiles) flist = (char **) Malloc(sizeof(char *)*nfiles,"Allocating file list"); findx = (int *) Malloc(sizeof(int *)*(nfiles+1),"Allocating file index"); if (flist == NULL || findx == NULL) exit (1); findx += 1; findx[-1] = 0; for (i = 0; i < nfiles; i++) { char prolog[MAX_NAME], fname[MAX_NAME]; FSCANF(dstub,DB_FDATA,findx+i,fname,prolog) if ((flist[i] = Strdup(prolog,"Adding to file list")) == NULL) exit (1); } fclose(dstub); free(dstub_name); // If TRIM (the default) then "trim" prolog ranges and the DB if (TRIM) { int nid, oid, lid; int cutoff, allflag; DAZZ_READ *reads; reads = db->reads - db->ufirst; cutoff = db->cutoff; if ((db->allarr & DB_ALL) != 0) allflag = 0; else allflag = DB_BEST; nid = 0; oid = db->ufirst; lid = oid + db->nreads; for (i = 0; i < nfiles; i++) { while (oid < findx[i] && oid < lid) { if ((reads[oid].flags & DB_BEST) >= allflag && reads[oid].rlen >= cutoff) nid++; oid += 1; } findx[i] = nid; } } else if (db->part > 0) { for (i = 0; i < nfiles; i++) findx[i] -= db->ufirst; } } if (TRIM) { int i, status, kind; Trim_DB(db); // Load tracks for trimmed DB for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status < 0) continue; else if (status == 1 && kind == MASK_TRACK) Load_Track(db,MASK[i]); } } // Process read index arguments into a list of read ranges input_pts = 0; if (argc == 3) { if (argv[2][0] != LAST_READ_SYMBOL || argv[2][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[2],&eptr,10); if (eptr > argv[2] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { input = Fopen(argv[2],"r"); if (input == NULL) exit (1); iter = init_file_iterator(input); } else { pts = (int *) Malloc(sizeof(int)*2*(argc-1),"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 2) { int c, b, e; char *eptr, *fptr; for (c = 2; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == 0) { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = db->nreads; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } } else { pts[reps++] = 1; pts[reps++] = db->nreads; } } // Display each read (and/or QV streams) in the active DB according to the // range pairs in pts[0..reps) and according to the display options. { DAZZ_READ *reads; DAZZ_TRACK *first; char *read, *arrow, **entry; int c, b, e, i; int hilight, substr; int map; int (*iscase)(int); read = New_Read_Buffer(db); if (DOQVS) { entry = New_QV_Buffer(db); first = db->tracks->next; } else { entry = NULL; first = db->tracks; } if (DOARR) arrow = New_Read_Buffer(db); else arrow = NULL; if (UPPER == 1) { hilight = 'A'-'a'; iscase = islower; } else { hilight = 'a'-'A'; iscase = isupper; } map = 0; reads = db->reads; substr = 0; c = 0; while (1) { if (input_pts) { if (next_read(iter)) break; e = iter->read; b = e-1; substr = (iter->beg >= 0); } else { if (c >= reps) break; b = pts[c]-1; e = pts[c+1]; if (e > db->nreads) e = db->nreads; c += 2; } for (i = b; i < e; i++) { int len; int fst, lst; int flags, qv; float snr[4]; DAZZ_READ *r; DAZZ_TRACK *track; r = reads + i; len = r->rlen; if (substr) { fst = iter->beg; lst = iter->end; } else { fst = 0; lst = len; } flags = r->flags; qv = (flags & DB_QV); if (DOARR) { uint64 big; int j; big = *((uint64 *) &(r->coff)); for (j = 0; j < 4; j++) { snr[3-j] = (big & 0xffff) / 100.; big >>= 16; } } if (DAM) { char header[MAX_NAME]; FSEEKO(hdrs,r->coff,SEEK_SET) FGETS(header,MAX_NAME,hdrs) header[strlen(header)-1] = '\0'; PRINTF("%s :: Contig %d[%d,%d]",header,r->origin,r->fpulse+fst,r->fpulse+lst) } else { while (i < findx[map-1]) map -= 1; while (i >= findx[map]) map += 1; if (QUIVA) PRINTF("@%s/%d/%d_%d",flist[map],r->origin,r->fpulse+fst,r->fpulse+lst) else if (ARROW) PRINTF(">%s",flist[map]) else PRINTF(">%s/%d/%d_%d",flist[map],r->origin,r->fpulse+fst,r->fpulse+lst) if (qv > 0) PRINTF(" RQ=0.%3d",qv) if (DOARR) PRINTF(" SN=%.2f,%.2f,%.2f,%.2f",snr[0],snr[1],snr[2],snr[3]) } PRINTF("\n") if (DOQVS) Load_QVentry(db,i,entry,UPPER); if (DOSEQ) Load_Read(db,i,read,UPPER); if (DOARR) Load_Arrow(db,i,arrow,1); for (track = first; track != NULL; track = track->next) { int64 *anno; int *data; int64 s, f, j; int bd, ed, m; anno = (int64 *) track->anno; data = (int *) track->data; s = (anno[i] >> 2); f = (anno[i+1] >> 2); if (s < f) { for (j = s; j < f; j += 2) { bd = data[j]; ed = data[j+1]; if (DOSEQ) for (m = bd; m < ed; m++) if (iscase(read[m])) read[m] = (char) (read[m] + hilight); if (j == s) PRINTF("> %s:",track->name) PRINTF(" [%d,%d]",bd,ed) } PRINTF("\n") } } if (QUIVA) { int k; for (k = 0; k < 5; k++) PRINTF("%.*s\n",lst-fst,entry[k]+fst) } else if (ARROW) { int k; for (k = fst; k+WIDTH < lst; k += WIDTH) PRINTF("%.*s\n",WIDTH,arrow+k) if (k < lst) PRINTF("%.*s\n",lst-k,arrow+k) } else { if (DOQVS) { int j, k; PRINTF("\n") for (j = fst; j+WIDTH < lst; j += WIDTH) { if (DOSEQ) PRINTF("%.*s\n",WIDTH,read+j) for (k = 0; k < 5; k++) PRINTF("%.*s\n",WIDTH,entry[k]+j) PRINTF("\n") } if (j < lst) { if (DOSEQ) PRINTF("%.*s\n",lst-j,read+j) for (k = 0; k < 5; k++) PRINTF("%.*s\n",lst-j,entry[k]+j) PRINTF("\n") } } else if (DOARR) { int j; PRINTF("\n") for (j = fst; j+WIDTH < lst; j += WIDTH) { if (DOSEQ) PRINTF("%.*s\n",WIDTH,read+j) PRINTF("%.*s\n\n",WIDTH,arrow+j) } if (j < lst) { if (DOSEQ) PRINTF("%.*s\n",lst-j,read+j) PRINTF("%.*s\n\n",lst-j,arrow+j) } } else if (DOSEQ) { int j; for (j = fst; j+WIDTH < lst; j += WIDTH) PRINTF("%.*s\n",WIDTH,read+j) if (j < lst) PRINTF("%.*s\n",lst-j,read+j) } } } } } FCLOSE(stdout) if (input_pts) { fclose(input); free(iter); } else free(pts); if (DAM) fclose(hdrs); else { int i; for (i = 0; i < nfiles; i++) free(flist[i]); free(flist); free(findx-1); } Close_DB(db); exit (0); } DAZZ_DB-master/DBsplit.c000066400000000000000000000145451322703422500152020ustar00rootroot00000000000000/******************************************************************************************* * * Split a .db into a set of sub-database blocks for use by the Dazzler: * Divide the database .db conceptually into a series of blocks referable to on the * command line as .1.db, .2.db, ... If the -x option is set then all reads * less than the given length are ignored, and if the -a option is not set then secondary * reads from a given well are also ignored. The remaining reads are split amongst the * blocks so that each block is of size -s * 1Mbp except for the last which necessarily * contains a smaller residual. The default value for -s is 400Mbp because blocks of this * size can be compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks * are very space efficient in that their sub-index of the master .idx is computed on the * fly when loaded, and the .bps file of base pairs is shared with the master DB. Any * tracks associated with the DB are also computed on the fly when loading a database block. * * Author: Gene Myers * Date : September 2013 * Mod : New splitting definition to support incrementality, and new stub file format * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-af] [-x] [-s] "; int main(int argc, char *argv[]) { DAZZ_DB db, dbs; int64 dbpos; FILE *dbfile, *ixfile; char *dbfile_name, *ixfile_name; int status; int FORCE; int ALL; int CUTOFF; int64 SIZE; { int i, j, k; int flags[128]; char *eptr; float size; ARG_INIT("DBsplit") CUTOFF = 0; size = 200; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("af") break; case 'x': ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff") break; case 's': ARG_REAL(size) if (size <= 0.) { fprintf(stderr,"%s: Block size must be a positive number\n",Prog_Name); exit (1); } break; } else argv[j++] = argv[i]; argc = j; SIZE = size*1000000ll; ALL = flags['a']; FORCE = flags['f']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -s: Target size of blocks (in Mbp).\n"); fprintf(stderr," -x: Trimmed DB has reads >= this threshold.\n"); fprintf(stderr," -a: Trimmed DB contains all reads from a well (not just longest).\n"); fprintf(stderr," -f: Force the split to occur even if already split.\n"); exit (1); } } // Open db status = Open_DB(argv[1],&db); if (status < 0) exit (1); if (db.part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } { char *pwd, *root; char buffer[2*MAX_NAME+100]; int nfiles; int i; pwd = PathTo(argv[1]); if (status) { root = Root(argv[1],".dam"); dbfile_name = Strdup(Catenate(pwd,"/",root,".dam"),"Allocating db file name"); } else { root = Root(argv[1],".db"); dbfile_name = Strdup(Catenate(pwd,"/",root,".db"),"Allocating db file name"); } ixfile_name = Strdup(Catenate(pwd,PATHSEP,root,".idx"),"Allocating index file name"); dbfile = Fopen(dbfile_name,"r+"); ixfile = Fopen(ixfile_name,"r+"); if (dbfile_name == NULL || ixfile_name == NULL || dbfile == NULL || ixfile == NULL) exit (1); free(pwd); free(root); FSCANF(dbfile,DB_NFILE,&nfiles) for (i = 0; i < nfiles; i++) FGETS(buffer,2*MAX_NAME+100,dbfile) FREAD(&dbs,sizeof(DAZZ_DB),1,ixfile) if (dbs.cutoff >= 0 && !FORCE) { printf("You are about to overwrite the current partition settings. This\n"); printf("will invalidate any tracks, overlaps, and other derivative files.\n"); printf("Are you sure you want to proceed? [Y/N] "); fflush(stdout); fgets(buffer,100,stdin); if (index(buffer,'n') != NULL || index(buffer,'N') != NULL) { printf("Aborted\n"); fflush(stdout); fclose(dbfile); fclose(ixfile); exit (1); } } dbpos = FTELLO(dbfile); FSEEKO(dbfile,dbpos,SEEK_SET) FPRINTF(dbfile,DB_NBLOCK,0) FPRINTF(dbfile,DB_PARAMS,SIZE,CUTOFF,ALL) } { DAZZ_READ *reads = db.reads; int nreads = db.ureads; int64 totlen; int nblock, ireads, treads, rlen, fno; int i; nblock = 0; totlen = 0; ireads = 0; treads = 0; FPRINTF(dbfile,DB_BDATA,0,0) if (ALL) for (i = 0; i < nreads; i++) { rlen = reads[i].rlen; if (rlen >= CUTOFF) { ireads += 1; treads += 1; totlen += rlen; if (totlen >= SIZE) { FPRINTF(dbfile,DB_BDATA,i+1,treads) totlen = 0; ireads = 0; nblock += 1; } } } else for (i = 0; i < nreads; i++) { rlen = reads[i].rlen; if (rlen >= CUTOFF && (reads[i].flags & DB_BEST) != 0) { ireads += 1; treads += 1; totlen += rlen; if (totlen >= SIZE) { FPRINTF(dbfile,DB_BDATA,i+1,treads) totlen = 0; ireads = 0; nblock += 1; } } } if (ireads > 0) { FPRINTF(dbfile,DB_BDATA,nreads,treads) nblock += 1; } fno = fileno(dbfile); if (ftruncate(fno,FTELLO(dbfile)) < 0) SYSTEM_WRITE_ERROR FSEEKO(dbfile,dbpos,SEEK_SET) FPRINTF(dbfile,DB_NBLOCK,nblock) dbs.cutoff = CUTOFF; if (ALL) dbs.allarr |= DB_ALL; dbs.treads = treads; FSEEKO(ixfile,0,SEEK_SET) FWRITE(&dbs,sizeof(DAZZ_DB),1,ixfile) } FCLOSE(ixfile) FCLOSE(dbfile) Close_DB(&db); exit (0); } DAZZ_DB-master/DBstats.c000066400000000000000000000226101322703422500151750ustar00rootroot00000000000000/******************************************************************************************* * * Display statistics about the contents of a .db and a histogram of its read lengths. * * Author: Gene Myers * Date : July 2013 * Mod : April 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" static char *Usage = " [-nu] [-b] [-m]+ "; int main(int argc, char *argv[]) { DAZZ_DB _db, *db = &_db; int dam; int64 ototal; int oreads; int nbin, *hist; int64 *bsum; int NONE; int TRIM; int BIN; int MMAX, MTOP; char **MASK; { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DBstats") BIN = 1000; MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("nu") break; case 'b': ARG_POSITIVE(BIN,"Bin size") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; NONE = flags['n']; TRIM = 1-flags['u']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -u: Give stats for the untrimmed database.\n"); fprintf(stderr,"\n"); fprintf(stderr," -n: Do not show histogram of read lengths.\n"); fprintf(stderr," -m: Show histogram of mask intervals.\n"); fprintf(stderr," -b: Use histogram buckets of this size (default 1Kbp).\n"); exit (1); } } { int i, status, kind; // Open .db or .dam status = Open_DB(argv[1],db); if (status < 0) exit (1); dam = status; // Check tracks and load tracks for untrimmed DB for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status == -2) fprintf(stderr,"%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]); else if (status == -1) fprintf(stderr,"%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]); else if (kind != MASK_TRACK) fprintf(stderr,"%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]); else if (status == 0) Load_Track(db,MASK[i]); else if (status == 1 && !TRIM) fprintf(stderr,"%s: Warning: %s track is for a trimmed db but -u is set.\n", Prog_Name,MASK[i]); } oreads = db->nreads; ototal = db->totlen; if (TRIM) { Trim_DB(db); // Load tracks for trimmed DB for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status < 0) continue; else if (status == 1) Load_Track(db,MASK[i]); } } } { int i; int64 totlen; int nreads, maxlen; DAZZ_READ *reads; nreads = db->nreads; totlen = db->totlen; maxlen = db->maxlen; reads = db->reads; nbin = maxlen/BIN + 1; hist = (int *) Malloc(sizeof(int)*nbin,"Allocating histograms"); bsum = (int64 *) Malloc(sizeof(int64)*nbin,"Allocating histograms"); if (hist == NULL || bsum == NULL) exit (1); for (i = 0; i < nbin; i++) { hist[i] = 0; bsum[i] = 0; } for (i = 0; i < nreads; i++) { int rlen = reads[i].rlen; hist[rlen/BIN] += 1; bsum[rlen/BIN] += rlen; } if (dam) printf("\nStatistics for all contigs"); else if ((db->allarr & DB_ALL) != 0 || !TRIM) printf("\nStatistics for all wells"); else printf("\nStatistics for all reads"); if (TRIM && db->cutoff > 0) { printf(" of length "); Print_Number(db->cutoff,0,stdout); printf(" bases or more\n\n"); } else if (dam) printf(" in the map index\n\n"); else printf(" in the data set\n\n"); Print_Number((int64) nreads,15,stdout); if (dam) printf(" contigs"); else printf(" reads "); if (TRIM) { printf(" out of "); Print_Number((int64 ) oreads,15,stdout); if (oreads <= 0) printf(" (100.0%%)"); else printf(" (%5.1f%%)",(100.*nreads)/oreads); } printf("\n"); Print_Number(totlen,15,stdout); printf(" base pairs"); if (TRIM) { printf(" out of "); Print_Number(ototal,15,stdout); if (ototal <= 0) printf(" (100.0%%)"); else printf(" (%5.1f%%)",(100.*totlen)/ototal); } printf("\n\n"); if (nreads > 0) { int64 ave, dev; ave = totlen/nreads; Print_Number(ave,15,stdout); if (dam) printf(" average contig length\n"); else { printf(" average read length\n"); dev = 0; for (i = 0; i < nreads; i++) { int rlen = reads[i].rlen; dev += (rlen-ave)*(rlen-ave); } dev = (int64) sqrt((1.*dev)/nreads); Print_Number(dev,15,stdout); printf(" standard deviation\n"); } } if (totlen <= 0) { free(hist); free(bsum); Close_DB(db); exit (0); } printf("\n Base composition: %.3f(A) %.3f(C) %.3f(G) %.3f(T)\n", db->freq[0],db->freq[1],db->freq[2],db->freq[3]); if (!NONE) { int64 btot; int cum, skip, avg; printf("\n Distribution of Read Lengths (Bin size = "); Print_Number((int64) BIN,0,stdout); printf(")\n\n Bin: Count %% Reads %% Bases Average\n"); if (dam) skip = 0; else skip = -1; cum = 0; btot = 0; for (i = nbin-1; i >= 0; i--) { cum += hist[i]; btot += bsum[i]; if (hist[i] != skip) { Print_Number((int64) (i*BIN),11,stdout); printf(":"); Print_Number((int64) hist[i],11,stdout); if (cum > 0) avg = btot/cum; else avg = 0; printf(" %5.1f %5.1f %9d\n",(100.*cum)/nreads,(100.*btot)/totlen,avg); } if (cum == nreads) break; } } } { int64 totlen; int numint, maxlen; DAZZ_TRACK *track; for (track = db->tracks; track != NULL; track = track->next) { char *data = track->data; int64 *anno = (int64 *) track->anno; int *idata, *edata; int64 ave, dev, btot; int k, rlen, cum; totlen = 0; numint = 0; maxlen = 0; for (k = 0; k < db->nreads; k++) { edata = (int *) (data + anno[k+1]); for (idata = (int *) (data + anno[k]); idata < edata; idata += 2) { rlen = idata[1] - *idata; numint += 1; totlen += rlen; if (rlen > maxlen) maxlen = rlen; } } printf("\n\nStatistics for %s-track\n",track->name); printf("\n There are "); Print_Number(numint,0,stdout); printf(" intervals totaling "); Print_Number(totlen,0,stdout); printf(" bases (%.1f%% of all data)\n",(100.*totlen)/db->totlen); if (numint <= 0) continue; nbin = maxlen/BIN + 1; for (k = 0; k < nbin; k++) { hist[k] = 0; bsum[k] = 0; } ave = totlen/numint; dev = 0; for (k = 0; k < db->nreads; k++) { edata = (int *) (data + anno[k+1]); for (idata = (int *) (data + anno[k]); idata < edata; idata += 2) { rlen = idata[1] - *idata; dev += (rlen-ave)*(rlen-ave); hist[rlen/BIN] += 1; bsum[rlen/BIN] += rlen; } } dev = (int64) sqrt((1.*dev)/numint); printf("\n"); Print_Number(ave,15,stdout); printf(" average interval length\n"); Print_Number(dev,15,stdout); printf(" standard deviation\n"); printf("\n Distribution of %s intervals (Bin size = ",track->name); Print_Number((int64) BIN,0,stdout); printf(")\n\n Bin: Count %% Intervals %% Bases Average\n"); cum = 0; btot = 0; for (k = nbin-1; k >= 0; k--) { cum += hist[k]; btot += bsum[k]; if (hist[k] > 0) { Print_Number((int64) (k*BIN),11,stdout); printf(":"); Print_Number((int64) hist[k],11,stdout); printf(" %5.1f %5.1f %9lld\n",(100.*cum)/numint, (100.*btot)/totlen,btot/cum); if (cum == numint) break; } } printf("\n"); } } free(hist); free(bsum); Close_DB(db); exit (0); } DAZZ_DB-master/DBtrim.c000066400000000000000000000112441322703422500150130ustar00rootroot00000000000000/******************************************************************************************* * * Reset the trimming parameters for a .db: * Rewrite the .db or .dam file with the new thresholds and the new read counts for * each trimmed block. * * Author: Gene Myers * Date : September 2017 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-af] [-x] "; int main(int argc, char *argv[]) { DAZZ_DB db, dbs; int64 dbpos; FILE *dbfile, *ixfile; char *dbfile_name, *ixfile_name; int nblocks; int status; int FORCE; int ALL; int CUTOFF; { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DBtrim") CUTOFF = 0; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("af") break; case 'x': ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff") break; } else argv[j++] = argv[i]; argc = j; ALL = flags['a']; FORCE = flags['f']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -x: Trimmed DB has reads >= this threshold.\n"); fprintf(stderr," -a: Trimmed DB contains all reads from a well (not just longest).\n"); fprintf(stderr," -f: Force the new trim setting even if already set.\n"); exit (1); } } // Open db status = Open_DB(argv[1],&db); if (status < 0) exit (1); if (db.part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } { char *pwd, *root; char buffer[2*MAX_NAME+100]; int nfiles; int all, cutoff; int64 size; int i; pwd = PathTo(argv[1]); if (status) { root = Root(argv[1],".dam"); dbfile_name = Strdup(Catenate(pwd,"/",root,".dam"),"Allocating db file name"); } else { root = Root(argv[1],".db"); dbfile_name = Strdup(Catenate(pwd,"/",root,".db"),"Allocating db file name"); } ixfile_name = Strdup(Catenate(pwd,PATHSEP,root,".idx"),"Allocating index file name"); dbfile = Fopen(dbfile_name,"r+"); ixfile = Fopen(ixfile_name,"r+"); if (dbfile_name == NULL || ixfile_name == NULL || dbfile == NULL || ixfile == NULL) exit (1); free(pwd); free(root); FSCANF(dbfile,DB_NFILE,&nfiles) for (i = 0; i < nfiles; i++) FGETS(buffer,2*MAX_NAME+100,dbfile) FREAD(&dbs,sizeof(DAZZ_DB),1,ixfile) if (dbs.cutoff >= 0) { if (!FORCE) { printf("You are about to reset the thresholds for the trimmed DB.\n"); printf("This will invalidate any .las files produced by daligner\n"); printf("Are you sure you want to proceed? [Y/N] "); fflush(stdout); fgets(buffer,100,stdin); if (index(buffer,'n') != NULL || index(buffer,'N') != NULL) { printf("Aborted\n"); fflush(stdout); fclose(ixfile); fclose(dbfile); exit (1); } } } else { fprintf(stderr,"%s: DB has not yet been split, use DBsplit\n",Prog_Name); exit (1); } FSCANF(dbfile,DB_NBLOCK,&nblocks) dbpos = FTELLO(dbfile); FSCANF(dbfile,DB_PARAMS,&size,&cutoff,&all) FSEEKO(dbfile,dbpos,SEEK_SET) FPRINTF(dbfile,DB_PARAMS,size,CUTOFF,ALL) } { DAZZ_READ *reads = db.reads; int uread, tread; int rlen; int b, u, t; u = 0; t = 0; fprintf(dbfile,DB_BDATA,0,0); for (b = 0; b < nblocks; b++) { dbpos = FTELLO(dbfile); FSCANF(dbfile,DB_BDATA,&uread,&tread) if (ALL) while (u < uread) { rlen = reads[u++].rlen; if (rlen >= CUTOFF) t += 1; } else while (u < uread) { rlen = reads[u].rlen; if (rlen >= CUTOFF && (reads[u].flags & DB_BEST) != 0) t += 1; u += 1; } FSEEKO(dbfile,dbpos,SEEK_SET) FPRINTF(dbfile,DB_BDATA,uread,t) } dbs.cutoff = CUTOFF; if (ALL) dbs.allarr |= DB_ALL; dbs.treads = t; FSEEKO(ixfile,0,SEEK_SET) FWRITE(&dbs,sizeof(DAZZ_DB),1,ixfile) } FCLOSE(ixfile) FCLOSE(dbfile) Close_DB(&db); exit (0); } DAZZ_DB-master/DBwipe.c000066400000000000000000000061201322703422500150010ustar00rootroot00000000000000/******************************************************************************************* * * Split a .db into a set of sub-database blocks for use by the Dazzler: * Divide the database .db conceptually into a series of blocks referable to on the * command line as .1.db, .2.db, ... If the -x option is set then all reads * less than the given length are ignored, and if the -a option is not set then secondary * reads from a given well are also ignored. The remaining reads are split amongst the * blocks so that each block is of size -s * 1Mbp except for the last which necessarily * contains a smaller residual. The default value for -s is 400Mbp because blocks of this * size can be compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks * are very space efficient in that their sub-index of the master .idx is computed on the * fly when loaded, and the .bps file of base pairs is shared with the master DB. Any * tracks associated with the DB are also computed on the fly when loading a database block. * * Author: Gene Myers * Date : September 2013 * Mod : New splitting definition to support incrementality, and new stub file format * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = ""; int main(int argc, char *argv[]) { DAZZ_DB db; int status; Prog_Name = Strdup("DBwipe","Allocating Program Name"); if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } // Open db status = Open_DB(argv[1],&db); if (status < 0) exit (1); if (db.part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } if (status) { fprintf(stderr,"%s: Cannot be called on a .dam: %s\n",Prog_Name,argv[1]); exit (1); } { char *pwd, *root; FILE *index; char *index_name; int i; pwd = PathTo(argv[1]); root = Root(argv[1],".db"); if (unlink(Catenate(pwd,PATHSEP,root,".arw")) < 0) { if (errno != ENOENT) { fprintf(stderr,"%s: [WARNING] Could not delete %s.arw\n",Prog_Name,root); exit (1); } } if (unlink(Catenate(pwd,PATHSEP,root,".qvs")) < 0) { if (errno != ENOENT) { fprintf(stderr,"%s: [WARNING] Could not delete %s.qvs\n",Prog_Name,root); exit (1); } } for (i = 0; i < db.nreads; i++) db.reads[i].coff = -1; db.allarr &= ~DB_ARROW; index_name = Strdup(Catenate(pwd,PATHSEP,root,".idx"),"Allocating index file name"); index = Fopen(index_name,"w"); if (index_name == NULL || index == NULL) exit (1); FWRITE(&db,sizeof(DAZZ_DB),1,index) FWRITE(db.reads,sizeof(DAZZ_READ),db.nreads,index) FCLOSE(index); } Close_DB(&db); exit (0); } DAZZ_DB-master/LICENSE000066400000000000000000000053111322703422500144710ustar00rootroot00000000000000 Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: · Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. · Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. · The name of EWM may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. For any issues regarding this software and its use, contact EWM at: Eugene W. Myers Jr. Bautzner Str. 122e 01099 Dresden GERMANY Email: gene.myers@gmail.com DAZZ_DB-master/Makefile000066400000000000000000000042541322703422500151310ustar00rootroot00000000000000DEST_DIR = ~/bin CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing ALL = fasta2DB DB2fasta quiva2DB DB2quiva DBsplit DBdust Catrack DBshow DBstats DBrm DBmv \ simulator fasta2DAM DAM2fasta DBdump rangen arrow2DB DB2arrow DBwipe DBtrim all: $(ALL) fasta2DB: fasta2DB.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o fasta2DB fasta2DB.c DB.c QV.c -lm DB2fasta: DB2fasta.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DB2fasta DB2fasta.c DB.c QV.c -lm quiva2DB: quiva2DB.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -DINTERACTIVE -o quiva2DB quiva2DB.c DB.c QV.c -lm DB2quiva: DB2quiva.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DB2quiva DB2quiva.c DB.c QV.c -lm DB2arrow: DB2arrow.c DB.c QV.c DB.h QV.h gcc $(CFLAGS) -o DB2arrow DB2arrow.c DB.c QV.c -lz arrow2DB: arrow2DB.c DB.c QV.c DB.h QV.h gcc $(CFLAGS) -o arrow2DB arrow2DB.c DB.c QV.c -lz DBsplit: DBsplit.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBsplit DBsplit.c DB.c QV.c -lm DBtrim: DBtrim.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBtrim DBtrim.c DB.c QV.c -lm DBdust: DBdust.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBdust DBdust.c DB.c QV.c -lm Catrack: Catrack.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o Catrack Catrack.c DB.c QV.c -lm DBshow: DBshow.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBshow DBshow.c DB.c QV.c -lm DBdump: DBdump.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBdump DBdump.c DB.c QV.c -lm DBstats: DBstats.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBstats DBstats.c DB.c QV.c -lm DBrm: DBrm.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBrm DBrm.c DB.c QV.c -lm DBmv: DBmv.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBmv DBmv.c DB.c QV.c -lm simulator: simulator.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o simulator simulator.c DB.c QV.c -lm rangen: rangen.c gcc $(CFLAGS) -o rangen rangen.c fasta2DAM: fasta2DAM.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o fasta2DAM fasta2DAM.c DB.c QV.c -lm DAM2fasta: DAM2fasta.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DAM2fasta DAM2fasta.c DB.c QV.c -lm DBwipe: DBwipe.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBwipe DBwipe.c DB.c QV.c -lm clean: rm -f $(ALL) rm -fr *.dSYM rm -f dazz.db.tar.gz install: cp $(ALL) $(DEST_DIR) package: make clean tar -zcf dazz.db.tar.gz README.md Makefile *.h *.c DAZZ_DB-master/QV.c000066400000000000000000001132131322703422500141570ustar00rootroot00000000000000/******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #undef DEBUG #define MIN_BUFFER 1000 #define HUFF_CUTOFF 16 // This cannot be larger than 16 ! /******************************************************************************************* * * Endian flipping routines * ********************************************************************************************/ static int LittleEndian; // Little-endian machine ? // Referred by: Decode & Decode_Run static int Flip; // Flip endian of all coded shorts and ints // Referred by: Decode & Decode_Run & Read_Scheme static void Set_Endian(int flip) { uint32 x = 3; uint8 *b = (uint8 *) (&x); Flip = flip; LittleEndian = (b[0] == 3); } static void Flip_Long(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[3]; v[3] = x; x = v[1]; v[1] = v[2]; v[2] = x; } static void Flip_Short(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[1]; v[1] = x; } /******************************************************************************************* * * Routines for computing a Huffman Encoding Scheme * ********************************************************************************************/ typedef struct { int type; // 0 => normal, 1 => normal but has long codes, 2 => truncated uint32 codebits[256]; // If type = 2, then code 255 is the special code for int codelens[256]; // non-Huffman exceptions int lookup[0x10000]; // Lookup table (just for decoding) } HScheme; typedef struct _HTree { struct _HTree *lft, *rgt; uint64 count; } HTree; // Establish heap property from node s down (1 is root, siblings of n are 2n and 2n+1) // assuming s is the only perturbation in the tree. static void Reheap(int s, HTree **heap, int hsize) { int c, l, r; HTree *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; hr = heap[r]; if (r > hsize || hr->count > hl->count) { if (hs->count > hl->count) { heap[c] = hl; c = l; } else break; } else { if (hs->count > hr->count) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } // Given Huffman tree build a table of codes from it, the low-order codelens[s] bits // of codebits[s] contain the code for symbol s. static void Build_Table(HTree *node, int code, int len, uint32 *codebits, int *codelens) { if (node->rgt == NULL) { uint64 symbol = (uint64) (node->lft); codebits[symbol] = code; codelens[symbol] = len; } else { code <<= 1; len += 1; Build_Table(node->lft,code,len,codebits,codelens); Build_Table(node->rgt,code+1,len,codebits,codelens); } } // For the non-zero symbols in hist, compute a huffman tree over them, and then // build a table of the codes. If inscheme is not NULL, then place all symbols // with code 255 or with more than HUFF_CUTOFF bits in the encoding by inscheme // as a single united entity, whose code signals that the value of these symbols // occur explicitly in 8 (values) or 16 (run lengths) bits following the code. // All the symbols in this class will have the same entry in the code table and // 255 is always in this class. static HScheme *Huffman(uint64 *hist, HScheme *inscheme) { HScheme *scheme; HTree *heap[259]; HTree node[512]; int hsize; HTree *lft, *rgt; int value, range; int i; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); hsize = 0; // Load heap value = 0; if (inscheme != NULL) { node[0].count = 0; node[0].lft = (HTree *) (uint64) 255; node[0].rgt = NULL; heap[++hsize] = node+(value++); } for (i = 0; i < 256; i++) if (hist[i] > 0) { if (inscheme != NULL && (inscheme->codelens[i] > HUFF_CUTOFF || i == 255)) node[0].count += hist[i]; else { node[value].count = hist[i]; node[value].lft = (HTree *) (uint64) i; node[value].rgt = NULL; heap[++hsize] = node+(value++); } } for (i = hsize/2; i >= 1; i--) // Establish heap property Reheap(i,heap,hsize); range = value; // Merge pairs with smallest count until have a tree for (i = 1; i < value; i++) { lft = heap[1]; heap[1] = heap[hsize--]; Reheap(1,heap,hsize); rgt = heap[1]; node[range].lft = lft; node[range].rgt = rgt; node[range].count = lft->count + rgt->count; heap[1] = node+(range++); Reheap(1,heap,hsize); } for (i = 0; i < 256; i++) // Build the code table { scheme->codebits[i] = 0; scheme->codelens[i] = 0; } Build_Table(node+(range-1),0,0,scheme->codebits,scheme->codelens); if (inscheme != NULL) // Set scheme type and if truncated (2), map truncated codes { scheme->type = 2; // to code and length for 255 for (i = 0; i < 255; i++) if (inscheme->codelens[i] > HUFF_CUTOFF || scheme->codelens[i] > HUFF_CUTOFF) { scheme->codelens[i] = scheme->codelens[255]; scheme->codebits[i] = scheme->codebits[255]; } } else { scheme->type = 0; for (i = 0; i < 256; i++) { if (scheme->codelens[i] > HUFF_CUTOFF) scheme->type = 1; } } return (scheme); } #ifdef DEBUG // For debug, show the coding table static void Print_Table(HScheme *scheme, uint64 *hist, int infosize) { uint64 total_bits; uint32 specval, mask, code, *bits; int speclen, clen, *lens; int i, k; total_bits = 0; bits = scheme->codebits; lens = scheme->codelens; if (scheme->type == 2) { specval = bits[255]; speclen = lens[255]; } else specval = speclen = 0x7fffffff; printf("\nCode Table:\n"); for (i = 0; i < 256; i++) if (lens[i] > 0) { clen = lens[i]; mask = (1 << clen); code = bits[i]; printf(" %3d: %2d ",i,clen); for (k = 0; k < clen; k++) { mask >>= 1; if (code & mask) printf("1"); else printf("0"); } if (code == specval && clen == speclen) { printf(" ***"); if (hist != NULL) total_bits += (clen+infosize)*hist[i]; } else if (hist != NULL) total_bits += clen*hist[i]; printf("\n"); } if (hist != NULL) printf("\nTotal Bytes = %lld\n",(total_bits-1)/8+1); } // For debug, show the histogram static void Print_Histogram(uint64 *hist) { int i, low, hgh; uint64 count; for (hgh = 255; hgh >= 0; hgh--) if (hist[hgh] != 0) break; for (low = 0; low < 256; low++) if (hist[low] != 0) break; count = 0; for (i = low; i <= hgh; i++) count += hist[i]; for (i = hgh; i >= low; i--) printf(" %3d: %8llu %5.1f%%\n",i,hist[i],(hist[i]*100.)/count); } #endif /******************************************************************************************* * * Read and Write Huffman Schemes * ********************************************************************************************/ // Write the code table to out. static void Write_Scheme(HScheme *scheme, FILE *out) { int i; uint8 x; uint32 *bits; int *lens; lens = scheme->codelens; bits = scheme->codebits; x = (uint8) (scheme->type); fwrite(&x,1,1,out); for (i = 0; i < 256; i++) { x = (uint8) (lens[i]); fwrite(&x,1,1,out); if (x > 0) fwrite(bits+i,sizeof(uint32),1,out); } } // Allocate and read a code table from in, and return a pointer to it. static HScheme *Read_Scheme(FILE *in) { HScheme *scheme; int *look, *lens; uint32 *bits, base; int i, j, powr; uint8 x; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); lens = scheme->codelens; bits = scheme->codebits; look = scheme->lookup; if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n"); free(scheme); return (NULL); } scheme->type = x; for (i = 0; i < 256; i++) { if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i); return (NULL); } lens[i] = x; if (x > 0) { if (fread(bits+i,sizeof(uint32),1,in) != 1) { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i); free(scheme); return (NULL); } } else bits[i] = 0; } if (Flip) { for (i = 0; i < 256; i++) Flip_Long(bits+i); } for (i = 0; i < 256; i++) { if (lens[i] > 0) { base = (bits[i] << (16-lens[i])); powr = (1 << (16-lens[i])); for (j = 0; j < powr; j++) look[base+j] = i; } } return (scheme); } /******************************************************************************************* * * Encoders and Decoders * ********************************************************************************************/ // Encode read[0..rlen-1] according to scheme and write to out static void Encode(HScheme *scheme, FILE *out, uint8 *read, int rlen) { uint32 x, c, ocode; int n, k, olen, llen; int *nlens; uint32 *nbits; uint32 nspec; int nslen; nlens = scheme->codelens; nbits = scheme->codebits; if (scheme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; #define OCODE(L,C) \ { int len = olen + (L); \ uint32 code = (C); \ \ llen = olen; \ if (len >= 32) \ { olen = len-32; \ ocode |= (code >> olen); \ fwrite(&ocode,sizeof(uint32),1,out); \ if (olen > 0) \ ocode = (code << (32-olen)); \ else \ ocode = 0; \ } \ else \ { olen = len; \ ocode |= (code << (32-olen));; \ } \ } llen = 0; olen = 0; ocode = 0; for (k = 0; k < rlen; k++) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); } if (olen > 0) // Tricky: must pad so decoder does not read past { fwrite(&ocode,sizeof(uint32),1,out); // last integer int the coded output. if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Encode read[0..rlen-1] according to non-rchar table neme, and run-length table reme for // runs of rchar characters. Write to out. static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int rlen, int rchar) { uint32 x, c, ocode; int n, h, k, olen, llen; int *nlens, *rlens; uint32 *nbits, *rbits; uint32 nspec, rspec; int nslen, rslen; nlens = neme->codelens; nbits = neme->codebits; rlens = reme->codelens; rbits = reme->codebits; if (neme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; rspec = rbits[255]; rslen = rlens[255]; llen = 0; olen = 0; ocode = 0; k = 0; while (k < rlen) { h = k; while (k < rlen && read[k] == rchar) k += 1; if (k-h >= 255) x = 255; else x = k-h; n = rlens[x]; c = rbits[x]; OCODE(n,c); if (c == rspec && n == rslen) OCODE(16,k-h); if (k < rlen) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); k += 1; } } if (olen > 0) { fwrite(&ocode,sizeof(uint32),1,out); if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Read and decode from in, the next rlen symbols into read according to scheme static int Decode(HScheme *scheme, FILE *in, char *read, int rlen) { int *look, *lens; int signal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (scheme->type == 2) signal = 255; else signal = 256; lens = scheme->codelens; look = scheme->lookup; #define GET \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } #define GETFLIP \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ Flip_Long(ipart); \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = look[*xpart]; n = lens[c]; if (c == signal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } else for (j = 0; j < rlen; j++) { GET c = look[*xpart]; n = lens[c]; if (c == signal) { GET c = *cpart; n = 8; } read[j] = (char) c; } return (0); } // Read and decode from in, the next rlen symbols into read according to non-rchar scheme // neme, and the rchar runlength shceme reme static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, int rlen, int rchar) { int *nlook, *nlens; int *rlook, *rlens; int nsignal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c, k; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (neme->type == 2) nsignal = 255; else nsignal = 256; nlens = neme->codelens; nlook = neme->lookup; rlens = reme->codelens; rlook = reme->lookup; n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GETFLIP c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GETFLIP c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } } else for (j = 0; j < rlen; j++) { GET c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GET c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GET c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GET c = *cpart; n = 8; } read[j] = (char) c; } } return (0); } /******************************************************************************************* * * Histogrammers * ********************************************************************************************/ // Histogram runlengths of symbol runChar in stream[0..rlen-1] into run. static void Histogram_Seqs(uint64 *hist, uint8 *stream, int rlen) { int k; for (k = 0; k < rlen; k++) hist[stream[k]] += 1; } static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar) { int k, h; k = 0; while (k < rlen) { h = k; while (k < rlen && stream[k] == runChar) k += 1; if (k-h >= 256) run[255] += 1; else run[k-h] += 1; if (k < rlen) k += 1; } } /******************************************************************************************* * * Reader * ********************************************************************************************/ static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan, static int Rmax = -1; // Compress_Next_QVentry static int Nline; // Referred by: QVcoding_Scan char *QVentry() { return (Read); } void Set_QV_Line(int line) { Nline = line; } int Get_QV_Line() { return (Nline); } // If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines // for a sequence. Place line j at Read+j*Rmax and the length of every line is returned // unless eof occurs in which case return -1. If any error occurs return -2. int Read_Lines(FILE *input, int nlines) { int i, rlen; int tmax; char *tread; char *other; if (Read == NULL) { tmax = MIN_BUFFER; tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; } Nline += 1; if (fgets(Read,Rmax,input) == NULL) return (-1); rlen = strlen(Read); while (Read[rlen-1] != '\n') { tmax = ((int) 1.4*Rmax) + MIN_BUFFER; tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; if (fgets(Read+rlen,Rmax-rlen,input) == NULL) { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline); EXIT(-2); } rlen += strlen(Read+rlen); } other = Read; for (i = 1; i < nlines; i++) { other += Rmax; Nline += 1; if (fgets(other,Rmax,input) == NULL) { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(-2); } if (rlen != (int) strlen(other)) { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline); EXIT(-2); } } return (rlen-1); } /******************************************************************************************* * * Tag compression and decompression routines * ********************************************************************************************/ // Keep only the symbols in tags[0..rlen-1] for which qvs[k] != rchar and // return the # of symbols kept. static int Pack_Tag(char *tags, char *qvs, int rlen, int rchar) { int j, k; j = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) tags[j++] = tags[k]; tags[j] = '\0'; return (j); } // Count the # of non-rchar symbols in qvs[0..rlen-1] static int Packed_Length(char *qvs, int rlen, int rchar) { int k, clen; clen = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) clen += 1; return (clen); } // Unpack tags by moving its i'th char to position k where qvs[k] is the i'th non-rchar // symbol in qvs. All other chars are set to rchar. rlen is the length of qvs and // the unpacked result, clen is the initial length of tags. static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar) { int j, k; j = clen-1; for (k = rlen-1; k >= 0; k--) { if (qvs[k] == rchar) tags[k] = 'n'; else tags[k] = tags[j--]; } } /******************************************************************************************* * * Statistics Scan and Scheme creation and write * ********************************************************************************************/ // Read up to the next num entries or until eof from the .quiva file on input and record // frequency statistics. Copy these entries to the temporary file temp if != NULL. // If there is an error then -1 is returned, otherwise the number of entries read. static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256]; static uint64 totChar; static int delChar, subChar; // Referred by: QVcoding_Scan, Create_QVcoding void QVcoding_Scan1(int rlen, char *delQV, char *delTag, char *insQV, char *mergeQV, char *subQV) { if (rlen == 0) // Initialization call { int i; // Zero histograms bzero(delHist,sizeof(uint64)*256); bzero(mrgHist,sizeof(uint64)*256); bzero(insHist,sizeof(uint64)*256); bzero(subHist,sizeof(uint64)*256); for (i = 0; i < 256; i++) delRun[i] = subRun[i] = 1; totChar = 0; delChar = -1; subChar = -1; return; } // Add streams to accumulating histograms and figure out the run chars // for the deletion and substition streams Histogram_Seqs(delHist,(uint8 *) delQV,rlen); Histogram_Seqs(insHist,(uint8 *) insQV,rlen); Histogram_Seqs(mrgHist,(uint8 *) mergeQV,rlen); Histogram_Seqs(subHist,(uint8 *) subQV,rlen); if (delChar < 0) { int k; for (k = 0; k < rlen; k++) if (delTag[k] == 'n' || delTag[k] == 'N') { delChar = delQV[k]; break; } } if (delChar >= 0) Histogram_Runs( delRun,(uint8 *) delQV,rlen,delChar); totChar += rlen; if (subChar < 0) { if (totChar >= 100000) { int k; subChar = 0; for (k = 1; k < 256; k++) if (subHist[k] > subHist[subChar]) subChar = k; } } if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) subQV,rlen,subChar); return; } int QVcoding_Scan(FILE *input, int num, FILE *temp) { char *slash; int rlen; int i, r; // Zero histograms bzero(delHist,sizeof(uint64)*256); bzero(mrgHist,sizeof(uint64)*256); bzero(insHist,sizeof(uint64)*256); bzero(subHist,sizeof(uint64)*256); for (i = 0; i < 256; i++) delRun[i] = subRun[i] = 1; totChar = 0; delChar = -1; subChar = -1; // Make a sweep through the .quiva entries, histogramming the relevant things // and figuring out the run chars for the deletion and substition streams r = 0; for (i = 0; i < num; i++) { int well, beg, end, qv; rlen = Read_Lines(input,1); if (rlen == -2) EXIT(-1); if (rlen < 0) break; if (rlen == 0 || Read[0] != '@') { EPRINTF(EPLACE,"Line %d: Header in quiva file is missing\n",Nline); EXIT(-1); } slash = index(Read+1,'/'); if (slash == NULL) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(-1); } if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(-1); } if (temp != NULL) fputs(Read,temp); rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(-1); } if (temp != NULL) { fputs(Read,temp); fputs(Read+Rmax,temp); fputs(Read+2*Rmax,temp); fputs(Read+3*Rmax,temp); fputs(Read+4*Rmax,temp); } Histogram_Seqs(delHist,(uint8 *) (Read),rlen); Histogram_Seqs(insHist,(uint8 *) (Read+2*Rmax),rlen); Histogram_Seqs(mrgHist,(uint8 *) (Read+3*Rmax),rlen); Histogram_Seqs(subHist,(uint8 *) (Read+4*Rmax),rlen); if (delChar < 0) { int k; char *del = Read+Rmax; for (k = 0; k < rlen; k++) if (del[k] == 'n' || del[k] == 'N') { delChar = Read[k]; break; } } if (delChar >= 0) Histogram_Runs( delRun,(uint8 *) (Read),rlen,delChar); totChar += rlen; if (subChar < 0) { if (totChar >= 100000) { int k; subChar = 0; for (k = 1; k < 256; k++) if (subHist[k] > subHist[subChar]) subChar = k; } } if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar); r += 1; } return (r); } // Using the statistics in the global stat tables, create the Huffman schemes and write // them to output. If lossy is set, then create a lossy table for the insertion and merge // QVs. QVcoding *Create_QVcoding(int lossy) { static QVcoding coding; HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; HScheme *dRunScheme, *sRunScheme; delScheme = NULL; dRunScheme = NULL; insScheme = NULL; mrgScheme = NULL; subScheme = NULL; sRunScheme = NULL; // Check whether using a subtitution run char is a win if (totChar < 200000 || subHist[subChar] < .5*totChar) subChar = -1; // If lossy encryption is enabled then scale insertions and merge QVs. if (lossy) { int k; for (k = 0; k < 256; k += 2) { insHist[k] += insHist[k+1]; insHist[k+1] = 0; } for (k = 0; k < 256; k += 4) { mrgHist[k] += mrgHist[k+1]; mrgHist[k] += mrgHist[k+2]; mrgHist[k] += mrgHist[k+3]; mrgHist[k+1] = 0; mrgHist[k+2] = 0; mrgHist[k+3] = 0; } } // Build a Huffman scheme for each stream entity from the histograms #define SCHEME_MACRO(meme,hist,label,bits) \ scheme = Huffman( (hist), NULL); \ if (scheme == NULL) \ goto error; \ if (scheme->type) \ { (meme) = Huffman( (hist), scheme); \ free(scheme); \ } \ else \ (meme) = scheme; #ifdef DEBUG #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) \ printf("\n%s\n", (label) ); \ Print_Histogram( (hist)); \ Print_Table( (meme), (hist), (bits)); #else #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) #endif { HScheme *scheme; if (delChar < 0) { MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs", 8); dRunScheme = NULL; } else { delHist[delChar] = 0; MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs less run char", 8); MAKE_SCHEME(dRunScheme,delRun, "Histogram of Deletion Runs QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",delChar); #endif } #ifdef DEBUG { int k; uint64 count; count = 0; for (k = 0; k < 256; k++) count += delHist[k]; printf("\nDelTag will require %lld bytes\n",count/4); } #endif MAKE_SCHEME(insScheme,insHist, "Hisotgram of Insertion QVs", 8); MAKE_SCHEME(mrgScheme,mrgHist, "Hisotgram of Merge QVs", 8); if (subChar < 0) { MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs", 8); sRunScheme = NULL; } else { subHist[subChar] = 0; MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs less run char", 8); MAKE_SCHEME(sRunScheme,subRun, "Histogram of Substitution Run QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",subChar); #endif } } // Setup endian handling Set_Endian(0); coding.delScheme = delScheme; coding.insScheme = insScheme; coding.mrgScheme = mrgScheme; coding.subScheme = subScheme; coding.dRunScheme = dRunScheme; coding.sRunScheme = sRunScheme; coding.delChar = delChar; coding.subChar = subChar; coding.prefix = NULL; coding.flip = 0; return (&coding); error: if (delScheme != NULL) free(delScheme); if (dRunScheme != NULL) free(dRunScheme); if (insScheme != NULL) free(insScheme); if (mrgScheme != NULL) free(mrgScheme); if (subScheme != NULL) free(subScheme); if (sRunScheme != NULL) free(sRunScheme); EXIT(NULL); } // Write the encoding scheme 'coding' to 'output' void Write_QVcoding(FILE *output, QVcoding *coding) { // Write out the endian key, run chars, and prefix (if not NULL) { uint16 half; int len; half = 0x33cc; fwrite(&half,sizeof(uint16),1,output); if (coding->delChar < 0) half = 256; else half = (uint16) (coding->delChar); fwrite(&half,sizeof(uint16),1,output); if (coding->subChar < 0) half = 256; else half = (uint16) (coding->subChar); fwrite(&half,sizeof(uint16),1,output); len = strlen(coding->prefix); fwrite(&len,sizeof(int),1,output); fwrite(coding->prefix,1,len,output); } // Write out the scheme tables Write_Scheme(coding->delScheme,output); if (coding->delChar >= 0) Write_Scheme(coding->dRunScheme,output); Write_Scheme(coding->insScheme,output); Write_Scheme(coding->mrgScheme,output); Write_Scheme(coding->subScheme,output); if (coding->subChar >= 0) Write_Scheme(coding->sRunScheme,output); } // Read the encoding scheme 'coding' to 'output' QVcoding *Read_QVcoding(FILE *input) { static QVcoding coding; // Read endian key, run chars, and short name common to all headers { uint16 half; int len; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n"); EXIT(NULL); } coding.flip = (half != 0x33cc); if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.delChar = half; if (coding.delChar >= 256) coding.delChar = -1; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.subChar = half; if (coding.subChar >= 256) coding.subChar = -1; // Read the short name common to all headers if (fread(&len,sizeof(int),1,input) != 1) { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Long(&len); coding.prefix = (char *) Malloc(len+1,"Allocating header prefix"); if (coding.prefix == NULL) EXIT(NULL); if (len > 0) { if (fread(coding.prefix,len,1,input) != 1) { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n"); EXIT(NULL); } } coding.prefix[len] = '\0'; } // Setup endian handling Set_Endian(coding.flip); // Read the Huffman schemes used to compress the data coding.delScheme = NULL; coding.dRunScheme = NULL; coding.insScheme = NULL; coding.mrgScheme = NULL; coding.subScheme = NULL; coding.sRunScheme = NULL; coding.delScheme = Read_Scheme(input); if (coding.delScheme == NULL) goto error; if (coding.delChar >= 0) { coding.dRunScheme = Read_Scheme(input); if (coding.dRunScheme == NULL) goto error; } coding.insScheme = Read_Scheme(input); if (coding.insScheme == NULL) goto error; coding.mrgScheme = Read_Scheme(input); if (coding.mrgScheme == NULL) goto error; coding.subScheme = Read_Scheme(input); if (coding.subScheme == NULL) goto error; if (coding.subChar >= 0) { coding.sRunScheme = Read_Scheme(input); if (coding.sRunScheme == NULL) goto error; } return (&coding); error: if (coding.delScheme != NULL) free(coding.delScheme); if (coding.dRunScheme != NULL) free(coding.dRunScheme); if (coding.insScheme != NULL) free(coding.insScheme); if (coding.mrgScheme != NULL) free(coding.mrgScheme); if (coding.subScheme != NULL) free(coding.subScheme); if (coding.sRunScheme != NULL) free(coding.sRunScheme); EXIT(NULL); } // Free all the auxilliary storage associated with the encoding argument void Free_QVcoding(QVcoding *coding) { if (coding->subChar >= 0) free(coding->sRunScheme); free(coding->subScheme); free(coding->mrgScheme); free(coding->insScheme); if (coding->delChar >= 0) free(coding->dRunScheme); free(coding->delScheme); free(coding->prefix); } /******************************************************************************************* * * Encode/Decode (w.r.t. coding) next entry from input and write to output * ********************************************************************************************/ void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub, FILE *output, QVcoding *coding, int lossy) { int clen; if (coding->delChar < 0) { Encode(coding->delScheme, output, (uint8 *) del, rlen); clen = rlen; } else { Encode_Run(coding->delScheme, coding->dRunScheme, output, (uint8 *) del, rlen, coding->delChar); clen = Pack_Tag(tag,del,rlen,coding->delChar); } Number_Read(tag); Compress_Read(clen,tag); fwrite(tag,1,COMPRESSED_LEN(clen),output); if (lossy) { uint8 *insert = (uint8 *) ins; uint8 *merge = (uint8 *) mrg; int k; for (k = 0; k < rlen; k++) { insert[k] = (uint8) ((insert[k] >> 1) << 1); merge[k] = (uint8) (( merge[k] >> 2) << 2); } } Encode(coding->insScheme, output, (uint8 *) ins, rlen); Encode(coding->mrgScheme, output, (uint8 *) mrg, rlen); if (coding->subChar < 0) Encode(coding->subScheme, output, (uint8 *) sub, rlen); else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) sub, rlen, coding->subChar); return; } int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) { int rlen, clen; // Get all 5 streams, compress each with its scheme, and output rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT (-1); } if (coding->delChar < 0) { Encode(coding->delScheme, output, (uint8 *) Read, rlen); clen = rlen; } else { Encode_Run(coding->delScheme, coding->dRunScheme, output, (uint8 *) Read, rlen, coding->delChar); clen = Pack_Tag(Read+Rmax,Read,rlen,coding->delChar); } Number_Read(Read+Rmax); Compress_Read(clen,Read+Rmax); fwrite(Read+Rmax,1,COMPRESSED_LEN(clen),output); if (lossy) { uint8 *insert = (uint8 *) (Read+2*Rmax); uint8 *merge = (uint8 *) (Read+3*Rmax); int k; for (k = 0; k < rlen; k++) { insert[k] = (uint8) ((insert[k] >> 1) << 1); merge[k] = (uint8) (( merge[k] >> 2) << 2); } } Encode(coding->insScheme, output, (uint8 *) (Read+2*Rmax), rlen); Encode(coding->mrgScheme, output, (uint8 *) (Read+3*Rmax), rlen); if (coding->subChar < 0) Encode(coding->subScheme, output, (uint8 *) (Read+4*Rmax), rlen); else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) (Read+4*Rmax), rlen, coding->subChar); return (rlen); } int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) { int clen, tlen; // Decode each stream and write to output if (coding->delChar < 0) { if (Decode(coding->delScheme, input, entry[0], rlen)) EXIT(1); clen = rlen; tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); } else { if (Decode_Run(coding->delScheme, coding->dRunScheme, input, entry[0], rlen, coding->delChar)) EXIT(1); clen = Packed_Length(entry[0],rlen,coding->delChar); tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar); } if (Decode(coding->insScheme, input, entry[2], rlen)) EXIT(1); if (Decode(coding->mrgScheme, input, entry[3], rlen)) EXIT(1); if (coding->subChar < 0) { if (Decode(coding->subScheme, input, entry[4], rlen)) EXIT(1); } else { if (Decode_Run(coding->subScheme, coding->sRunScheme, input, entry[4], rlen, coding->subChar)) EXIT(1); } return (0); } DAZZ_DB-master/QV.h000066400000000000000000000115151322703422500141660ustar00rootroot00000000000000/******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #ifndef _QV_COMPRESSOR #include #define _QV_COMPRESSOR // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or // batch version of the routines in this library are compiled. In batch mode, routines // print an error message and exit. In interactive mode, the routines place the error // message in EPLACE (also defined in DB.h) and return an error value, typically NULL // if the routine returns a pointer, and an unusual integer value if the routine returns // an integer. // Below when an error return is described, one should understand that this value is returned // only if the routine was compiled in INTERACTIVE mode. // A PacBio compression scheme typedef struct { void *delScheme; // Huffman scheme for deletion QVs void *insScheme; // Huffman scheme for insertion QVs void *mrgScheme; // Huffman scheme for merge QVs void *subScheme; // Huffman scheme for substitution QVs void *dRunScheme; // Huffman scheme for deletion run lengths (if delChar > 0) void *sRunScheme; // Huffman scheme for substitution run lengths (if subChar > 0) int delChar; // If > 0, run-encoded deletion value int subChar; // If > 0, run-encoded substitution value int flip; // Need to flip multi-byte integers char *prefix; // Header line prefix } QVcoding; // Read the next nlines of input, and QVentry returns a pointer to the first line if needed. // If end-of-input is encountered before any further input, -1 is returned. If there is // an error than -2 is returned. Otherwise the length of the line(s) read is returned. int Read_Lines(FILE *input, int nlines); char *QVentry(); // Get and set the line counter for error reporting void Set_QV_Line(int line); int Get_QV_Line(); // Read up to the next num entries or until eof from the .quiva file on input and record // frequency statistics. Copy these entries to the temporary file temp if != NULL. // If there is an error then -1 is returned, otherwise the number of entries read. int QVcoding_Scan(FILE *input, int num, FILE *temp); void QVcoding_Scan1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub); // Given QVcoding_Scan has been called at least once, create an encoding scheme based on // the accumulated statistics and return a pointer to it. The returned encoding object // is *statically allocated within the routine. If lossy is set then use a lossy scaling // for the insertion and merge streams. If there is an error, then NULL is returned. QVcoding *Create_QVcoding(int lossy); // Read/write a coding scheme to input/output. The encoding object returned by the reader // is *statically* allocated within the routine. If an error occurs while reading then // NULL is returned. QVcoding *Read_QVcoding(FILE *input); void Write_QVcoding(FILE *output, QVcoding *coding); // Free all the auxiliary storage associated with coding (but not the object itself!) void Free_QVcoding(QVcoding *coding); // Assuming the file pointer is positioned just beyond an entry header line, read the // next set of 5 QV lines, compress them according to 'coding', and output. If lossy // is set then the scheme is a lossy one. A negative value is returned if an error // occurred, and the sequence length otherwise. int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub, FILE *output, QVcoding *coding, int lossy); // Assuming the input is position just beyond the compressed encoding of an entry header, // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, // and place their decompressed values into entry which is a 5 element array of character // pointers. The parameter rlen computed from the preceeding header line, critically // provides the length of each of the 5 vectors. A non-zero value is return only if an // error occured. int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); #endif // _QV_COMPRESSOR DAZZ_DB-master/README.md000066400000000000000000000765161322703422500147620ustar00rootroot00000000000000# The Dazzler Database Library ## _Author: Gene Myers_ ## _First: July 17, 2013_ For typeset documentation, examples of use, and design philosophy please go to my [blog](https://dazzlerblog.wordpress.com/command-guides/dazz_db-command-guide). To facilitate the multiple phases of the dazzler assembler, we organize all the read data into what is effectively a "database" of the reads and their meta-information. The design goals for this data base are as follows: 1. The database stores the source Pacbio read information in such a way that it can recreate the original input data, thus permitting a user to remove the (effectively redundant) source files. This avoids duplicating the same data, once in the source file and once in the database. 2. The data base can be built up incrementally, that is new sequence data can be added to the data base over time. 3. The data base flexibly allows one to store any meta-data desired for reads. This is accomplished with the concept of *tracks* that implementors can add as they need them. 4. The data is held in a compressed form equivalent to the .dexta and .dexqv/.dexar files of the data extraction module. 5. Quiver or Arrow information can be added separately from the sequence information and later on if desired, but a database can only hold either Quiver or Arrow information, but not both. The Arrow or Quiver information can be removed from the database at any time leaving a database just containing sequence information. 6. To facilitate job parallel, cluster operation of the phases of our assembler, the data base has a concept of a *current partitioning* in which all the reads that are over a given length and optionally unique to a well, are divided up into *blocks* containing roughly a given number of bases, except possibly the last block which may have a short count. Often programs con be run on blocks or pairs of blocks and each such job is reasonably well balanced as the blocks are all the same size. One must be careful about changing the partition during an assembly as doing so can void the structural validity of any interim block-based results. A DB con contain the information needed by Quiver, or by Arrow, or neither, but not both. A DB containing neither Quiver or Arrow information is termed a Sequence-DB (S-DB). A DB with Quiver information is a Quiver-DB (Q-DB) and a DB with Arrow information is an Arrow-DB (A-DB). All commands are aware of the state of a DB and respond to options according to their type. A Dazzler DB consists of one named, *visible* file, e.g. FOO.db, and several *invisible* secondary files encoding various elements of the DB. The secondary files are "invisible" to the UNIX OS in the sense that they begin with a "." and hence are not listed by "ls" unless one specifies the -a flag. We chose to do this so that when a user lists the contents of a directory they just see a single name, e.g. FOO.db, that is used to refer to the DB in commands. The files associated with a database named, say FOO, are as follows: * "FOO.db": a text file containing 1. the list of input files added to the database so far, and 2. how to partition the database into blocks (if the partition parameters have been set). * ".FOO.idx": a binary "index" of all the meta-data about each read allowing, for example, one to randomly access a read's sequence (in the store ".FOO.bps"). It is 28N + 88 bytes in size where N is the number of reads in the database. * ".FOO.bps": a binary compressed "store" of all the DNA sequences. It is M/4 bytes in size where M is the total number of base pairs in the database. * ".FOO.qvs": a binary compressed "store" of the 5 Pacbio quality value streams for the reads. Its size is roughly 5/3M bytes depending on the compression acheived. This file only exists if Quiver information has been added to the database. * ".FOO.arw": a binary compressed "store" of the clipped pulse width stream for the reads. Its size is roughly M/4 bytes. This file only exists if Arrow information has been added to the database. * ".FOO.\.[anno,data]": a *track* containing customized meta-data for each read. For example, the DBdust command annotates low complexity intervals of reads and records the intervals for each read in two files .FOO.dust.anno & .FOO.dust.data. Any kind of information about a read can be recorded, such as micro-sats, repeat intervals, corrected sequence, etc. Specific tracks will be described as modules that produce them are released. If one does not like the convention of the secondary files being invisible, then un-defining the constant HIDE_FILES in DB.h before compiling the library, creates commands that do not place a prefixing "." before secondary file names, e.g. FOO.idx instead of .FOO.idx. One then sees all the files realizing a DB when listing the contents of a directory with ls. While a Dazzler DB holds a collection of Pacbio reads, a Dazzler map DB or DAM holds a collection of contigs from a reference genome assembly. This special type of DB has been introduced in order to facilitate the mapping of reads to an assembly and has been given the suffix .dam to distinguish it from an ordinary DB. It is structurally identical to a .db except: * there is no concept of quality values, and hence no .FOO.qvs or .FOO.arw file. * every .fasta scaffold (a sequence with runs of N's between contigs estimating the length of the gap) is broken into a separate contig sequence in the DB and the header for each scaffold is retained in a new .FOO.hdr file. * the original and first and last pulse fields in the meta-data records held in .FOO.idx, hold instead the contig number and the interval of the contig within its original scaffold sequence. A map DB can equally well be the argument of many of the commands below that operate on normal DBs. In general, a .dam can be an argument anywhere a .db can, with the exception of routines or optioned calls to routines that involve quality values, or the special routines fasta2DAM and DAM2fasta that create a DAM and reverse said, just like the pair fasta2DB and DB2fasta do for a normal DB. So in general when we refer to a database we are referring to either a DB or a DAM. The command DBsplit sets or resets the current partition for a database which is determined by 3 parameters: (i) the total number of basepairs to place in each block, (ii) the minimum read length of reads to include within a block, and (iii) whether or not to only include the longest read from a given well or all reads from a well (NB: several reads of the same insert in a given well can be produced by the Pacbio instrument). Note that the length and uniqueness parameters effectively select a subset of the reads that contribute to the size of a block. We call this subset the *trimmed* data base. Some commands operate on the entire database, others on the trimmed database, and yet others have an option flag that permits them to operate on either at the users discretion. Therefore, one should note carefully to which version of the database a command refers to. This is especially important for any command that identifies reads by their index (ordinal position) in the database. Once the database has been split into blocks, the commands DBshow, DBstats, and DBdust below and commands yet to come, such as the local alignment finder dalign, can take a block or blocks as arguments. On the command line this is indicated by supplying the name of the DB followed by a period and then a block number, e.g. FOO.3.db or simply FOO.3, refers to the 3'rd block of DB FOO (assuming of course it has a current partition and said partition has a 3rd block). One should note carefully that a block is a contiguous range of reads such that once it is trimmed has a given size in base pairs (as set by DBsplit). Thus like an entire database, a block can be either untrimmed or trimmed and one needs to again be careful when giving a read index to a command such as DBshow. All programs add suffixes (e.g. .db) as needed. The commands of the database library are currently as follows: ``` 1. fasta2DB [-v] ( -f | -i[] | ... ) ``` Builds an initial data base, or adds to an existing database, either (a) the list of .fasta files following the database name argument, or (b) the list of .fasta files in \ if the -f option is used, or (c) entries piped from the standard input if the -i option is used. If the DB is being created it is established as a Sequence-DB (S-DB) otherwise its type is unchanged. If a faux file name, \, follows the -i option then all the input received is considered to have come from a file by the name of \.fasta by DB2fasta, otherwise it will be sent to the standard output by DB2fasta. The SMRT cells in a given named input (i.e. all sources other than -i without a name) can only be added consecutively to the DB (this is checked by the command). The .fasta headers must be in the "Pacbio" format (i.e. the output of the Pacbio tools or our dextract program) and the well, pulse interval, and read quality are extracted from the header and kept with each read record. If the files are being added to an existing database, and the partition settings of the DB have already been set (see DBsplit below), then the partitioning of the database is updated to include the new data. A file may contain the data from multiple SMRT cells provided the reads for each SMRT cell are consecutive in the file. ``` 2. DB2fasta [-vU] [-w] ``` The set of .fasta files for the given DB are recreated from the DB exactly as they were input. That is, this is a perfect inversion, including the reconstitution of the proper .fasta headers. Because of this property, one can, if desired, delete the .fasta source files once they are in the DB as they can always be recreated from it. Entries imported from the standard input will be place in the faux file name given on import, or to the standard output if no name was given. By default the output sequences are in lower case and 80 chars per line. The -U option specifies upper case should be used, and the characters per line, or line width, can be set to any positive value with the -w option. ``` 3. quiva2DB [-vl] ( -f | -i | ... ) ``` Adds .quiva streams to an existing DB "path". The DB must either be an S-DB or a Q-DB and upon completion the DB is a Q-DB. The data comes from (a) the given .quiva files on the command line, or (b) those in the file specified by the -f option, or (c) the standard input if the -i option is given. The input files can be added incrementally but must be added in the same order as the .fasta files were and have the same root names, e.g. FOO.fasta and FOO.quiva. This is enforced by the program. With the -l option set the compression scheme is a bit lossy to get more compression (see the description of dexqv in the DEXTRACTOR module here). ``` 4. DB2quiva [-vU] ``` The set of .quiva files within the given Q-DB are recreated from the DB exactly as they were input. That is, this is a perfect inversion, including the reconstitution of the proper .quiva headers. Because of this property, one can, if desired, delete the .quiva source files once they are in the DB as they can always be recreated from it. Entries imported from the standard input will be placed in the faux file name given on import, or to the standard output if no name was given. By .fastq convention each QV vector is output as a line without new-lines, and by default the Deletion Tag entry is in lower case letters. The -U option specifies upper case letters should be used instead. ``` 5. arrow2DB [-v] ( -f | -i | ... ) ``` Adds .arrow streams to an existing DB "path". The DB must either be an S-DB or an A-DB and upon completion the DB is an A-DB. The data comes from (a) the given .arrow files on the command line, or (b) those in the file specified by the -f option, or (c) the standard input if the -i option is given. The input files can be added incrementally but must be added in the same order as the .fasta files were and have the same root names, e.g. FOO.fasta and FOO.quiva. This is enforced by the program. ``` 6. DB2arrow [-v] [-w] ``` The set of .arrow files within the given A-DB are recreated from the DB exactly as they were input. That is, this is a perfect inversion, including the reconstitution of the proper .arrow headers. Because of this property, one can, if desired, delete the .arrow source files once they are in the DB as they can always be recreated from it. Entries imported from the standard input will be placed in the faux file name given on import, or to the standard output if no name was given. By default the output sequences are formatted 80 chars per line, but the characters per line, or line width, can be set to any positive value with the -w option. ``` 7. fasta2DAM [-v] ( -f | -i[] | ... ) ``` Builds an initial map DB or DAM, or adds to an existing DAM, either (a) the list of .fasta files following the database name argument, or (b) the list of .fasta files in \ if the -f option is used, or (c) entries piped from the standard input if the -i option is used. If a faux file name, \, follows the -i option then all the input received is considered to have come from a file by the name of \.fasta by DAM2fasta, otherwise it will be sent to the standard output by DAM2fasta. Any .fasta entry that has a run of N's in it will be split into separate "contig" entries and the interval of the contig in the original entry recorded. The header for each .fasta entry is saved with the contigs created from it. ``` 8. DAM2fasta [-vU] [-w] ``` The set of .fasta files for the given map DB or DAM are recreated from the DAM exactly as they were input. That is, this is a perfect inversion, including the reconstitution of the proper .fasta headers and the concatenation of contigs with the proper number of N's between them to recreate scaffolds. Entries imported from the standard input will be place in the faux file name given on import, or to the standard output if no name was given. By default the output sequences are in lower case and 80 chars per line. The -U option specifies upper case should be used, and the characters per line, or line width, can be set to any positive value with the -w option. ``` 9. DBsplit [-af] [-x] [-s] ``` Divide the database \.db or \.dam conceptually into a series of blocks referable to on the command line as \.1, \.2, ... If the -x option is set then all reads less than the given length are ignored, and if the -a option is not set then secondary reads from a given well are also ignored. The remaining reads, constituting what we call the trimmed DB, are split amongst the blocks so that each block is of size -s * 1Mbp except for the last which necessarily contains a smaller residual. The default value for -s is 200Mbp because blocks of this size can be compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks are very space efficient in that their sub-index of the master .idx is computed on the fly when loaded, and the .bps and .qvs files (if a .db) of base pairs and quality values, respectively, is shared with the master DB. Any relevant portions of tracks associated with the DB are also computed on the fly when loading a database block. If the -f option is set, the split is forced regardless of whether or not the DB in question has previously bin split, i.e. one is not interactively asked if they wish to proceed. ``` 10. DBtrim [-af] [-x] ``` Exactly like DBsplit except that it only resets the trimming parameters (and not the split partition itself). ``` 11. DBdust [-b] [-w] [-t] [-m] ``` Runs the symmetric DUST algorithm over the reads in the untrimmed DB \.db or \.dam producing a track .\.dust[.anno,.data] that marks all intervals of low complexity sequence, where the scan window is of size -w, the threshold for being a low-complexity interval is -t, and only low-complexity intervals of size greater than -m are recorded. If the -b option is set then the definition of low complexity takes into account the frequency of a given base. The command is incremental if given a DB to which new data has been added since it was last run on the DB, then it will extend the track to include the new reads. It is important to set this flag for genomes with a strong AT/GC bias, albeit the code is a tad slower. The dust track, if present, is understood and used by DBshow, DBstats, and dalign. DBdust can also be run over an untriimmed DB block in which case it outputs a track encoding where the trace file names contain the block number, e.g. .FOO.3.dust.anno and .FOO.3.dust.data, given FOO.3 on the command line. We call this a *block track*. This permits job parallelism in block-sized chunks, and the resulting sequence of block tracks can then be merged into a track for the entire untrimmed DB with Catrack. ``` 12. Catrack [-vfd] ``` Find all block tracks of the form .\.#.\... and concatenate them into a single track, .\.\..., for the given DB or DAM. The block track files must all encode the same kind of track data (this is checked), and the files must exist for block 1, 2, 3, ... up to the last block number. If the -f option is set, then the concatenation takes place regardless of whether or not the single, combined track already exists or not. If the -d option is set then every block track is removed after the successful construction of the combined track. ``` 13. DBshow [-unqaUQA] [-w] [-m]+ [ | ... ] ``` Displays the requested reads in the database \.db or \.dam. By default the command applies to the trimmed database, but if -u is set then the entire DB is used. If no read arguments are given then every read in the database or database block is displayed. Otherwise the input file or the list of supplied integer ranges give the ordinal positions in the actively loaded portion of the db. In the case of a file, it should simply contain a read index, one per line. In the other case, a read range is either a lone integer or the symbol $, in which case the read range consists of just that read (the last read in the database if $). One may also give two positive integers separated by a dash to indicate a range of integers, where again a $ represents the index of the last read in the actively loaded db. For example, 1 3-5 $ displays reads 1, 3, 4, 5, and the last read in the active db. As another example, 1-$ displays every read in the active db (the default). By default a .fasta file of the read sequences is displayed. If the -q option is set and the DB is a Q-DB, then the QV streams are also displayed in a non-standard modification of the fasta format. Similarly, if the -a option is set and the DB is an A-DB, then the pulse width stream is also displayed in a non-standard format. If the -n option is set then the DNA sequence is *not* displayed. If the -Q option is set then a .quiva file of the selected reads is displayed and all other options except -u and -U are ignored. If the -A option is set then a .arrow file of the selected reads is displayed and all other options except -u and -w are ignored. If one or more masks are set with the -m option then the track intervals are also displayed in an additional header line and the bases within an interval are displayed in the case opposite that used for all the other bases. By default the output sequences are in lower case and 80 chars per line. The -U option specifies upper case should be used, and the characters per line, or line width, can be set to any positive value with the -w option. The .fasta, .quiva, and .arrow files that are output can be used to build a new DB with fasta2DB, quiva2D, and arrow2DB, giving one a simple way to make a DB of a subset of the reads for testing purposes. ``` 14. DBdump [-rhsaqip] [-uU] [-m]+ [ | ... ] ``` Like DBshow, DBdump allows one to display a subset of the reads in the DB and select which information to show about them including any mask tracks. The difference is that the information is written in a very simple "1-code" ASCII format that makes it easy for one to read and parse the information for further use. The option flags determine which items of information are output as follows: * -r requests that each read number be displayed in an R-line (see below, useful if only a subset of reads is requested). * -h requests the header information be output as the source file name on an H-line, the If the -d option is set then every block track is removed after the successful construction of the combined track.well # and pulse range on an L-line, and optionally the quality of the read if given on a Q-line. * -s requests the sequence be output on an S-line. * -a requests the Arrow information be output as a pulse-width string on an A-line and the 4 SNR channel values on an N-line, * -q requests that the 5 Quiver quality streams be output on d-, c-, i-, m-, and s-lines. * -i requests that the intrinsic quality values be output on an I-line. * -p requests the repeat profile be output (if available) on a P-line, on a P-line * -m\ requests that mask \ be output on a T-line. Set -u if you want data from the untrimmed database (the default is trimmed) and set -U if you'd like upper-case letter used in the DNA sequence strings. The format is very simple. A requested unit of information occurs on a line. The first character of every line is a "1-code" character that tells you what information to expect on the line. The rest of the line contains the information where each item is separated by a single blank space. Strings are output as first an integer giving the length of the string, a blank space, and then the string terminated by a new-line. Intrinsic quality values are between 0 and 50, inclusive, and a vector of said are displayed as an alphabetic string where 'a' is 0, 'b' is '1', ... 'z' is 25, 'A' is 26, 'B' is 27, ... and 'Y' is 50. Repeat profiles are also displayed as string where '_' denotes 0 repetitions, and then 'a' through 'N' denote the values 1 through 40, respectively. The set of all possible lines is as follows: ``` R # - read number H # string - original file name string (header) L # # # - location: well, pulse start, pulse end Q # - quality of read (#/1000) N # # # # - SNR of ACGT channels (#/100) Tx #n (#b #e)^#n - x'th track on command line, #n intervals all on same line S # string - sequence string A # string - arrow pulse-width string I # string - intrinsic quality vector (as an ASCII string) P # string - repeat profile vector (as an ASCII string) d # string - Quiva deletion values (as an ASCII string) c # string - Quiva deletion character string i # string - Quiva insertion value string m # string - Quiva merge value string s # string - Quiva substitution value string + X # - Total amount of X (X = H or S or I or P or R or M or T#) @ X # - Maximum amount of X (X = H or S or I or P or T#) ``` 1-code lines that begin with + or @ are always the first lines in the output. They give size information about what is contained in the output. That is '+ X #' gives the number of reads (X=R), the number of masks (X=M), or the total number of characters in all headers (X=H), sequences (X=S), intrinsic quality vectors (X=I), read profile vector (X=P), or track (X=T#). And '@ X #' gives the maximum number of characters in any header (X=H), sequence (X=S), intrincic quality vector (X=I), read profile vector (X=P), or track (X=T#). The size numbers for the Quiva strings and Arrow pulse width strings are identical to that for the sequence as they are all of the same length for any given entry. ``` 15. DBstats [-nu] [-b]+ ``` Show overview statistics for all the reads in the trimmed data base \.db or \.dam, including a histogram of read lengths where the bucket size is set with the -b option (default 1000). If the -u option is given then the untrimmed database is summarized. If the -n option is given then the histogran of read lengths is not displayed. Any track such as a "dust" track that gives a series of intervals along the read can be specified with the -m option in which case a summary and a histogram of the interval lengths is displayed. ``` 16. DBrm [-v] ... ``` Delete all the files for the given data bases. Do not use rm to remove a database, as there are at least two and often several secondary files for each DB including track files, and all of these are removed by DBrm. If the -v option is set then every file deleted is listed. ``` 17. DBmv [-v] ``` Rename all the files for the data base old to use the new root. If the -v option is set then every file move is displayed. ``` 18. DBwipe ... ``` Delete any Arrow or Quiver data from the given databases. This removes the .arw or .qvs file and resets information in the .idx file containing information for Arrow or Quiver. Basically, converts an A-DB or Q-DB back to a simple S-DB. ``` 19. simulator [-CU] [-m] [-s] [-e] [-f] [-x] [-w] [-r] [-M] ``` In addition to the DB commands we include here, somewhat tangentially, a simple simulator that generates synthetic reads over a given genome reference contained in a supplied .dam DB. The simulator first reconstitutes the scaffolds of the reference genome and fills in their gaps (a run of N's in .fasta format indicating the estimate gap length) with a random sequence that follows the base distribution of the contigs. It will then sample reads from these scaffold sequences. The simulator generates sample reads of mean length -m from a log-normal length distribution with standard deviation -s, but ignores reads of length less than -x. It collects enough reads to cover the genome -c times and Introduces -e fraction errors into each read where the ratio of insertions, deletions, and substitutions are set by defined constants INS_RATE (default 73%) and DEL_RATE (default 20%) within generate.c. One can control the rate at which reads are picked from the forward and reverse strands with the -f option. The -r option seeds the random number generator for the generation process so that one can reproducibly generate the same dataset. If this parameter is missing, then the job id of the invocation seeds the random number generator effectively guaranteeing a different sampling with each invocation. The output is sent to the standard output (i.e. it is a UNIX pipe). The output is in Pacbio .fasta format suitable as input to fasta2DB. Uppercase letters are used if the -U option is given, and the width of each line can be controlled with the -w option. Finally, the -M option requests that the scaffold and coordinates within said scaffold from which each read has been sampled are written to the indicated file, one line per read, ASCII encoded. This "map" file essential tells one where every read belongs in an assembly and is very useful for debugging and testing purposes. If the map line for a read is say 's b e' then if b \< e the read is a perturbed copy of s[b,e] in the forward direction, and a perturbed copy s[e,b] in the reverse direction otherwise. ``` 20. rangen [-U] [-b] [-w] [-r] ``` Generate a random DNA sequence of length genlen*1Mbp that has an AT-bias of -b. Output the sequence to the standard output in .fasta format. Use uppercase letters if -U is set and -w base pairs per line (default 80). The result can then be converted into a .dam DB and given to the simulator to create a read database over a random synthetic sequence. The -r option seeds the random number generator for the generation process so that one can reproducibly generate the same sequence. If this parameter is missing, then the job id of the invocation seeds the random number generator effectively guaranteeing a different sequence with each invocation. Example: A small complete example of most of the commands above. ``` > rangen 1.0 >R.fasta // Generate a randome 1Mbp sequence R.fasta > fasta2DAM R R.fasta // Load it into a .dam DB R.dam > simulator R -c20. >G.fasta // Sample a 20x data sets of the random geneome R > fasta2DB G G.fasta // Create a compressed data base of the reads, G.db > rm G.fasta // Redundant, recreate any time with "DB2fasta G" > DBsplit -s11 G // Split G into 2 parts of size ~ 11MB each > DBdust G.1 // Produce a "dust" track on each part > DBdust G.2 > Catrack G dust // Create one track for all of the DB > rm .G.*.dust.* // Clean up the sub-tracks > DBstats -mdust G // Take a look at the statistics for the database Statistics for all reads in the data set 1,836 reads out of 1,836 (100.0%) 20,007,090 base pairs out of 20,007,090 (100.0%) 10,897 average read length 2,192 standard deviation Base composition: 0.250(A) 0.250(C) 0.250(G) 0.250(T) Distribution of Read Lengths (Bin size = 1,000) Bin: Count % Reads % Bases Average 22,000: 1 0.1 0.1 22654 21,000: 0 0.1 0.1 22654 20,000: 1 0.1 0.2 21355 19,000: 0 0.1 0.2 21355 18,000: 4 0.3 0.6 19489 17,000: 8 0.8 1.3 18374 16,000: 19 1.8 2.8 17231 15,000: 43 4.1 6.2 16253 14,000: 81 8.6 12.0 15341 13,000: 146 16.5 21.9 14428 12,000: 200 27.4 34.4 13664 11,000: 315 44.6 52.4 12824 10,000: 357 64.0 71.2 12126 9,000: 306 80.7 85.8 11586 8,000: 211 92.2 94.8 11208 7,000: 95 97.3 98.4 11017 6,000: 43 99.7 99.8 10914 5,000: 6 100.0 100.0 10897 Statistics for dust-track There are 158 intervals totaling 1,820 bases (0.0% of all data) Distribution of dust intervals (Bin size = 1,000) Bin: Count % Intervals % Bases Average 0: 158 100.0 100.0 11 > ls -al total 66518744 drwxr-xr-x+ 177 myersg staff 6018 Mar 2 13:28 . drwxr-xr-x+ 20 myersg staff 680 Feb 26 19:52 .. -rw-r--r--+ 1 myersg staff 5002464 Mar 2 13:28 .G.bps -rw-r--r--+ 1 myersg staff 14704 Mar 2 13:28 .G.dust.anno -rw-r--r--+ 1 myersg staff 1264 Mar 2 13:28 .G.dust.data -rw-r--r--+ 1 myersg staff 73552 Mar 2 13:28 .G.idx -rw-r--r--+ 1 myersg staff 162 Mar 2 13:28 G.db > cat G.db files = 1 1836 G Sim blocks = 2 size = 11 cutoff = 0 all = 0 0 0 1011 1011 1836 1836 ``` DAZZ_DB-master/arrow2DB.c000066400000000000000000000406731322703422500152640ustar00rootroot00000000000000/******************************************************************************************* * * Adds the given .arrow files to an existing DB "path". The input files must be added in * the same order as the .fasta files were and have the same root names, e.g. FOO.fasta * and FOO.arrow. The files can be added incrementally but must be added in the same order * as the .fasta files. This is enforced by the program. With the -l option set the * compression scheme is a bit lossy to get more compression (see the description of dexqv * in the DEXTRACTOR module). * * Author: Gene Myers * Date : July 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #include "QV.h" // Compiled in INTERACTIVE mode as all routines must return with an error // so that cleanup and restore is possible. #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-v] ( -f | -i | ... )"; typedef struct { int argc; char **argv; FILE *input; int count; char *name; } File_Iterator; File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); if (it == NULL) return (NULL); it->argc = argc; it->argv = argv; it->input = input; if (input == NULL) it->count = first; else { it->count = 1; rewind(input); } return (it); } int next_file(File_Iterator *it) { static char nbuffer[MAX_NAME+8]; if (it->input == NULL) { if (it->count >= it->argc) return (0); it->name = it->argv[it->count++]; } else { char *eol; if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL) { if (feof(it->input)) return (0); fprintf(stderr,"%s: IO error reading line %d of -f file of names\n",Prog_Name,it->count); it->name = NULL; return (1); } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n", Prog_Name,it->count,MAX_NAME+7); it->name = NULL; return (1); } *eol = '\0'; it->count += 1; it->name = nbuffer; } return (1); } int main(int argc, char *argv[]) { FILE *istub; char *root, *pwd; FILE *arrow, *indx; int64 boff; DAZZ_DB db; DAZZ_READ *reads; int nfiles; int VERBOSE; int PIPE; FILE *INFILE; // Process command line { int i, j, k; int flags[128]; ARG_INIT("arrow2DB") INFILE = NULL; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vli") break; case 'f': INFILE = fopen(argv[i]+2,"r"); if (INFILE == NULL) { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2); exit (1); } break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; PIPE = flags['i']; if (INFILE != NULL && PIPE) { fprintf(stderr,"%s: Cannot use both -f and -i together\n",Prog_Name); exit (1); } if ( (INFILE == NULL && ! PIPE && argc <= 2) || ((INFILE != NULL || PIPE) && argc != 2)) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr," -f: import files listed 1/line in given file.\n"); fprintf(stderr," -i: import data from stdin.\n"); fprintf(stderr," : otherwise, import sequence of specified files.\n"); exit (1); } } // Open DB stub file, index, and .arw file for appending. Load db and read records, // get number of cells from stub file, and note current offset to end of .arw root = Root(argv[1],".db"); pwd = PathTo(argv[1]); istub = Fopen(Catenate(pwd,"/",root,".db"),"r"); if (istub == NULL) exit (1); if (fscanf(istub,DB_NFILE,&nfiles) != 1) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root); exit (1); } indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); if (indx == NULL) exit (1); if (fread(&db,sizeof(DAZZ_DB),1,indx) != 1) { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root); exit (1); } reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*db.ureads,"Allocating DB index"); if (reads == NULL) exit (1); if (fread(reads,sizeof(DAZZ_READ),db.ureads,indx) != (size_t) (db.ureads)) { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root); exit (1); } if (reads[0].coff >= 0 && (db.allarr & DB_ARROW) == 0) { fprintf(stderr,"%s: Database %s has Quiver data!\n",Prog_Name,root); exit (1); } arrow = NULL; boff = 0; if (reads[0].coff < 0) arrow = Fopen(Catenate(pwd,PATHSEP,root,".arw"),"w"); else arrow = Fopen(Catenate(pwd,PATHSEP,root,".arw"),"r+"); if (arrow == NULL) goto error; fseeko(arrow,0,SEEK_END); boff = ftello(arrow); // Do a merged traversal of cell lines in .db stub file and .arrow files to be // imported, driving the loop with the cell line # { FILE *input = NULL; char *path = NULL; char *core = NULL; char *read; int rmax, rlen, eof; File_Iterator *ng = NULL; char lname[MAX_NAME]; int first, last, cline; int cell; // Buffer for accumulating .arrow sequence over multiple lines rmax = MAX_NAME + 60000; read = (char *) Malloc(rmax+1,"Allocating line buffer"); if (read == NULL) goto error; if (!PIPE) { ng = init_file_iterator(argc,argv,INFILE,2); if (ng == NULL) goto error; } eof = 0; for (cell = 0; cell < nfiles; cell++) { char prolog[MAX_NAME], fname[MAX_NAME]; if (cell == 0) // First addition, a pipe: find the first cell that does not have .arrow's yet // (error if none) and set input source to stdin. if (PIPE) { first = 0; while (cell < nfiles) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",core,Prog_Name); goto error; } if (reads[first].coff < 0) break; first = last; cell += 1; } if (cell >= nfiles) { fprintf(stderr,"%s: All .arrows's have already been added !?\n",Prog_Name); goto error; } input = stdin; if (VERBOSE) { fprintf(stderr,"Adding arrows's from stdin ...\n"); fflush(stderr); } cline = 0; } // First addition, not a pipe: then get first .arrow file name (error if not one) to // add, find the first cell name whose file name matches (error if none), check that // the previous .arrow's have been added and this is the next slot. Then open // the .arrow file for compression else { if (! next_file(ng)) { fprintf(stderr,"%s: file list is empty!\n",Prog_Name); goto error; } if (ng->name == NULL) goto error; core = Root(ng->name,".arrow"); path = PathTo(ng->name); if ((input = Fopen(Catenate(path,"/",core,".arrow"),"r")) == NULL) goto error; first = 0; while (cell < nfiles) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",core,Prog_Name); goto error; } if (strcmp(core,fname) == 0) break; first = last; cell += 1; } if (cell >= nfiles) { fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core); goto error; } if (first > 0 && reads[first-1].coff < 0) { fprintf(stderr,"%s: Predecessor of %s.arrow has not been added yet\n", Prog_Name,core); goto error; } if (reads[first].coff >= 0) { fprintf(stderr,"%s: %s.arrow has already been added\n",Prog_Name,core); goto error; } if (VERBOSE) { fprintf(stderr,"Adding '%s.arrow' ...\n",core); fflush(stderr); } cline = 0; } // Not the first addition: get next cell line. If not a pipe and the file name is new, // then close the current .arrow, open the next one and after ensuring the names // match, open it for incorporation else { first = last; strcpy(lname,fname); if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",core,Prog_Name); goto error; } if (PIPE) { int c; if ((c = fgetc(input)) == EOF) break; ungetc(c,input); } else if (strcmp(lname,fname) != 0) { if ( ! eof) { fprintf(stderr,"%s: Too many reads in %s.arrow while handling %s.fasta\n", Prog_Name,core,fname); goto error; } fclose(input); free(path); free(core); if ( ! next_file(ng)) break; if (ng->name == NULL) goto error; path = PathTo(ng->name); core = Root(ng->name,".arrow"); if ((input = Fopen(Catenate(path,"/",core,".arrow"),"r")) == NULL) goto error; if (strcmp(core,fname) != 0) { fprintf(stderr,"%s: Files not being added in order (expect %s, given %s)\n", Prog_Name,fname,core); goto error; } if (VERBOSE) { fprintf(stderr,"Adding '%s.arrow' ...\n",core); fflush(stderr); } cline = 0; } } // If first cell or source is a new file, then start IO if (cline == 0) { // Read in first line and make sure it is a header in PACBIO format. rlen = 0; eof = (fgets(read,MAX_NAME,input) == NULL); if (read[strlen(read)-1] != '\n') { fprintf(stderr,"File %s.arrow, Line 1: Fasta line is too long (> %d chars)\n", core,MAX_NAME-2); goto error; } if (!eof && read[0] != '>') { fprintf(stderr,"File %s.arrow, Line 1: First header in arrow file is missing\n", core); goto error; } } // Compress reads [first..last) from open .arrow appending to .arw and record // snr in .coff field of reads (offset is the same as for the DNA sequence, .boff) { int i, x; for (i = first; i < last; i++) { char *find; int clen; float snr[4]; uint16 cnr[4]; if (eof) { if (PIPE) fprintf(stderr,"%s: Insufficient # of reads on input while handling %s.arrow\n", Prog_Name,fname); else { fprintf(stderr,"%s: Insufficient # of reads in %s.arrow while handling", Prog_Name,core); fprintf(stderr," %s.arrow\n",fname); } goto error; } find = index(read+(rlen+1),' '); if (find == NULL) { fprintf(stderr,"File %s.arrow, Line %d: Pacbio header line format error\n", core,cline); goto error; } *find = '\0'; if (strcmp(read+(rlen+1),prolog) != 0) { fprintf(stderr,"File %s.arrow, Line %d: Pacbio prolog doesn't match DB entry\n", core,cline); goto error; } *find = ' '; x = sscanf(find+1," SN=%f,%f,%f,%f\n",snr,snr+1,snr+2,snr+3); if (x != 4) { fprintf(stderr,"File %s.arrow, Line %d: Pacbio header line format error\n", core,cline); goto error; } rlen = 0; while (1) { eof = (fgets(read+rlen,MAX_NAME,input) == NULL); cline += 1; x = strlen(read+rlen)-1; if (read[rlen+x] != '\n') { if (read[rlen] == '>') { fprintf(stderr,"File %s.arrow, Line %d:",core,cline); fprintf(stderr," Fasta header line is too long (> %d chars)\n", MAX_NAME-2); goto error; } else x += 1; } if (eof || read[rlen] == '>') break; rlen += x; if (rlen + MAX_NAME > rmax) { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_NAME; read = (char *) realloc(read,rmax+1); if (read == NULL) { fprintf(stderr,"File %s.arrow, Line %d:",core,cline); fprintf(stderr," Out of memory (Allocating line buffer)\n"); goto error; } } } read[rlen] = '\0'; for (x = 0; x < 4; x++) cnr[x] = (uint32) (snr[x] * 100.); *((uint64 *) &(reads[i].coff)) = ((uint64) cnr[0]) << 48 | ((uint64) cnr[1]) << 32 | ((uint64) cnr[2]) << 16 | ((uint64) cnr[3]); Number_Arrow(read); Compress_Read(rlen,read); clen = COMPRESSED_LEN(rlen); fwrite(read,1,clen,arrow); } } } if (!eof) { if (PIPE) fprintf(stderr,"%s: Too many reads on input while handling %s.fasta\n", Prog_Name,lname); else fprintf(stderr,"%s: Too many reads in %s.arrow while handling %s.fasta\n", Prog_Name,core,lname); goto error; } if ( ! PIPE && cell >= nfiles) { fclose(input); free(core); free(path); if (next_file(ng)) { if (ng->name == NULL) goto error; core = Root(ng->name,".arrow"); fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core); goto error; } } } // Write the db record and read index into .idx and clean up db.allarr |= DB_ARROW; rewind(indx); fwrite(&db,sizeof(DAZZ_DB),1,indx); fwrite(reads,sizeof(DAZZ_READ),db.ureads,indx); fclose(istub); fclose(indx); fclose(arrow); exit (0); // Error exit: Either truncate or remove the .arw file as appropriate. error: if (boff != 0) { fseeko(arrow,0,SEEK_SET); if (ftruncate(fileno(arrow),boff) < 0) fprintf(stderr,"%s: Fatal: could not restore %s.arw after error, truncate failed\n", Prog_Name,root); } if (arrow != NULL) { fclose(arrow); if (boff == 0) unlink(Catenate(pwd,PATHSEP,root,".arw")); } fclose(istub); fclose(indx); exit (1); } DAZZ_DB-master/fasta2DAM.c000066400000000000000000000506511322703422500153410ustar00rootroot00000000000000/******************************************************************************************* * * Add .fasta files to a DB: * Adds the given fasta files in the given order to .db. If the db does not exist * then it is created. All .fasta files added to a given data base must have the same * header format and follow Pacbio's convention. A file cannot be added twice and this * is enforced. The command either builds or appends to the ..idx and ..bps * files, where the index file (.idx) contains information about each read and their offsets * in the base-pair file (.bps) that holds the sequences where each base is compessed * into 2-bits. The two files are hidden by virtue of their names beginning with a '.'. * .db is effectively a stub file with given name that contains an ASCII listing * of the files added to the DB and possibly the block partitioning for the DB if DBsplit * has been called upon it. * * Author: Gene Myers * Date : May 2013 * Modify: DB upgrade: now *add to* or create a DB depending on whether it exists, read * multiple .fasta files (no longer a stdin pipe). * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-v] ( -f | -i[] | ... )"; static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; typedef struct { int argc; char **argv; FILE *input; int count; char *name; } File_Iterator; File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); if (it == NULL) return (NULL); it->argc = argc; it->argv = argv; it->input = input; if (input == NULL) it->count = first; else { it->count = 1; rewind(input); } return (it); } int next_file(File_Iterator *it) { static char nbuffer[MAX_NAME+8]; if (it->input == NULL) { if (it->count >= it->argc) return (0); it->name = it->argv[it->count++]; } else { char *eol; if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL) { if (feof(it->input)) return (0); fprintf(stderr,"%s: IO error reading line %d of -f file of names\n",Prog_Name,it->count); it->name = NULL; return (1); } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n", Prog_Name,it->count,MAX_NAME+7); it->name = NULL; return (1); } *eol = '\0'; it->count += 1; it->name = nbuffer; } return (1); } int main(int argc, char *argv[]) { FILE *istub, *ostub; char *dbname; char *root, *pwd; FILE *bases, *indx, *hdrs; int64 boff, ioff, hoff, noff; int ifiles, ofiles; char **flist; DAZZ_DB db; int ureads; int64 offset, hdrset; char *PIPE; FILE *IFILE; int VERBOSE; // Process command line { int i, j, k; int flags[128]; ARG_INIT("fasta2DAM") IFILE = NULL; PIPE = NULL; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; case 'f': IFILE = fopen(argv[i]+2,"r"); if (IFILE == NULL) { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2); exit (1); } break; case 'i': PIPE = argv[i]+2; if (PIPE[0] != '\0') { FILE *temp; temp = fopen(PIPE,"w"); if (temp == NULL) { fprintf(stderr,"%s: Cannot create -i name '%s'\n",Prog_Name,argv[i]+2); exit (1); } fclose(temp); unlink(PIPE); } break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (IFILE != NULL && PIPE != NULL) { fprintf(stderr,"%s: Cannot use both -f and -i together\n",Prog_Name); exit (1); } if ( (IFILE == NULL && PIPE == NULL && argc <= 2) || ((IFILE != NULL || PIPE != NULL) && argc != 2)) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -f: import files listed 1/line in given file.\n"); fprintf(stderr," -i: import data from stdin, use optiona name as data source.\n"); fprintf(stderr," : otherwise, import sequence of specified files.\n"); exit (1); } } // Try to open DAM file, if present then adding to DAM, otherwise creating new DAM. Set up // variables as follows: // dbname = full name of map index = /.dam // istub = open db file (if adding) or NULL (if creating) // ostub = new image of db file (will overwrite old image at end) // bases = .bps file positioned for appending // indx = .idx file positioned for appending // hdrs = .hdr file positioned for appending // ureads = # of reads currently in db // offset = offset in .bps at which to place next sequence // hdrset = offset in .hdr at which to place next header // ioff = offset in .idx file to truncate to if command fails // boff = offset in .bps file to truncate to if command fails // hoff = offset in .hdr file to truncate to if command fails // ifiles = # of .fasta files to add // ofiles = # of .fasta files added so far // flist = [0..ifiles+ofiles] list of file names (root only) added to dam so far { int i; root = Root(argv[1],".dam"); pwd = PathTo(argv[1]); dbname = Strdup(Catenate(pwd,"/",root,".dam"),"Allocating map index name"); if (dbname == NULL) exit (1); if (PIPE != NULL) ifiles = 1; else if (IFILE == NULL) ifiles = argc-2; else { File_Iterator *ng; ifiles = 0; ng = init_file_iterator(argc,argv,IFILE,2); if (ng == NULL) exit (1); while (next_file(ng)) { if (ng->name == NULL) exit (1); ifiles += 1; } free(ng); } bases = NULL; indx = NULL; hdrs = NULL; ostub = NULL; ioff = 0; boff = 0; hoff = 0; istub = fopen(dbname,"r"); if (istub == NULL) { ofiles = 0; bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w+"); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w+"); hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"w+"); if (bases == NULL || indx == NULL || hdrs == NULL) goto error; fwrite(&db,sizeof(DAZZ_DB),1,indx); ureads = 0; offset = 0; hdrset = 0; boff = 0; ioff = 0; hoff = 0; } else { if (fscanf(istub,DB_NFILE,&ofiles) != 1) { fprintf(stderr,"%s: %s.dam is corrupted, read failed 1\n",Prog_Name,root); goto error; } bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"r+"); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"r+"); if (bases == NULL || indx == NULL || hdrs == NULL) goto error; if (fread(&db,sizeof(DAZZ_DB),1,indx) != 1) { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root); goto error; } fseeko(bases,0,SEEK_END); fseeko(indx, 0,SEEK_END); fseeko(hdrs, 0,SEEK_END); ureads = db.ureads; offset = ftello(bases); hdrset = ftello(hdrs); boff = offset; ioff = ftello(indx); hoff = hdrset; } flist = (char **) Malloc(sizeof(char *)*(ofiles+ifiles),"Allocating file list"); ostub = Fopen(Catenate(pwd,"/",root,".dbx"),"w+"); if (ostub == NULL || flist == NULL) goto error; fprintf(ostub,DB_NFILE,ofiles+ifiles); noff = 0; for (i = 0; i < ofiles; i++) { int last; char prolog[MAX_NAME], fname[MAX_NAME]; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { fprintf(stderr,"%s: %s.dam is corrupted, read failed 5(%d)\n",Prog_Name,root,i); goto error; } if ((flist[i] = Strdup(fname,"Adding to file list")) == NULL) goto error; noff = ftello(ostub); fprintf(ostub,DB_FDATA,last,fname,prolog); } } { int maxlen; int64 totlen, count[4]; int rmax; DAZZ_READ prec; char *read; int append; int c; File_Iterator *ng = NULL; // Buffer for accumulating .fasta sequence over multiple lines rmax = MAX_NAME + 60000; read = (char *) Malloc(rmax+1,"Allocating line buffer"); if (read == NULL) goto error; totlen = 0; // total # of bases in new .fasta files maxlen = 0; // longest read in new .fasta files for (c = 0; c < 4; c++) // count of acgt in new .fasta files count[c] = 0; // For each .fasta file do: if (PIPE == NULL) { ng = init_file_iterator(argc,argv,IFILE,2); if (ng == NULL) goto error; } while (PIPE != NULL || next_file(ng)) { FILE *input; char *path, *core; int nline, eof, rlen; // Open it: /.fasta if file, stdin otherwise with core = PIPE or "stdout" if (PIPE == NULL) { if (ng->name == NULL) goto error; path = PathTo(ng->name); core = Root(ng->name,".fasta"); if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL) goto error; free(path); } else { if (PIPE[0] == '\0') core = Strdup("stdout","Allocating file name"); else core = Strdup(PIPE,"Allocating file name"); if (core == NULL) goto error; input = stdin; } // Check that core is not too long and name is unique or last source if PIPE'd // If PIPE'd and last source, then overwrite last file line of new stub file. if (strlen(core) >= MAX_NAME) { fprintf(stderr,"%s: File name over %d chars: '%.200s'\n", Prog_Name,MAX_NAME,core); goto error; } { int j; append = 0; if (PIPE == NULL || (strcmp(core,"stdout") != 0 && (ofiles == 0 || strcmp(core,flist[ofiles-1]) != 0))) { for (j = 0; j < ofiles; j++) if (strcmp(core,flist[j]) == 0) { fprintf(stderr,"%s: File %s.fasta is already in database %s.dam\n", Prog_Name,core,Root(argv[1],".dam")); goto error; } } else if (ofiles > 0 && strcmp(core,flist[ofiles-1]) == 0) { fseeko(ostub,noff,SEEK_SET); append = 1; } } // Get the header of the first line. If the file is empty skip. rlen = 0; nline = 1; eof = (fgets(read,MAX_NAME,input) == NULL); if (eof || strlen(read) < 1) { fprintf(stderr,"Skipping '%s', file is empty!\n",core); fclose(input); free(core); continue; } // Add the file name to flist if (VERBOSE) { if (PIPE != NULL && PIPE[0] == '\0') fprintf(stderr,"Adding scaffolds from stdio ...\n"); else fprintf(stderr,"Adding '%s.fasta' ...\n",core); fflush(stderr); } if (!append) flist[ofiles++] = core; // Check that the first line is a header line if (read[strlen(read)-1] != '\n') { fprintf(stderr,"File %s.fasta, Line 1: Fasta line is too long (> %d chars)\n", core,MAX_NAME-2); goto error; } if (!eof && read[0] != '>') { fprintf(stderr,"File %s.fasta, Line 1: First header in fasta file is missing\n",core); goto error; } // Read in all the sequences until end-of-file { int i, x, n; while (!eof) { int hlen; read[rlen] = '>'; hlen = strlen(read+rlen); fwrite(read+rlen,1,hlen,hdrs); rlen = 0; while (1) { eof = (fgets(read+rlen,MAX_NAME,input) == NULL); nline += 1; x = strlen(read+rlen)-1; if (read[rlen+x] != '\n') { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Fasta line is too long (> %d chars)\n",MAX_NAME-2); goto error; } if (eof || read[rlen] == '>') break; rlen += x; if (rlen + MAX_NAME > rmax) { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_NAME; read = (char *) realloc(read,rmax+1); if (read == NULL) { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Out of memory (Allocating line buffer)\n"); goto error; } } } read[rlen] = '\0'; n = 0; i = -1; while (i < rlen) { int pbeg, plen, clen; while (i < rlen) if (number[(int) read[++i]] < 4) break; if (i >= rlen) break; pbeg = i; prec.fpulse = pbeg; prec.origin = n++; prec.boff = offset; prec.coff = hdrset; prec.flags = DB_BEST; while (i < rlen) { x = number[(int) read[i]]; if (x >= 4) break; count[x] += 1; read[i++] = (char) x; } prec.rlen = plen = i-pbeg; ureads += 1; totlen += plen; if (plen > maxlen) maxlen = plen; Compress_Read(plen,read+pbeg); clen = COMPRESSED_LEN(plen); fwrite(read+pbeg,1,clen,bases); offset += clen; fwrite(&prec,sizeof(DAZZ_READ),1,indx); } hdrset += hlen; } } fprintf(ostub,DB_FDATA,ureads,core,core); if (PIPE == NULL) fclose(input); else break; } // Update relevant fields in db record db.ureads = ureads; if (istub == NULL) { for (c = 0; c < 4; c++) db.freq[c] = (float) ((1.*count[c])/totlen); db.totlen = totlen; db.maxlen = maxlen; db.cutoff = -1; db.allarr = 0; } else { for (c = 0; c < 4; c++) db.freq[c] = (float) ((db.freq[c]*db.totlen + (1.*count[c]))/(db.totlen + totlen)); db.totlen += totlen; if (maxlen > db.maxlen) db.maxlen = maxlen; } } // If db has been previously partitioned then calculate additional partition points and // write to new db file image if (db.cutoff >= 0) { int64 totlen, dbpos, size; int nblock, ireads, tfirst, rlen; int ufirst, cutoff, allflag; DAZZ_READ record; int i; if (VERBOSE) { fprintf(stderr,"Updating block partition ...\n"); fflush(stderr); } // Read the block portion of the existing db image getting the indices of the first // read in the last block of the exisiting db as well as the partition parameters. // Copy the old image block information to the new block information (except for // the indices of the last partial block) if (fscanf(istub,DB_NBLOCK,&nblock) != 1) { fprintf(stderr,"%s: %s.dam is corrupted, read failed 2\n",Prog_Name,root); goto error; } dbpos = ftello(ostub); fprintf(ostub,DB_NBLOCK,0); if (fscanf(istub,DB_PARAMS,&size,&cutoff,&allflag) != 3) { fprintf(stderr,"%s: %s.dam is corrupted, read failed 3\n",Prog_Name,root); goto error; } fprintf(ostub,DB_PARAMS,size,cutoff,allflag); if (allflag) allflag = 0; else allflag = DB_BEST; nblock -= 1; for (i = 0; i <= nblock; i++) { if (fscanf(istub,DB_BDATA,&ufirst,&tfirst) != 2) { fprintf(stderr,"%s: %s.dam is corrupted, read failed 4\n",Prog_Name,root); goto error; } fprintf(ostub,DB_BDATA,ufirst,tfirst); } // Seek the first record of the last block of the existing db in .idx, and then // compute and record partition indices for the rest of the db from this point // forward. fseeko(indx,sizeof(DAZZ_DB)+sizeof(DAZZ_READ)*ufirst,SEEK_SET); totlen = 0; ireads = 0; for (i = ufirst; i < ureads; i++) { if (fread(&record,sizeof(DAZZ_READ),1,indx) != 1) { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root); goto error; } rlen = record.rlen; if (rlen >= cutoff) { ireads += 1; tfirst += 1; totlen += rlen; if (totlen >= size) { fprintf(ostub," %9d %9d\n",i+1,tfirst); totlen = 0; ireads = 0; nblock += 1; } } } if (ireads > 0) { fprintf(ostub,DB_BDATA,ureads,tfirst); nblock += 1; } db.treads = tfirst; fseeko(ostub,dbpos,SEEK_SET); fprintf(ostub,DB_NBLOCK,nblock); // Rewind and record the new number of blocks } else db.treads = ureads; rewind(ostub); fprintf(ostub,DB_NFILE,ofiles); rewind(indx); fwrite(&db,sizeof(DAZZ_DB),1,indx); // Write the finalized db record into .idx if (istub != NULL) fclose(istub); fclose(ostub); fclose(indx); fclose(bases); fclose(hdrs); rename(Catenate(pwd,"/",root,".dbx"),dbname); // New image replaces old image exit (0); // Error exit: Either truncate or remove the .idx, .bps, and .hdr files as appropriate. // Remove the new image file /.dbx error: if (ioff != 0) { fseeko(indx,0,SEEK_SET); if (ftruncate(fileno(indx),ioff) < 0) fprintf(stderr,"%s: Fatal: could not restore %s.idx after error, truncate failed\n", Prog_Name,root); } if (boff != 0) { fseeko(bases,0,SEEK_SET); if (ftruncate(fileno(bases),boff) < 0) fprintf(stderr,"%s: Fatal: could not restore %s.bps after error, truncate failed\n", Prog_Name,root); } if (hoff != 0) { fseeko(hdrs,0,SEEK_SET); if (ftruncate(fileno(hdrs),hoff) < 0) fprintf(stderr,"%s: Fatal: could not restore %s.hdr after error, truncate failed\n", Prog_Name,root); } if (indx != NULL) { fclose(indx); if (ioff == 0) unlink(Catenate(pwd,PATHSEP,root,".idx")); } if (bases != NULL) { fclose(bases); if (boff == 0) unlink(Catenate(pwd,PATHSEP,root,".bps")); } if (hdrs != NULL) { fclose(hdrs); if (hoff == 0) unlink(Catenate(pwd,PATHSEP,root,".hdr")); } if (ostub != NULL) { fclose(ostub); unlink(Catenate(pwd,"/",root,".dbx")); } if (istub != NULL) fclose(istub); exit (1); } DAZZ_DB-master/fasta2DB.c000066400000000000000000000560201322703422500152210ustar00rootroot00000000000000/******************************************************************************************* * * Add .fasta files to a DB: * Adds the given fasta files in the given order to .db. If the db does not exist * then it is created. All .fasta files added to a given data base must have the same * header format and follow Pacbio's convention. A file cannot be added twice and this * is enforced. The command either builds or appends to the ..idx and ..bps * files, where the index file (.idx) contains information about each read and their offsets * in the base-pair file (.bps) that holds the sequences where each base is compessed * into 2-bits. The two files are hidden by virtue of their names beginning with a '.'. * .db is effectively a stub file with given name that contains an ASCII listing * of the files added to the DB and possibly the block partitioning for the DB if DBsplit * has been called upon it. * * Author: Gene Myers * Date : May 2013 * Modify: DB upgrade: now *add to* or create a DB depending on whether it exists, read * multiple .fasta files (no longer a stdin pipe). * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-v] ( -f | -i[] | ... )"; static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; typedef struct { int argc; char **argv; FILE *input; int count; char *name; } File_Iterator; File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); if (it == NULL) return (NULL); it->argc = argc; it->argv = argv; it->input = input; if (input == NULL) it->count = first; else { it->count = 1; rewind(input); } return (it); } int next_file(File_Iterator *it) { static char nbuffer[MAX_NAME+8]; if (it->input == NULL) { if (it->count >= it->argc) return (0); it->name = it->argv[it->count++]; } else { char *eol; if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL) { if (feof(it->input)) return (0); fprintf(stderr,"%s: IO error reading line %d of -f file of names\n",Prog_Name,it->count); it->name = NULL; return (1); } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n", Prog_Name,it->count,MAX_NAME+7); it->name = NULL; return (1); } *eol = '\0'; it->count += 1; it->name = nbuffer; } return (1); } int main(int argc, char *argv[]) { FILE *istub, *ostub; char *dbname; char *root, *pwd; FILE *bases, *indx; int64 boff, ioff; int ifiles, ofiles, ocells; char **flist; DAZZ_DB db; int ureads; int64 offset; char *PIPE; FILE *IFILE; int VERBOSE; // Process command line { int i, j, k; int flags[128]; ARG_INIT("fasta2DB") IFILE = NULL; PIPE = NULL; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; case 'f': IFILE = fopen(argv[i]+2,"r"); if (IFILE == NULL) { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2); exit (1); } break; case 'i': PIPE = argv[i]+2; if (PIPE[0] != '\0') { FILE *temp; temp = fopen(PIPE,"w"); if (temp == NULL) { fprintf(stderr,"%s: Cannot create -i name '%s'\n",Prog_Name,argv[i]+2); exit (1); } fclose(temp); if (unlink(PIPE) != 0) fprintf(stderr,"%s: [WARNING] Could not delete temporary file %s\n", Prog_Name,PIPE); } break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (IFILE != NULL && PIPE != NULL) { fprintf(stderr,"%s: Cannot use both -f and -i together\n",Prog_Name); exit (1); } if ( (IFILE == NULL && PIPE == NULL && argc <= 2) || ((IFILE != NULL || PIPE != NULL) && argc != 2)) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -f: import files listed 1/line in given file.\n"); fprintf(stderr," -i: import data from stdin, use optiona name as data source.\n"); fprintf(stderr," : otherwise, import sequence of specified files.\n"); exit (1); } } // Try to open DB file, if present then adding to DB, otherwise creating new DB. Set up // variables as follows: // dbname = full name of db = /.db // istub = open db file (if adding) or NULL (if creating) // ostub = new image of db file (will overwrite old image at end) // bases = .bps file positioned for appending // indx = .idx file positioned for appending // ureads = # of reads currently in db // offset = offset in .bps at which to place next sequence // ioff = offset in .idx file to truncate to if command fails // boff = offset in .bps file to truncate to if command fails // ifiles = # of .fasta files to add // ofiles = # of .fasta files added so far // ocells = # of SMRT cells already in db // flist = [0..ifiles+ocells] list of file names (root only) added to db so far { int i; root = Root(argv[1],".db"); pwd = PathTo(argv[1]); dbname = Strdup(Catenate(pwd,"/",root,".db"),"Allocating db name"); if (dbname == NULL) exit (1); if (PIPE != NULL) ifiles = 1; else if (IFILE == NULL) ifiles = argc-2; else { File_Iterator *ng; ifiles = 0; ng = init_file_iterator(argc,argv,IFILE,2); if (ng == NULL) exit (1); while (next_file(ng)) { if (ng->name == NULL) exit (1); ifiles += 1; } free(ng); } bases = NULL; indx = NULL; ostub = NULL; ioff = 0; boff = 0; istub = fopen(dbname,"r"); if (istub == NULL) { ocells = 0; bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w+"); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w+"); if (bases == NULL || indx == NULL) goto error; fwrite(&db,sizeof(DAZZ_DB),1,indx); ureads = 0; offset = 0; } else { if (fscanf(istub,DB_NFILE,&ocells) != 1) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root); exit (1); } bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"r+"); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); if (bases == NULL || indx == NULL) exit (1); if (fread(&db,sizeof(DAZZ_DB),1,indx) != 1) { if (ferror(indx)) fprintf(stderr,"%s: System error, read failed\n",Prog_Name); else fprintf(stderr,"%s: File %s.idx is corrupted\n",Prog_Name,root); exit (1); } if (fseeko(bases,0,SEEK_END) < 0) SYSTEM_READ_ERROR if (fseeko(indx, 0,SEEK_END) < 0) SYSTEM_READ_ERROR ureads = db.ureads; offset = ftello(bases); boff = offset; ioff = ftello(indx); if (boff < 0 || ioff < 0) SYSTEM_READ_ERROR } flist = (char **) Malloc(sizeof(char *)*(ocells+ifiles),"Allocating file list"); ostub = Fopen(Catenate(pwd,"/",root,".dbx"),"w+"); if (ostub == NULL || flist == NULL) goto error; if (fprintf(ostub,DB_NFILE,ocells+ifiles) < 0) // Will write again with correct value at end { fprintf(stderr,"%s: System error, write failed\n",Prog_Name); goto error; } ofiles = 0; for (i = 0; i < ocells; i++) { int last; char prolog[MAX_NAME], fname[MAX_NAME]; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { if (ferror(istub)) fprintf(stderr,"%s: System error, read failed\n",Prog_Name); else fprintf(stderr,"%s: File %s.db is corrupted\n",Prog_Name,root); goto error; } if (ofiles == 0 || strcmp(flist[ofiles-1],fname) != 0) if ((flist[ofiles++] = Strdup(fname,"Adding to file list")) == NULL) goto error; if (fprintf(ostub,DB_FDATA,last,fname,prolog) < 0) { fprintf(stderr,"%s: System error, write failed\n",Prog_Name); goto error; } } } { int maxlen; int64 totlen, count[4]; int pmax, rmax; DAZZ_READ *prec; char *read; int c; File_Iterator *ng = NULL; // Buffer for reads all in the same well pmax = 100; prec = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*pmax,"Allocating record buffer"); if (prec == NULL) goto error; // Buffer for accumulating .fasta sequence over multiple lines rmax = MAX_NAME + 60000; read = (char *) Malloc(rmax+1,"Allocating line buffer"); if (read == NULL) goto error; totlen = 0; // total # of bases in new .fasta files maxlen = 0; // longest read in new .fasta files for (c = 0; c < 4; c++) // count of acgt in new .fasta files count[c] = 0; // For each new input source do if (PIPE == NULL) { ng = init_file_iterator(argc,argv,IFILE,2); // Setup to read .fasta's if (ng == NULL) // from command line or file goto error; } while (PIPE != NULL || next_file(ng)) { FILE *input; char prolog[MAX_NAME]; char *path, *core; int eof; // Open it: /.fasta if file, stdin otherwise with core = PIPE or "stdout" if (PIPE == NULL) { if (ng->name == NULL) goto error; path = PathTo(ng->name); core = Root(ng->name,".fasta"); if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL) goto error; free(path); } else { if (PIPE[0] == '\0') core = Strdup("stdout","Allocating file name"); else core = Strdup(PIPE,"Allocating file name"); if (core == NULL) goto error; input = stdin; } // Get the header of the first line. If the file is empty skip. eof = (fgets(read,MAX_NAME,input) == NULL); if (eof || strlen(read) < 1) { free(core); fclose(input); if (PIPE != NULL) { fprintf(stderr,"Standard input is empty, terminating!\n"); break; } else { fprintf(stderr,"Skipping '%s', file is empty!\n",core); continue; } } // Check that core is not too long and name is unique or last source if PIPE'd if (strlen(core) >= MAX_NAME) { fprintf(stderr,"%s: File name over %d chars: '%.200s'\n", Prog_Name,MAX_NAME,core); goto error; } { int j; if (PIPE == NULL || (strcmp(core,"stdout") != 0 && (ofiles == 0 || strcmp(core,flist[ofiles-1]) != 0))) for (j = 0; j < ofiles; j++) if (strcmp(core,flist[j]) == 0) { fprintf(stderr,"%s: File %s.fasta is already in database %s.db\n", Prog_Name,core,Root(argv[1],".db")); goto error; } } // Add the file name to flist if (VERBOSE) { if (PIPE != NULL && PIPE[0] == '\0') fprintf(stderr,"Adding reads from stdio ...\n"); else fprintf(stderr,"Adding '%s.fasta' ...\n",core); fflush(stderr); } flist[ofiles++] = core; // Check that the first line is a header and has PACBIO format. if (read[strlen(read)-1] != '\n') { fprintf(stderr,"File %s.fasta, Line 1: Fasta line is too long (> %d chars)\n", core,MAX_NAME-2); goto error; } if (!eof && read[0] != '>') { fprintf(stderr,"File %s.fasta, Line 1: First header in fasta file is missing\n",core); goto error; } { char *find; int well, beg, end, qv; find = index(read+1,'/'); if (find != NULL && sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) >= 3) { *find = '\0'; strcpy(prolog,read+1); *find = '/'; } else { fprintf(stderr,"File %s.fasta, Line 1: Pacbio header line format error\n",core); goto error; } } // Read in all the sequences until end-of-file { int i, x; int nline, pwell, rlen, pcnt; pcnt = 0; rlen = 0; nline = 1; pwell = -1; while (!eof) { int beg, end, clen; int well, qv; char *find; find = index(read+(rlen+1),'/'); if (find == NULL) { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n", core,nline); goto error; } *find = '\0'; if (strcmp(read+(rlen+1),prolog) != 0) { fprintf(ostub,DB_FDATA,ureads,core,prolog); ocells += 1; strcpy(prolog,read+(rlen+1)); } *find = '/'; x = sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv); if (x < 3) { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n", core,nline); goto error; } else if (x == 3) qv = 0; rlen = 0; while (1) { eof = (fgets(read+rlen,MAX_NAME,input) == NULL); nline += 1; x = strlen(read+rlen)-1; if (read[rlen+x] != '\n') { if (read[rlen] == '>') { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Fasta header line is too long (> %d chars)\n", MAX_NAME-2); goto error; } else x += 1; } if (eof || read[rlen] == '>') break; rlen += x; if (rlen + MAX_NAME > rmax) { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_NAME; read = (char *) realloc(read,rmax+1); if (read == NULL) { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Out of memory (Allocating line buffer)\n"); goto error; } } } read[rlen] = '\0'; for (i = 0; i < rlen; i++) { x = number[(int) read[i]]; count[x] += 1; read[i] = (char) x; } ureads += 1; totlen += rlen; if (rlen > maxlen) maxlen = rlen; prec[pcnt].origin = well; prec[pcnt].fpulse = beg; prec[pcnt].rlen = rlen; prec[pcnt].boff = offset; prec[pcnt].coff = -1; prec[pcnt].flags = qv; Compress_Read(rlen,read); clen = COMPRESSED_LEN(rlen); fwrite(read,1,clen,bases); offset += clen; if (pwell == well) { prec[pcnt].flags |= DB_CSS; pcnt += 1; if (pcnt >= pmax) { pmax = ((int) (pcnt*1.2)) + 100; prec = (DAZZ_READ *) realloc(prec,sizeof(DAZZ_READ)*pmax); if (prec == NULL) { fprintf(stderr,"File %s.fasta, Line %d: Out of memory",core,nline); fprintf(stderr," (Allocating read records)\n"); goto error; } } } else if (pcnt == 0) pcnt += 1; else { x = 0; for (i = 1; i < pcnt; i++) if (prec[i].rlen > prec[x].rlen) x = i; prec[x].flags |= DB_BEST; fwrite(prec,sizeof(DAZZ_READ),pcnt,indx); prec[0] = prec[pcnt]; pcnt = 1; } pwell = well; } // Complete processing of .fasta file: flush last well group, write file line // in db image, and close file x = 0; for (i = 1; i < pcnt; i++) if (prec[i].rlen > prec[x].rlen) x = i; prec[x].flags |= DB_BEST; fwrite(prec,sizeof(DAZZ_READ),pcnt,indx); } fprintf(ostub,DB_FDATA,ureads,core,prolog); ocells += 1; if (input != stdin) fclose(input); else break; } // Finished loading all sequences: update relevant fields in db record db.ureads = ureads; if (istub == NULL) { for (c = 0; c < 4; c++) db.freq[c] = (float) ((1.*count[c])/totlen); db.totlen = totlen; db.maxlen = maxlen; db.cutoff = -1; db.allarr = 0; } else { for (c = 0; c < 4; c++) db.freq[c] = (float) ((db.freq[c]*db.totlen + (1.*count[c]))/(db.totlen + totlen)); db.totlen += totlen; if (maxlen > db.maxlen) db.maxlen = maxlen; } } // If db has been previously partitioned then calculate additional partition points and // write to new db file image if (db.cutoff >= 0) { int64 totlen, dbpos, size; int nblock, ireads, tfirst, rlen; int ufirst, cutoff, allflag; DAZZ_READ record; int i; if (VERBOSE) { fprintf(stderr,"Updating block partition ...\n"); fflush(stderr); } // Read the block portion of the existing db image getting the indices of the first // read in the last block of the exisiting db as well as the partition parameters. // Copy the old image block information to the new block information (except for // the indices of the last partial block) if (fscanf(istub,DB_NBLOCK,&nblock) != 1) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root); goto error; } dbpos = ftello(ostub); fprintf(ostub,DB_NBLOCK,0); if (fscanf(istub,DB_PARAMS,&size,&cutoff,&allflag) != 3) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root); goto error; } fprintf(ostub,DB_PARAMS,size,cutoff,allflag); if (allflag) allflag = 0; else allflag = DB_BEST; nblock -= 1; for (i = 0; i <= nblock; i++) { if (fscanf(istub,DB_BDATA,&ufirst,&tfirst) != 2) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root); goto error; } fprintf(ostub,DB_BDATA,ufirst,tfirst); } // Seek the first record of the last block of the existing db in .idx, and then // compute and record partition indices for the rest of the db from this point // forward. fseeko(indx,sizeof(DAZZ_DB)+sizeof(DAZZ_READ)*ufirst,SEEK_SET); totlen = 0; ireads = 0; for (i = ufirst; i < ureads; i++) { if (fread(&record,sizeof(DAZZ_READ),1,indx) != 1) { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root); goto error; } rlen = record.rlen; if (rlen >= cutoff && (record.flags & DB_BEST) >= allflag) { ireads += 1; tfirst += 1; totlen += rlen; if (totlen >= size) { fprintf(ostub," %9d %9d\n",i+1,tfirst); totlen = 0; ireads = 0; nblock += 1; } } } if (ireads > 0) { fprintf(ostub,DB_BDATA,ureads,tfirst); nblock += 1; } db.treads = tfirst; fseeko(ostub,dbpos,SEEK_SET); fprintf(ostub,DB_NBLOCK,nblock); // Rewind and record the new number of blocks } else db.treads = ureads; rewind(indx); fwrite(&db,sizeof(DAZZ_DB),1,indx); // Write the finalized db record into .idx rewind(ostub); // Rewrite the number of files actually added fprintf(ostub,DB_NFILE,ocells); if (istub != NULL) fclose(istub); fclose(ostub); fclose(indx); fclose(bases); rename(Catenate(pwd,"/",root,".dbx"),dbname); // New image replaces old image exit (0); // Error exit: Either truncate or remove the .idx and .bps files as appropriate. // Remove the new image file /.dbx error: if (ioff != 0) { fseeko(indx,0,SEEK_SET); if (ftruncate(fileno(indx),ioff) < 0) fprintf(stderr,"%s: Fatal: could not restore %s.idx after error, truncate failed\n", Prog_Name,root); } if (boff != 0) { fseeko(bases,0,SEEK_SET); if (ftruncate(fileno(bases),boff) < 0) fprintf(stderr,"%s: Fatal: could not restore %s.bps after error, truncate failed\n", Prog_Name,root); } if (indx != NULL) { fclose(indx); if (ioff == 0) unlink(Catenate(pwd,PATHSEP,root,".idx")); } if (bases != NULL) { fclose(bases); if (boff == 0) unlink(Catenate(pwd,PATHSEP,root,".bps")); } if (ostub != NULL) { fclose(ostub); unlink(Catenate(pwd,"/",root,".dbx")); } if (istub != NULL) fclose(istub); exit (1); } DAZZ_DB-master/quiva2DB.c000066400000000000000000000363511322703422500152550ustar00rootroot00000000000000/******************************************************************************************* * * Adds the given .quiva files to an existing DB "path". The input files must be added in * the same order as the .fasta files were and have the same root names, e.g. FOO.fasta * and FOO.quiva. The files can be added incrementally but must be added in the same order * as the .fasta files. This is enforced by the program. With the -l option set the * compression scheme is a bit lossy to get more compression (see the description of dexqv * in the DEXTRACTOR module). * * Author: Gene Myers * Date : July 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #include "QV.h" // Compiled in INTERACTIVE mode as all routines must return with an error // so that cleanup and restore is possible. #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-v] ( -f | -i | ... )"; typedef struct { int argc; char **argv; FILE *input; int count; char *name; } File_Iterator; File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); if (it == NULL) return (NULL); it->argc = argc; it->argv = argv; it->input = input; if (input == NULL) it->count = first; else { it->count = 1; rewind(input); } return (it); } int next_file(File_Iterator *it) { static char nbuffer[MAX_NAME+8]; if (it->input == NULL) { if (it->count >= it->argc) return (0); it->name = it->argv[it->count++]; } else { char *eol; if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL) { if (feof(it->input)) return (0); fprintf(stderr,"%s: IO error reading line %d of -f file of names\n",Prog_Name,it->count); it->name = NULL; return (1); } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n", Prog_Name,it->count,MAX_NAME+7); it->name = NULL; return (1); } *eol = '\0'; it->count += 1; it->name = nbuffer; } return (1); } int main(int argc, char *argv[]) { FILE *istub; char *root, *pwd; FILE *quiva, *indx; int64 coff; DAZZ_DB db; DAZZ_READ *reads; int nfiles; FILE *temp; char *tname; int VERBOSE; int PIPE; FILE *INFILE; // Process command line { int i, j, k; int flags[128]; ARG_INIT("quiva2DB") INFILE = NULL; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vli") break; case 'f': INFILE = fopen(argv[i]+2,"r"); if (INFILE == NULL) { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2); exit (1); } break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; PIPE = flags['i']; if (INFILE != NULL && PIPE) { fprintf(stderr,"%s: Cannot use both -f and -i together\n",Prog_Name); exit (1); } if ( (INFILE == NULL && ! PIPE && argc <= 2) || ((INFILE != NULL || PIPE) && argc != 2)) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -f: import files listed 1/line in given file.\n"); fprintf(stderr," -i: import data from stdin.\n"); fprintf(stderr," : otherwise, import sequence of specified files.\n"); exit (1); } } // Open DB stub file, index, and .qvs file for appending. Load db and read records, // get number of cells from stub file, and note current offset to end of .qvs root = Root(argv[1],".db"); pwd = PathTo(argv[1]); istub = Fopen(Catenate(pwd,"/",root,".db"),"r"); if (istub == NULL) { fprintf(stderr,"%s",Ebuffer); exit (1); } if (fscanf(istub,DB_NFILE,&nfiles) != 1) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root); exit (1); } indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); if (indx == NULL) { fprintf(stderr,"%s",Ebuffer); exit (1); } if (fread(&db,sizeof(DAZZ_DB),1,indx) != 1) { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root); exit (1); } if ((db.allarr & DB_ARROW) != 0) { fprintf(stderr,"%s: Database %s has Arrow data!\n",Prog_Name,root); exit (1); } reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*db.ureads,"Allocating DB index"); if (reads == NULL) { fprintf(stderr,"%s",Ebuffer); exit (1); } if (fread(reads,sizeof(DAZZ_READ),db.ureads,indx) != (size_t) (db.ureads)) { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root); exit (1); } quiva = NULL; temp = NULL; coff = 0; if (reads[0].coff < 0) quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"w"); else quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r+"); tname = Strdup(Catenate(".",PATHSEP,root,Numbered_Suffix("",getpid(),".tmp")), "Allocating temporary name"); temp = Fopen(tname,"w+"); if (quiva == NULL || temp == NULL) { fprintf(stderr,"%s",Ebuffer); goto error; } fseeko(quiva,0,SEEK_END); coff = ftello(quiva); // Do a merged traversal of cell lines in .db stub file and .quiva files to be // imported, driving the loop with the cell line # { FILE *input = NULL; char *path = NULL; char *core = NULL; File_Iterator *ng = NULL; char lname[MAX_NAME]; int first, last, cline; int cell; if (!PIPE) { ng = init_file_iterator(argc,argv,INFILE,2); if (ng == NULL) { fprintf(stderr,"%s",Ebuffer); goto error; } } for (cell = 0; cell < nfiles; cell++) { char prolog[MAX_NAME], fname[MAX_NAME]; if (cell == 0) // First addition, a pipe: find the first cell that does not have .quiva's yet // (error if none) and set input source to stdin. if (PIPE) { first = 0; while (cell < nfiles) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",core,Prog_Name); goto error; } if (reads[first].coff < 0) break; first = last; cell += 1; } if (cell >= nfiles) { fprintf(stderr,"%s: All .quiva's have already been added !?\n",Prog_Name); goto error; } input = stdin; if (VERBOSE) { fprintf(stderr,"Adding quiva's from stdin ...\n"); fflush(stderr); } cline = 0; } // First addition, not a pipe: then get first .quiva file name (error if not one) to // add, find the first cell name whose file name matches (error if none), check that // the previous .quiva's have been added and this is the next slot. Then open // the .quiva file for compression else { if (! next_file(ng)) { fprintf(stderr,"%s: file list is empty!\n",Prog_Name); goto error; } if (ng->name == NULL) { fprintf(stderr,"%s",Ebuffer); goto error; } core = Root(ng->name,".quiva"); path = PathTo(ng->name); if ((input = Fopen(Catenate(path,"/",core,".quiva"),"r")) == NULL) { fprintf(stderr,"%s",Ebuffer); goto error; } first = 0; while (cell < nfiles) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",core,Prog_Name); goto error; } if (strcmp(core,fname) == 0) break; first = last; cell += 1; } if (cell >= nfiles) { fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core); goto error; } if (first > 0 && reads[first-1].coff < 0) { fprintf(stderr,"%s: Predecessor of %s.quiva has not been added yet\n", Prog_Name,core); goto error; } if (reads[first].coff >= 0) { fprintf(stderr,"%s: %s.quiva has already been added\n",Prog_Name,core); goto error; } if (VERBOSE) { fprintf(stderr,"Adding '%s.quiva' ...\n",core); fflush(stderr); } cline = 0; } // Not the first addition: get next cell line. If not a pipe and the file name is new, // then close the current .quiva, open the next one and after ensuring the names // match, open it for compression else { first = last; strcpy(lname,fname); if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",core,Prog_Name); goto error; } if (PIPE) { int c; if ((c = fgetc(input)) == EOF) break; ungetc(c,input); } else if (strcmp(lname,fname) != 0) { if (fgetc(input) != EOF) { fprintf(stderr,"%s: Too many reads in %s.quiva while handling %s.fasta\n", Prog_Name,core,fname); goto error; } fclose(input); free(path); free(core); if ( ! next_file(ng)) break; if (ng->name == NULL) { fprintf(stderr,"%s",Ebuffer); goto error; } path = PathTo(ng->name); core = Root(ng->name,".quiva"); if ((input = Fopen(Catenate(path,"/",core,".quiva"),"r")) == NULL) { fprintf(stderr,"%s",Ebuffer); goto error; } if (strcmp(core,fname) != 0) { fprintf(stderr,"%s: Files not being added in order (expect %s, given %s)\n", Prog_Name,fname,core); goto error; } if (VERBOSE) { fprintf(stderr,"Adding '%s.quiva' ...\n",core); fflush(stderr); } cline = 0; } } // Compress reads [first..last) from open .quiva appending to .qvs and record // offset in .coff field of reads (offset of first in a cell is to the compression // table). { int64 qpos; QVcoding *coding; int i, s; rewind(temp); if (ftruncate(fileno(temp),0) < 0) { fprintf(stderr,"%s: System error: could not truncate temporary file\n",Prog_Name); goto error; } Set_QV_Line(cline); s = QVcoding_Scan(input,last-first,temp); if (s < 0) { fprintf(stderr,"%s",Ebuffer); goto error; } if (s != last-first) { if (PIPE) fprintf(stderr,"%s: Insufficient # of reads on input while handling %s.fasta\n", Prog_Name,fname); else fprintf(stderr,"%s: Insufficient # of reads in %s.quiva while handling %s.fasta\n", Prog_Name,core,fname); goto error; } coding = Create_QVcoding(0); if (coding == NULL) { fprintf(stderr,"%s",Ebuffer); goto error; } coding->prefix = Strdup(".qvs","Allocating header prefix"); if (coding->prefix == NULL) { fprintf(stderr,"%s",Ebuffer); goto error; } qpos = ftello(quiva); Write_QVcoding(quiva,coding); // Then compress and append to the .qvs each compressed QV entry rewind(temp); Set_QV_Line(cline); for (i = first; i < last; i++) { s = Read_Lines(temp,1); if (s < -1) { fprintf(stderr,"%s",Ebuffer); goto error; } reads[i].coff = qpos; s = Compress_Next_QVentry(temp,quiva,coding,0); if (s < 0) { fprintf(stderr,"%s",Ebuffer); goto error; } if (s != reads[i].rlen) { fprintf(stderr,"%s: Length of quiva %d is different than fasta in DB\n", Prog_Name,i+1); goto error; } qpos = ftello(quiva); } cline = Get_QV_Line(); Free_QVcoding(coding); } } if (fgetc(input) != EOF) { if (PIPE) fprintf(stderr,"%s: Too many reads on input while handling %s.fasta\n", Prog_Name,lname); else fprintf(stderr,"%s: Too many reads in %s.quiva while handling %s.fasta\n", Prog_Name,core,lname); goto error; } if ( ! PIPE && cell >= nfiles) { fclose(input); free(core); free(path); if (next_file(ng)) { if (ng->name == NULL) { fprintf(stderr,"%s",Ebuffer); goto error; } core = Root(ng->name,".quiva"); fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core); goto error; } } } // Write the db record and read index into .idx and clean up rewind(indx); fwrite(&db,sizeof(DAZZ_DB),1,indx); fwrite(reads,sizeof(DAZZ_READ),db.ureads,indx); fclose(istub); fclose(indx); fclose(quiva); fclose(temp); unlink(tname); exit (0); // Error exit: Either truncate or remove the .qvs file as appropriate. error: if (coff != 0) { fseeko(quiva,0,SEEK_SET); if (ftruncate(fileno(quiva),coff) < 0) fprintf(stderr,"%s: Fatal: could not restore %s.qvs after error, truncate failed\n", Prog_Name,root); } if (quiva != NULL) { fclose(quiva); if (coff == 0) unlink(Catenate(pwd,PATHSEP,root,".qvs")); } if (temp != NULL) { fclose(temp); unlink(tname); } fclose(istub); fclose(indx); exit (1); } DAZZ_DB-master/rangen.c000066400000000000000000000116111322703422500151020ustar00rootroot00000000000000/******************************************************************************************* * * Synthetic DNA shotgun sequence generator * Generate a fake genome of size genlen*1Mb long, that has an AT-bias of -b. * The -r parameter seeds the random number generator for the generation of the genome * so that one can reproducbile produce the same underlying genome to sample from. If * missing, then the job id of the invocation seeds the generator. The sequence is * sent to the standard output in .fasta format. * * Author: Gene Myers * Date : April 2016 * ********************************************************************************************/ #include #include #include #include #include static char *Usage = " [-U] [-b] [-w] [-r]"; static int GENOME; // -g option * 1Mbp static double BIAS; // -b option static int HASR = 0; // -r option is set? static int SEED; // -r option static int WIDTH; // -w option static int UPPER; // -U option static char *Prog_Name; // Generate a random DNA sequence of length *len* with an AT-bias of BIAS. // Uppercase letters if UPPER is set, lowercase otherwise. static char *random_genome(int len) { static char *seq = NULL; static double x, PRA, PRC, PRG; int i; if (seq == NULL) { PRA = BIAS/2.; PRC = (1.-BIAS)/2. + PRA; PRG = (1.-BIAS)/2. + PRC; if ((seq = (char *) malloc(WIDTH+1)) == NULL) { fprintf(stderr,"%s: Allocating genome sequence\n",Prog_Name); exit (1); } } if (UPPER) for (i = 0; i < len; i++) { x = drand48(); if (x < PRC) if (x < PRA) seq[i] = 'A'; else seq[i] = 'C'; else if (x < PRG) seq[i] = 'G'; else seq[i] = 'T'; } else for (i = 0; i < len; i++) { x = drand48(); if (x < PRC) if (x < PRA) seq[i] = 'a'; else seq[i] = 'c'; else if (x < PRG) seq[i] = 'g'; else seq[i] = 't'; } seq[len] = '\0'; return (seq); } int main(int argc, char *argv[]) { int i, j; char *eptr; double glen; // Process command arguments // // Usage: [-b] [-r] Prog_Name = strdup("rangen"); WIDTH = 80; BIAS = .5; HASR = 0; UPPER = 0; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: fprintf(stderr,"%s: %s is an illegal option\n",Prog_Name,argv[i]); exit (1); case 'U': if (argv[i][2] != '\0') { fprintf(stderr,"%s: %s is an illegal option\n",Prog_Name,argv[i]); exit (1); } UPPER = 1; break; case 'b': BIAS = strtod(argv[i]+2,&eptr); if (*eptr != '\0' || argv[i][2] == '\0') { fprintf(stderr,"%s: -%c '%s' argument is not a real number\n", Prog_Name,argv[i][1],argv[i]+2); exit (1); } if (BIAS < 0. || BIAS > 1.) { fprintf(stderr,"%s: AT-bias must be in [0,1] (%g)\n",Prog_Name,BIAS); exit (1); } break; case 'r': SEED = strtol(argv[i]+2,&eptr,10); HASR = 1; if (*eptr != '\0' || argv[i][2] == '\0') { fprintf(stderr,"%s: -r argument is not an integer\n",Prog_Name); exit (1); } break; case 'w': WIDTH = strtol(argv[i]+2,&eptr,10); if (*eptr != '\0' || argv[i][2] == '\0') { fprintf(stderr,"%s: -w '%s' argument is not an integer\n",Prog_Name,argv[i]+2); exit (1); } if (WIDTH < 0) { fprintf(stderr,"%s: Line width must be non-negative (%d)\n",Prog_Name,WIDTH); exit (1); } break; } else argv[j++] = argv[i]; argc = j; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } glen = strtod(argv[1],&eptr); if (*eptr != '\0') { fprintf(stderr,"%s: genome length is not a real number\n",Prog_Name); exit (1); } if (glen < 0.) { fprintf(stderr,"%s: Genome length must be positive (%g)\n",Prog_Name,glen); exit (1); } GENOME = (int) (glen*1000000.); // Set up random number generator if (HASR) srand48(SEED); else srand48(getpid()); // Generate the sequence line at a time where all lines have width WDITH, save the last. fprintf(stdout,">random len=%d bias=%g\n",GENOME,BIAS); for (j = 0; j+WIDTH < GENOME; j += WIDTH) fprintf(stdout,"%s\n",random_genome(WIDTH)); if (j < GENOME) fprintf(stdout,"%s\n",random_genome(GENOME-j)); exit (0); } DAZZ_DB-master/simulator.c000066400000000000000000000414621322703422500156560ustar00rootroot00000000000000/******************************************************************************************* * * Synthetic DNA shotgun dataset simulator * From a supplied reference genome in the form of a Dazzler .dam, sample reads of * mean length -m from a log-normal length distribution with standard deviation -s, * but ignore reads of length less than -x. Collect enough reads to cover the genome * -c times. Introduce -e fraction errors into each read where the ratio of insertions, * deletions, and substitutions are set by defined constants INS_RATE and DEL_RATE * within generate.c. The fraction -f controls the rate at which reads are picked from * the forward and reverse strands which defaults to 50%. If -C is set then assume the * scaffolds are circular. * * The -r parameter seeds the random number generator for the generation of the genome * so that one can reproducbile produce the same underlying genome to sample from. If * missing, then the job id of the invocation seeds the generator. The output is sent * to the standard output (i.e. it is a pipe). The output is in fasta format (i.e. it is * a UNIX pipe). The output is in Pacbio .fasta format suitable as input to fasta2DB. * * The genome is considered a sequence of *scaffolds* (these are reconstituted from the * Dazzler's internal encoding of a .dam), where the gaps are filled with a random * sequence that follows the base distribution of the contigs of the genome. The program * then samples these filled in scaffolds for reads. If the -C optioin is set then the * program assumes each scaffold is a circular sequence. * * The -M option requests that the scaffold and coordinates from which each read has * been sampled are written to the indicated file, one line per read, ASCII encoded. * This "map" file essentially tells one where every read belongs in an assembly and * is very useful for debugging and testing purposes. If a read pair is say b,e then * if b < e the read was sampled from [b,e] in the forward direction, and from [e,b] * in the reverse direction otherwise. * * Author: Gene Myers * Date : July 2013 * Mod : April 2016 (generates reads w.r.t. a reference genome) * ********************************************************************************************/ #include #include #include #include #include #define PACBIO #include "DB.h" static char *Usage[] = { " [-CU] [-m] [-s] [-e]", " [-c] [-f] [-x]", " [-w] [-r] [-M]", }; static int CIRCULAR; // -C option static int UPPER; // -U option static int RMEAN; // -m option static int RSDEV; // -s option static double ERROR; // -e option static double COVERAGE; // -c option static double FLIP_RATE; // -f option static int RSHORT; // -x option static int WIDTH; // -w option static int HASR; // -r option is set? static int SEED; // -r option static FILE *MAP; // -M option #ifdef PACBIO #define INS_RATE .73333 // insert rate (for PB data) #define DEL_RATE .20000 // deletion rate #define IDL_RATE .93333 // insert + delete rate #elif ILLUMINA #define INS_RATE .1 // insert rate (for Illumina data) #define DEL_RATE .1 // deletion rate #define IDL_RATE .2 // insert + delete rate #else #define INS_RATE .33333 // insert rate (equal weighting) #define DEL_RATE .33333 // deletion rate #define IDL_RATE .66666 // insert + delete rate #endif // Complement (in the DNA sense) string *s*. static void complement(int elen, char *s) { char *t; int c; t = s + (elen-1); while (s <= t) { c = *s; *s = (char) (3-*t); *t = (char) (3-c); s += 1; t -= 1; } } // A unit normal distribution random number generator #define UNORM_LEN 60000 #define UNORM_MAX 6.0 static double unorm_table[UNORM_LEN+1]; // Upper half of cdf of N(0,1) static double unorm_scale; static void init_unorm() { double del, sum, x; int i; unorm_scale = del = UNORM_MAX / UNORM_LEN; sum = 0; // Integrate pdf, x >= 0 half only. for (i = 0; i < UNORM_LEN; i++) { x = i * del; unorm_table[i] = sum; sum += exp(-.5*x*x) * del; } unorm_table[UNORM_LEN] = sum; /* Normalize cdf */ sum *= 2.; for (i = 0; i < UNORM_LEN; i++) unorm_table[i] /= sum; unorm_table[UNORM_LEN] = 1.; #ifdef DEBUG printf("Truncated tail is < %g\n", exp(-.5*UNORM_MAX*UNORM_MAX)/(sum*(1.-exp(-UNORM_MAX))) ); printf("Diff between last two entries is %g\n",.5-unorm_table[UNORM_LEN-1]); printf("\n CDF:\n"); for (i = 0; i <= UNORM_LEN; i += 100) printf("%6.2f: %10.9f\n",i*del,unorm_table[i]); #endif } static int bin_search(int len, double *tab, double y) { int l, m, r; // Searches tab[0..len] for min { r : y < tab[r] }. // Assumes y < 1, tab[0] = 0 and tab[len] = 1. // So returned index is in [1,len]. l = 0; r = len; while (l < r) { m = (l+r) >> 1; if (y < tab[m]) r = m; else l = m+1; } return (r); } static double sample_unorm(double x) { double y; int f; if (x >= .5) // Map [0,1) random var to upper-half of cdf */ y = x-.5; else y = .5-x; f = bin_search(UNORM_LEN,unorm_table,y); // Bin. search upper-half cdf #ifdef DEBUG printf("Normal search %g -> %g -> %d",x,y,f); #endif // Linear interpolate between table points y = (f - (unorm_table[f]-y) / (unorm_table[f] - unorm_table[f-1]) ) * unorm_scale; if (x < .5) y = -y; // Map upper-half var back to full range #ifdef DEBUG printf(" -> %g\n",y); #endif return (y); } // Open and trim the reference genome *name*. Determine the number of scaffolds and sizes // of each scaffold (in nscaffs and the .coff field of the read records) in the dam. Then // create a sequence for each scaffold (index in the .boff field of the read records), that // consists of its contigs with a random sequence filling the gaps (generated according to // the bp frequency in db.freq[4]). DAZZ_DB *load_and_fill(char *name, int *pscaffs) { static DAZZ_DB db; DAZZ_READ *reads; FILE *bases; char *bases_name; char *seq; int nreads, nscaffs; int i, c; int64 ctot; int64 o, u; double PRA, PRC, PRG; if (Open_DB(name,&db) != 1) { fprintf(stderr,"%s: %s is not a Dazzler .dam\n",Prog_Name,name); exit (1); } Trim_DB(&db); PRA = db.freq[0]; PRC = PRA + db.freq[1]; PRG = PRC + db.freq[2]; nreads = db.nreads; reads = db.reads; nscaffs = 0; for (i = 0; i < nreads; i++) if (reads[i].origin == 0) nscaffs += 1; for (i = 0; i < nscaffs; i++) reads[i].coff = 0; c = -1; for (i = 0; i < nreads; i++) { if (reads[i].origin == 0) c += 1; reads[c].coff = reads[i].fpulse+reads[i].rlen; } ctot = 0; for (i = 0; i < nscaffs; i++) ctot += reads[i].coff+1; bases_name = Strdup(Catenate(db.path,"","",".bps"),"Allocating base-pair file name"); bases = Fopen(bases_name,"r"); if (bases_name == NULL || bases == NULL) exit (1); seq = (char *) Malloc(ctot+4,"Allocating space for genome"); if (seq == NULL) exit (1); *seq++ = 4; c = -1; o = u = 0; for (i = 0; i < nreads; i++) { int len, clen; int64 off; if (reads[i].origin == 0) { if (c >= 0) o += reads[c].coff + 1; c += 1; u = o; } else { int64 p; double x; p = u + reads[i-1].rlen; u = o + reads[i].fpulse; while (p < u) { x = drand48(); if (x < PRC) if (x < PRA) seq[p++] = 0; else seq[p++] = 1; else if (x < PRG) seq[p++] = 2; else seq[p++] = 3; } } len = reads[i].rlen; off = reads[i].boff; if (ftello(bases) != off) FSEEKO(bases,off,SEEK_SET) clen = COMPRESSED_LEN(len); if (clen > 0) FREAD(seq+u,clen,1,bases) Uncompress_Read(len,seq+u); if (reads[i].origin == 0) reads[c].boff = o; } reads[nscaffs].boff = ctot; db.bases = (void *) seq; db.loaded = 1; *pscaffs = nscaffs; return (&db); } // Generate reads (a) whose lengths are exponentially distributed with mean *mean* and // standard deviation *stdev*, and (b) that are never shorter than *shortest*. Each // read is a randomly sampled interval of one of the filled scaffolds of *source* // (each interval is equally likely) that has insertion, deletion, and/or substitution // errors introduced into it and which is oriented in either the forward or reverse // strand direction with probability FLIP_RATE. The number of errors introduced is the // length of the string times *erate*, and the probability of an insertion, delection, // or substitution is controlled by the defined constants INS_RATE and DEL_RATE. // If the -C option is set then each scaffold is assumed to be circular and reads can // be sampled that span the origin. Reads are generated until the sum of the lengths of // the reads is greater thant coverage times the sum of the lengths of the scaffolds in // the reference (i.e. including filled scaffold gaps in the genome size). The reads are // output as fasta entries with the PacBio-specific header format that contains the // sampling interval, read length, and a read id. static void shotgun(DAZZ_DB *source, int nscaffs) { DAZZ_READ *reads; int gleng; int maxlen, nreads, qv; int64 totlen, totbp; char *rbuffer, *bases; double nmean, nsdev; double *weights; int scf; nsdev = (1.*RSDEV)/RMEAN; nsdev = log(1.+nsdev*nsdev); nmean = log(1.*RMEAN) - .5*nsdev; nsdev = sqrt(nsdev); bases = source->bases; reads = source->reads; gleng = reads[nscaffs].boff - nscaffs; if (gleng <= RSHORT) { fprintf(stderr,"Genome length is less than shortest read length !\n"); exit (1); } init_unorm(); weights = (double *) Malloc(sizeof(double)*(nscaffs+1),"Allocating contig weights"); if (weights == NULL) exit (1); { double r; r = 0.; for (scf = 0; scf < nscaffs; scf++) { weights[scf] = r/gleng; r += reads[scf].coff; } weights[nscaffs] = 1.; } qv = (int) (1000 * (1.-ERROR)); rbuffer = NULL; maxlen = 0; totlen = 0; totbp = COVERAGE*gleng; nreads = 0; while (totlen < totbp) { int len, sdl, ins, del, elen, slen, rbeg, rend; int j; double uni; char *s, *t; scf = bin_search(nscaffs,weights,drand48()) - 1; // Pick a scaffold with probabilitye // proportional to its length uni = drand48(); len = (int) exp(nmean + nsdev*sample_unorm(uni)); // Pick a read length if (len <= RSHORT) continue; // New sampler: slen = reads[scf].coff; rbeg = (int) (drand48()*slen); // Pick a spot for read start if (CIRCULAR) rend = (rbeg + len) % slen; // Wrap if circular else { if (drand48() < .5) // Pick direction and trim if necessary { rend = rbeg + len; // if not circular if (rend > slen) { rend = slen; len = rend - rbeg; } } else { rend = rbeg; rbeg = rbeg - len; if (rbeg < 0) { rbeg = 0; len = rend; } } if (len <= RSHORT) continue; } // Old sampler: // // rbeg = (int) (drand48()*((reads[scf].coff-len)+.9999999)); // rend = rbeg + len; sdl = (int) (len*ERROR); // Determine number of inserts *ins*, deletions *del, ins = del = 0; // and substitions+deletions *sdl*. for (j = 0; j < sdl; j++) { double x = drand48(); if (x < INS_RATE) ins += 1; else if (x < IDL_RATE) del += 1; } sdl -= ins; elen = len + (ins-del); if (elen > maxlen) { maxlen = ((int) (1.2*elen)) + 1000; rbuffer = (char *) Realloc(rbuffer,maxlen+3,"Allocating read buffer"); if (rbuffer == NULL) exit (1); } t = rbuffer; s = bases + (reads[scf].boff + rbeg); // Generate the string with errors. NB that inserts occur randomly between source // characters, while deletions and substitutions occur on source characters. while ((len+1) * drand48() < ins) { *t++ = (char) (4.*drand48()); ins -= 1; } for ( ; len > 0; len--) { if (len * drand48() >= sdl) *t++ = *s; else if (sdl * drand48() >= del) { double x = 3.*drand48(); if (x >= *s) x += 1.; *t++ = (char) x; sdl -= 1; } else { del -= 1; sdl -= 1; } s += 1; if (*s == 4) s = bases + reads[scf].boff; while (len * drand48() < ins) { *t++ = (char) (4.*drand48()); ins -= 1; } } *t = 4; if (drand48() >= FLIP_RATE) // Complement the string with probability FLIP_RATE. { complement(elen,rbuffer); j = rend; rend = rbeg; rbeg = j; } PRINTF(">Sim/%d/%d_%d RQ=0.%d\n",nreads+1,0,elen,qv) if (UPPER) Upper_Read(rbuffer); else Lower_Read(rbuffer); for (j = 0; j+WIDTH < elen; j += WIDTH) PRINTF("%.*s\n",WIDTH,rbuffer+j) if (j < elen) PRINTF("%s\n",rbuffer+j) if (MAP != NULL) FPRINTF(MAP," %6d %9d %9d\n",scf,rbeg,rend) totlen += elen; nreads += 1; } } int main(int argc, char *argv[]) { DAZZ_DB *source; int nscaffs; // Process command line { int i, j, k; int flags[128]; char *eptr; ARG_INIT("simulator"); RMEAN = 10000; RSDEV = 2000; ERROR = .15; COVERAGE = 50.; FLIP_RATE = .5; RSHORT = 4000; HASR = 0; MAP = NULL; WIDTH = 80; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("CU"); break; case 'c': ARG_REAL(COVERAGE) if (COVERAGE < 0.) { fprintf(stderr,"%s: Coverage must be non-negative (%g)\n",Prog_Name,COVERAGE); exit (1); } break; case 'e': ARG_REAL(ERROR) if (ERROR < 0. || ERROR > .5) { fprintf(stderr,"%s: Error rate must be in [0,.5] (%g)\n",Prog_Name,ERROR); exit (1); } break; case 'f': ARG_REAL(FLIP_RATE) if (FLIP_RATE < 0. || FLIP_RATE > 1.) { fprintf(stderr,"%s: Error rate must be in [0,1] (%g)\n",Prog_Name,FLIP_RATE); exit (1); } break; case 'm': ARG_POSITIVE(RMEAN,"Mean read length") break; case 'r': SEED = strtol(argv[i]+2,&eptr,10); HASR = 1; if (*eptr != '\0' || argv[i][2] == '\0') { fprintf(stderr,"%s: -r argument is not an integer\n",Prog_Name); exit (1); } break; case 's': ARG_NON_NEGATIVE(RSDEV,"Read length standard deviation") break; case 'x': ARG_NON_NEGATIVE(RSHORT,"Read length minimum") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; case 'M': MAP = Fopen(argv[i]+2,"w"); if (MAP == NULL) exit (1); break; } else argv[j++] = argv[i]; argc = j; CIRCULAR = flags['C']; UPPER = flags['U']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); exit (1); } } if (HASR) srand48(SEED); else srand48(getpid()); // Read and generate source = load_and_fill(argv[1],&nscaffs); shotgun(source,nscaffs); if (MAP != NULL) fclose(MAP); exit (0); }