pax_global_header00006660000000000000000000000064125375246460014530gustar00rootroot0000000000000052 comment=40bb7e4b2041cdfd3c50b22201301b8df06342fa DAZZ_DB-1.0/000077500000000000000000000000001253752464600125055ustar00rootroot00000000000000DAZZ_DB-1.0/Catrack.c000066400000000000000000000237171253752464600142330ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************** * * Concate in block order all "block tracks" ..# into a single track * . * * Author: Gene Myers * Date : June 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-v] "; int main(int argc, char *argv[]) { char *prefix; FILE *aout, *dout; int VERBOSE; // Process arguments { int i, j, k; int flags[128]; ARG_INIT("Catrack") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("v") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc != 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } { char *pwd, *root; int plen; plen = strlen(argv[1]); if (strcmp(argv[1]+(plen-3),".dam") == 0) root = Root(argv[1],".dam"); else root = Root(argv[1],".db"); pwd = PathTo(argv[1]); prefix = Strdup(Catenate(pwd,PATHSEP,root,"."),"Allocating track name"); free(pwd); free(root); aout = fopen(Catenate(prefix,argv[2],".","anno"),"r"); if (aout != NULL) { fprintf(stderr,"%s: Track file %s%s.anno already exists!\n",Prog_Name,prefix,argv[2]); fclose(aout); exit (1); } dout = fopen(Catenate(prefix,argv[2],".","data"),"r"); if (dout != NULL) { fprintf(stderr,"%s: Track file %s%s.data already exists!\n",Prog_Name,prefix,argv[2]); fclose(dout); exit (1); } aout = Fopen(Catenate(prefix,argv[2],".","anno"),"w"); if (aout == NULL) exit (1); dout = NULL; } { int tracktot, tracksiz; int64 trackoff; int nfiles; char data[1024]; void *anno; anno = NULL; trackoff = 0; tracktot = tracksiz = 0; fwrite(&tracktot,sizeof(int),1,aout); fwrite(&tracksiz,sizeof(int),1,aout); nfiles = 0; while (1) { FILE *afile, *dfile; int i, size, tracklen; afile = fopen(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","anno")),"r"); if (afile == NULL) break; dfile = fopen(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","data")),"r"); if (VERBOSE) { fprintf(stderr,"Concatenating %s%d.%s ...\n",prefix,nfiles+1,argv[2]); fflush(stderr); } if (fread(&tracklen,sizeof(int),1,afile) != 1) SYSTEM_ERROR if (fread(&size,sizeof(int),1,afile) != 1) SYSTEM_ERROR if (nfiles == 0) { tracksiz = size; if (dfile != NULL) { dout = Fopen(Catenate(prefix,argv[2],".","data"),"w"); if (dout == NULL) { fclose(afile); fclose(dfile); goto error; } } else { anno = Malloc(size,"Allocating annotation record"); if (anno == NULL) { fclose(afile); goto error; } } } else { int escape = 1; if (tracksiz != size) { fprintf(stderr,"%s: Track block %d does not have the same annotation size (%d)", Prog_Name,nfiles+1,size); fprintf(stderr," as previous blocks (%d)\n",tracksiz); } else if (dfile == NULL && dout != NULL) fprintf(stderr,"%s: Track block %d does not have data but previous blocks do\n", Prog_Name,nfiles+1); else if (dfile != NULL && dout == NULL) fprintf(stderr,"%s: Track block %d has data but previous blocks do not\n", Prog_Name,nfiles+1); else escape = 0; if (escape) { fclose(afile); if (dfile != NULL) fclose(dfile); if (anno != NULL) free(anno); goto error; } } if (dfile != NULL) { int64 dlen; if (size == 4) { int anno4; for (i = 0; i < tracklen; i++) { if (fread(&anno4,sizeof(int),1,afile) != 1) SYSTEM_ERROR anno4 += trackoff; fwrite(&anno4,sizeof(int),1,aout); } if (fread(&anno4,sizeof(int),1,afile) != 1) SYSTEM_ERROR dlen = anno4; } else { int64 anno8; for (i = 0; i < tracklen; i++) { if (fread(&anno8,sizeof(int64),1,afile) != 1) SYSTEM_ERROR anno8 += trackoff; fwrite(&anno8,sizeof(int64),1,aout); } if (fread(&anno8,sizeof(int64),1,afile) != 1) SYSTEM_ERROR dlen = anno8; } trackoff += dlen; for (i = 1024; i < dlen; i += 1024) { if (fread(data,1024,1,dfile) != 1) SYSTEM_ERROR fwrite(data,1024,1,dout); } i -= 1024; if (i < dlen) { if (fread(data,dlen-i,1,dfile) != 1) SYSTEM_ERROR fwrite(data,dlen-i,1,dout); } } else { for (i = 0; i < tracklen; i++) { if (fread(anno,size,1,afile) != 1) SYSTEM_ERROR fwrite(anno,size,1,aout); } } tracktot += tracklen; nfiles += 1; if (dfile != NULL) fclose(dfile); fclose(afile); } if (nfiles == 0) { fprintf(stderr,"%s: Couldn't find first track block %s1.%s.anno\n", Prog_Name,prefix,argv[2]); goto error; } else { if (dout != NULL) { if (tracksiz == 4) { int anno4 = trackoff; fwrite(&anno4,sizeof(int),1,aout); } else { int64 anno8 = trackoff; fwrite(&anno8,sizeof(int64),1,aout); } } else { fwrite(anno,tracksiz,1,aout); free(anno); } rewind(aout); fwrite(&tracktot,sizeof(int),1,aout); fwrite(&tracksiz,sizeof(int),1,aout); } } fclose(aout); if (dout != NULL) fclose(dout); free(prefix); exit (0); error: fclose(aout); unlink(Catenate(prefix,argv[2],".","anno")); if (dout != NULL) { fclose(dout); unlink(Catenate(prefix,argv[2],".","data")); } free(prefix); exit (1); } DAZZ_DB-1.0/DAM2fasta.c000066400000000000000000000173611253752464600143630ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************** * * Recreate all the .fasta files that are in a specified DAM. * * Author: Gene Myers * Date : May 2014 * ********************************************************************************************/ #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-vU] [-w] "; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *dbfile, *hdrs; int nfiles; int VERBOSE, UPPER, WIDTH; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DAM2fasta") WIDTH = 80; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vU") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; } else argv[j++] = argv[i]; argc = j; UPPER = 1 + flags['U']; VERBOSE = flags['v']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open db { int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 0) { fprintf(stderr,"%s: Cannot be called on a .db: %s\n",Prog_Name,argv[1]); exit (1); } if (db->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } } { char *pwd, *root; pwd = PathTo(argv[1]); root = Root(argv[1],".dam"); dbfile = Fopen(Catenate(pwd,"/",root,".dam"),"r"); hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"r"); free(pwd); free(root); if (dbfile == NULL || hdrs == NULL) exit (1); } // nfiles = # of files in data base if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR // For each file do: { HITS_READ *reads; char *read; int f, first; char nstring[WIDTH+1]; if (UPPER == 2) for (f = 0; f < WIDTH; f++) nstring[f] = 'N'; else for (f = 0; f < WIDTH; f++) nstring[f] = 'n'; nstring[WIDTH] = '\0'; reads = db->reads; read = New_Read_Buffer(db); first = 0; for (f = 0; f < nfiles; f++) { int i, last, wpos; FILE *ofile; char prolog[MAX_NAME], fname[MAX_NAME], header[MAX_NAME]; // Scan db image file line, create .fasta file for writing if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Creating %s.fasta ...\n",fname); fflush(stdout); } // For the relevant range of reads, write each to the file // recreating the original headers with the index meta-data about each read wpos = 0; for (i = first; i < last; i++) { int j, len, nlen, w; HITS_READ *r; r = reads + i; len = r->rlen; if (r->origin == 0) { if (i != first && wpos != 0) { fprintf(ofile,"\n"); wpos = 0; } fseeko(hdrs,r->coff,SEEK_SET); fgets(header,MAX_NAME,hdrs); fputs(header,ofile); } if (r->fpulse != 0) { if (r->origin != 0) nlen = r->fpulse - (reads[i-1].fpulse + reads[i-1].rlen); else nlen = r->fpulse; for (j = 0; j+(w = WIDTH-wpos) <= nlen; j += w) { fprintf(ofile,"%.*s\n",w,nstring); wpos = 0; } if (j < nlen) { fprintf(ofile,"%.*s",nlen-j,nstring); if (j == 0) wpos += nlen; else wpos = nlen-j; } } Load_Read(db,i,read,UPPER); for (j = 0; j+(w = WIDTH-wpos) <= len; j += w) { fprintf(ofile,"%.*s\n",w,read+j); wpos = 0; } if (j < len) { fprintf(ofile,"%s",read+j); if (j == 0) wpos += len; else wpos = len-j; } } if (wpos > 0) fprintf(ofile,"\n"); first = last; } } fclose(hdrs); fclose(dbfile); Close_DB(db); exit (0); } DAZZ_DB-1.0/DB.c000066400000000000000000001264201253752464600131430ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. Eventually will also * support compressed quality information. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif /******************************************************************************************* * * GENERAL UTILITIES * ********************************************************************************************/ char *Prog_Name; #ifdef INTERACTIVE char Ebuffer[1000]; #endif void *Malloc(int64 size, char *mesg) { void *p; if ((p = malloc(size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } void *Realloc(void *p, int64 size, char *mesg) { if ((p = realloc(p,size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } char *Strdup(char *name, char *mesg) { char *s; if (name == NULL) return (NULL); if ((s = strdup(name)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (s); } FILE *Fopen(char *name, char *mode) { FILE *f; if (name == NULL || mode == NULL) return (NULL); if ((f = fopen(name,mode)) == NULL) EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); return (f); } char *PathTo(char *name) { char *path, *find; if (name == NULL) return (NULL); if ((find = rindex(name,'/')) != NULL) { *find = '\0'; path = Strdup(name,"Extracting path from"); *find = '/'; } else path = Strdup(".","Allocating default path"); return (path); } char *Root(char *name, char *suffix) { char *path, *find, *dot; int epos; if (name == NULL) return (NULL); find = rindex(name,'/'); if (find == NULL) find = name; else find += 1; if (suffix == NULL) { dot = strchr(find,'.'); if (dot != NULL) *dot = '\0'; path = Strdup(find,"Extracting root from"); if (dot != NULL) *dot = '.'; } else { epos = strlen(find); epos -= strlen(suffix); if (epos > 0 && strcasecmp(find+epos,suffix) == 0) { find[epos] = '\0'; path = Strdup(find,"Extracting root from"); find[epos] = suffix[0]; } else path = Strdup(find,"Allocating root"); } return (path); } char *Catenate(char *path, char *sep, char *root, char *suffix) { static char *cat = NULL; static int max = -1; int len; if (path == NULL || root == NULL || sep == NULL || suffix == NULL) return (NULL); len = strlen(path); len += strlen(sep); len += strlen(root); len += strlen(suffix); if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); return (NULL); } } sprintf(cat,"%s%s%s%s",path,sep,root,suffix); return (cat); } char *Numbered_Suffix(char *left, int num, char *right) { static char *suffix = NULL; static int max = -1; int len; if (left == NULL || right == NULL) return (NULL); len = strlen(left); len += strlen(right) + 40; if (len > max) { max = ((int) (1.2*len)) + 100; if ((suffix = (char *) realloc(suffix,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); return (NULL); } } sprintf(suffix,"%s%d%s",left,num,right); return (suffix); } #define COMMA ',' // Print big integers with commas/periods for better readability void Print_Number(int64 num, int width, FILE *out) { if (width == 0) { if (num < 1000ll) fprintf(out,"%lld",num); else if (num < 1000000ll) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else if (num < 1000000000ll) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll, COMMA,(num%1000000000ll)/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); } else { if (num < 1000ll) fprintf(out,"%*lld",width,num); else if (num < 1000000ll) { if (width <= 4) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll); } else if (num < 1000000000ll) { if (width <= 8) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); } else { if (width <= 12) fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); } } } // Return the number of digits, base 10, of num int Number_Digits(int64 num) { int digit; digit = 0; while (num >= 1) { num /= 10; digit += 1; } return (digit); } /******************************************************************************************* * * READ COMPRESSION/DECOMPRESSION UTILITIES * ********************************************************************************************/ // Compress read into 2-bits per base (from [0-3] per byte representation void Compress_Read(int len, char *s) { int i; char c, d; char *s0, *s1, *s2, *s3; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; c = s1[len]; d = s2[len]; s0[len] = s1[len] = s2[len] = 0; for (i = 0; i < len; i += 4) *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]); s1[len] = c; s2[len] = d; } // Uncompress read form 2-bits per base into [0-3] per byte representation void Uncompress_Read(int len, char *s) { int i, tlen, byte; char *s0, *s1, *s2, *s3; char *t; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; tlen = (len-1)/4; t = s+tlen; for (i = tlen*4; i >= 0; i -= 4) { byte = *t--; s0[i] = (char) ((byte >> 6) & 0x3); s1[i] = (char) ((byte >> 4) & 0x3); s2[i] = (char) ((byte >> 2) & 0x3); s3[i] = (char) (byte & 0x3); } s[len] = 4; } // Convert read in [0-3] representation to ascii representation (end with '\n') void Lower_Read(char *s) { static char letter[4] = { 'a', 'c', 'g', 't' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } void Upper_Read(char *s) { static char letter[4] = { 'A', 'C', 'G', 'T' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } // Convert read in ascii representation to [0-3] representation (end with 4) void Number_Read(char *s) { static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; for ( ; *s != '\0'; s++) *s = number[(int) *s]; *s = 4; } /******************************************************************************************* * * DB OPEN, TRIM & CLOSE ROUTINES * ********************************************************************************************/ // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char* path, HITS_DB *db) { HITS_DB dbcopy; char *root, *pwd, *bptr, *fptr, *cat; int nreads; FILE *index, *dbvis; int status, plen, isdam; int part, cutoff, all; int ufirst, tfirst, ulast, tlast; status = -1; dbcopy = *db; plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); pwd = PathTo(path); bptr = rindex(root,'.'); if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-') { part = strtol(bptr+1,&fptr,10); if (*fptr != '\0' || part == 0) part = 0; else *bptr = '\0'; } else part = 0; isdam = 0; cat = Catenate(pwd,"/",root,".db"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { cat = Catenate(pwd,"/",root,".dam"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path); goto error; } isdam = 1; } if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) goto error1; if (fread(db,sizeof(HITS_DB),1,index) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); goto error2; } { int p, nblocks, nfiles; int64 size; char fname[MAX_NAME], prolog[MAX_NAME]; nblocks = 0; if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } for (p = 0; p < nfiles; p++) if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) if (part == 0) { cutoff = 0; all = 1; } else { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", Prog_Name,root); goto error2; } else { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (part > nblocks) { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); goto error2; } } if (part > 0) { for (p = 1; p <= part; p++) if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } } else { ufirst = tfirst = 0; ulast = db->ureads; tlast = db->treads; } } db->trimmed = 0; db->tracks = NULL; db->part = part; db->cutoff = cutoff; db->all = all; db->ufirst = ufirst; db->tfirst = tfirst; nreads = ulast-ufirst; if (part <= 0) { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); db->reads += 1; if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(db->reads); goto error2; } } else { HITS_READ *reads; int i, r, maxlen; int64 totlen; reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); reads += 1; fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR); if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(reads); goto error2; } totlen = 0; maxlen = 0; for (i = 0; i < nreads; i++) { r = reads[i].rlen; totlen += r; if (r > maxlen) maxlen = r; } db->maxlen = maxlen; db->totlen = totlen; db->reads = reads; } ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part ((int *) (db->reads))[-2] = tlast - tfirst; db->nreads = nreads; db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); if (db->path == NULL) goto error2; db->bases = NULL; db->loaded = 0; status = isdam; error2: fclose(index); error1: fclose(dbvis); error: if (bptr != NULL) *bptr = '.'; free(pwd); free(root); if (status < 0) *db = dbcopy; return (status); } // Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(HITS_DB *db) { int i, j, r; int allflag, cutoff; int64 totlen; int maxlen, nreads; HITS_TRACK *record; HITS_READ *reads; if (db->trimmed) return; if (db->cutoff <= 0 && db->all) return; cutoff = db->cutoff; if (db->all) allflag = 0; else allflag = DB_BEST; reads = db->reads; nreads = db->nreads; for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,".@qvs") == 0) { uint16 *table = ((HITS_QV *) record)->table; j = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) table[j++] = table[i]; } else { int *anno4, size; int64 *anno8; char *anno, *data; size = record->size; data = (char *) record->data; if (data == NULL) { anno = (char *) record->anno; j = 0; for (i = r = 0; i < db->nreads; i++, r += size) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (record->anno); j = anno4[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (record->anno); j = anno8[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno8[j],NULL); } record->anno = Realloc(record->anno,record->size*(j+1),NULL); } totlen = maxlen = 0; for (j = i = 0; i < nreads; i++) { r = reads[i].rlen; if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff) { totlen += r; if (r > maxlen) maxlen = r; reads[j++] = reads[i]; } } db->totlen = totlen; db->maxlen = maxlen; db->nreads = j; db->trimmed = 1; if (j < nreads) { db->reads = Realloc(reads-1,sizeof(HITS_READ)*(j+2),NULL); db->reads += 1; } } // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(HITS_DB *db) { HITS_TRACK *t, *p; if (db->loaded) free(((char *) (db->bases)) - 1); else if (db->bases != NULL) fclose((FILE *) db->bases); free(db->reads-1); free(db->path); Close_QVs(db); for (t = db->tracks; t != NULL; t = p) { p = t->next; free(t->anno); free(t->data); free(t); } } /******************************************************************************************* * * QV LOAD & CLOSE ROUTINES * ********************************************************************************************/ HITS_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" HITS_QV *Active_QV; // Becomes invalid after closing int Load_QVs(HITS_DB *db) { FILE *quiva, *istub, *indx; char *root; uint16 *table; HITS_QV *qvtrk; QVcoding *coding, *nx; int ncodes; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) return (0); if (db->trimmed) { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); EXIT(1); } if (db->reads[db->nreads-1].coff < 0) { EPRINTF(EPLACE,"%s: The requested QVs have not been added to the DB!\n",Prog_Name); EXIT(1); } // Open .qvs, .idx, and .db files quiva = Fopen(Catenate(db->path,"","",".qvs"),"r"); if (quiva == NULL) return (-1); istub = NULL; indx = NULL; table = NULL; coding = NULL; qvtrk = NULL; root = rindex(db->path,'/') + 2; istub = Fopen(Catenate(db->path,"/",root,".db"),"r"); if (istub == NULL) goto error; { int first, last, nfiles; char prolog[MAX_NAME], fname[MAX_NAME]; int i, j; if (fscanf(istub,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (db->part > 0) { int pfirst, plast; int fbeg, fend; int n, k; FILE *indx; // Determine first how many and which files span the block (fbeg to fend) pfirst = db->ufirst; plast = pfirst + db->nreads; first = 0; for (fbeg = 0; fbeg < nfiles; fbeg++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (last > pfirst) break; first = last; } for (fend = fbeg+1; fend <= nfiles; fend++) { if (last >= plast) break; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } first = last; } indx = Fopen(Catenate(db->path,"","",".idx"),"r"); ncodes = fend-fbeg; coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (indx == NULL || coding == NULL || table == NULL) { ncodes = 0; goto error; } // Carefully get the first coding scheme (its offset is most likely in a HITS_RECORD // in .idx that is *not* in memory). Get all the other coding schemes normally and // assign the tables # for each read in the block in "tables". rewind(istub); fscanf(istub,DB_NFILE,&nfiles); first = 0; for (n = 0; n < fbeg; n++) { fscanf(istub,DB_FDATA,&last,fname,prolog); first = last; } for (n = fbeg; n < fend; n++) { fscanf(istub,DB_FDATA,&last,fname,prolog); i = n-fbeg; if (first < pfirst) { HITS_READ read; fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*first,SEEK_SET); if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); ncodes = i; goto error; } fseeko(quiva,read.coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; } else { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first-pfirst].coff = ftello(quiva); } j = first-pfirst; if (j < 0) j = 0; k = last-pfirst; if (k > db->nreads) k = db->nreads; while (j < k) table[j++] = (uint16) i; first = last; } fclose(indx); indx = NULL; } else { // Load in coding scheme for each file, adjust .coff of first read in the file, and // record which table each read uses ncodes = nfiles; coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (coding == NULL || table == NULL) goto error; first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } fseeko(quiva,db->reads[first].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first].coff = ftello(quiva); for (j = first; j < last; j++) table[j] = (uint16) i; first = last; } } // Allocate and fill in the HITS_QV record and add it to the front of the // track list qvtrk = (HITS_QV *) Malloc(sizeof(HITS_QV),"Allocating QV pseudo-track"); if (qvtrk == NULL) goto error; qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); if (qvtrk->name == NULL) goto error; qvtrk->next = db->tracks; db->tracks = (HITS_TRACK *) qvtrk; qvtrk->ncodes = ncodes; qvtrk->table = table; qvtrk->coding = coding; qvtrk->quiva = quiva; } fclose(istub); return (0); error: if (qvtrk != NULL) free(qvtrk); if (table != NULL) free(table); if (coding != NULL) { int i; for (i = 0; i < ncodes; i++) Free_QVcoding(coding+i); free(coding); } if (indx != NULL) fclose(indx); if (istub != NULL) fclose(istub); fclose(quiva); EXIT(1); } // Close the QV stream, free the QV pseudo track and all associated memory void Close_QVs(HITS_DB *db) { HITS_TRACK *track; HITS_QV *qvtrk; int i; Active_DB = NULL; track = db->tracks; if (track != NULL && strcmp(track->name,".@qvs") == 0) { qvtrk = (HITS_QV *) track; for (i = 0; i < qvtrk->ncodes; i++) Free_QVcoding(qvtrk->coding+i); free(qvtrk->coding); free(qvtrk->table); fclose(qvtrk->quiva); db->tracks = track->next; free(track); } return; } /******************************************************************************************* * * TRACK LOAD & CLOSE ROUTINES * ********************************************************************************************/ // Return status of track: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track int Check_Track(HITS_DB *db, char *track) { FILE *afile; int tracklen, ispart; int ureads, treads; afile = NULL; if (db->part > 0) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) return (-2); if (fread(&tracklen,sizeof(int),1,afile) != 1) return (-1); fclose(afile); if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (tracklen == treads) return (1); else if (tracklen == ureads) return (0); else return (-1); } // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned. HITS_TRACK *Load_Track(HITS_DB *db, char *track) { FILE *afile, *dfile; int tracklen, size; int nreads, ispart; int treads, ureads; void *anno; void *data; char *name; HITS_TRACK *record; if (track[0] == '.') { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); EXIT(NULL); } for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,track) == 0) return (record); afile = NULL; if (db->part) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); return (NULL); } dfile = NULL; anno = NULL; data = NULL; record = NULL; if (ispart) name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"); else name = Catenate(db->path,".",track,".data"); if (name == NULL) goto error; dfile = fopen(name,"r"); if (fread(&tracklen,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (fread(&size,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size <= 0) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (db->trimmed) { if (tracklen != treads) { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->tfirst,SEEK_CUR); } else { if (tracklen != ureads) { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->ufirst,SEEK_CUR); } nreads = db->nreads; anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); if (anno == NULL) goto error; if (dfile != NULL) { int64 *anno8, off8, dlen; int *anno4, off4; int i; if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 4) { anno4 = (int *) anno; off4 = anno4[0]; if (off4 != 0) { for (i = 0; i <= nreads; i++) anno4[i] -= off4; fseeko(dfile,off4,SEEK_SET); } dlen = anno4[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } else { anno8 = (int64 *) anno; off8 = anno8[0]; if (off8 != 0) { for (i = 0; i <= nreads; i++) anno8[i] -= off8; fseeko(dfile,off8,SEEK_SET); } dlen = anno8[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } if (data == NULL) goto error; if (dlen > 0) { if (fread(data,dlen,1,dfile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track); goto error; } } fclose(dfile); dfile = NULL; } else { if (fread(anno,size,nreads,afile) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } data = NULL; } fclose(afile); record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record"); if (record == NULL) goto error; record->name = Strdup(track,"Allocating Track Name"); if (record->name == NULL) goto error; record->data = data; record->anno = anno; record->size = size; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) { record->next = db->tracks->next; db->tracks->next = record; } else { record->next = db->tracks; db->tracks = record; } return (record); error: if (record == NULL) free(record); if (data != NULL) free(data); if (anno != NULL) free(anno); if (dfile != NULL) fclose(dfile); fclose(afile); EXIT (NULL); } void Close_Track(HITS_DB *db, char *track) { HITS_TRACK *record, *prev; prev = NULL; for (record = db->tracks; record != NULL; record = record->next) { if (strcmp(record->name,track) == 0) { free(record->anno); free(record->data); free(record->name); if (prev == NULL) db->tracks = record->next; else prev->next = record->next; free(record); return; } prev = record; } return; } /******************************************************************************************* * * READ BUFFER ALLOCATION AND READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer big enough for the largest read in 'db', leaving room // for an initial delimiter character char *New_Read_Buffer(HITS_DB *db) { char *read; read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); if (read == NULL) EXIT(NULL); return (read+1); } // Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a // lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and // 3(T) otherwise. // // **NB**, the byte before read will be set to a delimiter character! int Load_Read(HITS_DB *db, int i, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; HITS_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(1); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); db->bases = (void *) bases; } off = r[i].boff; len = r[i].rlen; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(1); } } Uncompress_Read(len,read); if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (0); } char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; int bbeg, bend; HITS_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(NULL); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(NULL); db->bases = (void *) bases; } bbeg = beg/4; bend = (end-1)/4+1; off = r[i].boff + bbeg; len = end - beg; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = bend-bbeg; if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(NULL); } } Uncompress_Read(4*clen,read); read += beg%4; read[len] = 4; if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (read); } /******************************************************************************************* * * QV BUFFER ALLOCATION QV READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer of 5 vectors big enough for the largest read in 'db' char **New_QV_Buffer(HITS_DB *db) { char **entry; char *qvs; int i; qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); if (qvs == NULL || entry == NULL) EXIT(NULL); for (i = 0; i < 5; i++) entry[i] = qvs + i*db->maxlen; return (entry); } // Load into entry the QV streams for the i'th read from db. The parameter ascii applies to // the DELTAG stream as described for Load_Read. int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) { HITS_READ *reads; FILE *quiva; int rlen; if (db != Active_DB) { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name); EXIT(1); } Active_QV = (HITS_QV *) db->tracks; Active_DB = db; } if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); EXIT(1); } reads = db->reads; quiva = Active_QV->quiva; rlen = reads[i].rlen; fseeko(quiva,reads[i].coff,SEEK_SET); if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) EXIT(1); if (ascii != 1) { char *deltag = entry[1]; if (ascii != 2) { char x = deltag[rlen]; deltag[rlen] = '\0'; Number_Read(deltag); deltag[rlen] = x; } else { int j; int u = 'A'-'a'; for (j = 0; j < rlen; j++) deltag[j] = (char) (deltag[j]+u); } } return (0); } /******************************************************************************************* * * BLOCK LOAD OF ALL READS (PRIMARILY FOR DALIGNER) * ********************************************************************************************/ // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // non-zero then the reads are converted to ACGT ascii, otherwise the reads are left // as numeric strings over 0(A), 1(C), 2(G), and 3(T). int Read_All_Sequences(HITS_DB *db, int ascii) { FILE *bases; int nreads = db->nreads; HITS_READ *reads = db->reads; void (*translate)(char *s); char *seq; int64 o, off; int i, len, clen; bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); if (seq == NULL) { fclose(bases); EXIT(1); } *seq++ = 4; if (ascii == 1) translate = Lower_Read; else translate = Upper_Read; o = 0; for (i = 0; i < nreads; i++) { len = reads[i].rlen; off = reads[i].boff; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(seq+o,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name); free(seq); fclose(bases); EXIT(1); } } Uncompress_Read(len,seq+o); if (ascii) translate(seq+o); reads[i].boff = o; o += (len+1); } reads[nreads].boff = o; fclose(bases); db->bases = (void *) seq; db->loaded = 1; return (0); } int List_DB_Files(char *path, void actor(char *path, char *extension)) { int status, plen, rlen, dlen; char *root, *pwd, *name; int isdam; DIR *dirp; struct dirent *dp; status = 0; pwd = PathTo(path); plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); rlen = strlen(root); if (root == NULL || pwd == NULL) { free(pwd); free(root); EXIT(1); } if ((dirp = opendir(pwd)) == NULL) { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; goto error; } isdam = 0; while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary) { name = dp->d_name; if (strcmp(name,Catenate("","",root,".db")) == 0) break; if (strcmp(name,Catenate("","",root,".dam")) == 0) { isdam = 1; break; } if (strcasecmp(name,Catenate("","",root,".db")) == 0) { strncpy(root,name,rlen); break; } if (strcasecmp(name,Catenate("","",root,".dam")) == 0) { strncpy(root,name,rlen); isdam = 1; break; } } if (dp == NULL) { EPRINTF(EPLACE,"%s: Cannot find %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; closedir(dirp); goto error; } if (isdam) actor(Catenate(pwd,"/",root,".dam"),"dam"); else actor(Catenate(pwd,"/",root,".db"),"db"); rewinddir(dirp); // Report each auxiliary file while ((dp = readdir(dirp)) != NULL) { name = dp->d_name; dlen = strlen(name); #ifdef HIDE_FILES if (name[0] != '.') continue; dlen -= 1; name += 1; #endif if (dlen < rlen+1) continue; if (name[rlen] != '.') continue; if (strncmp(name,root,rlen) != 0) continue; actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); } closedir(dirp); error: free(pwd); free(root); return (status); } void Print_Read(char *s, int width) { int i; if (s[0] < 4) { for (i = 0; s[i] != 4; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%d",s[i]); } printf("\n"); } else { for (i = 0; s[i] != '\0'; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%c",s[i]); } printf("\n"); } } DAZZ_DB-1.0/DB.h000066400000000000000000000551471253752464600131570ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. Eventually will also * support compressed quality information. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #ifndef _HITS_DB #define _HITS_DB #include #include "QV.h" #define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" // Undefine if you don't want this // For interactive applications where it is inappropriate to simply exit with an error // message to standard error, define the constant INTERACTIVE. If set, then error // messages are put in the global variable Ebuffer and the caller of a DB routine // can decide how to deal with the error. // // DB, QV, or alignment routines that can encounter errors function as before in // non-INTERACTIVE mode by exiting after printing an error message to stderr. In // INTERACTIVE mode the routines place a message at EPLACE and return an error // value. For such routines that were previously void, they are now int, and // return 1 if an error occured, 0 otherwise. #undef INTERACTIVE #ifdef INTERACTIVE #define EPRINTF sprintf #define EPLACE Ebuffer #define EXIT(x) return (x) #else // BATCH #define EPRINTF fprintf #define EPLACE stderr #define EXIT(x) exit (1) #endif typedef unsigned char uint8; typedef unsigned short uint16; typedef unsigned int uint32; typedef unsigned long long uint64; typedef signed char int8; typedef signed short int16; typedef signed int int32; typedef signed long long int64; typedef float float32; typedef double float64; /******************************************************************************************* * * COMMAND LINE INTERPRETATION MACROS * ********************************************************************************************/ extern char *Prog_Name; // Name of program #ifdef INTERACTIVE extern char Ebuffer[]; #endif #define SYSTEM_ERROR \ { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \ exit (2); \ } #define ARG_INIT(name) \ Prog_Name = Strdup(name,""); \ for (i = 0; i < 128; i++) \ flags[i] = 0; #define ARG_FLAGS(set) \ for (k = 1; argv[i][k] != '\0'; k++) \ { if (index(set,argv[i][k]) == NULL) \ { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \ exit (1); \ } \ flags[(int) argv[i][k]] = 1; \ } #define ARG_POSITIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \ exit (1); \ } \ if (var <= 0) \ { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_NON_NEGATIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \ exit (1); \ } \ if (var < 0) \ { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_REAL(var) \ var = strtod(argv[i]+2,&eptr); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not a real number\n",Prog_Name,argv[i][1]); \ exit (1); \ } /******************************************************************************************* * * UTILITIES * ********************************************************************************************/ // The following general utilities return NULL if any of their input pointers are NULL, or if they // could not perform their function (in which case they also print an error to stderr). void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to char *Strdup(char *string, char *mesg); // stderr if out of memory FILE *Fopen(char *path, char *mode); // Open file path for "mode" char *PathTo(char *path); // Return path portion of file name "path" char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path" // Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer // Numbered_Suffix returns concatenation of left..right in a *temporary* buffer char *Catenate(char *path, char *sep, char *root, char *suffix); char *Numbered_Suffix(char *left, int num, char *right); // DB-related utilities void Print_Number(int64 num, int width, FILE *out); // Print readable big integer int Number_Digits(int64 num); // Return # of digits in printed number #define COMPRESSED_LEN(len) (((len)+3) >> 2) void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form void Print_Read(char *s, int width); void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt) void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT) void Number_Read(char *s); // Convert read from letters to numbers /******************************************************************************************* * * DB IN-CORE DATA STRUCTURES * ********************************************************************************************/ #define DB_QV 0x03ff // Mask for 3-digit quality value #define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert #define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1) typedef struct { int origin; // Well # int rlen; // Length of the sequence (Last pulse = fpulse + rlen) int fpulse; // First pulse int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file int flags; // QV of read + flags above } HITS_READ; // A track can be of 3 types: // data == NULL: there are nreads 'anno' records of size 'size'. // data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1]) // contains the variable length data // data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1]) // contains the variable length data typedef struct _track { struct _track *next; // Link to next track char *name; // Symbolic name of track int size; // Size in bytes of anno records void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL } HITS_TRACK; // The information for accessing QV streams is in a HITS_QV record that is a "pseudo-track" // named ".@qvs" and is always the first track record in the list (if present). Since normal // track names cannot begin with a . (this is enforced), this pseudo-track is never confused // with a normal track. typedef struct { struct _track *next; char *name; int ncodes; // # of coding tables QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h) uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with // scheme coding[table[i]] FILE *quiva; // the open file pointer to the .qvs file } HITS_QV; // The DB record holds all information about the current state of an active DB including an // array of HITS_READS, one per read, and a linked list of HITS_TRACKs the first of which // is always a HITS_QV pseudo-track (if the QVs have been loaded). typedef struct { int ureads; // Total number of reads in untrimmed DB int treads; // Total number of reads in trimmed DB int cutoff; // Minimum read length in block (-1 if not yet set) int all; // Consider multiple reads from a given well float freq[4]; // frequency of A, C, G, T, respectively // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed) int maxlen; // length of maximum read (initially over all DB) int64 totlen; // total # of bases (initially over all DB) int nreads; // # of reads in actively loaded portion of DB int trimmed; // DB has been trimmed by cutoff/all int part; // DB block (if > 0), total DB (if == 0) int ufirst; // Index of first read in block (without trimming) int tfirst; // Index of first read in block (with trimming) // In order to avoid forcing users to have to rebuild all thier DBs to accommodate // the addition of fields for the size of the actively loaded trimmed and untrimmed // blocks, an additional read record is allocated in "reads" when a DB is loaded into // memory (reads[-1]) and the two desired fields are crammed into the first two // integer spaces of the record. char *path; // Root name of DB for .bps, .qvs, and tracks int loaded; // Are reads loaded in memory? void *bases; // file pointer for bases file (to fetch reads from), // or memory pointer to uncompressed block of all sequences. HITS_READ *reads; // Array [-1..nreads] of HITS_READ HITS_TRACK *tracks; // Linked list of loaded tracks } HITS_DB; /******************************************************************************************* * * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock * ********************************************************************************************/ #define MAX_NAME 10000 // Longest file name or fasta header line #define DB_NFILE "files = %9d\n" // number of files #define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name #define DB_NBLOCK "blocks = %9d\n" // number of blocks #define DB_PARAMS "size = %9lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well #define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed) /******************************************************************************************* * * DB ROUTINES * ********************************************************************************************/ // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps, // .DB.qvs, and files .DB..anno and DB..data where is a track name // (not containing a . !). // A DAM is basically a DB except that: // 1. there are no QV's, instead .coff points the '\0' terminated fasta header of the read // in the file ..hdr file // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences // contain N-separated contigs), and .fpulse the first base of the contig in the // fasta entry // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char *path, HITS_DB *db); // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(HITS_DB *db); // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(HITS_DB *db); // If QV pseudo track is not already in db's track list, then load it and set it up. // The database must not have been trimmed yet. -1 is returned if a .qvs file is not // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE // is defined. Otherwise a 0 is returned. int Load_QVs(HITS_DB *db); // Remove the QV pseudo track, all space associated with it, and close the .qvs file. void Close_QVs(HITS_DB *db); // Look up the file and header in the file of the indicated track. Return: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track int Check_Track(HITS_DB *db, char *track); // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise // the routine prints an error message to stderr and exits if an error occurs, and returns // with NULL only if the track does not exist. HITS_TRACK *Load_Track(HITS_DB *db, char *track); // If track is on the db's track list, then it is removed and all storage associated with it // is freed. void Close_Track(HITS_DB *db, char *track); // Allocate and return a buffer big enough for the largest read in 'db'. // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte // are needed by the alignment algorithms. If cannot allocate memory then return NULL // if INTERACTIVE is defined, or print error to stderr and exit otherwise. char *New_Read_Buffer(HITS_DB *db); // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter // for traversals in either direction. A non-zero value is returned if an error occured // and INTERACTIVE is defined. int Load_Read(HITS_DB *db, int i, char *read, int ascii); // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to // the string holding the substring so it has a delimeter for traversals in either direction. // A NULL pointer is returned if an error occured and INTERACTIVE is defined. char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii); // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, // or print error to stderr and exit otherwise. #define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer #define DEL_TAG 1 // The deleted characters #define INS_QV 2 // The insertion QVs #define SUB_QV 3 // The substitution QVs #define MRG_QV 4 // The merge QVs char **New_QV_Buffer(HITS_DB *db); // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters // are converted to a numeric or upper/lower case ascii string as per ascii. Return with // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii); // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T). // Return with a zero, except when an error occurs and INTERACTIVE is defined in which // case return wtih 1. int Read_All_Sequences(HITS_DB *db, int ascii); // For the DB or DAM "path" = "prefix/root[.db|.dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.db" and extension "db". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. // -1 is returned if the path could not be found, and 1 is returned if an error (reported // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. int List_DB_Files(char *path, void actor(char *path, char *extension)); #endif // _HITS_DB DAZZ_DB-1.0/DB2fasta.c000066400000000000000000000147451253752464600142520ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************** * * Recreate all the .fasta files that have been loaded into a specified database. * * Author: Gene Myers * Date : May 2014 * ********************************************************************************************/ #include #include #include #include "DB.h" static char *Usage = "[-vU] [-w] "; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *dbfile; int nfiles; int VERBOSE, UPPER, WIDTH; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DB2fasta") WIDTH = 80; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vU") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; } else argv[j++] = argv[i]; argc = j; UPPER = 1 + flags['U']; VERBOSE = flags['v']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open db { int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (db->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } } { char *pwd, *root; pwd = PathTo(argv[1]); root = Root(argv[1],".db"); dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r"); free(pwd); free(root); if (dbfile == NULL) exit (1); } // nfiles = # of files in data base if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR // For each file do: { HITS_READ *reads; char *read; int f, first; reads = db->reads; read = New_Read_Buffer(db); first = 0; for (f = 0; f < nfiles; f++) { int i, last; FILE *ofile; char prolog[MAX_NAME], fname[MAX_NAME]; // Scan db image file line, create .fasta file for writing if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Creating %s.fasta ...\n",fname); fflush(stdout); } // For the relevant range of reads, write each to the file // recreating the original headers with the index meta-data about each read for (i = first; i < last; i++) { int j, len; int flags, qv; HITS_READ *r; r = reads + i; len = r->rlen; flags = r->flags; qv = (flags & DB_QV); fprintf(ofile,">%s/%d/%d_%d",prolog,r->origin,r->fpulse,r->fpulse+len); if (qv > 0) fprintf(ofile," RQ=0.%3d",qv); fprintf(ofile,"\n"); Load_Read(db,i,read,UPPER); for (j = 0; j+WIDTH < len; j += WIDTH) fprintf(ofile,"%.*s\n",WIDTH,read+j); if (j < len) fprintf(ofile,"%s\n",read+j); } first = last; } } fclose(dbfile); Close_DB(db); exit (0); } DAZZ_DB-1.0/DB2quiva.c000066400000000000000000000151711253752464600142730ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************** * * Recreate all the .quiva files that have been loaded into a specified database. * * Author: Gene Myers * Date : May 2014 * ********************************************************************************************/ #include #include #include #include "DB.h" #include "QV.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-vU] "; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *dbfile, *quiva; int VERBOSE, UPPER; // Process arguments { int i, j, k; int flags[128]; ARG_INIT("DB2quiva") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vU") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; UPPER = flags['U']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open db, db stub file, and .qvs file { char *pwd, *root; int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (db->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } pwd = PathTo(argv[1]); root = Root(argv[1],".db"); dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r"); quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r"); free(pwd); free(root); if (dbfile == NULL || quiva == NULL) exit (1); } // For each file do: { HITS_READ *reads; int f, first, nfiles; QVcoding *coding; char **entry; if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR entry = New_QV_Buffer(db); reads = db->reads; first = 0; for (f = 0; f < nfiles; f++) { int i, last; FILE *ofile; char prolog[MAX_NAME], fname[MAX_NAME]; // Scan db image file line, create .quiva file for writing if (reads[first].coff < 0) break; if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if ((ofile = Fopen(Catenate(".","/",fname,".quiva"),"w")) == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Creating %s.quiva ...\n",fname); fflush(stderr); } coding = Read_QVcoding(quiva); // For the relevant range of reads, write the header for each to the file // and then uncompress and write the quiva entry for each for (i = first; i < last; i++) { int e, flags, qv, rlen; HITS_READ *r; r = reads + i; flags = r->flags; rlen = r->rlen; qv = (flags & DB_QV); fprintf(ofile,"@%s/%d/%d_%d",prolog,r->origin,r->fpulse,r->fpulse+rlen); if (qv > 0) fprintf(ofile," RQ=0.%3d",qv); fprintf(ofile,"\n"); Uncompress_Next_QVentry(quiva,entry,coding,rlen); if (UPPER) { char *deltag = entry[1]; int j; for (j = 0; j < rlen; j++) deltag[j] -= 32; } for (e = 0; e < 5; e++) fprintf(ofile,"%.*s\n",rlen,entry[e]); } first = last; } } fclose(quiva); fclose(dbfile); Close_DB(db); exit (0); } DAZZ_DB-1.0/DBdust.c000066400000000000000000000373001253752464600140410ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * My implementation of the SDUST algorithm (Morgulis et al., JCB 13, 5 (2006), 1028-1040) * * Author: Gene Myers * Date : September 2013 * Mod : Is now incremental * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif #undef DEBUG #ifdef DEBUG static int Caps[4] = { 'A', 'C', 'G', 'T' }; static int Lowr[4] = { 'a', 'c', 'g', 't' }; #endif static char *Usage = "[-b] [-w] [-t] [-m] "; typedef struct _cand { struct _cand *next; struct _cand *prev; int beg; int end; double score; } Candidate; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *afile, *dfile; int64 indx; int nreads; int *mask; Candidate *cptr; int WINDOW; double THRESH; int MINLEN; int BIASED; { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DBdust") WINDOW = 64; THRESH = 2.; MINLEN = 9; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("b") break; case 'w': ARG_POSITIVE(WINDOW,"Window size") break; case 't': ARG_REAL(THRESH) if (THRESH <= 0.) { fprintf(stderr,"%s: Threshold must be positive (%g)\n",Prog_Name,THRESH); exit (1); } break; case 'm': ARG_NON_NEGATIVE(MINLEN,"Minimum hit") MINLEN -= 1; break; } else argv[j++] = argv[i]; argc = j; BIASED = flags['b']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open .db or .dam { int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); } mask = (int *) Malloc((db->maxlen+1)*sizeof(int),"Allocating mask vector"); cptr = (Candidate *) Malloc((WINDOW+1)*sizeof(Candidate),"Allocating candidate vector"); if (mask == NULL || cptr == NULL) exit (1); { char *pwd, *root, *fname; int size; pwd = PathTo(argv[1]); root = Root(argv[1],".db"); size = 8; fname = Catenate(pwd,PATHSEP,root,".dust.anno"); if ((afile = fopen(fname,"r+")) == NULL || db->part > 0) { if (afile != NULL) fclose(afile); afile = Fopen(fname,"w"); dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"w"); if (dfile == NULL || afile == NULL) exit (1); fwrite(&(db->nreads),sizeof(int),1,afile); fwrite(&size,sizeof(int),1,afile); nreads = 0; indx = 0; fwrite(&indx,sizeof(int64),1,afile); } else { dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"r+"); if (dfile == NULL) exit (1); if (fread(&nreads,sizeof(int),1,afile) != 1) SYSTEM_ERROR if (nreads >= db->nreads) { fclose(afile); fclose(dfile); exit(0); } fseeko(afile,0,SEEK_SET); fwrite(&(db->nreads),sizeof(int),1,afile); fwrite(&size,sizeof(int),1,afile); fseeko(afile,0,SEEK_END); fseeko(dfile,0,SEEK_END); indx = ftello(dfile); } free(pwd); free(root); } { int *mask1; char *read, *lag2; int wcount[64], lcount[64]; Candidate *aptr; double skew[64], thresh2r; int thresh2i; int i; read = New_Read_Buffer(db); lag2 = read-2; mask1 = mask+1; *mask = -2; aptr = cptr+1; for (i = 1; i < WINDOW; i++) cptr[i].next = aptr+i; cptr[WINDOW].next = NULL; cptr->next = cptr->prev = cptr; cptr->beg = -2; thresh2r = 2.*THRESH; thresh2i = (int) ceil(thresh2r); if (BIASED) { int a, b, c, p; p = 0; for (a = 0; a < 4; a++) for (b = 0; b < 4; b++) for (c = 0; c < 4; c++) skew[p++] = .015625 / (db->freq[a]*db->freq[b]*db->freq[c]); } for (i = nreads; i < db->nreads; i++) { Candidate *lptr, *jptr; int *mtop; double mscore; int len; int wb, lb; int j, c, d; len = db->reads[i].rlen; // Fetch read Load_Read(db,i,read,0); c = (read[0] << 2) | read[1]; // Convert to triple codes for (j = 2; j < len; j++) { c = ((c << 2) & 0x3f) | read[j]; lag2[j] = (char) c; } len -= 2; for (j = 0; j < 64; j++) // Setup counter arrays wcount[j] = lcount[j] = 0; mtop = mask; // The dust algorithm lb = wb = -1; if (BIASED) { double lsqr, wsqr, trun; // Modification for high-compositional bias wsqr = lsqr = 0.; for (j = 0; j < len; j++) { c = read[j]; #define ADDR(e,cnt,sqr) sqr += (cnt[e]++) * skew[e]; #define DELR(e,cnt,sqr) sqr -= (--cnt[e]) * skew[e]; #define WADDR(e) ADDR(e,wcount,wsqr) #define WDELR(e) DELR(e,wcount,wsqr) #define LADDR(e) ADDR(e,lcount,lsqr) #define LDELR(e) DELR(e,lcount,lsqr) if (j > WINDOW-3) { d = read[++wb]; WDELR(d) } WADDR(c) if (lb < wb) { d = read[++lb]; LDELR(d) } trun = (lcount[c]++) * skew[c]; lsqr += trun; if (trun >= thresh2r) { while (lb < j) { d = read[++lb]; LDELR(d) if (d == c) break; } } jptr = cptr->prev; if (jptr != cptr && jptr->beg <= wb) { c = jptr->end + 2; if (*mtop+1 >= jptr->beg) { if (*mtop < c) *mtop = c; } else { *++mtop = jptr->beg; *++mtop = c; } lptr = jptr->prev; cptr->prev = lptr; lptr->next = cptr; jptr->next = aptr; aptr = jptr; } if (wsqr <= lsqr*THRESH) continue; jptr = cptr->next; lptr = cptr; mscore = 0.; for (c = lb; c > wb; c--) { d = read[c]; LADDR(d) if (lsqr >= THRESH * (j-c)) { for ( ; jptr->beg >= c; jptr = (lptr = jptr)->next) if (jptr->score > mscore) mscore = jptr->score; if (lsqr >= mscore * (j-c)) { mscore = lsqr / (j-c); if (lptr->beg == c) { lptr->end = j; lptr->score = mscore; } else { aptr->beg = c; aptr->end = j; aptr->score = mscore; aptr->prev = lptr; lptr = lptr->next = aptr; aptr = aptr->next; jptr->prev = lptr; lptr->next = jptr; } } } } for (c++; c <= lb; c++) { d = read[c]; LDELR(d) } } } else { int lsqr, wsqr, trun; // Algorithm for GC-balanced sequences wsqr = lsqr = 0; for (j = 0; j < len; j++) { c = read[j]; #define ADDI(e,cnt,sqr) sqr += (cnt[e]++); #define DELI(e,cnt,sqr) sqr -= (--cnt[e]); #define WADDI(e) ADDI(e,wcount,wsqr) #define WDELI(e) DELI(e,wcount,wsqr) #define LADDI(e) ADDI(e,lcount,lsqr) #define LDELI(e) DELI(e,lcount,lsqr) if (j > WINDOW-3) { d = read[++wb]; WDELI(d) } WADDI(c) if (lb < wb) { d = read[++lb]; LDELI(d) } trun = lcount[c]++; lsqr += trun; if (trun >= thresh2i) { while (lb < j) { d = read[++lb]; LDELI(d) if (d == c) break; } } jptr = cptr->prev; if (jptr != cptr && jptr->beg <= wb) { c = jptr->end + 2; if (*mtop+1 >= jptr->beg) { if (*mtop < c) *mtop = c; } else { *++mtop = jptr->beg; *++mtop = c; } lptr = jptr->prev; cptr->prev = lptr; lptr->next = cptr; jptr->next = aptr; aptr = jptr; } if (wsqr <= lsqr*THRESH) continue; jptr = cptr->next; lptr = cptr; mscore = 0.; for (c = lb; c > wb; c--) { d = read[c]; LADDI(d) if (lsqr >= THRESH * (j-c)) { for ( ; jptr->beg >= c; jptr = (lptr = jptr)->next) if (jptr->score > mscore) mscore = jptr->score; if (lsqr >= mscore * (j-c)) { mscore = (1. * lsqr) / (j-c); if (lptr->beg == c) { lptr->end = j; lptr->score = mscore; } else { aptr->beg = c; aptr->end = j; aptr->score = mscore; aptr->prev = lptr; lptr = lptr->next = aptr; aptr = aptr->next; jptr->prev = lptr; lptr->next = jptr; } } } } for (c++; c <= lb; c++) { d = read[c]; LDELI(d) } } } while ((jptr = cptr->prev) != cptr) { c = jptr->end + 2; if (*mtop+1 >= jptr->beg) { if (*mtop < c) *mtop = c; } else { *++mtop = jptr->beg; *++mtop = c; } cptr->prev = jptr->prev; jptr->prev->next = cptr; jptr->next = aptr; aptr = jptr; } { int *jtop, ntop; ntop = 0; for (jtop = mask1; jtop < mtop; jtop += 2) if (jtop[1] - jtop[0] >= MINLEN) { mask[++ntop] = jtop[0]; mask[++ntop] = jtop[1]+1; } mtop = mask + ntop; indx += ntop*sizeof(int); fwrite(&indx,sizeof(int64),1,afile); fwrite(mask1,sizeof(int),ntop,dfile); } #ifdef DEBUG { int *jtop; printf("\nREAD %d\n",i); for (jtop = mask1; jtop < mtop; jtop += 2) printf(" [%5d,%5d]\n",jtop[0],jtop[1]); Load_Read(db,i,read,0); jtop = mask1; for (c = 0; c < len; c++) { while (jtop < mtop && c > jtop[1]) jtop += 2; if (jtop < mtop && c >= *jtop) printf("%c",Caps[(int) read[c]]); else printf("%c",Lowr[(int) read[c]]); if ((c%80) == 79) printf("\n"); } printf("\n"); } #endif } } fclose(afile); fclose(dfile); Close_DB(db); exit (0); } DAZZ_DB-1.0/DBrm.c000066400000000000000000000100721253752464600134750ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************** * * Remove a list of .db databases * Delete all the files for the given data bases .db ... (there are a couple * of hidden . files for each DB, and these are removed too.) Do not use "rm" to * remove a database. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #include #include #include #include #include "DB.h" static char *Usage = " ... "; static void HANDLER(char *path, char *name) { (void) name; unlink(path); } int main(int argc, char *argv[]) { int i; Prog_Name = Strdup("DBrm",""); if (argc <= 1) fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); for (i = 1; i < argc; i++) if (List_DB_Files(argv[i],HANDLER) < 0) fprintf(stderr,"%s: Could not list database %s\n",Prog_Name,argv[i]); exit (0); } DAZZ_DB-1.0/DBshow.c000066400000000000000000000436201253752464600140440ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Display a specified set of reads of a database in fasta format. * * Author: Gene Myers * Date : September 2013 * Mod : With DB overhaul, made this a routine strictly for printing a selected subset * and created DB2fasta for recreating all the fasta files of a DB * Date : April 2014 * Mod : Added options to display QV streams * Date : July 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage[] = { "[-unqUQ] [-w] [-m]+", " [ | ... ]" }; #define LAST_READ_SYMBOL '$' #define MAX_BUFFER 10001 typedef struct { FILE *input; int lineno; int read; int beg; int end; } File_Iterator; File_Iterator *init_file_iterator(FILE *input) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->input = input; it->lineno = 1; rewind(input); return (it); } int next_read(File_Iterator *it) { static char nbuffer[MAX_BUFFER]; char *eol; int x; if (fgets(nbuffer,MAX_BUFFER,it->input) == NULL) { if (feof(it->input)) return (1); SYSTEM_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in read list is longer than %d chars!\n", Prog_Name,it->lineno,MAX_BUFFER-1); return (1); } *eol = '\0'; x = sscanf(nbuffer," %d %d %d",&(it->read),&(it->beg),&(it->end)); if (x == 1) it->beg = -1; else if (x != 3) { fprintf(stderr,"%s: Line %d of read list is improperly formatted\n",Prog_Name,it->lineno); return (1); } it->lineno += 1; return (0); } int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *hdrs = NULL; int nfiles; char **flist = NULL; int *findx = NULL; int reps, *pts; int input_pts; File_Iterator *iter; FILE *input; int TRIM, UPPER; int DOSEQ, DOQVS, QUIVA, DAM; int WIDTH; int MMAX, MTOP; char **MASK; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DBshow") WIDTH = 80; MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("unqUQ") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; DAM = 0; TRIM = 1-flags['u']; UPPER = 1+flags['U']; DOQVS = flags['q']; DOSEQ = 1-flags['n']; QUIVA = flags['Q']; if (QUIVA && (!DOSEQ || MTOP > 0)) { fprintf(stderr,"%s: -Q (quiva) format request inconsistent with -n and -m options\n", Prog_Name); exit (1); } if (QUIVA) DOQVS = 1; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); exit (1); } } // Open DB or DAM, and if a DAM open also .hdr file { char *pwd, *root; int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { root = Root(argv[1],".dam"); pwd = PathTo(argv[1]); hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"r"); if (hdrs == NULL) exit (1); DAM = 1; if (QUIVA || DOQVS) { fprintf(stderr,"%s: -Q and -q options not compatible with a .dam DB\n",Prog_Name); exit (1); } free(root); free(pwd); } } // Load QVs if requested if (DOQVS) { if (Load_QVs(db) < 0) { fprintf(stderr,"%s: QVs requested, but no .qvs for data base\n",Prog_Name); exit (1); } } // Check tracks and load tracks for untrimmed DB { int i, status; for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i]); if (status == -2) printf("%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]); else if (status == -1) printf("%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]); else if (status == 0) Load_Track(db,MASK[i]); else if (status == 1 && !TRIM) printf("%s: Warning: %s track is for a trimmed db but -u is set.\n",Prog_Name,MASK[i]); } } // If not a DAM then get prolog names and index ranges from the .db file if (!DAM) { char *pwd, *root; FILE *dstub; int i; root = Root(argv[1],".db"); pwd = PathTo(argv[1]); if (db->part > 0) *rindex(root,'.') = '\0'; dstub = Fopen(Catenate(pwd,"/",root,".db"),"r"); if (dstub == NULL) exit (1); free(pwd); free(root); if (fscanf(dstub,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR flist = (char **) Malloc(sizeof(char *)*nfiles,"Allocating file list"); findx = (int *) Malloc(sizeof(int *)*(nfiles+1),"Allocating file index"); if (flist == NULL || findx == NULL) exit (1); findx += 1; findx[-1] = 0; for (i = 0; i < nfiles; i++) { char prolog[MAX_NAME], fname[MAX_NAME]; if (fscanf(dstub,DB_FDATA,findx+i,fname,prolog) != 3) SYSTEM_ERROR if ((flist[i] = Strdup(prolog,"Adding to file list")) == NULL) exit (1); } fclose(dstub); // If TRIM (the default) then "trim" prolog ranges and the DB if (TRIM) { int nid, oid, lid; int cutoff, allflag; HITS_READ *reads; reads = db->reads - db->ufirst; cutoff = db->cutoff; if (db->all) allflag = 0; else allflag = DB_BEST; nid = 0; oid = db->ufirst; lid = oid + db->nreads; for (i = 0; i < nfiles; i++) { while (oid < findx[i] && oid < lid) { if ((reads[oid].flags & DB_BEST) >= allflag && reads[oid].rlen >= cutoff) nid++; oid += 1; } findx[i] = nid; } } else if (db->part > 0) { for (i = 0; i < nfiles; i++) findx[i] -= db->ufirst; } } if (TRIM) { int i, status; Trim_DB(db); // Load tracks for trimmed DB for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i]); if (status < 0) continue; else if (status == 1) Load_Track(db,MASK[i]); } } // Process read index arguments into a list of read ranges input_pts = 0; if (argc == 3) { if (argv[2][0] != LAST_READ_SYMBOL || argv[2][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[2],&eptr,10); if (eptr > argv[2] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { input = Fopen(argv[2],"r"); if (input == NULL) exit (1); iter = init_file_iterator(input); } else { pts = (int *) Malloc(sizeof(int)*2*(argc-1),"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 2) { int c, b, e; char *eptr, *fptr; for (c = 2; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == 0) { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = db->nreads; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } } else { pts[reps++] = 1; pts[reps++] = db->nreads; } } // Display each read (and/or QV streams) in the active DB according to the // range pairs in pts[0..reps) and according to the display options. { HITS_READ *reads; HITS_TRACK *first; char *read, **entry; int c, b, e, i; int hilight, substr; int map; int (*iscase)(int); read = New_Read_Buffer(db); if (DOQVS) { entry = New_QV_Buffer(db); first = db->tracks->next; } else { entry = NULL; first = db->tracks; } if (UPPER == 1) { hilight = 'A'-'a'; iscase = islower; } else { hilight = 'a'-'A'; iscase = isupper; } map = 0; reads = db->reads; substr = 0; c = 0; while (1) { if (input_pts) { if (next_read(iter)) break; e = iter->read; b = e-1; substr = (iter->beg >= 0); } else { if (c >= reps) break; b = pts[c]-1; e = pts[c+1]; if (e > db->nreads) e = db->nreads; c += 2; } for (i = b; i < e; i++) { int len; int fst, lst; int flags, qv; HITS_READ *r; HITS_TRACK *track; r = reads + i; len = r->rlen; flags = r->flags; qv = (flags & DB_QV); if (DAM) { char header[MAX_NAME]; fseeko(hdrs,r->coff,SEEK_SET); fgets(header,MAX_NAME,hdrs); header[strlen(header)-1] = '\0'; printf("%s :: Contig %d[%d,%d]",header,r->origin,r->fpulse,r->fpulse+len); } else { while (i < findx[map-1]) map -= 1; while (i >= findx[map]) map += 1; if (QUIVA) printf("@%s/%d/%d_%d",flist[map],r->origin,r->fpulse,r->fpulse+len); else printf(">%s/%d/%d_%d",flist[map],r->origin,r->fpulse,r->fpulse+len); if (qv > 0) printf(" RQ=0.%3d",qv); } printf("\n"); if (DOQVS) Load_QVentry(db,i,entry,UPPER); if (DOSEQ) Load_Read(db,i,read,UPPER); for (track = first; track != NULL; track = track->next) { int64 *anno; int *data; int64 s, f, j; int bd, ed, m; anno = (int64 *) track->anno; data = (int *) track->data; s = (anno[i] >> 2); f = (anno[i+1] >> 2); if (s < f) { for (j = s; j < f; j += 2) { bd = data[j]; ed = data[j+1]; if (DOSEQ) for (m = bd; m < ed; m++) if (iscase(read[m])) read[m] = (char) (read[m] + hilight); if (j == s) printf("> %s:",track->name); printf(" [%d,%d]",bd,ed); } printf("\n"); } } if (substr) { fst = iter->beg; lst = iter->end; } else { fst = 0; lst = len; } if (QUIVA) { int k; for (k = 0; k < 5; k++) printf("%.*s\n",lst-fst,entry[k]+fst); } else { if (DOQVS) { int j, k; printf("\n"); for (j = fst; j+WIDTH < lst; j += WIDTH) { if (DOSEQ) printf("%.*s\n",WIDTH,read+j); for (k = 0; k < 5; k++) printf("%.*s\n",WIDTH,entry[k]+j); printf("\n"); } if (j < lst) { if (DOSEQ) printf("%.*s\n",lst-j,read+j); for (k = 0; k < 5; k++) printf("%.*s\n",lst-j,entry[k]+j); printf("\n"); } } else if (DOSEQ) { int j; for (j = fst; j+WIDTH < lst; j += WIDTH) printf("%.*s\n",WIDTH,read+j); if (j < lst) printf("%.*s\n",lst-j,read+j); } } } } } if (input_pts) { fclose(input); free(iter); } else free(pts); if (DAM) fclose(hdrs); else { int i; for (i = 0; i < nfiles; i++) free(flist[i]); free(flist); free(findx-1); } Close_DB(db); exit (0); } DAZZ_DB-1.0/DBsplit.c000066400000000000000000000213021253752464600142100ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Split a .db into a set of sub-database blocks for use by the Dazzler: * Divide the database .db conceptually into a series of blocks referable to on the * command line as .1.db, .2.db, ... If the -x option is set then all reads * less than the given length are ignored, and if the -a option is not set then secondary * reads from a given well are also ignored. The remaining reads are split amongst the * blocks so that each block is of size -s * 1Mbp except for the last which necessarily * contains a smaller residual. The default value for -s is 400Mbp because blocks of this * size can be compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks * are very space efficient in that their sub-index of the master .idx is computed on the * fly when loaded, and the .bps file of base pairs is shared with the master DB. Any * tracks associated with the DB are also computed on the fly when loading a database block. * * Author: Gene Myers * Date : September 2013 * Mod : New splitting definition to support incrementality, and new stub file format * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-a] [-x] [-s] "; int main(int argc, char *argv[]) { HITS_DB db, dbs; int64 dbpos; FILE *dbfile, *ixfile; int status; int ALL; int CUTOFF; int SIZE; { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DBsplit") CUTOFF = 0; SIZE = 200; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("a") break; case 'x': ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff") break; case 's': ARG_POSITIVE(SIZE,"Block size") break; } else argv[j++] = argv[i]; argc = j; ALL = flags['a']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open db status = Open_DB(argv[1],&db); if (status < 0) exit (1); if (db.part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } { char *pwd, *root; char buffer[2*MAX_NAME+100]; int nfiles; int i; pwd = PathTo(argv[1]); if (status) { root = Root(argv[1],".dam"); dbfile = Fopen(Catenate(pwd,"/",root,".dam"),"r+"); } else { root = Root(argv[1],".db"); dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r+"); } ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); if (dbfile == NULL || ixfile == NULL) exit (1); free(pwd); free(root); if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR for (i = 0; i < nfiles; i++) if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) SYSTEM_ERROR if (fread(&dbs,sizeof(HITS_DB),1,ixfile) != 1) SYSTEM_ERROR if (dbs.cutoff >= 0) { printf("You are about to overwrite the current partition settings. This\n"); printf("will invalidate any tracks, overlaps, and other derivative files.\n"); printf("Are you sure you want to proceed? [Y/N] "); fflush(stdout); if (fgets(buffer,100,stdin) == NULL) SYSTEM_ERROR if (index(buffer,'n') != NULL || index(buffer,'N') != NULL) { printf("Aborted\n"); fflush(stdout); fclose(dbfile); exit (1); } } dbpos = ftello(dbfile); fseeko(dbfile,dbpos,SEEK_SET); fprintf(dbfile,DB_NBLOCK,0); fprintf(dbfile,DB_PARAMS,(int64) SIZE,CUTOFF,ALL); } { HITS_READ *reads = db.reads; int nreads = db.ureads; int64 size, totlen; int nblock, ireads, treads, rlen, fno; int i; size = SIZE*1000000ll; nblock = 0; totlen = 0; ireads = 0; treads = 0; fprintf(dbfile,DB_BDATA,0,0); if (ALL) for (i = 0; i < nreads; i++) { rlen = reads[i].rlen; if (rlen >= CUTOFF) { ireads += 1; treads += 1; totlen += rlen; if (totlen >= size) { fprintf(dbfile,DB_BDATA,i+1,treads); totlen = 0; ireads = 0; nblock += 1; } } } else for (i = 0; i < nreads; i++) { rlen = reads[i].rlen; if (rlen >= CUTOFF && (reads[i].flags & DB_BEST) != 0) { ireads += 1; treads += 1; totlen += rlen; if (totlen >= size) { fprintf(dbfile,DB_BDATA,i+1,treads); totlen = 0; ireads = 0; nblock += 1; } } } if (ireads > 0) { fprintf(dbfile,DB_BDATA,nreads,treads); nblock += 1; } fno = fileno(dbfile); if (ftruncate(fno,ftello(dbfile)) < 0) SYSTEM_ERROR fseeko(dbfile,dbpos,SEEK_SET); fprintf(dbfile,DB_NBLOCK,nblock); dbs.cutoff = CUTOFF; dbs.all = ALL; dbs.treads = treads; rewind(ixfile); fwrite(&dbs,sizeof(HITS_DB),1,ixfile); } fclose(ixfile); fclose(dbfile); Close_DB(&db); exit (0); } DAZZ_DB-1.0/DBstats.c000066400000000000000000000266411253752464600142260ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Display statistics about the contents of a .db and a histogram of its read lengths. * * Author: Gene Myers * Date : July 2013 * Mod : April 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" static char *Usage = " [-nu] [-b] [-m]+ "; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; int dam; int64 ototal; int oreads; int nbin, *hist; int64 *bsum; int NONE; int TRIM; int BIN; int MMAX, MTOP; char **MASK; { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DBstats") BIN = 1000; MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("nu") break; case 'b': ARG_POSITIVE(BIN,"Bin size") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; NONE = flags['n']; TRIM = 1-flags['u']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } { int i, status; // Open .db or .dam status = Open_DB(argv[1],db); if (status < 0) exit (1); dam = status; // Check tracks and load tracks for untrimmed DB for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i]); if (status == -2) fprintf(stderr,"%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]); else if (status == -1) fprintf(stderr,"%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]); else if (status == 0) Load_Track(db,MASK[i]); else if (status == 1 && !TRIM) fprintf(stderr,"%s: Warning: %s track is for a trimmed db but -u is set.\n", Prog_Name,MASK[i]); } oreads = db->nreads; ototal = db->totlen; if (TRIM) { Trim_DB(db); // Load tracks for trimmed DB for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i]); if (status < 0) continue; else if (status == 1) Load_Track(db,MASK[i]); } } } { int i; int64 totlen; int nreads, maxlen; int64 ave, dev; HITS_READ *reads; nreads = db->nreads; totlen = db->totlen; maxlen = db->maxlen; reads = db->reads; nbin = (maxlen-1)/BIN + 1; hist = (int *) Malloc(sizeof(int)*nbin,"Allocating histograms"); bsum = (int64 *) Malloc(sizeof(int64)*nbin,"Allocating histograms"); if (hist == NULL || bsum == NULL) exit (1); for (i = 0; i < nbin; i++) { hist[i] = 0; bsum[i] = 0; } for (i = 0; i < nreads; i++) { int rlen = reads[i].rlen; hist[rlen/BIN] += 1; bsum[rlen/BIN] += rlen; } nbin = (maxlen-1)/BIN + 1; ave = totlen/nreads; dev = 0; for (i = 0; i < nreads; i++) { int rlen = reads[i].rlen; dev += (rlen-ave)*(rlen-ave); } dev = (int64) sqrt((1.*dev)/nreads); if (dam) printf("\nStatistics for all contigs"); else if (db->all || !TRIM) printf("\nStatistics for all wells"); else printf("\nStatistics for all reads"); if (TRIM && db->cutoff > 0) { printf(" of length "); Print_Number(db->cutoff,0,stdout); printf(" bases or more\n\n"); } else if (dam) printf(" in the map index\n\n"); else printf(" in the data set\n\n"); Print_Number((int64) nreads,15,stdout); if (dam) printf(" contigs"); else printf(" reads "); if (TRIM) { printf(" out of "); Print_Number((int64 ) oreads,15,stdout); printf(" (%5.1f%%)",(100.*nreads)/oreads); } printf("\n"); Print_Number(totlen,15,stdout); printf(" base pairs"); if (TRIM) { printf(" out of "); Print_Number(ototal,15,stdout); printf(" (%5.1f%%)",(100.*totlen)/ototal); } printf("\n\n"); Print_Number(ave,15,stdout); if (dam) printf(" average contig length\n"); else { printf(" average read length\n"); Print_Number(dev,15,stdout); printf(" standard deviation\n"); } printf("\n Base composition: %.3f(A) %.3f(C) %.3f(G) %.3f(T)\n", db->freq[0],db->freq[1],db->freq[2],db->freq[3]); if (!NONE) { int64 btot; int cum, skip; printf("\n Distribution of Read Lengths (Bin size = "); Print_Number((int64) BIN,0,stdout); printf(")\n\n Bin: Count %% Reads %% Bases Average\n"); if (dam) skip = 0; else skip = -1; cum = 0; btot = 0; for (i = nbin-1; i >= 0; i--) { cum += hist[i]; btot += bsum[i]; if (hist[i] != skip) { Print_Number((int64) (i*BIN),11,stdout); printf(":"); Print_Number((int64) hist[i],11,stdout); printf(" %5.1f %5.1f %9lld\n",(100.*cum)/nreads, (100.*btot)/totlen,btot/cum); } if (cum == nreads) break; } } } { int64 totlen; int numint, maxlen; int64 ave, dev; HITS_TRACK *track; for (track = db->tracks; track != NULL; track = track->next) { char *data = track->data; int64 *anno = (int64 *) track->anno; int k, rlen; int *idata, *edata; totlen = 0; numint = 0; maxlen = 0; for (k = 0; k < db->nreads; k++) { edata = (int *) (data + anno[k+1]); for (idata = (int *) (data + anno[k]); idata < edata; idata += 2) { rlen = idata[1] - *idata; numint += 1; totlen += rlen; if (rlen > maxlen) maxlen = rlen; } } nbin = (maxlen-1)/BIN + 1; for (k = 0; k < nbin; k++) { hist[k] = 0; bsum[k] = 0; } ave = totlen/numint; dev = 0; for (k = 0; k < db->nreads; k++) { edata = (int *) (data + anno[k+1]); for (idata = (int *) (data + anno[k]); idata < edata; idata += 2) { rlen = idata[1] - *idata; dev += (rlen-ave)*(rlen-ave); hist[rlen/BIN] += 1; bsum[rlen/BIN] += rlen; } } dev = (int64) sqrt((1.*dev)/numint); printf("\n\nStatistics for %s-track\n",track->name); printf("\n There are "); Print_Number(numint,0,stdout); printf(" intervals totaling "); Print_Number(totlen,0,stdout); printf(" bases (%.1f%% of all data)\n",(100.*totlen)/db->totlen); { int64 btot; int cum; printf("\n Distribution of %s intervals (Bin size = ",track->name); Print_Number((int64) BIN,0,stdout); printf(")\n\n Bin: Count %% Intervals %% Bases Average\n"); cum = 0; btot = 0; for (k = nbin-1; k >= 0; k--) { cum += hist[k]; btot += bsum[k]; if (hist[k] > 0) { Print_Number((int64) (k*BIN),11,stdout); printf(":"); Print_Number((int64) hist[k],11,stdout); printf(" %5.1f %5.1f %9lld\n",(100.*cum)/numint, (100.*btot)/totlen,btot/cum); if (cum == numint) break; } } printf("\n"); } } } free(hist); free(bsum); Close_DB(db); exit (0); } DAZZ_DB-1.0/DBupgrade.Dec.31.2014.c000066400000000000000000000117311253752464600160120ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Interim code: upgrade previous db to have fpulse,rlen fields * * Author: Gene Myers * Date : December 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif typedef struct { int origin; // Well # int beg; // First pulse int end; // Last pulse int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file int flags; // QV of read + flags above } HITS_OLD; int main(int argc, char *argv[]) { HITS_DB db; FILE *nxfile, *ixfile; char *pwd, *root; int i; if (argc != 2) { fprintf(stderr,"Usage: %s \n",argv[0]); exit (1); } pwd = PathTo(argv[1]); root = Root(argv[1],".db"); ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r"); nxfile = Fopen(Catenate(pwd,PATHSEP,root,".ndx"),"w"); if (ixfile == NULL || nxfile == NULL) exit (1); free(pwd); free(root); if (fread(&db,sizeof(HITS_DB),1,ixfile) != 1) SYSTEM_ERROR fwrite(&db,sizeof(HITS_DB),1,nxfile); for (i = 0; i < db.oreads; i++) { HITS_OLD orec; HITS_READ nrec; if (fread(&orec,sizeof(HITS_OLD),1,ixfile) != 1) SYSTEM_ERROR nrec.origin = orec.origin; nrec.fpulse = orec.beg; nrec.rlen = orec.end-orec.beg; nrec.boff = orec.boff; nrec.coff = orec.coff; nrec.flags = orec.flags; fwrite(&nrec,sizeof(HITS_READ),1,nxfile); } fclose(ixfile); fclose(nxfile); exit (0); } DAZZ_DB-1.0/DBupgrade.Sep.25.2014.c000066400000000000000000000125731253752464600160560ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Interim code: upgrade previous db to have int's for pulse positions. * * Author: Gene Myers * Date : September 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif typedef struct { int origin; // Well # uint16 beg; // First pulse uint16 end; // Last pulse int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file int flags; // QV of read + flags above } HITS_OLD; typedef struct { int origin; // Well # int beg; // First pulse int end; // Last pulse int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file int flags; // QV of read + flags above } HITS_NEW; int main(int argc, char *argv[]) { HITS_DB db; FILE *nxfile, *ixfile; char *pwd, *root; int i; if (argc != 2) { fprintf(stderr,"Usage: %s \n",argv[0]); exit (1); } pwd = PathTo(argv[1]); root = Root(argv[1],".db"); ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r"); nxfile = Fopen(Catenate(pwd,PATHSEP,root,".ndx"),"w"); if (ixfile == NULL || nxfile == NULL) exit (1); free(pwd); free(root); if (fread(&db,sizeof(HITS_DB),1,ixfile) != 1) SYSTEM_ERROR fwrite(&db,sizeof(HITS_DB),1,nxfile); for (i = 0; i < db.oreads; i++) { HITS_OLD orec; HITS_NEW nrec; if (fread(&orec,sizeof(HITS_OLD),1,ixfile) != 1) SYSTEM_ERROR nrec.origin = orec.origin; nrec.beg = orec.beg; nrec.end = orec.end; nrec.boff = orec.boff; nrec.coff = orec.coff; nrec.flags = orec.flags; fwrite(&nrec,sizeof(HITS_NEW),1,nxfile); } fclose(ixfile); fclose(nxfile); exit (0); } DAZZ_DB-1.0/DUSTupgrade.Jan.1.2015.c000066400000000000000000000117761253752464600162500ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Interim code: upgrade previous db to have fpulse,rlen fields * * Author: Gene Myers * Date : December 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif int main(int argc, char *argv[]) { FILE *afile, *dfile; FILE *nafile, *ndfile; char *pwd, *root; int size, tracklen; int i, vint, dint; int64 vlong; if (argc != 2) { fprintf(stderr,"Usage: %s \n",argv[0]); exit (1); } pwd = PathTo(argv[1]); root = Root(argv[1],".db"); afile = Fopen(Catenate(pwd,PATHSEP,root,".dust.anno"),"r"); dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"r"); nafile = Fopen(Catenate(pwd,PATHSEP,root,".next.anno"),"w"); ndfile = Fopen(Catenate(pwd,PATHSEP,root,".next.data"),"w"); if (afile == NULL || dfile == NULL || nafile == NULL || ndfile == NULL) exit (1); free(pwd); free(root); if (fread(&tracklen,sizeof(int),1,afile) != 1) SYSTEM_ERROR fwrite(&tracklen,sizeof(int),1,nafile); if (fread(&size,sizeof(int),1,afile) != 1) SYSTEM_ERROR size = 8; fwrite(&size,sizeof(int),1,nafile); for (i = 0; i <= tracklen; i++) { if (fread(&vint,sizeof(int),1,afile) != 1) SYSTEM_ERROR vlong = vint; fwrite(&vlong,sizeof(int64),1,nafile); } vint >>= 2; for (i = 0; i < vint; i += 2) { if (fread(&dint,sizeof(int),1,dfile) != 1) SYSTEM_ERROR fwrite(&dint,sizeof(int),1,ndfile); if (fread(&dint,sizeof(int),1,dfile) != 1) SYSTEM_ERROR dint += 1; fwrite(&dint,sizeof(int),1,ndfile); } fclose(nafile); fclose(ndfile); fclose(afile); fclose(dfile); exit (0); } DAZZ_DB-1.0/Makefile000066400000000000000000000040001253752464600141370ustar00rootroot00000000000000CFLAGS = -O3 -Wall -Wextra -fno-strict-aliasing ALL = fasta2DB DB2fasta quiva2DB DB2quiva DBsplit DBdust Catrack DBshow DBstats DBrm simulator \ fasta2DAM DAM2fasta all: $(ALL) fasta2DB: fasta2DB.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o fasta2DB fasta2DB.c DB.c QV.c -lm DB2fasta: DB2fasta.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DB2fasta DB2fasta.c DB.c QV.c -lm quiva2DB: quiva2DB.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o quiva2DB quiva2DB.c DB.c QV.c -lm DB2quiva: DB2quiva.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DB2quiva DB2quiva.c DB.c QV.c -lm DBsplit: DBsplit.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBsplit DBsplit.c DB.c QV.c -lm DBdust: DBdust.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBdust DBdust.c DB.c QV.c -lm Catrack: Catrack.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o Catrack Catrack.c DB.c QV.c -lm DBshow: DBshow.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBshow DBshow.c DB.c QV.c -lm DBstats: DBstats.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBstats DBstats.c DB.c QV.c -lm DBrm: DBrm.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBrm DBrm.c DB.c QV.c -lm simulator: simulator.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o simulator simulator.c DB.c QV.c -lm fasta2DAM: fasta2DAM.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o fasta2DAM fasta2DAM.c DB.c QV.c -lm DAM2fasta: DAM2fasta.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DAM2fasta DAM2fasta.c DB.c QV.c -lm DBupgrade.Sep.25.2014: DBupgrade.Sep.25.2014.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBupgrade.Sep.25.2014 DBupgrade.Sep.25.2014.c DB.c QV.c -lm DBupgrade.Dec.31.2014: DBupgrade.Dec.31.2014.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBupgrade.Dec.31.2014 DBupgrade.Dec.31.2014.c DB.c QV.c -lm DUSTupgrade.Jan.1.2015: DUSTupgrade.Jan.1.2015.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DUSTupgrade.Jan.1.2015 DUSTupgrade.Jan.1.2015.c DB.c QV.c -lm clean: rm -f $(ALL) rm -fr *.dSYM rm -f DBupgrade.Sep.25.2014 DBupgrade.Dec.31.2014 DUSTupgrade.Jan.1.2015 rm -f dazz.db.tar.gz install: cp $(ALL) ~/bin package: make clean tar -zcf dazz.db.tar.gz README Makefile *.h *.c DAZZ_DB-1.0/QV.c000066400000000000000000001132511253752464600132020ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #undef DEBUG #define MIN_BUFFER 1000 #define HUFF_CUTOFF 16 // This cannot be larger than 16 ! /******************************************************************************************* * * Endian flipping routines * ********************************************************************************************/ static int LittleEndian; // Little-endian machine ? // Referred by: Decode & Decode_Run static int Flip; // Flip endian of all coded shorts and ints // Referred by: Decode & Decode_Run & Read_Scheme static void Set_Endian(int flip) { uint32 x = 3; uint8 *b = (uint8 *) (&x); Flip = flip; LittleEndian = (b[0] == 3); } static void Flip_Long(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[3]; v[3] = x; x = v[1]; v[1] = v[2]; v[2] = x; } static void Flip_Short(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[1]; v[1] = x; } /******************************************************************************************* * * Routines for computing a Huffman Encoding Scheme * ********************************************************************************************/ typedef struct { int type; // 0 => normal, 1 => normal but has long codes, 2 => truncated uint32 codebits[256]; // If type = 2, then code 255 is the special code for int codelens[256]; // non-Huffman exceptions int lookup[0x10000]; // Lookup table (just for decoding) } HScheme; typedef struct _HTree { struct _HTree *lft, *rgt; uint64 count; } HTree; // Establish heap property from node s down (1 is root, siblings of n are 2n and 2n+1) // assuming s is the only perturbation in the tree. static void Reheap(int s, HTree **heap, int hsize) { int c, l, r; HTree *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; hr = heap[r]; if (r > hsize || hr->count > hl->count) { if (hs->count > hl->count) { heap[c] = hl; c = l; } else break; } else { if (hs->count > hr->count) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } // Given Huffman tree build a table of codes from it, the low-order codelens[s] bits // of codebits[s] contain the code for symbol s. static void Build_Table(HTree *node, int code, int len, uint32 *codebits, int *codelens) { if (node->rgt == NULL) { uint64 symbol = (uint64) (node->lft); codebits[symbol] = code; codelens[symbol] = len; } else { code <<= 1; len += 1; Build_Table(node->lft,code,len,codebits,codelens); Build_Table(node->rgt,code+1,len,codebits,codelens); } } // For the non-zero symbols in hist, compute a huffman tree over them, and then // build a table of the codes. If inscheme is not NULL, then place all symbols // with code 255 or with more than HUFF_CUTOFF bits in the encoding by inscheme // as a single united entity, whose code signals that the value of these symbols // occur explicitly in 8 (values) or 16 (run lengths) bits following the code. // All the symbols in this class will have the same entry in the code table and // 255 is always in this class. static HScheme *Huffman(uint64 *hist, HScheme *inscheme) { HScheme *scheme; HTree *heap[259]; HTree node[512]; int hsize; HTree *lft, *rgt; int value, range; int i; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); hsize = 0; // Load heap value = 0; if (inscheme != NULL) { node[0].count = 0; node[0].lft = (HTree *) (uint64) 255; node[0].rgt = NULL; heap[++hsize] = node+(value++); } for (i = 0; i < 256; i++) if (hist[i] > 0) { if (inscheme != NULL && (inscheme->codelens[i] > HUFF_CUTOFF || i == 255)) node[0].count += hist[i]; else { node[value].count = hist[i]; node[value].lft = (HTree *) (uint64) i; node[value].rgt = NULL; heap[++hsize] = node+(value++); } } for (i = hsize/2; i >= 1; i--) // Establish heap property Reheap(i,heap,hsize); range = value; // Merge pairs with smallest count until have a tree for (i = 1; i < value; i++) { lft = heap[1]; heap[1] = heap[hsize--]; Reheap(1,heap,hsize); rgt = heap[1]; node[range].lft = lft; node[range].rgt = rgt; node[range].count = lft->count + rgt->count; heap[1] = node+(range++); Reheap(1,heap,hsize); } for (i = 0; i < 256; i++) // Build the code table { scheme->codebits[i] = 0; scheme->codelens[i] = 0; } Build_Table(node+(range-1),0,0,scheme->codebits,scheme->codelens); if (inscheme != NULL) // Set scheme type and if truncated (2), map truncated codes { scheme->type = 2; // to code and length for 255 for (i = 0; i < 255; i++) if (inscheme->codelens[i] > HUFF_CUTOFF || scheme->codelens[i] > HUFF_CUTOFF) { scheme->codelens[i] = scheme->codelens[255]; scheme->codebits[i] = scheme->codebits[255]; } } else { scheme->type = 0; for (i = 0; i < 256; i++) { if (scheme->codelens[i] > HUFF_CUTOFF) scheme->type = 1; } } return (scheme); } #ifdef DEBUG // For debug, show the coding table static void Print_Table(HScheme *scheme, uint64 *hist, int infosize) { uint64 total_bits; uint32 specval, mask, code, *bits; int speclen, clen, *lens; int i, k; total_bits = 0; bits = scheme->codebits; lens = scheme->codelens; if (scheme->type == 2) { specval = bits[255]; speclen = lens[255]; } else specval = speclen = 0x7fffffff; printf("\nCode Table:\n"); for (i = 0; i < 256; i++) if (lens[i] > 0) { clen = lens[i]; mask = (1 << clen); code = bits[i]; printf(" %3d: %2d ",i,clen); for (k = 0; k < clen; k++) { mask >>= 1; if (code & mask) printf("1"); else printf("0"); } if (code == specval && clen == speclen) { printf(" ***"); if (hist != NULL) total_bits += (clen+infosize)*hist[i]; } else if (hist != NULL) total_bits += clen*hist[i]; printf("\n"); } if (hist != NULL) printf("\nTotal Bytes = %lld\n",(total_bits-1)/8+1); } // For debug, show the histogram static void Print_Histogram(uint64 *hist) { int i, low, hgh; uint64 count; for (hgh = 255; hgh >= 0; hgh--) if (hist[hgh] != 0) break; for (low = 0; low < 256; low++) if (hist[low] != 0) break; count = 0; for (i = low; i <= hgh; i++) count += hist[i]; for (i = hgh; i >= low; i--) printf(" %3d: %8llu %5.1f%%\n",i,hist[i],(hist[i]*100.)/count); } #endif /******************************************************************************************* * * Read and Write Huffman Schemes * ********************************************************************************************/ // Write the code table to out. static void Write_Scheme(HScheme *scheme, FILE *out) { int i; uint8 x; uint32 *bits; int *lens; lens = scheme->codelens; bits = scheme->codebits; x = (uint8) (scheme->type); fwrite(&x,1,1,out); for (i = 0; i < 256; i++) { x = (uint8) (lens[i]); fwrite(&x,1,1,out); if (x > 0) fwrite(bits+i,sizeof(uint32),1,out); } } // Allocate and read a code table from in, and return a pointer to it. static HScheme *Read_Scheme(FILE *in) { HScheme *scheme; int *look, *lens; uint32 *bits, base; int i, j, powr; uint8 x; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); lens = scheme->codelens; bits = scheme->codebits; look = scheme->lookup; if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n"); free(scheme); return (NULL); } scheme->type = x; for (i = 0; i < 256; i++) { if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i); return (NULL); } lens[i] = x; if (x > 0) { if (fread(bits+i,sizeof(uint32),1,in) != 1) { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i); free(scheme); return (NULL); } } else bits[i] = 0; } if (Flip) { for (i = 0; i < 256; i++) Flip_Long(bits+i); } for (i = 0; i < 256; i++) { if (lens[i] > 0) { base = (bits[i] << (16-lens[i])); powr = (1 << (16-lens[i])); for (j = 0; j < powr; j++) look[base+j] = i; } } return (scheme); } /******************************************************************************************* * * Encoders and Decoders * ********************************************************************************************/ // Encode read[0..rlen-1] according to scheme and write to out static void Encode(HScheme *scheme, FILE *out, uint8 *read, int rlen) { uint32 x, c, ocode; int n, k, olen, llen; int *nlens; uint32 *nbits; uint32 nspec; int nslen; nlens = scheme->codelens; nbits = scheme->codebits; if (scheme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; #define OCODE(L,C) \ { int len = olen + (L); \ uint32 code = (C); \ \ llen = olen; \ if (len >= 32) \ { olen = len-32; \ ocode |= (code >> olen); \ fwrite(&ocode,sizeof(uint32),1,out); \ if (olen > 0) \ ocode = (code << (32-olen)); \ else \ ocode = 0; \ } \ else \ { olen = len; \ ocode |= (code << (32-olen));; \ } \ } llen = 0; olen = 0; ocode = 0; for (k = 0; k < rlen; k++) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); } if (olen > 0) // Tricky: must pad so decoder does not read past { fwrite(&ocode,sizeof(uint32),1,out); // last integer int the coded output. if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Encode read[0..rlen-1] according to non-rchar table neme, and run-length table reme for // runs of rchar characters. Write to out. static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int rlen, int rchar) { uint32 x, c, ocode; int n, h, k, olen, llen; int *nlens, *rlens; uint32 *nbits, *rbits; uint32 nspec, rspec; int nslen, rslen; nlens = neme->codelens; nbits = neme->codebits; rlens = reme->codelens; rbits = reme->codebits; if (neme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; rspec = rbits[255]; rslen = rlens[255]; llen = 0; olen = 0; ocode = 0; k = 0; while (k < rlen) { h = k; while (k < rlen && read[k] == rchar) k += 1; if (k-h >= 255) x = 255; else x = k-h; n = rlens[x]; c = rbits[x]; OCODE(n,c); if (c == rspec && n == rslen) OCODE(16,k-h); if (k < rlen) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); k += 1; } } if (olen > 0) { fwrite(&ocode,sizeof(uint32),1,out); if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Read and decode from in, the next rlen symbols into read according to scheme static int Decode(HScheme *scheme, FILE *in, char *read, int rlen) { int *look, *lens; int signal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (scheme->type == 2) signal = 255; else signal = 256; lens = scheme->codelens; look = scheme->lookup; #define GET \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } #define GETFLIP \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ Flip_Long(ipart); \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = look[*xpart]; n = lens[c]; if (c == signal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } else for (j = 0; j < rlen; j++) { GET c = look[*xpart]; n = lens[c]; if (c == signal) { GET c = *cpart; n = 8; } read[j] = (char) c; } return (0); } // Read and decode from in, the next rlen symbols into read according to non-rchar scheme // neme, and the rchar runlength shceme reme static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, int rlen, int rchar) { int *nlook, *nlens; int *rlook, *rlens; int nsignal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c, k; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (neme->type == 2) nsignal = 255; else nsignal = 256; nlens = neme->codelens; nlook = neme->lookup; rlens = reme->codelens; rlook = reme->lookup; n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GETFLIP c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GETFLIP c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } } else for (j = 0; j < rlen; j++) { GET c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GET c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GET c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GET c = *cpart; n = 8; } read[j] = (char) c; } } return (0); } /******************************************************************************************* * * Histogrammers * ********************************************************************************************/ // Histogram runlengths of symbol runChar in stream[0..rlen-1] into run. static void Histogram_Seqs(uint64 *hist, uint8 *stream, int rlen) { int k; for (k = 0; k < rlen; k++) hist[stream[k]] += 1; } static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar) { int k, h; k = 0; while (k < rlen) { h = k; while (k < rlen && stream[k] == runChar) k += 1; if (k-h >= 256) run[255] += 1; else run[k-h] += 1; if (k < rlen) k += 1; } } /******************************************************************************************* * * Reader * ********************************************************************************************/ static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan, static int Rmax = -1; // Compress_Next_QVentry static int Nline; // Referred by: QVcoding_Scan char *QVentry() { return (Read); } // If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines // for a sequence. Place line j at Read+j*Rmax and the length of every line is returned // unless eof occurs in which case return -1. If any error occurs return -2. int Read_Lines(FILE *input, int nlines) { int i, rlen; int tmax; char *tread; char *other; if (Read == NULL) { tmax = MIN_BUFFER; tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; } Nline += 1; if (fgets(Read,Rmax,input) == NULL) return (-1); rlen = strlen(Read); while (Read[rlen-1] != '\n') { tmax = ((int) 1.4*Rmax) + MIN_BUFFER; tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; if (fgets(Read+rlen,Rmax-rlen,input) == NULL) { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline); EXIT(-2); } rlen += strlen(Read+rlen); } other = Read; for (i = 1; i < nlines; i++) { other += Rmax; Nline += 1; if (fgets(other,Rmax,input) == NULL) { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(-2); } if (rlen != (int) strlen(other)) { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline); EXIT(-2); } } return (rlen-1); } /******************************************************************************************* * * Tag compression and decompression routines * ********************************************************************************************/ // Keep only the symbols in tags[0..rlen-1] for which qvs[k] != rchar and // return the # of symbols kept. static int Pack_Tag(char *tags, char *qvs, int rlen, int rchar) { int j, k; j = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) tags[j++] = tags[k]; tags[j] = '\0'; return (j); } // Count the # of non-rchar symbols in qvs[0..rlen-1] static int Packed_Length(char *qvs, int rlen, int rchar) { int k, clen; clen = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) clen += 1; return (clen); } // Unpack tags by moving its i'th char to position k where qvs[k] is the i'th non-rchar // symbol in qvs. All other chars are set to rchar. rlen is the length of qvs and // the unpacked result, clen is the initial length of tags. static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar) { int j, k; j = clen-1; for (k = rlen-1; k >= 0; k--) { if (qvs[k] == rchar) tags[k] = 'n'; else tags[k] = tags[j--]; } } /******************************************************************************************* * * Statistics Scan and Scheme creation and write * ********************************************************************************************/ // Read .quiva file from input, recording stats in the histograms. If zero is set then // start the stats anew with this file. static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256]; static uint64 totChar; static int delChar, subChar; // Referred by: QVcoding_Scan, Create_QVcoding int QVcoding_Scan(FILE *input) { char *slash; int rlen; // Zero histograms bzero(delHist,sizeof(uint64)*256); bzero(mrgHist,sizeof(uint64)*256); bzero(insHist,sizeof(uint64)*256); bzero(subHist,sizeof(uint64)*256); { int i; for (i = 0; i < 256; i++) delRun[i] = subRun[i] = 1; } totChar = 0; delChar = -1; subChar = -1; // Make a sweep through the .quiva entries, histogramming the relevant things // and figuring out the run chars for the deletion and substition streams Nline = 0; while (1) { int well, beg, end, qv; rlen = Read_Lines(input,1); if (rlen == -2) EXIT(1); if (rlen < 0) break; if (rlen == 0 || Read[0] != '@') { EPRINTF(EPLACE,"Line %d: Header in quiv file is missing\n",Nline); EXIT(1); } slash = index(Read+1,'/'); if (slash == NULL) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(1); } if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(1); } rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(1); } Histogram_Seqs(delHist,(uint8 *) (Read),rlen); Histogram_Seqs(insHist,(uint8 *) (Read+2*Rmax),rlen); Histogram_Seqs(mrgHist,(uint8 *) (Read+3*Rmax),rlen); Histogram_Seqs(subHist,(uint8 *) (Read+4*Rmax),rlen); if (delChar < 0) { int k; char *del = Read+Rmax; for (k = 0; k < rlen; k++) if (del[k] == 'n' || del[k] == 'N') { delChar = Read[k]; break; } } if (delChar >= 0) Histogram_Runs( delRun,(uint8 *) (Read),rlen,delChar); totChar += rlen; if (subChar < 0) { if (totChar >= 100000) { int k; subChar = 0; for (k = 1; k < 256; k++) if (subHist[k] > subHist[subChar]) subChar = k; } } if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar); } return (0); } // Using the statistics in the global stat tables, create the Huffman schemes and write // them to output. If lossy is set, then create a lossy table for the insertion and merge // QVs. QVcoding *Create_QVcoding(int lossy) { static QVcoding coding; HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; HScheme *dRunScheme, *sRunScheme; delScheme = NULL; dRunScheme = NULL; insScheme = NULL; mrgScheme = NULL; subScheme = NULL; sRunScheme = NULL; // Check whether using a subtitution run char is a win if (totChar < 200000 || subHist[subChar] < .5*totChar) subChar = -1; // If lossy encryption is enabled then scale insertions and merge QVs. if (lossy) { int k; for (k = 0; k < 256; k += 2) { insHist[k] += insHist[k+1]; insHist[k+1] = 0; } for (k = 0; k < 256; k += 4) { mrgHist[k] += mrgHist[k+1]; mrgHist[k] += mrgHist[k+2]; mrgHist[k] += mrgHist[k+3]; mrgHist[k+1] = 0; mrgHist[k+2] = 0; mrgHist[k+3] = 0; } } // Build a Huffman scheme for each stream entity from the histograms #define SCHEME_MACRO(meme,hist,label,bits) \ scheme = Huffman( (hist), NULL); \ if (scheme == NULL) \ goto error; \ if (scheme->type) \ { (meme) = Huffman( (hist), scheme); \ free(scheme); \ } \ else \ (meme) = scheme; #ifdef DEBUG #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) \ printf("\n%s\n", (label) ); \ Print_Histogram( (hist)); \ Print_Table( (meme), (hist), (bits)); #else #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) #endif { HScheme *scheme; if (delChar < 0) { MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs", 8); dRunScheme = NULL; } else { delHist[delChar] = 0; MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs less run char", 8); MAKE_SCHEME(dRunScheme,delRun, "Histogram of Deletion Runs QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",delChar); #endif } #ifdef DEBUG { int k; uint64 count; count = 0; for (k = 0; k < 256; k++) count += delHist[k]; printf("\nDelTag will require %lld bytes\n",count/4); } #endif MAKE_SCHEME(insScheme,insHist, "Hisotgram of Insertion QVs", 8); MAKE_SCHEME(mrgScheme,mrgHist, "Hisotgram of Merge QVs", 8); if (subChar < 0) { MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs", 8); sRunScheme = NULL; } else { subHist[subChar] = 0; MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs less run char", 8); MAKE_SCHEME(sRunScheme,subRun, "Histogram of Substitution Run QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",subChar); #endif } } // Setup endian handling Set_Endian(0); coding.delScheme = delScheme; coding.insScheme = insScheme; coding.mrgScheme = mrgScheme; coding.subScheme = subScheme; coding.dRunScheme = dRunScheme; coding.sRunScheme = sRunScheme; coding.delChar = delChar; coding.subChar = subChar; coding.prefix = NULL; coding.flip = 0; return (&coding); error: if (delScheme != NULL) free(delScheme); if (dRunScheme != NULL) free(dRunScheme); if (insScheme != NULL) free(insScheme); if (mrgScheme != NULL) free(mrgScheme); if (subScheme != NULL) free(subScheme); if (sRunScheme != NULL) free(sRunScheme); EXIT(NULL); } // Write the encoding scheme 'coding' to 'output' void Write_QVcoding(FILE *output, QVcoding *coding) { // Write out the endian key, run chars, and prefix (if not NULL) { uint16 half; int len; half = 0x33cc; fwrite(&half,sizeof(uint16),1,output); if (coding->delChar < 0) half = 256; else half = (uint16) (coding->delChar); fwrite(&half,sizeof(uint16),1,output); if (coding->subChar < 0) half = 256; else half = (uint16) (coding->subChar); fwrite(&half,sizeof(uint16),1,output); len = strlen(coding->prefix); fwrite(&len,sizeof(int),1,output); fwrite(coding->prefix,1,len,output); } // Write out the scheme tables Write_Scheme(coding->delScheme,output); if (coding->delChar >= 0) Write_Scheme(coding->dRunScheme,output); Write_Scheme(coding->insScheme,output); Write_Scheme(coding->mrgScheme,output); Write_Scheme(coding->subScheme,output); if (coding->subChar >= 0) Write_Scheme(coding->sRunScheme,output); } // Read the encoding scheme 'coding' to 'output' QVcoding *Read_QVcoding(FILE *input) { static QVcoding coding; // Read endian key, run chars, and short name common to all headers { uint16 half; int len; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n"); EXIT(NULL); } coding.flip = (half != 0x33cc); if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.delChar = half; if (coding.delChar >= 256) coding.delChar = -1; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.subChar = half; if (coding.subChar >= 256) coding.subChar = -1; // Read the short name common to all headers if (fread(&len,sizeof(int),1,input) != 1) { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Long(&len); coding.prefix = (char *) Malloc(len+1,"Allocating header prefix"); if (coding.prefix == NULL) EXIT(NULL); if (len > 0) { if (fread(coding.prefix,len,1,input) != 1) { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n"); EXIT(NULL); } } coding.prefix[len] = '\0'; } // Setup endian handling Set_Endian(coding.flip); // Read the Huffman schemes used to compress the data coding.delScheme = NULL; coding.dRunScheme = NULL; coding.insScheme = NULL; coding.mrgScheme = NULL; coding.subScheme = NULL; coding.sRunScheme = NULL; coding.delScheme = Read_Scheme(input); if (coding.delScheme == NULL) goto error; if (coding.delChar >= 0) { coding.dRunScheme = Read_Scheme(input); if (coding.dRunScheme == NULL) goto error; } coding.insScheme = Read_Scheme(input); if (coding.insScheme == NULL) goto error; coding.mrgScheme = Read_Scheme(input); if (coding.mrgScheme == NULL) goto error; coding.subScheme = Read_Scheme(input); if (coding.subScheme == NULL) goto error; if (coding.subChar >= 0) { coding.sRunScheme = Read_Scheme(input); if (coding.sRunScheme == NULL) goto error; } return (&coding); error: if (coding.delScheme != NULL) free(coding.delScheme); if (coding.dRunScheme != NULL) free(coding.dRunScheme); if (coding.insScheme != NULL) free(coding.insScheme); if (coding.mrgScheme != NULL) free(coding.mrgScheme); if (coding.subScheme != NULL) free(coding.subScheme); if (coding.sRunScheme != NULL) free(coding.sRunScheme); EXIT(NULL); } // Free all the auxilliary storage associated with the encoding argument void Free_QVcoding(QVcoding *coding) { if (coding->subChar >= 0) free(coding->sRunScheme); free(coding->subScheme); free(coding->mrgScheme); free(coding->insScheme); if (coding->delChar >= 0) free(coding->dRunScheme); free(coding->delScheme); free(coding->prefix); } /******************************************************************************************* * * Encode/Decode (w.r.t. coding) next entry from input and write to output * ********************************************************************************************/ int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) { int rlen, clen; // Get all 5 streams, compress each with its scheme, and output rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT (1); } if (coding->delChar < 0) { Encode(coding->delScheme, output, (uint8 *) Read, rlen); clen = rlen; } else { Encode_Run(coding->delScheme, coding->dRunScheme, output, (uint8 *) Read, rlen, coding->delChar); clen = Pack_Tag(Read+Rmax,Read,rlen,coding->delChar); } Number_Read(Read+Rmax); Compress_Read(clen,Read+Rmax); fwrite(Read+Rmax,1,COMPRESSED_LEN(clen),output); if (lossy) { uint8 *insert = (uint8 *) (Read+2*Rmax); uint8 *merge = (uint8 *) (Read+3*Rmax); int k; for (k = 0; k < rlen; k++) { insert[k] = (uint8) ((insert[k] >> 1) << 1); merge[k] = (uint8) (( merge[k] >> 2) << 2); } } Encode(coding->insScheme, output, (uint8 *) (Read+2*Rmax), rlen); Encode(coding->mrgScheme, output, (uint8 *) (Read+3*Rmax), rlen); if (coding->subChar < 0) Encode(coding->subScheme, output, (uint8 *) (Read+4*Rmax), rlen); else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) (Read+4*Rmax), rlen, coding->subChar); return (0); } int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) { int clen, tlen; // Decode each stream and write to output if (coding->delChar < 0) { if (Decode(coding->delScheme, input, entry[0], rlen)) EXIT(1); clen = rlen; tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); } else { if (Decode_Run(coding->delScheme, coding->dRunScheme, input, entry[0], rlen, coding->delChar)) EXIT(1); clen = Packed_Length(entry[0],rlen,coding->delChar); tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar); } if (Decode(coding->insScheme, input, entry[2], rlen)) EXIT(1); if (Decode(coding->mrgScheme, input, entry[3], rlen)) EXIT(1); if (coding->subChar < 0) { if (Decode(coding->subScheme, input, entry[4], rlen)) EXIT(1); } else { if (Decode_Run(coding->subScheme, coding->sRunScheme, input, entry[4], rlen, coding->subChar)) EXIT(1); } return (0); } DAZZ_DB-1.0/QV.h000066400000000000000000000165041253752464600132120ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #ifndef _QV_COMPRESSOR #define _QV_COMPRESSOR // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or // batch version of the routines in this library are compiled. In batch mode, routines // print an error message and exit. In interactive mode, the routines place the error // message in EPLACE (also defined in DB.h) and return an error value, typically NULL // if the routine returns a pointer, and an unusual integer value if the routine returns // an integer. // Below when an error return is described, one should understand that this value is returned // only if the routine was compiled in INTERACTIVE mode. // A PacBio compression scheme typedef struct { void *delScheme; // Huffman scheme for deletion QVs void *insScheme; // Huffman scheme for insertion QVs void *mrgScheme; // Huffman scheme for merge QVs void *subScheme; // Huffman scheme for substitution QVs void *dRunScheme; // Huffman scheme for deletion run lengths (if delChar > 0) void *sRunScheme; // Huffman scheme for substitution run lengths (if subChar > 0) int delChar; // If > 0, run-encoded deletion value int subChar; // If > 0, run-encoded substitution value int flip; // Need to flip multi-byte integers char *prefix; // Header line prefix } QVcoding; // Read the next nlines of input, and QVentry returns a pointer to the first line if needed. // If end-of-input is encountered before any further input, -1 is returned. If there is // an error than -2 is returned. Otherwise the length of the line(s) read is returned. int Read_Lines(FILE *input, int nlines); char *QVentry(); // Read the .quiva file on input and record frequency statistics. If there is an error // then 1 is returned, otherwise 0. int QVcoding_Scan(FILE *input); // Given QVcoding_Scan has been called at least once, create an encoding scheme based on // the accumulated statistics and return a pointer to it. The returned encoding object // is *statically allocated within the routine. If lossy is set then use a lossy scaling // for the insertion and merge streams. If there is an error, then NULL is returned. QVcoding *Create_QVcoding(int lossy); // Read/write a coding scheme to input/output. The encoding object returned by the reader // is *statically* allocated within the routine. If an error occurs while reading then // NULL is returned. QVcoding *Read_QVcoding(FILE *input); void Write_QVcoding(FILE *output, QVcoding *coding); // Free all the auxiliary storage associated with coding (but not the object itself!) void Free_QVcoding(QVcoding *coding); // Assuming the file pointer is positioned just beyond an entry header line, read the // next set of 5 QV lines, compress them according to 'coding', and output. If lossy // is set then the scheme is a lossy one. A non-zero value is return only if an // error occured. int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); // Assuming the input is position just beyond the compressed encoding of an entry header, // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, // and place their decompressed values into entry which is a 5 element array of character // pointers. The parameter rlen computed from the preceeding header line, critically // provides the length of each of the 5 vectors. A non-zero value is return only if an // error occured. int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); #endif // _QV_COMPRESSOR DAZZ_DB-1.0/README000066400000000000000000000641601253752464600133740ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /************************************************************************************\ UPGRADE & DEVELOPER NOTES ! ! ! If you have already built a big database and don't want to rebuild it, but do want to use a more recent version of the software that entails a change to the data structures (currently the updates on Sept 25, 2014 and December 31, 2014), please note the routines DBupgrade.Sep.25.2014 and DBupgrade.Dec.31.2014. These take a DB, say X, as an argument, and produce a file .X.ndx which you should then replace .X.idx with. To update a very old DB to today's version you will need to run both in sequence. Both of the upgrade programs can be made with "make" but are not by default created when make is called without an argument. For those interested in the details, on September 25, the "beg" and "end" fields went from shorts to ints, and on December 31, the "beg" and "end" fields became "fpulse" and "rlen", respectively where fpulse = beg and rlen = end-beg. Unfortunately, the .dust track formats also changed on Dec.31.2014 and Jan.1.2015. To upgrade said use DUSTupgrade.Jan.1.2015. This program takes a DB, say X as an argument and produces .X.next.anno and .X.next.data which you should then replace .X.dust.* with. Of course, it may, if the DB is not too big, be easier and simply to just rerun DBdust. Developers should also note carefully that the calling conventions to Open_DB have changed and there are new utility routines Number_Digits and Check_Track. \************************************************************************************/ The Dazzler Database Library Author: Gene Myers First: July 17, 2013 Current: December 31, 2014 To facilitate the multiple phases of the dazzler assembler, we organize all the read data into what is effectively a "database" of the reads and their meta-information. The design goals for this data base are as follows: (1) The database stores the source Pacbio read information in such a way that it can recreate the original input data, thus permitting a user to remove the (effectively redundant) source files. This avoids duplicating the same data, once in the source file and once in the database. (2) The data base can be built up incrementally, that is new sequence data can be added to the data base over time. (3) The data base flexibly allows one to store any meta-data desired for reads. This is accomplished with the concept of *tracks* that implementors can add as they need them. (4) The data is held in a compressed form equivalent to the .dexta and .dexqv files of the data extraction module. Both the .fasta and .quiva information for each read is held in the data base and can be recreated from it. The .quiva information can be added separately and later on if desired. (5) To facilitate job parallel, cluster operation of the phases of our assembler, the data base has a concept of a *current partitioning* in which all the reads that are over a given length and optionally unique to a well, are divided up into *blocks* containing roughly a given number of bases, except possibly the last block which may have a short count. Often programs con be run on blocks or pairs of blocks and each such job is reasonably well balanced as the blocks are all the same size. One must be careful about changing the partition during an assembly as doing so can void the structural validity of any interim block-based results. A Dazzler DB consists of one named, *visible* file, e.g. FOO.db, and several *invisible* secondary files encoding various elements of the DB. The secondary files are "invisible" to the UNIX OS in the sense that they begin with a "." and hence are not listed by "ls" unless one specifies the -a flag. We chose to do this so that when a user lists the contents of a directory they just see a single name, e.g. FOO.db, that is the one used to refer to the DB in commands. The files associated with a database named, say FOO, are as follows: (a) "FOO.db": a text file containing (i) the list of input files added to the database so far, and (ii) how to partition the database into blocks (if the partition parameters have been set). (b) ".FOO.idx": a binary "index" of all the meta-data about each read allowing, for example, one to randomly access a read's sequence (in the store ".FOO.bps"). It is 28N + 88 bytes in size where N is the number of reads in the database. (c) ".FOO.bps": a binary compressed "store" of all the DNA sequences. It is M/4 bytes in size where M is the total number of base pairs in the database. (d) ".FOO.qvs": a binary compressed "store" of the 5 Pacbio quality value streams for the reads. Its size is roughly 5/3M bytes depending on the compression acheived. This file only exists if .quiva files have been added to the database. (e) ".FOO..anno": a *track* containing customized meta-data for each read. For ".FOO..data" example, the DBdust command annotates low complexity intervals of reads and records the intervals for each read in two files .FOO.dust.anno & .FOO.dust.data. Any kind of information about a read can be recorded, such as micro-sats, repeat intervals, corrected sequence, etc. Specific tracks will be described as modules that produce them are released. If one does not like the convention of the secondary files being invisible, then un-defining the constant HIDE_FILES in DB.h before compiling the library, creates commands that do not place a prefixing "." before secondary file names, e.g. FOO.idx instead of .FOO.idx. One then sees all the files realizing a DB when listing the contents of a directory with ls. While a Dazzler DB holds a collection of Pacbio reads, a Dazzler map DB or DAM holds a collection of contigs from a reference genome assembly. This special type of DB has been introduced in order to facilitate the mapping of reads to an assembly and has been given the suffix .dam to distinguish it from an ordinary DB. It is structurally identical to a .db except: (a) there is no concept of quality values, and hence no .FOO.qvs file. (b) every .fasta scaffold (a sequence with runs of N's between contigs estimating the length of the gap) is broken into a separate contig sequence in the DB and the header for each scaffold is retained in a new .FOO.hdr file. (c) the original and first and last pulse fields in the meta-data records held in .FOO.idx, hold instead the contig number and the interval of the contig within its original scaffold sequence. A map DB can equally well be the argument of many of the commands below that operate on normal DBs. In general, a .dam can be an argument anywhere a .db can, with the exception of routines or optioned calls to routines that involve quality values, or the special routines fasta2DAM and DAM2fasta that create a DAM and reverse said, just like the pair fasta2DB and DB2fasta do for a normal DB. So in general when we refer to a database we are referring to either a DB or a DAM. The command DBsplit sets or resets the current partition for a database which is determined by 3 parameters: (i) the total number of basepairs to place in each block, (ii) the minimum read length of reads to include within a block, and (iii) whether or not to only include the longest read from a given well or all reads from a well (NB: several reads of the same insert in a given well can be produced by the Pacbio instrument). Note that the length and uniqueness parameters effectively select a subset of the reads that contribute to the size of a block. We call this subset the *trimmed* data base. Some commands operate on the entire database, others on the trimmed database, and yet others have an option flag that permits them to operate on either at the users discretion. Therefore, one should note carefully to which version of the database a command refers to. This is especially important for any command that identifies reads by their index (ordinal position) in the database. Once the database has been split into blocks, the commands DBshow, DBstats, and DBdust below and commands yet to come, such as the local alignment finder dalign, can take a block or blocks as arguments. On the command line this is indicated by supplying the name of the DB followed by a period and then a block number, e.g. FOO.3.db or simply FOO.3, refers to the 3'rd block of DB FOO (assuming of course it has a current partition and said partition has a 3rd block). One should note carefully that a block is a contiguous range of reads such that once it is trimmed has a given size in base pairs (as set by DBsplit). Thus like an entire database, a block can be either untrimmed or trimmed and one needs to again be careful when giving a read index to a command such as DBshow. All programs add suffixes (e.g. .db) as needed. The commands of the database library are currently as follows: 1. fasta2DB [-v] ( -f | ... ) Builds an initial data base, or adds to an existing database, the list of .fasta files following the database name argument, or if the -f option is used, the list of .fasta files in . A given .fasta file can only be added once to the DB (this is checked by the command). The .fasta headers must be in the "Pacbio" format (i.e. the output of the Pacbio tools or our dextract program) and the well, pulse interval, and read quality are extracted from the header and kept with each read record. If the files are being added to an existing database, and the partition settings of the DB have already been set (see DBsplit below), then the partitioning of the database is updated to include the new data. 2. DB2fasta [-vU] [-w] The set of .fasta files for the given DB are recreated from the DB exactly as they were input. That is, this is a perfect inversion, including the reconstitution of the proper .fasta headers. Because of this property, one can, if desired, delete the .fasta source files once they are in the DB as they can always be recreated from it. By default the output sequences are in lower case and 80 chars per line. The -U option specifies upper case should be used, and the characters per line, or line width, can be set to any positive value with the -w option. 3. quiva2DB [-vl] ( -f | ... ) Adds the given .quiva files on the command line or in the file specified by the -f option to an existing DB "path". The input files must be added in the same order as the .fasta files were and have the same root names, e.g. FOO.fasta and FOO.quiva. The files can be added incrementally but must be added in the same order as the .fasta files. This is enforced by the program. With the -l option set the compression scheme is a bit lossy to get more compression (see the description of dexqv in the DEXTRACTOR module). 4. DB2quiva [-vU] The set of .quiva files within the given DB are recreated from the DB exactly as they were input. That is, this is a perfect inversion, including the reconstitution of the proper .quiva headers. Because of this property, one can, if desired, delete the .quiva source files once they are in the DB as they can always be recreated from it. By .fastq convention each QV vector is output as a line without new-lines, and by default the Deletion Tag entry is in lower case letters. The -U option specifies upper case letters should be used instead. 5. fasta2DAM [-v] ( -f | ... ) Builds a map DB or DAM from the list of .fasta files following the map database name argument, or if the -f option is used, the list of .fasta files in . Any .fasta entry that has a run of N's in it will be split into separate "contig" entries and the interval of the contig in the original entry recorded. The header for each .fasta entry is saved with the contigs created from it. 6. DAM2fasta [-vU] [-w] The set of .fasta files for the given map DB or DAM are recreated from the DAM exactly as they were input. That is, this is a perfect inversion, including the reconstitution of the proper .fasta headers and the concatenation of contigs with the proper number of N's between them. By default the output sequences are in lower case and 80 chars per line. The -U option specifies upper case should be used, and the characters per line, or line width, can be set to any positive value with the -w option. 7. DBsplit [-a] [-x] [-s] Divide the database .db or .dam conceptually into a series of blocks referable to on the command line as .1, .2, ... If the -x option is set then all reads less than the given length are ignored, and if the -a option is not set then secondary reads from a given well are also ignored. The remaining reads, constituting what we call the trimmed DB, are split amongst the blocks so that each block is of size -s * 1Mbp except for the last which necessarily contains a smaller residual. The default value for -s is 200Mbp because blocks of this size can be compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks are very space efficient in that their sub-index of the master .idx is computed on the fly when loaded, and the .bps and .qvs files (if a .db) of base pairs and quality values, respectively, is shared with the master DB. Any relevant portions of tracks associated with the DB are also computed on the fly when loading a database block. 8. DBdust [-b] [-w] [-t] [-m] Runs the symmetric DUST algorithm over the reads in the untrimmed DB .db or .dam producing a track ..dust[.anno,.data] that marks all intervals of low complexity sequence, where the scan window is of size -w, the threshold for being a low-complexity interval is -t, and only perfect intervals of size greater than -m are recorded. If the -b option is set then the definition of low complexity takes into account the frequency of a given base. The command is incremental if given a DB to which new data has been added since it was last run on the DB, then it will extend the track to include the new reads. It is important to set this flag for genomes with a strong AT/GC bias, albeit the code is a tad slower. The dust track, if present, is understood and used by DBshow, DBstats, and dalign. DBdust can also be run over an untriimmed DB block in which case it outputs a track encoding where the trace file names contain the block number, e.g. .FOO.3.dust.anno and .FOO.3.dust.data, given FOO.3 on the command line. We call this a *block track*. This permits job parallelism in block-sized chunks, and the resulting sequence of block tracks can then be merged into a track for the entire untrimmed DB with Catrack. 9. Catrack [-v] Find all block tracks of the form ..#.... and merge them into a single track, ....., for the given DB or DAM. The block track files must all encode the same kind of track data (this is checked), and the files must exist for block 1, 2, 3, ... up to the last block number. 10. DBshow [-unqUQ] [-w] [-m]+ [ | ... ] Displays the requested reads in the database .db or .dam. By default the command applies to the trimmed database, but if -u is set then the entire DB is used. If no read arguments are given then every read in the database or database block is displayed. Otherwise the input file or the list of supplied integer ranges give the ordinal positions in the actively loaded portion of the db. In the case of a file, it should simply contain a read index, one per line. In the other case, a read range is either a lone integer or the symbol $, in which case the read range consists of just that read (the last read in the database if $). One may also give two positive integers separated by a dash to indicate a range of integers, where again a $ represents the index of the last read in the actively loaded db. For example, 1 3-5 $ displays reads 1, 3, 4, 5, and the last read in the active db. As another example, 1-$ displays every read in the active db (the default). By default a .fasta file of the read sequences is displayed. If the -q option is set, then the QV streams are also displayed in a non-standard modification of the fasta format. If the -n option is set then the DNA sequence is *not* displayed. If the -Q option is set then a .quiva file is displayed and in this case the -n and -m options mayt not be set (and the -q and -w options have no effect). If one or more masks are set with the -m option then the track intervals are also displayed in an additional header line and the bases within an interval are displayed in the case opposite that used for all the other bases. By default the output sequences are in lower case and 80 chars per line. The -U option specifies upper case should be used, and the characters per line, or line width, can be set to any positive value with the -w option. The .fasta or .quiva files that are output can be converted into a DB by fasta2DB and quiva2DB (if the -q and -n options are not set and no -m options are set), giving one a simple way to make a DB of a subset of the reads for testing purposes. 11. DBstats [-nu] [-b]+ Show overview statistics for all the reads in the trimmed data base .db or .dam, including a histogram of read lengths where the bucket size is set with the -b option (default 1000). If the -u option is given then the untrimmed database is summarized. If the -n option is given then the histogran of read lengths is not displayed. Any track such as a "dust" track that gives a seried of intervals along the read can be specified with the -m option in which case a summary and a histogram of the interval lengths is displayed. 12. DBrm ... Delete all the files for the given data bases. Do not use rm to remove a database, as there are at least two and often several secondary files for each DB including track files, and all of these are removed by DBrm. 13. simulator [-c] [-b] [-m] [-s] [-x] [-e] [-M] In addition to the DB commands we include here, somewhat tangentially, a simple simulator that generates synthetic reads for a random genome. simulator first generates a fake genome of size genlen*1Mb long, that has an AT-bias of -b. It then generates sample reads of mean length -m from a log-normal length distribution with standard deviation -s, but ignores reads of length less than -x. It collects enough reads to cover the genome -c times and introduces -e fraction errors into each read where the ratio of insertions, deletions, and substitutions are set by defined constants INS_RATE (default 73%) and DEL_RATE (default 20%) within generate.c. One can also control the rate at which reads are picked from the forward and reverse strands by setting the defined constant FLIP_RATE (default 50/50). The -r option seeds the random number generator for the generation of the genome so that one can reproducibly generate the same underlying genome to sample from. If this parameter is missing, then the job id of the invocation seeds the random number generator. The output is sent to the standard output (i.e. it is a UNIX pipe). The output is in Pacbio .fasta format suitable as input to fasta2DB. Finally, the -M option requests that the coordinates from which each read has been sampled are written to the indicated file, one line per read, ASCII encoded. This "map" file essentially tells one where every read belongs in an assembly and is very useful for debugging and testing purposes. If a read pair is say b,e then if b < e the read was sampled from [b,e] in the forward direction, and if b > e from [e,b] in the reverse direction. Example: A small complete example of most of the commands above. > simulator 1.0 -c20. >G.fasta // Generate a 20x data sets of a 1Mb genome > fasta2DB G G.fasta // Create a compressed data base of the reads, G.db > rm G.fasta // Redundant, recreate any time with "DB2fasta G" > DBsplit -s11 G // Split G into 2 parts of size ~ 11MB each > DBdust G.1 // Produce a "dust" track on each part > DBdust G.2 > Catrack G dust // Create one track for all of the DB > rm .G.*.dust.* // Clean up the sub-tracks > DBstats -mdust G // Take a look at the statistics for the database Statistics for all reads in the data set 1,836 reads out of 1,836 (100.0%) 20,007,090 base pairs out of 20,007,090 (100.0%) 10,897 average read length 2,192 standard deviation Base composition: 0.250(A) 0.250(C) 0.250(G) 0.250(T) Distribution of Read Lengths (Bin size = 1,000) Bin: Count % Reads % Bases Average 22,000: 1 0.1 0.1 22654 21,000: 0 0.1 0.1 22654 20,000: 1 0.1 0.2 21355 19,000: 0 0.1 0.2 21355 18,000: 4 0.3 0.6 19489 17,000: 8 0.8 1.3 18374 16,000: 19 1.8 2.8 17231 15,000: 43 4.1 6.2 16253 14,000: 81 8.6 12.0 15341 13,000: 146 16.5 21.9 14428 12,000: 200 27.4 34.4 13664 11,000: 315 44.6 52.4 12824 10,000: 357 64.0 71.2 12126 9,000: 306 80.7 85.8 11586 8,000: 211 92.2 94.8 11208 7,000: 95 97.3 98.4 11017 6,000: 43 99.7 99.8 10914 5,000: 6 100.0 100.0 10897 Statistics for dust-track There are 158 intervals totaling 1,820 bases (0.0% of all data) Distribution of dust intervals (Bin size = 1,000) Bin: Count % Intervals % Bases Average 0: 158 100.0 100.0 11 > ls -al total 66518744 drwxr-xr-x+ 177 myersg staff 6018 Mar 2 13:28 . drwxr-xr-x+ 20 myersg staff 680 Feb 26 19:52 .. -rw-r--r--+ 1 myersg staff 5002464 Mar 2 13:28 .G.bps -rw-r--r--+ 1 myersg staff 14704 Mar 2 13:28 .G.dust.anno -rw-r--r--+ 1 myersg staff 1264 Mar 2 13:28 .G.dust.data -rw-r--r--+ 1 myersg staff 73552 Mar 2 13:28 .G.idx -rw-r--r--+ 1 myersg staff 162 Mar 2 13:28 G.db > cat G.db files = 1 1836 G Sim blocks = 2 size = 11 cutoff = 0 all = 0 0 0 1011 1011 1836 1836 DAZZ_DB-1.0/fasta2DAM.c000066400000000000000000000351571253752464600143660ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Add .fasta files to a DB: * Adds the given fasta files in the given order to .db. If the db does not exist * then it is created. All .fasta files added to a given data base must have the same * header format and follow Pacbio's convention. A file cannot be added twice and this * is enforced. The command either builds or appends to the ..idx and ..bps * files, where the index file (.idx) contains information about each read and their offsets * in the base-pair file (.bps) that holds the sequences where each base is compessed * into 2-bits. The two files are hidden by virtue of their names beginning with a '.'. * .db is effectively a stub file with given name that contains an ASCII listing * of the files added to the DB and possibly the block partitioning for the DB if DBsplit * has been called upon it. * * Author: Gene Myers * Date : May 2013 * Modify: DB upgrade: now *add to* or create a DB depending on whether it exists, read * multiple .fasta files (no longer a stdin pipe). * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-v] ( -f | ... )"; static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; typedef struct { int argc; char **argv; FILE *input; int count; char *name; } File_Iterator; File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->argc = argc; it->argv = argv; it->input = input; if (input == NULL) it->count = first; else { it->count = 1; rewind(input); } return (it); } int next_file(File_Iterator *it) { static char nbuffer[MAX_NAME+8]; if (it->input == NULL) { if (it->count >= it->argc) return (0); it->name = it->argv[it->count++]; } else { char *eol; if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL) { if (feof(it->input)) return (0); SYSTEM_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n", Prog_Name,it->count,MAX_NAME+7); it->name = NULL; } *eol = '\0'; it->count += 1; it->name = nbuffer; } return (1); } int main(int argc, char *argv[]) { FILE *ostub; char *dbname; char *root, *pwd; FILE *bases, *indx, *hdrs; int64 boff, hoff; int ifiles, ofiles; char **flist; HITS_DB db; int ureads; int VERBOSE; FILE *IFILE; // Process command line { int i, j, k; int flags[128]; ARG_INIT("fasta2DAM") IFILE = NULL; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; case 'f': IFILE = fopen(argv[i]+2,"r"); if (IFILE == NULL) { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2); exit (1); } break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2)) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Try to open DB file, if present then adding to DB, otherwise creating new DB. Set up // variables as follows: // dbname = full name of map index = /.dam // ostub = new image of db file (will overwrite old image at end) // bases = .bps file positioned for appending // indx = .idx file positioned for appending // ureads = # of reads currently in db // boff = offset in .bps at which to place next sequence // hoff = offset in .hdr at which to place next header prefix // ifiles = # of .fasta files to add // ofiles = # of .fasta files added so far // flist = [0..ifiles] list of file names (root only) added to db so far root = Root(argv[1],".dam"); pwd = PathTo(argv[1]); dbname = Strdup(Catenate(pwd,"/",root,".dam"),"Allocating map index name"); if (dbname == NULL) exit (1); if (IFILE == NULL) ifiles = argc-2; else { File_Iterator *ng; ifiles = 0; ng = init_file_iterator(argc,argv,IFILE,2); while (next_file(ng)) ifiles += 1; free(ng); } ofiles = 0; bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w"); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w"); hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"w"); if (bases == NULL || indx == NULL || hdrs == NULL) exit (1); flist = (char **) Malloc(sizeof(char *)*ifiles,"Allocating file list"); fwrite(&db,sizeof(HITS_DB),1,indx); ureads = 0; boff = 0; hoff = 0; ostub = Fopen(dbname,"w+"); if (ostub == NULL) exit (1); fprintf(ostub,DB_NFILE,argc-2); { int maxlen; int64 totlen, count[4]; int rmax; HITS_READ prec; char *read; int c; File_Iterator *ng; // Buffer for accumulating .fasta sequence over multiple lines rmax = MAX_NAME + 60000; read = (char *) Malloc(rmax+1,"Allocating line buffer"); if (read == NULL) goto error; totlen = 0; // total # of bases in new .fasta files maxlen = 0; // longest read in new .fasta files for (c = 0; c < 4; c++) // count of acgt in new .fasta files count[c] = 0; // For each .fasta file do: ng = init_file_iterator(argc,argv,IFILE,2); while (next_file(ng)) { FILE *input; char *path, *core; int nline, eof, rlen; if (ng->name == NULL) goto error; // Open it: /.fasta, check that core is not too long, // and checking that it is not already in flist. path = PathTo(ng->name); core = Root(ng->name,".fasta"); if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL) goto error; free(path); { int j; for (j = 0; j < ofiles; j++) if (strcmp(core,flist[j]) == 0) { fprintf(stderr,"%s: File %s.fasta is already in database %s.db\n", Prog_Name,core,Root(argv[1],".db")); goto error; } } // Get the header of the first line. If the file is empty skip. rlen = 0; nline = 1; eof = (fgets(read,MAX_NAME,input) == NULL); if (eof || strlen(read) < 1) { fprintf(stderr,"Skipping '%s', file is empty!\n",core); fclose(input); free(core); continue; } // Add the file name to flist if (VERBOSE) { fprintf(stderr,"Adding '%s' ...\n",core); fflush(stderr); } flist[ofiles++] = core; // Check that the first line has PACBIO format, and record prolog in 'prolog'. if (read[strlen(read)-1] != '\n') { fprintf(stderr,"File %s.fasta, Line 1: Fasta line is too long (> %d chars)\n", core,MAX_NAME-2); goto error; } if (!eof && read[0] != '>') { fprintf(stderr,"File %s.fasta, Line 1: First header in fasta file is missing\n",core); goto error; } // Read in all the sequences until end-of-file { int i, x, n; while (!eof) { int hlen, hline; read[rlen] = '>'; hlen = strlen(read+rlen); fwrite(read+rlen,1,hlen,hdrs); hline = nline; rlen = 0; while (1) { eof = (fgets(read+rlen,MAX_NAME,input) == NULL); nline += 1; x = strlen(read+rlen)-1; if (read[rlen+x] != '\n') { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Fasta line is too long (> %d chars)\n",MAX_NAME-2); goto error; } if (eof || read[rlen] == '>') break; rlen += x; if (rlen + MAX_NAME > rmax) { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_NAME; read = (char *) realloc(read,rmax+1); if (read == NULL) { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Out of memory (Allocating line buffer)\n"); goto error; } } } read[rlen] = '\0'; n = 0; i = -1; while (i < rlen) { int pbeg, plen, clen; while (i < rlen) if (number[(int) read[++i]] < 4) break; if (i >= rlen) break; pbeg = i; prec.fpulse = pbeg; prec.origin = n++; prec.boff = boff; prec.coff = hoff; prec.flags = DB_BEST; while (i < rlen) { x = number[(int) read[i]]; if (x >= 4) break; count[x] += 1; read[i++] = (char) x; } prec.rlen = plen = i-pbeg; ureads += 1; totlen += plen; if (plen > maxlen) maxlen = plen; Compress_Read(plen,read+pbeg); clen = COMPRESSED_LEN(plen); fwrite(read+pbeg,1,clen,bases); boff += clen; fwrite(&prec,sizeof(HITS_READ),1,indx); } hoff += hlen; } fprintf(ostub,DB_FDATA,ureads,core,core); fclose(input); } } // Update relevant fields in db record db.ureads = ureads; db.treads = ureads; for (c = 0; c < 4; c++) db.freq[c] = (float) ((1.*count[c])/totlen); db.totlen = totlen; db.maxlen = maxlen; db.cutoff = -1; } rewind(indx); fwrite(&db,sizeof(HITS_DB),1,indx); // Write the finalized db record into .idx fclose(ostub); fclose(indx); fclose(bases); fclose(hdrs); exit (0); // Error exit: Remove the .idx, .bps, and .dam files error: fclose(ostub); fclose(indx); fclose(hdrs); fclose(bases); unlink(Catenate(pwd,PATHSEP,root,".idx")); unlink(Catenate(pwd,PATHSEP,root,".bps")); unlink(Catenate(pwd,PATHSEP,root,".hdr")); unlink(Catenate(pwd,"/",root,".dam")); exit (1); } DAZZ_DB-1.0/fasta2DB.c000066400000000000000000000535171253752464600142520ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Add .fasta files to a DB: * Adds the given fasta files in the given order to .db. If the db does not exist * then it is created. All .fasta files added to a given data base must have the same * header format and follow Pacbio's convention. A file cannot be added twice and this * is enforced. The command either builds or appends to the ..idx and ..bps * files, where the index file (.idx) contains information about each read and their offsets * in the base-pair file (.bps) that holds the sequences where each base is compessed * into 2-bits. The two files are hidden by virtue of their names beginning with a '.'. * .db is effectively a stub file with given name that contains an ASCII listing * of the files added to the DB and possibly the block partitioning for the DB if DBsplit * has been called upon it. * * Author: Gene Myers * Date : May 2013 * Modify: DB upgrade: now *add to* or create a DB depending on whether it exists, read * multiple .fasta files (no longer a stdin pipe). * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-v] ( -f | ... )"; static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; typedef struct { int argc; char **argv; FILE *input; int count; char *name; } File_Iterator; File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->argc = argc; it->argv = argv; it->input = input; if (input == NULL) it->count = first; else { it->count = 1; rewind(input); } return (it); } int next_file(File_Iterator *it) { static char nbuffer[MAX_NAME+8]; if (it->input == NULL) { if (it->count >= it->argc) return (0); it->name = it->argv[it->count++]; } else { char *eol; if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL) { if (feof(it->input)) return (0); SYSTEM_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n", Prog_Name,it->count,MAX_NAME+7); it->name = NULL; } *eol = '\0'; it->count += 1; it->name = nbuffer; } return (1); } int main(int argc, char *argv[]) { FILE *istub, *ostub; char *dbname; char *root, *pwd; FILE *bases, *indx; int64 boff, ioff; int ifiles, ofiles; char **flist; HITS_DB db; int ureads; int64 offset; FILE *IFILE; int VERBOSE; // Usage: [-v] ( -f | ... ) { int i, j, k; int flags[128]; ARG_INIT("fasta2DB") IFILE = NULL; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; case 'f': IFILE = fopen(argv[i]+2,"r"); if (IFILE == NULL) { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2); exit (1); } break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2)) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Try to open DB file, if present then adding to DB, otherwise creating new DB. Set up // variables as follows: // dbname = full name of db = /.db // istub = open db file (if adding) or NULL (if creating) // ostub = new image of db file (will overwrite old image at end) // bases = .bps file positioned for appending // indx = .idx file positioned for appending // ureads = # of reads currently in db // offset = offset in .bps at which to place next sequence // ioff = offset in .idx file to truncate to if command fails // boff = offset in .bps file to truncate to if command fails // ifiles = # of .fasta files to add // ofiles = # of .fasta files already in db // flist = [0..ifiles+ofiles] list of file names (root only) added to db so far { int i; root = Root(argv[1],".db"); pwd = PathTo(argv[1]); dbname = Strdup(Catenate(pwd,"/",root,".db"),"Allocating db name"); if (dbname == NULL) exit (1); if (IFILE == NULL) ifiles = argc-2; else { File_Iterator *ng; ifiles = 0; ng = init_file_iterator(argc,argv,IFILE,2); while (next_file(ng)) ifiles += 1; free(ng); } istub = fopen(dbname,"r"); if (istub == NULL) { ofiles = 0; bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w+"); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w+"); if (bases == NULL || indx == NULL) exit (1); fwrite(&db,sizeof(HITS_DB),1,indx); ureads = 0; offset = 0; boff = 0; ioff = 0; } else { if (fscanf(istub,DB_NFILE,&ofiles) != 1) SYSTEM_ERROR bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"r+"); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); if (bases == NULL || indx == NULL) exit (1); if (fread(&db,sizeof(HITS_DB),1,indx) != 1) SYSTEM_ERROR fseeko(bases,0,SEEK_END); fseeko(indx, 0,SEEK_END); ureads = db.ureads; offset = ftello(bases); boff = offset; ioff = ftello(indx); } flist = (char **) Malloc(sizeof(char *)*(ofiles+ifiles),"Allocating file list"); ostub = Fopen(Catenate(pwd,"/",root,".dbx"),"w+"); if (ostub == NULL || flist == NULL) exit (1); fprintf(ostub,DB_NFILE,ofiles+ifiles); for (i = 0; i < ofiles; i++) { int last; char prolog[MAX_NAME], fname[MAX_NAME]; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if ((flist[i] = Strdup(fname,"Adding to file list")) == NULL) goto error; fprintf(ostub,DB_FDATA,last,fname,prolog); } } { int maxlen; int64 totlen, count[4]; int pmax, rmax; HITS_READ *prec; char *read; int c; File_Iterator *ng; // Buffer for reads all in the same well pmax = 100; prec = (HITS_READ *) Malloc(sizeof(HITS_READ)*pmax,"Allocating record buffer"); if (prec == NULL) goto error; // Buffer for accumulating .fasta sequence over multiple lines rmax = MAX_NAME + 60000; read = (char *) Malloc(rmax+1,"Allocating line buffer"); if (read == NULL) goto error; totlen = 0; // total # of bases in new .fasta files maxlen = 0; // longest read in new .fasta files for (c = 0; c < 4; c++) // count of acgt in new .fasta files count[c] = 0; // For each new .fasta file do: ng = init_file_iterator(argc,argv,IFILE,2); while (next_file(ng)) { FILE *input; char *path, *core, *prolog; int nline, eof, rlen, pcnt; int pwell; if (ng->name == NULL) goto error; // Open it: /.fasta, check that core is not too long, // and checking that it is not already in flist. path = PathTo(ng->name); core = Root(ng->name,".fasta"); if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL) goto error; free(path); if (strlen(core) >= MAX_NAME) { fprintf(stderr,"%s: File name over %d chars: '%.200s'\n", Prog_Name,MAX_NAME,core); goto error; } { int j; for (j = 0; j < ofiles; j++) if (strcmp(core,flist[j]) == 0) { fprintf(stderr,"%s: File %s.fasta is already in database %s.db\n", Prog_Name,core,Root(argv[1],".db")); goto error; } } // Get the header of the first line. If the file is empty skip. pcnt = 0; rlen = 0; nline = 1; eof = (fgets(read,MAX_NAME,input) == NULL); if (eof || strlen(read) < 1) { fprintf(stderr,"Skipping '%s', file is empty!\n",core); fclose(input); free(core); continue; } // Add the file name to flist if (VERBOSE) { fprintf(stderr,"Adding '%s' ...\n",core); fflush(stderr); } flist[ofiles++] = core; // Check that the first line has PACBIO format, and record prolog in 'prolog'. if (read[strlen(read)-1] != '\n') { fprintf(stderr,"File %s.fasta, Line 1: Fasta line is too long (> %d chars)\n", core,MAX_NAME-2); goto error; } if (!eof && read[0] != '>') { fprintf(stderr,"File %s.fasta, Line 1: First header in fasta file is missing\n",core); goto error; } { char *find; int well, beg, end, qv; find = index(read+1,'/'); if (find != NULL && sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) >= 3) { *find = '\0'; prolog = Strdup(read+1,"Extracting prolog"); *find = '/'; if (prolog == NULL) goto error; } else { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n", core,nline); goto error; } } // Read in all the sequences until end-of-file { int i, x; pwell = -1; while (!eof) { int beg, end, clen, hline; int well, qv; char *find; find = index(read+(rlen+1),'/'); if (find == NULL) { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n", core,nline); goto error; } *find = '\0'; if (strcmp(read+(rlen+1),prolog) != 0) { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line name inconsisten\n", core,nline); goto error; } *find = '/'; x = sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv); if (x < 3) { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n", core,nline); goto error; } else if (x == 3) qv = 0; hline = nline; rlen = 0; while (1) { eof = (fgets(read+rlen,MAX_NAME,input) == NULL); nline += 1; x = strlen(read+rlen)-1; if (read[rlen+x] != '\n') { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Fasta line is too long (> %d chars)\n",MAX_NAME-2); goto error; } if (eof || read[rlen] == '>') break; rlen += x; if (rlen + MAX_NAME > rmax) { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_NAME; read = (char *) realloc(read,rmax+1); if (read == NULL) { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Out of memory (Allocating line buffer)\n"); goto error; } } } read[rlen] = '\0'; for (i = 0; i < rlen; i++) { x = number[(int) read[i]]; count[x] += 1; read[i] = (char) x; } ureads += 1; totlen += rlen; if (rlen > maxlen) maxlen = rlen; prec[pcnt].origin = well; prec[pcnt].fpulse = beg; prec[pcnt].rlen = rlen; prec[pcnt].boff = offset; prec[pcnt].coff = -1; prec[pcnt].flags = qv; Compress_Read(rlen,read); clen = COMPRESSED_LEN(rlen); fwrite(read,1,clen,bases); offset += clen; if (pwell == well) { prec[pcnt].flags |= DB_CSS; pcnt += 1; if (pcnt >= pmax) { pmax = ((int) (pcnt*1.2)) + 100; prec = (HITS_READ *) realloc(prec,sizeof(HITS_READ)*pmax); if (prec == NULL) { fprintf(stderr,"File %s.fasta, Line %d: Out of memory",core,nline); fprintf(stderr," (Allocating read records)\n"); goto error; } } } else if (pcnt == 0) pcnt += 1; else { x = 0; for (i = 1; i < pcnt; i++) if (prec[i].rlen > prec[x].rlen) x = i; prec[x].flags |= DB_BEST; fwrite(prec,sizeof(HITS_READ),pcnt,indx); prec[0] = prec[pcnt]; pcnt = 1; } pwell = well; } // Complete processing of .fasta file: flush last well group, write file line // in db image, free prolog, and close file x = 0; for (i = 1; i < pcnt; i++) if (prec[i].rlen > prec[x].rlen) x = i; prec[x].flags |= DB_BEST; fwrite(prec,sizeof(HITS_READ),pcnt,indx); fprintf(ostub,DB_FDATA,ureads,core,prolog); } free(prolog); fclose(input); } // Finished loading all sequences: update relevant fields in db record db.ureads = ureads; if (istub == NULL) { for (c = 0; c < 4; c++) db.freq[c] = (float) ((1.*count[c])/totlen); db.totlen = totlen; db.maxlen = maxlen; db.cutoff = -1; } else { for (c = 0; c < 4; c++) db.freq[c] = (float) ((db.freq[c]*db.totlen + (1.*count[c]))/(db.totlen + totlen)); db.totlen += totlen; if (maxlen > db.maxlen) db.maxlen = maxlen; } } // If db has been previously partitioned then calculate additional partition points and // write to new db file image if (db.cutoff >= 0) { int64 totlen, dbpos, size; int nblock, ireads, tfirst, rlen; int ufirst, cutoff, allflag; HITS_READ record; int i; if (VERBOSE) { fprintf(stderr,"Updating block partition ...\n"); fflush(stderr); } // Read the block portion of the existing db image getting the indices of the first // read in the last block of the exisiting db as well as the partition parameters. // Copy the old image block information to the new block information (except for // the indices of the last partial block) if (fscanf(istub,DB_NBLOCK,&nblock) != 1) SYSTEM_ERROR dbpos = ftello(ostub); fprintf(ostub,DB_NBLOCK,0); if (fscanf(istub,DB_PARAMS,&size,&cutoff,&allflag) != 3) SYSTEM_ERROR fprintf(ostub,DB_PARAMS,size,cutoff,allflag); if (allflag) allflag = 0; else allflag = DB_BEST; size *= 1000000ll; nblock -= 1; for (i = 0; i <= nblock; i++) { if (fscanf(istub,DB_BDATA,&ufirst,&tfirst) != 2) SYSTEM_ERROR fprintf(ostub,DB_BDATA,ufirst,tfirst); } // Seek the first record of the last block of the existing db in .idx, and then // compute and record partition indices for the rest of the db from this point // forward. fseeko(indx,sizeof(HITS_DB)+sizeof(HITS_READ)*ufirst,SEEK_SET); totlen = 0; ireads = 0; for (i = ufirst; i < ureads; i++) { if (fread(&record,sizeof(HITS_READ),1,indx) != 1) SYSTEM_ERROR rlen = record.rlen; if (rlen >= cutoff && (record.flags & DB_BEST) >= allflag) { ireads += 1; tfirst += 1; totlen += rlen; if (totlen >= size) { fprintf(ostub," %9d %9d\n",i+1,tfirst); totlen = 0; ireads = 0; nblock += 1; } } } if (ireads > 0) { fprintf(ostub,DB_BDATA,ureads,tfirst); nblock += 1; } db.treads = tfirst; fseeko(ostub,dbpos,SEEK_SET); fprintf(ostub,DB_NBLOCK,nblock); // Rewind and record the new number of blocks } else db.treads = ureads; rewind(indx); fwrite(&db,sizeof(HITS_DB),1,indx); // Write the finalized db record into .idx rewind(ostub); // Rewrite the number of files actually added fprintf(ostub,DB_NFILE,ofiles); if (istub != NULL) fclose(istub); fclose(ostub); fclose(indx); fclose(bases); rename(Catenate(pwd,"/",root,".dbx"),dbname); // New image replaces old image exit (0); // Error exit: Either truncate or remove the .idx and .bps files as appropriate. // Remove the new image file /.dbx error: if (ioff != 0) { fseeko(indx,0,SEEK_SET); if (ftruncate(fileno(indx),ioff) < 0) SYSTEM_ERROR } if (boff != 0) { fseeko(bases,0,SEEK_SET); if (ftruncate(fileno(bases),boff) < 0) SYSTEM_ERROR } fclose(indx); fclose(bases); if (ioff == 0) unlink(Catenate(pwd,PATHSEP,root,".idx")); if (boff == 0) unlink(Catenate(pwd,PATHSEP,root,".bps")); if (istub != NULL) fclose(istub); fclose(ostub); unlink(Catenate(pwd,"/",root,".dbx")); exit (1); } DAZZ_DB-1.0/quiva2DB.c000066400000000000000000000303621253752464600142720ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Adds the given .quiva files to an existing DB "path". The input files must be added in * the same order as the .fasta files were and have the same root names, e.g. FOO.fasta * and FOO.quiva. The files can be added incrementally but must be added in the same order * as the .fasta files. This is enforced by the program. With the -l option set the * compression scheme is a bit lossy to get more compression (see the description of dexqv * in the DEXTRACTOR module). * * Author: Gene Myers * Date : July 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #include "QV.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-vl] ( -f | ... )"; typedef struct { int argc; char **argv; FILE *input; int count; char *name; } File_Iterator; File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->argc = argc; it->argv = argv; it->input = input; if (input == NULL) it->count = first; else { it->count = 1; rewind(input); } return (it); } int next_file(File_Iterator *it) { static char nbuffer[MAX_NAME+8]; if (it->input == NULL) { if (it->count >= it->argc) return (0); it->name = it->argv[it->count++]; } else { char *eol; if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL) { if (feof(it->input)) return (0); SYSTEM_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n", Prog_Name,it->count,MAX_NAME+7); it->name = NULL; } *eol = '\0'; it->count += 1; it->name = nbuffer; } return (1); } int main(int argc, char *argv[]) { FILE *istub, *quiva, *indx; int64 coff; int ofile; HITS_DB db; HITS_READ *reads; int VERBOSE; int LOSSY; FILE *IFILE; // Process command line { int i, j, k; int flags[128]; ARG_INIT("quiva2DB") IFILE = NULL; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vl") break; case 'f': IFILE = fopen(argv[i]+2,"r"); if (IFILE == NULL) { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2); exit (1); } break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; LOSSY = flags['l']; if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2)) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open DB stub file and index, load db and read records. Confirm that the .fasta files // corresponding to the command line .quiva files are in the DB and in order where the // index of the first file is ofile and the index of the first read to be added is ofirst. // Record in coff the current size of the .qvs file in case an error occurs and it needs // to be truncated back to its size at the start. { int i; char *pwd, *root; int nfiles; File_Iterator *ng; root = Root(argv[1],".db"); pwd = PathTo(argv[1]); istub = Fopen(Catenate(pwd,"/",root,".db"),"r"); if (istub == NULL) exit (1); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); if (indx == NULL) exit (1); if (fread(&db,sizeof(HITS_DB),1,indx) != 1) SYSTEM_ERROR reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*db.ureads,"Allocating DB index"); if (reads == NULL) exit (1); if (fread(reads,sizeof(HITS_READ),db.ureads,indx) != (size_t) (db.ureads)) SYSTEM_ERROR { int first, last; char prolog[MAX_NAME], fname[MAX_NAME]; char *core; ng = init_file_iterator(argc,argv,IFILE,2); if ( ! next_file(ng)) { fprintf(stderr,"%s: file list is empty!\n",Prog_Name); exit (1); } if (ng->name == NULL) exit (1); core = Root(ng->name,".quiva"); if (fscanf(istub,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if (strcmp(core,fname) == 0) break; first = last; } if (i >= nfiles) { fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core); exit (1); } ofile = i; if (first > 0 && reads[first-1].coff < 0) { fprintf(stderr,"%s: Predecessor of %s.quiva has not been added yet\n",Prog_Name,core); exit (1); } if (reads[first].coff >= 0) { fprintf(stderr,"%s: %s.quiva has already been added\n",Prog_Name,core); exit (1); } while (next_file(ng)) { if (ng->name == NULL) exit (1); core = Root(ng->name,".quiva"); if (++i >= nfiles) { fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core); exit (1); } if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if (strcmp(core,fname) != 0) { fprintf(stderr,"%s: Files not being added in order (expect %s, given %s)", Prog_Name,fname,core); exit (1); } } if (ofile == 0) quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"w"); else quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r+"); if (quiva == NULL) exit (1); fseeko(quiva,0,SEEK_END); coff = ftello(quiva); free(core); free(ng); } free(root); free(pwd); } // For each .quiva file, determine its compression scheme in a fast scan and append it to // the .qvs file Then compress every .quiva entry in the file, appending its compressed // form to the .qvs file as you go and recording the offset in the .qvs in the .coff field // of each read record (*except* the first, that points at the compression scheme immediately // preceding it). Ensure that the # of .quiva entries matches the # of .fasta entries // in each added file. { int i; int last, cur; File_Iterator *ng; // For each .quiva file do: rewind(istub); if (fscanf(istub,"files = %*d\n") != 0) SYSTEM_ERROR last = 0; for (i = 0; i < ofile; i++) if (fscanf(istub," %9d %*s %*s\n",&last) != 1) SYSTEM_ERROR ng = init_file_iterator(argc,argv,IFILE,2); cur = last; while (next_file(ng)) { FILE *input; int64 qpos; char *pwd, *root; QVcoding *coding; // Open next .quiva file and create its compression scheme pwd = PathTo(ng->name); root = Root(ng->name,".quiva"); if ((input = Fopen(Catenate(pwd,"/",root,".quiva"),"r")) == NULL) goto error; if (VERBOSE) { fprintf(stderr,"Analyzing '%s' ...\n",root); fflush(stderr); } QVcoding_Scan(input); coding = Create_QVcoding(LOSSY); coding->prefix = Strdup(".qvs","Allocating header prefix"); qpos = ftello(quiva); Write_QVcoding(quiva,coding); // Then compress and append to the .qvs each compressed QV entry if (VERBOSE) { fprintf(stderr,"Compressing '%s' ...\n",root); fflush(stderr); } rewind(input); while (Read_Lines(input,1) > 0) { reads[cur++].coff = qpos; Compress_Next_QVentry(input,quiva,coding,LOSSY); qpos = ftello(quiva); } if (fscanf(istub," %9d %*s %*s\n",&last) != 1) SYSTEM_ERROR if (last != cur) { fprintf(stderr,"%s: Number of reads in %s.quiva doesn't match number in %s.fasta\n", Prog_Name,root,root); goto error; } Free_QVcoding(coding); free(root); free(pwd); } free(ng); } // Write the db record and read index into .idx and clean up rewind(indx); fwrite(&db,sizeof(HITS_DB),1,indx); fwrite(reads,sizeof(HITS_READ),db.ureads,indx); fclose(istub); fclose(indx); fclose(quiva); exit (0); // Error exit: Either truncate or remove the .qvs file as appropriate. error: if (coff != 0) { fseeko(quiva,0,SEEK_SET); if (ftruncate(fileno(quiva),coff) < 0) SYSTEM_ERROR } fclose(istub); fclose(indx); fclose(quiva); if (coff == 0) { char *root = Root(argv[1],".db"); char *pwd = PathTo(argv[1]); unlink(Catenate(pwd,PATHSEP,root,".qvs")); free(pwd); free(root); } exit (1); } DAZZ_DB-1.0/simulator.c000066400000000000000000000365621253752464600147040ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Synthetic DNA shotgun dataset simulator * Generate a fake genome of size genlen*1Mb long, that has an AT-bias of -b. Then * sample reads of mean length -m from a log-normal length distribution with * standard deviation -s, but ignore reads of length less than -x. Collect enough * reads to cover the genome -c times. Introduce -e fraction errors into each * read where the ratio of insertions, deletions, and substitutions are set by * defined constants INS_RATE and DEL_RATE within generate.c. One can also control * the rate at which reads are picked from the forward and reverse strands by setting * the defined constant FLIP_RATE. * * The -r parameter seeds the random number generator for the generation of the genome * so that one can reproducbile produce the same underlying genome to sample from. If * missing, then the job id of the invocation seeds the generator. The output is sent * to the standard output (i.e. it is a pipe). The output is in fasta format (i.e. it is * a UNIX pipe). The output is in Pacbio .fasta format suitable as input to fasta2DB. * * The -M option requests that the coordinates from which each read has been sampled are * written to the indicated file, one line per read, ASCII encoded. This "map" file * essentially tells one where every read belongs in an assembly and is very useful for * debugging and testing purposes. If a read pair is say b,e then if b < e the read was * sampled from [b,e] in the forward direction, and from [e,b] in the reverse direction * otherwise. * * Author: Gene Myers * Date : July 2013 * Mod : April 2014 (made independent of "mylib") * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" static char *Usage[] = { " [-c] [-b] [-r]", " [-m] [-s] [-x]", " [-e] [-M]" }; static int GENOME; // -g option * 1Mbp static double COVERAGE; // -c option static double BIAS; // -b option static int HASR = 0; // -r option is set? static int SEED; // -r option static int RMEAN; // -m option static int RSDEV; // -s option static int RSHORT; // -x option static double ERROR; // -e option static FILE *MAP; // -M option #define INS_RATE .73333 // insert rate #define DEL_RATE .20000 // deletion rate #define IDL_RATE .93333 // insert + delete rate #define FLIP_RATE .5 // orientation rate (equal) // Generate a random 4 letter string of length *len* with every letter having equal probability. static char *random_genome() { char *seq; int i; double x, PRA, PRC, PRG; PRA = BIAS/2.; PRC = (1.-BIAS)/2. + PRA; PRG = (1.-BIAS)/2. + PRC; if (HASR) srand48(SEED); else srand48(getpid()); if ((seq = (char *) Malloc(GENOME+1,"Allocating genome sequence")) == NULL) exit (1); for (i = 0; i < GENOME; i++) { x = drand48(); if (x < PRA) seq[i] = 0; else if (x < PRC) seq[i] = 1; else if (x < PRG) seq[i] = 2; else seq[i] = 3; } seq[GENOME] = 4; return (seq); } // Complement (in the DNA sense) string *s*. static void complement(int elen, char *s) { char *t; int c; t = s + (elen-1); while (s <= t) { c = *s; *s = (char) (3-*t); *t = (char) (3-c); s += 1; t -= 1; } } #define UNORM_LEN 60000 #define UNORM_MAX 6.0 static double unorm_table[UNORM_LEN+1]; // Upper half of cdf of N(0,1) static double unorm_scale; static void init_unorm() { double del, sum, x; int i; unorm_scale = del = UNORM_MAX / UNORM_LEN; sum = 0; // Integrate pdf, x >= 0 half only. for (i = 0; i < UNORM_LEN; i++) { x = i * del; unorm_table[i] = sum; sum += exp(-.5*x*x) * del; } unorm_table[UNORM_LEN] = sum; /* Normalize cdf */ sum *= 2.; for (i = 0; i < UNORM_LEN; i++) unorm_table[i] /= sum; unorm_table[UNORM_LEN] = 1.; #ifdef DEBUG printf("Truncated tail is < %g\n", exp(-.5*UNORM_MAX*UNORM_MAX)/(sum*(1.-exp(-UNORM_MAX))) ); printf("Diff between last two entries is %g\n",.5-unorm_table[UNORM_LEN-1]); printf("\n CDF:\n"); for (i = 0; i <= UNORM_LEN; i += 100) printf("%6.2f: %10.9f\n",i*del,unorm_table[i]); #endif } static int bin_search(int len, double *tab, double y) { int l, m, r; // Searches tab[0..len] for min { r : y < tab[r] }. // Assumes y < 1, tab[0] = 0 and tab[len] = 1. // So returned index is in [1,len]. l = 0; r = len; while (l < r) { m = (l+r) >> 1; if (y < tab[m]) r = m; else l = m+1; } return (r); } static double sample_unorm(double x) { double y; int f; if (x >= .5) // Map [0,1) random var to upper-half of cdf */ y = x-.5; else y = .5-x; f = bin_search(UNORM_LEN,unorm_table,y); // Bin. search upper-half cdf #ifdef DEBUG printf("Normal search %g -> %g -> %d",x,y,f); #endif // Linear interpolate between table points y = (f - (unorm_table[f]-y) / (unorm_table[f] - unorm_table[f-1]) ) * unorm_scale; if (x < .5) y = -y; // Map upper-half var back to full range #ifdef DEBUG printf(" -> %g\n",y); #endif return (y); } // Generate reads (a) whose lengths are exponentially distributed with mean *mean* and // standard deviation *stdev*, (b) that are never shorter than *shortest* and never // longer than the string *source*. Each read is a randomly sampled interval of // *source* (each interval is equally likely) that has insertion, deletion, and/or // substitution errors introduced into it and which is oriented in either the forward // or reverse strand direction with probability FLIP_RATE. The number of errors // introduced is the length of the string times *erate*, and the probability of an // insertion, deletion, or substitution is controlled by the defined constants INS_RATE // and DEL_RATE. Generate reads until the sum of the lengths of the reads is greater // than slen*coverage. The reads are output as fasta entries with a specific header // format that contains the sampling interval, read length, and a read id. static void shotgun(char *source) { int maxlen, nreads, qv; int64 totlen, totbp; char *rbuffer; double nmean, nsdev; nsdev = (1.*RSDEV)/RMEAN; nsdev = log(1.+nsdev*nsdev); nmean = log(1.*RMEAN) - .5*nsdev; nsdev = sqrt(nsdev); if (GENOME < RSHORT) { fprintf(stderr,"Genome length is less than shortest read length !\n"); exit (1); } init_unorm(); qv = (int) (1000 * (1.-ERROR)); rbuffer = NULL; maxlen = 0; totlen = 0; totbp = COVERAGE*GENOME; nreads = 0; while (totlen < totbp) { int len, sdl, ins, del, elen, rbeg, rend; int j; char *s, *t; len = (int) exp(nmean + nsdev*sample_unorm(drand48())); // Determine length of read. if (len > GENOME) len = GENOME; if (len < RSHORT) continue; sdl = (int) (len*ERROR); // Determine number of inserts *ins*, deletions *del, ins = del = 0; // and substitions+deletions *sdl*. for (j = 0; j < sdl; j++) { double x = drand48(); if (x < INS_RATE) ins += 1; else if (x < IDL_RATE) del += 1; } sdl -= ins; elen = len + (ins-del); rbeg = (int) (drand48()*((GENOME-len)+.9999999)); rend = rbeg + len; if (elen > maxlen) { maxlen = ((int) (1.2*elen)) + 1000; rbuffer = (char *) Realloc(rbuffer,maxlen+3,"Allocating read buffer"); if (rbuffer == NULL) exit (1); } t = rbuffer; s = source + rbeg; // Generate the string with errors. NB that inserts occur randomly between source // characters, while deletions and substitutions occur on source characters. while ((len+1) * drand48() < ins) { *t++ = (char) (4.*drand48()); ins -= 1; } for ( ; len > 0; len--) { if (len * drand48() >= sdl) *t++ = *s; else if (sdl * drand48() >= del) { double x = 3.*drand48(); if (x >= *s) x += 1.; *t++ = (char) x; sdl -= 1; } else { del -= 1; sdl -= 1; } s += 1; while (len * drand48() < ins) { *t++ = (char) (4.*drand48()); ins -= 1; } } *t = 4; if (drand48() >= FLIP_RATE) // Complement the string with probability FLIP_RATE. { printf(">Sim/%d/%d_%d RQ=0.%d\n",nreads+1,0,elen,qv); complement(elen,rbuffer); j = rend; rend = rbeg; rbeg = j; } else printf(">Sim/%d/%d_%d RQ=0.%d\n",nreads+1,0,elen,qv); Lower_Read(rbuffer); for (j = 0; j+80 < elen; j += 80) printf("%.80s\n",rbuffer+j); if (j < elen) printf("%s\n",rbuffer+j); if (MAP != NULL) fprintf(MAP," %9d %9d\n",rbeg,rend); totlen += elen; nreads += 1; } } int main(int argc, char *argv[]) { char *source; // Usage: [-c] [-b] [-r] // [-m] [-s] [-x] // [-e] [-M 1.) { fprintf(stderr,"%s: AT-bias must be in [0,1] (%g)\n",Prog_Name,BIAS); exit (1); } break; case 'r': SEED = strtol(argv[i]+2,&eptr,10); HASR = 1; if (*eptr != '\0' || argv[i][2] == '\0') { fprintf(stderr,"%s: -r argument is not an integer\n",Prog_Name); exit (1); } break; case 'M': MAP = Fopen(argv[i]+2,"w"); if (MAP == NULL) exit (1); break; case 'm': ARG_POSITIVE(RMEAN,"Mean read length") break; case 's': ARG_POSITIVE(RSDEV,"Read length standard deviation") break; case 'x': ARG_NON_NEGATIVE(RSHORT,"Read length minimum") break; case 'e': ARG_REAL(ERROR) if (ERROR < 0. || ERROR > .5) { fprintf(stderr,"%s: Error rate must be in [0,.5] (%g)\n",Prog_Name,ERROR); exit (1); } break; } else argv[j++] = argv[i]; argc = j; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); exit (1); } glen = strtod(argv[1],&eptr); if (*eptr != '\0') { fprintf(stderr,"%s: genome length is not a real number\n",Prog_Name); exit (1); } if (glen < 0.) { fprintf(stderr,"%s: Genome length must be positive (%g)\n",Prog_Name,glen); exit (1); } GENOME = (int) (glen*1000000.); } source = random_genome(); shotgun(source); if (MAP != NULL) fclose(MAP); exit (0); }