pax_global_header00006660000000000000000000000064125375246740014531gustar00rootroot0000000000000052 comment=d767683b3f37798c34e964d62aeb17ed2831cc60 DEXTRACTOR-1.0/000077500000000000000000000000001253752467400130505ustar00rootroot00000000000000DEXTRACTOR-1.0/DB.c000066400000000000000000001264201253752467400135060ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. Eventually will also * support compressed quality information. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif /******************************************************************************************* * * GENERAL UTILITIES * ********************************************************************************************/ char *Prog_Name; #ifdef INTERACTIVE char Ebuffer[1000]; #endif void *Malloc(int64 size, char *mesg) { void *p; if ((p = malloc(size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } void *Realloc(void *p, int64 size, char *mesg) { if ((p = realloc(p,size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } char *Strdup(char *name, char *mesg) { char *s; if (name == NULL) return (NULL); if ((s = strdup(name)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (s); } FILE *Fopen(char *name, char *mode) { FILE *f; if (name == NULL || mode == NULL) return (NULL); if ((f = fopen(name,mode)) == NULL) EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); return (f); } char *PathTo(char *name) { char *path, *find; if (name == NULL) return (NULL); if ((find = rindex(name,'/')) != NULL) { *find = '\0'; path = Strdup(name,"Extracting path from"); *find = '/'; } else path = Strdup(".","Allocating default path"); return (path); } char *Root(char *name, char *suffix) { char *path, *find, *dot; int epos; if (name == NULL) return (NULL); find = rindex(name,'/'); if (find == NULL) find = name; else find += 1; if (suffix == NULL) { dot = strchr(find,'.'); if (dot != NULL) *dot = '\0'; path = Strdup(find,"Extracting root from"); if (dot != NULL) *dot = '.'; } else { epos = strlen(find); epos -= strlen(suffix); if (epos > 0 && strcasecmp(find+epos,suffix) == 0) { find[epos] = '\0'; path = Strdup(find,"Extracting root from"); find[epos] = suffix[0]; } else path = Strdup(find,"Allocating root"); } return (path); } char *Catenate(char *path, char *sep, char *root, char *suffix) { static char *cat = NULL; static int max = -1; int len; if (path == NULL || root == NULL || sep == NULL || suffix == NULL) return (NULL); len = strlen(path); len += strlen(sep); len += strlen(root); len += strlen(suffix); if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); return (NULL); } } sprintf(cat,"%s%s%s%s",path,sep,root,suffix); return (cat); } char *Numbered_Suffix(char *left, int num, char *right) { static char *suffix = NULL; static int max = -1; int len; if (left == NULL || right == NULL) return (NULL); len = strlen(left); len += strlen(right) + 40; if (len > max) { max = ((int) (1.2*len)) + 100; if ((suffix = (char *) realloc(suffix,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); return (NULL); } } sprintf(suffix,"%s%d%s",left,num,right); return (suffix); } #define COMMA ',' // Print big integers with commas/periods for better readability void Print_Number(int64 num, int width, FILE *out) { if (width == 0) { if (num < 1000ll) fprintf(out,"%lld",num); else if (num < 1000000ll) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else if (num < 1000000000ll) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll, COMMA,(num%1000000000ll)/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); } else { if (num < 1000ll) fprintf(out,"%*lld",width,num); else if (num < 1000000ll) { if (width <= 4) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll); } else if (num < 1000000000ll) { if (width <= 8) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); } else { if (width <= 12) fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); } } } // Return the number of digits, base 10, of num int Number_Digits(int64 num) { int digit; digit = 0; while (num >= 1) { num /= 10; digit += 1; } return (digit); } /******************************************************************************************* * * READ COMPRESSION/DECOMPRESSION UTILITIES * ********************************************************************************************/ // Compress read into 2-bits per base (from [0-3] per byte representation void Compress_Read(int len, char *s) { int i; char c, d; char *s0, *s1, *s2, *s3; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; c = s1[len]; d = s2[len]; s0[len] = s1[len] = s2[len] = 0; for (i = 0; i < len; i += 4) *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]); s1[len] = c; s2[len] = d; } // Uncompress read form 2-bits per base into [0-3] per byte representation void Uncompress_Read(int len, char *s) { int i, tlen, byte; char *s0, *s1, *s2, *s3; char *t; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; tlen = (len-1)/4; t = s+tlen; for (i = tlen*4; i >= 0; i -= 4) { byte = *t--; s0[i] = (char) ((byte >> 6) & 0x3); s1[i] = (char) ((byte >> 4) & 0x3); s2[i] = (char) ((byte >> 2) & 0x3); s3[i] = (char) (byte & 0x3); } s[len] = 4; } // Convert read in [0-3] representation to ascii representation (end with '\n') void Lower_Read(char *s) { static char letter[4] = { 'a', 'c', 'g', 't' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } void Upper_Read(char *s) { static char letter[4] = { 'A', 'C', 'G', 'T' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } // Convert read in ascii representation to [0-3] representation (end with 4) void Number_Read(char *s) { static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; for ( ; *s != '\0'; s++) *s = number[(int) *s]; *s = 4; } /******************************************************************************************* * * DB OPEN, TRIM & CLOSE ROUTINES * ********************************************************************************************/ // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char* path, HITS_DB *db) { HITS_DB dbcopy; char *root, *pwd, *bptr, *fptr, *cat; int nreads; FILE *index, *dbvis; int status, plen, isdam; int part, cutoff, all; int ufirst, tfirst, ulast, tlast; status = -1; dbcopy = *db; plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); pwd = PathTo(path); bptr = rindex(root,'.'); if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-') { part = strtol(bptr+1,&fptr,10); if (*fptr != '\0' || part == 0) part = 0; else *bptr = '\0'; } else part = 0; isdam = 0; cat = Catenate(pwd,"/",root,".db"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { cat = Catenate(pwd,"/",root,".dam"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path); goto error; } isdam = 1; } if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) goto error1; if (fread(db,sizeof(HITS_DB),1,index) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); goto error2; } { int p, nblocks, nfiles; int64 size; char fname[MAX_NAME], prolog[MAX_NAME]; nblocks = 0; if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } for (p = 0; p < nfiles; p++) if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) if (part == 0) { cutoff = 0; all = 1; } else { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", Prog_Name,root); goto error2; } else { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (part > nblocks) { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); goto error2; } } if (part > 0) { for (p = 1; p <= part; p++) if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } } else { ufirst = tfirst = 0; ulast = db->ureads; tlast = db->treads; } } db->trimmed = 0; db->tracks = NULL; db->part = part; db->cutoff = cutoff; db->all = all; db->ufirst = ufirst; db->tfirst = tfirst; nreads = ulast-ufirst; if (part <= 0) { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); db->reads += 1; if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(db->reads); goto error2; } } else { HITS_READ *reads; int i, r, maxlen; int64 totlen; reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); reads += 1; fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR); if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(reads); goto error2; } totlen = 0; maxlen = 0; for (i = 0; i < nreads; i++) { r = reads[i].rlen; totlen += r; if (r > maxlen) maxlen = r; } db->maxlen = maxlen; db->totlen = totlen; db->reads = reads; } ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part ((int *) (db->reads))[-2] = tlast - tfirst; db->nreads = nreads; db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); if (db->path == NULL) goto error2; db->bases = NULL; db->loaded = 0; status = isdam; error2: fclose(index); error1: fclose(dbvis); error: if (bptr != NULL) *bptr = '.'; free(pwd); free(root); if (status < 0) *db = dbcopy; return (status); } // Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(HITS_DB *db) { int i, j, r; int allflag, cutoff; int64 totlen; int maxlen, nreads; HITS_TRACK *record; HITS_READ *reads; if (db->trimmed) return; if (db->cutoff <= 0 && db->all) return; cutoff = db->cutoff; if (db->all) allflag = 0; else allflag = DB_BEST; reads = db->reads; nreads = db->nreads; for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,".@qvs") == 0) { uint16 *table = ((HITS_QV *) record)->table; j = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) table[j++] = table[i]; } else { int *anno4, size; int64 *anno8; char *anno, *data; size = record->size; data = (char *) record->data; if (data == NULL) { anno = (char *) record->anno; j = 0; for (i = r = 0; i < db->nreads; i++, r += size) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (record->anno); j = anno4[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (record->anno); j = anno8[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno8[j],NULL); } record->anno = Realloc(record->anno,record->size*(j+1),NULL); } totlen = maxlen = 0; for (j = i = 0; i < nreads; i++) { r = reads[i].rlen; if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff) { totlen += r; if (r > maxlen) maxlen = r; reads[j++] = reads[i]; } } db->totlen = totlen; db->maxlen = maxlen; db->nreads = j; db->trimmed = 1; if (j < nreads) { db->reads = Realloc(reads-1,sizeof(HITS_READ)*(j+2),NULL); db->reads += 1; } } // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(HITS_DB *db) { HITS_TRACK *t, *p; if (db->loaded) free(((char *) (db->bases)) - 1); else if (db->bases != NULL) fclose((FILE *) db->bases); free(db->reads-1); free(db->path); Close_QVs(db); for (t = db->tracks; t != NULL; t = p) { p = t->next; free(t->anno); free(t->data); free(t); } } /******************************************************************************************* * * QV LOAD & CLOSE ROUTINES * ********************************************************************************************/ HITS_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" HITS_QV *Active_QV; // Becomes invalid after closing int Load_QVs(HITS_DB *db) { FILE *quiva, *istub, *indx; char *root; uint16 *table; HITS_QV *qvtrk; QVcoding *coding, *nx; int ncodes; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) return (0); if (db->trimmed) { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); EXIT(1); } if (db->reads[db->nreads-1].coff < 0) { EPRINTF(EPLACE,"%s: The requested QVs have not been added to the DB!\n",Prog_Name); EXIT(1); } // Open .qvs, .idx, and .db files quiva = Fopen(Catenate(db->path,"","",".qvs"),"r"); if (quiva == NULL) return (-1); istub = NULL; indx = NULL; table = NULL; coding = NULL; qvtrk = NULL; root = rindex(db->path,'/') + 2; istub = Fopen(Catenate(db->path,"/",root,".db"),"r"); if (istub == NULL) goto error; { int first, last, nfiles; char prolog[MAX_NAME], fname[MAX_NAME]; int i, j; if (fscanf(istub,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (db->part > 0) { int pfirst, plast; int fbeg, fend; int n, k; FILE *indx; // Determine first how many and which files span the block (fbeg to fend) pfirst = db->ufirst; plast = pfirst + db->nreads; first = 0; for (fbeg = 0; fbeg < nfiles; fbeg++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (last > pfirst) break; first = last; } for (fend = fbeg+1; fend <= nfiles; fend++) { if (last >= plast) break; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } first = last; } indx = Fopen(Catenate(db->path,"","",".idx"),"r"); ncodes = fend-fbeg; coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (indx == NULL || coding == NULL || table == NULL) { ncodes = 0; goto error; } // Carefully get the first coding scheme (its offset is most likely in a HITS_RECORD // in .idx that is *not* in memory). Get all the other coding schemes normally and // assign the tables # for each read in the block in "tables". rewind(istub); fscanf(istub,DB_NFILE,&nfiles); first = 0; for (n = 0; n < fbeg; n++) { fscanf(istub,DB_FDATA,&last,fname,prolog); first = last; } for (n = fbeg; n < fend; n++) { fscanf(istub,DB_FDATA,&last,fname,prolog); i = n-fbeg; if (first < pfirst) { HITS_READ read; fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*first,SEEK_SET); if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); ncodes = i; goto error; } fseeko(quiva,read.coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; } else { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first-pfirst].coff = ftello(quiva); } j = first-pfirst; if (j < 0) j = 0; k = last-pfirst; if (k > db->nreads) k = db->nreads; while (j < k) table[j++] = (uint16) i; first = last; } fclose(indx); indx = NULL; } else { // Load in coding scheme for each file, adjust .coff of first read in the file, and // record which table each read uses ncodes = nfiles; coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (coding == NULL || table == NULL) goto error; first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } fseeko(quiva,db->reads[first].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first].coff = ftello(quiva); for (j = first; j < last; j++) table[j] = (uint16) i; first = last; } } // Allocate and fill in the HITS_QV record and add it to the front of the // track list qvtrk = (HITS_QV *) Malloc(sizeof(HITS_QV),"Allocating QV pseudo-track"); if (qvtrk == NULL) goto error; qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); if (qvtrk->name == NULL) goto error; qvtrk->next = db->tracks; db->tracks = (HITS_TRACK *) qvtrk; qvtrk->ncodes = ncodes; qvtrk->table = table; qvtrk->coding = coding; qvtrk->quiva = quiva; } fclose(istub); return (0); error: if (qvtrk != NULL) free(qvtrk); if (table != NULL) free(table); if (coding != NULL) { int i; for (i = 0; i < ncodes; i++) Free_QVcoding(coding+i); free(coding); } if (indx != NULL) fclose(indx); if (istub != NULL) fclose(istub); fclose(quiva); EXIT(1); } // Close the QV stream, free the QV pseudo track and all associated memory void Close_QVs(HITS_DB *db) { HITS_TRACK *track; HITS_QV *qvtrk; int i; Active_DB = NULL; track = db->tracks; if (track != NULL && strcmp(track->name,".@qvs") == 0) { qvtrk = (HITS_QV *) track; for (i = 0; i < qvtrk->ncodes; i++) Free_QVcoding(qvtrk->coding+i); free(qvtrk->coding); free(qvtrk->table); fclose(qvtrk->quiva); db->tracks = track->next; free(track); } return; } /******************************************************************************************* * * TRACK LOAD & CLOSE ROUTINES * ********************************************************************************************/ // Return status of track: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track int Check_Track(HITS_DB *db, char *track) { FILE *afile; int tracklen, ispart; int ureads, treads; afile = NULL; if (db->part > 0) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) return (-2); if (fread(&tracklen,sizeof(int),1,afile) != 1) return (-1); fclose(afile); if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (tracklen == treads) return (1); else if (tracklen == ureads) return (0); else return (-1); } // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned. HITS_TRACK *Load_Track(HITS_DB *db, char *track) { FILE *afile, *dfile; int tracklen, size; int nreads, ispart; int treads, ureads; void *anno; void *data; char *name; HITS_TRACK *record; if (track[0] == '.') { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); EXIT(NULL); } for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,track) == 0) return (record); afile = NULL; if (db->part) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); return (NULL); } dfile = NULL; anno = NULL; data = NULL; record = NULL; if (ispart) name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"); else name = Catenate(db->path,".",track,".data"); if (name == NULL) goto error; dfile = fopen(name,"r"); if (fread(&tracklen,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (fread(&size,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size <= 0) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (db->trimmed) { if (tracklen != treads) { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->tfirst,SEEK_CUR); } else { if (tracklen != ureads) { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->ufirst,SEEK_CUR); } nreads = db->nreads; anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); if (anno == NULL) goto error; if (dfile != NULL) { int64 *anno8, off8, dlen; int *anno4, off4; int i; if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 4) { anno4 = (int *) anno; off4 = anno4[0]; if (off4 != 0) { for (i = 0; i <= nreads; i++) anno4[i] -= off4; fseeko(dfile,off4,SEEK_SET); } dlen = anno4[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } else { anno8 = (int64 *) anno; off8 = anno8[0]; if (off8 != 0) { for (i = 0; i <= nreads; i++) anno8[i] -= off8; fseeko(dfile,off8,SEEK_SET); } dlen = anno8[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } if (data == NULL) goto error; if (dlen > 0) { if (fread(data,dlen,1,dfile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track); goto error; } } fclose(dfile); dfile = NULL; } else { if (fread(anno,size,nreads,afile) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } data = NULL; } fclose(afile); record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record"); if (record == NULL) goto error; record->name = Strdup(track,"Allocating Track Name"); if (record->name == NULL) goto error; record->data = data; record->anno = anno; record->size = size; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) { record->next = db->tracks->next; db->tracks->next = record; } else { record->next = db->tracks; db->tracks = record; } return (record); error: if (record == NULL) free(record); if (data != NULL) free(data); if (anno != NULL) free(anno); if (dfile != NULL) fclose(dfile); fclose(afile); EXIT (NULL); } void Close_Track(HITS_DB *db, char *track) { HITS_TRACK *record, *prev; prev = NULL; for (record = db->tracks; record != NULL; record = record->next) { if (strcmp(record->name,track) == 0) { free(record->anno); free(record->data); free(record->name); if (prev == NULL) db->tracks = record->next; else prev->next = record->next; free(record); return; } prev = record; } return; } /******************************************************************************************* * * READ BUFFER ALLOCATION AND READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer big enough for the largest read in 'db', leaving room // for an initial delimiter character char *New_Read_Buffer(HITS_DB *db) { char *read; read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); if (read == NULL) EXIT(NULL); return (read+1); } // Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a // lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and // 3(T) otherwise. // // **NB**, the byte before read will be set to a delimiter character! int Load_Read(HITS_DB *db, int i, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; HITS_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(1); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); db->bases = (void *) bases; } off = r[i].boff; len = r[i].rlen; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(1); } } Uncompress_Read(len,read); if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (0); } char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; int bbeg, bend; HITS_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(NULL); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(NULL); db->bases = (void *) bases; } bbeg = beg/4; bend = (end-1)/4+1; off = r[i].boff + bbeg; len = end - beg; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = bend-bbeg; if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(NULL); } } Uncompress_Read(4*clen,read); read += beg%4; read[len] = 4; if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (read); } /******************************************************************************************* * * QV BUFFER ALLOCATION QV READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer of 5 vectors big enough for the largest read in 'db' char **New_QV_Buffer(HITS_DB *db) { char **entry; char *qvs; int i; qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); if (qvs == NULL || entry == NULL) EXIT(NULL); for (i = 0; i < 5; i++) entry[i] = qvs + i*db->maxlen; return (entry); } // Load into entry the QV streams for the i'th read from db. The parameter ascii applies to // the DELTAG stream as described for Load_Read. int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) { HITS_READ *reads; FILE *quiva; int rlen; if (db != Active_DB) { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name); EXIT(1); } Active_QV = (HITS_QV *) db->tracks; Active_DB = db; } if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); EXIT(1); } reads = db->reads; quiva = Active_QV->quiva; rlen = reads[i].rlen; fseeko(quiva,reads[i].coff,SEEK_SET); if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) EXIT(1); if (ascii != 1) { char *deltag = entry[1]; if (ascii != 2) { char x = deltag[rlen]; deltag[rlen] = '\0'; Number_Read(deltag); deltag[rlen] = x; } else { int j; int u = 'A'-'a'; for (j = 0; j < rlen; j++) deltag[j] = (char) (deltag[j]+u); } } return (0); } /******************************************************************************************* * * BLOCK LOAD OF ALL READS (PRIMARILY FOR DALIGNER) * ********************************************************************************************/ // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // non-zero then the reads are converted to ACGT ascii, otherwise the reads are left // as numeric strings over 0(A), 1(C), 2(G), and 3(T). int Read_All_Sequences(HITS_DB *db, int ascii) { FILE *bases; int nreads = db->nreads; HITS_READ *reads = db->reads; void (*translate)(char *s); char *seq; int64 o, off; int i, len, clen; bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); if (seq == NULL) { fclose(bases); EXIT(1); } *seq++ = 4; if (ascii == 1) translate = Lower_Read; else translate = Upper_Read; o = 0; for (i = 0; i < nreads; i++) { len = reads[i].rlen; off = reads[i].boff; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(seq+o,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name); free(seq); fclose(bases); EXIT(1); } } Uncompress_Read(len,seq+o); if (ascii) translate(seq+o); reads[i].boff = o; o += (len+1); } reads[nreads].boff = o; fclose(bases); db->bases = (void *) seq; db->loaded = 1; return (0); } int List_DB_Files(char *path, void actor(char *path, char *extension)) { int status, plen, rlen, dlen; char *root, *pwd, *name; int isdam; DIR *dirp; struct dirent *dp; status = 0; pwd = PathTo(path); plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); rlen = strlen(root); if (root == NULL || pwd == NULL) { free(pwd); free(root); EXIT(1); } if ((dirp = opendir(pwd)) == NULL) { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; goto error; } isdam = 0; while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary) { name = dp->d_name; if (strcmp(name,Catenate("","",root,".db")) == 0) break; if (strcmp(name,Catenate("","",root,".dam")) == 0) { isdam = 1; break; } if (strcasecmp(name,Catenate("","",root,".db")) == 0) { strncpy(root,name,rlen); break; } if (strcasecmp(name,Catenate("","",root,".dam")) == 0) { strncpy(root,name,rlen); isdam = 1; break; } } if (dp == NULL) { EPRINTF(EPLACE,"%s: Cannot find %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; closedir(dirp); goto error; } if (isdam) actor(Catenate(pwd,"/",root,".dam"),"dam"); else actor(Catenate(pwd,"/",root,".db"),"db"); rewinddir(dirp); // Report each auxiliary file while ((dp = readdir(dirp)) != NULL) { name = dp->d_name; dlen = strlen(name); #ifdef HIDE_FILES if (name[0] != '.') continue; dlen -= 1; name += 1; #endif if (dlen < rlen+1) continue; if (name[rlen] != '.') continue; if (strncmp(name,root,rlen) != 0) continue; actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); } closedir(dirp); error: free(pwd); free(root); return (status); } void Print_Read(char *s, int width) { int i; if (s[0] < 4) { for (i = 0; s[i] != 4; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%d",s[i]); } printf("\n"); } else { for (i = 0; s[i] != '\0'; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%c",s[i]); } printf("\n"); } } DEXTRACTOR-1.0/DB.h000066400000000000000000000551471253752467400135220ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. Eventually will also * support compressed quality information. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #ifndef _HITS_DB #define _HITS_DB #include #include "QV.h" #define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" // Undefine if you don't want this // For interactive applications where it is inappropriate to simply exit with an error // message to standard error, define the constant INTERACTIVE. If set, then error // messages are put in the global variable Ebuffer and the caller of a DB routine // can decide how to deal with the error. // // DB, QV, or alignment routines that can encounter errors function as before in // non-INTERACTIVE mode by exiting after printing an error message to stderr. In // INTERACTIVE mode the routines place a message at EPLACE and return an error // value. For such routines that were previously void, they are now int, and // return 1 if an error occured, 0 otherwise. #undef INTERACTIVE #ifdef INTERACTIVE #define EPRINTF sprintf #define EPLACE Ebuffer #define EXIT(x) return (x) #else // BATCH #define EPRINTF fprintf #define EPLACE stderr #define EXIT(x) exit (1) #endif typedef unsigned char uint8; typedef unsigned short uint16; typedef unsigned int uint32; typedef unsigned long long uint64; typedef signed char int8; typedef signed short int16; typedef signed int int32; typedef signed long long int64; typedef float float32; typedef double float64; /******************************************************************************************* * * COMMAND LINE INTERPRETATION MACROS * ********************************************************************************************/ extern char *Prog_Name; // Name of program #ifdef INTERACTIVE extern char Ebuffer[]; #endif #define SYSTEM_ERROR \ { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \ exit (2); \ } #define ARG_INIT(name) \ Prog_Name = Strdup(name,""); \ for (i = 0; i < 128; i++) \ flags[i] = 0; #define ARG_FLAGS(set) \ for (k = 1; argv[i][k] != '\0'; k++) \ { if (index(set,argv[i][k]) == NULL) \ { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \ exit (1); \ } \ flags[(int) argv[i][k]] = 1; \ } #define ARG_POSITIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \ exit (1); \ } \ if (var <= 0) \ { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_NON_NEGATIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \ exit (1); \ } \ if (var < 0) \ { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_REAL(var) \ var = strtod(argv[i]+2,&eptr); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not a real number\n",Prog_Name,argv[i][1]); \ exit (1); \ } /******************************************************************************************* * * UTILITIES * ********************************************************************************************/ // The following general utilities return NULL if any of their input pointers are NULL, or if they // could not perform their function (in which case they also print an error to stderr). void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to char *Strdup(char *string, char *mesg); // stderr if out of memory FILE *Fopen(char *path, char *mode); // Open file path for "mode" char *PathTo(char *path); // Return path portion of file name "path" char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path" // Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer // Numbered_Suffix returns concatenation of left..right in a *temporary* buffer char *Catenate(char *path, char *sep, char *root, char *suffix); char *Numbered_Suffix(char *left, int num, char *right); // DB-related utilities void Print_Number(int64 num, int width, FILE *out); // Print readable big integer int Number_Digits(int64 num); // Return # of digits in printed number #define COMPRESSED_LEN(len) (((len)+3) >> 2) void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form void Print_Read(char *s, int width); void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt) void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT) void Number_Read(char *s); // Convert read from letters to numbers /******************************************************************************************* * * DB IN-CORE DATA STRUCTURES * ********************************************************************************************/ #define DB_QV 0x03ff // Mask for 3-digit quality value #define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert #define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1) typedef struct { int origin; // Well # int rlen; // Length of the sequence (Last pulse = fpulse + rlen) int fpulse; // First pulse int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file int flags; // QV of read + flags above } HITS_READ; // A track can be of 3 types: // data == NULL: there are nreads 'anno' records of size 'size'. // data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1]) // contains the variable length data // data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1]) // contains the variable length data typedef struct _track { struct _track *next; // Link to next track char *name; // Symbolic name of track int size; // Size in bytes of anno records void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL } HITS_TRACK; // The information for accessing QV streams is in a HITS_QV record that is a "pseudo-track" // named ".@qvs" and is always the first track record in the list (if present). Since normal // track names cannot begin with a . (this is enforced), this pseudo-track is never confused // with a normal track. typedef struct { struct _track *next; char *name; int ncodes; // # of coding tables QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h) uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with // scheme coding[table[i]] FILE *quiva; // the open file pointer to the .qvs file } HITS_QV; // The DB record holds all information about the current state of an active DB including an // array of HITS_READS, one per read, and a linked list of HITS_TRACKs the first of which // is always a HITS_QV pseudo-track (if the QVs have been loaded). typedef struct { int ureads; // Total number of reads in untrimmed DB int treads; // Total number of reads in trimmed DB int cutoff; // Minimum read length in block (-1 if not yet set) int all; // Consider multiple reads from a given well float freq[4]; // frequency of A, C, G, T, respectively // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed) int maxlen; // length of maximum read (initially over all DB) int64 totlen; // total # of bases (initially over all DB) int nreads; // # of reads in actively loaded portion of DB int trimmed; // DB has been trimmed by cutoff/all int part; // DB block (if > 0), total DB (if == 0) int ufirst; // Index of first read in block (without trimming) int tfirst; // Index of first read in block (with trimming) // In order to avoid forcing users to have to rebuild all thier DBs to accommodate // the addition of fields for the size of the actively loaded trimmed and untrimmed // blocks, an additional read record is allocated in "reads" when a DB is loaded into // memory (reads[-1]) and the two desired fields are crammed into the first two // integer spaces of the record. char *path; // Root name of DB for .bps, .qvs, and tracks int loaded; // Are reads loaded in memory? void *bases; // file pointer for bases file (to fetch reads from), // or memory pointer to uncompressed block of all sequences. HITS_READ *reads; // Array [-1..nreads] of HITS_READ HITS_TRACK *tracks; // Linked list of loaded tracks } HITS_DB; /******************************************************************************************* * * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock * ********************************************************************************************/ #define MAX_NAME 10000 // Longest file name or fasta header line #define DB_NFILE "files = %9d\n" // number of files #define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name #define DB_NBLOCK "blocks = %9d\n" // number of blocks #define DB_PARAMS "size = %9lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well #define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed) /******************************************************************************************* * * DB ROUTINES * ********************************************************************************************/ // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps, // .DB.qvs, and files .DB..anno and DB..data where is a track name // (not containing a . !). // A DAM is basically a DB except that: // 1. there are no QV's, instead .coff points the '\0' terminated fasta header of the read // in the file ..hdr file // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences // contain N-separated contigs), and .fpulse the first base of the contig in the // fasta entry // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char *path, HITS_DB *db); // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(HITS_DB *db); // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(HITS_DB *db); // If QV pseudo track is not already in db's track list, then load it and set it up. // The database must not have been trimmed yet. -1 is returned if a .qvs file is not // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE // is defined. Otherwise a 0 is returned. int Load_QVs(HITS_DB *db); // Remove the QV pseudo track, all space associated with it, and close the .qvs file. void Close_QVs(HITS_DB *db); // Look up the file and header in the file of the indicated track. Return: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track int Check_Track(HITS_DB *db, char *track); // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise // the routine prints an error message to stderr and exits if an error occurs, and returns // with NULL only if the track does not exist. HITS_TRACK *Load_Track(HITS_DB *db, char *track); // If track is on the db's track list, then it is removed and all storage associated with it // is freed. void Close_Track(HITS_DB *db, char *track); // Allocate and return a buffer big enough for the largest read in 'db'. // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte // are needed by the alignment algorithms. If cannot allocate memory then return NULL // if INTERACTIVE is defined, or print error to stderr and exit otherwise. char *New_Read_Buffer(HITS_DB *db); // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter // for traversals in either direction. A non-zero value is returned if an error occured // and INTERACTIVE is defined. int Load_Read(HITS_DB *db, int i, char *read, int ascii); // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to // the string holding the substring so it has a delimeter for traversals in either direction. // A NULL pointer is returned if an error occured and INTERACTIVE is defined. char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii); // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, // or print error to stderr and exit otherwise. #define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer #define DEL_TAG 1 // The deleted characters #define INS_QV 2 // The insertion QVs #define SUB_QV 3 // The substitution QVs #define MRG_QV 4 // The merge QVs char **New_QV_Buffer(HITS_DB *db); // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters // are converted to a numeric or upper/lower case ascii string as per ascii. Return with // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii); // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T). // Return with a zero, except when an error occurs and INTERACTIVE is defined in which // case return wtih 1. int Read_All_Sequences(HITS_DB *db, int ascii); // For the DB or DAM "path" = "prefix/root[.db|.dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.db" and extension "db". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. // -1 is returned if the path could not be found, and 1 is returned if an error (reported // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. int List_DB_Files(char *path, void actor(char *path, char *extension)); #endif // _HITS_DB DEXTRACTOR-1.0/Makefile000066400000000000000000000015271253752467400145150ustar00rootroot00000000000000PATH_HDF5 = /sw/apps/hdf5/current PATH_HDF5 = /usr/local/hdf5 CFLAGS = -O3 -Wall -Wextra -fno-strict-aliasing all: dextract dexta undexta dexqv undexqv dextract: dextract.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -I$(PATH_HDF5)/include -L$(PATH_HDF5)/lib -o dextract dextract.c DB.c QV.c -lhdf5 dexta: dexta.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o dexta dexta.c DB.c QV.c undexta: undexta.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o undexta undexta.c DB.c QV.c dexqv: dexqv.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o dexqv dexqv.c DB.c QV.c undexqv: undexqv.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o undexqv undexqv.c DB.c QV.c clean: rm -f dextract dexta undexta dexqv undexqv dextract.tar.gz rm -fr *.dSYM rm -f dextract.tar.gz install: cp dextract dexta undexta dexqv undexqv ~/bin package: make clean tar -zcf dextract.tar.gz README Makefile *.c *.h DEXTRACTOR-1.0/QV.c000066400000000000000000001132511253752467400135450ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #undef DEBUG #define MIN_BUFFER 1000 #define HUFF_CUTOFF 16 // This cannot be larger than 16 ! /******************************************************************************************* * * Endian flipping routines * ********************************************************************************************/ static int LittleEndian; // Little-endian machine ? // Referred by: Decode & Decode_Run static int Flip; // Flip endian of all coded shorts and ints // Referred by: Decode & Decode_Run & Read_Scheme static void Set_Endian(int flip) { uint32 x = 3; uint8 *b = (uint8 *) (&x); Flip = flip; LittleEndian = (b[0] == 3); } static void Flip_Long(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[3]; v[3] = x; x = v[1]; v[1] = v[2]; v[2] = x; } static void Flip_Short(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[1]; v[1] = x; } /******************************************************************************************* * * Routines for computing a Huffman Encoding Scheme * ********************************************************************************************/ typedef struct { int type; // 0 => normal, 1 => normal but has long codes, 2 => truncated uint32 codebits[256]; // If type = 2, then code 255 is the special code for int codelens[256]; // non-Huffman exceptions int lookup[0x10000]; // Lookup table (just for decoding) } HScheme; typedef struct _HTree { struct _HTree *lft, *rgt; uint64 count; } HTree; // Establish heap property from node s down (1 is root, siblings of n are 2n and 2n+1) // assuming s is the only perturbation in the tree. static void Reheap(int s, HTree **heap, int hsize) { int c, l, r; HTree *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; hr = heap[r]; if (r > hsize || hr->count > hl->count) { if (hs->count > hl->count) { heap[c] = hl; c = l; } else break; } else { if (hs->count > hr->count) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } // Given Huffman tree build a table of codes from it, the low-order codelens[s] bits // of codebits[s] contain the code for symbol s. static void Build_Table(HTree *node, int code, int len, uint32 *codebits, int *codelens) { if (node->rgt == NULL) { uint64 symbol = (uint64) (node->lft); codebits[symbol] = code; codelens[symbol] = len; } else { code <<= 1; len += 1; Build_Table(node->lft,code,len,codebits,codelens); Build_Table(node->rgt,code+1,len,codebits,codelens); } } // For the non-zero symbols in hist, compute a huffman tree over them, and then // build a table of the codes. If inscheme is not NULL, then place all symbols // with code 255 or with more than HUFF_CUTOFF bits in the encoding by inscheme // as a single united entity, whose code signals that the value of these symbols // occur explicitly in 8 (values) or 16 (run lengths) bits following the code. // All the symbols in this class will have the same entry in the code table and // 255 is always in this class. static HScheme *Huffman(uint64 *hist, HScheme *inscheme) { HScheme *scheme; HTree *heap[259]; HTree node[512]; int hsize; HTree *lft, *rgt; int value, range; int i; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); hsize = 0; // Load heap value = 0; if (inscheme != NULL) { node[0].count = 0; node[0].lft = (HTree *) (uint64) 255; node[0].rgt = NULL; heap[++hsize] = node+(value++); } for (i = 0; i < 256; i++) if (hist[i] > 0) { if (inscheme != NULL && (inscheme->codelens[i] > HUFF_CUTOFF || i == 255)) node[0].count += hist[i]; else { node[value].count = hist[i]; node[value].lft = (HTree *) (uint64) i; node[value].rgt = NULL; heap[++hsize] = node+(value++); } } for (i = hsize/2; i >= 1; i--) // Establish heap property Reheap(i,heap,hsize); range = value; // Merge pairs with smallest count until have a tree for (i = 1; i < value; i++) { lft = heap[1]; heap[1] = heap[hsize--]; Reheap(1,heap,hsize); rgt = heap[1]; node[range].lft = lft; node[range].rgt = rgt; node[range].count = lft->count + rgt->count; heap[1] = node+(range++); Reheap(1,heap,hsize); } for (i = 0; i < 256; i++) // Build the code table { scheme->codebits[i] = 0; scheme->codelens[i] = 0; } Build_Table(node+(range-1),0,0,scheme->codebits,scheme->codelens); if (inscheme != NULL) // Set scheme type and if truncated (2), map truncated codes { scheme->type = 2; // to code and length for 255 for (i = 0; i < 255; i++) if (inscheme->codelens[i] > HUFF_CUTOFF || scheme->codelens[i] > HUFF_CUTOFF) { scheme->codelens[i] = scheme->codelens[255]; scheme->codebits[i] = scheme->codebits[255]; } } else { scheme->type = 0; for (i = 0; i < 256; i++) { if (scheme->codelens[i] > HUFF_CUTOFF) scheme->type = 1; } } return (scheme); } #ifdef DEBUG // For debug, show the coding table static void Print_Table(HScheme *scheme, uint64 *hist, int infosize) { uint64 total_bits; uint32 specval, mask, code, *bits; int speclen, clen, *lens; int i, k; total_bits = 0; bits = scheme->codebits; lens = scheme->codelens; if (scheme->type == 2) { specval = bits[255]; speclen = lens[255]; } else specval = speclen = 0x7fffffff; printf("\nCode Table:\n"); for (i = 0; i < 256; i++) if (lens[i] > 0) { clen = lens[i]; mask = (1 << clen); code = bits[i]; printf(" %3d: %2d ",i,clen); for (k = 0; k < clen; k++) { mask >>= 1; if (code & mask) printf("1"); else printf("0"); } if (code == specval && clen == speclen) { printf(" ***"); if (hist != NULL) total_bits += (clen+infosize)*hist[i]; } else if (hist != NULL) total_bits += clen*hist[i]; printf("\n"); } if (hist != NULL) printf("\nTotal Bytes = %lld\n",(total_bits-1)/8+1); } // For debug, show the histogram static void Print_Histogram(uint64 *hist) { int i, low, hgh; uint64 count; for (hgh = 255; hgh >= 0; hgh--) if (hist[hgh] != 0) break; for (low = 0; low < 256; low++) if (hist[low] != 0) break; count = 0; for (i = low; i <= hgh; i++) count += hist[i]; for (i = hgh; i >= low; i--) printf(" %3d: %8llu %5.1f%%\n",i,hist[i],(hist[i]*100.)/count); } #endif /******************************************************************************************* * * Read and Write Huffman Schemes * ********************************************************************************************/ // Write the code table to out. static void Write_Scheme(HScheme *scheme, FILE *out) { int i; uint8 x; uint32 *bits; int *lens; lens = scheme->codelens; bits = scheme->codebits; x = (uint8) (scheme->type); fwrite(&x,1,1,out); for (i = 0; i < 256; i++) { x = (uint8) (lens[i]); fwrite(&x,1,1,out); if (x > 0) fwrite(bits+i,sizeof(uint32),1,out); } } // Allocate and read a code table from in, and return a pointer to it. static HScheme *Read_Scheme(FILE *in) { HScheme *scheme; int *look, *lens; uint32 *bits, base; int i, j, powr; uint8 x; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); lens = scheme->codelens; bits = scheme->codebits; look = scheme->lookup; if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n"); free(scheme); return (NULL); } scheme->type = x; for (i = 0; i < 256; i++) { if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i); return (NULL); } lens[i] = x; if (x > 0) { if (fread(bits+i,sizeof(uint32),1,in) != 1) { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i); free(scheme); return (NULL); } } else bits[i] = 0; } if (Flip) { for (i = 0; i < 256; i++) Flip_Long(bits+i); } for (i = 0; i < 256; i++) { if (lens[i] > 0) { base = (bits[i] << (16-lens[i])); powr = (1 << (16-lens[i])); for (j = 0; j < powr; j++) look[base+j] = i; } } return (scheme); } /******************************************************************************************* * * Encoders and Decoders * ********************************************************************************************/ // Encode read[0..rlen-1] according to scheme and write to out static void Encode(HScheme *scheme, FILE *out, uint8 *read, int rlen) { uint32 x, c, ocode; int n, k, olen, llen; int *nlens; uint32 *nbits; uint32 nspec; int nslen; nlens = scheme->codelens; nbits = scheme->codebits; if (scheme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; #define OCODE(L,C) \ { int len = olen + (L); \ uint32 code = (C); \ \ llen = olen; \ if (len >= 32) \ { olen = len-32; \ ocode |= (code >> olen); \ fwrite(&ocode,sizeof(uint32),1,out); \ if (olen > 0) \ ocode = (code << (32-olen)); \ else \ ocode = 0; \ } \ else \ { olen = len; \ ocode |= (code << (32-olen));; \ } \ } llen = 0; olen = 0; ocode = 0; for (k = 0; k < rlen; k++) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); } if (olen > 0) // Tricky: must pad so decoder does not read past { fwrite(&ocode,sizeof(uint32),1,out); // last integer int the coded output. if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Encode read[0..rlen-1] according to non-rchar table neme, and run-length table reme for // runs of rchar characters. Write to out. static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int rlen, int rchar) { uint32 x, c, ocode; int n, h, k, olen, llen; int *nlens, *rlens; uint32 *nbits, *rbits; uint32 nspec, rspec; int nslen, rslen; nlens = neme->codelens; nbits = neme->codebits; rlens = reme->codelens; rbits = reme->codebits; if (neme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; rspec = rbits[255]; rslen = rlens[255]; llen = 0; olen = 0; ocode = 0; k = 0; while (k < rlen) { h = k; while (k < rlen && read[k] == rchar) k += 1; if (k-h >= 255) x = 255; else x = k-h; n = rlens[x]; c = rbits[x]; OCODE(n,c); if (c == rspec && n == rslen) OCODE(16,k-h); if (k < rlen) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); k += 1; } } if (olen > 0) { fwrite(&ocode,sizeof(uint32),1,out); if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Read and decode from in, the next rlen symbols into read according to scheme static int Decode(HScheme *scheme, FILE *in, char *read, int rlen) { int *look, *lens; int signal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (scheme->type == 2) signal = 255; else signal = 256; lens = scheme->codelens; look = scheme->lookup; #define GET \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } #define GETFLIP \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ Flip_Long(ipart); \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = look[*xpart]; n = lens[c]; if (c == signal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } else for (j = 0; j < rlen; j++) { GET c = look[*xpart]; n = lens[c]; if (c == signal) { GET c = *cpart; n = 8; } read[j] = (char) c; } return (0); } // Read and decode from in, the next rlen symbols into read according to non-rchar scheme // neme, and the rchar runlength shceme reme static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, int rlen, int rchar) { int *nlook, *nlens; int *rlook, *rlens; int nsignal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c, k; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (neme->type == 2) nsignal = 255; else nsignal = 256; nlens = neme->codelens; nlook = neme->lookup; rlens = reme->codelens; rlook = reme->lookup; n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GETFLIP c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GETFLIP c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } } else for (j = 0; j < rlen; j++) { GET c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GET c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GET c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GET c = *cpart; n = 8; } read[j] = (char) c; } } return (0); } /******************************************************************************************* * * Histogrammers * ********************************************************************************************/ // Histogram runlengths of symbol runChar in stream[0..rlen-1] into run. static void Histogram_Seqs(uint64 *hist, uint8 *stream, int rlen) { int k; for (k = 0; k < rlen; k++) hist[stream[k]] += 1; } static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar) { int k, h; k = 0; while (k < rlen) { h = k; while (k < rlen && stream[k] == runChar) k += 1; if (k-h >= 256) run[255] += 1; else run[k-h] += 1; if (k < rlen) k += 1; } } /******************************************************************************************* * * Reader * ********************************************************************************************/ static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan, static int Rmax = -1; // Compress_Next_QVentry static int Nline; // Referred by: QVcoding_Scan char *QVentry() { return (Read); } // If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines // for a sequence. Place line j at Read+j*Rmax and the length of every line is returned // unless eof occurs in which case return -1. If any error occurs return -2. int Read_Lines(FILE *input, int nlines) { int i, rlen; int tmax; char *tread; char *other; if (Read == NULL) { tmax = MIN_BUFFER; tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; } Nline += 1; if (fgets(Read,Rmax,input) == NULL) return (-1); rlen = strlen(Read); while (Read[rlen-1] != '\n') { tmax = ((int) 1.4*Rmax) + MIN_BUFFER; tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; if (fgets(Read+rlen,Rmax-rlen,input) == NULL) { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline); EXIT(-2); } rlen += strlen(Read+rlen); } other = Read; for (i = 1; i < nlines; i++) { other += Rmax; Nline += 1; if (fgets(other,Rmax,input) == NULL) { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(-2); } if (rlen != (int) strlen(other)) { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline); EXIT(-2); } } return (rlen-1); } /******************************************************************************************* * * Tag compression and decompression routines * ********************************************************************************************/ // Keep only the symbols in tags[0..rlen-1] for which qvs[k] != rchar and // return the # of symbols kept. static int Pack_Tag(char *tags, char *qvs, int rlen, int rchar) { int j, k; j = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) tags[j++] = tags[k]; tags[j] = '\0'; return (j); } // Count the # of non-rchar symbols in qvs[0..rlen-1] static int Packed_Length(char *qvs, int rlen, int rchar) { int k, clen; clen = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) clen += 1; return (clen); } // Unpack tags by moving its i'th char to position k where qvs[k] is the i'th non-rchar // symbol in qvs. All other chars are set to rchar. rlen is the length of qvs and // the unpacked result, clen is the initial length of tags. static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar) { int j, k; j = clen-1; for (k = rlen-1; k >= 0; k--) { if (qvs[k] == rchar) tags[k] = 'n'; else tags[k] = tags[j--]; } } /******************************************************************************************* * * Statistics Scan and Scheme creation and write * ********************************************************************************************/ // Read .quiva file from input, recording stats in the histograms. If zero is set then // start the stats anew with this file. static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256]; static uint64 totChar; static int delChar, subChar; // Referred by: QVcoding_Scan, Create_QVcoding int QVcoding_Scan(FILE *input) { char *slash; int rlen; // Zero histograms bzero(delHist,sizeof(uint64)*256); bzero(mrgHist,sizeof(uint64)*256); bzero(insHist,sizeof(uint64)*256); bzero(subHist,sizeof(uint64)*256); { int i; for (i = 0; i < 256; i++) delRun[i] = subRun[i] = 1; } totChar = 0; delChar = -1; subChar = -1; // Make a sweep through the .quiva entries, histogramming the relevant things // and figuring out the run chars for the deletion and substition streams Nline = 0; while (1) { int well, beg, end, qv; rlen = Read_Lines(input,1); if (rlen == -2) EXIT(1); if (rlen < 0) break; if (rlen == 0 || Read[0] != '@') { EPRINTF(EPLACE,"Line %d: Header in quiv file is missing\n",Nline); EXIT(1); } slash = index(Read+1,'/'); if (slash == NULL) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(1); } if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(1); } rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(1); } Histogram_Seqs(delHist,(uint8 *) (Read),rlen); Histogram_Seqs(insHist,(uint8 *) (Read+2*Rmax),rlen); Histogram_Seqs(mrgHist,(uint8 *) (Read+3*Rmax),rlen); Histogram_Seqs(subHist,(uint8 *) (Read+4*Rmax),rlen); if (delChar < 0) { int k; char *del = Read+Rmax; for (k = 0; k < rlen; k++) if (del[k] == 'n' || del[k] == 'N') { delChar = Read[k]; break; } } if (delChar >= 0) Histogram_Runs( delRun,(uint8 *) (Read),rlen,delChar); totChar += rlen; if (subChar < 0) { if (totChar >= 100000) { int k; subChar = 0; for (k = 1; k < 256; k++) if (subHist[k] > subHist[subChar]) subChar = k; } } if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar); } return (0); } // Using the statistics in the global stat tables, create the Huffman schemes and write // them to output. If lossy is set, then create a lossy table for the insertion and merge // QVs. QVcoding *Create_QVcoding(int lossy) { static QVcoding coding; HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; HScheme *dRunScheme, *sRunScheme; delScheme = NULL; dRunScheme = NULL; insScheme = NULL; mrgScheme = NULL; subScheme = NULL; sRunScheme = NULL; // Check whether using a subtitution run char is a win if (totChar < 200000 || subHist[subChar] < .5*totChar) subChar = -1; // If lossy encryption is enabled then scale insertions and merge QVs. if (lossy) { int k; for (k = 0; k < 256; k += 2) { insHist[k] += insHist[k+1]; insHist[k+1] = 0; } for (k = 0; k < 256; k += 4) { mrgHist[k] += mrgHist[k+1]; mrgHist[k] += mrgHist[k+2]; mrgHist[k] += mrgHist[k+3]; mrgHist[k+1] = 0; mrgHist[k+2] = 0; mrgHist[k+3] = 0; } } // Build a Huffman scheme for each stream entity from the histograms #define SCHEME_MACRO(meme,hist,label,bits) \ scheme = Huffman( (hist), NULL); \ if (scheme == NULL) \ goto error; \ if (scheme->type) \ { (meme) = Huffman( (hist), scheme); \ free(scheme); \ } \ else \ (meme) = scheme; #ifdef DEBUG #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) \ printf("\n%s\n", (label) ); \ Print_Histogram( (hist)); \ Print_Table( (meme), (hist), (bits)); #else #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) #endif { HScheme *scheme; if (delChar < 0) { MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs", 8); dRunScheme = NULL; } else { delHist[delChar] = 0; MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs less run char", 8); MAKE_SCHEME(dRunScheme,delRun, "Histogram of Deletion Runs QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",delChar); #endif } #ifdef DEBUG { int k; uint64 count; count = 0; for (k = 0; k < 256; k++) count += delHist[k]; printf("\nDelTag will require %lld bytes\n",count/4); } #endif MAKE_SCHEME(insScheme,insHist, "Hisotgram of Insertion QVs", 8); MAKE_SCHEME(mrgScheme,mrgHist, "Hisotgram of Merge QVs", 8); if (subChar < 0) { MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs", 8); sRunScheme = NULL; } else { subHist[subChar] = 0; MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs less run char", 8); MAKE_SCHEME(sRunScheme,subRun, "Histogram of Substitution Run QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",subChar); #endif } } // Setup endian handling Set_Endian(0); coding.delScheme = delScheme; coding.insScheme = insScheme; coding.mrgScheme = mrgScheme; coding.subScheme = subScheme; coding.dRunScheme = dRunScheme; coding.sRunScheme = sRunScheme; coding.delChar = delChar; coding.subChar = subChar; coding.prefix = NULL; coding.flip = 0; return (&coding); error: if (delScheme != NULL) free(delScheme); if (dRunScheme != NULL) free(dRunScheme); if (insScheme != NULL) free(insScheme); if (mrgScheme != NULL) free(mrgScheme); if (subScheme != NULL) free(subScheme); if (sRunScheme != NULL) free(sRunScheme); EXIT(NULL); } // Write the encoding scheme 'coding' to 'output' void Write_QVcoding(FILE *output, QVcoding *coding) { // Write out the endian key, run chars, and prefix (if not NULL) { uint16 half; int len; half = 0x33cc; fwrite(&half,sizeof(uint16),1,output); if (coding->delChar < 0) half = 256; else half = (uint16) (coding->delChar); fwrite(&half,sizeof(uint16),1,output); if (coding->subChar < 0) half = 256; else half = (uint16) (coding->subChar); fwrite(&half,sizeof(uint16),1,output); len = strlen(coding->prefix); fwrite(&len,sizeof(int),1,output); fwrite(coding->prefix,1,len,output); } // Write out the scheme tables Write_Scheme(coding->delScheme,output); if (coding->delChar >= 0) Write_Scheme(coding->dRunScheme,output); Write_Scheme(coding->insScheme,output); Write_Scheme(coding->mrgScheme,output); Write_Scheme(coding->subScheme,output); if (coding->subChar >= 0) Write_Scheme(coding->sRunScheme,output); } // Read the encoding scheme 'coding' to 'output' QVcoding *Read_QVcoding(FILE *input) { static QVcoding coding; // Read endian key, run chars, and short name common to all headers { uint16 half; int len; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n"); EXIT(NULL); } coding.flip = (half != 0x33cc); if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.delChar = half; if (coding.delChar >= 256) coding.delChar = -1; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.subChar = half; if (coding.subChar >= 256) coding.subChar = -1; // Read the short name common to all headers if (fread(&len,sizeof(int),1,input) != 1) { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Long(&len); coding.prefix = (char *) Malloc(len+1,"Allocating header prefix"); if (coding.prefix == NULL) EXIT(NULL); if (len > 0) { if (fread(coding.prefix,len,1,input) != 1) { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n"); EXIT(NULL); } } coding.prefix[len] = '\0'; } // Setup endian handling Set_Endian(coding.flip); // Read the Huffman schemes used to compress the data coding.delScheme = NULL; coding.dRunScheme = NULL; coding.insScheme = NULL; coding.mrgScheme = NULL; coding.subScheme = NULL; coding.sRunScheme = NULL; coding.delScheme = Read_Scheme(input); if (coding.delScheme == NULL) goto error; if (coding.delChar >= 0) { coding.dRunScheme = Read_Scheme(input); if (coding.dRunScheme == NULL) goto error; } coding.insScheme = Read_Scheme(input); if (coding.insScheme == NULL) goto error; coding.mrgScheme = Read_Scheme(input); if (coding.mrgScheme == NULL) goto error; coding.subScheme = Read_Scheme(input); if (coding.subScheme == NULL) goto error; if (coding.subChar >= 0) { coding.sRunScheme = Read_Scheme(input); if (coding.sRunScheme == NULL) goto error; } return (&coding); error: if (coding.delScheme != NULL) free(coding.delScheme); if (coding.dRunScheme != NULL) free(coding.dRunScheme); if (coding.insScheme != NULL) free(coding.insScheme); if (coding.mrgScheme != NULL) free(coding.mrgScheme); if (coding.subScheme != NULL) free(coding.subScheme); if (coding.sRunScheme != NULL) free(coding.sRunScheme); EXIT(NULL); } // Free all the auxilliary storage associated with the encoding argument void Free_QVcoding(QVcoding *coding) { if (coding->subChar >= 0) free(coding->sRunScheme); free(coding->subScheme); free(coding->mrgScheme); free(coding->insScheme); if (coding->delChar >= 0) free(coding->dRunScheme); free(coding->delScheme); free(coding->prefix); } /******************************************************************************************* * * Encode/Decode (w.r.t. coding) next entry from input and write to output * ********************************************************************************************/ int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) { int rlen, clen; // Get all 5 streams, compress each with its scheme, and output rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT (1); } if (coding->delChar < 0) { Encode(coding->delScheme, output, (uint8 *) Read, rlen); clen = rlen; } else { Encode_Run(coding->delScheme, coding->dRunScheme, output, (uint8 *) Read, rlen, coding->delChar); clen = Pack_Tag(Read+Rmax,Read,rlen,coding->delChar); } Number_Read(Read+Rmax); Compress_Read(clen,Read+Rmax); fwrite(Read+Rmax,1,COMPRESSED_LEN(clen),output); if (lossy) { uint8 *insert = (uint8 *) (Read+2*Rmax); uint8 *merge = (uint8 *) (Read+3*Rmax); int k; for (k = 0; k < rlen; k++) { insert[k] = (uint8) ((insert[k] >> 1) << 1); merge[k] = (uint8) (( merge[k] >> 2) << 2); } } Encode(coding->insScheme, output, (uint8 *) (Read+2*Rmax), rlen); Encode(coding->mrgScheme, output, (uint8 *) (Read+3*Rmax), rlen); if (coding->subChar < 0) Encode(coding->subScheme, output, (uint8 *) (Read+4*Rmax), rlen); else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) (Read+4*Rmax), rlen, coding->subChar); return (0); } int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) { int clen, tlen; // Decode each stream and write to output if (coding->delChar < 0) { if (Decode(coding->delScheme, input, entry[0], rlen)) EXIT(1); clen = rlen; tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); } else { if (Decode_Run(coding->delScheme, coding->dRunScheme, input, entry[0], rlen, coding->delChar)) EXIT(1); clen = Packed_Length(entry[0],rlen,coding->delChar); tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar); } if (Decode(coding->insScheme, input, entry[2], rlen)) EXIT(1); if (Decode(coding->mrgScheme, input, entry[3], rlen)) EXIT(1); if (coding->subChar < 0) { if (Decode(coding->subScheme, input, entry[4], rlen)) EXIT(1); } else { if (Decode_Run(coding->subScheme, coding->sRunScheme, input, entry[4], rlen, coding->subChar)) EXIT(1); } return (0); } DEXTRACTOR-1.0/QV.h000066400000000000000000000165041253752467400135550ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #ifndef _QV_COMPRESSOR #define _QV_COMPRESSOR // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or // batch version of the routines in this library are compiled. In batch mode, routines // print an error message and exit. In interactive mode, the routines place the error // message in EPLACE (also defined in DB.h) and return an error value, typically NULL // if the routine returns a pointer, and an unusual integer value if the routine returns // an integer. // Below when an error return is described, one should understand that this value is returned // only if the routine was compiled in INTERACTIVE mode. // A PacBio compression scheme typedef struct { void *delScheme; // Huffman scheme for deletion QVs void *insScheme; // Huffman scheme for insertion QVs void *mrgScheme; // Huffman scheme for merge QVs void *subScheme; // Huffman scheme for substitution QVs void *dRunScheme; // Huffman scheme for deletion run lengths (if delChar > 0) void *sRunScheme; // Huffman scheme for substitution run lengths (if subChar > 0) int delChar; // If > 0, run-encoded deletion value int subChar; // If > 0, run-encoded substitution value int flip; // Need to flip multi-byte integers char *prefix; // Header line prefix } QVcoding; // Read the next nlines of input, and QVentry returns a pointer to the first line if needed. // If end-of-input is encountered before any further input, -1 is returned. If there is // an error than -2 is returned. Otherwise the length of the line(s) read is returned. int Read_Lines(FILE *input, int nlines); char *QVentry(); // Read the .quiva file on input and record frequency statistics. If there is an error // then 1 is returned, otherwise 0. int QVcoding_Scan(FILE *input); // Given QVcoding_Scan has been called at least once, create an encoding scheme based on // the accumulated statistics and return a pointer to it. The returned encoding object // is *statically allocated within the routine. If lossy is set then use a lossy scaling // for the insertion and merge streams. If there is an error, then NULL is returned. QVcoding *Create_QVcoding(int lossy); // Read/write a coding scheme to input/output. The encoding object returned by the reader // is *statically* allocated within the routine. If an error occurs while reading then // NULL is returned. QVcoding *Read_QVcoding(FILE *input); void Write_QVcoding(FILE *output, QVcoding *coding); // Free all the auxiliary storage associated with coding (but not the object itself!) void Free_QVcoding(QVcoding *coding); // Assuming the file pointer is positioned just beyond an entry header line, read the // next set of 5 QV lines, compress them according to 'coding', and output. If lossy // is set then the scheme is a lossy one. A non-zero value is return only if an // error occured. int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); // Assuming the input is position just beyond the compressed encoding of an entry header, // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, // and place their decompressed values into entry which is a 5 element array of character // pointers. The parameter rlen computed from the preceeding header line, critically // provides the length of each of the 5 vectors. A non-zero value is return only if an // error occured. int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); #endif // _QV_COMPRESSOR DEXTRACTOR-1.0/README000066400000000000000000000201621253752467400137310ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ The Dextractor and Compression Command Library Authors: Gene Myers, Martin Pippel First: December 21, 2013 Current: February 26, 2014 The Dextractor commands allow one to pull exactly and only the information needed for assembly and reconstruction from the source .bax.h5 HDF5 files produced by the PacBio RS II sequencer. Generally speaking, this information is the sequence of all the reads coded in the .bax.h5 file and a number of quality value (QV) streams needed by Quiver to produce a highly accurate consensus sequence as the last step in the assembly process. The Dextractor therefore produces a .fasta file of the sequence of all the reads, and a .quiva file containing the QV stream information in a .fastq readable format. For each of these two file types the library contains commands to compress the given file type, and to decompress it, which is a reversible process delivering the original uncompressed file. In this way, users of a PacBio can keep the data needed for assembly spooled up on disk in 1/14th the space occupied by the .bax.h5 files which can be archived to a cheap backup medium such as tape, should the raw data ever need to be consulted again (we expect never unless the spooled up data is compromised or lost in some way). The compressor/decompressor pairs are endian-aware so moving compressed files between machines is possible. 1. dextract [-vq] [-o[]] [-l] [-s] ... The dextract'or takes the .bax.h5 files produced for a given SMRT cell as input and: (a) if the -o option is set, then the information needed for Quiver is extracted and put in a file named .quiva. If the -q option is not set, then the sequence of each read is placed in a file named .fasta, otherwise a .fastq file of the sequence and the imputed "quality values" for each base in the sequence is placed in a file named .fastq. We personally do not find these values useful and so never set -q but we give you the option in case your downstream processes use such values. If is missing, then the path of the first .bax.h5 file is used for the output file name, less any suffixes which are replaced by .fasta and .quiva. E.G., the call "dextract -o EColi.1.bax.h5 EColi.2.bax.h5 Ecoli.3.bax.h5" will result in the files EColi.fasta and Ecoli.quiva. (b) if the -o option is not set, then if the -q option is also not set, then a .fasta file of the sequence of each read is written to the standard output. Otherwise a .fastq file is written to the standard output. If the -v option is set then the program reports the processing of each .bax.h5 file, otherwise it runs silently. The parameter -l determines the shortest read length to be extracted (default 500) and the -s parameter determines the minimum quality/score of reads to be extracted (default 750 = 75%). 2. dexta [-vk] ... undexta [-vk] ... Dexta compresses a set of .fasta files (produced by either Pacbio's software or dextract) and replaces them with new files with a .dexta extension. That is, submitting G.fasta will result in a compressed image G.dexta, and G.fasta will no longer exist. With the -k option the .fasta source is *not* removed. If -v is set, then the program reports its progress on each file. Otherwise it runs completely silently (good for batch jobs to an HPC cluster). The compression factor is always slightly better than 4.0. Undexta reverses the compression of dexta, replacing the uncompressed image of G.dexta with G.fasta. 3. dexqv [-vkl] ... undexqv [-vk] ... Dexqv compresses a set of .quiva files (produced by dextract) into new files with a .dexqv extension. That is, submitting G.quiva will result in a compressed image G.dexqv, and G.quiva will not longer exist. The -k flag prevents the removal of G.quiva. With -v set progress is reported, otherwise the command runs silently. If slightly more compression is desired at the expense of being a bit "lossy" then set the -l option. This option is experimental in that it remains to be seen if Quiver gives the same results with the scaled values responsible for the loss. Undexqv reverses the compression of dexqv, replacing the uncompressed image of G.dexqv with G.quiva. The flags are analgous to the v & k flags for dexqv. The compression factor is typically 3.4 or so (4.0 or so with -l set). To compile the programs you must have the HDF5 library installed on your system and the library and include files for said must be on the appropriate search paths. The HDR5 library in turn depends on the presence of zlib, so make sure it is also installed on your system. The most recent version of the source for the HDF5 library can be obtained at: http://www.hdfgroup.org/HDF5/release/obtainsrc.html DEXTRACTOR-1.0/dexqv.c000066400000000000000000000157771253752467400143640ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressor for .quiv files, customized Huffman codes for each stream based on the * histogram of values occuring in the given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" static char *Usage = "[-vkl] ..."; int main(int argc, char* argv[]) { int VERBOSE; int KEEP; int LOSSY; { int i, j, k; int flags[128]; ARG_INIT("dexqv") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vkl") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; KEEP = flags['k']; LOSSY = flags['l']; if (argc == 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // For each .quiva file to be compressed: { int i; for (i = 1; i < argc; i++) { char *pwd, *root; FILE *input, *output; QVcoding *coding; pwd = PathTo(argv[i]); root = Root(argv[i],".quiva"); input = Fopen(Catenate(pwd,"/",root,".quiva"),"r"); if (input == NULL) exit (1); output = Fopen(Catenate(pwd,"/",root,".dexqv"),"w"); if (output == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Processing '%s' ...\n",root); fflush(stderr); } // Scan the file collecting statistics for Huffman schemes QVcoding_Scan(input); // Create and output the encoding schemes coding = Create_QVcoding(LOSSY); { char *slash, *read; // Get header line prefix from first line rewind (input); Read_Lines(input,1); read = QVentry(); slash = index(read+1,'/'); coding->prefix = (char *) malloc((slash-read)+1); if (coding->prefix == NULL) { fprintf(stderr,"%s: Out of memory (Allocating header prefix)\n",Prog_Name); exit (1); } *slash = '\0'; strcpy(coding->prefix,read); *slash = '/'; } Write_QVcoding(output,coding); // For each entry do { int lwell; rewind (input); lwell = 0; while (Read_Lines(input,1) > 0) { int well, beg, end, qv; char *slash; uint16 half; uint8 byte; // Interpret the header, encode and write out the fields slash = index(QVentry(),'/'); sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv); while (well - lwell >= 255) { byte = 0xff; fwrite(&byte,1,1,output); lwell += 255; } byte = (uint8) (well-lwell); fwrite(&byte,1,1,output); lwell = well; half = (uint16) beg; fwrite(&half,sizeof(uint16),1,output); half = (uint16) end; fwrite(&half,sizeof(uint16),1,output); half = (uint16) qv; fwrite(&half,sizeof(uint16),1,output); Compress_Next_QVentry(input,output,coding,LOSSY); } } // Clean up for the next file Free_QVcoding(coding); fclose(input); fclose(output); if (!KEEP) unlink(Catenate(pwd,"/",root,".quiva")); free(root); free(pwd); if (VERBOSE) { fprintf(stderr,"Done\n"); fflush(stderr); } } } free(QVentry()); exit (0); } DEXTRACTOR-1.0/dexta.c000066400000000000000000000210741253752467400143250ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compresses a .fasta file into a 2-bit per base .dexta file * * Author: Gene Myers * Date : January 12, 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" static char *Usage = "[-vk] ..."; #define MAX_BUFFER 100000 // Compress read into 2-bits per base (from [0-3] per byte representation int main(int argc, char *argv[]) { int VERBOSE; int KEEP; { int i, j, k; int flags[128]; ARG_INIT("dexta") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vk") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; KEEP = flags['k']; if (argc == 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // For each fasta file do: { char *read; int rmax; int i; rmax = MAX_BUFFER + 30000; read = (char *) Malloc(rmax+1,"Allocating read buffer"); if (read == NULL) exit (1); for (i = 1; i < argc; i++) { char *pwd, *root; FILE *input, *output; int eof; // Open fasta file pwd = PathTo(argv[i]); root = Root(argv[i],".fasta"); input = Fopen(Catenate(pwd,"/",root,".fasta"),"r"); if (input == NULL) exit (1); output = Fopen(Catenate(pwd,"/",root,".dexta"),"w"); if (output == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Processing '%s' ...\n",root); fflush(stderr); } // Read the first header and output the endian key and short name { char *slash; uint16 half; int x; eof = (fgets(read,MAX_BUFFER,input) == NULL); if (read[strlen(read)-1] != '\n') { fprintf(stderr,"Line 1: Fasta line is too long (> %d chars)\n",MAX_BUFFER-2); exit (1); } if (!eof && read[0] != '>') { fprintf(stderr,"Line 1: First header in fasta file is missing\n"); exit (1); } slash = index(read,'/'); if (slash == NULL) { fprintf(stderr,"%s: Header line incorrectly formatted ?\n",Prog_Name); exit (1); } half = 0x33cc; fwrite(&half,sizeof(uint16),1,output); x = slash-read; fwrite(&x,sizeof(int),1,output); fwrite(read,1,slash-read,output); } // For each read do { int nline, rlen, lwell; nline = 1; rlen = 0; lwell = 0; while (!eof) { int well, beg, end, qv; char *slash; uint16 half; uint8 byte; // Next header is always at read+(rlen+1). Interpret its fields slash = index(read+(rlen+1),'/'); if (slash == NULL) { fprintf(stderr,"%s: Header line incorrectly formatted ?\n",Prog_Name); exit (1); } if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) { fprintf(stderr,"%s: Header line incorrectly formatted ?\n",Prog_Name); exit (1); } // Read fasta sequence (@read) and stop at eof or after having read next header rlen = 0; while (1) { int x; eof = (fgets(read+rlen,MAX_BUFFER,input) == NULL); nline += 1; x = strlen(read+rlen)-1; if (read[rlen+x] != '\n') { fprintf(stderr,"Line %d: Fasta line is too long (> %d chars)\n", nline,MAX_BUFFER-2); exit (1); } if (eof || read[rlen] == '>') break; rlen += x; if (rlen + MAX_BUFFER > rmax) { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_BUFFER; read = (char *) Realloc(read,rmax+1,"Reallocaing read buffer"); if (read == NULL) exit (1); } } read[rlen] = '\0'; // Compress the header fields and output (except for short name, only output once) while (well - lwell >= 255) { byte = 0xff; fwrite(&byte,1,1,output); lwell += 255; } byte = (uint8) (well-lwell); fwrite(&byte,1,1,output); lwell = well; half = (uint16) beg; fwrite(&half,sizeof(uint16),1,output); half = (uint16) end; fwrite(&half,sizeof(uint16),1,output); half = (uint16) qv; fwrite(&half,sizeof(uint16),1,output); // Compress read and output Number_Read(read); Compress_Read(rlen,read); fwrite(read,1,COMPRESSED_LEN(rlen),output); } } if (!KEEP) unlink(Catenate(pwd,"/",root,".fasta")); free(root); free(pwd); if (VERBOSE) { fprintf(stderr,"Done\n"); fflush(stderr); } } free(read); } exit (0); } DEXTRACTOR-1.0/dextract.c000066400000000000000000000466521253752467400150470ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Dextractor: pullls requested info out of .bax.h5 files produced by Pacbio * * * Author: Martin Pippel * Date : Dec 12, 2013 * * Author: Gene Myers * Date: Jan 8, 2014, redesign of the modes of operation and flags, and also the * logic for extraction in writeBaxReads * ********************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #define LOWER_OFFSET 32 #define PHRED_OFFSET 33 static char *Usage = "[-vq] [-o[]] [-l] [-s] ..."; #define DEXTRACT // Exception codes #define CANNOT_OPEN_BAX_FILE 1 #define BAX_BASECALL_ERR 2 #define BAX_DEL_ERR 3 #define BAX_TAG_ERR 4 #define BAX_INS_ERR 5 #define BAX_MRG_ERR 6 #define BAX_SUB_ERR 7 #define BAX_QV_ERR 8 #define BAX_NR_EVENTS_ERR 9 #define BAX_REGION_ERR 10 #define BAX_HOLESTATUS_ERR 11 typedef struct { char *fullName; // full file path char *shortName; // without path and file extension (used in header line) int fastq; // if non-zero produce a fastq file instead of a fasta file int quivqv; // if non-zero produce a quiv file hsize_t numBP; // sum of all raw read lengths char *baseCall; // 7 streams that may be extracted dependent on flag settings char *delQV; char *delTag; char *insQV; char *mergeQV; char *subQV; char *fastQV; hsize_t numZMW; // number of wells/holes int *readLen; // length of each read in events char *holeType; // Hole type, only SEQUENCING holes are extracted hsize_t numHQR; // number of regions int *regions; // region information (5 ints per entry) int delLimit; // The Del QV associated with N's in the Del Tag } BaxData; // Initialize *the* BaxData structure static void initBaxData(BaxData *b, int fastq, int quivqv) { b->fullName = NULL; b->shortName = NULL; b->fastq = fastq; b->quivqv = quivqv; b->baseCall = NULL; b->delQV = NULL; b->delTag = NULL; b->insQV = NULL; b->mergeQV = NULL; b->subQV = NULL; b->fastQV = NULL; b->readLen = NULL; b->holeType = NULL; b->regions = NULL; } // Record the names of the next bax file and reset the memory buffer high-water mark static void initBaxNames(BaxData *b, char *fname, char *hname) { b->fullName = fname; b->shortName = hname; b->numBP = 0; b->numZMW = 0; b->numHQR = 0; } // Check if memory needed is above highwater mark, and if so allocate static void ensureBases(BaxData *b, hsize_t len) { static hsize_t smax = 0; b->numBP = len; if (smax < len) { smax = 1.2*len + 10000; b->baseCall = (char *) Realloc(b->baseCall, smax, "Allocating basecall vector"); if (b->fastq) b->fastQV = (char *) Realloc(b->fastQV, smax, "Allocating fastq vector"); if (b->quivqv) { b->delQV = (char *) Realloc(b->delQV, 5ll*smax, "Allocating 5 QV vectors"); b->delTag = b->delQV + smax; b->insQV = b->delTag + smax; b->mergeQV = b->insQV + smax; b->subQV = b->mergeQV + smax; } } } static void ensureZMW(BaxData *b, hsize_t len) { static hsize_t smax = 0; b->numZMW = len; if (smax < len) { smax = 1.2*len + 10000; b->holeType = (char *) Realloc(b->holeType, smax, "Allocating hole vector"); b->readLen = (int *) Realloc(b->readLen , smax * sizeof(int), "Allocating event vector"); } } static void ensureHQR(BaxData *b, hsize_t len) { static hsize_t smax = 0; b->numHQR = len; if (smax < len) { smax = 1.2*len + 10000; b->regions = (int *) Realloc(b->regions, (5ll*smax+1)*sizeof(int), "Allocating region vector"); } } // Fetch the relevant contents of the current bax.h5 file and return the H5 file id. static int getBaxData(BaxData *b) { hid_t field_space; hid_t field_set; hsize_t field_len[2]; hid_t file_id; herr_t stat; int ecode; H5Eset_auto(H5E_DEFAULT,0,0); // silence hdf5 error stack file_id = H5Fopen(b->fullName, H5F_ACC_RDONLY, H5P_DEFAULT); if (file_id < 0) return (CANNOT_OPEN_BAX_FILE); #ifdef DEBUG printf("PROCESSING %s, file_id: %d\n", baxFileName, file_id); #endif #define GET_SIZE(path,error) \ { ecode = error; \ if ((field_set = H5Dopen2(file_id, path, H5P_DEFAULT)) < 0) goto exit0; \ if ((field_space = H5Dget_space(field_set)) < 0) goto exit1; \ H5Sget_simple_extent_dims(field_space, field_len, NULL); \ } #define FETCH(field,type) \ { stat = H5Dread(field_set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, b->field); \ H5Sclose(field_space); \ H5Dclose(field_set); \ if (stat < 0) goto exit0; \ } #define CHECK_FETCH(path,error,field,type,cntr) \ { GET_SIZE(path,error) \ if (b->cntr != field_len[0]) goto exit2; \ FETCH(field,type) \ } GET_SIZE("/PulseData/BaseCalls/Basecall",BAX_BASECALL_ERR) ensureBases(b,field_len[0]); FETCH(baseCall,H5T_NATIVE_UCHAR) if (b->fastq) CHECK_FETCH("/PulseData/BaseCalls/QualityValue",BAX_QV_ERR,fastQV,H5T_NATIVE_UCHAR,numBP) if (b->quivqv) { CHECK_FETCH("/PulseData/BaseCalls/DeletionQV", BAX_DEL_ERR,delQV, H5T_NATIVE_UCHAR,numBP) CHECK_FETCH("/PulseData/BaseCalls/DeletionTag", BAX_TAG_ERR,delTag, H5T_NATIVE_UCHAR,numBP) CHECK_FETCH("/PulseData/BaseCalls/InsertionQV", BAX_INS_ERR,insQV, H5T_NATIVE_UCHAR,numBP) CHECK_FETCH("/PulseData/BaseCalls/MergeQV", BAX_MRG_ERR,mergeQV,H5T_NATIVE_UCHAR,numBP) CHECK_FETCH("/PulseData/BaseCalls/SubstitutionQV",BAX_SUB_ERR,subQV, H5T_NATIVE_UCHAR,numBP) } GET_SIZE("/PulseData/BaseCalls/ZMW/HoleStatus",BAX_HOLESTATUS_ERR) ensureZMW(b,field_len[0]); FETCH(holeType,H5T_NATIVE_UCHAR) CHECK_FETCH("/PulseData/BaseCalls/ZMW/NumEvent",BAX_NR_EVENTS_ERR,readLen,H5T_NATIVE_INT,numZMW) GET_SIZE("/PulseData/Regions",BAX_REGION_ERR) ensureHQR(b,field_len[0]); FETCH(regions,H5T_NATIVE_INT) // Find the Del QV associated with N's in the Del Tag if (b->quivqv) { hsize_t i; for (i = 0; i < b->numBP; i++) if (b->delTag[i] == 'N') { b->delLimit = b->delQV[i]; break; } } return (0); exit2: H5Sclose(field_space); exit1: H5Dclose(field_set); exit0: H5Fclose(file_id); return (ecode); } // Find the good read invervals of the baxfile b(FileID), output the reads of length >= minLen and // score >= minScore to output (for the fasta or fastq part) and qvquiv (if b->quivqv is set) static char *fasta_header = ">%s/%d/%d_%d RQ=0.%d\n"; static char *fastq_header = "@%s/%d/%d_%d RQ=0.%d\n"; static void writeBaxReads(BaxData *b, int minLen, int minScore, FILE *output, FILE* qvquiv) { int nreads, *rlen; int roff, *hlen, *cur, h, w; int tolower; char *header; char *baseCall; char *delQV; char *delTag; char *insQV; char *mergeQV; char *subQV; char *fastQV; baseCall = b->baseCall; delQV = b->delQV; delTag = b->delTag; insQV = b->insQV; mergeQV = b->mergeQV; subQV = b->subQV; fastQV = b->fastQV; #ifdef DEBUG printf("printSubreadFields\n"); #endif #define HOLE 0 #define TYPE 1 #define ADAPTER_REGION 0 #define INSERT_REGION 1 #define HQV_REGION 2 #define START 2 #define FINISH 3 #define SCORE 4 // Find the HQV regions and output as reads according to the various output options tolower = isupper(b->baseCall[0]); if (b->fastq) header = fastq_header; else header = fasta_header; rlen = b->readLen; roff = 0; cur = b->regions; nreads = b->numZMW + cur[HOLE]; hlen = rlen - cur[HOLE]; cur[5*b->numHQR] = nreads; for (h = cur[HOLE], w = 0; h < nreads; h++, w++) { int *bot, *top, *hqv, *r; int hbeg, hend, qv; int ibeg, iend; if (hlen[h] >= minLen) { while (cur[HOLE] < h) cur += 5; bot = hqv = cur; while (cur[HOLE] <= h) { if (cur[TYPE] == HQV_REGION) hqv = cur; cur += 5; } top = cur-5; qv = hqv[SCORE]; if (qv >= minScore) { hbeg = hqv[START]; hend = hqv[FINISH]; for (r = bot; r <= top; r += 5) { if (r[TYPE] != INSERT_REGION) continue; ibeg = r[START]; iend = r[FINISH]; if (ibeg < hbeg) ibeg = hbeg; if (iend > hend) iend = hend; if (iend - ibeg < minLen || b->holeType[w] > 0) continue; fprintf(output,header,b->shortName,h,ibeg,iend,qv); ibeg += roff; iend += roff; if (tolower) { int a; for (a = ibeg; a < iend; a++) baseCall[a] += LOWER_OFFSET; if (b->quivqv) for (a = ibeg; a < iend; a++) delTag[a] += LOWER_OFFSET; } if (b->fastq) { int a; fprintf(output,"%.*s\n", iend-ibeg, baseCall + ibeg); fprintf(output,"+\n"); for (a = ibeg; a < iend; a++) fputc(fastQV[a]+PHRED_OFFSET,output); fputc('\n',output); } else { int a; for (a = ibeg; a < iend; a += 80) if (a+80 > iend) fprintf(output,"%.*s\n", iend-a, baseCall + a); else fprintf(output,"%.80s\n", baseCall + a); } if (b->quivqv) { int a, d; fprintf(qvquiv,"@%s/%d/%d_%d RQ=0.%d\n", b->shortName,h,ibeg-roff,iend-roff,qv); d = b->delLimit; for (a = ibeg; a < iend; a++) { if (delQV[a] == d) delTag[a] = 'n'; delQV[a] += PHRED_OFFSET; insQV[a] += PHRED_OFFSET; mergeQV[a] += PHRED_OFFSET; subQV[a] += PHRED_OFFSET; } iend -= ibeg; fprintf (qvquiv, "%.*s\n", iend, delQV + ibeg); fprintf (qvquiv, "%.*s\n", iend, delTag + ibeg); fprintf (qvquiv, "%.*s\n", iend, insQV + ibeg); fprintf (qvquiv, "%.*s\n", iend, mergeQV + ibeg); fprintf (qvquiv, "%.*s\n", iend, subQV + ibeg); } } } } roff += hlen[h]; } } // Print an error message static void printBaxError(int ecode) { fprintf(stderr," *** Warning ***: "); switch (ecode) { case CANNOT_OPEN_BAX_FILE: fprintf(stderr,"Cannot open bax file:\n"); break; case BAX_BASECALL_ERR: fprintf(stderr,"Cannot parse /PulseData/BaseCalls/Basecall from file:\n"); break; case BAX_DEL_ERR: fprintf(stderr,"Cannot parse /PulseData/BaseCalls/DeletionQV from file:\n"); break; case BAX_TAG_ERR: fprintf(stderr,"Cannot parse /PulseData/BaseCalls/DeletionTag from file:\n"); break; case BAX_INS_ERR: fprintf(stderr,"Cannot parse /PulseData/BaseCalls/InsertionQV from file:\n"); break; case BAX_MRG_ERR: fprintf(stderr,"Cannot parse /PulseData/BaseCalls/MergeQV from file:\n"); break; case BAX_SUB_ERR: fprintf(stderr,"Cannot parse /PulseData/BaseCalls/SubstitutionQV from file:\n"); break; case BAX_QV_ERR: fprintf(stderr,"Cannot parse /PulseData/BaseCalls/QualityValue from file:\n"); break; case BAX_NR_EVENTS_ERR: fprintf(stderr,"Cannot parse /PulseData/BaseCalls/ZMW/NumEvent from file:\n"); break; case BAX_REGION_ERR: fprintf(stderr,"Cannot parse /PulseData/Regions from file:\n"); break; case BAX_HOLESTATUS_ERR: fprintf(stderr,"Cannot parse /PulseData/BaseCalls/ZMW/HoleStatus from file:\n"); break; default: fprintf(stderr,"Cannot parse bax file:\n"); break; } fflush(stderr); } // Free *the* bax data structure static void freeBaxData(BaxData *b) { free(b->baseCall); free(b->delQV); free(b->fastQV); free(b->holeType); free(b->readLen); free(b->regions); } int main(int argc, char* argv[]) { char *output; FILE *fileOut; FILE *fileQuiv; int FASTQ; int QUIVQV; int MIN_LEN; int MIN_SCORE; int VERBOSE; BaxData b; // Check that zlib library is present if ( ! H5Zfilter_avail(H5Z_FILTER_DEFLATE)) { fprintf(stderr,"%s: zlib library is not present, check build/installation\n",Prog_Name); exit (1); } { int i, j, k; int flags[128]; char *eptr; ARG_INIT("dextract") MIN_LEN = 500; MIN_SCORE = 750; QUIVQV = 0; output = NULL; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("qv") break; case 's': ARG_NON_NEGATIVE(MIN_SCORE,"Subread score threshold") break; case 'l': ARG_NON_NEGATIVE(MIN_LEN,"Minimum length threshold") break; case 'o': QUIVQV = 1; output = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; FASTQ = flags['q']; if (argc == 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } fileQuiv = NULL; if (QUIVQV) { int explicit; explicit = (*output != '\0'); if ( ! explicit) output = Root(argv[1],NULL); if (FASTQ) fileOut = Fopen(Catenate("","",output,".fastq"), "w"); else fileOut = Fopen(Catenate("","",output,".fasta"), "w"); fileQuiv = Fopen(Catenate("","",output,".quiva"), "w"); if (fileOut == NULL || fileQuiv == NULL) exit (1); if (explicit) output = Root(output,NULL); } else fileOut = stdout; if (VERBOSE) { fprintf(stderr, "Minimum length: %d\n", MIN_LEN); fprintf(stderr, "Minimum score : %d\n", MIN_SCORE); } initBaxData(&b,FASTQ,QUIVQV); { int i; for (i = 1; i < argc; i++) { char *root, *full, *input; int ecode; { char *pwd; FILE *in; pwd = PathTo(argv[i]); root = Root(argv[i],".bax.h5"); full = Strdup(Catenate(pwd,"/",root,".bax.h5"),"Allocating full name"); input = Root(argv[i],NULL); free(pwd); if ((in = fopen(full,"r")) == NULL) { fprintf(stderr,"%s: Cannot find %s !\n",Prog_Name,input); exit (1); } else fclose(in); } if (QUIVQV) initBaxNames(&b,full,output); else initBaxNames(&b,full,input); if (VERBOSE) { fprintf(stderr, "Fetching file : %s ...\n", root); fflush(stderr); } if ((ecode = getBaxData(&b)) == 0) { if (VERBOSE) { fprintf(stderr, "Extracting subreads ...\n"); fflush(stderr); } writeBaxReads(&b, MIN_LEN, MIN_SCORE, fileOut, fileQuiv); if (VERBOSE) { fprintf(stderr, "Done\n"); fflush(stdout); } } else { if (VERBOSE) fprintf(stderr, "Skipping due to failure\n"); else fprintf(stderr, "Skipping %s due to failure\n",root); printBaxError(ecode); } free(root); free(full); free(input); } } freeBaxData(&b); if (fileOut != stdout) fclose(fileOut); if (fileQuiv != NULL) fclose(fileQuiv); if (QUIVQV) free(output); exit (0); } DEXTRACTOR-1.0/undexqv.c000066400000000000000000000165541253752467400147210ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Uncompressor for .dexqv files * * Author: Gene Myers * Date: Jan 18, 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" static char *Usage = "[-vk] ..."; static void flip_short(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[1]; v[1] = x; } int main(int argc, char* argv[]) { int VERBOSE; int KEEP; { int i, j, k; int flags[128]; ARG_INIT("undexqv") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vk") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; KEEP = flags['k']; if (argc == 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // For each .dexqv file to be decompressed { int i; char *entry[5] = { NULL, NULL, NULL, NULL, NULL }; int emax = -1; for (i = 1; i < argc; i++) { char *pwd, *root; FILE *input, *output; QVcoding *coding; // Open it and the appropriately named .quiva file pwd = PathTo(argv[i]); root = Root(argv[i],".dexqv"); input = Fopen(Catenate(pwd,"/",root,".dexqv"),"r"); if (input == NULL) exit (1); output = Fopen(Catenate(pwd,"/",root,".quiva"),"w"); if (output == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Processing '%s' ...\n",root); fflush(stderr); } // Read in compression scheme coding = Read_QVcoding(input); // For each compressed entry do { int well; well = 0; while (1) { int beg, end, qv, rlen; uint16 half; uint8 byte; int e; // Decode the compressed header and write it out if (fread(&byte,1,1,input) < 1) break; while (byte == 255) { well += 255; if (fread(&byte,1,1,input) != 1) SYSTEM_ERROR } well += byte; if (coding->flip) { if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR flip_short(&half); beg = half; if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR flip_short(&half); end = half; if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR flip_short(&half); qv = half; } else { if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR beg = half; if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR end = half; if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR qv = half; } fprintf(output,"%s/%d/%d_%d RQ=0.%d\n",coding->prefix,well,beg,end,qv); // Decode the QV entry and write it out rlen = end-beg; if (rlen > emax) { emax = ((int) (1.2*rlen)) + 1000; entry[0] = (char *) Realloc(entry[0],5*emax,"Reallocating QV entry buffer"); if (entry[0] == NULL) exit (1); for (e = 1; e < 5; e++) entry[e] = entry[e-1] + emax; } Uncompress_Next_QVentry(input,entry,coding,rlen); for (e = 0; e < 5; e++) fprintf(output,"%.*s\n",rlen,entry[e]); } } // Clean up for the next file Free_QVcoding(coding); fclose(input); fclose(output); if (!KEEP) unlink(Catenate(pwd,"/",root,".dexqv")); free(root); free(pwd); if (VERBOSE) { fprintf(stderr,"Done\n"); fflush(stderr); } } } free(QVentry()); exit (0); } DEXTRACTOR-1.0/undexta.c000066400000000000000000000206051253752467400146670ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Uncompresses a .dexta file (2-bit per base compression) back to a .fasta file * * Author: Gene Myers * Date : January 12, 2014 ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" static char *Usage = "[-vk] ..."; #define MAX_BUFFER 100000 // Uncompress read from 2-bits per base into [0-3] per byte representation static void flip_long(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[3]; v[3] = x; x = v[1]; v[1] = v[2]; v[2] = x; } static void flip_short(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[1]; v[1] = x; } int main(int argc, char *argv[]) { int VERBOSE; int KEEP; { int i, j, k; int flags[128]; ARG_INIT("undexta") VERBOSE = 1; KEEP = 0; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vk") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; KEEP = flags['k']; if (argc == 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // For each .dexta file do { char *read; int rmax; int i; rmax = MAX_BUFFER + 30000; read = (char *) Malloc(rmax+1,"Allocating read buffer"); for (i = 1; i < argc; i++) { char *pwd, *root; FILE *input, *output; // Open dexta file pwd = PathTo(argv[i]); root = Root(argv[i],".dexta"); input = Fopen(Catenate(pwd,"/",root,".dexta"),"r"); if (input == NULL) exit (1); output = Fopen(Catenate(pwd,"/",root,".fasta"),"w"); if (output == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Processing '%s' ...\n",root); fflush(stderr); } { char *name; int well, flip; // Read endian key and short name common to all headers { uint16 half; if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR flip = (half != 0x33cc); if (fread(&well,sizeof(int),1,input) != 1) SYSTEM_ERROR if (flip) flip_long(&well); name = (char *) Malloc(well+1,"Allocating header prefix"); if (well > 0) { if (fread(name,well,1,input) != 1) SYSTEM_ERROR } name[well] = '\0'; } // For each encoded entry do well = 0; while (1) { int rlen, beg, end, qv; int clen; uint16 half; uint8 byte; // Read and decompress header and output if (fread(&byte,1,1,input) < 1) break; while (byte == 255) { well += 255; if (fread(&byte,1,1,input) != 1) SYSTEM_ERROR } well += byte; if (flip) { if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR flip_short(&half); beg = half; if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR flip_short(&half); end = half; if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR flip_short(&half); qv = half; } else { if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR beg = half; if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR end = half; if (fread(&half,sizeof(uint16),1,input) != 1) SYSTEM_ERROR qv = half; } fprintf(output,"%s/%d/%d_%d RQ=0.%d\n",name,well,beg,end,qv); // Read compressed sequence (into buffer big enough for uncompressed sequence) // Uncompress and output 80 symbols to a line rlen = end-beg; if (rlen > rmax) { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_BUFFER; read = (char *) Realloc(read,rmax+1,"Allocating read buffer"); } clen = COMPRESSED_LEN(rlen); if (clen > 0) { if (fread(read,clen,1,input) != 1) SYSTEM_ERROR } Uncompress_Read(rlen,read); Lower_Read(read); { int j; for (j = 0; j < rlen; j += 80) if (j+80 > rlen) fprintf(output,"%.*s\n", rlen-j, read+j); else fprintf(output,"%.80s\n", read+j); } } free(name); } if (!KEEP) unlink(Catenate(pwd,"/",root,".dexta")); free(root); free(pwd); if (VERBOSE) { fprintf(stderr,"Done\n"); fflush(stderr); } } free(read); } exit (0); }