pax_global_header00006660000000000000000000000064132757420640014524gustar00rootroot0000000000000052 comment=d3c3d3f4a32d82dfc61f139b66332ffc776b6559 DASCRUBBER-1.1/000077500000000000000000000000001327574206400130015ustar00rootroot00000000000000DASCRUBBER-1.1/DAScover.c000066400000000000000000000414541327574206400146230ustar00rootroot00000000000000/******************************************************************************************* * * Using overlap pile for each read compute estimated coverage of the underlying genome * generating a .covr track containing a histogram of the coverage of every unmasked * trace-point tile. * * Author: Gene Myers * Date : January 2018 * *******************************************************************************************/ #include #include #include #include #include "DB.h" #include "align.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif #undef COVER_DEBUG static char *Usage = "[-v] [-H] [-m]+ ..."; #define MAX_COVER 1000 static int VERBOSE; static int HGAP_MIN; // Under this length do not process for HGAP typedef struct { char *name; int64 *idx; int *data; } Mask; static int NUM_MASK; static Mask *MASKS; static int64 Cov_Hist[MAX_COVER+1]; // [0..MAX_COVER] counts of trace-interval coverages static char *Cov_Name = "Coverage Histogram"; static char *Hgap_Name = "Hgap threshold"; static int TRACE_SPACING; // Trace spacing (from .las file) static int TBYTES; // Bytes per trace segment (from .las file) static DAZZ_DB _DB, *DB = &_DB; // Data base static int DB_FIRST; // First read of DB to process static int DB_LAST; // Last read of DB to process (+1) static int DB_PART; // 0 if all, otherwise block # static DAZZ_READ *Reads; // Data base reads array static FILE *CV_AFILE; // .covr.anno static char *CV_ANAME; // ".covr.anno" // Statistics static int64 nreads, totlen; // For each pile, calculate QV scores of the aread at tick spacing TRACE_SPACING static void HISTOGRAM_COVER(int aread, Overlap *ovls, int novl) { static int nmax = 0; static int *local; static int *cover; static int *maskd; int alen, atick; int bread, cssr; int t2, t1; int i, j, a, e; alen = DB->reads[aread].rlen; atick = (alen + (TRACE_SPACING-1))/TRACE_SPACING; if (alen < HGAP_MIN) return; #if defined(COVER_DEBUG) printf("AREAD %d",aread); if (novl == 0) printf(" EMPTY"); printf("\n"); #endif // COVERAGE // Allocate or expand data structures for cover calculation as needed if (nmax == 0) { nmax = (DB->maxlen + (TRACE_SPACING-1))/TRACE_SPACING; local = (int *) Malloc(nmax*sizeof(int),"Allocating bread cover"); cover = (int *) Malloc(nmax*sizeof(int),"Allocating aread cover"); maskd = (int *) Malloc(nmax*sizeof(int),"Allocating aread cover"); if (local == NULL || cover == NULL || maskd == NULL) exit (1); for (i = nmax-1; i >= 0; i--) local[i] = cover[i] = maskd[i] = 0; } // For every segment, fill histogram of match diffs for every one of the // atick intervals, building separate histograms, hist & cist, for forward // and reverse B-hits t2 = TRACE_SPACING/2; t1 = t2-1; for (i = 0; i < novl; i = j) { bread = ovls[i].bread; for (j = bread+1; j < DB_LAST; j++) if ((Reads[j].flags & DB_CSS) == 0) break; cssr = j; for (j = i; j < novl; j++) if (ovls[j].bread < cssr) { e = (ovls[j].path.aepos + t2) / TRACE_SPACING; for (a = (ovls[j].path.abpos + t1) / TRACE_SPACING; a < e; a++) local[a] = 1; } else break; for (j = i; j < novl; j++) if (ovls[j].bread < cssr) { e = (ovls[j].path.aepos + t2) / TRACE_SPACING; for (a = (ovls[j].path.abpos + t1) / TRACE_SPACING; a < e; a++) if (local[a] > 0) { local[a] = 0; cover[a] += 1; } } else break; } for (i = 0; i < NUM_MASK; i++) { Mask *m = (Mask *) (MASKS+i); int k, u, e; #ifdef COVER_DEBUG printf(" %s:",m->name); #endif for (k = m->idx[aread]; k < m->idx[aread+1]; k += 2) { e = m->data[k+1]; #ifdef COVER_DEBUG printf(" [%d..%d]",m->data[k],e); #endif e = (e + (TRACE_SPACING-1))/TRACE_SPACING; for (u = m->data[k]/TRACE_SPACING; u < e; u++) maskd[u] = 1; } #ifdef COVER_DEBUG printf("\n"); #endif } #ifdef COVER_DEBUG printf("Mask: "); for (a = 0; a < atick; a++) printf("%d",maskd[a]); printf("\n"); #endif for (a = 0; a < atick; a++) { if (maskd[a] == 0) { e = cover[a]; if (e >= MAX_COVER) Cov_Hist[MAX_COVER] += 1; else Cov_Hist[e] += 1; } cover[a] = 0; } for (i = 0; i < NUM_MASK; i++) { Mask *m = (Mask *) (MASKS+i); int k, u, e; for (k = m->idx[aread]; k < m->idx[aread+1]; k += 2) { e = m->data[k+1]; e = (e + (TRACE_SPACING-1))/TRACE_SPACING; for (u = m->data[k]/TRACE_SPACING; u < e; u++) maskd[u] = 0; } } if (VERBOSE) { nreads += 1; totlen += alen; } } // Read in each successive pile and call ACTION on it. Read in the traces only if // "trace" is nonzero static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int trace) { static Overlap *ovls = NULL; static int omax = 500; static uint16 *paths = NULL; static int pmax = 100000; int64 i, j, novl; int n, a; int pcur; int max; if (ovls == NULL) { ovls = (Overlap *) Malloc(sizeof(Overlap)*omax,"Allocating overlap buffer"); if (ovls == NULL) exit (1); } if (trace && paths == NULL) { paths = (uint16 *) Malloc(sizeof(uint16)*pmax,"Allocating path buffer"); if (paths == NULL) exit (1); } rewind(input); fread(&novl,sizeof(int64),1,input); fread(&TRACE_SPACING,sizeof(int),1,input); if (TRACE_SPACING <= TRACE_XOVR) TBYTES = sizeof(uint8); else TBYTES = sizeof(uint16); if (Read_Overlap(input,ovls) != 0) ovls[0].aread = INT32_MAX; else if (trace) { if (ovls[0].path.tlen > pmax) { pmax = 1.2*(ovls[0].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); if (paths == NULL) exit (1); } fread(paths,TBYTES,ovls[0].path.tlen,input); if (TBYTES == 1) { ovls[0].path.trace = paths; Decompress_TraceTo16(ovls); } } else fseek(input,TBYTES*ovls[0].path.tlen,SEEK_CUR); if (ovls[0].aread < DB_FIRST) { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n", Prog_Name,DB_PART); exit (1); } pcur = 0; n = max = 0; for (j = DB_FIRST; j < DB_LAST; j++) { ovls[0] = ovls[n]; a = ovls[0].aread; if (a != j) n = 0; else { if (trace) memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen); n = 1; pcur = ovls[0].path.tlen; while (1) { if (Read_Overlap(input,ovls+n) != 0) { ovls[n].aread = INT32_MAX; break; } if (trace) { if (pcur + ovls[n].path.tlen > pmax) { pmax = 1.2*(pcur+ovls[n].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); if (paths == NULL) exit (1); } fread(paths+pcur,TBYTES,ovls[n].path.tlen,input); if (TBYTES == 1) { ovls[n].path.trace = paths+pcur; Decompress_TraceTo16(ovls+n); } } else fseek(input,TBYTES*ovls[n].path.tlen,SEEK_CUR); if (ovls[n].aread != a) break; pcur += ovls[n].path.tlen; n += 1; if (n >= omax) { omax = 1.2*n + 100; ovls = (Overlap *) Realloc(ovls,sizeof(Overlap)*omax,"Expanding overlap buffer"); if (ovls == NULL) exit (1); } } if (n >= max) max = n; pcur = 0; for (i = 0; i < n; i++) { ovls[i].path.trace = paths+pcur; pcur += ovls[i].path.tlen; } } ACTION(j,ovls,n); } if (ovls[n].aread < INT32_MAX) { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n", Prog_Name,DB_PART); exit (1); } return (max); } int main(int argc, char *argv[]) { char *root, *dpwd; int64 novl, hgap64; int c; DAZZ_EXTRA ex_covr, ex_hgap; // Process arguments { int i, j, k; int flags[128]; char *eptr; int mmax; ARG_INIT("DAScover") HGAP_MIN = 0; NUM_MASK = 0; mmax = 10; MASKS = (Mask *) Malloc(mmax*sizeof(Mask),"Allocating mask array"); if (MASKS == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; case 'm': if (NUM_MASK >= mmax) { mmax = 1.2*NUM_MASK + 10; MASKS = (Mask *) Realloc(MASKS,mmax*sizeof(Mask),"Reallocating mask array"); if (MASKS == NULL) exit (1); } MASKS[NUM_MASK++].name = argv[i]+2; break; case 'H': ARG_POSITIVE(HGAP_MIN,"HGAP threshold (in bp.s)") break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc < 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n"); fprintf(stderr," -H: HGAP minimum length threshold.\n"); fprintf(stderr," -m: repeat masks, stats not collected over these intervals\n"); exit (1); } } // Open trimmed DB and any mask tracks { DAZZ_TRACK *track; int64 *anno; int status, kind; int i, j, k; status = Open_DB(argv[1],DB); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (DB->part) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } Trim_DB(DB); Reads = DB->reads; k = 0; for (i = 0; i < NUM_MASK; i++) { status = Check_Track(DB,MASKS[i].name,&kind); switch (status) { case 0: fprintf(stderr,"%s: [WARNING] %s track is for the *un*trimmed DB?\n", Prog_Name,MASKS[i].name); continue; case -1: fprintf(stderr,"%s: [WARNING] %s track size not correct for trimmed DB.\n", Prog_Name,MASKS[i].name); continue; case -2: fprintf(stderr,"%s: [WARNING] -m%s option given but no track found.\n", Prog_Name,MASKS[i].name); continue; default: if (kind != MASK_TRACK) { fprintf(stderr,"%s: [WARNING] %s track is not a mask track.\n", Prog_Name,MASKS[i].name); continue; } break; } track = Load_Track(DB,MASKS[i].name); anno = (int64 *) (track->anno); for (j = 0; j <= DB->nreads; j++) anno[j] /= sizeof(int); MASKS[k].name = MASKS[i].name; MASKS[k].idx = anno; MASKS[k].data = (int *) (track->data); k += 1; } NUM_MASK = k; } // Determine if overlap block is being processed and if so get first and last read // from .db file ex_covr.vtype = DB_INT; ex_covr.nelem = MAX_COVER+1; ex_covr.accum = DB_SUM; ex_covr.name = Cov_Name; ex_covr.value = Cov_Hist; ex_hgap.vtype = DB_INT; ex_hgap.nelem = 1; ex_hgap.accum = DB_EXACT; ex_hgap.name = Hgap_Name; hgap64 = HGAP_MIN; ex_hgap.value = &hgap64; dpwd = PathTo(argv[1]); root = Root(argv[1],".db"); for (c = 2; c < argc; c++) { Block_Looper *parse; FILE *input; parse = Parse_Block_Arg(argv[c]); while ((input = Next_Block_Arg(parse)) != NULL) { DB_PART = 0; DB_FIRST = 0; DB_LAST = DB->nreads; { FILE *dbfile; char buffer[2*MAX_NAME+100]; char *p, *eptr; int i, part, nfiles, nblocks, cutoff, all, oindx; int64 size; p = rindex(Block_Arg_Root(parse),'.'); if (p != NULL) { part = strtol(p+1,&eptr,10); if (*eptr == '\0' && eptr != p+1) { dbfile = Fopen(Catenate(dpwd,"/",root,".db"),"r"); if (dbfile == NULL) exit (1); if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_READ_ERROR for (i = 0; i < nfiles; i++) if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3) SYSTEM_READ_ERROR for (i = 1; i <= part; i++) if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2) SYSTEM_READ_ERROR fclose(dbfile); DB_PART = part; } } } // Set up cover extra's track if (DB_PART > 0) CV_ANAME = Strdup(Catenate(dpwd,PATHSEP,root, Numbered_Suffix(".",DB_PART,".covr.anno")),"Allocating cover anno name"); else CV_ANAME = Strdup(Catenate(dpwd,PATHSEP,root,".covr.anno"),"Allocating cover anno name"); CV_AFILE = Fopen(CV_ANAME,"w"); if (CV_ANAME == NULL || CV_AFILE == NULL) exit (1); { int size, length; length = 0; size = 1; fwrite(&length,sizeof(int),1,CV_AFILE); fwrite(&size,sizeof(int),1,CV_AFILE); } // Get trace point spacing information fread(&novl,sizeof(int64),1,input); fread(&TRACE_SPACING,sizeof(int),1,input); // Initialize statistics gathering if (VERBOSE) { nreads = 0; totlen = 0; printf("\nDAScover %s %s\n",argv[1],argv[c]); } { int i; for (i = 0; i <= MAX_COVER; i++) Cov_Hist[i] = 0; } // Process each read pile make_a_pass(input,HISTOGRAM_COVER,0); // If verbose output statistics summary to stdout if (VERBOSE) { int i, cover; int64 ssum, stotal; printf("\nInput: "); Print_Number(nreads,7,stdout); printf("reads, "); Print_Number(totlen,12,stdout); printf(" bases"); if (HGAP_MIN > 0) { printf(" (another "); Print_Number((DB_LAST-DB_FIRST) - nreads,0,stdout); printf(" were < H-length)"); } printf("\n"); stotal = 0; for (i = 0; i <= MAX_COVER; i++) stotal += Cov_Hist[i]; printf("\nCoverage Histogram\n\n"); ssum = Cov_Hist[MAX_COVER]; if (ssum > 0) printf(" %4d: %9lld %5.1f%%\n\n", MAX_COVER,Cov_Hist[MAX_COVER],(100.*ssum)/stotal); stotal -= ssum; ssum = 0; for (i = MAX_COVER-1; i >= 0; i--) if (Cov_Hist[i] > 0) { ssum += Cov_Hist[i]; printf(" %4d: %9lld %5.1f%%\n", i,Cov_Hist[i],(100.*ssum)/stotal); } i = 0; while (Cov_Hist[i+1] < Cov_Hist[i]) i += 1; for (cover = i++; i < MAX_COVER; i++) if (Cov_Hist[cover] < Cov_Hist[i]) cover = i; printf("\n Coverage is estimated at %d\n\n",cover); } // Output coverage histogram Write_Extra(CV_AFILE,&ex_covr); Write_Extra(CV_AFILE,&ex_hgap); fclose(CV_AFILE); free(CV_ANAME); fclose(input); } Free_Block_Arg(parse); } free(dpwd); free(root); Close_DB(DB); free(Prog_Name); exit (0); } DASCRUBBER-1.1/DASedit.c000066400000000000000000000557231327574206400144360ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2015, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Given the "trim" track patching directions, produce a new database .E.db from * .db containing all the patched and cut images of the original reads. * * Author: Gene Myers * Date : August 2016 * *******************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif #undef DEBUG_PATCHING #undef SHOW_PIECES #undef DEBUG_MAPPING #undef DEBUG_INDEX static char *Usage = "[-v] [-x] "; // Gap states #define LOWQ 0 // Gap is spanned by many LAs and patchable #define SPAN 1 // Gap has many paired LAs and patchable #define SPLIT 2 // Gap is a chimer or an unpatchable gap // Global Variables (must exist across the processing of each pile) // Read-only static DAZZ_DB _DB, *DB = &_DB; // Data base static int64 *TRIM_IDX; static int *TRIM; static int64 *PATCH_IDX; static int *PATCH; void Print_Seq(char *target, int tlen) { static int letter[4] = { 'a', 'c', 'g', 't' }; int i; for (i = 0; i < tlen; i++) { if (i%100 == 0 && i != 0) printf("\n"); printf("%c",letter[(int) target[i]]); } printf("\n"); } #define STACK_SIZE 50 static char *BSTACK[STACK_SIZE]; static int Load_Model(int *patch, char *target, int depth) { int bread, bbeg, bend; int lend, rend; int gb, ge, pb; int tlen, plen; char *bseq; if (BSTACK[depth] == NULL) BSTACK[depth] = New_Read_Buffer(DB); bread = patch[0]; if (bread < 0) bread = - (bread+1); else bread -= 1; bbeg = patch[1]; bend = patch[2]; #ifdef DEBUG_PATCHING printf("%*sPatching %d%c[%d,%d]->[%d,%d]\n", 2*depth,"",bread,(patch[0]<0?'c':'n'),patch[1],patch[2],bbeg,bend); #endif bseq = Load_Subread(DB,bread,bbeg,bend,BSTACK[depth],0) - bbeg; gb = TRIM_IDX[bread]; ge = TRIM_IDX[bread+1]; pb = PATCH_IDX[bread]; tlen = 0; rend = -1; for (; gb < ge; gb += 3) { lend = TRIM[gb]; if (lend > bbeg) { if (rend < bbeg || lend > bend || PATCH[pb] == 0) return (-1); plen = Load_Model(PATCH+pb,target+tlen,depth+1); if (plen < 0) return (-1); tlen += plen; rend = TRIM[gb+1]; if (rend > bend) rend = bend; memmove(target+tlen,bseq+lend,rend-lend); tlen += rend-lend; #ifdef DEBUG_PATCHING printf("%*s Piece %d,%d\n",2*depth,"",lend,rend); #endif #ifdef SHOW_PIECES Print_Seq(bseq+lend,rend-lend); #endif pb += 3; } else // lend <= bbeg { rend = TRIM[gb+1]; if (rend > bbeg) { if (rend > bend) rend = bend; memmove(target+tlen,bseq+bbeg,rend-bbeg); tlen += rend-bbeg; #ifdef DEBUG_PATCHING printf("%*s Piece %d,%d\n",2*depth,"",bbeg,rend); #endif #ifdef SHOW_PIECES Print_Seq(bseq+bbeg,rend-bbeg); #endif } else pb += 3; } if (rend >= bend) break; } if (gb >= ge) fprintf(stderr,"Fatal: Should not happen\n"); if (patch[0] < 0) Complement_Seq(target,tlen); return (tlen); } int main(int argc, char *argv[]) { DAZZ_READ *reads; int nreads; int64 boff; int nnew, nmax; int64 ntot; int *segfate; int nsegs; char *pwd1, *root1; // inputs char *pwd2, *root2; int VERBOSE; int CUTOFF; int HGAP_MIN; int nfiles; // contents of source .db file int nblocks; int64 bsize; char **flist = NULL; char **plist = NULL; int *findx = NULL; int *bindx = NULL; FILE *NB_FILE; // files for writing target FILE *IDX_FILE; FILE *BPS_FILE; FILE *MP_AFILE; FILE *MP_DFILE; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DASedit") CUTOFF = 0; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; case 'x': ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff") break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc != 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n"); fprintf(stderr," -x: minimum length for edited reads\n"); exit (1); } } // Open source and make target has a different pwd/name { int status; status = Open_DB(argv[1],DB); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (DB->part) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } pwd1 = PathTo(argv[1]); root1 = Root(argv[1],".db"); pwd2 = PathTo(argv[2]); root2 = Root(argv[2],".db"); if (strcmp(root1,root2) == 0 && strcmp(pwd1,pwd2) == 0) { fprintf(stderr,"%s: source and target are the same !\n",Prog_Name); exit (1); } } // Load the file and block structure of the source database { int i; int nid, oid; int cutoff, allflag, ufirst; FILE *dstub; dstub = Fopen(Catenate(pwd1,"/",root1,".db"),"r"); if (dstub == NULL) exit (1); if (fscanf(dstub,DB_NFILE,&nfiles) != 1) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root1); exit (1); } flist = (char **) Malloc(sizeof(char *)*nfiles,"Allocating file list"); plist = (char **) Malloc(sizeof(char *)*nfiles,"Allocating prolog list"); findx = (int *) Malloc(sizeof(int *)*nfiles,"Allocating file index"); if (flist == NULL || plist == NULL || findx == NULL) exit (1); for (i = 0; i < nfiles; i++) { char prolog[MAX_NAME], fname[MAX_NAME]; if (fscanf(dstub,DB_FDATA,findx+i,fname,prolog) != 3) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root1); exit (1); } if ((flist[i] = Strdup(fname,"Adding to file list")) == NULL) exit (1); if ((plist[i] = Strdup(prolog,"Adding to prolog list")) == NULL) exit (1); } if (fscanf(dstub,DB_NBLOCK,&nblocks) != 1) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root1); exit (1); } if (fscanf(dstub,DB_PARAMS,&bsize,&cutoff,&allflag) != 3) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root1); exit (1); } bindx = (int *) Malloc(sizeof(int *)*(nblocks+1),"Allocating block indices"); if (bindx == NULL) exit (1); for (i = 0; i <= nblocks; i++) if (fscanf(dstub,DB_BDATA,&ufirst,bindx+i) != 2) { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root1); exit (1); } fclose(dstub); // map read counts for each file into trimmed read counts for each file if (allflag) allflag = 0; else allflag = DB_BEST; reads = DB->reads; nid = 0; oid = 0; for (i = 0; i < nfiles; i++) { while (oid < findx[i]) { if ((reads[oid].flags & DB_BEST) >= allflag && reads[oid].rlen >= cutoff) nid++; oid += 1; } findx[i] = nid; } } // Can now trim source DB. Also load .trim and .patch tracks Trim_DB(DB); reads = DB->reads; nreads = DB->nreads; { DAZZ_TRACK *track; int i; track = Load_Track(DB,"trim"); if (track != NULL) { FILE *afile; char *aname; int extra, tracklen, size; DAZZ_EXTRA ex_hgap; TRIM_IDX = (int64 *) track->anno; TRIM = (int *) track->data; for (i = 0; i <= nreads; i++) TRIM_IDX[i] /= sizeof(int); // Get HGAP minimum from .trim extras aname = Strdup(Catenate(DB->path,".","trim",".anno"),"Allocating anno file"); if (aname == NULL) exit (1); afile = fopen(aname,"r"); fread(&tracklen,sizeof(int),1,afile); fread(&size,sizeof(int),1,afile); fseeko(afile,0,SEEK_END); extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int)); fseeko(afile,-extra,SEEK_END); ex_hgap.nelem = 0; if (Read_Extra(afile,aname,&ex_hgap) != 0) { fprintf(stderr,"%s: Hgap threshold extra missing from .trim track?\n",Prog_Name); exit (1); } fclose(afile); HGAP_MIN = (int) ((int64 *) (ex_hgap.value))[0]; } else { fprintf(stderr,"%s: Must have a 'trim' track, run DAStrim\n",Prog_Name); exit (1); } track = Load_Track(DB,"patch"); if (track != NULL) { PATCH_IDX = (int64 *) track->anno; PATCH = (int *) track->data; for (i = 0; i <= nreads; i++) PATCH_IDX[i] /= sizeof(int); } else { fprintf(stderr,"%s: Must have a 'patch' track, run DASfix\n",Prog_Name); exit (1); } } // Allocate segment status: will have value of defined constants below or if > 0 // the the segment is followed by a patch of given size. // Also open new db files here, to be certain they can be so manipulated before // doing anything that would need to be reversed. #define SHORT_LAST -4 #define GOOD_LAST -3 #define TRIMMED -2 #define SHORT -1 #define GOOD 0 nsegs = nreads + PATCH_IDX[nreads]/3; segfate = Malloc(sizeof(int)*nsegs,"Allocating block status vector"); NB_FILE = Fopen(Catenate(pwd2,"/",root2,".db"),"w"); IDX_FILE = Fopen(Catenate(pwd2,PATHSEP,root2,".idx"),"w"); BPS_FILE = Fopen(Catenate(pwd2,PATHSEP,root2,".bps"),"w"); MP_AFILE = Fopen(Catenate(pwd2,PATHSEP,root2,".map.anno"),"w"); MP_DFILE = Fopen(Catenate(pwd2,PATHSEP,root2,".map.data"),"w"); if (NB_FILE == NULL || IDX_FILE == NULL || BPS_FILE == NULL || MP_AFILE == NULL || MP_DFILE == NULL) exit (1); // Patch all the reads creating a new compressed .bps file of said for the new // DB. Further tally the total bytes and number of cuts, and also produce // the .map track. { int i, ni, bi; int gb, ge, pb; char *target, *aseq; int64 MP_INDEX; int64 ntrim, nshort; int64 nttot, nstot; int64 htrim, httot; ni = 0; fwrite(&ni,sizeof(int),1,MP_AFILE); ni = 8; fwrite(&ni,sizeof(int),1,MP_AFILE); MP_INDEX = 0; fwrite(&MP_INDEX,sizeof(int64),1,MP_AFILE); for (i = 0; i < STACK_SIZE; i++) BSTACK[i] = NULL; { int ml = DB->maxlen; DB->maxlen = 1.5*ml + 10000; target = New_Read_Buffer(DB); DB->maxlen = ml; } aseq = BSTACK[0] = New_Read_Buffer(DB); segfate[0] = GOOD_LAST; ntrim = nshort = 0; nstot = nttot = 0; htrim = httot = 0; ntot = 0; nnew = nmax = 0; boff = 0; ni = 0; bi = 0; pb = 0; ge = 0; for (i = 0; i < nreads; i++) { int tlen, clen; int lend, rend, blen; int gb1, bi1; gb = ge; ge = TRIM_IDX[i+1]; #ifdef DEBUG_PATCHING printf("Doing %d\n",i); #endif if (reads[i].rlen < HGAP_MIN) { segfate[bi++] = TRIMMED; htrim += 1; httot += reads[i].rlen; continue; } if (ge <= gb) { segfate[bi++] = TRIMMED; ntrim += 1; nttot += reads[i].rlen; continue; } Load_Read(DB,i,aseq,0); nttot += TRIM[gb]; gb1 = gb; bi1 = bi; tlen = 0; for ( ; gb < ge; gb += 3) { lend = TRIM[gb]; rend = TRIM[gb+1]; blen = rend - lend; memmove(target+tlen,aseq+lend,blen); tlen += blen; #ifdef DEBUG_PATCHING printf(" Piece %d,%d (%d)\n",lend,rend,bi); #endif #ifdef SHOW_PIECES Print_Seq(aseq+lend,blen); #endif if (gb+3 < ge) { if (PATCH[pb] != 0) clen = Load_Model(PATCH+pb,target+tlen,1); else clen = -1; pb += 3; if (clen >= 0) { tlen += clen; segfate[bi++] = clen; continue; } } if (tlen >= CUTOFF) { Compress_Read(tlen,target); clen = COMPRESSED_LEN(tlen); fwrite(target,1,clen,BPS_FILE); boff += clen; #ifdef DEBUG_PATCHING printf(" Output %d(%d)\n",ni,tlen); #endif ni += 1; fwrite(&i,sizeof(int),1,MP_DFILE); fwrite(&(reads[i].rlen),sizeof(int),1,MP_DFILE); MP_INDEX += 2*sizeof(int); while (gb1 < gb) { fwrite(TRIM+gb1,sizeof(int),1,MP_DFILE); fwrite(TRIM+(gb1+1),sizeof(int),1,MP_DFILE); fwrite(segfate+bi1,sizeof(int),1,MP_DFILE); MP_INDEX += 3*sizeof(int); gb1 += 3; bi1 += 1; } fwrite(TRIM+gb1,sizeof(int),1,MP_DFILE); fwrite(TRIM+(gb1+1),sizeof(int),1,MP_DFILE); MP_INDEX += 2*sizeof(int); fwrite(&MP_INDEX,sizeof(int64),1,MP_AFILE); gb1 = gb+3; if (gb1 <= ge) segfate[bi++] = GOOD; else segfate[bi++] = GOOD_LAST; bi1 = bi; nnew += 1; ntot += tlen; if (tlen > nmax) nmax = tlen; } else { gb1 = gb+3; if (gb1 <= ge) segfate[bi++] = SHORT; else segfate[bi++] = SHORT_LAST; bi1 = bi; nshort += 1; nstot += tlen; #ifdef DEBUG_PATCHING printf(" Remove %d(%d)\n",ni,tlen); #endif } tlen = 0; if (gb1 <= ge) { nttot += TRIM[gb1]-rend; #ifdef DEBUG_PATCHING printf(" Cutting %d,%d\n",rend,TRIM[gb1]); #endif } else nttot += reads[i].rlen-rend; } } nsegs = bi; for (i = 0; i < STACK_SIZE; i++) if (BSTACK[i] == NULL) break; else free(BSTACK[i]-1); free(target-1); rewind(MP_AFILE); fwrite(&ni,sizeof(int),1,MP_AFILE); if (VERBOSE) { printf("\n "); if (htrim > 0) { Print_Number(htrim,0,stdout); printf(" reads and "); Print_Number(httot,0,stdout); printf(" bases in reads < H-length (%d)\n\n ",HGAP_MIN); } Print_Number(DB->nreads-htrim,0,stdout); printf(" reads and "); Print_Number(DB->totlen-httot,0,stdout); if (htrim > 0) printf(" bases in reads >= H-length in source DB \n\n "); else printf(" bases in source DB\n\n "); Print_Number(ntrim,0,stdout); printf(" reads and "); Print_Number(nttot,0,stdout); printf(" bases were trimmmed by scrubbing (DAStrim)\n\n "); if (CUTOFF > 0) { Print_Number(nshort,0,stdout); printf(" edited reads < %d bases, totaling ",CUTOFF); Print_Number(nstot,0,stdout); printf(" bases were cut (-x option)\n\n "); } printf("The edited DB has "); Print_Number((int64) nnew,0,stdout); printf(" edited reads and "); Print_Number(ntot,0,stdout); printf(" bases\n"); } fclose(BPS_FILE); fclose(MP_AFILE); fclose(MP_DFILE); } // Output the file structure for the new database, adjusting the number // of reads in each file according to how reads are split, and also adjust // block indices similarly. Further compress all read records that are trimmed // or produce no edited reads. { int i, s; int oi, ni; int nf, nb; #ifdef DEBUG_MAPPING printf("\nMAPPING\n"); #endif nf = 0; nb = 1; oi = 0; ni = 0; for (i = 0; i < nsegs; i++) { s = segfate[i]; if (s <= 0) { if (s == GOOD || s == GOOD_LAST) ni += 1; if (s <= TRIMMED) { oi += 1; if (oi == findx[nf]) { findx[nf++] = ni; #ifdef DEBUG_MAPPING printf(" %2d: %d->%d\n",nf-1,oi,ni); #endif } if (oi == bindx[nb]) { bindx[nb++] = ni; #ifdef DEBUG_MAPPING printf(" %2d: %d->%d\n",nb-1,oi,ni); #endif } } } } #ifdef DEBUG_MAPPING printf(" Total reads = %d Trimmed Reads = %d\n",ni,oi); if (nf != nfiles) printf(" File count not correct %d %d\n",nf,nfiles); if (nb != nblocks+1) printf(" Block count not correct %d %d\n",nb,nblocks+1); fflush(stdout); #endif fprintf(NB_FILE,DB_NFILE,nfiles); for (i = 0; i < nfiles; i++) fprintf(NB_FILE,DB_FDATA,findx[i],flist[i],plist[i]); fprintf(NB_FILE,DB_NBLOCK,nblocks); fprintf(NB_FILE,DB_PARAMS,bsize,CUTOFF,1); for (i = 0; i <= nblocks; i++) fprintf(NB_FILE,DB_BDATA,bindx[i],bindx[i]); fclose(NB_FILE); } { int i, s; int bi, gb; int tlen, first; DAZZ_DB NB; DAZZ_READ newrec; #ifdef DEBUG_INDEX int ni; #endif // Write an index of the patched reads NB = *DB; NB.ureads = nnew; NB.treads = nnew; NB.cutoff = CUTOFF; NB.allarr = DB->allarr | DB_ALL; NB.totlen = ntot; NB.maxlen = nmax; fwrite(&NB,sizeof(DAZZ_DB),1,IDX_FILE); #ifdef DEBUG_INDEX printf("\nINDEXING\n"); ni = 0; #endif boff = 0; i = 0; gb = 0; bi = 0; while (bi < nsegs) { tlen = 0; first = TRIM[gb]; while ((s = segfate[bi++]) > 0) { tlen += (TRIM[gb+1] - TRIM[gb]) + s; #ifdef DEBUG_INDEX printf(" [%d,%d] %d",TRIM[gb],TRIM[gb+1],s); #endif gb += 3; } if (s == GOOD || s == GOOD_LAST) { tlen += (TRIM[gb+1] - TRIM[gb]); #ifdef DEBUG_INDEX printf(" [%d,%d]",TRIM[gb],TRIM[gb+1]); printf(" GOOD %d: (%d,%d)\n",i,ni++,bi); #endif newrec.origin = reads[i].origin; newrec.fpulse = reads[i].fpulse + first; newrec.rlen = tlen; newrec.boff = boff; newrec.coff = i; newrec.flags = (reads[i].flags & DB_QV) | DB_BEST; if (segfate[bi] > TRIMMED) newrec.flags |= DB_CSS; fwrite(&newrec,sizeof(DAZZ_READ),1,IDX_FILE); boff += COMPRESSED_LEN(tlen); if (s == GOOD) gb += 3; else { gb += 2; i += 1; } } else if (s == TRIMMED) { #ifdef DEBUG_INDEX printf(" TRIM %d: (%d)\n",i,bi); #endif i += 1; } else // s == SHORT || s == SHORT_LAST { #ifdef DEBUG_INDEX printf(" [%d,%d]",TRIM[gb],TRIM[gb+1]); printf(" SHRT %d: (%d)\n",i,bi); #endif if (s == SHORT) gb += 3; else { gb += 2; i += 1; } } } #ifdef DEBUG_INDEX printf("Finish %d %d %lld\n",ni,i,boff); #endif fclose(IDX_FILE); } free(pwd2); free(root2); free(pwd1); free(root1); free(Prog_Name); exit (0); } DASCRUBBER-1.1/DASmap.c000066400000000000000000000167701327574206400142650ustar00rootroot00000000000000/******************************************************************************************* * * Display a specified set of reads of a database in fasta format. * * Author: Gene Myers * Date : September 2013 * Mod : With DB overhaul, made this a routine strictly for printing a selected subset * and created DB2fasta for recreating all the fasta files of a DB * Date : April 2014 * Mod : Added options to display QV streams * Date : July 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char Usage[] = "[-p] [ | ... ]"; #define LAST_READ_SYMBOL '$' #define MAX_BUFFER 10001 typedef struct { FILE *input; int lineno; int read; int beg; int end; } File_Iterator; File_Iterator *init_file_iterator(FILE *input) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->input = input; it->lineno = 1; rewind(input); return (it); } int next_read(File_Iterator *it) { static char nbuffer[MAX_BUFFER]; char *eol; int x; if (fgets(nbuffer,MAX_BUFFER,it->input) == NULL) { if (feof(it->input)) return (1); SYSTEM_READ_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in read list is longer than %d chars!\n", Prog_Name,it->lineno,MAX_BUFFER-1); return (1); } *eol = '\0'; x = sscanf(nbuffer," %d %d %d",&(it->read),&(it->beg),&(it->end)); if (x == 1) it->beg = -1; else if (x != 3) { fprintf(stderr,"%s: Line %d of read list is improperly formatted\n",Prog_Name,it->lineno); return (1); } it->lineno += 1; return (0); } int main(int argc, char *argv[]) { DAZZ_DB _db, *db = &_db; DAZZ_TRACK *map; int reps, *pts; int input_pts; File_Iterator *iter = NULL; FILE *input; int PRETTY; // Process arguments { int i, j, k; int flags[128]; ARG_INIT("DASmap") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') ARG_FLAGS("p") else argv[j++] = argv[i]; argc = j; PRETTY = flags['p']; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -p: Pretty print (vs easy to parse).\n"); exit (1); } } // Open DB or DAM, and if a DAM open also .hdr file { int status, kind; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (db->part) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } status = Check_Track(db,"map",&kind); if (status == -2) { fprintf(stderr,"%s: 'map' track not found.\n",Prog_Name); exit (1); } else if (status == -1) { fprintf(stderr,"%s: Warning: 'map' track not sync'd with db.\n",Prog_Name); exit (1); } map = Load_Track(db,"map"); } // Process read index arguments into a list of read ranges input_pts = 0; if (argc == 3) { if (argv[2][0] != LAST_READ_SYMBOL || argv[2][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[2],&eptr,10); if (eptr > argv[2] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { input = Fopen(argv[2],"r"); if (input == NULL) exit (1); iter = init_file_iterator(input); } else { pts = (int *) Malloc(sizeof(int)*2*(argc-1),"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 2) { int c, b, e; char *eptr, *fptr; for (c = 2; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == 0) { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = db->nreads; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } } else { pts[reps++] = 1; pts[reps++] = db->nreads; } } // Display each read (and/or QV streams) in the active DB according to the // range pairs in pts[0..reps) and according to the display options. { int c, b, e, i; int64 *anno; int *data; int64 s, f, j; anno = (int64 *) map->anno; data = (int *) map->data; c = 0; while (1) { if (input_pts) { if (next_read(iter)) break; e = iter->read; b = e-1; } else { if (c >= reps) break; b = pts[c]-1; e = pts[c+1]; if (e > db->nreads) e = db->nreads; c += 2; } if (PRETTY) for (i = b; i < e; i++) { s = (anno[i] >> 2); f = (anno[i+1] >> 2); printf(" %d -> %d(%d)",i+1,data[s]+1,data[s+1]); for (j = s+2; j < f; j += 3) { printf(" [%d,%d]",data[j],data[j+1]); if (j+2 < f) printf(" %d",data[j+2]); } printf("\n"); } else for (i = b; i < e; i++) { s = (anno[i] >> 2); f = (anno[i+1] >> 2); printf(" %d %d %d %lld",i+1,data[s]+1,data[s+1],(f-s)-2); for (j = s+2; j < f; j += 3) { printf(" %d %d",data[j],data[j+1]); if (j+2 < f) printf(" %d",data[j+2]); } printf("\n"); } } } if (input_pts) { fclose(input); free(iter); } else free(pts); Close_DB(db); exit (0); } DASCRUBBER-1.1/DASpatch.c000066400000000000000000000576311327574206400146100ustar00rootroot00000000000000/******************************************************************************************* * * Using overlap pile for each read,intrinisic quality values, and trimmed hq-intervals * for each read, determine the B-read and segment thereof to use to patch each low * quality segment between hq-intervals. * * Author: Gene Myers * Date : June 2016 * *******************************************************************************************/ #include #include #include #include #include "DB.h" #include "align.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif #undef DEBUG_GAP_FILL #undef SHOW_PAIRS #undef DEBUG_SUMMARY // Command format and global parameter variables static char *Usage = " [-v] ..."; static int BAD_QV; // qv >= and you are "bad" static int GOOD_QV; // qv <= and you are "good" static int HGAP_MIN; // less than this length do not process for HGAP static int VERBOSE; // Gap states #define LOWQ 0 // Gap is spanned by many LAs and patchable #define SPAN 1 // Gap has many paired LAs and patchable #define SPLIT 2 // Gap is a chimer or an unpatchable gap #define NOPAT 3 // Gap could not be patched (internal only) #define COVER_LEN 400 // An overlap covers a point if it extends COVER_LEN to either side. #define ANCHOR_MATCH .25 // Delta in trace interval at both ends of patch must be < this %. static int ANCHOR_THRESH; // Global Variables (must exist across the processing of each pile) // Read-only static int TRACE_SPACING; // Trace spacing (from .las file) static DAZZ_DB _DB, *DB = &_DB; // Data base static int DB_FIRST; // First read of DB to process static int DB_LAST; // Last read of DB to process (+1) static int DB_PART; // 0 if all, otherwise block # static int64 *QV_IDX; // qual track index static uint8 *QV; // qual track values static int64 *TRIM_IDX; // trim track index static int *TRIM; // trim track values // Write-only static FILE *PR_AFILE; // .trim.anno static FILE *PR_DFILE; // .trim.data static int64 PR_INDEX; // Current index into .trim.data file as it is being written // Statistics static int fpatch, npatch; // Data Structures typedef struct // General read interval [beg..end] { int beg; int end; } Interval; /******************************************************************************************* * * FIND ANY UNREMOVED ADAPTER (OR POLYMERASE SWITCHES) AND TRIM SMALLER PARTS * ********************************************************************************************/ typedef struct { int bread; // bread^comp[beg..end] is the patch sequence int comp; int beg; int end; int anc; // maximum anchor interval match int bad; // number of segments that are bad int avg; // average QV of the patch } Patch; // Evaluate the quality of lov->bread = rov->bread spaning [lcv,rcv] as a patch static Patch *compute_patch(int lft, int rgt, Overlap *lov, Overlap *rov) { static Patch ans; uint16 *tr; int bread, bcomp, blen; int bb, be; int t, te; int bl, br; uint8 *qb; int avg, anc, bad; bread = lov->bread; bcomp = COMP(lov->flags); blen = DB->reads[bread].rlen; if (blen < HGAP_MIN) return (NULL); if (lft > lov->path.aepos || rgt < rov->path.abpos) // Cannot anchor return (NULL); if (lov->path.abpos > lft-TRACE_SPACING || rgt+TRACE_SPACING > rov->path.aepos) return (NULL); // Get max of left and right anchors as anchor score tr = (uint16 *) lov->path.trace; te = 2 * (((lft + (TRACE_SPACING-1)) - lov->path.abpos)/TRACE_SPACING); if (te == 0) return (NULL); anc = tr[te-2]; bb = lov->path.bbpos; for (t = 1; t < te; t += 2) bb += tr[t]; tr = (uint16 *) rov->path.trace; te = 2 * (((rgt + (TRACE_SPACING-1)) - rov->path.abpos)/TRACE_SPACING); if (te >= rov->path.tlen) return (NULL); if (tr[te] > anc) anc = tr[te]; be = rov->path.bepos; for (t = rov->path.tlen-1; t > te; t -= 2) be -= tr[t]; if (bb >= be) return (NULL); // Compute metrics for b-read patch if (bcomp) { t = blen - be; be = blen - bb; bb = t; } bl = bb/TRACE_SPACING; br = (be+(TRACE_SPACING-1))/TRACE_SPACING; qb = QV + QV_IDX[bread]; if (bl >= br) { avg = qb[bl]; if (avg >= BAD_QV) bad = 1; else bad = 0; } else { avg = 0; bad = 0; for (t = bl; t < br; t++) { avg += qb[t]; if (qb[t] >= BAD_QV) bad += 1; } avg /= (br-bl); } ans.bread = bread; ans.comp = bcomp; ans.beg = bb; ans.end = be; ans.anc = anc; ans.bad = bad; ans.avg = avg; return (&ans); } static int unsuitable(int bread, int lft, int rgt) { int tb, te; tb = TRIM_IDX[bread]; te = TRIM_IDX[bread+1]; for ( ; tb < te; tb += 3) if (TRIM[tb+1] >= lft) break; if (tb >= te || TRIM[tb] > lft) return (1); for ( ; tb < te ; tb += 3) { if (TRIM[tb+1] >= rgt) break; if (TRIM[tb+2] == SPLIT) return (1); } if (tb >= te || TRIM[tb] > rgt) return (1); return (0); } // Categorize each gap and if appropriate return the best patch for each static Patch *lowq_patch(Overlap *ovls, int novl, Interval *lblock, Interval *rblock) { static Patch patch; int j; int lft, rgt; int lcv, rcv; lft = lblock->end; rgt = rblock->beg; lcv = lft - COVER_LEN; rcv = rgt + COVER_LEN; if (lcv < lblock->beg) lcv = lblock->beg; if (rcv > rblock->end) rcv = rblock->end; patch.bread = -1; patch.anc = TRACE_SPACING; patch.avg = 100; for (j = 0; j < novl; j++) if (ovls[j].path.abpos <= lcv && ovls[j].path.aepos >= rcv) { Patch *can; can = compute_patch(lft,rgt,ovls+j,ovls+j); if (can == NULL) continue; if (unsuitable(can->bread,can->beg,can->end)) continue; if (can->anc <= ANCHOR_THRESH && can->avg <= GOOD_QV && can->bad == 0 && can->avg + can->anc < patch.anc + patch.avg) patch = *can; } #ifdef DEBUG_GAP_FILL if (patch.bread >= 0) printf(" LOWQ PATCH = %d%c[%d..%d] %d (%d)\n", patch.bread,patch.comp?'c':'n',patch.beg,patch.end,patch.anc,patch.avg); else printf(" LOWQ PATCH FAIL\n"); #endif return (&patch); } static Patch *span_patch(Overlap *ovls, int novl, Interval *lblock, Interval *rblock) { static Patch patch; int j, k; int lft, rgt; int lcv, rcv; int bread, bcomp, blen; int ab, ae; int lidx, ridx, sidx, cidx; Patch *can; lft = lblock->end; rgt = rblock->beg; lcv = lft - COVER_LEN; rcv = rgt + COVER_LEN; if (lcv < lblock->beg) lcv = lblock->beg; if (rcv > rblock->end) rcv = rblock->end; // Find LA pairs or LAs spanning the gap flank [lcv,rcv] patch.bread = -1; patch.bad = DB->maxlen; patch.avg = 100; for (j = 0; j < novl; j = k) { bread = ovls[j].bread; blen = DB->reads[bread].rlen; bcomp = COMP(ovls[j].flags); if (bcomp) cidx = j; lidx = ridx = sidx = -1; // For all LA's with same b-read for (k = j; k < novl; k++) { if (ovls[k].bread != bread) break; if (COMP(ovls[k].flags) != (uint32) bcomp) // Note when b switches orientation { cidx = k; bcomp = COMP(ovls[k].flags); } ab = ovls[k].path.abpos; ae = ovls[k].path.aepos; #ifdef SHOW_PAIRS printf("\n %5d [%5d,%5d] %c",bread,ab,ae,COMP(ovls[k].flags)?'c':'n'); if (ab <= lcv && ae >= rcv) printf("s"); else printf(" "); #endif // Is LA a spanner, left-partner, or right partner if (ab <= lcv && ae >= rcv) { sidx = k; lidx = ridx = -1; continue; } #ifdef SHOW_PAIRS if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - COVER_LEN) printf("r"); else printf(" "); if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + COVER_LEN) printf("l"); else printf(" "); #endif if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - COVER_LEN) ridx = k; if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + COVER_LEN) lidx = k; } if (! bcomp) cidx = k; #ifdef SHOW_PAIRS printf(" ="); if (sidx >= 0) printf(" S"); if (lidx >= 0) printf(" L"); if (ridx >= 0) printf(" R"); if (0 <= lidx && lidx < ridx && (ridx < cidx || lidx >= cidx)) printf(" G"); if ((0<=ridx && ridx= 0) lidx = ridx = sidx; else if (0 > lidx || lidx >= ridx || (ridx >= cidx && cidx > lidx)) continue; // Otherwise consider the gap linkable and try to patch it, declaring a split // iff all patch attemtps fail #ifdef DEBUG_GAP_FILL if (lidx != ridx) printf(" %5d [%5d,%5d] [%5d,%5d]", ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos, ovls[ridx].path.abpos,ovls[ridx].path.aepos); else printf(" %5d [%5d,%5d] SSS", ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos); #endif can = compute_patch(lft,rgt,ovls+lidx,ovls+ridx); if (can != NULL) { #ifdef DEBUG_GAP_FILL printf(" %d",can->end - can->beg); #endif if ( ! unsuitable(can->bread,can->beg,can->end) && can->anc <= ANCHOR_THRESH) { if (can->bad < patch.bad) patch = *can; else if (can->bad == patch.bad) { if (can->avg < patch.avg) patch = *can; } #ifdef DEBUG_GAP_FILL printf(" AA %d %d(%d)",can->anc,can->bad,can->avg); #endif } } #ifdef DEBUG_GAP_FILL printf("\n"); #endif } #ifdef DEBUG_GAP_FILL if (patch.bread >= 0) printf(" SPAN %5d: PATCH = %d%c[%d..%d] %d %d(%d)\n",rgt-lft, patch.bread,patch.comp?'c':'n',patch.beg, patch.end,patch.anc,patch.bad,patch.avg); else printf(" SPAN PATCH FAIL\n"); #endif return (&patch); } /******************************************************************************************* * * SCRUB EACH PILE: * Trim low-quality tips of reads and patch low quality intervals within a sequence * Trim adapter (and associated redundant prefix or suffix) * Break chimers or all unscaffoldable no-coverage gaps of reads * ********************************************************************************************/ // Analyze all the gaps between the good patches found in the first pass. // Consider a hole between two good intervals [lb,le] and [rb,re]. An overlap // is anchored to the left of the whole if abpos <= le-COVER_LEN and aepos >= rb+COVER_LEN static void PATCH_GAPS(int aread, Overlap *ovls, int novl) { static Patch dummy = { 0, 0, 0, 0, 0, 0, 0 }; #ifdef DEBUG_SUMMARY static char *status_string[4] = { "LOWQ", "SPAN", "SPLIT", "NOPAT" }; #endif int alen; Interval lblock, rblock; Patch *patch = NULL; int status; int tb, te; int val; alen = DB->reads[aread].rlen; if (alen < HGAP_MIN) { fwrite(&PR_INDEX,sizeof(int64),1,PR_AFILE); return; } #if defined(DEBUG_GAP_FILL) || defined(DEBUG_SUMMARY) printf("\n"); printf("AREAD %d\n",aread); #endif // Determine patch for every LOWQ and SPAN gap and output dummy 0-patch // for all SPLIT decisions tb = TRIM_IDX[aread]; te = TRIM_IDX[aread+1]; if (tb+2 < te) { lblock.beg = TRIM[tb]; lblock.end = TRIM[tb+1]; for (tb += 3; tb < te; tb += 3) { status = TRIM[tb-1]; rblock.beg = TRIM[tb]; rblock.end = TRIM[tb+1]; if (status == LOWQ) { patch = lowq_patch(ovls,novl,&lblock,&rblock); if (patch->bread < 0) status = SPAN; } if (status == SPAN) patch = span_patch(ovls,novl,&lblock,&rblock); if (status == SPLIT) { val = 0; patch = &dummy; } else { if (patch->bread < 0) { val = 0; fpatch += 1; #ifdef DEBUG_SUMMARY TRIM[tb-1] = NOPAT; #endif } else if (patch->comp) val = -(patch->bread+1); else val = patch->bread+1; npatch += 1; } fwrite(&val,sizeof(int),1,PR_DFILE); fwrite(&(patch->beg),sizeof(int),1,PR_DFILE); fwrite(&(patch->end),sizeof(int),1,PR_DFILE); PR_INDEX += 3*sizeof(int); lblock = rblock; } } fwrite(&PR_INDEX,sizeof(int64),1,PR_AFILE); #ifdef DEBUG_SUMMARY tb = TRIM_IDX[aread]; te = TRIM_IDX[aread+1]; #ifdef DEBUG_GAP_FILL if (tb+2 < te) printf(" FINAL:\n"); #endif if (tb < te) { printf(" [%d,%d]",TRIM[tb],TRIM[tb+1]); for (tb += 3; tb < te; tb += 3) printf(" %s [%d,%d]",status_string[TRIM[tb-1]],TRIM[tb],TRIM[tb+1]); printf("\n"); } #endif } // Read in each successive pile and call ACTION on it. Read in the traces only if // "trace" is nonzero static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int trace) { static Overlap *ovls = NULL; static int omax = 500; static uint16 *paths = NULL; static int pmax = 100000; int64 i, j, novl; int n, a; int pcur; int max; int tbytes; if (ovls == NULL) { ovls = (Overlap *) Malloc(sizeof(Overlap)*omax,"Allocating overlap buffer"); if (ovls == NULL) exit (1); } if (trace && paths == NULL) { paths = (uint16 *) Malloc(sizeof(uint16)*pmax,"Allocating path buffer"); if (paths == NULL) exit (1); } rewind(input); fread(&novl,sizeof(int64),1,input); fread(&TRACE_SPACING,sizeof(int),1,input); if (TRACE_SPACING <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); if (Read_Overlap(input,ovls) != 0) ovls[0].aread = INT32_MAX; else if (trace) { if (ovls[0].path.tlen > pmax) { pmax = 1.2*(ovls[0].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); if (paths == NULL) exit (1); } fread(paths,tbytes,ovls[0].path.tlen,input); if (tbytes == 1) { ovls[0].path.trace = paths; Decompress_TraceTo16(ovls); } } else fseek(input,tbytes*ovls[0].path.tlen,SEEK_CUR); if (ovls[0].aread < DB_FIRST) { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n", Prog_Name,DB_PART); exit (1); } pcur = 0; n = max = 0; for (j = DB_FIRST; j < DB_LAST; j++) { ovls[0] = ovls[n]; a = ovls[0].aread; if (a != j) n = 0; else { if (trace) memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen); n = 1; pcur = ovls[0].path.tlen; while (1) { if (Read_Overlap(input,ovls+n) != 0) { ovls[n].aread = INT32_MAX; break; } if (trace) { if (pcur + ovls[n].path.tlen > pmax) { pmax = 1.2*(pcur+ovls[n].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); if (paths == NULL) exit (1); } fread(paths+pcur,tbytes,ovls[n].path.tlen,input); if (tbytes == 1) { ovls[n].path.trace = paths+pcur; Decompress_TraceTo16(ovls+n); } } else fseek(input,tbytes*ovls[n].path.tlen,SEEK_CUR); if (ovls[n].aread != a) break; pcur += ovls[n].path.tlen; n += 1; if (n >= omax) { omax = 1.2*n + 100; ovls = (Overlap *) Realloc(ovls,sizeof(Overlap)*omax,"Expanding overlap buffer"); if (ovls == NULL) exit (1); } } if (n >= max) max = n; pcur = 0; for (i = 0; i < n; i++) { ovls[i].path.trace = paths+pcur; pcur += ovls[i].path.tlen; } } ACTION(j,ovls,n); } if (ovls[n].aread < INT32_MAX) { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n", Prog_Name,DB_PART); exit (1); } return (max); } int main(int argc, char *argv[]) { char *root, *dpwd; int64 novl; DAZZ_TRACK *track; int c; // Process arguments { int i, j, k; int flags[128]; ARG_INIT("DASpatch") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc < 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n"); exit (1); } } // Open trimmed DB and .qual and .trim tracks { int i, status; status = Open_DB(argv[1],DB); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (DB->part) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } Trim_DB(DB); track = Load_Track(DB,"qual"); if (track != NULL) { QV_IDX = (int64 *) track->anno; QV = (uint8 *) track->data; } else { fprintf(stderr,"%s: Must have a 'qual' track, run DASqv\n",Prog_Name); exit (1); } track = Load_Track(DB,"trim"); if (track != NULL) { FILE *afile; char *aname; int extra, tracklen, size; DAZZ_EXTRA ex_hgap, ex_cest, ex_good, ex_bad; TRIM_IDX = (int64 *) track->anno; TRIM = (int *) track->data; for (i = 0; i <= DB->nreads; i++) TRIM_IDX[i] /= sizeof(int); // Get HGAP minimum, and good and bad qv thresholds from .trim extras aname = Strdup(Catenate(DB->path,".","trim",".anno"),"Allocating anno file"); if (aname == NULL) exit (1); afile = fopen(aname,"r"); fread(&tracklen,sizeof(int),1,afile); fread(&size,sizeof(int),1,afile); fseeko(afile,0,SEEK_END); extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int)); fseeko(afile,-extra,SEEK_END); ex_hgap.nelem = 0; if (Read_Extra(afile,aname,&ex_hgap) != 0) { fprintf(stderr,"%s: Hgap threshold extra missing from .trim track?\n",Prog_Name); exit (1); } ex_cest.nelem = 0; if (Read_Extra(afile,aname,&ex_cest) != 0) { fprintf(stderr,"%s: Coverage estimate extra missing from .trim track?\n",Prog_Name); exit (1); } ex_good.nelem = 0; if (Read_Extra(afile,aname,&ex_good) != 0) { fprintf(stderr,"%s: Good QV threshold extra missing from .trim track?\n",Prog_Name); exit (1); } ex_bad.nelem = 0; if (Read_Extra(afile,aname,&ex_bad) != 0) { fprintf(stderr,"%s: Bad QV threshold extra missing from .trim track?\n",Prog_Name); exit (1); } fclose(afile); HGAP_MIN = (int) ((int64 *) (ex_hgap.value))[0]; GOOD_QV = (int) ((int64 *) (ex_good.value))[0]; BAD_QV = (int) ((int64 *) (ex_bad.value))[0]; } else { fprintf(stderr,"%s: Must have a 'trim' track, run DAStrim\n",Prog_Name); exit (1); } } // For each .las block/file dpwd = PathTo(argv[1]); root = Root(argv[1],".db"); for (c = 2; c < argc; c++) { Block_Looper *parse; FILE *input; parse = Parse_Block_Arg(argv[c]); while ((input = Next_Block_Arg(parse)) != NULL) { DB_PART = 0; DB_FIRST = 0; DB_LAST = DB->nreads; // Determine if a .las block is being processed and if so get first and last read // from .db file { FILE *dbfile; char buffer[2*MAX_NAME+100]; char *p, *eptr; int i, part, nfiles, nblocks, cutoff, all, oindx; int64 size; p = rindex(Block_Arg_Root(parse),'.'); if (p != NULL) { part = strtol(p+1,&eptr,10); if (*eptr == '\0' && eptr != p+1) { dbfile = Fopen(Catenate(dpwd,"/",root,".db"),"r"); if (dbfile == NULL) exit (1); if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_READ_ERROR for (i = 0; i < nfiles; i++) if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3) SYSTEM_READ_ERROR for (i = 1; i <= part; i++) if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2) SYSTEM_READ_ERROR fclose(dbfile); DB_PART = part; } } } // Set up patch track { int len, size; if (DB_PART > 0) { PR_AFILE = Fopen(Catenate(dpwd,PATHSEP,root, Numbered_Suffix(".",DB_PART,".patch.anno")),"w"); PR_DFILE = Fopen(Catenate(dpwd,PATHSEP,root, Numbered_Suffix(".",DB_PART,".patch.data")),"w"); } else { PR_AFILE = Fopen(Catenate(dpwd,PATHSEP,root,".patch.anno"),"w"); PR_DFILE = Fopen(Catenate(dpwd,PATHSEP,root,".patch.data"),"w"); } if (PR_AFILE == NULL || PR_DFILE == NULL) exit (1); len = DB_LAST - DB_FIRST; size = 8; fwrite(&len,sizeof(int),1,PR_AFILE); fwrite(&size,sizeof(int),1,PR_AFILE); PR_INDEX = 0; fwrite(&PR_INDEX,sizeof(int64),1,PR_AFILE); } // Get trace point spacing information fread(&novl,sizeof(int64),1,input); fread(&TRACE_SPACING,sizeof(int),1,input); ANCHOR_THRESH = ANCHOR_MATCH * TRACE_SPACING; // Initialize statistics gathering if (VERBOSE) { npatch = 0; fpatch = 0; printf("\nDASpatch -g%d -b%d %s %s\n",GOOD_QV,BAD_QV,argv[1],argv[c]); } // Process each read pile make_a_pass(input,PATCH_GAPS,1); // If verbose output statistics summary to stdout if (VERBOSE) { if (fpatch == 0) printf(" All %d patches were successful\n",npatch); else printf(" %d out of %d total patches failed\n",fpatch,npatch); } fclose(PR_AFILE); fclose(PR_DFILE); fclose(input); } Free_Block_Arg(parse); } free(dpwd); free(root); Close_DB(DB); free(Prog_Name); exit (0); } DASCRUBBER-1.1/DASqv.c000066400000000000000000000470601327574206400141320ustar00rootroot00000000000000/******************************************************************************************* * * Using overlap pile for each read compute estimated intrinisic quality values * * Author: Gene Myers * Date : September 2015 * *******************************************************************************************/ #include #include #include #include #include "DB.h" #include "align.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif #undef QV_DEBUG static char *Usage = "[-v] [-c] ..."; #define MAXQV 50 // Max QV score is 50 #define MAXQV1 51 #define MINCOV 2 // To have a score must be covered >= MINCOV in each direction (must be >0) #define PARTIAL .20 // Partial terminal segments covering this percentage are scored static int VERBOSE; static int COVERAGE; // Estimated coverage of genome static int QV_DEEP; // # of best diffs to average for QV score static int HGAP_MIN; // Under this length do not process for HGAP static int TRACE_SPACING; // Trace spacing (from .las file) static int TBYTES; // Bytes per trace segment (from .las file) static DAZZ_DB _DB, *DB = &_DB; // Data base static int DB_FIRST; // First read of DB to process static int DB_LAST; // Last read of DB to process (+1) static int DB_PART; // 0 if all, otherwise block # static FILE *QV_AFILE; // .qual.anno static FILE *QV_DFILE; // .qual.data static int64 QV_INDEX; // Current index into .qual.data file // Statistics static int64 nreads, totlen; static int64 qgram[MAXQV1], sgram[MAXQV1]; // For each pile, calculate QV scores of the aread at tick spacing TRACE_SPACING static void CALCULATE_QVS(int aread, Overlap *ovls, int novl) { static int nmax = 0; static int *hist = NULL; static int *cist = NULL; static uint8 *qvec = NULL; static int partial; int alen, atick; int *tick, *cick; int i; alen = DB->reads[aread].rlen; atick = (alen + (TRACE_SPACING-1))/TRACE_SPACING; if (alen < HGAP_MIN) { fwrite(&QV_INDEX,sizeof(int64),1,QV_AFILE); return; } #if defined(QV_DEBUG) printf("AREAD %d",aread); if (novl == 0) printf(" EMPTY"); printf("\n"); #endif // QV SCORES // Allocate or expand data structures for qv calculation as needed if (atick > nmax) { nmax = atick*1.2 + 100; hist = (int *) Realloc(hist,nmax*MAXQV1*sizeof(int),"Allocating histograms"); cist = (int *) Realloc(cist,nmax*MAXQV1*sizeof(int),"Allocating histograms"); qvec = (uint8 *) Realloc(qvec,nmax*sizeof(uint8),"Allocating QV vector"); if (hist == NULL || cist == NULL || qvec == NULL) exit (1); for (i = MAXQV1*nmax-1; i >= 0; i--) hist[i] = cist[i] = 0; partial = PARTIAL*TRACE_SPACING; } // For every segment, fill histogram of match diffs for every one of the // atick intervals, building separate histograms, hist & cist, for forward // and reverse B-hits for (i = 0; i < novl; i++) { Path *path; uint16 *trace; int *ht; int tlen, abit; int a, b, x; path = &(ovls[i].path); trace = (uint16 *) path->trace; tlen = path->tlen; if (COMP(ovls[i].flags)) ht = cist; else ht = hist; b = 0; a = (path->abpos/TRACE_SPACING)*MAXQV1; abit = (path->abpos % TRACE_SPACING); if (abit != 0) { a += MAXQV1; b += 2; } abit = (path->aepos % TRACE_SPACING); if (abit != 0) tlen -= 2; while (b < tlen) { x = (int) ((200.*trace[b]) / (TRACE_SPACING + trace[b+1])); if (x > MAXQV) x = MAXQV; ht[a + x] += 1; a += MAXQV1; b += 2; } if (path->aepos == alen && abit >= partial) { x = (int) ((200.*trace[tlen]) / (abit + trace[tlen+1])); if (x > MAXQV) x = MAXQV; ht[a + x] += 1; } } // For every segment, qv score is the maximum of the averages of the QV_DEEP lowest // in the forward and reverse directions (if each is QV_DEEP), or the average // of overlap scores (if between MINCOV and QV_DEEP-1), or MAXQV if no overlaps at all. // Reset histogram for segment to zeros. tick = hist; cick = cist; for (i = 0; i < atick; i++) { int v, y; int qvn, qvc; int cntn, cntc; int sumn, sumc; #ifdef QV_DEBUG { int min, max; printf(" [%5d,%5d]:",i*TRACE_SPACING,(i+1)*TRACE_SPACING); for (v = 0; v <= MAXQV; v++) if (tick[v] > 0) break; min = v; for (v = MAXQV; v >= 0; v--) if (tick[v] > 0) break; max = v; for (v = min; v <= max; v++) if (tick[v] == 1) printf(" %2d",v); else if (tick[v] > 1) printf(" %2d(%d)",v,tick[v]); printf("\n :"); for (v = 0; v <= MAXQV; v++) if (cick[v] > 0) break; min = v; for (v = MAXQV; v >= 0; v--) if (cick[v] > 0) break; max = v; for (v = min; v <= max; v++) if (cick[v] == 1) printf(" %2d",v); else if (cick[v] > 1) printf(" %2d(%d)",v,cick[v]); } #endif for (v = 0; v <= MAXQV; v++) sgram[v] += tick[v] + cick[v]; cntn = sumn = 0; for (v = 0; v <= MAXQV; v++) { y = tick[v]; tick[v] = 0; cntn += y; sumn += y*v; if (cntn >= QV_DEEP) { sumn -= (cntn-QV_DEEP)*v; cntn = QV_DEEP; break; } } for (v++; v <= MAXQV; v++) tick[v] = 0; cntc = sumc = 0; for (v = 0; v <= MAXQV; v++) { y = cick[v]; cick[v] = 0; cntc += y; sumc += y*v; if (cntc >= QV_DEEP) { sumc -= (cntc-QV_DEEP)*v; cntc = QV_DEEP; break; } } for (v++; v <= MAXQV; v++) cick[v] = 0; if (cntn >= MINCOV) qvn = sumn/cntn; else qvn = MAXQV; if (cntc >= MINCOV) qvc = sumc/cntc; else qvc = MAXQV; if (qvn > qvc) qvec[i] = (uint8) qvn; else qvec[i] = (uint8) qvc; tick += MAXQV1; cick += MAXQV1; #ifdef QV_DEBUG printf(" >> %2d(%d) %2d(%d) = %2d <<\n",qvn,cntn,qvc,cntc,qvec[i]); #endif } // Accumulate qv histogram and append qv's to .qual file for (i = 0; i < atick; i++) qgram[qvec[i]] += 1; nreads += 1; totlen += alen; fwrite(qvec,sizeof(uint8),atick,QV_DFILE); QV_INDEX += atick; fwrite(&QV_INDEX,sizeof(int64),1,QV_AFILE); } // Read in each successive pile and call ACTION on it. Read in the traces only if // "trace" is nonzero static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int trace) { static Overlap *ovls = NULL; static int omax = 500; static uint16 *paths = NULL; static int pmax = 100000; int64 i, j, novl; int n, a; int pcur; int max; if (ovls == NULL) { ovls = (Overlap *) Malloc(sizeof(Overlap)*omax,"Allocating overlap buffer"); if (ovls == NULL) exit (1); } if (trace && paths == NULL) { paths = (uint16 *) Malloc(sizeof(uint16)*pmax,"Allocating path buffer"); if (paths == NULL) exit (1); } rewind(input); fread(&novl,sizeof(int64),1,input); fread(&TRACE_SPACING,sizeof(int),1,input); if (TRACE_SPACING <= TRACE_XOVR) TBYTES = sizeof(uint8); else TBYTES = sizeof(uint16); if (Read_Overlap(input,ovls) != 0) ovls[0].aread = INT32_MAX; else if (trace) { if (ovls[0].path.tlen > pmax) { pmax = 1.2*(ovls[0].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); if (paths == NULL) exit (1); } fread(paths,TBYTES,ovls[0].path.tlen,input); if (TBYTES == 1) { ovls[0].path.trace = paths; Decompress_TraceTo16(ovls); } } else fseek(input,TBYTES*ovls[0].path.tlen,SEEK_CUR); if (ovls[0].aread < DB_FIRST) { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n", Prog_Name,DB_PART); exit (1); } pcur = 0; n = max = 0; for (j = DB_FIRST; j < DB_LAST; j++) { ovls[0] = ovls[n]; a = ovls[0].aread; if (a != j) n = 0; else { if (trace) memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen); n = 1; pcur = ovls[0].path.tlen; while (1) { if (Read_Overlap(input,ovls+n) != 0) { ovls[n].aread = INT32_MAX; break; } if (trace) { if (pcur + ovls[n].path.tlen > pmax) { pmax = 1.2*(pcur+ovls[n].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); if (paths == NULL) exit (1); } fread(paths+pcur,TBYTES,ovls[n].path.tlen,input); if (TBYTES == 1) { ovls[n].path.trace = paths+pcur; Decompress_TraceTo16(ovls+n); } } else fseek(input,TBYTES*ovls[n].path.tlen,SEEK_CUR); if (ovls[n].aread != a) break; pcur += ovls[n].path.tlen; n += 1; if (n >= omax) { omax = 1.2*n + 100; ovls = (Overlap *) Realloc(ovls,sizeof(Overlap)*omax,"Expanding overlap buffer"); if (ovls == NULL) exit (1); } } if (n >= max) max = n; pcur = 0; for (i = 0; i < n; i++) { ovls[i].path.trace = paths+pcur; pcur += ovls[i].path.tlen; } } ACTION(j,ovls,n); } if (ovls[n].aread < INT32_MAX) { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n", Prog_Name,DB_PART); exit (1); } return (max); } int main(int argc, char *argv[]) { char *root, *dpwd; int64 novl; int c; DAZZ_EXTRA ex_hgap, ex_covr; DAZZ_EXTRA ex_cest, ex_qvs, ex_dif; char *cest_name = "Coverage Estimate"; char *qvs_name = "Histogram of QVs"; char *dif_name = "Histogram of Tile Differences"; int64 cover64; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DASqv") COVERAGE = -1; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; case 'c': ARG_POSITIVE(COVERAGE,"Voting depth") break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc < 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n"); fprintf(stderr," -c: Use this as the average coverage (not DAScover estimate).\n"); exit (1); } } // Open trimmed DB { int status; status = Open_DB(argv[1],DB); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (DB->part) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } Trim_DB(DB); } // Get .covr track information { FILE *afile; char *aname; int extra, cmax; int64 *cgram; aname = Strdup(Catenate(DB->path,".","covr",".anno"),"Allocating anno file"); if (aname == NULL) exit (1); afile = fopen(aname,"r"); if (afile == NULL) { fprintf(stderr,"%s: Must have a 'covr' track, run DAScover\n",Prog_Name); exit (1); } fseeko(afile,0,SEEK_END); extra = ftell(afile) - sizeof(int)*2; fseeko(afile,-extra,SEEK_END); ex_covr.nelem = 0; if (Read_Extra(afile,aname,&ex_covr) != 0) { fprintf(stderr,"%s: Histogram extra missing from .covr track?\n",Prog_Name); exit (1); } ex_hgap.nelem = 0; if (Read_Extra(afile,aname,&ex_hgap) != 0) { fprintf(stderr,"%s: Hgap threshold extra missing from .covr track?\n",Prog_Name); exit (1); } fclose(afile); HGAP_MIN = (int) ((int64 *) (ex_hgap.value))[0]; cgram = (int64 *) (ex_covr.value); cmax = ex_covr.nelem - 1; if (COVERAGE < 0) { int i; i = 0; while (cgram[i+1] < cgram[i]) i += 1; for (COVERAGE = i++; i < cmax; i++) if (cgram[COVERAGE] < cgram[i]) COVERAGE = i; } if (COVERAGE >= 40) QV_DEEP = COVERAGE/8; else if (COVERAGE >= 20) QV_DEEP = 5; else if (COVERAGE >= 4) QV_DEEP = COVERAGE/4; else { fprintf(stderr,"%s: Average coverage is too low (< 4X), cannot infer qv's\n",Prog_Name); exit (1); } } // Setup extras ex_cest.vtype = DB_INT; // Estimated coverage (same for every .las) ex_cest.nelem = 1; ex_cest.accum = DB_EXACT; ex_cest.name = cest_name; cover64 = COVERAGE; ex_cest.value = &cover64; ex_qvs.vtype = DB_INT; // Histogram of MAXQV1 trace-point diff counts ex_qvs.nelem = MAXQV1; ex_qvs.accum = DB_SUM; ex_qvs.name = qvs_name; ex_qvs.value = &qgram; ex_dif.vtype = DB_INT; // Histogram of MAXQV1 intrinisic qv counts ex_dif.nelem = MAXQV1; ex_dif.accum = DB_SUM; ex_dif.name = dif_name; ex_dif.value = &sgram; // For each .las file do dpwd = PathTo(argv[1]); root = Root(argv[1],".db"); for (c = 2; c < argc; c++) { Block_Looper *parse; FILE *input; parse = Parse_Block_Arg(argv[c]); while ((input = Next_Block_Arg(parse)) != NULL) { DB_PART = 0; DB_FIRST = 0; DB_LAST = DB->nreads; // Determine if overlap block is being processed and if so get first and last read // from .db file { FILE *dbfile; char buffer[2*MAX_NAME+100]; char *p, *eptr; int i, part, nfiles, nblocks, cutoff, all, oindx; int64 size; p = rindex(Block_Arg_Root(parse),'.'); if (p != NULL) { part = strtol(p+1,&eptr,10); if (*eptr == '\0' && eptr != p+1) { dbfile = Fopen(Catenate(dpwd,"/",root,".db"),"r"); if (dbfile == NULL) exit (1); if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_READ_ERROR for (i = 0; i < nfiles; i++) if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3) SYSTEM_READ_ERROR for (i = 1; i <= part; i++) if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2) SYSTEM_READ_ERROR fclose(dbfile); DB_PART = part; } } } // Set up preliminary trimming track if (DB_PART > 0) { QV_AFILE = Fopen(Catenate(dpwd,PATHSEP,root, Numbered_Suffix(".",DB_PART,".qual.anno")),"w"); QV_DFILE = Fopen(Catenate(dpwd,PATHSEP,root, Numbered_Suffix(".",DB_PART,".qual.data")),"w"); } else { QV_AFILE = Fopen(Catenate(dpwd,PATHSEP,root,".qual.anno"),"w"); QV_DFILE = Fopen(Catenate(dpwd,PATHSEP,root,".qual.data"),"w"); } if (QV_AFILE == NULL || QV_DFILE == NULL) exit (1); { int size, length; length = DB_LAST - DB_FIRST; size = sizeof(int64); fwrite(&length,sizeof(int),1,QV_AFILE); fwrite(&size,sizeof(int),1,QV_AFILE); QV_INDEX = 0; fwrite(&QV_INDEX,sizeof(int64),1,QV_AFILE); } // Get trace point spacing information fread(&novl,sizeof(int64),1,input); fread(&TRACE_SPACING,sizeof(int),1,input); // Initialize statistics gathering { int i; nreads = 0; totlen = 0; for (i = 0; i <= MAXQV; i++) qgram[i] = sgram[i] = 0; } if (VERBOSE) { printf("\n\nDASqv"); if (HGAP_MIN > 0) printf(" -H%d",HGAP_MIN); printf(" -c%d %s %s\n\n",COVERAGE,argv[1],argv[c]); fflush(stdout); } // Process each read pile make_a_pass(input,CALCULATE_QVS,1); // Write out extras and close .qual track Write_Extra(QV_AFILE,&ex_hgap); Write_Extra(QV_AFILE,&ex_cest); Write_Extra(QV_AFILE,&ex_qvs); Write_Extra(QV_AFILE,&ex_dif); fclose(QV_AFILE); fclose(QV_DFILE); fclose(input); // If verbose output statistics summary to stdout if (VERBOSE) { int i; int64 ssum, qsum; int64 stotal, qtotal; int gval, bval; printf("\n Input: "); Print_Number(nreads,7,stdout); printf("reads, "); Print_Number(totlen,12,stdout); printf(" bases"); if (HGAP_MIN > 0) { printf(" (another "); Print_Number((DB_LAST-DB_FIRST) - nreads,0,stdout); printf(" were < H-length)"); } printf("\n"); stotal = qtotal = 0; for (i = 0; i <= MAXQV; i++) { stotal += sgram[i]; qtotal += qgram[i]; } printf("\n Histogram of q-values (average %d best)\n",2*QV_DEEP); printf("\n Input QV\n"); qsum = qgram[MAXQV]; ssum = sgram[MAXQV]; printf("\n %2d: %9lld %5.1f%% %9lld %5.1f%%\n\n", MAXQV,sgram[MAXQV],(100.*ssum)/stotal,qgram[MAXQV],(100.*qsum)/qtotal); bval = gval = -1; qtotal -= qsum; stotal -= ssum; ssum = qsum = 0; for (i = MAXQV-1; i >= 0; i--) if (qgram[i] > 0) { ssum += sgram[i]; qsum += qgram[i]; printf(" %2d: %9lld %5.1f%% %9lld %5.1f%%\n", i,sgram[i],(100.*ssum)/stotal, qgram[i],(100.*qsum)/qtotal); if ((100.*qsum)/qtotal > 7. && bval < 0) bval = i; if ((100.*qsum)/qtotal > 20. && gval < 0) gval = i; } printf("\n Recommend \'DAStrim -g%d -b%d'\n\n",gval,bval); } } Free_Block_Arg(parse); } // Clean up free(dpwd); free(root); Close_DB(DB); free(Prog_Name); exit (0); } DASCRUBBER-1.1/DASrealign.c000066400000000000000000001103541327574206400151220ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2015, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Map and extend every overlap to the patched read framework. * * Author: Gene Myers * Date : March 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #undef OUTLINE #undef SHOW_MAP #undef TRACE #undef SHOW_FINAL #undef SHOW_ALIGNMENTS static char *Usage = " [-v] [-l] "; static int TRACE_SPACING; // Trace spacing (from .las file) static int TBYTES; // Bytes per trace segment (from .las file) static int SMALL; // Trace points can fit in a byte static int MIN_LEN; // Minimum piece length static DAZZ_DB _ADB, *ADB = &_ADB; // A-read database static DAZZ_DB _BDB, *BDB = &_BDB; // B-read database static int ADB_ofirst, ADB_olast; static int BDB_ofirst, BDB_olast; static int AFIRST, BFIRST; static int64 *AMAP_IDX; // Map to originals for A-reads static int *AMAP; static int64 *BMAP_IDX; // Map to orignals for B-reads static int *BMAP; static int *IAMAP, *IBMAP; // Inverse map, old x -> new IMAP[x]..IMAP[x+1]-1 static FILE *OUTPUT; // The new set of overlaps static int64 WOVLS; static int VERBOSE; int lowTK(int a) { return (a/TRACE_SPACING); } int hghTK(int a) { return ((a+(TRACE_SPACING-1))/TRACE_SPACING); } int lowTP(int a) { return ((a/TRACE_SPACING)*TRACE_SPACING); } /******************************************************************************************* * * Finger iterator: allows one to map the next trace point of a read to its * patched read as the trace points are examined in order. * *******************************************************************************************/ typedef struct { int cidx; int lidx; int dist; int last; int blen; int *map; } Finger; // A finger is initialized with init_finger where cur is suppled by the user, and the // patch sequence is in GRIM[gb..ge]. static inline void init_finger(Finger *f, int *map, int mb, int me, int blen) { if (blen == 0) { f->cidx = mb+3; f->lidx = me; f->last = map[mb]; } else { f->cidx = me-4; f->lidx = mb-1; f->last = blen - map[me-1]; } f->dist = 0; f->blen = blen; f->map = map; } // Advance finger to position pos and return position in patched read, if known, -1 otherwise static inline int good(Finger *cur, int pos) { int blen, *map; map = cur->map; blen = cur->blen; if (blen == 0) { while (cur->cidx < cur->lidx && pos >= map[cur->cidx]) { cur->dist += (map[cur->cidx-2] - cur->last) + map[cur->cidx-1]; cur->last = map[cur->cidx]; cur->cidx += 3; } if (pos <= map[cur->cidx-2]) { if (pos < cur->last) return (-1); else return (cur->dist + (pos-cur->last)); } } else { while (cur->cidx > cur->lidx && pos >= blen - map[cur->cidx]) { cur->dist += ((blen - map[cur->cidx+2]) - cur->last) + map[cur->cidx+1]; cur->last = blen - map[cur->cidx]; cur->cidx -= 3; } if (pos <= blen - map[cur->cidx+2]) { if (pos < cur->last) return (-1); else return (cur->dist + (pos-cur->last)); } } return (-1); } // Advance finger to position pos, and return best estimate of position in patched read, // or -1 if outside the bounds of the patched read. acc points at the distance the // estimate is from a non-patched segment (0 if mapped). static inline int where(Finger *cur, int pos, int *acc) { int blen, *map; map = cur->map; blen = cur->blen; if (blen == 0) { while (cur->cidx < cur->lidx && pos >= map[cur->cidx]) { cur->dist += (map[cur->cidx-2] - cur->last) + map[cur->cidx-1]; cur->last = map[cur->cidx]; cur->cidx += 3; } if (pos <= map[cur->cidx-2]) { if (pos < cur->last) return (-1); else { *acc = 0; return (cur->dist + (pos-cur->last)); } } if (cur->cidx >= cur->lidx) return (-1); else { int ab, ae; ab = map[cur->cidx-2]; ae = map[cur->cidx]; if (pos-ab < ae-pos) *acc = pos-ab; else *acc = ae-pos; return (cur->dist + (ab-cur->last) + ((1.*(pos-ab))/(ae-ab)) * map[cur->cidx-1]); } } else { while (cur->cidx > cur->lidx && pos >= blen - map[cur->cidx]) { cur->dist += ((blen - map[cur->cidx+2]) - cur->last) + map[cur->cidx+1]; cur->last = blen - map[cur->cidx]; cur->cidx -= 3; } if (pos <= blen - map[cur->cidx+2]) { if (pos < cur->last) return (-1); else { *acc = 0; return (cur->dist + (pos-cur->last)); } } if (cur->cidx <= cur->lidx) return (-1); else { int ab, ae; ab = blen - map[cur->cidx+2]; ae = blen - map[cur->cidx]; if (pos-ab < ae-pos) *acc = pos-ab; else *acc = ae-pos; return (cur->dist + (ab-cur->last) + ((1.*(pos-ab))/(ae-ab)) * map[cur->cidx+1]); } } } /******************************************************************************************* * * Trace point mapping: * recon makes a trace with all the mapable pairs * when there are no mapable pairs, estimate finds the estimated point position closest * to a mapable region. * *******************************************************************************************/ static int recon(Path *image, Path *path, Finger *afinger, Finger *bfinger) { static int tmax = -1; static uint16 *itrace = NULL; int ae, be; int al, bl; int an, bn; int t, tl, df; uint16 *strace = ((uint16 *) (path->trace)); if (path->tlen > tmax) { tmax = 1.2*path->tlen + 100; itrace = (uint16 *) Realloc(itrace,sizeof(uint16)*tmax,"Reallocating image trace"); } image->trace = itrace; #ifdef TRACE printf(" Backbone:\n"); fflush(stdout); #endif df = 0; tl = -1; al = bl = 0; ae = lowTP(path->abpos); be = path->bbpos; if ((an = good(afinger,path->abpos)) >= 0 && (bn = good(bfinger,be)) >= 0) { image->abpos = al = an; image->bbpos = bl = bn; tl = 0; #ifdef TRACE printf(" %5d,%5d -> %5d,%5d\n",path->abpos,be,an,bn); fflush(stdout); #endif } for (t = 1; t < path->tlen; t += 2) { ae += TRACE_SPACING; be += strace[t]; if (ae > path->aepos) ae = path->aepos; if (tl >= 0) df += strace[t-1]; if ((an = good(afinger,ae)) >= 0 && (bn = good(bfinger,be)) >= 0) { if (tl < 0) { image->abpos = an; image->bbpos = bn; tl = 0; } else { itrace[tl] = an-al; itrace[tl+1] = bn-bl; tl += 2; } image->aepos = al = an; image->bepos = bl = bn; image->diffs = df; #ifdef TRACE printf(" %5d,%5d -> %5d,%5d\n",ae,be,an,bn); fflush(stdout); #endif } } image->tlen = tl; if (tl <= 0) return (0); else return (1); } static int estimate(Path *path, Finger *afinger, Finger *bfinger, int *bsta, int *bstb, int *acc) { int ae, be; int an, bn; int best, adst, bdst; int t; uint16 *strace = ((uint16 *) (path->trace)); *bsta = *bstb = -1; best = INT32_MAX; #ifdef TRACE printf(" Point Estimate:\n"); fflush(stdout); #endif ae = lowTP(path->abpos); be = path->bbpos; if ((an = where(afinger,path->abpos,&adst)) >= 0 && (bn = where(bfinger,be,&bdst)) >= 0) { best = adst + bdst; *bsta = an; *bstb = bn; #ifdef TRACE printf(" %5d,%5d -> %5d(%d),%5d(%d)\n",path->abpos,be,an,adst,bn,bdst); fflush(stdout); #endif } for (t = 1; t < path->tlen; t += 2) { ae += TRACE_SPACING; be += strace[t]; if (ae > path->aepos) ae = path->aepos; if ((an = where(afinger,ae,&adst)) >= 0 && (bn = where(bfinger,be,&bdst)) >= 0) { if (adst + bdst < best) { best = adst + bdst; *bsta = an; *bstb = bn; } #ifdef TRACE printf(" %5d,%5d -> %5d(%d),%5d(%d)\n",ae,be,an,adst,bn,bdst); fflush(stdout); #endif } } *acc = best; return (*bsta >= 0); } #ifdef SHOW_MAP static void print_map(int *map, int mb, int me, int clen) { int b, dist; if (clen == 0) { printf(" n"); for (b = mb; b < me; b += 3) { printf(" [%5d,%5d]",map[b],map[b+1]); if (b+2 < me) printf(" %5d",map[b+2]); } printf("\n"); printf(" "); dist = 0; for (b = mb; b < me; b += 3) { printf(" [%5d,%5d]",dist,dist+(map[b+1]-map[b])); dist += map[b+1]-map[b]; if (b+2 < me) { printf(" %5d",map[b+2]); dist += map[b+2]; } } printf("\n"); } else { printf(" c"); for (b = me; b >= mb; b -= 3) { printf(" [%5d,%5d]",clen-map[b-1],clen-map[b-2]); if (b-3 > mb) printf(" %5d",map[b-3]); } printf("\n"); printf(" "); dist = 0; for (b = me; b >= mb; b -= 3) { printf(" [%5d,%5d]",dist,dist+(map[b-1]-map[b-2])); dist += map[b-1]-map[b-2]; if (b-3 > mb) { printf(" %5d",map[b-3]); dist += map[b-3]; } } printf("\n"); } } #endif #ifdef SHOW_FINAL static void show_overlap(Overlap *ovl) { int i, a, b; uint16 *t; t = (uint16 *) (ovl->path.trace); a = ovl->path.abpos; b = ovl->path.bbpos; for (i = 0; i < ovl->path.tlen; i += 2) { a += t[i]; b += t[i+1]; printf(" %5d %5d :: %5d %5d\n",t[i],t[i+1],a,b); fflush(stdout); } } #endif static void convert_trace(Path *path) { int ab, ae; int t; uint16 *trace = ((uint16 *) (path->trace)); ae = lowTP(path->abpos); ab = path->abpos; for (t = 0; t < path->tlen; t += 2) { ae += TRACE_SPACING; if (ae > path->aepos) ae = path->aepos; #ifdef TRACE printf(" %5d,%5d -> %5d,%5d\n",trace[t],trace[t+1],ae-ab,trace[t+1]); fflush(stdout); #endif trace[t] = ae-ab; ab = ae; } } // Produce the concatentation of path1 and path2 where they are known to meet at // the trace point with coordinate ap. Place this result in a big growing buffer, // that gets reset when fusion is called with path1 = NULL static void fusion(Path *path1, Path *path2, int wch) { static uint16 *paths = NULL; static int pmax = 0; static int ptop = 0; int k; int len; uint16 *trace; if (path1 == NULL) { ptop = 0; return; } len = path1->tlen + path2->tlen; if (ptop + len >= pmax) { pmax = 1.2*(ptop+len) + 1000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Allocating paths"); if (paths == NULL) exit (1); } trace = paths+ptop; ptop += len; len = 0; if (path1->tlen > 0) { uint16 *t = (uint16 *) (path1->trace); for (k = 0; k < path1->tlen; k += 2) { trace[len++] = t[k]; trace[len++] = t[k+1]; } } if (path2->tlen > 0) { uint16 *t = (uint16 *) (path2->trace); for (k = 0; k < path2->tlen; k += 2) { trace[len++] = t[k]; trace[len++] = t[k+1]; } } if (wch == 1) { path1->aepos = path2->aepos; path1->bepos = path2->bepos; path1->diffs += path2->diffs; path1->trace = trace; path1->tlen = len; } else { path2->abpos = path1->abpos; path2->bbpos = path1->bbpos; path2->diffs += path1->diffs; path2->trace = trace; path2->tlen = len; } } static void EXTENDER(int aread, Overlap *ovls, int novl) { Finger _afinger, *afinger = &_afinger; Finger _bfinger, *bfinger = &_bfinger; static Overlap _ovla, *ovla = &_ovla; static Path *ipath = &_ovla.path; static Path rpath, fpath; static Alignment _ralign, *ralign = &_ralign; static Alignment _falign, *falign = &_falign; static Work_Data *work = NULL; static Align_Spec *spec; int ap, alast; if (aread < ADB_ofirst || aread >= ADB_olast) return; if (work == NULL) { spec = New_Align_Spec(.70,100,ADB->freq,1); work = New_Work_Data(); ralign->path = &rpath; falign->path = &fpath; } alast = IAMAP[aread+1]; for (ap = IAMAP[aread]; ap < alast; ap++) { int mb, me; int aend, abeg, alen; mb = AMAP_IDX[ap]+2; me = AMAP_IDX[ap+1]; abeg = AMAP[mb]; aend = AMAP[me-1]; if (aend - abeg < MIN_LEN) continue; ralign->aseq = falign->aseq = ((char *) ADB->bases) + ADB->reads[ap].boff; ralign->alen = falign->alen = alen = ADB->reads[ap].rlen; #ifdef OUTLINE printf("AREAD %d -> %d [%d,%d]\n",aread,ap,abeg,aend); fflush(stdout); #endif #ifdef SHOW_MAP print_map(AMAP,mb,me,0); #endif { int o, ob, oe; Path *path; int bread; int bp, blast; for (ob = 0; ob < novl; ob = oe) { bread = ovls[ob].bread; for (oe = ob+1; oe < novl && ovls[oe].bread == bread; oe += 1) ; if (bread < BDB_ofirst || bread >= BDB_olast) continue; blast = IBMAP[bread+1]; for (bp = IBMAP[bread]; bp < blast; bp++) { int hb, he; int bend, bbeg, blen; int alpos, clen; hb = BMAP_IDX[bp]+2; he = BMAP_IDX[bp+1]; bbeg = BMAP[hb]; bend = BMAP[he-1]; if (bend - bbeg < MIN_LEN) continue; #ifdef OUTLINE printf(" BREAD %d->%d [%d,%d]\n",bread,bp,bbeg,bend); fflush(stdout); #endif #ifdef SHOW_MAP print_map(BMAP,hb,he,0); #endif alpos = -1; for (o = ob; o < oe; o++) { int bbreal, bereal; path = &(ovls[o].path); if (COMP(ovls[o].flags)) { clen = BMAP[hb-1]; bbreal = clen-path->bepos; bereal = clen-path->bbpos; } else { clen = 0; bbreal = path->bbpos; bereal = path->bepos; } #ifdef OUTLINE printf(" OVL %d: [%d,%d] %c [%d,%d]\n",o, path->abpos,path->aepos,(clen==0)?'n':'c',bbreal,bereal); fflush(stdout); #endif if (path->abpos <= aend-MIN_LEN && path->aepos >= abeg+MIN_LEN && bbreal <= bend-MIN_LEN && bereal >= bbeg+MIN_LEN) { ralign->bseq = falign->bseq = ((char *) BDB->bases) + BDB->reads[bp].boff; ralign->blen = falign->blen = blen = BDB->reads[bp].rlen; ralign->flags = falign->flags = ovls[o].flags; if (COMP(ralign->flags)) Complement_Seq(ralign->bseq,blen); #ifdef SHOW_MAP print_map(BMAP,hb,he,clen); #endif init_finger(afinger,AMAP,mb,me,0); init_finger(bfinger,BMAP,hb,he,clen); if ( ! recon(ipath,path,afinger,bfinger)) { int apos, bpos, acc, len, diag; init_finger(afinger,AMAP,mb,me,0); init_finger(bfinger,BMAP,hb,he,clen); if (estimate(path,afinger,bfinger,&apos,&bpos,&acc)) if (apos > alpos) { diag = apos-bpos; acc /= 2; if (apos + acc > alen) acc = alen-apos; if (bpos + acc > blen) acc = blen-bpos; if (apos < acc) acc = apos; if (bpos < acc) acc = bpos; if (acc > 500) acc = 500; acc *= 2; #ifdef OUTLINE printf(" Trying: %d,%d + %d\n",apos,bpos,acc); fflush(stdout); #endif Local_Alignment(ralign,work,spec, diag-acc,diag+acc,apos+bpos,-1,-1); #ifdef OUTLINE printf(" Local: <%d,%d> -> <%d,%d>\n", ralign->path->abpos,ralign->path->bbpos, ralign->path->aepos,ralign->path->bepos); fflush(stdout); #endif len = ralign->path->aepos - ralign->path->abpos; if (len >= MIN_LEN && ralign->path->diffs <= .35*len) { ovla->aread = ap; ovla->bread = bp; ovla->flags = ralign->flags; _ovla.path = *(ralign->path); convert_trace(ralign->path); #ifdef OUTLINE printf(" Final: %d[%d..%d] vs %d[%d..%d] %c d=%d\n", ovla->aread,ovla->path.abpos,ovla->path.aepos, ovla->bread,ovla->path.bbpos,ovla->path.bepos, (COMP(ovla->flags) ? 'c' : 'n'), ovla->path.diffs); fflush(stdout); #endif #ifdef SHOW_FINAL show_overlap(ovla); #endif #ifdef SHOW_ALIGNMENTS Compute_Trace_IRR(ralign,work,GREEDIEST); Print_Alignment(stdout,ralign,work,4,80,10,0,6); #else Write_Overlap(OUTPUT,ovla,sizeof(uint16)); WOVLS += 1; #endif alpos = ralign->path->aepos; #ifdef OUTLINE printf(" ACCEPT\n"); #endif } #ifdef OUTLINE else printf(" REJECT\n"); #endif } #ifdef OUTLINE else printf(" SKIP\n"); else printf(" NO OVERLAP\n"); fflush(stdout); #endif } else if (ipath->aepos > alpos) { int ab, bb; int ae, be; int ar, br; int af, bf; ab = ipath->abpos; bb = ipath->bbpos; ae = ipath->aepos; be = ipath->bepos; ar = ab; br = bb; if (ab > 0 && bb > 0) { Find_Extension(ralign,work,spec,ab-bb,ab+bb,-1,-1,1); ar = ralign->path->abpos; br = ralign->path->bbpos; #ifdef OUTLINE printf(" Rev: (%d,%d)",ab,bb); printf(" -> (%d,%d)",ar,br); printf(" %d",ralign->path->diffs); fflush(stdout); #endif if (ar == 0 || br == 0 || ralign->path->diffs <= .35*(ab-ar)) { #ifdef OUTLINE printf(" OK\n"); fflush(stdout); #endif if (ab - 10 < ar) { uint16 *trace = (uint16 *) ipath->trace; int tlen = ipath->tlen; if (tlen > 0) { trace[0] += ab - ar; trace[1] += bb - br; } ipath->abpos = ar; ipath->bbpos = br; } else { convert_trace(ralign->path); fusion(ralign->path,ipath,2); } } else { ar = ab; br = bb; #ifdef OUTLINE printf(" NOTOK\n"); fflush(stdout); #endif } } af = ae; bf = be; if (ae < alen && be < blen) { Find_Extension(falign,work,spec,ae-be,ae+be,-1,-1,0); af = falign->path->aepos; bf = falign->path->bepos; #ifdef OUTLINE printf(" Fow: (%d,%d)",ae,be); printf(" -> (%d,%d)",af,bf); printf(" %d",falign->path->diffs); fflush(stdout); #endif if (af == alen || bf == blen || falign->path->diffs <= .35*(af-ae)) { #ifdef OUTLINE printf(" OK\n"); fflush(stdout); #endif if (ae + 10 > af) { uint16 *trace = (uint16 *) ipath->trace; int tlen = ipath->tlen; if (tlen > 0) { trace[tlen-2] += af-ae; trace[tlen-1] += bf-be; } ipath->aepos = af; ipath->bepos = bf; } else { convert_trace(falign->path); fusion(ipath,falign->path,1); } } else { af = ae; bf = be; #ifdef OUTLINE printf(" NOTOK\n"); fflush(stdout); #endif } } alpos = af; if (af-ar >= MIN_LEN) { ovla->aread = AFIRST + ap; ovla->bread = BFIRST + bp; ovla->flags = ralign->flags; #ifdef OUTLINE printf(" Final: %d[%d..%d] vs %d[%d..%d] %c d=%d\n", ovla->aread,ovla->path.abpos,ovla->path.aepos, ovla->bread,ovla->path.bbpos,ovla->path.bepos, (COMP(ovla->flags) ? 'c' : 'n'), ovla->path.diffs); fflush(stdout); #endif #ifdef SHOW_FINAL show_overlap(ovla); #endif #ifdef SHOW_ALIGNMENTS fpath = *ipath; Compute_Trace_IRR(falign,work,GREEDIEST); Print_Alignment(stdout,falign,work,4,80,10,0,6); #else Write_Overlap(OUTPUT,ovla,sizeof(uint16)); WOVLS += 1; #endif } #ifdef OUTLINE else printf(" NO OVERLAP\n"); #endif fusion(NULL,NULL,0); } #ifdef OUTLINE else printf(" SKIP\n"); fflush(stdout); #endif if (COMP(ralign->flags)) Complement_Seq(ralign->bseq,blen); } } } } } } } static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int trace) { static Overlap *ovls = NULL; static int omax = 500; static uint16 *paths = NULL; static int pmax = 100000; int64 i, j, novl; int n, a; int pcur; int max; if (ovls == NULL) { ovls = (Overlap *) Malloc(sizeof(Overlap)*omax,"Allocating overlap buffer"); if (ovls == NULL) exit (1); } if (trace && paths == NULL) { paths = (uint16 *) Malloc(sizeof(uint16)*pmax,"Allocating path buffer"); if (paths == NULL) exit (1); } rewind(input); fread(&novl,sizeof(int64),1,input); fread(&TRACE_SPACING,sizeof(int),1,input); if (TRACE_SPACING <= TRACE_XOVR) { TBYTES = sizeof(uint8); SMALL = 1; } else { TBYTES = sizeof(uint16); SMALL = 0; } if (Read_Overlap(input,ovls) != 0) ovls[0].aread = INT32_MAX; else if (trace) { if (ovls[0].path.tlen > pmax) { pmax = 1.2*(ovls[0].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); if (paths == NULL) exit (1); } fread(paths,TBYTES,ovls[0].path.tlen,input); if (TBYTES == 1) { ovls[0].path.trace = paths; Decompress_TraceTo16(ovls); } } else fseek(input,TBYTES*ovls[0].path.tlen,SEEK_CUR); pcur = 0; n = max = 0; for (j = ovls[0].aread; j < ADB_olast; j++) { ovls[0] = ovls[n]; a = ovls[0].aread; if (a != j) n = 0; else { if (trace) memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen); n = 1; pcur = ovls[0].path.tlen; while (1) { if (Read_Overlap(input,ovls+n) != 0) { ovls[n].aread = INT32_MAX; break; } if (trace) { if (pcur + ovls[n].path.tlen > pmax) { pmax = 1.2*(pcur+ovls[n].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); if (paths == NULL) exit (1); } fread(paths+pcur,TBYTES,ovls[n].path.tlen,input); if (TBYTES == 1) { ovls[n].path.trace = paths+pcur; Decompress_TraceTo16(ovls+n); } } else fseek(input,TBYTES*ovls[n].path.tlen,SEEK_CUR); if (ovls[n].aread != a) break; pcur += ovls[n].path.tlen; n += 1; if (n >= omax) { omax = 1.2*n + 100; ovls = (Overlap *) Realloc(ovls,sizeof(Overlap)*omax,"Expanding overlap buffer"); if (ovls == NULL) exit (1); } } if (n >= max) max = n; pcur = 0; for (i = 0; i < n; i++) { ovls[i].path.trace = paths+pcur; pcur += ovls[i].path.tlen; } } if (j >= ADB_ofirst) ACTION(j,ovls,n); } if (ovls[n].aread < INT32_MAX) { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n", Prog_Name,ADB->part); exit (1); } return (max); } int main(int argc, char *argv[]) { DAZZ_TRACK *map1, *map2; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DASrealign") MIN_LEN = 800; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; case 'l': ARG_POSITIVE(MIN_LEN,"Minimum piece length to recompute") break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc != 5) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n"); fprintf(stderr," -l: minimum length alignment length.\n"); exit (1); } } { int status; status = Open_DB(argv[1],ADB); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if ( ! ADB->part) { fprintf(stderr,"%s: Must be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } Trim_DB(ADB); Read_All_Sequences(ADB,0); } if (strcmp(argv[1],argv[2]) == 0) BDB = ADB; else { int status; status = Open_DB(argv[2],BDB); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if ( ! BDB->part) { fprintf(stderr,"%s: Must be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } Trim_DB(BDB); Read_All_Sequences(BDB,0); } AFIRST = ADB->tfirst; BFIRST = BDB->tfirst; map1 = Load_Track(ADB,"map"); if (map1 != NULL) { int i, o, q, n; AMAP_IDX = (int64 *) map1->anno; AMAP = (int *) map1->data; for (i = 0; i <= ADB->nreads; i++) AMAP_IDX[i] /= sizeof(int); ADB_ofirst = AMAP[AMAP_IDX[0]]; ADB_olast = AMAP[AMAP_IDX[ADB->nreads-1]]+1; IAMAP = (int *) Malloc(sizeof(int)*((ADB_olast-ADB_ofirst)+1),"Inverse map") - ADB_ofirst; IAMAP[q = ADB_olast] = n = ADB->nreads; for (i = ADB->nreads-1; i >= 0; i--) { o = AMAP[AMAP_IDX[i]]; if (q > o) while (--q > o) IAMAP[q] = n; IAMAP[o] = n = i; } } else { fprintf(stderr,"%s: Must have a 'map' track, run DASedit\n",Prog_Name); exit (1); } if (BDB == ADB) { map2 = map1; BMAP_IDX = AMAP_IDX; BMAP = AMAP; BDB_ofirst = ADB_ofirst; BDB_olast = ADB_olast ; IBMAP = IAMAP; } else { map2 = Load_Track(BDB,"map"); if (map2 != NULL) { int i, o, q, n; BMAP_IDX = (int64 *) map2->anno; BMAP = (int *) map2->data; for (i = 0; i <= BDB->nreads; i++) BMAP_IDX[i] /= sizeof(int); BDB_ofirst = BMAP[BMAP_IDX[0]]; BDB_olast = BMAP[BMAP_IDX[BDB->nreads-1]]+1; IBMAP = (int *) Malloc(sizeof(int)*((BDB_olast-BDB_ofirst)+1),"Inverse map") - BDB_ofirst; IBMAP[q = BDB_olast] = n = BDB->nreads; for (i = BDB->nreads-1; i >= 0; i--) { o = BMAP[BMAP_IDX[i]]; if (q > o) while (--q > o) IBMAP[q] = n; IBMAP[o] = n = i; } } else { fprintf(stderr,"%s: Must have a 'map' track, run DASedit\n",Prog_Name); exit (1); } } ADB_ofirst = AMAP[AMAP_IDX[0]]; BDB_ofirst = BMAP[BMAP_IDX[0]]; ADB_olast = AMAP[AMAP_IDX[ADB->nreads-1]]+1; BDB_olast = BMAP[BMAP_IDX[BDB->nreads-1]]+1; // Open .las and process piles therein output new piles to F.las { FILE *input; char *las, *pwd; char *lasT, *pwdT; int64 novl; las = Root(argv[3],".las"); pwd = PathTo(argv[3]); lasT = Root(argv[4],".las"); pwdT = PathTo(argv[4]); if (strcmp(las,lasT) == 0 && strcmp(pwd,pwdT) == 0) { fprintf(stderr,"%s: source and target are the same !\n",Prog_Name); exit (1); } input = Fopen(Catenate(pwd,"/",las,".las"),"r"); OUTPUT = Fopen(Catenate(pwdT,"/",lasT,".las"),"w"); if (input == NULL || OUTPUT == NULL) exit (1); free(pwd); free(las); WOVLS = 0; TRACE_SPACING = 0; fwrite(&WOVLS,sizeof(int64),1,OUTPUT); fwrite(&TRACE_SPACING,sizeof(int),1,OUTPUT); fread(&novl,sizeof(int64),1,input); fread(&TRACE_SPACING,sizeof(int),1,input); make_a_pass(input,EXTENDER,1); rewind(OUTPUT); fwrite(&WOVLS,sizeof(int64),1,OUTPUT); fclose(OUTPUT); } exit (0); } DASCRUBBER-1.1/DAStrim.c000066400000000000000000002411461327574206400144600ustar00rootroot00000000000000/******************************************************************************************* * * Using overlap pile for each read and intrinisic quality values, determine the * high quality segments with interspersed gaps. Any unremoved * adaptemer sequences are dectected and the shorter side trimmed. * Every gap is analyzed and either patched or splits the read. * * Author: Gene Myers * Date : June 2016 * *******************************************************************************************/ #include #include #include #include #include "DB.h" #include "align.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif #undef DEBUG_HQ_BLOCKS // Various DEBUG flags (normally all off) #undef SHOW_EVENTS #undef DEBUG_HOLE_FINDER #undef DEBUG_GAP_STATUS #undef SHOW_PAIRS #undef DEBUG_PATCHING #undef DEBUG_SUMMARY #undef DEBUG_CLASS #define ANNOTATE // Output annotation tracks for DaViewer // Command format and global parameter variables static char *Usage = " [-v] [-g] [-b] ..."; static int COVERAGE; // estimated coverage static int BAD_QV; // qv >= and you are "bad" static int GOOD_QV; // qv <= and you are "good" static int HGAP_MIN; // less than this length do not process for HGAP static int VERBOSE; // Gap states #define LOWQ 0 // Gap is spanned by many LAs and patchable #define SPAN 1 // Gap has many paired LAs and patchable #define SPLIT 2 // Gap is a chimer or an unpatchable gap #define ADAPT 3 // Gap is due to adaptemer (internal only) // Good patch constants #define MIN_BLOCK 500 // Minimum length of a good patch // Gap constants #define MIN_COVER 3 // A coverage gap occurs at or below this level #define COVER_LEN 400 // An overlap covers a point if it extends COVER_LEN to either side. #define ANCHOR_MATCH .25 // Delta in trace interval at both ends of patch must be < this %. #define MIN_OVERLAP 900 // Was COVER_LEN, too small? // Wall Constants #define MIN_PNT 5 // Minimum # of events in a wall #define MAX_SEP 25 // Maximum separation between two events in a wall #define AVE_SEP 5. // Maximum average separation between two events in a wall // Global Variables (must exist across the processing of each pile) // Input static int TRACE_SPACING; // Trace spacing (from .las file) static DAZZ_DB _DB, *DB = &_DB; // Data base static int DB_FIRST; // First read of DB to process static int DB_LAST; // Last read of DB to process (+1) static int DB_PART; // 0 if all, otherwise block # static int64 *QV_IDX; // qual track index static uint8 *QV; // qual track values // Output static FILE *TR_AFILE; // .trim.anno static FILE *TR_DFILE; // .trim.data static int64 TR_INDEX; // Current index into .trim.data file as it is being written #ifdef ANNOTATE static FILE *HQ_AFILE; // .hq.anno static FILE *HQ_DFILE; // .hq.data static int64 HQ_INDEX; // Current index into .hq.data file as it is being written static FILE *HL_AFILE; // .hole.anno static FILE *HL_DFILE; // .hole.data static int64 HL_INDEX; // Current index into .hole.data file as it is being written static FILE *SN_AFILE; // .span.anno static FILE *SN_DFILE; // .span.data static int64 SN_INDEX; // Current index into .span.data file as it is being written static FILE *SP_AFILE; // .split.anno static FILE *SP_DFILE; // .split.data static int64 SP_INDEX; // Current index into .split.data file as it is being written static FILE *AD_AFILE; // .adapt.anno static FILE *AD_DFILE; // .adapt.data static int64 AD_INDEX; // Current index into .adapt.data file as it is being written static FILE *KP_AFILE; // .keep.anno static FILE *KP_DFILE; // .keep.data static int64 KP_INDEX; // Current index into .keep.data file as it is being written #endif // Statistics static int64 nreads, totlen; static int64 nelim, nelimbp; static int64 n5trm, n5trmbp; static int64 n3trm, n3trmbp; static int64 natrm, natrmbp; static int64 ngaps, ngapsbp; static int64 nlowq, nlowqbp; static int64 nspan, nspanbp; static int64 nchim, nchimbp; // Data Structures typedef struct // General read interval [beg..end] { int beg; int end; } Interval; // Coverage events, type (one of 7 below) and position #define ADD 0 // leftmost A-position of LA #define LFT 1 // ADD position + COVER_LEN of LA (>= 2*COVER_LEN long) #define LGP 2 // left end of an HQ-block #define CTR 3 // A-center of LA < 2*COVER_LEN long #define RGP 4 // right end of an HQ-block #define RGT 5 // DEL position - COVER_LEN of LA #define DEL 6 // rightmost A-position of LA #ifdef SHOW_EVENTS static char Symbol[7] = { 'A', 'L', '[', 'C', ']', 'R', 'D' }; #endif typedef struct { int type; int pos; } Event; // Wall: there are cnt LFT/RGT events ending in the interval [beg,end] going // from coverage depth cov up to cov+cnt typedef struct { int beg; int end; int cnt; int cov; } Wall; /******************************************************************************************* * * FIND ALL HIGH_QV BLOCKS OF EACH READ * ********************************************************************************************/ // Find "good" blocks of trace point intervals: // 0. A good block must begin and end with an interval <= GOOD_QV // 1. Any stretch all < BAD_QV at least MIN_BLOCK long // 2. Any stretch all <= GOOD_QV at least MIN_BLOCK-TRACE_SPACING long // 3. Any stretch all <= GOOD_QV only 1 interval away from another good patch // Global Inputs: QV, QV_IDX, GOOD_QV, BAD_QV // HQ_BLOCKS[0..*nblk-1] contain the good patches in increase sequencing across aread. // Parameter aread is input-only, and p_nblk is output-only. static Interval *HQ_BLOCKS(int aread, int *p_nblk) { int nblk; static int *alive = NULL; static Interval *block = NULL; int alen, atick; uint8 *qvec; alen = DB->reads[aread].rlen; atick = (alen + (TRACE_SPACING-1))/TRACE_SPACING; if (alive == NULL) { int max = DB->maxlen/TRACE_SPACING+2; alive = (int *) Malloc(max*sizeof(int),"Allocating alive vector"); block = (Interval *) Malloc(max*sizeof(Interval),"Allocating block vector"); if (alive == NULL || block == NULL) exit (1); } qvec = QV + QV_IDX[aread]; nblk = 0; // Find all blocks < BAD_QV with either len >= MIN_BLOCK or all <= GOOD_QV in block[0..nblk) // Mark those satisfying 1. or 2. as "alive" (.alv) { int lmost = 0, rmost = 0, thr; int i, in; thr = (MIN_BLOCK-1)/TRACE_SPACING; in = 0; for (i = 0; i <= atick; i++) { int q, alv; if (i < atick) q = qvec[i]; else q = BAD_QV; if (in) { if (q >= BAD_QV) { alv = (lmost-rmost >= thr); if (alv) { block[nblk].beg = rmost; block[nblk].end = lmost + 1; alive[nblk] = alv; nblk += 1; } else { int j, k; for (j = rmost; j <= lmost; j = k) { for (k = j+1; k <= lmost; k++) if (qvec[k] > GOOD_QV) break; block[nblk].beg = j; block[nblk].end = k; alive[nblk] = (k-j >= thr); nblk += 1; for ( ; k <= lmost; k++) if (qvec[k] <= GOOD_QV) break; } } in = 0; } else if (q <= GOOD_QV) lmost = i; } else { if (q <= GOOD_QV) { rmost = lmost = i; in = 1; } } } } // Mark as alive all short, all-good blocks that satisfy 3. { int i, j; for (i = 0; i < nblk; i++) if (alive[i]) { for (j = i-1; j >= 0 && ! alive[j]; j--) if (block[j+1].beg - block[j].end == 1) alive[j] = 1; else break; for (j = i+1; j < nblk && ! alive[j]; j++) if (block[j].beg - block[j-1].end == 1) alive[j] = 1; else break; } } // Remove all blocks that are not alive { int i, j; j = 0; for (i = 0; i < nblk; i++) if (alive[i]) { block[j].beg = block[i].beg * TRACE_SPACING; block[j].end = block[i].end * TRACE_SPACING; j += 1; } nblk = j; if (nblk > 0 && block[nblk-1].end > alen) block[nblk-1].end = alen; } #ifdef DEBUG_HQ_BLOCKS { int i; printf(" %3d:",nblk); for (i = 0; i < nblk; i++) printf(" [%5d,%5d]",block[i].beg,block[i].end); printf("\n"); } #endif *p_nblk = nblk; return (block); } /******************************************************************************************* * * WALL ANALYZER TO HELP AVOID REPEAT BOUNDARIES * ********************************************************************************************/ // Find intervals of LFT/RGT events where no two events are separated by more than // MAX_SEP, the average arrival rate is AVE_SEP, and there are at least MIN_PNT // events in the interval. static Wall *wall_detector(int *ev, int b, int e, Wall *next) { int idx; { int i, n, max; double ave; n = e-b; if (n < MIN_PNT) return (next); // Too small: done idx = b; max = -1; // Find the position of the largest separation between for (i = b+1; i < e; i++) // two tips in ev[b..e) if (ev[i] - ev[i-1] > max) { max = ev[i] - ev[i-1]; idx = i; } ave = (ev[e-1] - ev[b]) / (n-1.); // Check if the current interval is a wall if (ave <= AVE_SEP && max <= MAX_SEP) { if (max <= 4.*(ave+1.)) // Max separation < 4*average separation ? { next->beg = b; next->end = e; next->cnt = n; return (next+1); } } } next = wall_detector(ev,b,idx,next); // If not then split on the largest separation next = wall_detector(ev,idx,e,next); // and recurse on the two parts return (next); } // Find LFT/RGT event walls static Wall *find_walls(int novl, Event *queue, int *anum, int *dnum) { static int nmax = 0; Wall *aptr, *dptr; static Wall *wall = NULL; int ntip; static int *adds = NULL; static int *dels; if (novl == 0) return (NULL); if (novl > nmax) { nmax = novl*1.2 + 1000; wall = (Wall *) Realloc(wall,sizeof(Wall)*(nmax/MIN_PNT),"Reallocating wall vector"); adds = (int *) Realloc(adds,sizeof(int)*2*nmax,"Reallocating add+del vectors"); if (wall == NULL || adds == NULL) exit (1); dels = adds + nmax; } // Make separate arrays of add and del tips (LFT and RGT events) in sorted order in // which to seek "walls". { int i, j, x; i = x = 0; // A bit tricky: less than novl tips due to CTR events for (j = 0; x < novl; j++) // that don't generate tips, so analyze events until if (queue[j].type == CTR) // have counted all LA's. Furthermore adds and dels x += 1; // are sorted because queue is sorted. else if (queue[j].type == LFT) { x += 1; adds[i++] = queue[j].pos; } ntip = i; i = 0; for (j = 0; i < ntip; j++) if (queue[j].type == RGT) dels[i++] = queue[j].pos; } // Find LFT walls and RGT walls in [walls,aptr) and [aptr,dptr) aptr = wall_detector(adds,0,ntip,wall); dptr = wall_detector(dels,0,ntip,aptr); // For each wall, determine the coverage of its base with a merged traversal // of the adds and dels arrays { Wall *a, *d; int i, j, x; x = 0; a = wall; d = aptr;; i = j = 0; while (j < ntip) if (i < ntip && adds[i] < dels[j]) { if (a->beg == i) a->cov = x; else if (a->end == i+1) { a += 1; if (a >= aptr) a -= 1; } x += 1; i += 1; } else { if (d->beg == j) d->cov = x - d->cnt; else if (d->end == j+1) { d += 1; if (d >= dptr) d -= 1; } x -= 1; j += 1; } } // Sneaky, switch beg/end from an index into the adds or dels array, to the actually // coordinate of the event. { Wall *a; for (a = wall; a < aptr; a++) { a->beg = adds[a->beg]; a->end = adds[a->end-1]; } for (a = aptr; a < dptr; a++) { a->beg = dels[a->beg]; a->end = dels[a->end-1]; } } *anum = aptr-wall; *dnum = dptr-aptr; return (wall); } /******************************************************************************************* * * COVERAGE ANALYSIS TO FIND ALL HOLES (regions of very low coverage/support) * ********************************************************************************************/ // Find intervals for which there are MIN_COVER or fewer LAs that project at least COVER_LEN // bases to the left and right of the interval. These are called holes. // Holes are usually found between HQ-blocks. However occasionally they intersect one or // more blocks and this requires the HQ-blocks be refined as follows: // a. Hole spans an HQ-block: // The block needs to be removed as HQ *if* it is not based on 5 or more LA's // (this usually never happens, 10^-5 or less) // b. Hole is contained in an HQ-block: // The block needs to be split around the hole because one needs to verify that // the left and right regions on each side of a hole actually belong together // (this happens occasionaly, ~ 10^-3) // c. Hole overlaps an HQ-block: // If this happens, then the overlap is very small and the block is left unperturbed. // (this worries me a bit, but in all testing it (very small overlap) remains so) // Given the above possibilities, the list of HQ-blocks can be modified by FIND_HOLES. static int ESORT(const void *l, const void *r) { Event *x = (Event *) l; Event *y = (Event *) r; if (x->pos == y->pos) return (x->type - y->type); return (x->pos - y->pos); } static int FIND_HOLES(int aread, Overlap *ovls, int novl, Interval *block, int nblk) { static int nmax = 0; int nev; static Event *queue = NULL; // Event queue[0..nev) int nhole; static Interval *holes = NULL; // Detected holes[0..nhole) static int pmax; static Interval *cover = NULL; // Coverage at block ends [0..nblk) static Interval *nwblk; // Modified block list [0..nblk') int anum = 0, dnum = 0; // LFT and RGT walls, awall[0..anum) & dwall[0..dnum) Wall *awall, *dwall; if (cover == NULL) { pmax = DB->maxlen/TRACE_SPACING + 2; cover = (Interval *) Malloc(2*pmax*sizeof(Interval),"Allocating patch vector"); nwblk = cover + pmax; } if (4*novl + pmax > nmax) { nmax = 4.8*novl + pmax + 100; queue = (Event *) Realloc(queue,(nmax+1)*sizeof(Event),"Allocating event queue"); holes = (Interval *) Realloc(holes,(nmax/4)*sizeof(Interval),"Allocating hole vector"); if (queue == NULL || holes == NULL) exit (1); } { int i; // For each trimmed overlap: add its events to the queue nev = 0; for (i = 0; i < novl; i++) { queue[nev].type = ADD; queue[nev].pos = ovls[i].path.abpos; nev += 1; queue[nev].type = DEL; queue[nev].pos = ovls[i].path.aepos; nev += 1; if (ovls[i].path.abpos + 2*COVER_LEN + 10 > ovls[i].path.aepos) { queue[nev].type = CTR; queue[nev].pos = (ovls[i].path.abpos + ovls[i].path.aepos) / 2; nev += 1; } else { queue[nev].type = LFT; queue[nev].pos = ovls[i].path.abpos + COVER_LEN; nev += 1; queue[nev].type = RGT; queue[nev].pos = ovls[i].path.aepos - COVER_LEN; nev += 1; } } // For each HQ-block: add its events to the queue for (i = 0; i < nblk; i++) { queue[nev].type = LGP; queue[nev].pos = block[i].beg; nev += 1; queue[nev].type = RGP; queue[nev].pos = block[i].end; nev += 1; } queue[nev].pos = DB->reads[aread].rlen; } // Sort the events qsort(queue,nev,sizeof(Event),ESORT); // Find all LFT and RGT walls awall = find_walls(novl,queue,&anum,&dnum); dwall = awall + anum; #ifdef DEBUG_HOLE_FINDER { int i; printf("\n"); for (i = 0; i < anum; i++) printf(" Add [%5d,%5d] %d %d\n",awall[i].beg,awall[i].end,awall[i].cnt,awall[i].cov); for (i = 0; i < dnum; i++) printf(" Del [%5d,%5d] %d %d\n",dwall[i].beg,dwall[i].end,dwall[i].cnt,dwall[i].cov); printf("\n"); } #endif // Move through events in order keeping track of inc, dec, & cnf so that the // invariant stated below holds { int cnf, inc, dec; int cblk; int in; int nbeg, nend = 0; int first, last; int i; in = 1; first = -1; cblk = 0; nhole = 0; inc = dec = cnf = 0; for (i = 0; i < nev; i++) { switch (queue[i].type) { case ADD: inc += 1; break; case LFT: inc -= 1; cnf += 1; break; case LGP: cover[cblk].beg = cnf + inc + dec; // = coverage depth at block[cblk].beg continue; case CTR: inc -= 1; dec += 1; continue; case RGP: cover[cblk].end = cnf + inc + dec; // = coverage depth at block[cblk].end cblk += 1; continue; case RGT: cnf -= 1; dec += 1; break; case DEL: dec -= 1; break; } // For position x = queue[i].pos: // inc = # of LA's between (ADD,LFT] positions // dec = # of LA's between (RGT,DEL] positions // cnf = # of LA's between (LFT,RGT] positions (= # of LAs tat project at least // COVER_LEN bases to the right and left of x! #ifdef SHOW_EVENTS printf(" %5d %c: %3d< %3d >%3d %3d\n", queue[i].pos,Symbol[queue[i].type],inc,cnf,dec,dec-inc); #endif // When truncated coverage, cnf, transitions below MIN_COVER(3), note the fact (in = 1) // and record the index first of the event (must be a RGT) and the number of LA's // currently in their (RGT,DEL] interval if (cnf <= MIN_COVER) { if ( ! in) { in = 1; nend = dec; first = i; } } // When truncated coverage transitions above MIN_COVER, we declare it a hole // if interval below MIN_COVER is at least COVER_LEN long, there are at least // 4 LA's that are "ending" at the left (i.e. in (RGT,DEL] interval, and // at least 4 LA's ending at the right. else { if (in && first >= 0 && queue[i].pos - queue[first].pos >= COVER_LEN && nend >= 4 && inc >= 4) { int lflank, rflank; int dpos, apos; nbeg = inc; last = i; // Need to find the boundaries of the hole. In principle, this is // [dpos + COVER_LEN, apos - COVER_LEN] where apos = queue[first].pos // and dpos = queue[last].pos, i.e. the entry and exit into the low // truncated cover interval. However, walls induced by repeat boundaries // and/or uneveness in the end-points of LA's can cause the above to be // quite far off. So ... // First try the average of the 2nd and 3rd quartile of the nend RGT events // before dpos. The requisite number of events must exist by the definition // of nend. While one is at it determine the index of the first of the // nend RGT events in lflank. { int64 sum; int q1, q3, n; int a, d, k; int acov, dcov; q1 = nend/4; q3 = (3*nend)/4; sum = 0; n = 0; for (lflank = first; n < nend; lflank--) if (queue[lflank].type == RGT || queue[lflank].type == CTR) { if (n >= q1 && n < q3) sum += queue[lflank].pos; n += 1; } dpos = sum/(q3-q1); lflank += 1; #ifdef DEBUG_HOLE_FINDER printf(" Dev %5d-%3d-%5d -> %5d",queue[lflank].pos,nend,queue[first].pos,dpos); #endif // Second, look for the leftmost RGT-(LFT-)wall that overlaps the left (right) // flank, i.e. queue[lflank,first].pos (queue[last,rflank].pos), and if found // take the average position of the flank position within the wall. for (d = dnum-1; d >= 0; d--) if (dwall[d].beg <= queue[first].pos) break; if (d >= 0 && dwall[d].end >= queue[lflank].pos) { sum = 0; n = 0; for (k = first; k >= lflank; k--) if (queue[k].type == RGT || queue[k].type == CTR) { if (queue[k].pos < dwall[d].beg) break; if (queue[k].pos <= dwall[d].end) { sum += queue[k].pos; n += 1; } } dpos = sum/n; #ifdef DEBUG_HOLE_FINDER printf(" Map [%5d,%5d] -> %4d\n",dwall[d].beg,dwall[d].end,dpos); #endif dcov = dwall[d].cov + dwall[d].cnt; d -= 1; } else { dcov = nend + MIN_COVER; #ifdef DEBUG_HOLE_FINDER printf(" No wall mapping\n"); #endif } // First try on LFT events (replace nend with nbeg, RGT with LFT, before // with after, and dpos with apos, first with last, and lflank with rflank. q1 = nbeg/4; q3 = (3*nbeg)/4; sum = 0; n = 0; for (rflank = last; n < nbeg; rflank++) if (queue[rflank].type == LFT || queue[rflank].type == CTR) { if (n >= q1 && n < q3) sum += queue[rflank].pos; n += 1; } apos = sum/(q3-q1); rflank -= 1; #ifdef DEBUG_HOLE_FINDER printf(" Aev %5d-%3d-%5d -> %5d",queue[i].pos,nbeg,queue[rflank].pos,apos); #endif // Second look at LFT events. for (a = 0; a < anum; a++) if (awall[a].end >= queue[i].pos) break; if (a < anum && awall[a].beg <= queue[rflank].pos) { sum = 0; n = 0; for (k = i; k <= rflank; k++) if (queue[k].type == LFT || queue[k].type == CTR) { if (queue[k].pos > awall[a].end) break; if (queue[k].pos >= awall[a].beg) { sum += queue[k].pos; n += 1; } } apos = sum/n; #ifdef DEBUG_HOLE_FINDER printf(" Map [%5d,%5d] -> %4d\n",awall[a].beg,awall[a].end,apos); #endif acov = awall[a].cov + awall[a].cnt; a += 1; } else { acov = nbeg + MIN_COVER; #ifdef DEBUG_HOLE_FINDER printf(" No wall mapping\n"); #endif } // If apos and dpos are still so close that the implied hole boundaries // are out of order by 50 or more bases, then walk back through ascending // walls (if present) until this is no longer true or there are no more // more walls left. If both left and right options exist, always take // the wall starting at the lower current height. while (apos - dpos < 2*COVER_LEN - 50) { if (d >= 0 && dwall[d].cov >= dcov) if (a < anum && awall[a].cov >= acov) { if (dcov < acov) { dcov = dwall[d].cov + dwall[d].cnt; dpos = dwall[d--].beg; #ifdef DEBUG_HOLE_FINDER printf(" Push <- %d\n",dpos); #endif } else { acov = awall[a].cov + awall[a].cnt; apos = awall[a++].end; #ifdef DEBUG_HOLE_FINDER printf(" Push -> %d\n",apos); #endif } } else { dcov = dwall[d].cov + dwall[d].cnt; dpos = dwall[d--].beg; #ifdef DEBUG_HOLE_FINDER printf(" Push <- %d\n",dpos); #endif } else if (a < anum && awall[a].cov >= acov) { acov = awall[a].cov + awall[a].cnt; apos = awall[a++].end; #ifdef DEBUG_HOLE_FINDER printf(" Push -> %d\n",apos); #endif } else { #ifdef DEBUG_HOLE_FINDER printf(" FAULT\n"); #endif break; } } } // Finalize and record the hole boundaries. holes[nhole].beg = dpos + COVER_LEN; holes[nhole].end = apos - COVER_LEN; nhole += 1; } in = 0; } } } // See if the holes remove or split any HQ-blocks and build the revised list // in newblk[0..q). { int i, p, q, x; int lhang, rhang; #ifdef DEBUG_HOLE_FINDER int reverse; #endif // For each hole in left-to-right order p = q = 0; for (i = 0; i < nhole; i++) { if (holes[i].beg > holes[i].end) { x = holes[i].beg; holes[i].beg = holes[i].end; holes[i].end = x; #ifdef DEBUG_HOLE_FINDER reverse = 1; #endif } #ifdef DEBUG_HOLE_FINDER else reverse = 0; #endif // Advance to the next block p that intersects with or is to the right of hole // moving blocks being skipped over to the new block list while (p < nblk && block[p].end <= holes[i].beg) nwblk[q++] = block[p++]; #ifdef DEBUG_HOLE_FINDER printf(" HOLE: %5d [%5d,%5d]\n", aread+1,holes[i].beg,holes[i].end); #endif // While the current block intersects the current hole while (p < nblk && block[p].beg < holes[i].end) { lhang = (holes[i].beg < block[p].beg); rhang = (holes[i].end > block[p].end); if (lhang) { if (rhang) // Hole i contains block p: remove it if coverage <= 4 at both ends { if (block[p].end - block[p].beg >= MIN_BLOCK && (cover[p].beg > 4 || cover[p].end > 4)) nwblk[q++] = block[p]; p += 1; #ifdef DEBUG_HOLE_FINDER printf(" INTERSECT %5d S [%5d,%5d] %3d %3d", aread+1,block[p-1].beg,block[p-1].end,cover[p-1].beg,cover[p-1].end); if (reverse) printf(" REV"); printf("\n"); #endif } // Hole i intersect the left tip of block p: nothing to do else { #ifdef DEBUG_HOLE_FINDER printf(" INTERSECT %5d Z %5d [..,%5d] %3d", aread+1,holes[i].end-block[p].beg,holes[i].end,cover[p].beg); if (reverse) printf(" REV"); printf("\n"); #endif break; } } else if (rhang) // Hole i intersect the right tip of block p: move p to new block list { nwblk[q++] = block[p++]; #ifdef DEBUG_HOLE_FINDER printf(" INTERSECT %5d Z %5d [%5d,..] %3d", aread+1,block[p-1].end-holes[i].beg,holes[i].beg,cover[p-1].end); if (reverse) printf(" REV"); printf("\n"); #endif } else // Hole i is contained within block p: Break block into two parts at // TRACE_SPACING ticks left and right of hole, and keep each piece // if they are greater than MIN_BLOCK long. { int beg, end; #ifdef DEBUG_HOLE_FINDER printf(" INTERSECT %5d C %5d [%5d,%5d]", aread+1,holes[i].end-holes[i].beg,block[p].beg,block[p].end); if (reverse) printf(" REV"); printf("\n"); #endif beg = (holes[i].beg/TRACE_SPACING); end = (holes[i].end-1)/TRACE_SPACING+1; if (beg == end) { beg -= 1; end += 1; } beg *= TRACE_SPACING; end *= TRACE_SPACING; if (beg - block[p].beg >= MIN_BLOCK) { nwblk[q].beg = block[p].beg; nwblk[q++].end = beg; } if (block[p].end - end >= MIN_BLOCK) block[p].beg = end; else p += 1; break; } } } // Remove any remaining blocks to the new list while (p < nblk) nwblk[q++] = block[p++]; nblk = q; // Transfer new blocks to original block vector for (i = 0; i < nblk; i++) block[i] = nwblk[i]; } #ifdef ANNOTATE { int i; for (i = 0; i < nhole; i++) if (holes[i].end - holes[i].beg < 75) { holes[i].end += 50; holes[i].beg -= 50; fwrite(&(holes[i].beg),sizeof(int),1,HL_DFILE); fwrite(&(holes[i].end),sizeof(int),1,HL_DFILE); holes[i].end -= 50; holes[i].beg += 50; } else { fwrite(&(holes[i].beg),sizeof(int),1,HL_DFILE); fwrite(&(holes[i].end),sizeof(int),1,HL_DFILE); } HL_INDEX += 2*nhole*sizeof(int); fwrite(&HL_INDEX,sizeof(int64),1,HL_AFILE); } #endif // Return the list of holes holes[0..nhole) and the new list of blocks, nwblk[0..nblk) return (nblk); } /******************************************************************************************* * * FIND ANY UNREMOVED ADAPTER (OR POLYMERASE SWITCHES) AND TRIM SMALLER PARTS * ********************************************************************************************/ typedef struct { int lidx; // left LA index int ridx; // right LA index int delta; // Difference between A-gap and B-gap int soft; // 0 (soft) = pair is not close to gap border or adjacent gap border on both sides // 1 (anciliary) = pair is not close to gap border but is to adjacent gap border // 2 (hard) = pair is close to gap border on both sides } Spanner; typedef struct { int bread; // bread^comp[beg..end] is the patch sequence int comp; int beg; int end; int anc; // maximum anchor interval match int bad; // number of segments that are bad int avg; // average QV of the patch } Patch; static int GSORT(const void *l, const void *r) { Spanner *x = (Spanner *) l; Spanner *y = (Spanner *) r; return (x->delta - y->delta); } #ifdef DEBUG_GAP_STATUS static int ASORT(const void *l, const void *r) { int *x = (int *) l; int *y = (int *) r; return (*x - *y); } #endif // Return match score of lov->bread with "anchor" lov->aread[lft-TRACE_SPACING,lft] static int eval_lft_anchor(int lft, Overlap *lov) { uint16 *tr; int te; if (lft > lov->path.aepos) return (50); tr = (uint16 *) lov->path.trace; te = 2 * (((lft + (TRACE_SPACING-1)) - lov->path.abpos)/TRACE_SPACING); if (te <= 0) return (50); return (tr[te-2]); } // Return match score of lov->bread with "anchor" lov->aread[rgt,rgt+TRACE_SPACING] static int eval_rgt_anchor(int rgt, Overlap *rov) { uint16 *tr; int te; if (rgt < rov->path.abpos) return (50); tr = (uint16 *) rov->path.trace; te = 2 * (((rgt + (TRACE_SPACING-1)) - rov->path.abpos)/TRACE_SPACING); if (te >= rov->path.tlen) return (50); return (tr[te]); } // Evaluate the quality of lov->bread = rov->bread spaning [lcv,rcv] as a patch static Patch *compute_patch(int lft, int rgt, Overlap *lov, Overlap *rov) { static Patch ans; uint16 *tr; int bread, bcomp, blen; int bb, be; int t, te; int bl, br; uint8 *qb; int avg, anc, bad; bread = lov->bread; bcomp = COMP(lov->flags); blen = DB->reads[bread].rlen; if (blen < HGAP_MIN) return (NULL); if (lft > lov->path.aepos || rgt < rov->path.abpos) // Cannot anchor return (NULL); if (lov->path.abpos > lft-TRACE_SPACING || rgt+TRACE_SPACING > rov->path.aepos) return (NULL); // Get max of left and right anchors as anchor score tr = (uint16 *) lov->path.trace; te = 2 * (((lft + (TRACE_SPACING-1)) - lov->path.abpos)/TRACE_SPACING); if (te == 0) return (NULL); anc = tr[te-2]; bb = lov->path.bbpos; for (t = 1; t < te; t += 2) bb += tr[t]; tr = (uint16 *) rov->path.trace; te = 2 * (((rgt + (TRACE_SPACING-1)) - rov->path.abpos)/TRACE_SPACING); if (te >= rov->path.tlen) return (NULL); if (tr[te] > anc) anc = tr[te]; be = rov->path.bepos; for (t = rov->path.tlen-1; t > te; t -= 2) be -= tr[t]; if (bb >= be) return (NULL); ans.bread = bread; ans.comp = bcomp; ans.beg = bb; ans.end = be; ans.anc = anc; // Compute metrics for b-read patch if (bcomp) { t = blen - be; be = blen - bb; bb = t; } bl = bb/TRACE_SPACING; br = (be+(TRACE_SPACING-1))/TRACE_SPACING; qb = QV + QV_IDX[bread]; if (bl >= br) { avg = qb[bl]; if (avg >= BAD_QV) bad = 1; else bad = 0; } else { avg = 0; bad = 0; for (t = bl; t < br; t++) { avg += qb[t]; if (qb[t] >= BAD_QV) bad += 1; } avg /= (br-bl); } ans.bad = bad; ans.avg = avg; return (&ans); } // Examine the spanning pairs for a gap. Group those with sufficient density // i.e. with 20 + 10% of the last one. If test == 1, keep groups that have at // least 4 members, and are either 60% hard or at least 8 hard pairs, but trim // away any extremal non-hard pairs. If test == 0, keep the largest group that has // at least 4 members, and is either 60% not soft or at least 8 non-soft pairs, // but trim awya ny extermal soft pairs. If "move" is non-zero then compress gsort // accordingly, return the size of gsort after trimming in all instances. static int analyze_gap_pairs(int gsize, Spanner *gsort, Overlap *ovls, int gcnt, int scnt, int test, int move) { int j, l, c, x, w; int bord, soft, keeper; int ncnt, biggest; (void) ovls; biggest = 0; ncnt = 0; #ifdef SHOW_PAIRS if (move) printf(" Gsort: %d\n",gsize); #endif c = gsize - gsort[0].delta; w = 0; for (j = 0; j <= gcnt; j++) { l = c; if (j >= gcnt) bord = 1; else { c = gsize - gsort[j].delta; if (l < 0) bord = (l-c >= 20-.1*c); else bord = (l-c >= 20+.1*l); } if (bord) { soft = 0; for (x = w; x < j; x++) soft += (gsort[x].soft <= test); keeper = (j-w >= 4 && (soft < .4*(j-w) || (j-w)-soft >= 8)); if (test == 0) { if (keeper && j-w > biggest) { biggest = j-w; ncnt = 0; } else keeper = 0; } #ifdef SHOW_PAIRS if (move) { printf("----\n"); for (x = w; x < j; x++) { printf(" %3d: %5d %5d",x,gsort[x].delta,gsize-gsort[x].delta); printf(" %5d",ovls[gsort[x].lidx].bread); if (gsort[x].soft == 0) printf(" @"); else if (gsort[x].soft == 1) printf(" #"); else printf(" "); if (!keeper) printf(" X"); printf("\n"); } } #endif if (keeper) { for (x = w; x < j; x++) if (gsort[x].soft > test) break; for (w = j; gsort[w-1].soft <= test; w--) ; if (move) while (x < w) gsort[ncnt++] = gsort[x++]; else ncnt += w-x; } w = j; } } if (move) for (x = gcnt; x < gcnt+scnt; x++) gsort[ncnt++] = gsort[x]; else ncnt += scnt; return (ncnt); } // Categorize each gap and if appropriate return the best patch for each static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rblock, int *p_lft, int *p_rgt) { static int nmax = 0; static Spanner *gsort = NULL; // A-B delta and idx-pair for all B-reads spanning a gap static int ANCHOR_THRESH; static Interval *FirstB; static Interval *LastB; int j; int lft, rgt; int lcv, rcv; int cnt; if (p_lft == NULL) { if (novl > nmax) { nmax = 1.2*novl + 500; gsort = (Spanner *) Realloc(gsort,nmax*sizeof(Spanner),"Allocating gap vector"); if (gsort == NULL) exit (1); ANCHOR_THRESH = ANCHOR_MATCH * TRACE_SPACING; } FirstB = lblock; LastB = rblock-1; return (0); } lft = lblock->end; rgt = rblock->beg; lcv = lft - COVER_LEN; rcv = rgt + COVER_LEN; if (lcv < lblock->beg) lcv = lblock->beg; if (rcv > rblock->end) rcv = rblock->end; #ifdef DEBUG_GAP_STATUS printf(" GAP [%5d,%5d] <%5d,%5d>\n",lft,rgt,lcv,rcv); #endif // If the gap flank [lcv,rcv] is covered by 10 or more LAs, then a LOWQ gap cnt = 0; for (j = 0; j < novl; j++) if (ovls[j].path.abpos <= lcv && ovls[j].path.aepos >= rcv) { cnt += 1; if (cnt >= 10) break; } // If so and it is patchable then report LOWQ if (cnt >= 10) { for (j = 0; j < novl; j++) if (ovls[j].path.abpos <= lcv && ovls[j].path.aepos >= rcv) { Patch *can; can = compute_patch(lft,rgt,ovls+j,ovls+j); if (can == NULL) continue; if (can->anc <= ANCHOR_THRESH && can->avg <= GOOD_QV && can->bad == 0) { #ifdef DEBUG_GAP_STATUS printf(" LOWQ PATCHABLE = %d%c[%d..%d] %d (%d)\n", can->bread,can->comp?'c':'n',can->beg, can->end,can->anc,can->avg); #endif return (LOWQ); } } #ifdef DEBUG_GAP_STATUS printf(" FAILING TO PATCH_LOWQ\n"); #endif } { int bread, bcomp, blen, blast; int ab, ae; int lstack[10], ltop; int rstack[10], rtop; int lcnt, rcnt, scnt, gcnt, acnt; int lidx, ridx, sidx, Lidx, Ridx; int k; // Find LA pairs or LAs spanning the gap flank [lcv,rcv] bread = -1; lcnt = rcnt = scnt = gcnt = acnt = 0; for (j = 0; j < novl; j = k) { blast = bread; bread = ovls[j].bread; blen = DB->reads[bread].rlen; bcomp = COMP(ovls[j].flags); Lidx = lidx; Ridx = ridx; ltop = rtop = 0; lidx = ridx = sidx = -1; // For all LA's with same b-read for (k = j; k < novl; k++) { if (ovls[k].bread != bread) break; if (COMP(ovls[k].flags) != (uint32) bcomp) // Note when b switches orientation break; ab = ovls[k].path.abpos; ae = ovls[k].path.aepos; #ifdef SHOW_PAIRS printf("\n %5d [%5d,%5d] %c",bread,ab,ae,COMP(ovls[k].flags)?'c':'n'); if (ab <= lcv && ae >= rcv) printf("s"); else printf(" "); #endif // Is LA a spanner, left-partner, or right partner // A partner is hard if end=point is within COVER_LEN of the gap boundary // Record rigthmost/leftmost left/right hard partners (if any) if (ab <= lcv && ae >= rcv) { sidx = k; lidx = ridx = -1; ltop = rtop = 0; continue; } if (sidx >= 0) continue; if (ae >= rcv && ab > lft) { if (rtop < 10) rstack[rtop++] = k; #ifdef SHOW_PAIRS printf("r"); #endif if (ab <= rcv && ridx < 0) { ridx = k; #ifdef SHOW_PAIRS printf("+"); #endif } } if (ab <= lcv && ae < rgt) { if (ltop < 10) lstack[ltop++] = k; #ifdef SHOW_PAIRS printf("l"); #endif if (ae >= lcv) { lidx = k; #ifdef SHOW_PAIRS printf("+"); #endif } } } // Check for a hard contra pair and if found add // Then check for a spanner and if so then add to gsort list. // Then check for a spanning pair: use hard pair if available, otherwise // use tightest pair and term it anciliary if endpoints are within an adjacent // gap boundary, or soft otherwise. // Finally, if left or right hard (but unpaired) then record as a conflict if // projection extends MIN_OVERLAP past the other side. if (blast == bread) { if (ridx < 0) { if (lidx >= 0 && Ridx >= 0 && Lidx < 0) { acnt += 1; if (Ridx >= 0 && ovls[Ridx].path.abpos - ovls[Ridx].path.bbpos <= lft - MIN_OVERLAP) rcnt -= 1; #ifdef SHOW_PAIRS printf(" = A"); #endif continue; } } else { if (lidx < 0 && Ridx < 0 && Lidx >= 0) { acnt += 1; if (Lidx >= 0 && ovls[Lidx].path.aepos + (blen-ovls[Lidx].path.bepos) >= rgt + MIN_OVERLAP) lcnt -= 1; #ifdef SHOW_PAIRS printf(" = A"); #endif continue; } } } if (sidx >= 0) { gsort[gcnt].delta = DB->maxlen; gsort[gcnt].lidx = sidx; gsort[gcnt].ridx = sidx; gcnt += 1; scnt += 1; #ifdef SHOW_PAIRS printf(" = S"); #endif continue; } if (ltop > 0 && rtop > 0) { int lok, rok, x; if (lidx < 0 || ridx < 0) { int dif, bst; int x, y; bst = 0x7fffffff; for (ltop--; ltop >= 0; ltop--) { x = lstack[ltop]; for (rtop--; rtop >= 0; rtop--) { y = rstack[rtop]; dif = (ovls[y].path.abpos - ovls[x].path.aepos) - (ovls[y].path.bbpos - ovls[x].path.bepos); dif = abs(dif); if (dif < bst) { bst = dif; lidx = x; ridx = y; dif = (ovls[ridx].path.abpos - ovls[lidx].path.aepos) - (ovls[ridx].path.bbpos - ovls[lidx].path.bepos); #ifdef SHOW_PAIRS printf(" C(%d,%d = %d)",x,y,dif); #endif } } } } lok = 2; if (ovls[lidx].path.aepos < lcv) { x = ovls[lidx].path.aepos; lok = (lblock > FirstB && x <= lblock->beg && x >= lblock[-1].end - COVER_LEN); } rok = 2; if (ovls[ridx].path.abpos > rcv) { x = ovls[ridx].path.abpos; rok = (rblock < LastB && x >= rblock->end && x <= rblock[1].beg + COVER_LEN); } if (lok >= 2 && rok >= 2) gsort[gcnt].soft = 2; else if (lok >= 1 && rok >= 1) gsort[gcnt].soft = 1; else gsort[gcnt].soft = 0; gsort[gcnt].delta = (ovls[ridx].path.abpos - ovls[lidx].path.aepos) - (ovls[ridx].path.bbpos - ovls[lidx].path.bepos); gsort[gcnt].lidx = lidx; gsort[gcnt].ridx = ridx; gcnt += 1; #ifdef SHOW_PAIRS printf(" = G%d",gsort[gcnt-1].delta); #endif continue; } if (ridx >= 0 && ovls[ridx].path.abpos - ovls[ridx].path.bbpos <= lft - MIN_OVERLAP) { rcnt += 1; #ifdef SHOW_PAIRS printf(" = R"); #endif } if (lidx >= 0 && ovls[lidx].path.aepos + (blen-ovls[lidx].path.bepos) >= rgt + MIN_OVERLAP) { lcnt += 1; #ifdef SHOW_PAIRS printf(" = L"); #endif } } #ifdef SHOW_PAIRS printf("\n"); #endif { int ccnt, ocnt; if (lcnt < rcnt) ccnt = lcnt; else ccnt = rcnt; // Analyze pair list gsort: if standard analysis (only hard pairs count) does not yield // a span, then consider anciliary pair spanners (rarely makes a difference but does // save a few. qsort(gsort,gcnt,sizeof(Spanner),GSORT); gcnt -= scnt; ocnt = gcnt; gcnt = analyze_gap_pairs(rgt-lft,gsort,ovls,gcnt,scnt,1,0); if (scnt < 4 && gcnt < 10 && gcnt < ccnt) { #ifdef SHOW_PAIRS printf(" SPECIAL\n"); #endif gcnt = analyze_gap_pairs(rgt-lft,gsort,ovls,ocnt,scnt,0,1); #ifdef SHOW_PAIRS if (gcnt >= 10 || gcnt >= ccnt) printf(" SWITCH\n"); #endif } else analyze_gap_pairs(rgt-lft,gsort,ovls,ocnt,scnt,1,1); #ifdef DEBUG_GAP_STATUS printf(" lcnt = %d gcnt = %d scnt = %d rcnt = %d acnt = %d\n", lcnt,gcnt-scnt,scnt,rcnt,acnt); #endif // Lots of contra pairs and less spanning support, call it an adaptamer gap. if (acnt >= .3*ccnt && gcnt < acnt) { #ifdef DEBUG_GAP_STATUS printf(" ADAPT %3d\n",std); #endif return (ADAPT); } // If there is insufficient evidence for a span, then split. if (scnt < 4 && gcnt < 10 && gcnt <= ccnt) { #ifdef DEBUG_GAP_STATUS if (ccnt >= 20) printf(" STRONG SPLIT\n"); else printf(" WEAK SPLIT\n"); if (gcnt >= 10) printf(" UNCERTAIN %5.1f %5d %3d\n",gcnt/(1.*ccnt),rgt-lft,gcnt); #endif return (SPLIT); } // Otherwise consider the gap spannable and try to find a viable patch, declaring a split // iff all patch attemtps fail else { Patch *can; int ncand; uint8 *qa; Interval *clb, *crb; qa = QV + QV_IDX[ovls[0].aread]; clb = lblock; crb = rblock; // First make sure enough partners provide anchors, and if not // shift them back to the next good segment of A-read { int nshort; nshort = 0; for (j = 0; j < gcnt; j++) { if (lft > ovls[gsort[j].lidx].path.aepos) nshort += 1; } if (nshort > .2*gcnt) do { lft -= TRACE_SPACING; if (lft <= clb->beg) { if (clb <= FirstB) break; clb -= 1; lft = clb->end; } } while (qa[lft/TRACE_SPACING-1] > GOOD_QV); nshort = 0; for (j = 0; j < gcnt; j++) { if (rgt < ovls[gsort[j].ridx].path.abpos) nshort += 1; } if (nshort > .2*gcnt) do { rgt += TRACE_SPACING; if (rgt >= crb->end) { if (crb >= LastB) break; crb += 1; rgt = crb->beg; } } while (qa[rgt/TRACE_SPACING] > GOOD_QV); // Could not find primary anchor pair, then declare a SPLIT if (clb < FirstB || crb > LastB) { #ifdef DEBUG_GAP_STATUS printf(" ANCHOR FAIL (BOUNDS)\n"); #endif return (SPLIT); } } // Count all patch candidates that have a good anchor pair ncand = 0; for (j = 0; j < gcnt; j++) { lidx = gsort[j].lidx; ridx = gsort[j].ridx; #ifdef DEBUG_PATCHING if (lidx != ridx) printf(" %5d [%5d,%5d] [%5d,%5d] %4d", ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos, ovls[ridx].path.abpos,ovls[ridx].path.aepos,gsort[j].delta); else printf(" %5d [%5d,%5d] SSS", ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos); #endif can = compute_patch(lft,rgt,ovls+lidx,ovls+ridx); if (can != NULL) { #ifdef DEBUG_PATCHING printf(" %d",can->end-can->beg); #endif if (can->anc <= ANCHOR_THRESH) { ncand += 1; #ifdef DEBUG_PATCHING printf(" AA %d %d(%d)",can->anc,can->bad,can->avg); #endif } } #ifdef DEBUG_PATCHING printf("\n"); #endif } // If there are less than 5 of them, then seek better anchor points a bit // further back if (ncand < 5) { int x, best, nlft, nrgt; int nanchor, ntry; #ifdef DEBUG_PATCHING printf(" NOT ENOUGH\n"); #endif // Try 4 additional anchor spots located at good intervals of A (if available) // One can cross other gaps in the search. Try the one with the most // partners having match scores below the anchor threshold. Do this to the // left and right. (A better search could be arranged (i.e. find smallest // spanning pair of adjusted anchors, but this situation happens 1 in 5000 // times, so felt it was not worth it). ntry = 0; nlft = lft; best = -1; for (x = lft; ntry < 5; x -= TRACE_SPACING) if (x <= clb->beg) { if (clb <= FirstB) break; clb -= 1; x = clb->end + TRACE_SPACING; } else if (qa[x/TRACE_SPACING-1] <= GOOD_QV) { ntry += 1; nanchor = 0; for (j = 0; j < gcnt; j++) if (eval_lft_anchor(x,ovls+gsort[j].lidx) <= ANCHOR_THRESH) nanchor += 1; #ifdef DEBUG_PATCHING printf(" %5d: %3d\n",x,nanchor); #endif if (nanchor > best) { best = nanchor; nlft = x; } } #ifdef DEBUG_PATCHING printf(" %5d->%5d\n",lft,nlft); #endif ntry = 0; nrgt = rgt; best = -1; for (x = rgt; ntry < 5; x += TRACE_SPACING) if (x >= crb->end) { if (crb >= LastB) break; crb += 1; x = crb->beg - TRACE_SPACING; } else if (qa[x/TRACE_SPACING] <= GOOD_QV) { ntry += 1; nanchor = 0; for (j = 0; j < gcnt; j++) if (eval_rgt_anchor(x,ovls+gsort[j].ridx) <= ANCHOR_THRESH) nanchor += 1; #ifdef DEBUG_PATCHING printf(" %5d: %3d\n",x,nanchor); #endif if (nanchor > best) { best = nanchor; nrgt = x; } } #ifdef DEBUG_PATCHING printf(" %5d->%5d\n",rgt,nrgt); #endif // If a better candidate pair of anchor points does not exist, then split. if (lft == nlft && rgt == nrgt) { #ifdef DEBUG_GAP_STATUS printf(" ANCHOR FAIL (ONCE) %d\n",ncand); #endif return (SPLIT); } lft = nlft; rgt = nrgt; // Check out if the new anchor pair has 5 or more candidate patches ncand = 0; for (j = 0; j < gcnt; j++) { lidx = gsort[j].lidx; ridx = gsort[j].ridx; #ifdef DEBUG_PATCHING if (lidx != ridx) printf(" %5d [%5d,%5d] [%5d,%5d] %4d", ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos, ovls[ridx].path.abpos,ovls[ridx].path.aepos,gsort[j].delta); else printf(" %5d [%5d,%5d] SSS", ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos); #endif if (lft <= ovls[lidx].path.aepos && rgt >= ovls[ridx].path.abpos) { can = compute_patch(lft,rgt,ovls+lidx,ovls+ridx); if (can != NULL) { #ifdef DEBUG_PATCHING printf(" %d",can->end-can->beg); #endif if (can->anc <= ANCHOR_THRESH) { ncand += 1; #ifdef DEBUG_PATCHING printf(" AA %d %d(%d)",can->anc,can->bad,can->avg); #endif } } } #ifdef DEBUG_PATCHING printf("\n"); #endif } // Could not arrange 5 patch candidates, give up and split. if (ncand < 5) { #ifdef DEBUG_GAP_STATUS printf(" ANCHOR FAIL (TWICE) %d\n",ncand); #endif return (SPLIT); } } *p_lft = lft; *p_rgt = rgt; #ifdef DEBUG_GAP_STATUS printf(" SPAN %5d: PATCHABLE\n",rgt-lft); #endif return (SPAN); } } } } static int *GAP_ANALYSIS(int aread, Overlap *ovls, int novl, Interval *block, int *p_nblk) { static int bmax = 0; static int *status = NULL; // Status of gaps between HQ_blocks #if defined(DEBUG_SUMMARY) || defined(DEBUG_CLASS) static char *status_string[4] = { "LOWQ", "SPAN", "SPLIT", "ADAPT" }; #endif int nblk; int i, j; int slft = 0, srgt = 0; (void) aread; nblk = *p_nblk; if (nblk > bmax) { bmax = 1.2*nblk + 100; status = (int *) Realloc(status,bmax*sizeof(int),"Allocating status vector"); if (status == NULL) exit (1); } gap_status(ovls,novl,block,block+nblk,NULL,NULL); // Initialization call j = 0; for (i = 1; i < nblk; i++) { status[i] = gap_status(ovls,novl,block+j,block+i,&slft,&srgt); if (status[i] == SPAN) { while (slft < block[j].beg) j -= 1; block[j].end = slft; j += 1; status[j] = status[i]; while (srgt > block[i].end) i += 1; block[j] = block[i]; block[j].beg = srgt; } else { j += 1; status[j] = status[i]; block[j] = block[i]; } } nblk = j+1; #ifdef DEBUG_SUMMARY #ifdef DEBUG_GAP_STATUS printf(" FINAL:\n"); #endif printf(" [%d,%d]",block[0].beg,block[0].end); for (i = 1; i < nblk; i++) printf(" %s [%d,%d]",status_string[status[i]],block[i].beg,block[i].end); #endif #ifdef DEBUG_CLASS for (i = 1; i < nblk; i++) printf("AREAD %d %s [%d,%d]\n",aread,status_string[status[i]],block[i-1].end,block[i].beg); #endif *p_nblk = nblk; return (status); } /******************************************************************************************* * * SCRUB EACH PILE: * Trim low-quality tips of reads and patch low quality intervals within a sequence * Trim adapter (and associated redundant prefix or suffix) * Break chimers or all unscaffoldable no-coverage gaps of reads * ********************************************************************************************/ // Analyze all the gaps between the good patches found in the first pass. // Consider a hole between two good intervals [lb,le] and [rb,re]. An overlap // is anchored to the left of the whole if abpos <= le-COVER_LEN and aepos >= rb+COVER_LEN static void GAPS(int aread, Overlap *ovls, int novl) { int alen; int nblk; Interval *block; int *status; #if defined(DEBUG_HQ_BLOCKS) || defined(DEBUG_HOLE_FINDER) || defined(DEBUG_GAP_STATUS) || defined(DEBUG_SUMMARY) printf("\n"); printf("AREAD %d\n",aread); #endif alen = DB->reads[aread].rlen; if (alen < HGAP_MIN) { #ifdef ANNOTATE fwrite(&HQ_INDEX,sizeof(int64),1,HQ_AFILE); fwrite(&SN_INDEX,sizeof(int64),1,SN_AFILE); fwrite(&SP_INDEX,sizeof(int64),1,SP_AFILE); fwrite(&AD_INDEX,sizeof(int64),1,AD_AFILE); fwrite(&HL_INDEX,sizeof(int64),1,HL_AFILE); fwrite(&KP_INDEX,sizeof(int64),1,KP_AFILE); #endif fwrite(&TR_INDEX,sizeof(int64),1,TR_AFILE); return; } nreads += 1; totlen += alen; // Partition into HQ-blocks block = HQ_BLOCKS(aread,&nblk); // Find holes and modify HQ-blocks if necessary if (nblk > 0) nblk = FIND_HOLES(aread,ovls,novl,block,nblk); // Determine the status of each gap between a pair of blocks if (nblk > 0) status = GAP_ANALYSIS(aread,ovls,novl,block,&nblk); // No blocks? ==> nothing to do if (nblk <= 0) { nelim += 1; nelimbp += alen; #ifdef ANNOTATE fwrite(&HQ_INDEX,sizeof(int64),1,HQ_AFILE); fwrite(&SN_INDEX,sizeof(int64),1,SN_AFILE); fwrite(&SP_INDEX,sizeof(int64),1,SP_AFILE); fwrite(&AD_INDEX,sizeof(int64),1,AD_AFILE); fwrite(&HL_INDEX,sizeof(int64),1,HL_AFILE); fwrite(&KP_INDEX,sizeof(int64),1,KP_AFILE); #endif fwrite(&TR_INDEX,sizeof(int64),1,TR_AFILE); return; } #ifdef ANNOTATE { int i; for (i = 0; i < nblk; i++) { fwrite(&(block[i].beg),sizeof(int),1,HQ_DFILE); fwrite(&(block[i].end),sizeof(int),1,HQ_DFILE); if (i > 0) { if (status[i] == SPAN || status[i] == LOWQ) { fwrite(&(block[i-1].end),sizeof(int),1,SN_DFILE); fwrite(&(block[i].beg),sizeof(int),1,SN_DFILE); SN_INDEX += 2*sizeof(int); } else if (status[i] == SPLIT) { fwrite(&(block[i-1].end),sizeof(int),1,SP_DFILE); fwrite(&(block[i].beg),sizeof(int),1,SP_DFILE); SP_INDEX += 2*sizeof(int); } else // status[i] == ADAPT { fwrite(&(block[i-1].end),sizeof(int),1,AD_DFILE); fwrite(&(block[i].beg),sizeof(int),1,AD_DFILE); AD_INDEX += 2*sizeof(int); } } } HQ_INDEX += 2*sizeof(int)*nblk; fwrite(&HQ_INDEX,sizeof(int64),1,HQ_AFILE); fwrite(&SN_INDEX,sizeof(int64),1,SN_AFILE); fwrite(&SP_INDEX,sizeof(int64),1,SP_AFILE); fwrite(&AD_INDEX,sizeof(int64),1,AD_AFILE); } #endif // Find largest non-adaptemer/subread range: block[abeg..aend) { int cmax, amax, abeg = 0, aend = 0; int p, i; amax = 0; p = 0; cmax = block[0].end-block[0].beg; for (i = 1; i < nblk; i++) if (status[i] == ADAPT) { if (cmax > amax) { amax = cmax; abeg = p; aend = i; } p = i; cmax = block[i].end - block[i].beg; } else if (status[i] != SPLIT) cmax += block[i].end - block[i-1].end; else cmax += block[i].end - block[i].beg; if (cmax > amax) { amax = cmax; abeg = p; aend = nblk; } if (block[aend-1].end - block[aend-1].beg < TRACE_SPACING) { aend -= 1; nblk -= 1; // assert: aend == nblk && status[aend-1] = SPLIT } if (block[aend-1].end == alen) block[aend-1].end = (alen/TRACE_SPACING)*TRACE_SPACING; #ifdef DEBUG_SUMMARY printf(" ::: Keeping [%d,%d]\n",block[abeg].beg,block[aend-1].end); #endif // Accummulate statistics if (block[0].beg > 0) { n5trm += 1; n5trmbp += block[0].beg; } if (block[nblk-1].end < alen) { n3trm += 1; n3trmbp += alen - block[nblk-1].end; } if (abeg > 0) { natrm += 1; natrmbp += block[abeg].beg - block[0].beg; } if (aend < nblk) { natrm += 1; natrmbp += (block[nblk-1].end - block[aend-1].end); } for (i = abeg+1; i < aend; i++) { ngaps += 1; ngapsbp += block[i].beg - block[i-1].end; if (status[i] == LOWQ) { nlowq += 1; nlowqbp += block[i].beg - block[i-1].end; } else if (status[i] == SPAN) { nspan += 1; nspanbp += block[i].beg - block[i-1].end; } else // status[i] == SPLIT { nchim += 1; nchimbp += block[i].beg - block[i-1].end; } } #ifdef ANNOTATE fwrite(&(block[abeg].beg),sizeof(int),1,KP_DFILE); for (i = abeg+1; i < aend; i++) if (status[i] == SPLIT) { fwrite(&(block[i-1].end),sizeof(int),1,KP_DFILE); fwrite(&(block[i].beg),sizeof(int),1,KP_DFILE); KP_INDEX += 2*sizeof(int); } fwrite(&(block[aend-1].end),sizeof(int),1,KP_DFILE); KP_INDEX += 2*sizeof(int); fwrite(&KP_INDEX,sizeof(int64),1,KP_AFILE); #endif // Output .trim track for this read fwrite(&(block[abeg].beg),sizeof(int),1,TR_DFILE); fwrite(&(block[abeg].end),sizeof(int),1,TR_DFILE); TR_INDEX += 2*sizeof(int); for (i = abeg+1; i < aend; i++) { fwrite(status+i,sizeof(int),1,TR_DFILE); fwrite(&(block[i].beg),sizeof(int),1,TR_DFILE); fwrite(&(block[i].end),sizeof(int),1,TR_DFILE); TR_INDEX += 3*sizeof(int); } fwrite(&TR_INDEX,sizeof(int64),1,TR_AFILE); } } // Read in each successive pile and call ACTION on it. Read in the traces only if // "trace" is nonzero static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int trace) { static Overlap *ovls = NULL; static int omax = 500; static uint16 *paths = NULL; static int pmax = 100000; int64 i, j, novl; int n, a; int pcur; int max; int tbytes; if (ovls == NULL) { ovls = (Overlap *) Malloc(sizeof(Overlap)*omax,"Allocating overlap buffer"); if (ovls == NULL) exit (1); } if (trace && paths == NULL) { paths = (uint16 *) Malloc(sizeof(uint16)*pmax,"Allocating path buffer"); if (paths == NULL) exit (1); } rewind(input); fread(&novl,sizeof(int64),1,input); fread(&TRACE_SPACING,sizeof(int),1,input); if (TRACE_SPACING <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); if (Read_Overlap(input,ovls) != 0) ovls[0].aread = INT32_MAX; else if (trace) { if (ovls[0].path.tlen > pmax) { pmax = 1.2*(ovls[0].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); if (paths == NULL) exit (1); } fread(paths,tbytes,ovls[0].path.tlen,input); if (tbytes == 1) { ovls[0].path.trace = paths; Decompress_TraceTo16(ovls); } } else fseek(input,tbytes*ovls[0].path.tlen,SEEK_CUR); if (ovls[0].aread < DB_FIRST) { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n", Prog_Name,DB_PART); exit (1); } pcur = 0; n = max = 0; for (j = DB_FIRST; j < DB_LAST; j++) { ovls[0] = ovls[n]; a = ovls[0].aread; if (a != j) n = 0; else { if (trace) memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen); n = 1; pcur = ovls[0].path.tlen; while (1) { if (Read_Overlap(input,ovls+n) != 0) { ovls[n].aread = INT32_MAX; break; } if (trace) { if (pcur + ovls[n].path.tlen > pmax) { pmax = 1.2*(pcur+ovls[n].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); if (paths == NULL) exit (1); } fread(paths+pcur,tbytes,ovls[n].path.tlen,input); if (tbytes == 1) { ovls[n].path.trace = paths+pcur; Decompress_TraceTo16(ovls+n); } } else fseek(input,tbytes*ovls[n].path.tlen,SEEK_CUR); if (ovls[n].aread != a) break; pcur += ovls[n].path.tlen; n += 1; if (n >= omax) { omax = 1.2*n + 100; ovls = (Overlap *) Realloc(ovls,sizeof(Overlap)*omax,"Expanding overlap buffer"); if (ovls == NULL) exit (1); } } if (n >= max) max = n; pcur = 0; for (i = 0; i < n; i++) { ovls[i].path.trace = paths+pcur; pcur += ovls[i].path.tlen; } } ACTION(j,ovls,n); } if (ovls[n].aread < INT32_MAX) { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n", Prog_Name,DB_PART); exit (1); } return (max); } int main(int argc, char *argv[]) { char *root, *dpwd; int64 novl; DAZZ_TRACK *track; int c; DAZZ_EXTRA ex_hgap, ex_cest; DAZZ_EXTRA ex_good, ex_bad, ex_trim; char *good_name = "Good QV threshold"; char *bad_name = "Bad QV threshold"; char *trim_name = "Trimming statistics"; int64 good64, bad64, tstats[18]; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DAStrim") BAD_QV = -1; GOOD_QV = -1; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; case 'b': ARG_NON_NEGATIVE(BAD_QV,"Minimum QV score for being considered bad") break; case 'g': ARG_NON_NEGATIVE(GOOD_QV,"Maximum QV score for being considered good") break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc < 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n"); fprintf(stderr," -g: Use as good qv threshold (and not auto-estimate).\n"); fprintf(stderr," -b: Use as bad qv threshold (and not auto-estimate).\n"); exit (1); } } // Open trimmed DB and the qual-track { int status; status = Open_DB(argv[1],DB); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (DB->part) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } Trim_DB(DB); } // Get .qual track and extras track = Load_Track(DB,"qual"); if (track != NULL) { FILE *afile; char *aname; int extra, tracklen, size; DAZZ_EXTRA ex_qvs, ex_dif; QV_IDX = (int64 *) track->anno; QV = (uint8 *) track->data; aname = Strdup(Catenate(DB->path,".","qual",".anno"),"Allocating anno file"); if (aname == NULL) exit (1); afile = fopen(aname,"r"); fread(&tracklen,sizeof(int),1,afile); fread(&size,sizeof(int),1,afile); fseeko(afile,0,SEEK_END); extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int)); fseeko(afile,-extra,SEEK_END); ex_hgap.nelem = 0; if (Read_Extra(afile,aname,&ex_hgap) != 0) { fprintf(stderr,"%s: Hgap threshold extra missing from .qual track?\n",Prog_Name); exit (1); } ex_cest.nelem = 0; if (Read_Extra(afile,aname,&ex_cest) != 0) { fprintf(stderr,"%s: Coverage estimate extra missing from .qual track?\n",Prog_Name); exit (1); } ex_qvs.nelem = 0; if (Read_Extra(afile,aname,&ex_qvs) != 0) { fprintf(stderr,"%s: QV histogram extra missing from .qual track?\n",Prog_Name); exit (1); } ex_dif.nelem = 0; if (Read_Extra(afile,aname,&ex_dif) != 0) { fprintf(stderr,"%s: Differences histogram extra missing from .qual track?\n",Prog_Name); exit (1); } fclose(afile); COVERAGE = (int) ((int64 *) (ex_cest.value))[0]; HGAP_MIN = (int) ((int64 *) (ex_hgap.value))[0]; // Compute -g and -b parameters { int64 qsum, qtotal; int64 *qgram; int i, maxqv; int gv, bv; qgram = (int64 *) (ex_qvs.value); maxqv = ex_qvs.nelem - 1; qtotal = 0; for (i = 0; i < maxqv; i++) qtotal += qgram[i]; bv = gv = -1; qsum = 0; for (i = maxqv-1; i >= 0; i--) if (qgram[i] > 0) { qsum += qgram[i]; if ((100.*qsum)/qtotal > 7. && bv < 0) bv = i+1; if ((100.*qsum)/qtotal > 20. && gv < 0) gv = i+1; } if (GOOD_QV < 0) GOOD_QV = gv; if (BAD_QV < 0) BAD_QV = bv; } } else { fprintf(stderr,"%s: Must have a 'qual' track, run DASqv\n",Prog_Name); exit (1); } if (GOOD_QV > BAD_QV) { fprintf(stderr,"%s: Good QV threshold (%d) > Bad QV threshold (%d) ?\n", Prog_Name,GOOD_QV,BAD_QV); exit (1); } // Setup extras ex_good.vtype = DB_INT; // Good QV threshold ex_good.nelem = 1; ex_good.accum = DB_EXACT; ex_good.name = good_name; good64 = GOOD_QV; ex_good.value = &good64; ex_bad.vtype = DB_INT; // Bad QV threshold ex_bad.nelem = 1; ex_bad.accum = DB_EXACT; ex_bad.name = bad_name; bad64 = BAD_QV; ex_bad.value = &bad64; ex_trim.vtype = DB_INT; // Trim statistics ex_trim.nelem = 16; ex_trim.accum = DB_SUM; ex_trim.name = trim_name; ex_trim.value = &tstats; // For each .las file do dpwd = PathTo(argv[1]); root = Root(argv[1],".db"); for (c = 2; c < argc; c++) { Block_Looper *parse; FILE *input; parse = Parse_Block_Arg(argv[c]); while ((input = Next_Block_Arg(parse)) != NULL) { DB_PART = 0; DB_FIRST = 0; DB_LAST = DB->nreads; // Determine if overlap block is being processed and if so get first and last read // from .db file { FILE *dbfile; char buffer[2*MAX_NAME+100]; char *p, *eptr; int i, part, nfiles, nblocks, cutoff, all, oindx; int64 size; p = rindex(Block_Arg_Root(parse),'.'); if (p != NULL) { part = strtol(p+1,&eptr,10); if (*eptr == '\0' && eptr != p+1) { dbfile = Fopen(Catenate(dpwd,"/",root,".db"),"r"); if (dbfile == NULL) exit (1); if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_READ_ERROR for (i = 0; i < nfiles; i++) if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3) SYSTEM_READ_ERROR for (i = 1; i <= part; i++) if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2) SYSTEM_READ_ERROR if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2) SYSTEM_READ_ERROR fclose(dbfile); DB_PART = part; } } } // Set up QV trimming track #define SETUP(AFILE,DFILE,INDEX,anno,data,S) \ { int len, size; \ \ if (DB_PART > 0) \ { AFILE = Fopen(Catenate(dpwd,PATHSEP,root, \ Numbered_Suffix(".",DB_PART,anno)),"w"); \ DFILE = Fopen(Catenate(dpwd,PATHSEP,root, \ Numbered_Suffix(".",DB_PART,data)),"w"); \ } \ else \ { AFILE = Fopen(Catenate(dpwd,PATHSEP,root,anno),"w"); \ DFILE = Fopen(Catenate(dpwd,PATHSEP,root,data),"w"); \ } \ if (AFILE == NULL || DFILE == NULL) \ exit (1); \ \ len = DB_LAST - DB_FIRST; \ size = S; \ fwrite(&len,sizeof(int),1,AFILE); \ fwrite(&size,sizeof(int),1,AFILE); \ INDEX = 0; \ fwrite(&INDEX,sizeof(int64),1,AFILE); \ } SETUP(TR_AFILE,TR_DFILE,TR_INDEX,".trim.anno",".trim.data",8) #ifdef ANNOTATE SETUP(HQ_AFILE,HQ_DFILE,HQ_INDEX,".hq.anno",".hq.data",0) SETUP(SN_AFILE,SN_DFILE,SN_INDEX,".span.anno",".span.data",0) SETUP(SP_AFILE,SP_DFILE,SP_INDEX,".split.anno",".split.data",0) SETUP(AD_AFILE,AD_DFILE,AD_INDEX,".adapt.anno",".adapt.data",0) SETUP(HL_AFILE,HL_DFILE,HL_INDEX,".hole.anno",".hole.data",0) SETUP(KP_AFILE,KP_DFILE,KP_INDEX,".keep.anno",".keep.data",0) #endif // Get trace point spacing information fread(&novl,sizeof(int64),1,input); fread(&TRACE_SPACING,sizeof(int),1,input); // Initialize statistics gathering nreads = 0; totlen = 0; nelim = 0; n5trm = 0; n3trm = 0; natrm = 0; nelimbp = 0; n5trmbp = 0; n3trmbp = 0; natrmbp = 0; ngaps = 0; nlowq = 0; nspan = 0; nchim = 0; ngapsbp = 0; nlowqbp = 0; nspanbp = 0; nchimbp = 0; if (VERBOSE) { printf("\nDAStrim"); if (HGAP_MIN > 0) printf(" -H%d",HGAP_MIN); printf(" -c%d -g%d -b%d %s %s\n",COVERAGE,GOOD_QV,BAD_QV,argv[1],argv[c]); } // Process each read pile make_a_pass(input,GAPS,1); // Write out extras and close .trim track tstats[ 0] = nelim; tstats[ 1] = n5trm; tstats[ 2] = n3trm; tstats[ 3] = natrm; tstats[ 4] = nelimbp; tstats[ 5] = n5trmbp; tstats[ 6] = n3trmbp; tstats[ 7] = natrmbp; tstats[ 8] = ngaps; tstats[ 9] = nlowq; tstats[10] = nspan; tstats[11] = nchim; tstats[12] = ngapsbp; tstats[13] = nlowqbp; tstats[14] = nspanbp; tstats[15] = nchimbp; Write_Extra(TR_AFILE,&ex_hgap); Write_Extra(TR_AFILE,&ex_cest); Write_Extra(TR_AFILE,&ex_good); Write_Extra(TR_AFILE,&ex_bad); Write_Extra(TR_AFILE,&ex_trim); fclose(TR_AFILE); fclose(TR_DFILE); #ifdef ANNOTATE fclose(HQ_AFILE); fclose(HQ_DFILE); fclose(SN_AFILE); fclose(SN_DFILE); fclose(SP_AFILE); fclose(SP_DFILE); fclose(AD_AFILE); fclose(AD_DFILE); fclose(HL_AFILE); fclose(HL_DFILE); fclose(KP_AFILE); fclose(KP_DFILE); #endif fclose(input); // If verbose output statistics summary to stdout if (VERBOSE) { printf("\n Input: "); Print_Number((int64) nreads,7,stdout); printf(" (100.0%%) reads "); Print_Number(totlen,12,stdout); printf(" (100.0%%) bases"); if (HGAP_MIN > 0) { printf(" (another "); Print_Number((int64) ((DB_LAST-DB_FIRST)-nreads),0,stdout); printf(" were < H-length)"); } printf("\n"); printf(" Trimmed: "); Print_Number(nelim,7,stdout); printf(" (%5.1f%%) reads ",(100.*nelim)/nreads); Print_Number(nelimbp,12,stdout); printf(" (%5.1f%%) bases\n",(100.*nelimbp)/totlen); printf(" 5' trim: "); Print_Number(n5trm,7,stdout); printf(" (%5.1f%%) reads ",(100.*n5trm)/nreads); Print_Number(n5trmbp,12,stdout); printf(" (%5.1f%%) bases\n",(100.*n5trmbp)/totlen); printf(" 3' trim: "); Print_Number(n3trm,7,stdout); printf(" (%5.1f%%) reads ",(100.*n3trm)/nreads); Print_Number(n3trmbp,12,stdout); printf(" (%5.1f%%) bases\n",(100.*n3trmbp)/totlen); printf(" Adapter: "); Print_Number(natrm,7,stdout); printf(" (%5.1f%%) reads ",(100.*natrm)/nreads); Print_Number(natrmbp,12,stdout); printf(" (%5.1f%%) bases\n",(100.*natrmbp)/totlen); printf("\n"); printf(" Gaps: "); Print_Number(ngaps,7,stdout); printf(" (%5.1f%%) gaps ",(100.*(ngaps))/nreads); Print_Number(ngapsbp,12,stdout); printf(" (%5.1f%%) bases\n",(100.*(ngapsbp))/totlen); printf(" Low QV: "); Print_Number(nlowq,7,stdout); printf(" (%5.1f%%) gaps ",(100.*(nlowq))/nreads); Print_Number(nlowqbp,12,stdout); printf(" (%5.1f%%) bases\n",(100.*(nlowqbp))/totlen); printf(" Span'd: "); Print_Number(nspan,7,stdout); printf(" (%5.1f%%) gaps ",(100.*(nspan))/nreads); Print_Number(nspanbp,12,stdout); printf(" (%5.1f%%) bases\n",(100.*(nspanbp))/totlen); printf(" Break: "); Print_Number(nchim,7,stdout); printf(" (%5.1f%%) gaps ",(100.*(nchim))/nreads); Print_Number(nchimbp,12,stdout); printf(" (%5.1f%%) bases\n",(100.*(nchimbp))/totlen); printf("\n"); printf(" Clipped: "); Print_Number(n5trm+n3trm+nelim+natrm+nchim,7,stdout); printf(" clips "); Print_Number(n5trmbp+n3trmbp+nelimbp+natrmbp,12,stdout); printf(" (%5.1f%%) bases\n",(100.*(n5trmbp+n3trmbp+nelimbp+natrmbp+nchimbp))/totlen); printf(" Patched: "); Print_Number(nlowq+nspan,7,stdout); printf(" patches "); Print_Number(nlowqbp+nspanbp,12,stdout); printf(" (%5.1f%%) bases\n",(100.*(nlowqbp+nspanbp))/totlen); } } Free_Block_Arg(parse); } free(dpwd); free(root); Close_DB(DB); free(Prog_Name); exit (0); } DASCRUBBER-1.1/DB.c000066400000000000000000001642371327574206400134470ustar00rootroot00000000000000/******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif /******************************************************************************************* * * GENERAL UTILITIES * ********************************************************************************************/ char *Prog_Name; #ifdef INTERACTIVE char Ebuffer[1000]; #endif int Count_Args(char *var) { int cnt, lev; char *s; cnt = 1; lev = 0; for (s = var; *s != '\0'; s++) if (*s == ',') { if (lev == 0) cnt += 1; } else if (*s == '(') lev += 1; else if (*s == ')') lev -= 1; return (cnt); } void *Malloc(int64 size, char *mesg) { void *p; if ((p = malloc(size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } void *Realloc(void *p, int64 size, char *mesg) { if (size <= 0) size = 1; if ((p = realloc(p,size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } char *Strdup(char *name, char *mesg) { char *s; if (name == NULL) return (NULL); if ((s = strdup(name)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (s); } FILE *Fopen(char *name, char *mode) { FILE *f; if (name == NULL || mode == NULL) return (NULL); if ((f = fopen(name,mode)) == NULL) EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); return (f); } char *PathTo(char *name) { char *path, *find; if (name == NULL) return (NULL); if ((find = rindex(name,'/')) != NULL) { *find = '\0'; path = Strdup(name,"Extracting path from"); *find = '/'; } else path = Strdup(".","Allocating default path"); return (path); } char *Root(char *name, char *suffix) { char *path, *find, *dot; int epos; if (name == NULL) return (NULL); find = rindex(name,'/'); if (find == NULL) find = name; else find += 1; if (suffix == NULL) { dot = strchr(find,'.'); if (dot != NULL) *dot = '\0'; path = Strdup(find,"Extracting root from"); if (dot != NULL) *dot = '.'; } else { epos = strlen(find); epos -= strlen(suffix); if (epos > 0 && strcasecmp(find+epos,suffix) == 0) { find[epos] = '\0'; path = Strdup(find,"Extracting root from"); find[epos] = suffix[0]; } else path = Strdup(find,"Allocating root"); } return (path); } char *Catenate(char *path, char *sep, char *root, char *suffix) { static char *cat = NULL; static int max = -1; int len; if (path == NULL || root == NULL || sep == NULL || suffix == NULL) return (NULL); len = strlen(path); len += strlen(sep); len += strlen(root); len += strlen(suffix); if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); return (NULL); } } sprintf(cat,"%s%s%s%s",path,sep,root,suffix); return (cat); } char *Numbered_Suffix(char *left, int num, char *right) { static char *suffix = NULL; static int max = -1; int len; if (left == NULL || right == NULL) return (NULL); len = strlen(left); len += strlen(right) + 40; if (len > max) { max = ((int) (1.2*len)) + 100; if ((suffix = (char *) realloc(suffix,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); return (NULL); } } sprintf(suffix,"%s%d%s",left,num,right); return (suffix); } #define COMMA ',' // Print big integers with commas/periods for better readability void Print_Number(int64 num, int width, FILE *out) { if (width == 0) { if (num < 1000ll) fprintf(out,"%lld",num); else if (num < 1000000ll) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else if (num < 1000000000ll) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll, COMMA,(num%1000000000ll)/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); } else { if (num < 1000ll) fprintf(out,"%*lld",width,num); else if (num < 1000000ll) { if (width <= 4) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll); } else if (num < 1000000000ll) { if (width <= 8) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); } else { if (width <= 12) fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); } } } // Return the number of digits, base 10, of num int Number_Digits(int64 num) { int digit; digit = 0; while (num >= 1) { num /= 10; digit += 1; } return (digit); } /******************************************************************************************* * * READ COMPRESSION/DECOMPRESSION UTILITIES * ********************************************************************************************/ // Compress read into 2-bits per base (from [0-3] per byte representation void Compress_Read(int len, char *s) { int i; char c, d; char *s0, *s1, *s2, *s3; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; c = s1[len]; d = s2[len]; s0[len] = s1[len] = s2[len] = 0; for (i = 0; i < len; i += 4) *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]); s1[len] = c; s2[len] = d; } // Uncompress read form 2-bits per base into [0-3] per byte representation void Uncompress_Read(int len, char *s) { int i, tlen, byte; char *s0, *s1, *s2, *s3; char *t; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; tlen = (len-1)/4; t = s+tlen; for (i = tlen*4; i >= 0; i -= 4) { byte = *t--; s0[i] = (char) ((byte >> 6) & 0x3); s1[i] = (char) ((byte >> 4) & 0x3); s2[i] = (char) ((byte >> 2) & 0x3); s3[i] = (char) (byte & 0x3); } s[len] = 4; } // Convert read in [0-3] representation to ascii representation (end with '\n') void Lower_Read(char *s) { static char letter[4] = { 'a', 'c', 'g', 't' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } void Upper_Read(char *s) { static char letter[4] = { 'A', 'C', 'G', 'T' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } void Letter_Arrow(char *s) { static char letter[4] = { '1', '2', '3', '4' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } // Convert read in ascii representation to [0-3] representation (end with 4) void Number_Read(char *s) { static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; for ( ; *s != '\0'; s++) *s = number[(int) *s]; *s = 4; } void Number_Arrow(char *s) { static char arrow[128] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, }; for ( ; *s != '\0'; s++) *s = arrow[(int) *s]; *s = 4; } /******************************************************************************************* * * DB OPEN, TRIM & CLOSE ROUTINES * ********************************************************************************************/ // Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char* path, DAZZ_DB *db) { DAZZ_DB dbcopy; char *root, *pwd, *bptr, *fptr, *cat; int nreads; FILE *index, *dbvis; int status, plen, isdam; int part, cutoff, all; int ufirst, tfirst, ulast, tlast; status = -1; dbcopy = *db; plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); pwd = PathTo(path); bptr = rindex(root,'.'); if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-') { part = strtol(bptr+1,&fptr,10); if (*fptr != '\0' || part == 0) part = 0; else *bptr = '\0'; } else part = 0; isdam = 0; cat = Catenate(pwd,"/",root,".db"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { cat = Catenate(pwd,"/",root,".dam"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path); goto error; } isdam = 1; } if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) goto error1; if (fread(db,sizeof(DAZZ_DB),1,index) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); goto error2; } { int p, nblocks, nfiles; int64 size; char fname[MAX_NAME], prolog[MAX_NAME]; nblocks = 0; if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } for (p = 0; p < nfiles; p++) if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) if (part == 0) { cutoff = 0; all = DB_ALL; } else { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", Prog_Name,root); goto error2; } else { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (part > nblocks) { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); goto error2; } } if (part > 0) { for (p = 1; p <= part; p++) if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } } else { ufirst = tfirst = 0; ulast = db->ureads; tlast = db->treads; } } db->trimmed = 0; db->tracks = NULL; db->part = part; db->cutoff = cutoff; db->allarr |= all; db->ufirst = ufirst; db->tfirst = tfirst; nreads = ulast-ufirst; if (part <= 0) { db->reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); if (db->reads == NULL) goto error2; db->reads += 1; if (fread(db->reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(db->reads-1); goto error2; } } else { DAZZ_READ *reads; int i, r, maxlen; int64 totlen; reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); if (reads == NULL) goto error2; reads += 1; fseeko(index,sizeof(DAZZ_READ)*ufirst,SEEK_CUR); if (fread(reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(reads-1); goto error2; } totlen = 0; maxlen = 0; for (i = 0; i < nreads; i++) { r = reads[i].rlen; totlen += r; if (r > maxlen) maxlen = r; } db->maxlen = maxlen; db->totlen = totlen; db->reads = reads; } ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part ((int *) (db->reads))[-2] = tlast - tfirst; db->nreads = nreads; db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); if (db->path == NULL) goto error2; db->bases = NULL; db->loaded = 0; status = isdam; error2: fclose(index); error1: fclose(dbvis); error: if (bptr != NULL) *bptr = '.'; free(pwd); free(root); if (status < 0) *db = dbcopy; return (status); } // Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(DAZZ_DB *db) { int i, j, r, f; int allflag, cutoff, css; int64 totlen; int maxlen, nreads; DAZZ_TRACK *record; DAZZ_READ *reads; if (db->trimmed) return; if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return; cutoff = db->cutoff; if ((db->allarr & DB_ALL) != 0) allflag = 0; else allflag = DB_BEST; reads = db->reads; nreads = db->nreads; for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,".@qvs") == 0) { uint16 *table = ((DAZZ_QV *) record)->table; j = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) table[j++] = table[i]; } else { int *anno4, size; int64 *anno8; char *anno, *data; size = record->size; data = (char *) record->data; if (data == NULL) { anno = (char *) record->anno; j = 0; for (i = r = 0; i < db->nreads; i++, r += size) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (record->anno); j = anno4[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (record->anno); j = anno8[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno8[j],NULL); } record->anno = Realloc(record->anno,record->size*(j+1),NULL); } totlen = maxlen = 0; for (j = i = 0; i < nreads; i++) { f = reads[i].flags; if ((f & DB_CSS) == 0) css = 0; r = reads[i].rlen; if ((f & DB_BEST) >= allflag && r >= cutoff) { totlen += r; if (r > maxlen) maxlen = r; reads[j] = reads[i]; if (css) reads[j++].flags |= DB_CSS; else reads[j++].flags &= ~DB_CSS; css = 1; } } db->totlen = totlen; db->maxlen = maxlen; db->nreads = j; db->trimmed = 1; if (j < nreads) { db->reads = Realloc(reads-1,sizeof(DAZZ_READ)*(j+2),NULL); db->reads += 1; } } // The DB has already been trimmed, but a track over the untrimmed DB needs to be loaded. // Trim the track by rereading the untrimmed DB index from the file system. static int Late_Track_Trim(DAZZ_DB *db, DAZZ_TRACK *track, int ispart) { int i, j, r; int allflag, cutoff; int ureads; char *root; DAZZ_READ read; FILE *indx; if (!db->trimmed) return (0); if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return (0); cutoff = db->cutoff; if ((db->allarr & DB_ALL) != 0) allflag = 0; else allflag = DB_BEST; root = rindex(db->path,'/') + 2; indx = Fopen(Catenate(db->path,"","",".idx"),"r"); fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*db->ufirst,SEEK_SET); if (ispart) ureads = ((int *) (db->reads))[-1]; else ureads = db->ureads; if (strcmp(track->name,".@qvs") == 0) { EPRINTF(EPLACE,"%s: Cannot load QV track after trimming\n",Prog_Name); fclose(indx); EXIT(1); } { int *anno4, size; int64 *anno8; char *anno, *data; size = track->size; data = (char *) track->data; if (data == NULL) { anno = (char *) track->anno; j = r = 0; for (i = r = 0; i < ureads; i++, r += size) { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } r += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (track->anno); j = anno4[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (track->anno); j = anno8[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno8[j],NULL); } track->anno = Realloc(track->anno,track->size*(j+1),NULL); } fclose(indx); return (0); } // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(DAZZ_DB *db) { DAZZ_TRACK *t, *p; if (db->loaded) free(((char *) (db->bases)) - 1); else if (db->bases != NULL) fclose((FILE *) db->bases); if (db->reads != NULL) free(db->reads-1); free(db->path); Close_QVs(db); for (t = db->tracks; t != NULL; t = p) { p = t->next; free(t->anno); free(t->data); free(t); } } // Return the size in bytes of the memory occupied by a given DB int64 sizeof_DB(DAZZ_DB *db) { int64 s; DAZZ_TRACK *t; s = sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*(db->nreads+2) + strlen(db->path)+1 + (db->totlen+db->nreads+4); t = db->tracks; if (t != NULL && strcmp(t->name,".@qvs") == 0) { DAZZ_QV *q = (DAZZ_QV *) t; s += sizeof(DAZZ_QV) + sizeof(uint16) * db->nreads + q->ncodes * sizeof(QVcoding) + 6; t = t->next; } for (; t != NULL; t = t->next) { s += sizeof(DAZZ_TRACK) + strlen(t->name)+1 + t->size * (db->nreads+1); if (t->data != NULL) { if (t->size == 8) s += sizeof(int)*((int64 *) t->anno)[db->nreads]; else // t->size == 4 s += sizeof(int)*((int *) t->anno)[db->nreads]; } } return (s); } /******************************************************************************************* * * QV LOAD & CLOSE ROUTINES * ********************************************************************************************/ DAZZ_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" DAZZ_QV *Active_QV; // Becomes invalid after closing int Load_QVs(DAZZ_DB *db) { FILE *quiva, *istub, *indx; char *root; uint16 *table; DAZZ_QV *qvtrk; QVcoding *coding, *nx; int ncodes = 0; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) return (0); if (db->trimmed) { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); EXIT(1); } if (db->reads[db->nreads-1].coff < 0) { if (db->part > 0) { EPRINTF(EPLACE,"%s: All QVs for this block have not been added to the DB!\n",Prog_Name); EXIT(1); } else { EPRINTF(EPLACE,"%s: All QVs for this DB have not been added!\n",Prog_Name); EXIT(1); } } // Open .qvs, .idx, and .db files quiva = Fopen(Catenate(db->path,"","",".qvs"),"r"); if (quiva == NULL) return (-1); istub = NULL; indx = NULL; table = NULL; coding = NULL; qvtrk = NULL; root = rindex(db->path,'/'); if (root[1] == '.') { *root = '\0'; istub = Fopen(Catenate(db->path,"/",root+2,".db"),"r"); *root = '/'; } else istub = Fopen(Catenate(db->path,"","",".db"),"r"); if (istub == NULL) goto error; { int first, last, nfiles; char prolog[MAX_NAME], fname[MAX_NAME]; int i, j; if (fscanf(istub,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (db->part > 0) { int pfirst, plast; int fbeg, fend; int n, k; FILE *indx; // Determine first how many and which files span the block (fbeg to fend) pfirst = db->ufirst; plast = pfirst + db->nreads; first = 0; for (fbeg = 0; fbeg < nfiles; fbeg++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (last > pfirst) break; first = last; } for (fend = fbeg+1; fend <= nfiles; fend++) { if (last >= plast) break; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } first = last; } indx = Fopen(Catenate(db->path,"","",".idx"),"r"); ncodes = fend-fbeg; coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (indx == NULL || coding == NULL || table == NULL) { ncodes = 0; goto error; } // Carefully get the first coding scheme (its offset is most likely in a DAZZ_RECORD // in .idx that is *not* in memory). Get all the other coding schemes normally and // assign the tables # for each read in the block in "tables". rewind(istub); (void) fscanf(istub,DB_NFILE,&nfiles); first = 0; for (n = 0; n < fbeg; n++) { (void) fscanf(istub,DB_FDATA,&last,fname,prolog); first = last; } for (n = fbeg; n < fend; n++) { (void) fscanf(istub,DB_FDATA,&last,fname,prolog); i = n-fbeg; if (first < pfirst) { DAZZ_READ read; fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*first,SEEK_SET); if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); ncodes = i; goto error; } fseeko(quiva,read.coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; } else { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first-pfirst].coff = ftello(quiva); } j = first-pfirst; if (j < 0) j = 0; k = last-pfirst; if (k > db->nreads) k = db->nreads; while (j < k) table[j++] = (uint16) i; first = last; } fclose(indx); indx = NULL; } else { // Load in coding scheme for each file, adjust .coff of first read in the file, and // record which table each read uses ncodes = nfiles; coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (coding == NULL || table == NULL) goto error; first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } fseeko(quiva,db->reads[first].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first].coff = ftello(quiva); for (j = first; j < last; j++) table[j] = (uint16) i; first = last; } } // Allocate and fill in the DAZZ_QV record and add it to the front of the // track list qvtrk = (DAZZ_QV *) Malloc(sizeof(DAZZ_QV),"Allocating QV pseudo-track"); if (qvtrk == NULL) goto error; qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); if (qvtrk->name == NULL) goto error; qvtrk->next = db->tracks; db->tracks = (DAZZ_TRACK *) qvtrk; qvtrk->ncodes = ncodes; qvtrk->table = table; qvtrk->coding = coding; qvtrk->quiva = quiva; } fclose(istub); return (0); error: if (qvtrk != NULL) free(qvtrk); if (table != NULL) free(table); if (coding != NULL) { int i; for (i = 0; i < ncodes; i++) Free_QVcoding(coding+i); free(coding); } if (indx != NULL) fclose(indx); if (istub != NULL) fclose(istub); fclose(quiva); EXIT(1); } // Close the QV stream, free the QV pseudo track and all associated memory void Close_QVs(DAZZ_DB *db) { DAZZ_TRACK *track; DAZZ_QV *qvtrk; int i; Active_DB = NULL; track = db->tracks; if (track != NULL && strcmp(track->name,".@qvs") == 0) { qvtrk = (DAZZ_QV *) track; for (i = 0; i < qvtrk->ncodes; i++) Free_QVcoding(qvtrk->coding+i); free(qvtrk->coding); free(qvtrk->table); fclose(qvtrk->quiva); db->tracks = track->next; free(track); } return; } /******************************************************************************************* * * TRACK LOAD & CLOSE ROUTINES * ********************************************************************************************/ // Return status of track: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track int Check_Track(DAZZ_DB *db, char *track, int *kind) { FILE *afile; int tracklen, size, ispart; int ureads, treads; afile = NULL; if (db->part > 0) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) return (-2); if (fread(&tracklen,sizeof(int),1,afile) != 1) { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } if (fread(&size,sizeof(int),1,afile) != 1) { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } if (size == 0) *kind = MASK_TRACK; else if (size > 0) *kind = CUSTOM_TRACK; else { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } fclose(afile); if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (tracklen == ureads) return (0); else if (tracklen == treads) return (1); else return (-1); } // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created DAZZ_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned. DAZZ_TRACK *Load_Track(DAZZ_DB *db, char *track) { FILE *afile, *dfile; int tracklen, size; int nreads, ispart; int treads, ureads; void *anno; void *data; char *name; DAZZ_TRACK *record; if (track[0] == '.') { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); EXIT(NULL); } for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,track) == 0) return (record); afile = NULL; if (db->part) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); return (NULL); } dfile = NULL; anno = NULL; data = NULL; record = NULL; if (ispart) name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"); else name = Catenate(db->path,".",track,".data"); if (name == NULL) goto error; dfile = fopen(name,"r"); if (fread(&tracklen,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (fread(&size,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size < 0) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 0) size = 8; if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (db->trimmed) { if (tracklen != treads && tracklen != ureads) { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) { if (tracklen == treads) fseeko(afile,size*db->tfirst,SEEK_CUR); else fseeko(afile,size*db->ufirst,SEEK_CUR); } } else { if (tracklen != ureads) { if (tracklen == treads) EPRINTF(EPLACE,"%s: Track '%s' is for a trimmed DB !\n",Prog_Name,track); else EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->ufirst,SEEK_CUR); } if (tracklen == treads) nreads = ((int *) (db->reads))[-2]; else nreads = ((int *) (db->reads))[-1]; anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); if (anno == NULL) goto error; if (dfile != NULL) { int64 *anno8, off8, dlen; int *anno4, off4; int i; if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 4) { anno4 = (int *) anno; off4 = anno4[0]; if (off4 != 0) { for (i = 0; i <= nreads; i++) anno4[i] -= off4; fseeko(dfile,off4,SEEK_SET); } dlen = anno4[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } else { anno8 = (int64 *) anno; off8 = anno8[0]; if (off8 != 0) { for (i = 0; i <= nreads; i++) anno8[i] -= off8; fseeko(dfile,off8,SEEK_SET); } dlen = anno8[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } if (data == NULL) goto error; if (dlen > 0) { if (fread(data,dlen,1,dfile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track); goto error; } } fclose(dfile); dfile = NULL; } else { if (fread(anno,size,nreads,afile) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } data = NULL; } fclose(afile); record = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK),"Allocating Track Record"); if (record == NULL) goto error; record->name = Strdup(track,"Allocating Track Name"); if (record->name == NULL) goto error; record->data = data; record->anno = anno; record->size = size; if (db->trimmed && tracklen != treads) { if (Late_Track_Trim(db,record,ispart)) goto error; } if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) { record->next = db->tracks->next; db->tracks->next = record; } else { record->next = db->tracks; db->tracks = record; } return (record); error: if (record != NULL) free(record); if (data != NULL) free(data); if (anno != NULL) free(anno); if (dfile != NULL) fclose(dfile); fclose(afile); EXIT (NULL); } // Assumming file pointer for afile is correctly positioned at the start of a extra item, // and aname is the name of the .anno file, decode the value present and places it in // extra if extra->nelem == 0, otherwise reduce the value just read into extra according // according the to the directive given by 'accum'. Leave the read poinrt at the next // extra or end-of-file. // Returns: // 1 if at the end of file, // 0 if item was read and folded correctly, // -1 if there was a system IO or allocation error (if interactive), and // -2 if the new value could not be reduced into the currenct value of extra (interactive) int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra) { int vtype, nelem, accum, slen; char *name; void *value; #define EREAD(v,s,n,file,ret) \ { if (fread(v,s,n,file) != (size_t) n) \ { if (ferror(file)) \ fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ else if (ret) \ return (1); \ else \ fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,aname); \ EXIT(-1); \ } \ } EREAD(&vtype,sizeof(int),1,afile,1) EREAD(&nelem,sizeof(int),1,afile,0) EREAD(&accum,sizeof(int),1,afile,0) EREAD(&slen,sizeof(int),1,afile,0) if (extra == NULL) { if (fseeko(afile,slen+8*nelem,SEEK_CUR) < 0) { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); EXIT(-1); } return (0); } name = (char *) Malloc(slen+1,"Allocating extra name"); value = Malloc(8*nelem,"Allocating extra value"); if (name == NULL || value == NULL) EXIT(-1); EREAD(name,1,slen,afile,0); EREAD(value,8,nelem,afile,0); name[slen] = '\0'; if (extra->nelem == 0) { extra->vtype = vtype; extra->nelem = nelem; extra->accum = accum; extra->name = name; extra->value = value; return (0); } if (vtype != extra->vtype) { fprintf(stderr,"%s: Type of extra %s does not agree with previous .anno block files\n", Prog_Name,name); goto error; } if (nelem != extra->nelem) { fprintf(stderr,"%s: Length of extra %s does not agree with previous .anno block files\n", Prog_Name,name); goto error; } if (accum != extra->accum) { fprintf(stderr,"%s: Reduction indicator of extra %s does not agree with",Prog_Name,name); fprintf(stderr," previos .anno block files\n"); goto error; } if (strcmp(name,extra->name) != 0) { fprintf(stderr,"%s: Expecting extra %s in .anno block file, not %s\n", Prog_Name,extra->name,name); goto error; } if (vtype == DB_INT) { int64 *ival = (int64 *) value; int64 *eval = (int64 *) (extra->value); int j; if (accum == DB_EXACT) { for (j = 0; j < nelem; j++) if (eval[j] != ival[j]) { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name); fprintf(stderr," with previous .anno block files\n"); goto error; } } else { for (j = 0; j < nelem; j++) eval[j] += ival[j]; } } else { double *ival = (double *) value; double *eval = (double *) (extra->value); int j; if (accum == DB_EXACT) { for (j = 0; j < nelem; j++) if (eval[j] != ival[j]) { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name); fprintf(stderr," with previous .anoo block files\n"); goto error; } } else { for (j = 0; j < nelem; j++) eval[j] += ival[j]; } } free(value); free(name); return (0); error: free(value); free(name); EXIT(1); } // Write extra record to end of file afile and advance write pointer // If interactive, then return non-zero on error, if bash, then print // and halt if an error int Write_Extra(FILE *afile, DAZZ_EXTRA *extra) { int slen; #define EWRITE(v,s,n,file) \ { if (fwrite(v,s,n,file) != (size_t) n) \ { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ EXIT(1); \ } \ } EWRITE(&(extra->vtype),sizeof(int),1,afile) FWRITE(&(extra->nelem),sizeof(int),1,afile) FWRITE(&(extra->accum),sizeof(int),1,afile) slen = strlen(extra->name); FWRITE(&slen,sizeof(int),1,afile) FWRITE(extra->name,1,slen,afile) FWRITE(extra->value,8,extra->nelem,afile) return (0); } void Close_Track(DAZZ_DB *db, char *track) { DAZZ_TRACK *record, *prev; prev = NULL; for (record = db->tracks; record != NULL; record = record->next) { if (strcmp(record->name,track) == 0) { free(record->anno); free(record->data); free(record->name); if (prev == NULL) db->tracks = record->next; else prev->next = record->next; free(record); return; } prev = record; } return; } /******************************************************************************************* * * READ BUFFER ALLOCATION AND READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer big enough for the largest read in 'db', leaving room // for an initial delimiter character char *New_Read_Buffer(DAZZ_DB *db) { char *read; read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); if (read == NULL) EXIT(NULL); return (read+1); } // Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a // lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and // 3(T) otherwise. // // **NB**, the byte before read will be set to a delimiter character! int Load_Read(DAZZ_DB *db, int i, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(1); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); db->bases = (void *) bases; } off = r[i].boff; len = r[i].rlen; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(1); } } Uncompress_Read(len,read); if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (0); } // Load into 'read' the i'th arrow in 'db'. As an ASCII string if ascii is 1, // and as a numeric string otherwise. // DAZZ_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow" FILE *Arrow_File = NULL; // Becomes invalid after closing int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii) { FILE *arrow; int64 off; int len, clen; DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Arrow)\n",Prog_Name); EXIT(1); } if (Arrow_DB != db) { if (Arrow_File != NULL) fclose(Arrow_File); arrow = Fopen(Catenate(db->path,"","",".arw"),"r"); if (arrow == NULL) EXIT(1); Arrow_File = arrow; Arrow_DB = db; } else arrow = Arrow_File; off = r[i].boff; len = r[i].rlen; if (ftello(arrow) != off) fseeko(arrow,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,arrow) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Arrow)\n",Prog_Name); EXIT(1); } } Uncompress_Read(len,read); if (ascii == 1) { Letter_Arrow(read); read[-1] = '\0'; } else read[-1] = 4; return (0); } char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; int bbeg, bend; DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(NULL); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(NULL); db->bases = (void *) bases; } bbeg = beg/4; bend = (end-1)/4+1; off = r[i].boff + bbeg; len = end - beg; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = bend-bbeg; if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(NULL); } } Uncompress_Read(4*clen,read); read += beg%4; read[len] = 4; if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (read); } /******************************************************************************************* * * QV BUFFER ALLOCATION QV READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer of 5 vectors big enough for the largest read in 'db' char **New_QV_Buffer(DAZZ_DB *db) { char **entry; char *qvs; int i; qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); if (qvs == NULL || entry == NULL) EXIT(NULL); for (i = 0; i < 5; i++) entry[i] = qvs + i*db->maxlen; return (entry); } // Load into entry the QV streams for the i'th read from db. The parameter ascii applies to // the DELTAG stream as described for Load_Read. int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii) { DAZZ_READ *reads; FILE *quiva; int rlen; if (db != Active_DB) { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name); EXIT(1); } Active_QV = (DAZZ_QV *) db->tracks; Active_DB = db; } if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); EXIT(1); } reads = db->reads; quiva = Active_QV->quiva; rlen = reads[i].rlen; fseeko(quiva,reads[i].coff,SEEK_SET); if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) EXIT(1); if (ascii != 1) { char *deltag = entry[1]; if (ascii != 2) { char x = deltag[rlen]; deltag[rlen] = '\0'; Number_Read(deltag); deltag[rlen] = x; } else { int j; int u = 'A'-'a'; for (j = 0; j < rlen; j++) deltag[j] = (char) (deltag[j]+u); } } return (0); } /******************************************************************************************* * * BLOCK LOAD OF ALL READS (PRIMARILY FOR DALIGNER) * ********************************************************************************************/ // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // non-zero then the reads are converted to ACGT ascii, otherwise the reads are left // as numeric strings over 0(A), 1(C), 2(G), and 3(T). int Read_All_Sequences(DAZZ_DB *db, int ascii) { FILE *bases; int nreads = db->nreads; DAZZ_READ *reads = db->reads; void (*translate)(char *s); char *seq; int64 o, off; int i, len, clen; bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); if (seq == NULL) { fclose(bases); EXIT(1); } *seq++ = 4; if (ascii == 1) translate = Lower_Read; else translate = Upper_Read; o = 0; for (i = 0; i < nreads; i++) { len = reads[i].rlen; off = reads[i].boff; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(seq+o,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name); free(seq); fclose(bases); EXIT(1); } } Uncompress_Read(len,seq+o); if (ascii) translate(seq+o); reads[i].boff = o; o += (len+1); } reads[nreads].boff = o; fclose(bases); db->bases = (void *) seq; db->loaded = 1; return (0); } // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. // -1 is returned if the path could not be found, and 1 is returned if an error (reported // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. int List_DB_Files(char *path, void actor(char *path, char *extension)) { int status, plen, rlen, dlen; char *root, *pwd, *name; int isdam; DIR *dirp; struct dirent *dp; status = 0; pwd = PathTo(path); plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); rlen = strlen(root); if (root == NULL || pwd == NULL) { free(pwd); free(root); EXIT(1); } if ((dirp = opendir(pwd)) == NULL) { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; goto error; } isdam = 0; while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary) { name = dp->d_name; if (strcmp(name,Catenate("","",root,".db")) == 0) break; if (strcmp(name,Catenate("","",root,".dam")) == 0) { isdam = 1; break; } } if (dp == NULL) { status = -1; closedir(dirp); goto error; } if (isdam) actor(Catenate(pwd,"/",root,".dam"),"dam"); else actor(Catenate(pwd,"/",root,".db"),"db"); rewinddir(dirp); // Report each auxiliary file while ((dp = readdir(dirp)) != NULL) { name = dp->d_name; dlen = strlen(name); #ifdef HIDE_FILES if (name[0] != '.') continue; dlen -= 1; name += 1; #endif if (dlen < rlen+1) continue; if (name[rlen] != '.') continue; if (strncmp(name,root,rlen) != 0) continue; actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); } closedir(dirp); error: free(pwd); free(root); return (status); } void Print_Read(char *s, int width) { int i; if (s[0] < 4) { for (i = 0; s[i] != 4; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%d",s[i]); } printf("\n"); } else { for (i = 0; s[i] != '\0'; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%c",s[i]); } printf("\n"); } } /******************************************************************************************* * * COMMAND LINE BLOCK PARSER * Take a command line argument and interpret the '@' block number ranges. * Parse_Block_Arg produces an Block_Looper iterator object that can then * be invoked multiple times to iterate through all the files implied by * the @ pattern/range. * ********************************************************************************************/ typedef struct { int first, last, next; char *root, *pwd, *ppnt; char *slice; } _Block_Looper; // Advance the iterator e_parse to the next file, open it, and return the file pointer // to it. Return NULL if at the end of the list of files. FILE *Next_Block_Arg(Block_Looper *e_parse) { _Block_Looper *parse = (_Block_Looper *) e_parse; char *disp; FILE *input; parse->next += 1; if (parse->next > parse->last) return (NULL); if (parse->next < 0) disp = parse->root; else disp = Numbered_Suffix(parse->root,parse->next,parse->ppnt); if ((input = fopen(Catenate(parse->pwd,"/",disp,".las"),"r")) == NULL) { if (parse->last != INT_MAX) { fprintf(stderr,"%s: %s.las is not present\n",Prog_Name,disp); exit (1); } return (NULL); } return (input); } // Reset the iterator e_parse to the first file void Reset_Block_Arg(Block_Looper *e_parse) { _Block_Looper *parse = (_Block_Looper *) e_parse; parse->next = parse->first - 1; } // Return a pointer to the path for the current file char *Block_Arg_Path(Block_Looper *e_parse) { _Block_Looper *parse = (_Block_Looper *) e_parse; return (parse->pwd); } // Return a pointer to the root name for the current file char *Block_Arg_Root(Block_Looper *e_parse) { _Block_Looper *parse = (_Block_Looper *) e_parse; if (parse->next < 0) return (parse->root); else return (Numbered_Suffix(parse->root,parse->next,parse->ppnt)); } // Free the iterator void Free_Block_Arg(Block_Looper *e_parse) { _Block_Looper *parse = (_Block_Looper *) e_parse; free(parse->root); free(parse->pwd); free(parse->slice); free(parse); } char *Next_Block_Slice(Block_Looper *e_parse, int slice) { _Block_Looper *parse = (_Block_Looper *) e_parse; if (parse->slice == NULL) { int size = strlen(parse->pwd) + strlen(Block_Arg_Root(parse)) + 30; parse->slice = (char *) Malloc(size,"Block argument slice"); if (parse->slice == NULL) exit (1); } if (parse->first < 0) sprintf(parse->slice,"%s/%s",parse->pwd,parse->root); else sprintf(parse->slice,"%s/%s%c%d-%d%s",parse->pwd,parse->root,BLOCK_SYMBOL,parse->next+1, parse->next+slice,parse->ppnt); parse->next += slice; return (parse->slice); } // Parse the command line argument and return an iterator to move through the // file names, setting it up to report the first file. Block_Looper *Parse_Block_Arg(char *arg) { _Block_Looper *parse; char *pwd, *root; char *ppnt, *cpnt; int first, last; parse = (_Block_Looper *) Malloc(sizeof(_Block_Looper),"Allocating parse node"); pwd = PathTo(arg); root = Root(arg,".las"); if (parse == NULL || pwd == NULL || root == NULL) exit (1); ppnt = index(root,BLOCK_SYMBOL); if (ppnt == NULL) first = last = -1; else { if (index(ppnt+1,BLOCK_SYMBOL) != NULL) { fprintf(stderr,"%s: Two or more occurences of %c-sign in source name '%s'\n", Prog_Name,BLOCK_SYMBOL,root); exit (1); } *ppnt++ = '\0'; first = strtol(ppnt,&cpnt,10); if (cpnt == ppnt) { first = 1; last = INT_MAX; } else { if (first < 0) { fprintf(stderr, "%s: Integer following %c-sigan is less than 0 in source name '%s'\n", Prog_Name,BLOCK_SYMBOL,root); exit (1); } if (*cpnt == '-') { ppnt = cpnt+1; last = strtol(ppnt,&cpnt,10); if (cpnt == ppnt) { fprintf(stderr,"%s: Second integer must follow - in source name '%s'\n", Prog_Name,root); exit (1); } if (last < first) { fprintf(stderr, "%s: 2nd integer is less than 1st integer in source name '%s'\n", Prog_Name,root); exit (1); } ppnt = cpnt; } else { last = INT_MAX; ppnt = cpnt; } } } parse->pwd = pwd; parse->root = root; parse->ppnt = ppnt; parse->first = first; parse->last = last; parse->next = first-1; parse->slice = NULL; return ((Block_Looper *) parse); } DASCRUBBER-1.1/DB.h000066400000000000000000000633231327574206400134460ustar00rootroot00000000000000/******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. Eventually will also * support compressed quality information. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #ifndef _DAZZ_DB #define _DAZZ_DB #include #include "QV.h" #define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" // Undefine if you don't want this // For interactive applications where it is inappropriate to simply exit with an error // message to standard error, define the constant INTERACTIVE. If set, then error // messages are put in the global variable Ebuffer and the caller of a DB routine // can decide how to deal with the error. // // DB, QV, or alignment routines that can encounter errors function as before in // non-INTERACTIVE mode by exiting after printing an error message to stderr. In // INTERACTIVE mode the routines place a message at EPLACE and return an error // value. For such routines that were previously void, they are now int, and // return 1 if an error occured, 0 otherwise. #ifdef INTERACTIVE #define EPRINTF sprintf #define EPLACE Ebuffer #define EXIT(x) return (x) #else // BATCH #define EPRINTF fprintf #define EPLACE stderr #define EXIT(x) exit (1) #endif typedef unsigned char uint8; typedef unsigned short uint16; typedef unsigned int uint32; typedef unsigned long long uint64; typedef signed char int8; typedef signed short int16; typedef signed int int32; typedef signed long long int64; typedef float float32; typedef double float64; #define LAST_READ_SYMBOL '$' #define BLOCK_SYMBOL '@' /******************************************************************************************* * * COMMAND LINE INTERPRETATION MACROS * ********************************************************************************************/ extern char *Prog_Name; // Name of program #ifdef INTERACTIVE extern char Ebuffer[]; #endif #define ARG_INIT(name) \ Prog_Name = Strdup(name,""); \ for (i = 0; i < 128; i++) \ flags[i] = 0; #define ARG_FLAGS(set) \ for (k = 1; argv[i][k] != '\0'; k++) \ { if (index(set,argv[i][k]) == NULL) \ { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \ exit (1); \ } \ flags[(int) argv[i][k]] = 1; \ } #define ARG_POSITIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } \ if (var <= 0) \ { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_NON_NEGATIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } \ if (var < 0) \ { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_REAL(var) \ var = strtod(argv[i]+2,&eptr); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not a real number\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } /******************************************************************************************* * * GUARDED BATCH IO MACROS * ********************************************************************************************/ // Utilitieis int Count_Args(char *arg); #define SYSTEM_READ_ERROR \ { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ exit (2); \ } #define SYSTEM_WRITE_ERROR \ { fprintf(stderr,"%s: System error, write failed!\n",Prog_Name); \ exit (2); \ } #define SYSTEM_CLOSE_ERROR \ { fprintf(stderr,"%s: System error, file close failed!\n",Prog_Name); \ exit (2); \ } // Output #define FWRITE(v,s,n,file) \ { if (fwrite(v,s,n,file) != (size_t) n) \ SYSTEM_WRITE_ERROR \ } #define FPRINTF(file,...) \ { if (fprintf(file,__VA_ARGS__) < 0) \ SYSTEM_WRITE_ERROR \ } #define PRINTF(...) \ { if (printf(__VA_ARGS__) < 0) \ SYSTEM_WRITE_ERROR \ } #define FPUTS(x,file) \ { if (fputs(x,file) == EOF) \ SYSTEM_WRITE_ERROR \ } // Close #define FCLOSE(file) \ { if (fclose(file) != 0) \ SYSTEM_CLOSE_ERROR \ } // Input #define FREAD(v,s,n,file) \ { if (fread(v,s,n,file) != (size_t) n) \ { if (ferror(file)) \ SYSTEM_READ_ERROR \ else \ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ exit (1); \ } \ } \ } #define FSCANF(file,...) \ { if (fscanf(file,__VA_ARGS__) != Count_Args(#__VA_ARGS__)-1) \ { if (ferror(file)) \ SYSTEM_READ_ERROR \ else \ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ exit (1); \ } \ } \ } #define FGETS(v,n,file) \ { if (fgets(v,n,file) == NULL) \ { if (ferror(file)) \ SYSTEM_READ_ERROR \ else \ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ exit (1); \ } \ } \ } #define FSEEKO(file,p,d) \ { if (fseeko(file,p,d) < 0) \ SYSTEM_READ_ERROR \ } #define FTELLO(file) \ ( { int x = ftello(file); \ if (x < 0) \ SYSTEM_READ_ERROR \ ; x; \ } ) /******************************************************************************************* * * UTILITIES * ********************************************************************************************/ // The following general utilities return NULL if any of their input pointers are NULL, or if they // could not perform their function (in which case they also print an error to stderr). void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to char *Strdup(char *string, char *mesg); // stderr if out of memory FILE *Fopen(char *path, char *mode); // Open file path for "mode" char *PathTo(char *path); // Return path portion of file name "path" char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path" // Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer // Numbered_Suffix returns concatenation of left..right in a *temporary* buffer char *Catenate(char *path, char *sep, char *root, char *suffix); char *Numbered_Suffix(char *left, int num, char *right); // DB-related utilities void Print_Number(int64 num, int width, FILE *out); // Print readable big integer int Number_Digits(int64 num); // Return # of digits in printed number #define COMPRESSED_LEN(len) (((len)+3) >> 2) void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form void Print_Read(char *s, int width); void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt) void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT) void Number_Read(char *s); // Convert read from letters to numbers void Letter_Arrow(char *s); // Convert arrow pw's from numbers to uppercase letters (0-3 to 1234) void Number_Arrow(char *s); // Convert arrow pw string from letters to numbers /******************************************************************************************* * * DB IN-CORE DATA STRUCTURES * ********************************************************************************************/ #define DB_QV 0x03ff // Mask for 3-digit quality value #define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert #define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1) #define DB_ARROW 0x2 // DB is an arrow DB #define DB_ALL 0x1 // all wells are in the trimmed DB // Fields have different interpretations if a .db versus a .dam typedef struct { int origin; // Well # (DB), Contig # (DAM) int rlen; // Length of the sequence (Last pulse = fpulse + rlen) int fpulse; // First pulse (DB), left index of contig in scaffold (DAM) int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in '.qvs' file (DB), // Offset (in bytes) of scaffold header string in '.hdr' file (DAM) // 4 compressed shorts containing snr info if an arrow DB. int flags; // QV of read + flags above (DB only) } DAZZ_READ; // A track can be of 3 types: // data == NULL: there are nreads 'anno' records of size 'size'. // data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1]) // contains the variable length data // data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1]) // contains the variable length data typedef struct _track { struct _track *next; // Link to next track char *name; // Symbolic name of track int size; // Size in bytes of anno records void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL } DAZZ_TRACK; // The tailing part of a .anno track file can contain meta-information produced by the // command that produced the track. For example, the coverage, or good/bad parameters // for trimming, or even say a histogram of QV values. Each item is an array of 'nelem' // 64-bit ints or floats ('vtype' = DB_INT or DB_REAL), has a 'name' string that // describes it, and an indicator as to whether the values should be equal accross all // block tracks, or summed accross all block tracks (by Catrack). 'value' points at the // array of values #define DB_INT 0 #define DB_REAL 1 #define DB_EXACT 0 #define DB_SUM 1 typedef struct { int vtype; // INT64 or FLOAST64 int nelem; // >= 1 int accum; // EXACT, SUM char *name; void *value; } DAZZ_EXTRA; // The information for accessing QV streams is in a DAZZ_QV record that is a "pseudo-track" // named ".@qvs" and is always the first track record in the list (if present). Since normal // track names cannot begin with a . (this is enforced), this pseudo-track is never confused // with a normal track. typedef struct { struct _track *next; char *name; int ncodes; // # of coding tables QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h) uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with // scheme coding[table[i]] FILE *quiva; // the open file pointer to the .qvs file } DAZZ_QV; // The DB record holds all information about the current state of an active DB including an // array of DAZZ_READS, one per read, and a linked list of DAZZ_TRACKs the first of which // is always a DAZZ_QV pseudo-track (if the QVs have been loaded). typedef struct { int ureads; // Total number of reads in untrimmed DB int treads; // Total number of reads in trimmed DB int cutoff; // Minimum read length in block (-1 if not yet set) int allarr; // DB_ALL | DB_ARROW float freq[4]; // frequency of A, C, G, T, respectively // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed) int maxlen; // length of maximum read (initially over all DB) int64 totlen; // total # of bases (initially over all DB) int nreads; // # of reads in actively loaded portion of DB int trimmed; // DB has been trimmed by cutoff/all int part; // DB block (if > 0), total DB (if == 0) int ufirst; // Index of first read in block (without trimming) int tfirst; // Index of first read in block (with trimming) // In order to avoid forcing users to have to rebuild all thier DBs to accommodate // the addition of fields for the size of the actively loaded trimmed and untrimmed // blocks, an additional read record is allocated in "reads" when a DB is loaded into // memory (reads[-1]) and the two desired fields are crammed into the first two // integer spaces of the record. char *path; // Root name of DB for .bps, .qvs, and tracks int loaded; // Are reads loaded in memory? void *bases; // file pointer for bases file (to fetch reads from), // or memory pointer to uncompressed block of all sequences. DAZZ_READ *reads; // Array [-1..nreads] of DAZZ_READ DAZZ_TRACK *tracks; // Linked list of loaded tracks } DAZZ_DB; /******************************************************************************************* * * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock * ********************************************************************************************/ #define MAX_NAME 10000 // Longest file name or fasta header line #define DB_NFILE "files = %9d\n" // number of files #define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name #define DB_NBLOCK "blocks = %9d\n" // number of blocks #define DB_PARAMS "size = %10lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well #define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed) /******************************************************************************************* * * DB ROUTINES * ********************************************************************************************/ // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps, // .DB.qvs, and files .DB..anno and DB..data where is a track name // (not containing a . !). // A DAM is basically a DB except that: // 1. there are no QV's, instead .coff points the '\0' terminated fasta header of the read // in the file ..hdr file // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences // contain N-separated contigs), and .fpulse the first base of the contig in the // fasta entry // Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char *path, DAZZ_DB *db); // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(DAZZ_DB *db); // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(DAZZ_DB *db); // Return the size in bytes of the given DB int64 sizeof_DB(DAZZ_DB *db); // If QV pseudo track is not already in db's track list, then load it and set it up. // The database must not have been trimmed yet. -1 is returned if a .qvs file is not // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE // is defined. Otherwise a 0 is returned. int Load_QVs(DAZZ_DB *db); // Remove the QV pseudo track, all space associated with it, and close the .qvs file. void Close_QVs(DAZZ_DB *db); // Look up the file and header in the file of the indicated track. Return: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track // In addition, if opened (0 or 1 returned), then kind points at an integer indicating // the type of track as follows: // CUSTOM 0 => a custom track // MASK 1 => a mask track #define CUSTOM_TRACK 0 #define MASK_TRACK 1 int Check_Track(DAZZ_DB *db, char *track, int *kind); // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created DAZZ_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise // the routine prints an error message to stderr and exits if an error occurs, and returns // with NULL only if the track does not exist. DAZZ_TRACK *Load_Track(DAZZ_DB *db, char *track); // Assumming file pointer for afile is correctly positioned at the start of a extra item, // and aname is the name of the .anno file, decode the value present and places it in // extra if extra->nelem == 0, otherwise reduce the value just read into extra according // according the to the directive given by 'accum'. Leave the read poinrt at the next // extra or end-of-file. // Returns: // 1 if at the end of file, // 0 if item was read and folded correctly, // -1 if there was a system IO or allocation error (if interactive), and // -2 if the new value could not be reduced into the currenct value of extra (interactive) int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra); // Write extra record to end of file afile and advance write pointer // If interactive, then return non-zero on error, if bash, then print // and halt if an error int Write_Extra(FILE *afile, DAZZ_EXTRA *extra); // If track is on the db's track list, then it is removed and all storage associated with it // is freed. void Close_Track(DAZZ_DB *db, char *track); // Allocate and return a buffer big enough for the largest read in 'db'. // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte // are needed by the alignment algorithms. If cannot allocate memory then return NULL // if INTERACTIVE is defined, or print error to stderr and exit otherwise. char *New_Read_Buffer(DAZZ_DB *db); // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter // for traversals in either direction. A non-zero value is returned if an error occured // and INTERACTIVE is defined. int Load_Read(DAZZ_DB *db, int i, char *read, int ascii); // Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence, // and there is only a choice between numeric (0) or ascii (1); int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii); // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to // the string holding the substring so it has a delimeter for traversals in either direction. // A NULL pointer is returned if an error occured and INTERACTIVE is defined. char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii); // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, // or print error to stderr and exit otherwise. #define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer #define DEL_TAG 1 // The deleted characters #define INS_QV 2 // The insertion QVs #define SUB_QV 3 // The substitution QVs #define MRG_QV 4 // The merge QVs char **New_QV_Buffer(DAZZ_DB *db); // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters // are converted to a numeric or upper/lower case ascii string as per ascii. Return with // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii); // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T). // Return with a zero, except when an error occurs and INTERACTIVE is defined in which // case return wtih 1. int Read_All_Sequences(DAZZ_DB *db, int ascii); // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. // -1 is returned if the path could not be found, and 1 is returned if an error (reported // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. int List_DB_Files(char *path, void actor(char *path, char *extension)); // Take a command line argument and interpret the '@' block number ranges. // Parse_Block_Arg produces a Block_Looper iterator object that can then // be invoked multiple times to iterate through all the files implied by // the @ pattern/range. Next_Block_Slice returns a string encoing the next // slice files represented by an @-notation, and advances the iterator by // that many files. typedef void Block_Looper; Block_Looper *Parse_Block_Arg(char *arg); FILE *Next_Block_Arg(Block_Looper *e_parse); char *Next_Block_Slice(Block_Looper *e_parse,int slice); void Reset_Block_Arg(Block_Looper *e_parse); // Reset iterator to first file char *Block_Arg_Path(Block_Looper *e_parse); // Path of current file char *Block_Arg_Root(Block_Looper *e_parse); // Root name of current file void Free_Block_Arg(Block_Looper *e_parse); // Free the iterator #endif // _DAZZ_DB DASCRUBBER-1.1/LICENSE000066400000000000000000000053111327574206400140060ustar00rootroot00000000000000 Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: · Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. · Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. · The name of EWM may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. For any issues regarding this software and its use, contact EWM at: Eugene W. Myers Jr. Bautzner Str. 122e 01099 Dresden GERMANY Email: gene.myers@gmail.com DASCRUBBER-1.1/Makefile000066400000000000000000000025721327574206400144470ustar00rootroot00000000000000DEST_DIR = ~/bin CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing ALL = DAScover DASqv DAStrim DASpatch DASedit DASmap DASrealign REPcover REPqv REPtrim all: $(ALL) DAScover: DAScover.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DAScover DAScover.c align.c DB.c QV.c -lm REPcover: REPcover.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o REPcover REPcover.c DB.c QV.c -lm DASqv: DASqv.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DASqv DASqv.c align.c DB.c QV.c -lm REPqv: REPqv.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o REPqv REPqv.c DB.c QV.c -lm DAStrim: DAStrim.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DAStrim DAStrim.c align.c DB.c QV.c -lm REPtrim: REPtrim.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o REPtrim REPtrim.c DB.c QV.c -lm DASpatch: DASpatch.c align.h align.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DASpatch DASpatch.c align.c DB.c QV.c -lm DASedit: DASedit.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DASedit DASedit.c align.c DB.c QV.c -lm DASmap: DASmap.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DASmap DASmap.c DB.c QV.c -lm DASrealign: DASrealign.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DASrealign DASrealign.c align.c DB.c QV.c -lm clean: rm -f $(ALL) rm -fr *.dSYM rm -f scrubber.tar.gz install: cp $(ALL) $(DEST_DIR) package: make clean tar -zcf scrubber.tar.gz README.md Makefile *.h *.c DASCRUBBER-1.1/QV.c000066400000000000000000001132131327574206400134740ustar00rootroot00000000000000/******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #undef DEBUG #define MIN_BUFFER 1000 #define HUFF_CUTOFF 16 // This cannot be larger than 16 ! /******************************************************************************************* * * Endian flipping routines * ********************************************************************************************/ static int LittleEndian; // Little-endian machine ? // Referred by: Decode & Decode_Run static int Flip; // Flip endian of all coded shorts and ints // Referred by: Decode & Decode_Run & Read_Scheme static void Set_Endian(int flip) { uint32 x = 3; uint8 *b = (uint8 *) (&x); Flip = flip; LittleEndian = (b[0] == 3); } static void Flip_Long(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[3]; v[3] = x; x = v[1]; v[1] = v[2]; v[2] = x; } static void Flip_Short(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[1]; v[1] = x; } /******************************************************************************************* * * Routines for computing a Huffman Encoding Scheme * ********************************************************************************************/ typedef struct { int type; // 0 => normal, 1 => normal but has long codes, 2 => truncated uint32 codebits[256]; // If type = 2, then code 255 is the special code for int codelens[256]; // non-Huffman exceptions int lookup[0x10000]; // Lookup table (just for decoding) } HScheme; typedef struct _HTree { struct _HTree *lft, *rgt; uint64 count; } HTree; // Establish heap property from node s down (1 is root, siblings of n are 2n and 2n+1) // assuming s is the only perturbation in the tree. static void Reheap(int s, HTree **heap, int hsize) { int c, l, r; HTree *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; hr = heap[r]; if (r > hsize || hr->count > hl->count) { if (hs->count > hl->count) { heap[c] = hl; c = l; } else break; } else { if (hs->count > hr->count) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } // Given Huffman tree build a table of codes from it, the low-order codelens[s] bits // of codebits[s] contain the code for symbol s. static void Build_Table(HTree *node, int code, int len, uint32 *codebits, int *codelens) { if (node->rgt == NULL) { uint64 symbol = (uint64) (node->lft); codebits[symbol] = code; codelens[symbol] = len; } else { code <<= 1; len += 1; Build_Table(node->lft,code,len,codebits,codelens); Build_Table(node->rgt,code+1,len,codebits,codelens); } } // For the non-zero symbols in hist, compute a huffman tree over them, and then // build a table of the codes. If inscheme is not NULL, then place all symbols // with code 255 or with more than HUFF_CUTOFF bits in the encoding by inscheme // as a single united entity, whose code signals that the value of these symbols // occur explicitly in 8 (values) or 16 (run lengths) bits following the code. // All the symbols in this class will have the same entry in the code table and // 255 is always in this class. static HScheme *Huffman(uint64 *hist, HScheme *inscheme) { HScheme *scheme; HTree *heap[259]; HTree node[512]; int hsize; HTree *lft, *rgt; int value, range; int i; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); hsize = 0; // Load heap value = 0; if (inscheme != NULL) { node[0].count = 0; node[0].lft = (HTree *) (uint64) 255; node[0].rgt = NULL; heap[++hsize] = node+(value++); } for (i = 0; i < 256; i++) if (hist[i] > 0) { if (inscheme != NULL && (inscheme->codelens[i] > HUFF_CUTOFF || i == 255)) node[0].count += hist[i]; else { node[value].count = hist[i]; node[value].lft = (HTree *) (uint64) i; node[value].rgt = NULL; heap[++hsize] = node+(value++); } } for (i = hsize/2; i >= 1; i--) // Establish heap property Reheap(i,heap,hsize); range = value; // Merge pairs with smallest count until have a tree for (i = 1; i < value; i++) { lft = heap[1]; heap[1] = heap[hsize--]; Reheap(1,heap,hsize); rgt = heap[1]; node[range].lft = lft; node[range].rgt = rgt; node[range].count = lft->count + rgt->count; heap[1] = node+(range++); Reheap(1,heap,hsize); } for (i = 0; i < 256; i++) // Build the code table { scheme->codebits[i] = 0; scheme->codelens[i] = 0; } Build_Table(node+(range-1),0,0,scheme->codebits,scheme->codelens); if (inscheme != NULL) // Set scheme type and if truncated (2), map truncated codes { scheme->type = 2; // to code and length for 255 for (i = 0; i < 255; i++) if (inscheme->codelens[i] > HUFF_CUTOFF || scheme->codelens[i] > HUFF_CUTOFF) { scheme->codelens[i] = scheme->codelens[255]; scheme->codebits[i] = scheme->codebits[255]; } } else { scheme->type = 0; for (i = 0; i < 256; i++) { if (scheme->codelens[i] > HUFF_CUTOFF) scheme->type = 1; } } return (scheme); } #ifdef DEBUG // For debug, show the coding table static void Print_Table(HScheme *scheme, uint64 *hist, int infosize) { uint64 total_bits; uint32 specval, mask, code, *bits; int speclen, clen, *lens; int i, k; total_bits = 0; bits = scheme->codebits; lens = scheme->codelens; if (scheme->type == 2) { specval = bits[255]; speclen = lens[255]; } else specval = speclen = 0x7fffffff; printf("\nCode Table:\n"); for (i = 0; i < 256; i++) if (lens[i] > 0) { clen = lens[i]; mask = (1 << clen); code = bits[i]; printf(" %3d: %2d ",i,clen); for (k = 0; k < clen; k++) { mask >>= 1; if (code & mask) printf("1"); else printf("0"); } if (code == specval && clen == speclen) { printf(" ***"); if (hist != NULL) total_bits += (clen+infosize)*hist[i]; } else if (hist != NULL) total_bits += clen*hist[i]; printf("\n"); } if (hist != NULL) printf("\nTotal Bytes = %lld\n",(total_bits-1)/8+1); } // For debug, show the histogram static void Print_Histogram(uint64 *hist) { int i, low, hgh; uint64 count; for (hgh = 255; hgh >= 0; hgh--) if (hist[hgh] != 0) break; for (low = 0; low < 256; low++) if (hist[low] != 0) break; count = 0; for (i = low; i <= hgh; i++) count += hist[i]; for (i = hgh; i >= low; i--) printf(" %3d: %8llu %5.1f%%\n",i,hist[i],(hist[i]*100.)/count); } #endif /******************************************************************************************* * * Read and Write Huffman Schemes * ********************************************************************************************/ // Write the code table to out. static void Write_Scheme(HScheme *scheme, FILE *out) { int i; uint8 x; uint32 *bits; int *lens; lens = scheme->codelens; bits = scheme->codebits; x = (uint8) (scheme->type); fwrite(&x,1,1,out); for (i = 0; i < 256; i++) { x = (uint8) (lens[i]); fwrite(&x,1,1,out); if (x > 0) fwrite(bits+i,sizeof(uint32),1,out); } } // Allocate and read a code table from in, and return a pointer to it. static HScheme *Read_Scheme(FILE *in) { HScheme *scheme; int *look, *lens; uint32 *bits, base; int i, j, powr; uint8 x; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); lens = scheme->codelens; bits = scheme->codebits; look = scheme->lookup; if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n"); free(scheme); return (NULL); } scheme->type = x; for (i = 0; i < 256; i++) { if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i); return (NULL); } lens[i] = x; if (x > 0) { if (fread(bits+i,sizeof(uint32),1,in) != 1) { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i); free(scheme); return (NULL); } } else bits[i] = 0; } if (Flip) { for (i = 0; i < 256; i++) Flip_Long(bits+i); } for (i = 0; i < 256; i++) { if (lens[i] > 0) { base = (bits[i] << (16-lens[i])); powr = (1 << (16-lens[i])); for (j = 0; j < powr; j++) look[base+j] = i; } } return (scheme); } /******************************************************************************************* * * Encoders and Decoders * ********************************************************************************************/ // Encode read[0..rlen-1] according to scheme and write to out static void Encode(HScheme *scheme, FILE *out, uint8 *read, int rlen) { uint32 x, c, ocode; int n, k, olen, llen; int *nlens; uint32 *nbits; uint32 nspec; int nslen; nlens = scheme->codelens; nbits = scheme->codebits; if (scheme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; #define OCODE(L,C) \ { int len = olen + (L); \ uint32 code = (C); \ \ llen = olen; \ if (len >= 32) \ { olen = len-32; \ ocode |= (code >> olen); \ fwrite(&ocode,sizeof(uint32),1,out); \ if (olen > 0) \ ocode = (code << (32-olen)); \ else \ ocode = 0; \ } \ else \ { olen = len; \ ocode |= (code << (32-olen));; \ } \ } llen = 0; olen = 0; ocode = 0; for (k = 0; k < rlen; k++) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); } if (olen > 0) // Tricky: must pad so decoder does not read past { fwrite(&ocode,sizeof(uint32),1,out); // last integer int the coded output. if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Encode read[0..rlen-1] according to non-rchar table neme, and run-length table reme for // runs of rchar characters. Write to out. static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int rlen, int rchar) { uint32 x, c, ocode; int n, h, k, olen, llen; int *nlens, *rlens; uint32 *nbits, *rbits; uint32 nspec, rspec; int nslen, rslen; nlens = neme->codelens; nbits = neme->codebits; rlens = reme->codelens; rbits = reme->codebits; if (neme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; rspec = rbits[255]; rslen = rlens[255]; llen = 0; olen = 0; ocode = 0; k = 0; while (k < rlen) { h = k; while (k < rlen && read[k] == rchar) k += 1; if (k-h >= 255) x = 255; else x = k-h; n = rlens[x]; c = rbits[x]; OCODE(n,c); if (c == rspec && n == rslen) OCODE(16,k-h); if (k < rlen) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); k += 1; } } if (olen > 0) { fwrite(&ocode,sizeof(uint32),1,out); if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Read and decode from in, the next rlen symbols into read according to scheme static int Decode(HScheme *scheme, FILE *in, char *read, int rlen) { int *look, *lens; int signal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (scheme->type == 2) signal = 255; else signal = 256; lens = scheme->codelens; look = scheme->lookup; #define GET \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } #define GETFLIP \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ Flip_Long(ipart); \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = look[*xpart]; n = lens[c]; if (c == signal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } else for (j = 0; j < rlen; j++) { GET c = look[*xpart]; n = lens[c]; if (c == signal) { GET c = *cpart; n = 8; } read[j] = (char) c; } return (0); } // Read and decode from in, the next rlen symbols into read according to non-rchar scheme // neme, and the rchar runlength shceme reme static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, int rlen, int rchar) { int *nlook, *nlens; int *rlook, *rlens; int nsignal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c, k; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (neme->type == 2) nsignal = 255; else nsignal = 256; nlens = neme->codelens; nlook = neme->lookup; rlens = reme->codelens; rlook = reme->lookup; n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GETFLIP c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GETFLIP c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } } else for (j = 0; j < rlen; j++) { GET c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GET c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GET c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GET c = *cpart; n = 8; } read[j] = (char) c; } } return (0); } /******************************************************************************************* * * Histogrammers * ********************************************************************************************/ // Histogram runlengths of symbol runChar in stream[0..rlen-1] into run. static void Histogram_Seqs(uint64 *hist, uint8 *stream, int rlen) { int k; for (k = 0; k < rlen; k++) hist[stream[k]] += 1; } static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar) { int k, h; k = 0; while (k < rlen) { h = k; while (k < rlen && stream[k] == runChar) k += 1; if (k-h >= 256) run[255] += 1; else run[k-h] += 1; if (k < rlen) k += 1; } } /******************************************************************************************* * * Reader * ********************************************************************************************/ static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan, static int Rmax = -1; // Compress_Next_QVentry static int Nline; // Referred by: QVcoding_Scan char *QVentry() { return (Read); } void Set_QV_Line(int line) { Nline = line; } int Get_QV_Line() { return (Nline); } // If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines // for a sequence. Place line j at Read+j*Rmax and the length of every line is returned // unless eof occurs in which case return -1. If any error occurs return -2. int Read_Lines(FILE *input, int nlines) { int i, rlen; int tmax; char *tread; char *other; if (Read == NULL) { tmax = MIN_BUFFER; tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; } Nline += 1; if (fgets(Read,Rmax,input) == NULL) return (-1); rlen = strlen(Read); while (Read[rlen-1] != '\n') { tmax = ((int) 1.4*Rmax) + MIN_BUFFER; tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; if (fgets(Read+rlen,Rmax-rlen,input) == NULL) { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline); EXIT(-2); } rlen += strlen(Read+rlen); } other = Read; for (i = 1; i < nlines; i++) { other += Rmax; Nline += 1; if (fgets(other,Rmax,input) == NULL) { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(-2); } if (rlen != (int) strlen(other)) { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline); EXIT(-2); } } return (rlen-1); } /******************************************************************************************* * * Tag compression and decompression routines * ********************************************************************************************/ // Keep only the symbols in tags[0..rlen-1] for which qvs[k] != rchar and // return the # of symbols kept. static int Pack_Tag(char *tags, char *qvs, int rlen, int rchar) { int j, k; j = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) tags[j++] = tags[k]; tags[j] = '\0'; return (j); } // Count the # of non-rchar symbols in qvs[0..rlen-1] static int Packed_Length(char *qvs, int rlen, int rchar) { int k, clen; clen = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) clen += 1; return (clen); } // Unpack tags by moving its i'th char to position k where qvs[k] is the i'th non-rchar // symbol in qvs. All other chars are set to rchar. rlen is the length of qvs and // the unpacked result, clen is the initial length of tags. static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar) { int j, k; j = clen-1; for (k = rlen-1; k >= 0; k--) { if (qvs[k] == rchar) tags[k] = 'n'; else tags[k] = tags[j--]; } } /******************************************************************************************* * * Statistics Scan and Scheme creation and write * ********************************************************************************************/ // Read up to the next num entries or until eof from the .quiva file on input and record // frequency statistics. Copy these entries to the temporary file temp if != NULL. // If there is an error then -1 is returned, otherwise the number of entries read. static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256]; static uint64 totChar; static int delChar, subChar; // Referred by: QVcoding_Scan, Create_QVcoding void QVcoding_Scan1(int rlen, char *delQV, char *delTag, char *insQV, char *mergeQV, char *subQV) { if (rlen == 0) // Initialization call { int i; // Zero histograms bzero(delHist,sizeof(uint64)*256); bzero(mrgHist,sizeof(uint64)*256); bzero(insHist,sizeof(uint64)*256); bzero(subHist,sizeof(uint64)*256); for (i = 0; i < 256; i++) delRun[i] = subRun[i] = 1; totChar = 0; delChar = -1; subChar = -1; return; } // Add streams to accumulating histograms and figure out the run chars // for the deletion and substition streams Histogram_Seqs(delHist,(uint8 *) delQV,rlen); Histogram_Seqs(insHist,(uint8 *) insQV,rlen); Histogram_Seqs(mrgHist,(uint8 *) mergeQV,rlen); Histogram_Seqs(subHist,(uint8 *) subQV,rlen); if (delChar < 0) { int k; for (k = 0; k < rlen; k++) if (delTag[k] == 'n' || delTag[k] == 'N') { delChar = delQV[k]; break; } } if (delChar >= 0) Histogram_Runs( delRun,(uint8 *) delQV,rlen,delChar); totChar += rlen; if (subChar < 0) { if (totChar >= 100000) { int k; subChar = 0; for (k = 1; k < 256; k++) if (subHist[k] > subHist[subChar]) subChar = k; } } if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) subQV,rlen,subChar); return; } int QVcoding_Scan(FILE *input, int num, FILE *temp) { char *slash; int rlen; int i, r; // Zero histograms bzero(delHist,sizeof(uint64)*256); bzero(mrgHist,sizeof(uint64)*256); bzero(insHist,sizeof(uint64)*256); bzero(subHist,sizeof(uint64)*256); for (i = 0; i < 256; i++) delRun[i] = subRun[i] = 1; totChar = 0; delChar = -1; subChar = -1; // Make a sweep through the .quiva entries, histogramming the relevant things // and figuring out the run chars for the deletion and substition streams r = 0; for (i = 0; i < num; i++) { int well, beg, end, qv; rlen = Read_Lines(input,1); if (rlen == -2) EXIT(-1); if (rlen < 0) break; if (rlen == 0 || Read[0] != '@') { EPRINTF(EPLACE,"Line %d: Header in quiva file is missing\n",Nline); EXIT(-1); } slash = index(Read+1,'/'); if (slash == NULL) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(-1); } if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(-1); } if (temp != NULL) fputs(Read,temp); rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(-1); } if (temp != NULL) { fputs(Read,temp); fputs(Read+Rmax,temp); fputs(Read+2*Rmax,temp); fputs(Read+3*Rmax,temp); fputs(Read+4*Rmax,temp); } Histogram_Seqs(delHist,(uint8 *) (Read),rlen); Histogram_Seqs(insHist,(uint8 *) (Read+2*Rmax),rlen); Histogram_Seqs(mrgHist,(uint8 *) (Read+3*Rmax),rlen); Histogram_Seqs(subHist,(uint8 *) (Read+4*Rmax),rlen); if (delChar < 0) { int k; char *del = Read+Rmax; for (k = 0; k < rlen; k++) if (del[k] == 'n' || del[k] == 'N') { delChar = Read[k]; break; } } if (delChar >= 0) Histogram_Runs( delRun,(uint8 *) (Read),rlen,delChar); totChar += rlen; if (subChar < 0) { if (totChar >= 100000) { int k; subChar = 0; for (k = 1; k < 256; k++) if (subHist[k] > subHist[subChar]) subChar = k; } } if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar); r += 1; } return (r); } // Using the statistics in the global stat tables, create the Huffman schemes and write // them to output. If lossy is set, then create a lossy table for the insertion and merge // QVs. QVcoding *Create_QVcoding(int lossy) { static QVcoding coding; HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; HScheme *dRunScheme, *sRunScheme; delScheme = NULL; dRunScheme = NULL; insScheme = NULL; mrgScheme = NULL; subScheme = NULL; sRunScheme = NULL; // Check whether using a subtitution run char is a win if (totChar < 200000 || subHist[subChar] < .5*totChar) subChar = -1; // If lossy encryption is enabled then scale insertions and merge QVs. if (lossy) { int k; for (k = 0; k < 256; k += 2) { insHist[k] += insHist[k+1]; insHist[k+1] = 0; } for (k = 0; k < 256; k += 4) { mrgHist[k] += mrgHist[k+1]; mrgHist[k] += mrgHist[k+2]; mrgHist[k] += mrgHist[k+3]; mrgHist[k+1] = 0; mrgHist[k+2] = 0; mrgHist[k+3] = 0; } } // Build a Huffman scheme for each stream entity from the histograms #define SCHEME_MACRO(meme,hist,label,bits) \ scheme = Huffman( (hist), NULL); \ if (scheme == NULL) \ goto error; \ if (scheme->type) \ { (meme) = Huffman( (hist), scheme); \ free(scheme); \ } \ else \ (meme) = scheme; #ifdef DEBUG #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) \ printf("\n%s\n", (label) ); \ Print_Histogram( (hist)); \ Print_Table( (meme), (hist), (bits)); #else #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) #endif { HScheme *scheme; if (delChar < 0) { MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs", 8); dRunScheme = NULL; } else { delHist[delChar] = 0; MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs less run char", 8); MAKE_SCHEME(dRunScheme,delRun, "Histogram of Deletion Runs QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",delChar); #endif } #ifdef DEBUG { int k; uint64 count; count = 0; for (k = 0; k < 256; k++) count += delHist[k]; printf("\nDelTag will require %lld bytes\n",count/4); } #endif MAKE_SCHEME(insScheme,insHist, "Hisotgram of Insertion QVs", 8); MAKE_SCHEME(mrgScheme,mrgHist, "Hisotgram of Merge QVs", 8); if (subChar < 0) { MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs", 8); sRunScheme = NULL; } else { subHist[subChar] = 0; MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs less run char", 8); MAKE_SCHEME(sRunScheme,subRun, "Histogram of Substitution Run QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",subChar); #endif } } // Setup endian handling Set_Endian(0); coding.delScheme = delScheme; coding.insScheme = insScheme; coding.mrgScheme = mrgScheme; coding.subScheme = subScheme; coding.dRunScheme = dRunScheme; coding.sRunScheme = sRunScheme; coding.delChar = delChar; coding.subChar = subChar; coding.prefix = NULL; coding.flip = 0; return (&coding); error: if (delScheme != NULL) free(delScheme); if (dRunScheme != NULL) free(dRunScheme); if (insScheme != NULL) free(insScheme); if (mrgScheme != NULL) free(mrgScheme); if (subScheme != NULL) free(subScheme); if (sRunScheme != NULL) free(sRunScheme); EXIT(NULL); } // Write the encoding scheme 'coding' to 'output' void Write_QVcoding(FILE *output, QVcoding *coding) { // Write out the endian key, run chars, and prefix (if not NULL) { uint16 half; int len; half = 0x33cc; fwrite(&half,sizeof(uint16),1,output); if (coding->delChar < 0) half = 256; else half = (uint16) (coding->delChar); fwrite(&half,sizeof(uint16),1,output); if (coding->subChar < 0) half = 256; else half = (uint16) (coding->subChar); fwrite(&half,sizeof(uint16),1,output); len = strlen(coding->prefix); fwrite(&len,sizeof(int),1,output); fwrite(coding->prefix,1,len,output); } // Write out the scheme tables Write_Scheme(coding->delScheme,output); if (coding->delChar >= 0) Write_Scheme(coding->dRunScheme,output); Write_Scheme(coding->insScheme,output); Write_Scheme(coding->mrgScheme,output); Write_Scheme(coding->subScheme,output); if (coding->subChar >= 0) Write_Scheme(coding->sRunScheme,output); } // Read the encoding scheme 'coding' to 'output' QVcoding *Read_QVcoding(FILE *input) { static QVcoding coding; // Read endian key, run chars, and short name common to all headers { uint16 half; int len; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n"); EXIT(NULL); } coding.flip = (half != 0x33cc); if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.delChar = half; if (coding.delChar >= 256) coding.delChar = -1; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.subChar = half; if (coding.subChar >= 256) coding.subChar = -1; // Read the short name common to all headers if (fread(&len,sizeof(int),1,input) != 1) { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Long(&len); coding.prefix = (char *) Malloc(len+1,"Allocating header prefix"); if (coding.prefix == NULL) EXIT(NULL); if (len > 0) { if (fread(coding.prefix,len,1,input) != 1) { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n"); EXIT(NULL); } } coding.prefix[len] = '\0'; } // Setup endian handling Set_Endian(coding.flip); // Read the Huffman schemes used to compress the data coding.delScheme = NULL; coding.dRunScheme = NULL; coding.insScheme = NULL; coding.mrgScheme = NULL; coding.subScheme = NULL; coding.sRunScheme = NULL; coding.delScheme = Read_Scheme(input); if (coding.delScheme == NULL) goto error; if (coding.delChar >= 0) { coding.dRunScheme = Read_Scheme(input); if (coding.dRunScheme == NULL) goto error; } coding.insScheme = Read_Scheme(input); if (coding.insScheme == NULL) goto error; coding.mrgScheme = Read_Scheme(input); if (coding.mrgScheme == NULL) goto error; coding.subScheme = Read_Scheme(input); if (coding.subScheme == NULL) goto error; if (coding.subChar >= 0) { coding.sRunScheme = Read_Scheme(input); if (coding.sRunScheme == NULL) goto error; } return (&coding); error: if (coding.delScheme != NULL) free(coding.delScheme); if (coding.dRunScheme != NULL) free(coding.dRunScheme); if (coding.insScheme != NULL) free(coding.insScheme); if (coding.mrgScheme != NULL) free(coding.mrgScheme); if (coding.subScheme != NULL) free(coding.subScheme); if (coding.sRunScheme != NULL) free(coding.sRunScheme); EXIT(NULL); } // Free all the auxilliary storage associated with the encoding argument void Free_QVcoding(QVcoding *coding) { if (coding->subChar >= 0) free(coding->sRunScheme); free(coding->subScheme); free(coding->mrgScheme); free(coding->insScheme); if (coding->delChar >= 0) free(coding->dRunScheme); free(coding->delScheme); free(coding->prefix); } /******************************************************************************************* * * Encode/Decode (w.r.t. coding) next entry from input and write to output * ********************************************************************************************/ void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub, FILE *output, QVcoding *coding, int lossy) { int clen; if (coding->delChar < 0) { Encode(coding->delScheme, output, (uint8 *) del, rlen); clen = rlen; } else { Encode_Run(coding->delScheme, coding->dRunScheme, output, (uint8 *) del, rlen, coding->delChar); clen = Pack_Tag(tag,del,rlen,coding->delChar); } Number_Read(tag); Compress_Read(clen,tag); fwrite(tag,1,COMPRESSED_LEN(clen),output); if (lossy) { uint8 *insert = (uint8 *) ins; uint8 *merge = (uint8 *) mrg; int k; for (k = 0; k < rlen; k++) { insert[k] = (uint8) ((insert[k] >> 1) << 1); merge[k] = (uint8) (( merge[k] >> 2) << 2); } } Encode(coding->insScheme, output, (uint8 *) ins, rlen); Encode(coding->mrgScheme, output, (uint8 *) mrg, rlen); if (coding->subChar < 0) Encode(coding->subScheme, output, (uint8 *) sub, rlen); else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) sub, rlen, coding->subChar); return; } int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) { int rlen, clen; // Get all 5 streams, compress each with its scheme, and output rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT (-1); } if (coding->delChar < 0) { Encode(coding->delScheme, output, (uint8 *) Read, rlen); clen = rlen; } else { Encode_Run(coding->delScheme, coding->dRunScheme, output, (uint8 *) Read, rlen, coding->delChar); clen = Pack_Tag(Read+Rmax,Read,rlen,coding->delChar); } Number_Read(Read+Rmax); Compress_Read(clen,Read+Rmax); fwrite(Read+Rmax,1,COMPRESSED_LEN(clen),output); if (lossy) { uint8 *insert = (uint8 *) (Read+2*Rmax); uint8 *merge = (uint8 *) (Read+3*Rmax); int k; for (k = 0; k < rlen; k++) { insert[k] = (uint8) ((insert[k] >> 1) << 1); merge[k] = (uint8) (( merge[k] >> 2) << 2); } } Encode(coding->insScheme, output, (uint8 *) (Read+2*Rmax), rlen); Encode(coding->mrgScheme, output, (uint8 *) (Read+3*Rmax), rlen); if (coding->subChar < 0) Encode(coding->subScheme, output, (uint8 *) (Read+4*Rmax), rlen); else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) (Read+4*Rmax), rlen, coding->subChar); return (rlen); } int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) { int clen, tlen; // Decode each stream and write to output if (coding->delChar < 0) { if (Decode(coding->delScheme, input, entry[0], rlen)) EXIT(1); clen = rlen; tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); } else { if (Decode_Run(coding->delScheme, coding->dRunScheme, input, entry[0], rlen, coding->delChar)) EXIT(1); clen = Packed_Length(entry[0],rlen,coding->delChar); tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar); } if (Decode(coding->insScheme, input, entry[2], rlen)) EXIT(1); if (Decode(coding->mrgScheme, input, entry[3], rlen)) EXIT(1); if (coding->subChar < 0) { if (Decode(coding->subScheme, input, entry[4], rlen)) EXIT(1); } else { if (Decode_Run(coding->subScheme, coding->sRunScheme, input, entry[4], rlen, coding->subChar)) EXIT(1); } return (0); } DASCRUBBER-1.1/QV.h000066400000000000000000000115151327574206400135030ustar00rootroot00000000000000/******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #ifndef _QV_COMPRESSOR #include #define _QV_COMPRESSOR // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or // batch version of the routines in this library are compiled. In batch mode, routines // print an error message and exit. In interactive mode, the routines place the error // message in EPLACE (also defined in DB.h) and return an error value, typically NULL // if the routine returns a pointer, and an unusual integer value if the routine returns // an integer. // Below when an error return is described, one should understand that this value is returned // only if the routine was compiled in INTERACTIVE mode. // A PacBio compression scheme typedef struct { void *delScheme; // Huffman scheme for deletion QVs void *insScheme; // Huffman scheme for insertion QVs void *mrgScheme; // Huffman scheme for merge QVs void *subScheme; // Huffman scheme for substitution QVs void *dRunScheme; // Huffman scheme for deletion run lengths (if delChar > 0) void *sRunScheme; // Huffman scheme for substitution run lengths (if subChar > 0) int delChar; // If > 0, run-encoded deletion value int subChar; // If > 0, run-encoded substitution value int flip; // Need to flip multi-byte integers char *prefix; // Header line prefix } QVcoding; // Read the next nlines of input, and QVentry returns a pointer to the first line if needed. // If end-of-input is encountered before any further input, -1 is returned. If there is // an error than -2 is returned. Otherwise the length of the line(s) read is returned. int Read_Lines(FILE *input, int nlines); char *QVentry(); // Get and set the line counter for error reporting void Set_QV_Line(int line); int Get_QV_Line(); // Read up to the next num entries or until eof from the .quiva file on input and record // frequency statistics. Copy these entries to the temporary file temp if != NULL. // If there is an error then -1 is returned, otherwise the number of entries read. int QVcoding_Scan(FILE *input, int num, FILE *temp); void QVcoding_Scan1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub); // Given QVcoding_Scan has been called at least once, create an encoding scheme based on // the accumulated statistics and return a pointer to it. The returned encoding object // is *statically allocated within the routine. If lossy is set then use a lossy scaling // for the insertion and merge streams. If there is an error, then NULL is returned. QVcoding *Create_QVcoding(int lossy); // Read/write a coding scheme to input/output. The encoding object returned by the reader // is *statically* allocated within the routine. If an error occurs while reading then // NULL is returned. QVcoding *Read_QVcoding(FILE *input); void Write_QVcoding(FILE *output, QVcoding *coding); // Free all the auxiliary storage associated with coding (but not the object itself!) void Free_QVcoding(QVcoding *coding); // Assuming the file pointer is positioned just beyond an entry header line, read the // next set of 5 QV lines, compress them according to 'coding', and output. If lossy // is set then the scheme is a lossy one. A negative value is returned if an error // occurred, and the sequence length otherwise. int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub, FILE *output, QVcoding *coding, int lossy); // Assuming the input is position just beyond the compressed encoding of an entry header, // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, // and place their decompressed values into entry which is a 5 element array of character // pointers. The parameter rlen computed from the preceeding header line, critically // provides the length of each of the 5 vectors. A non-zero value is return only if an // error occured. int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); #endif // _QV_COMPRESSOR DASCRUBBER-1.1/README.md000066400000000000000000000367731327574206400143000ustar00rootroot00000000000000 # Dascrubber: The Dazzler Read Scrubbing Suite ## _Author: Gene Myers_ ## _First: March 27, 2016_ For typeset documentation, examples of use, and design philosophy please go to my [blog](https://dazzlerblog.wordpress.com/command-guides/dascrubber-command-guide). This is still a preliminary release. The current set of commands provide a pipeline that one can use to scrub reads and if desired to scrub the alignment piles (with DASrealign). Ultimately DASpatch/DASedit and DASrealign will be replaced with more powerful programs that correct reads and not only scrub alignment piles, but also remove haplotype and repeat induced overlaps, prior to assembly via a string graph method. The goal of scrubbing is to produce a set of edited reads that are guaranteed to (a) be continuous stretches of the underlying genome (i\.e\. no unremoved adapters and not chimers), and (b) have no very low quality stretches (i\.e\. the error rate never exceeds some reasonable maximum, 20% or so in the case of Pacbio data). The secondary goal of scrubbing is to do so with the minimum removal of data and splitting of reads. Note carefully that the current scrubbing pipeline requires that one has employed repeat-masking in the daligner run as per the DAMASKER module described in this [post](https://dazzlerblog.wordpress.com/2016/04/01/detecting-and-soft-masking-repeats). The current \"DAS\" suite consists of a pipeline of several programs that in sequence accomplish the task of scrubbing: DAScover → DASqv → DAStrim → DASpatch → DASedit. For the commands, the \ argument must always refer to the entire DB, and only the \ arguments can involve a block number. If \ involves a block number, e\.g\. Ecoli.2.las, then the .las file is expected to contain all the overlaps where the A-read is in block 2 of the underlying database. The HPC.daligner scripts in the DALIGNER module produce such .las files as their final result. Parameters are propoagated down the pipeline to subsequent phases via the annotation tracks so one need not specify the same parameter over and over again, i.e, the parameters/flags -H, -c, -g, and -b. All programs add suffixes (e.g. .db, .las) as needed. For the commands that take multiple .las block files as arguments, e.g. DAScover, DASqv, ..., one can place a @-sign in the name, which is then interpreted as the sequence of files obtained by replacing the @-sign by 1, 2, 3, ... in sequence until a number is reached for which no file matches. One can also place a @-sign followed by an integer, say, i, in which case the sequence starts at i. Lastly, one can also place @i-j where i and j are integers, in which case the sequence is from i to j, inclusive. ``` 1. DAScover [-v] [-H] [-m]+ ... ``` This command takes as input a database \ and a sequence of sorted local alignments blocks, \, produced by an overlap/daligner run for said database. Note carefully that \ must always refer to the entire DB, only the \ can involve block numbers. Using the local alignment-pile for each A-read, DAScover produces a histogram of the depth of coverage of each trace point tile that is not within one of the intervals of the optionally specified tracks. It places this histogram in a .covr track for the bock and these block tracks are merged later with Catrack. If the -v option is set, the histogram for each block is displayed and an estimate of the coverage of the underlying target genome is output. If the overlap file contains a block number then the track files also contain a block number, e\.g\. \"DAScovr DB OVL.2\" will result in the track files DB.2.covr.[anno,data]. Furthermore, if DAScovr is run on .las blocks, then once it has been run on all the blocks of the DB, the block tracks must be concatenated into a single track for the entire database with Catrack in preparation for the next phase of scrubbing by DAStrim. ``` 2. DASqv [-v] [-c] ... ``` This command takes as input a database \ and a sequence of sorted local alignments blocks, \, produced by an overlap/daligner run for said database. A .covr track obtained by running DASqv and Catrack must be present for the entire data base. Note carefully that \ must always refer to the entire DB, only the \ can involve a block number. Using the local alignment-pile for each A-read, DASqv produces a QV value for each complete segment of TRACE_SPACING bases (e\.g\. 100bp, the -s parameter to daligner). The quality value of ecah trace tile is the average of the best 25-50% of the estimated coverage alignment matches, where the estimated coverage is computed from the histogram of the .covr track. If one supplies the -c parameter, than this value explicitly overrides the estimated coverage produced by default. All quality values over 50 are clipped to 50. The -v option prints out a histogram of the segment align matches, and the quality values produced. This histogram is useful in assessing, for a given data set, what constitutes the threshold -g and -b, to be used by down stream commands, for what is definitely a good segment and what is definitely a bad segment. The -H option is for HGAP-based assembly (see the -H option of daligner) wherein only reads longer than the -H parameter are considered for overlap, scrubbing, and assembly. With this option set, DASqv and all subsequent commands in the scrubbing pipeline, only perform their functions on reads of length -H or more. All other reads are used in the overlap piles for H-reads to help assess and scrub the H-reads, but are themselves not scrubbed. The quality values are written to a .qual track, that can be viewed by calling DBdump with the -i option set (\"i\" for \"intrinsic QV\"). Like DAScovr and all other scrubber modules, block tracks are produced in response to block .las files and these must be concatenated with Catrack into a single .qual track for the entire DB in preparation for the next phase of scrubbing by DAStrim. ``` 3. DAStrim [-v] [-g] [-b] ... ``` A DB-wide .qual track produced by DASqv and Catrack are required as input to this command. This command further takes as input a database \ and a sequence of sorted local alignments blocks, \, produced by an overlap/daligner run for said database. A .qual track obtained by running DASqv must be present for the entire data base. Note carefully that \ must always refer to the entire DB, only \ can involve block numbers. Using the local alignment-pile for each A-read and the QV\'s for all the reads in the pile, DAStrim (1) finds and breaks all chimeric reads, (2) finds all missed adaptamers and retains only the longest subread between missed adaptaers, and (3) identifies all low-quality regions that should be improved/replaced by better sequence. It makes these inherently heuristic decisions conservatively so that what remains is very highly likely to be free of chimers, adaptamers, and undetected low-quality sequence segments. Some of these artifacts may still get through, but at very low odds, less than 1 in 10,000 in our experience. The decision process is guided by the parameters -g and -b which indicate the thresholds for considering intrinsic QV values good, bad, or unknown. By default these parameters are automatically set to be the 80'th and 93'rd percentiles of the qv-histograms hidden in the .qual track. They may however be explicitly set at the command line to over-rule this default choice. The -v option prints out a report of how many chimer and adaptamer breaks were detected, how much sequence was trimmed, how many low-quality segments were spanned by alignments, and how many were rescued by many pairs of local alignments spanning the gap indicued by the low-quality region, and so on. The retained high-quality intervals for each read are written to a .trim track, in left-to-right order with an indicator of whether the gap between two such intervals is spanned by local alignments or by span-consistent pairs of local alignments. Like DAScovr and all other scrubber modules, block tracks are produced in response to block .las files and these must be concatenated with Catrack into a single .trim track for the entire DB in preparation for the next phase of scrubbing by DASpatch. ``` 4. DASpatch [-v] ... ``` This command takes as input a database \ and a sequence of sorted local alignments blocks, \, produced by an overlap/daligner run for said database. A .qual track and a .trim track obtained by running DASqv and DAStrim must be present for the entire data base. Note carefully that \ must always refer to the entire DB, only \ can involve block numbers. Using the local alignment-pile for each A-read, the QV\'s for all the reads in the pile, and the hiqh-quality segments annotated by DAStrim, DASpatch selects a high-quality B-read segment with which to patch every intervening low-quality segment of an A-read. Given that these gaps are annotated before each read is trimmed by DAStrim, it may be the case in this second examination, that the gap is no longer spanned by the now trimmed B-reads in which case the span/patch can fail. This is very rare but does occur and is the number of such events is reported by DASpatch when the -v option is set. The B-read segments for each patch (or a special \"failure patch\") are written to a .patch track, in left-to-right order. Like DAScovr and all other scrubber modules, block tracks are produced in response to block .las files and these must be concatenated with Catrack into a single .patch track for the entire DB in preparation for the next phase of scrubbing by DASedit. ``` 5. DASedit [-v] [-x] ``` This command takes as input a database \ for which a .trim, and .patch tracks have produced by DAStrim and DASpatch in sequence (and perforce DAScover and DASqv before them). Using the information in the two tracks, DASedit produces a new database \ whose reads are the patched, high-quality sub-reads. That is, every low quality segment is patched with the relevant sequence of another read, and some reads give rise to two or more reads if deemed chimers, or no reads if the entire read was deemed junk. This command can take considerable time as the access pattern to read sequences (for the patching) is not sequential or localized, implying poor cache performance. The new database does not have a .qvs or .arr component, that is, it is a a sequence or S-database (see the original Dazzler DB post). Very importantly, the new database has exactly the same block divisions as the original. That is, all patched subreads in a block of the new database have been derived from reads of the same block in the original database, and only from those reads. The new database does have a .map track that for each read encodes the original read in the source DB that was patched and the segments of that read that were high-quality (i\.e\. not patched). The program DASmap below can be used to output this information in either an easy-to-read or an easy-to-parse format. ``` 6. DASmap [-p] [ | ... ] ``` This command takes as input a database of patched reads \ produced by DASedit, and for the specified reads outputs a line for each showing the source read index and length in the originating DB, as well as annotating which segments were original and which were patched. The convention on interpreting the read arguments is as for DBshow and DBdump. As an example, with the -p option (pretty print) set one might see: ``` 55 -> 57(2946) [400,2946] 56 -> 58(11256) [700,1900] 57 -> 58(11256) [6600,9900] 83 [10000,11256] 58 -> 59(12282) [400,4100] 88 [4200,9400] 97 [9500,12282] ``` The first line indicates that read 55 in the patched database was derived from read 57 in the original database and is the segment from [400,2946] of that read. Reads 56 and 57 were both derived from read 58 in the original DB, and read 57 consists of segments [6600,9900] and [10000,11256] of read 58 with a patch between them of 83bp (but the source of the patch data is not given). The read length of each original read is given for convenience. With the -p option off, the output consists of space separated integers encoding the same information where the 4th field is always the number of integers in the segment description (always 3n+2 for some n): ``` 55 57 2946 2 400 2946 56 58 11256 2 700 1900 57 58 11256 5 6600 9900 83 10000 11256 58 59 12282 8 400 4100 88 4200 9400 97 9500 12282 ``` ``` 7. DASrealign [-v] [-l] ``` This command takes as input two blocks a patched database \ created by DASedit, and the original .las file \ for the block pair. That is the .las file that was produced for the blocks when daligner was run on the original database blocks. DASrealign then produces the set of alignments (inferrable from those in the original) between the new patched reads, placing them in the file \. These new .las files can then be merged with LAmerge to form block .las files for the new database. The idea of this program is to avoid having to run daligner again on the patched reads, but rather to simply refine the alignments already computed with respect to the new read set. It has the draw back that there is some small chance that there are previously undetected overlaps between reads now that they are patched, and the patched trace point encoding, while stillable to deliver alignments are no longer usable for quality estimation as the tracepoint spacing in the A-read becomes irregular. This is contrasted with the speedup of the new process which is roughly 40X faster than the original daligner run and the collection of overlaps in the output can be feed directly into a string graph construction process. ``` 8. REPcover ... ``` This command takes as input a sequence of databases or blocks \ and for each outputs a histogram of the coverage of the unmasked portions of the reads in the source along with a recommendation of the -c value with which to run DASqv. The .covr track produced by DAScover must be present for all sources referred to. The command is a quick way get the -v output of DAScover at any time after producing the coverage histograms and to get the histogram and coverage estimate for the entire data base (as opposed to a block of the database. ``` 9. REPqv ... ``` This command takes as input a sequence of databases or blocks \ and for each outputs a histogram of the intrinsic quality values of the reads in the source along with a recommendation of the -g and -b values with which to run DAStrim. The .qual track produced by DASqv must be present for all sources referred to. The command is a quick way to get the -v output of DASqv at any time after producing the intrinsic quality values and to get the histograms for the entire data base (as opposed to a block of the database). ``` 10. REPtrim ... ``` This command takes as input a sequence of databases or blocks \ and for each outputs the scrubbing statistics for the source, i.e. the same report produced by DAStrim with the -v option set. The .trim track produced by DAStrim must be present for all sources referred to. The command is a quick way to get the -v output of DAStrim at any time after the fact and to get the statistics for the entire data base (as opposed to a block of the database). DASCRUBBER-1.1/REPcover.c000066400000000000000000000113611327574206400146340ustar00rootroot00000000000000/******************************************************************************************* * * Read the .covr track of each db or db block on the command line and output a histogram * of the coverage of the unmasked portions and an estimate of the coverage of the * underlying genome * * Author: Gene Myers * Date : January 2017 * *******************************************************************************************/ #include #include #include #include #include #include "DB.h" // Command format and global parameter variables static char *Usage = " ..."; int main(int argc, char *argv[]) { int c; // Process arguments Prog_Name = Strdup("REPcover",""); if (argc < 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } // Open trimmed DB and the qual-track for (c = 1; c < argc; c++) { DAZZ_DB _DB, *DB = &_DB; DAZZ_EXTRA ex_hgap, ex_covr; // Load DB { int status; status = Open_DB(argv[c],DB); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } Trim_DB(DB); } // Get .covr track extras { FILE *afile; char *aname; int extra; afile = NULL; if (DB->part) { aname = Strdup(Catenate(DB->path,Numbered_Suffix(".",DB->part,"."),"covr",".anno"), "Allocating anno file"); if (aname == NULL) exit (1); afile = fopen(aname,"r"); if (afile == NULL) { fprintf(stderr,"%s: Must have a 'covr.%d' track, run DAScover\n",Prog_Name,DB->part); exit (1); } } else { aname = Strdup(Catenate(DB->path,".","covr",".anno"),"Allocating anno file"); if (aname == NULL) exit (1); afile = fopen(aname,"r"); if (afile == NULL) { fprintf(stderr,"%s: Must have a 'covr' track, run DAScover\n",Prog_Name); exit (1); } } fseeko(afile,0,SEEK_END); extra = ftell(afile) - sizeof(int)*2; fseeko(afile,-extra,SEEK_END); ex_covr.nelem = 0; if (Read_Extra(afile,aname,&ex_covr) != 0) { fprintf(stderr,"%s: Histogram extra missing from .covr track?\n",Prog_Name); exit (1); } ex_hgap.nelem = 0; if (Read_Extra(afile,aname,&ex_hgap) != 0) { fprintf(stderr,"%s: Hgap threshold extra missing from .covr track?\n",Prog_Name); exit (1); } fclose(afile); } // Generate display { char *root; int i, cmax, hgap_min, cover; int64 nreads, totlen; int64 *cgram; int64 ssum, stotal; root = Root(argv[c],".db"); nreads = DB->nreads; totlen = DB->totlen; hgap_min = (int) ((int64 *) (ex_hgap.value))[0]; cgram = (int64 *) (ex_covr.value); cmax = ex_covr.nelem - 1; printf("\nDAScover"); if (hgap_min > 0) printf(" -H%d",hgap_min); printf(" %s\n\n",root); if (hgap_min > 0) { for (i = 0; i < DB->nreads; i++) if (DB->reads[i].rlen < hgap_min) { nreads -= 1; totlen -= DB->reads[i].rlen; } } // Display histogram printf("\nInput: "); Print_Number(nreads,7,stdout); printf("reads, "); Print_Number(totlen,12,stdout); printf(" bases"); if (hgap_min > 0) { printf(" (another "); Print_Number(DB->nreads-nreads,0,stdout); printf(" were < H-length)"); } printf("\n"); stotal = 0; for (i = 0; i <= cmax; i++) stotal += cgram[i]; printf("\nCoverage Histogram\n\n"); ssum = cgram[cmax]; if (ssum > 0) printf(" %4d: %9lld %5.1f%%\n\n", cmax,cgram[cmax],(100.*ssum)/stotal); stotal -= ssum; ssum = 0; for (i = cmax-1; i >= 0; i--) if (cgram[i] > 0) { ssum += cgram[i]; printf(" %4d: %9lld %5.1f%%\n", i,cgram[i],(100.*ssum)/stotal); } i = 0; while (cgram[i+1] < cgram[i]) i += 1; for (cover = i++; i < cmax; i++) if (cgram[cover] < cgram[i]) cover = i; printf("\n Coverage is estimated at %d\n\n",cover); free(root); Close_DB(DB); } } free(Prog_Name); exit (0); } DASCRUBBER-1.1/REPqv.c000066400000000000000000000144101327574206400141420ustar00rootroot00000000000000/******************************************************************************************* * * Read the .qual track of each db or db block on the command line and output a histogram * of the intrinsic QV's therein. * * Author: Gene Myers * Date : August 2017 * *******************************************************************************************/ #include #include #include #include #include #include "DB.h" // Command format and global parameter variables static char *Usage = " ..."; int main(int argc, char *argv[]) { int c; // Process arguments Prog_Name = Strdup("REPqv",""); if (argc < 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } // Open trimmed DB and the qual-track for (c = 1; c < argc; c++) { DAZZ_DB _DB, *DB = &_DB; DAZZ_EXTRA ex_hgap, ex_cest, ex_qvs, ex_dif; // Load DB { int status; status = Open_DB(argv[c],DB); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } Trim_DB(DB); } // Get .qual track extras { FILE *afile; char *aname; int extra, tracklen, size; afile = NULL; if (DB->part) { aname = Strdup(Catenate(DB->path,Numbered_Suffix(".",DB->part,"."),"qual",".anno"), "Allocating anno file"); if (aname == NULL) exit (1); afile = fopen(aname,"r"); if (afile == NULL) { fprintf(stderr,"%s: Must have a 'qual.%d' track, run DASqv\n",Prog_Name,DB->part); exit (1); } } else { aname = Strdup(Catenate(DB->path,".","qual",".anno"),"Allocating anno file"); if (aname == NULL) exit (1); afile = fopen(aname,"r"); if (afile == NULL) { fprintf(stderr,"%s: Must have a 'qual' track, run DASqv\n",Prog_Name); exit (1); } } fread(&tracklen,sizeof(int),1,afile); fread(&size,sizeof(int),1,afile); fseeko(afile,0,SEEK_END); extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int)); fseeko(afile,-extra,SEEK_END); ex_hgap.nelem = 0; if (Read_Extra(afile,aname,&ex_hgap) != 0) { fprintf(stderr,"%s: Hgap threshold extra missing from .qual track?\n",Prog_Name); exit (1); } ex_cest.nelem = 0; if (Read_Extra(afile,aname,&ex_cest) != 0) { fprintf(stderr,"%s: Coverage estimate extra missing from .qual track?\n",Prog_Name); exit (1); } ex_qvs.nelem = 0; if (Read_Extra(afile,aname,&ex_qvs) != 0) { fprintf(stderr,"%s: QV histogram extra missing from .qual track?\n",Prog_Name); exit (1); } ex_dif.nelem = 0; if (Read_Extra(afile,aname,&ex_dif) != 0) { fprintf(stderr,"%s: Differences histogram extra missing from .qual track?\n",Prog_Name); exit (1); } fclose(afile); } // Generate display { char *root; int64 nreads, totlen; int hgap_min, cover; int64 *qgram, *sgram; int maxqv; // Get relevant variables root = Root(argv[c],".db"); nreads = DB->nreads; totlen = DB->totlen; hgap_min = (int) ((int64 *) (ex_hgap.value))[0]; cover = (int) ((int64 *) (ex_cest.value))[0]; qgram = (int64 *) (ex_qvs.value); maxqv = ex_qvs.nelem - 1; sgram = (int64 *) (ex_dif.value); printf("\nDASqv"); if (hgap_min > 0) printf(" -H%d",hgap_min); printf(" -c%d %s\n\n",cover,root); if (hgap_min > 0) { int i; for (i = 0; i < DB->nreads; i++) if (DB->reads[i].rlen < hgap_min) { nreads -= 1; totlen -= DB->reads[i].rlen; } } // Display histograms printf("\n Input: "); Print_Number(nreads,7,stdout); printf("reads, "); Print_Number(totlen,12,stdout); printf(" bases"); if (hgap_min > 0) { printf(" (another "); Print_Number(DB->nreads - nreads,0,stdout); printf(" were < H-length)"); } printf("\n"); { int64 ssum, qsum; int64 stotal, qtotal; int qv_deep; int gval, bval; int i; stotal = qtotal = 0; for (i = 0; i <= maxqv; i++) { stotal += sgram[i]; qtotal += qgram[i]; } if (cover >= 40) qv_deep = cover/8; else if (cover >= 20) qv_deep = 5; else qv_deep = cover/4; printf("\n Histogram of q-values (average %d best)\n",qv_deep); printf("\n Input QV\n"); qsum = qgram[maxqv]; ssum = sgram[maxqv]; printf("\n %2d: %9lld %5.1f%% %9lld %5.1f%%\n\n", maxqv,sgram[maxqv],(100.*ssum)/stotal,qgram[maxqv],(100.*qsum)/qtotal); qtotal -= qsum; stotal -= ssum; ssum = qsum = 0; for (i = maxqv-1; i >= 0; i--) if (qgram[i] > 0) { ssum += sgram[i]; qsum += qgram[i]; printf(" %2d: %9lld %5.1f%% %9lld %5.1f%%\n", i,sgram[i],(100.*ssum)/stotal, qgram[i],(100.*qsum)/qtotal); } // Estimate -g and -b parameters bval = gval = -1; qsum = 0; for (i = maxqv-1; i >= 0; i--) if (qgram[i] > 0) { qsum += qgram[i]; if ((100.*qsum)/qtotal > 7. && bval < 0) bval = i+1; if ((100.*qsum)/qtotal > 20. && gval < 0) gval = i+1; } printf("\n Recommend \'DAStrim -g%d -b%d'\n\n",gval,bval); } free(root); Close_DB(DB); } } free(Prog_Name); exit (0); } DASCRUBBER-1.1/REPtrim.c000066400000000000000000000214361327574206400144750ustar00rootroot00000000000000/******************************************************************************************* * * Read the .trim track of each db or db block on the command line and output * a summary of the scrubbing that took place on that db or block. * * Author: Gene Myers * Date : August 2017 * *******************************************************************************************/ #include #include #include #include #include #include "DB.h" #include "align.h" // Command format and global parameter variables static char *Usage = " ..."; int main(int argc, char *argv[]) { int c; // Process arguments Prog_Name = Strdup("REPtrim",""); if (argc < 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } // Open trimmed DB and .qual and .trim tracks for (c = 1; c < argc; c++) { DAZZ_DB _DB, *DB = &_DB; DAZZ_EXTRA ex_hgap, ex_cest, ex_good, ex_bad, ex_trim; // Load DB { int status; status = Open_DB(argv[c],DB); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } Trim_DB(DB); } // Get .trim track extras { FILE *afile; char *aname; int extra, tracklen, size; afile = NULL; if (DB->part) { aname = Strdup(Catenate(DB->path,Numbered_Suffix(".",DB->part,"."),"trim",".anno"), "Allocating anno file"); if (aname == NULL) exit (1); afile = fopen(aname,"r"); if (afile == NULL) { fprintf(stderr,"%s: Must have a 'trim.%d' track, run DAStrim\n",Prog_Name,DB->part); exit (1); } } else { aname = Strdup(Catenate(DB->path,".","trim",".anno"),"Allocating anno file"); if (aname == NULL) exit (1); afile = fopen(aname,"r"); if (afile == NULL) { fprintf(stderr,"%s: Must have a 'trim' track, run DAStrim\n",Prog_Name); exit (1); } } fread(&tracklen,sizeof(int),1,afile); fread(&size,sizeof(int),1,afile); fseeko(afile,0,SEEK_END); extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int)); fseeko(afile,-extra,SEEK_END); ex_hgap.nelem = 0; if (Read_Extra(afile,aname,&ex_hgap) != 0) { fprintf(stderr,"%s: Hgap threshold extra missing from .trim track?\n",Prog_Name); exit (1); } ex_cest.nelem = 0; if (Read_Extra(afile,aname,&ex_cest) != 0) { fprintf(stderr,"%s: Coverage estimate extra missing from .trim track?\n",Prog_Name); exit (1); } ex_good.nelem = 0; if (Read_Extra(afile,aname,&ex_good) != 0) { fprintf(stderr,"%s: Good QV threshold extra missing from .trim track?\n",Prog_Name); exit (1); } ex_bad.nelem = 0; if (Read_Extra(afile,aname,&ex_bad) != 0) { fprintf(stderr,"%s: Bad QV threshdold extra missing from .trim track?\n",Prog_Name); exit (1); } ex_trim.nelem = 0; if (Read_Extra(afile,aname,&ex_trim) != 0) { fprintf(stderr,"%s: Trimming statistics extra missing from .trim track?\n",Prog_Name); exit (1); } fclose(afile); } // Generate Display { char *root; int64 nreads, totlen; int64 nelim, nelimbp; int64 n5trm, n5trmbp; int64 n3trm, n3trmbp; int64 natrm, natrmbp; int64 ngaps, ngapsbp; int64 nlowq, nlowqbp; int64 nspan, nspanbp; int64 nchim, nchimbp; int rlog, blog; int cover, hgap_min; int bad_qv, good_qv; int64 *tstats; // Get relevant variables root = Root(argv[c],".db"); nreads = DB->nreads; totlen = DB->totlen; hgap_min = (int) ((int64 *) (ex_hgap.value))[0]; cover = (int) ((int64 *) (ex_cest.value))[0]; good_qv = (int) ((int64 *) (ex_good.value))[0]; bad_qv = (int) ((int64 *) (ex_bad.value))[0]; tstats = (int64 *) (ex_trim.value); nelim = tstats[0]; n5trm = tstats[1]; n3trm = tstats[2]; natrm = tstats[3]; nelimbp = tstats[4]; n5trmbp = tstats[5]; n3trmbp = tstats[6]; natrmbp = tstats[7]; ngaps = tstats[8]; nlowq = tstats[9]; nspan = tstats[10]; nchim = tstats[11]; ngapsbp = tstats[12]; nlowqbp = tstats[13]; nspanbp = tstats[14]; nchimbp = tstats[15]; printf("\nDAStrim"); if (hgap_min > 0) printf(" [-H%d]",hgap_min); printf(" -c%d -g%d -b%d %s\n\n",cover,good_qv,bad_qv,root); // Compensate for HGAP if (hgap_min > 0) { int i; for (i = 0; i < DB->nreads; i++) if (DB->reads[i].rlen < hgap_min) { nreads -= 1; totlen -= DB->reads[i].rlen; } } // Compute maximum field widths of statistics { int64 mult; rlog = 0; mult = 1; while (mult <= nreads || mult <= ngaps) { mult *= 10; rlog += 1; } if (rlog <= 3) rlog = 3; else rlog += (rlog-1)/3; blog = 0; mult = 1; while (mult <= totlen) { mult *= 10; blog += 1; } if (blog <= 3) blog = 3; else blog += (blog-1)/3; } // Display the statistices printf(" Input: "); Print_Number((int64) nreads,rlog,stdout); printf(" (100.0%%) reads "); Print_Number(totlen,blog,stdout); printf(" (100.0%%) bases"); if (hgap_min > 0) { printf(" (another "); Print_Number((int64) (DB->nreads-nreads),0,stdout); printf(" were < H-length)"); } printf("\n"); printf(" Trimmed: "); Print_Number(nelim,rlog,stdout); printf(" (%5.1f%%) reads ",(100.*nelim)/nreads); Print_Number(nelimbp,blog,stdout); printf(" (%5.1f%%) bases\n",(100.*nelimbp)/totlen); printf(" 5' trim: "); Print_Number(n5trm,rlog,stdout); printf(" (%5.1f%%) reads ",(100.*n5trm)/nreads); Print_Number(n5trmbp,blog,stdout); printf(" (%5.1f%%) bases\n",(100.*n5trmbp)/totlen); printf(" 3' trim: "); Print_Number(n3trm,rlog,stdout); printf(" (%5.1f%%) reads ",(100.*n3trm)/nreads); Print_Number(n3trmbp,blog,stdout); printf(" (%5.1f%%) bases\n",(100.*n3trmbp)/totlen); printf(" Adapter: "); Print_Number(natrm,rlog,stdout); printf(" (%5.1f%%) reads ",(100.*natrm)/nreads); Print_Number(natrmbp,blog,stdout); printf(" (%5.1f%%) bases\n",(100.*natrmbp)/totlen); printf("\n"); printf(" Gaps: "); Print_Number(ngaps,rlog,stdout); printf(" (%5.1f%%) gaps ",(100.*(ngaps))/nreads); Print_Number(ngapsbp,blog,stdout); printf(" (%5.1f%%) bases\n",(100.*(ngapsbp))/totlen); printf(" Low QV: "); Print_Number(nlowq,rlog,stdout); printf(" (%5.1f%%) gaps ",(100.*(nlowq))/nreads); Print_Number(nlowqbp,blog,stdout); printf(" (%5.1f%%) bases\n",(100.*(nlowqbp))/totlen); printf(" Span'd: "); Print_Number(nspan,rlog,stdout); printf(" (%5.1f%%) gaps ",(100.*(nspan))/nreads); Print_Number(nspanbp,blog,stdout); printf(" (%5.1f%%) bases\n",(100.*(nspanbp))/totlen); printf(" Break: "); Print_Number(nchim,rlog,stdout); printf(" (%5.1f%%) gaps ",(100.*(nchim))/nreads); Print_Number(nchimbp,blog,stdout); printf(" (%5.1f%%) bases\n",(100.*(nchimbp))/totlen); printf("\n"); printf(" Clipped: "); Print_Number(n5trm+n3trm+nelim+nchim,rlog,stdout); printf(" clips "); Print_Number(n5trmbp+n3trmbp+nelimbp+nchimbp,blog,stdout); printf(" (%5.1f%%) bases\n",(100.*(n5trmbp+n3trmbp+nelimbp+nchimbp))/totlen); printf(" Patched: "); Print_Number(nlowq+nspan,rlog,stdout); printf(" patches "); Print_Number(nlowqbp+nspanbp,blog,stdout); printf(" (%5.1f%%) bases\n",(100.*(nlowqbp+nspanbp))/totlen); free(root); Close_DB(DB); } } free(Prog_Name); exit (0); } DASCRUBBER-1.1/align.c000066400000000000000000004206161327574206400142500ustar00rootroot00000000000000/******************************************************************************************* * * Fast alignment discovery and trace generation along with utilites for displaying alignments * Based on previously unpublished ideas from 2005, subsequently refined in 2013-14. Basic * idea is to keep a dynamically selected interval of the f.r. waves from my 1986 O(nd) paper. * A recent cool idea is to not record all the details of an alignment while discovering it * but simply record trace points through which the optimal alignment passes every 100bp, * allowing rapid recomputation of the alignment details between trace points. * * Author : Gene Myers * First : June 2013 * Current: June 1, 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #undef DEBUG_PASSES // Show forward / backward extension termini for Local_Alignment #undef DEBUG_POINTS // Show trace points #undef DEBUG_WAVE // Show waves of Local_Alignment #undef SHOW_MATCH_WAVE // For waves of Local_Alignment also show # of matches #undef SHOW_TRAIL // Show trace at the end of forward and reverse passes #undef SHOW_TPS // Show trace points as they are encountered in a wave #undef DEBUG_EXTEND // Show waves of Extend_Until_Overlap #undef DEBUG_ALIGN // Show division points of Compute_Trace #undef DEBUG_TRACE // Show trace additions for Compute_Trace #undef DEBUG_SCRIPT // Show script additions for Compute_Trace #undef DEBUG_AWAVE // Show F/R waves of Compute_Trace #undef SHOW_TRACE // Show full trace for Print_Alignment #undef WAVE_STATS /****************************************************************************************\ * * * Working Storage Abstraction * * * \****************************************************************************************/ typedef struct // Hidden from the user, working space for each thread { int vecmax; void *vector; int celmax; void *cells; int pntmax; void *points; int tramax; void *trace; } _Work_Data; Work_Data *New_Work_Data() { _Work_Data *work; work = (_Work_Data *) Malloc(sizeof(_Work_Data),"Allocating work data block"); if (work == NULL) EXIT(NULL); work->vecmax = 0; work->vector = NULL; work->pntmax = 0; work->points = NULL; work->tramax = 0; work->trace = NULL; work->celmax = 0; work->cells = NULL; return ((Work_Data *) work); } static int enlarge_vector(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->vector,max,"Enlarging DP vector"); if (vec == NULL) EXIT(1); work->vecmax = max; work->vector = vec; return (0); } static int enlarge_points(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->points,max,"Enlarging point vector"); if (vec == NULL) EXIT(1); work->pntmax = max; work->points = vec; return (0); } static int enlarge_trace(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->trace,max,"Enlarging trace vector"); if (vec == NULL) EXIT(1); work->tramax = max; work->trace = vec; return (0); } void Free_Work_Data(Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; if (work->vector != NULL) free(work->vector); if (work->cells != NULL) free(work->cells); if (work->trace != NULL) free(work->trace); if (work->points != NULL) free(work->points); free(work); } /****************************************************************************************\ * * * ADAPTIVE PATH FINDING * * * \****************************************************************************************/ // Absolute/Fixed Parameters #define BVEC uint64 // Can be uint32 if PATH_LEN <= 32 #define TRIM_LEN 15 // Report as the tip, the last wave maximum for which the last // 2*TRIM_LEN edits are prefix-positive at rate ave_corr*f(bias) // (max value is 20) #define PATH_LEN 60 // Follow the last PATH_LEN columns/edges (max value is 63) // Derivative fixed parameters #define PATH_TOP 0x1000000000000000ll // Must be 1 << PATH_LEN #define PATH_INT 0x0fffffffffffffffll // Must be PATH_TOP-1 #define TRIM_MASK 0x7fff // Must be (1 << TRIM_LEN) - 1 #define TRIM_MLAG 200 // How far can last trim point be behind best point #define WAVE_LAG 30 // How far can worst point be behind the best point static double Bias_Factor[10] = { .690, .690, .690, .690, .780, .850, .900, .933, .966, 1.000 }; // Adjustable paramters typedef struct { double ave_corr; int trace_space; int reach; float freq[4]; int ave_path; int16 *score; int16 *table; } _Align_Spec; /* Fill in bit table: TABLE[x] = 1 iff the alignment modeled by x (1 = match, 0 = mismatch) has a non-negative score for every suffix of the alignment under the scoring scheme where match = MATCH and mismatch = -1. MATCH is set so that an alignment with TRIM_PCT matches has zero score ( (1-TRIM_PCT) / TRIM_PCT ). */ #define FRACTION 1000 // Implicit fractional part of scores, i.e. score = x/FRACTION typedef struct { int mscore; int dscore; int16 *table; int16 *score; } Table_Bits; static void set_table(int bit, int prefix, int score, int max, Table_Bits *parms) { if (bit >= TRIM_LEN) { parms->table[prefix] = (int16) (score-max); parms->score[prefix] = (int16) score; } else { if (score > max) max = score; set_table(bit+1,(prefix<<1),score - parms->dscore,max,parms); set_table(bit+1,(prefix<<1) | 1,score + parms->mscore,max,parms); } } /* Create an alignment specification record including path tip tables & values */ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq, int reach) { _Align_Spec *spec; Table_Bits parms; double match; int bias; spec = (_Align_Spec *) Malloc(sizeof(_Align_Spec),"Allocating alignment specification"); if (spec == NULL) EXIT(NULL); spec->ave_corr = ave_corr; spec->trace_space = trace_space; spec->reach = reach; spec->freq[0] = freq[0]; spec->freq[1] = freq[1]; spec->freq[2] = freq[2]; spec->freq[3] = freq[3]; match = freq[0] + freq[3]; if (match > .5) match = 1.-match; bias = (int) ((match+.025)*20.-1.); if (match < .2) { fprintf(stderr,"Warning: Base bias worse than 80/20%% ! (New_Align_Spec)\n"); fprintf(stderr," Capping bias at this ratio.\n"); bias = 3; } spec->ave_path = (int) (PATH_LEN * (1. - Bias_Factor[bias] * (1. - ave_corr))); parms.mscore = (int) (FRACTION * Bias_Factor[bias] * (1. - ave_corr)); parms.dscore = FRACTION - parms.mscore; parms.score = (int16 *) Malloc(sizeof(int16)*(TRIM_MASK+1)*2,"Allocating trim table"); if (parms.score == NULL) { free(spec); EXIT(NULL); } parms.table = parms.score + (TRIM_MASK+1); set_table(0,0,0,0,&parms); spec->table = parms.table; spec->score = parms.score; return ((Align_Spec *) spec); } void Free_Align_Spec(Align_Spec *espec) { _Align_Spec *spec = (_Align_Spec *) espec; free(spec->score); free(spec); } double Average_Correlation(Align_Spec *espec) { return (((_Align_Spec *) espec)->ave_corr); } int Trace_Spacing(Align_Spec *espec) { return (((_Align_Spec *) espec)->trace_space); } float *Base_Frequencies(Align_Spec *espec) { return (((_Align_Spec *) espec)->freq); } int Overlap_If_Possible(Align_Spec *espec) { return (((_Align_Spec *) espec)->reach); } /****************************************************************************************\ * * * LOCAL ALIGNMENT FINDER: forward_/reverse_wave and Local_Alignment * * * \****************************************************************************************/ #ifdef WAVE_STATS static int64 MAX, TOT, NWV; static int64 RESTARTS; void Init_Stats() { MAX = TOT = NWV = 0; RESTARTS = 0; } void Print_Stats() { printf("\nMax = %lld Ave = %.1f # = %lld\n",MAX,(1.*TOT)/NWV,NWV); printf("\nRestarts = %lld\n",RESTARTS); } #endif #ifdef DEBUG_WAVE static void print_wave(int *V, int *M, int low, int hgh, int besta) { int k, bestk; (void) M; printf(" [%6d,%6d]: ",low,hgh); for (k = low; k <= hgh; k++) { if (besta == V[k]) bestk = k; // printf(" %3d",(V[k]+k)/2); printf(" %3d",besta-V[k]); } printf(" : %d (%d,%d)\n",besta,(besta+bestk)/2,(besta-bestk)/2); #ifdef SHOW_MATCH_WAVE printf(" "); for (k = low; k <= hgh; k++) printf(" %3d",M[k]); printf("\n"); #endif fflush(stdout); } #endif /* At each furthest reaching point, keep a-coordinate of point (V), bitvector recording the last TRIM_LEN columns of the implied alignment (T), and the # of matches (1-bits) in the bitvector (M). */ typedef struct { int ptr; int diag; int diff; int mark; } Pebble; static int VectorEl = 6*sizeof(int) + sizeof(BVEC); static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, int *mind, int maxd, int mida, int minp, int maxp, int aoff, int boff) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *HB; int *_HA, *_HB; int *NA, *NB; int *_NA, *_NB; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int REACH = spec->reach; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha, trimhb; int morea, morey, mored; int moreha, morehb; int more, morem, lasta; int aclip, bclip; hgh = maxd; low = *mind; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEl; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } /* Compute 0-wave starting from mid-line */ more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; trimhb = morehb = 1; morem = -1; { int k; char *a; a = aseq + hgh; for (k = hgh; k >= low; k--) { int y, c, d; int ha, hb; int na, nb; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = (((y+k)+(TRACE_SPACE-aoff))/TRACE_SPACE-1)*TRACE_SPACE+aoff; #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; nb = ((y+(TRACE_SPACE-boff))/TRACE_SPACE-1)*TRACE_SPACE+boff; #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,-1,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb += TRACE_SPACE; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; } c = (y << 1) + k; while (y+k >= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; } while (y >= nb) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb += TRACE_SPACE; } if (c > besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; trimhb = hb; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; HB[k] = hb; NA[k] = na; NB[k] = nb; a -= 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; morehb = HB[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } #ifdef DEBUG_WAVE printf("\nFORWARD WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif /* Compute successive waves until no furthest reaching points remain */ while (more && lasta >= besta - TRIM_MLAG) { int k, n; int ua, ub; BVEC t; int am, ac, ap; char *a; low -= 1; hgh += 1; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move; int64 vd, md, had, hbd, nad, nbd, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEl)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEl; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (hbd < 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (nbd < 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nbd > 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (hbd > 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; } if (low >= minp) { NA[low] = NA[low+1]; NB[low] = NB[low+1]; V[low] = -1; } else low += 1; if (hgh <= maxp) { NA[hgh] = NA[hgh-1]; NB[hgh] = NB[hgh-1]; V[hgh] = am = -1; } else am = V[--hgh]; dif += 1; ac = V[hgh+1] = V[low-1] = -1; a = aseq + hgh; t = PATH_INT; n = PATH_LEN; ua = ub = -1; for (k = hgh; k >= low; k--) { int y, m; int ha, hb; int c, d; BVEC b; Pebble *pb; ap = ac; ac = am; am = V[d = k-1]; if (ac < am) if (am < ap) { c = ap+1; m = n; b = t; ha = ua; hb = ub; } else { c = am+1; m = M[d]; b = T[d]; ha = HA[d]; hb = HB[d]; } else if (ac < ap) { c = ap+1; m = n; b = t; ha = ua; hb = ub; } else { c = ac+2; m = M[k]; b = T[k]; ha = HA[k]; hb = HB[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k >= NA[k]) { if (cells[ha].mark < NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] += TRACE_SPACE; } while (y >= NB[k]) { if (cells[hb].mark < NB[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = dif; pb->mark = NB[k]; hb = avail++; } NB[k] += TRACE_SPACE; } if (c > besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; trimhb = hb; } } } t = T[k]; n = M[k]; ua = HA[k]; ub = HB[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; HB[k] = hb; a -= 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta-besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; morehb = HB[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } n = besta - WAVE_LAG; while (hgh >= low) if (V[hgh] < n) hgh -= 1; else { while (V[low] < n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; uint16 *btrace = (uint16 *) bpath->trace; int atlen, btlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0 && REACH) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; trimhb = morehb; } else trimx = trima-trimy; atlen = btlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida-k)/2; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; atrace[atlen++] = (uint16) (d-e); atrace[atlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[atlen++] = (uint16) (trimd-e); atrace[atlen++] = (uint16) (trimy-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } a = -1; for (h = trimhb; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida+k)/2; e = 0; low = k; #ifdef SHOW_TRAIL printf(" B path = (%5d,%5d)\n",b,(mida-k)/2); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark + k; d = cells[h].diff; btrace[btlen++] = (uint16) (d-e); btrace[btlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b-k != trimy) { btrace[btlen++] = (uint16) (trimd-e); btrace[btlen++] = (uint16) (trimx-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); #endif } else if (b != trimx) { btrace[btlen-1] = (uint16) (btrace[btlen-1] + (trimx-b)); btrace[btlen-2] = (uint16) (btrace[btlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); #endif } apath->aepos = trimx; apath->bepos = trimy; apath->diffs = trimd; apath->tlen = atlen; bpath->tlen = btlen; } *mind = low; return (0); } /*** Reverse Wave ***/ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, int mind, int maxd, int mida, int minp, int maxp, int aoff, int boff) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *HB; int *_HA, *_HB; int *NA, *NB; int *_NA, *_NB; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int REACH = spec->reach; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha, trimhb; int morea, morey, mored; int moreha, morehb; int more, morem, lasta; int aclip, bclip; hgh = maxd; low = mind; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEl; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; trimhb = morehb = 1; morem = -1; { int k; char *a; a = aseq + low; for (k = low; k <= hgh; k++) { int y, c, d; int ha, hb; int na, nb; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = (((y+k)+(TRACE_SPACE-aoff)-1)/TRACE_SPACE-1)*TRACE_SPACE+aoff; #ifdef SHOW_TPS printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y+k; ha = avail++; nb = ((y+(TRACE_SPACE-boff)-1)/TRACE_SPACE-1)*TRACE_SPACE+boff; #ifdef SHOW_TPS printf(" B %d: -1,%d,0,%d\n",avail,k,nb+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y; hb = avail++; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; } c = (y << 1) + k; while (y+k <= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na -= TRACE_SPACE; } while (y <= nb) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb -= TRACE_SPACE; } if (c < besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; trimhb = hb; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; HB[k] = hb; NA[k] = na; NB[k] = nb; a += 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; morehb = HB[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } #ifdef DEBUG_WAVE printf("\nREVERSE WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif while (more && lasta <= besta + TRIM_MLAG) { int k, n; int ua, ub; BVEC t; int am, ac, ap; char *a; low -= 1; hgh += 1; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move, vd, md, had, hbd, nad, nbd, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEl)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEl; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (hbd < 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (nbd < 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nbd > 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (hbd > 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; } if (low >= minp) { NA[low] = NA[low+1]; NB[low] = NB[low+1]; V[low] = ap = INT32_MAX; } else ap = V[++low]; if (hgh <= maxp) { NA[hgh] = NA[hgh-1]; NB[hgh] = NB[hgh-1]; V[hgh] = INT32_MAX; } else hgh -= 1; dif += 1; ac = V[hgh+1] = V[low-1] = INT32_MAX; a = aseq + low; t = PATH_INT; n = PATH_LEN; ua = ub = -1; for (k = low; k <= hgh; k++) { int y, m; int ha, hb; int c, d; BVEC b; Pebble *pb; am = ac; ac = ap; ap = V[d = k+1]; if (ac > ap) if (ap > am) { c = am-1; m = n; b = t; ha = ua; hb = ub; } else { c = ap-1; m = M[d]; b = T[d]; ha = HA[d]; hb = HB[d]; } else if (ac > am) { c = am-1; m = n; b = t; ha = ua; hb = ub; } else { c = ac-2; m = M[k]; b = T[k]; ha = HA[k]; hb = HB[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k <= NA[k]) { if (cells[ha].mark > NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] -= TRACE_SPACE; } while (y <= NB[k]) { if (cells[hb].mark > NB[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = dif; pb->mark = NB[k]; hb = avail++; } NB[k] -= TRACE_SPACE; } if (c < besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; trimhb = hb; } } } t = T[k]; n = M[k]; ua = HA[k]; ub = HB[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; HB[k] = hb; a += 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; morehb = HB[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } n = besta + WAVE_LAG; while (hgh >= low) if (V[hgh] > n) hgh -= 1; else { while (V[low] > n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; uint16 *btrace = (uint16 *) bpath->trace; int atlen, btlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0 && REACH) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; trimhb = morehb; } else trimx = trima-trimy; atlen = btlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark - k; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); #endif if ((b+k)%TRACE_SPACE != aoff) { h = cells[h].ptr; if (h < 0) { a = trimy; d = trimd; } else { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif if (apath->tlen == 0) { atrace[--atlen] = (uint16) (b-a); atrace[--atlen] = (uint16) (d-e); } else { atrace[1] = (uint16) (atrace[1] + (b-a)); atrace[0] = (uint16) (atrace[0] + (d-e)); } b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; atrace[--atlen] = (uint16) (b-a); d = cells[h].diff; atrace[--atlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[--atlen] = (uint16) (b-trimy); atrace[--atlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } } a = -1; for (h = trimhb; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark + k; e = 0; #ifdef SHOW_TRAIL printf(" B path = (%5d,%5d)\n",b,b-k); fflush(stdout); #endif if ((b-k)%TRACE_SPACE != boff) { h = cells[h].ptr; if (h < 0) { a = trimx; d = trimd; } else { k = cells[h].diag; a = cells[h].mark + k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); #endif if (bpath->tlen == 0) { btrace[--btlen] = (uint16) (b-a); btrace[--btlen] = (uint16) (b-a); } else { btrace[1] = (uint16) (btrace[1] + (b-a)); btrace[0] = (uint16) (btrace[0] + (d-e)); } b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark + k; btrace[--btlen] = (uint16) (b-a); d = cells[h].diff; btrace[--btlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b-k != trimy) { btrace[--btlen] = (uint16) (b-trimx); btrace[--btlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); #endif } else if (b != trimx) { btrace[btlen+1] = (uint16) (btrace[btlen+1] + (b-trimx)); btrace[btlen] = (uint16) (btrace[btlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); #endif } } apath->abpos = trimx; apath->bbpos = trimy; apath->diffs = apath->diffs + trimd; apath->tlen = apath->tlen - atlen; apath->trace = atrace + atlen; bpath->tlen = bpath->tlen - btlen; bpath->trace = btrace + btlen; } return (0); } /* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) See associated .h file for the precise definition of the interface. */ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, int low, int hgh, int anti, int lbord, int hbord) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath, *bpath; int aoff, boff; int minp, maxp; int selfie; { int alen, blen; int maxtp, wsize; alen = align->alen; blen = align->blen; if (hgh-low >= 7500) wsize = VectorEl*(hgh-low+1); else wsize = VectorEl*10000; if (wsize >= work->vecmax) if (enlarge_vector(work,wsize)) EXIT(NULL); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); else maxtp = 2*(alen/spec->trace_space+2); wsize = 4*maxtp*sizeof(uint16) + sizeof(Path); if (wsize > work->pntmax) if (enlarge_points(work,wsize)) EXIT(NULL); apath = align->path; bpath = (Path *) work->points; apath->trace = ((uint16 *) (bpath+1)) + maxtp; bpath->trace = ((uint16 *) apath->trace) + 2*maxtp; } #ifdef DEBUG_PASSES printf("\n"); #endif selfie = (align->aseq == align->bseq); if (lbord < 0) { if (selfie && low >= 0) minp = 1; else minp = -INT32_MAX; } else minp = low-lbord; if (hbord < 0) { if (selfie && hgh <= 0) maxp = -1; else maxp = INT32_MAX; } else maxp = hgh+hbord; if (ACOMP(align->flags)) { aoff = align->alen % spec->trace_space; boff = 0; } else if (COMP(align->flags)) { aoff = 0; boff = align->blen % spec->trace_space; } else { aoff = 0; boff = 0; } if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp,aoff,boff)) EXIT(NULL); #ifdef DEBUG_PASSES printf("F1 (%d,%d) ~ %d => (%d,%d) %d\n", (2*anti+(low+hgh))/4,(anti-(low+hgh))/4,hgh-low, apath->aepos,apath->bepos,apath->diffs); #endif if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp,aoff,boff)) EXIT(NULL); #ifdef DEBUG_PASSES printf("R1 (%d,%d) => (%d,%d) %d\n", (anti+low)/2,(anti-low)/2,apath->abpos,apath->bbpos,apath->diffs); #endif bpath->diffs = apath->diffs; if (ACOMP(align->flags)) { uint16 *trace = (uint16 *) apath->trace; uint16 p; int i, j; bpath->aepos = apath->bepos; bpath->bepos = apath->aepos; bpath->abpos = apath->bbpos; bpath->bbpos = apath->abpos; apath->abpos = align->alen - bpath->bepos; apath->bbpos = align->blen - bpath->aepos; apath->aepos = align->alen - bpath->bbpos; apath->bepos = align->blen - bpath->abpos; i = apath->tlen-2; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; p = trace[i+1]; trace[i+1] = trace[j+1]; trace[j+1] = p; i -= 2; j += 2; } } else if (COMP(align->flags)) { uint16 *trace = (uint16 *) bpath->trace; uint16 p; int i, j; bpath->abpos = align->blen - apath->bepos; bpath->bbpos = align->alen - apath->aepos; bpath->aepos = align->blen - apath->bbpos; bpath->bepos = align->alen - apath->abpos; i = bpath->tlen-2; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; p = trace[i+1]; trace[i+1] = trace[j+1]; trace[j+1] = p; i -= 2; j += 2; } } else { bpath->aepos = apath->bepos; bpath->bepos = apath->aepos; bpath->abpos = apath->bbpos; bpath->bbpos = apath->abpos; } #ifdef DEBUG_POINTS { uint16 *trace = (uint16 *) apath->trace; int a, h; printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos); printf(" %c\n",((COMP(align->flags) || ACOMP(align->flags)) ? 'c' : 'n')); a = apath->bbpos; for (h = 1; h < apath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } { uint16 *trace = (uint16 *) bpath->trace; int a, h; printf("\nB-path (%d,%d)->(%d,%d)",bpath->abpos,bpath->bbpos,bpath->aepos,bpath->bepos); printf(" %c [%d,%d]\n",((COMP(align->flags) || ACOMP(align->flags)) ? 'c' : 'n'), align->blen,align->alen); a = bpath->bbpos; for (h = 1; h < bpath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } #endif return (bpath); } /****************************************************************************************\ * * * EXTENSION VERSION OF LOCAL ALIGNMENT * * * \****************************************************************************************/ static int VectorEn = 4*sizeof(int) + sizeof(BVEC); static int forward_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, int midd, int mida, int minp, int maxp) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *NA; int *_HA, *_NA; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha; int morea, morey, mored; int moreha; int more, morem, lasta; int aclip, bclip; hgh = midd; low = midd; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEn; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } /* Compute 0-wave starting from mid-line */ more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; morem = -1; { int k; char *a; a = aseq + hgh; for (k = hgh; k >= low; k--) { int y, c, d; int ha, na; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k)/TRACE_SPACE)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; } c = (y << 1) + k; while (y+k >= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; } if (c > besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; NA[k] = na; a -= 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } #ifdef DEBUG_WAVE printf("\nFORWARD WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif /* Compute successive waves until no furthest reaching points remain */ while (more && lasta >= besta - TRIM_MLAG) { int k, n; int ua; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move; int64 vd, md, had, nad, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEn)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEn; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; V[low] = -1; } if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; V[hgh] = am = -1; } else am = V[hgh]; dif += 1; ac = V[hgh+1] = V[low-1] = -1; a = aseq + hgh; t = PATH_INT; n = PATH_LEN; ua = -1; for (k = hgh; k >= low; k--) { int y, m; int ha; int c, d; BVEC b; Pebble *pb; ap = ac; ac = am; am = V[d = k-1]; if (ac < am) if (am < ap) { c = ap+1; m = n; b = t; ha = ua; } else { c = am+1; m = M[d]; b = T[d]; ha = HA[d]; } else if (ac < ap) { c = ap+1; m = n; b = t; ha = ua; } else { c = ac+2; m = M[k]; b = T[k]; ha = HA[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k >= NA[k]) { if (cells[ha].mark < NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] += TRACE_SPACE; } if (c > besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; } } } t = T[k]; n = M[k]; ua = HA[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; a -= 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta-besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } n = besta - WAVE_LAG; while (hgh >= low) if (V[hgh] < n) hgh -= 1; else { while (V[low] < n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; int atlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; } else trimx = trima-trimy; atlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida-k)/2; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; atrace[atlen++] = (uint16) (d-e); atrace[atlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[atlen++] = (uint16) (trimd-e); atrace[atlen++] = (uint16) (trimy-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } apath->aepos = trimx; apath->bepos = trimy; apath->diffs = trimd; apath->tlen = atlen; } return (0); } static int reverse_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, int midd, int mida, int minp, int maxp) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *NA; int *_HA, *_NA; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha; int morea, morey, mored; int moreha; int more, morem, lasta; int aclip, bclip; hgh = midd; low = midd; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEn; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; morem = -1; { int k; char *a; a = aseq + low; for (k = low; k <= hgh; k++) { int y, c, d; int ha, na; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y+k; ha = avail++; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; } c = (y << 1) + k; while (y+k <= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na -= TRACE_SPACE; } if (c < besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; NA[k] = na; a += 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } #ifdef DEBUG_WAVE printf("\nREVERSE WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif while (more && lasta <= besta + TRIM_MLAG) { int k, n; int ua; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move, vd, md, had, nad, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEn)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEn; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; V[low] = ap = INT32_MAX; } else ap = V[low]; if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; V[hgh] = INT32_MAX; } dif += 1; ac = V[hgh+1] = V[low-1] = INT32_MAX; a = aseq + low; t = PATH_INT; n = PATH_LEN; ua = -1; for (k = low; k <= hgh; k++) { int y, m; int ha; int c, d; BVEC b; Pebble *pb; am = ac; ac = ap; ap = V[d = k+1]; if (ac > ap) if (ap > am) { c = am-1; m = n; b = t; ha = ua; } else { c = ap-1; m = M[d]; b = T[d]; ha = HA[d]; } else if (ac > am) { c = am-1; m = n; b = t; ha = ua; } else { c = ac-2; m = M[k]; b = T[k]; ha = HA[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k <= NA[k]) { if (cells[ha].mark > NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] -= TRACE_SPACE; } if (c < besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; } } } t = T[k]; n = M[k]; ua = HA[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; a += 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } n = besta + WAVE_LAG; while (hgh >= low) if (V[hgh] > n) hgh -= 1; else { while (V[low] > n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; int atlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; } else trimx = trima-trimy; atlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark - k; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); #endif if ((b+k)%TRACE_SPACE != 0) { h = cells[h].ptr; if (h < 0) { a = trimy; d = trimd; } else { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif atrace[--atlen] = (uint16) (b-a); atrace[--atlen] = (uint16) (d-e); b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; atrace[--atlen] = (uint16) (b-a); d = cells[h].diff; atrace[--atlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[--atlen] = (uint16) (b-trimy); atrace[--atlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } } apath->abpos = trimx; apath->bbpos = trimy; apath->diffs = trimd; apath->tlen = - atlen; apath->trace = atrace + atlen; } return (0); } /* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) See associated .h file for the precise definition of the interface. */ int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec, int diag, int anti, int lbord, int hbord, int prefix) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath; int minp, maxp; { int alen, blen; int maxtp, wsize; alen = align->alen; blen = align->blen; wsize = VectorEn*10000; if (wsize >= work->vecmax) if (enlarge_vector(work,wsize)) EXIT(1); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); else maxtp = 2*(alen/spec->trace_space+2); wsize = 2*maxtp*sizeof(uint16); if (wsize > work->pntmax) if (enlarge_points(work,wsize)) EXIT(1); apath = align->path; apath->trace = ((uint16 *) work->points) + maxtp; } #ifdef DEBUG_PASSES printf("\n"); #endif if (lbord < 0) minp = -INT32_MAX; else minp = diag-lbord; if (hbord < 0) maxp = INT32_MAX; else maxp = diag+hbord; if (prefix) { if (reverse_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); apath->aepos = (anti+diag)/2; apath->bepos = (anti-diag)/2; #ifdef DEBUG_PASSES printf("E1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->abpos,apath->bbpos,apath->diffs); #endif } else { if (forward_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); apath->abpos = (anti+diag)/2; apath->bbpos = (anti-diag)/2; #ifdef DEBUG_PASSES printf("F1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->aepos,apath->bepos,apath->diffs); #endif } #ifdef DEBUG_POINTS { uint16 *trace = (uint16 *) apath->trace; int a, h; printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos); printf(" %c\n",(COMP(align->flags) ? 'c' : 'n')); a = apath->bbpos; for (h = 1; h < apath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } #endif return (0); } /****************************************************************************************\ * * * OVERLAP MANIPULATION * * * \****************************************************************************************/ static int64 PtrSize = sizeof(void *); static int64 OvlIOSize = sizeof(Overlap) - sizeof(void *); int Read_Overlap(FILE *input, Overlap *ovl) { if (fread( ((char *) ovl) + PtrSize, OvlIOSize, 1, input) != 1) return (1); return (0); } int Read_Trace(FILE *input, Overlap *ovl, int tbytes) { if (tbytes > 0 && ovl->path.tlen > 0) { if (fread(ovl->path.trace, tbytes*ovl->path.tlen, 1, input) != 1) return (1); } return (0); } int Write_Overlap(FILE *output, Overlap *ovl, int tbytes) { if (fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output) != 1) return (1); if (ovl->path.trace != NULL) if (fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output) != (size_t) ovl->path.tlen) return (1); return (0); } int Compress_TraceTo8(Overlap *ovl, int check) { uint16 *t16 = (uint16 *) ovl->path.trace; uint8 *t8 = (uint8 *) ovl->path.trace; int j, x; if (check) for (j = 0; j < ovl->path.tlen; j++) { x = t16[j]; if (x > 255) { fprintf(stderr,"%s: Compression of trace to bytes fails, value too big\n",Prog_Name); EXIT(1); } t8[j] = (uint8) x; } else for (j = 0; j < ovl->path.tlen; j++) t8[j] = (uint8) (t16[j]); return (0); } void Decompress_TraceTo16(Overlap *ovl) { uint16 *t16 = (uint16 *) ovl->path.trace; uint8 *t8 = (uint8 *) ovl->path.trace; int j; for (j = ovl->path.tlen-1; j >= 0; j--) t16[j] = t8[j]; } void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent) { int i; fprintf(output,"%*s%d vs. ",indent,"",ovl->aread); if (COMP(ovl->flags)) fprintf(output,"c(%d)\n",ovl->bread); else fprintf(output,"%d\n",ovl->bread); fprintf(output,"%*s [%d,%d] vs [%d,%d] w. %d diffs\n",indent,"", ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos,ovl->path.diffs); if (tbytes == 1) { uint8 *trace = (uint8 *) (ovl->path.trace); if (trace != NULL) { int p = ovl->path.bbpos + trace[1]; fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); for (i = 3; i < ovl->path.tlen; i += 2) { if (i%10 == 0) fprintf(output,"\n%*s",indent+6,""); p += trace[i]; fprintf(output," %3d/%5d",trace[i-1],p); } fprintf(output,"\n"); } } else { uint16 *trace = (uint16 *) (ovl->path.trace); if (trace != NULL) { int p = ovl->path.bbpos + trace[1]; fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); for (i = 3; i < ovl->path.tlen; i += 2) { if (i%10 == 0) fprintf(output,"\n%*s",indent+6,""); p += trace[i]; fprintf(output," %3d/%5d",trace[i-1],p); } fprintf(output,"\n"); } } } int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname) { int i, p, q; if (tspace != 0) { if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2) { if (verbose) EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname); return (1); } p = ovl->path.bbpos; if (tspace <= TRACE_XOVR) { uint8 *trace8 = (uint8 *) ovl->path.trace; for (i = 1; i < ovl->path.tlen; i += 2) p += trace8[i]; } else { uint16 *trace16 = (uint16 *) ovl->path.trace; for (i = 1; i < ovl->path.tlen; i += 2) p += trace16[i]; } if (p != ovl->path.bepos) { if (verbose) EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname); return (1); } } else { uint16 *trace16 = (uint16 *) ovl->path.trace; p = ovl->path.bbpos; q = ovl->path.abpos; for (i = 1; i < ovl->path.tlen; i += 2) { p += trace16[i]; q += trace16[i-1]; } if (p != ovl->path.bepos || q != ovl->path.aepos) { if (verbose) EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname); return (1); } } return (0); } void Flip_Alignment(Alignment *align, int full) { char *aseq = align->aseq; char *bseq = align->bseq; int alen = align->alen; int blen = align->blen; Path *path = align->path; int comp = COMP(align->flags); int *trace = (int *) path->trace; int tlen = path->tlen; int i, j, p; if (comp) { p = path->abpos; path->abpos = blen - path->bepos; path->bepos = alen - p; p = path->aepos; path->aepos = blen - path->bbpos; path->bbpos = alen - p; if (full) { alen += 2; blen += 2; for (i = 0; i < tlen; i++) if ((p = trace[i]) < 0) trace[i] = alen + p; else trace[i] = p - blen; i = tlen-1; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; i -= 1; j += 1; } alen -= 2; blen -= 2; } } else { p = path->abpos; path->abpos = path->bbpos; path->bbpos = p; p = path->aepos; path->aepos = path->bepos; path->bepos = p; if (full) for (i = 0; i < tlen; i++) trace[i] = - (trace[i]); } align->aseq = bseq; align->bseq = aseq; align->alen = blen; align->blen = alen; } /****************************************************************************************\ * * * ALIGNMENT PRINTING * * * \****************************************************************************************/ /* Complement the sequence in fragment aseq. The operation does the complementation/reversal in place. Calling it a second time on a given fragment restores it to its original state. */ void Complement_Seq(char *aseq, int len) { char *s, *t; int c; s = aseq; t = aseq + (len-1); while (s < t) { c = 3 - *s; *s++ = (char) (3 - *t); *t-- = (char) c; } if (s == t) *s = (char) (3 - *s); } /* Print an alignment to file between a and b given in trace (unpacked). Prefix gives the length of the initial prefix of a that is unaligned. */ static char ToL[8] = { 'a', 'c', 'g', 't', '.', '[', ']', '-' }; static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' }; int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, int indent, int width, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int comp, blen; int sa, sb; int match, diff; char *N2A; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif o = sizeof(char)*3*(width+1); if (o > work->vecmax) if (enlarge_vector(work,o)) EXIT(1); if (upper) N2A = ToU; else N2A = ToL; Abuf = (char *) work->vector; Bbuf = Abuf + (width+1); Dbuf = Bbuf + (width+1); aend = align->path->aepos; bend = align->path->bepos; comp = COMP(align->flags); blen = align->blen; Abuf[width] = Bbuf[width] = Dbuf[width] = '\0'; /* buffer/output next column */ #define COLUMN(x,y) \ { int u, v; \ if (o >= width) \ { fprintf(file,"\n"); \ fprintf(file,"%*s",indent,""); \ if (coord > 0) \ { if (sa < aend) \ fprintf(file," %*d",coord,sa); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %*s %s\n",indent,"",coord,"",Dbuf); \ fprintf(file,"%*s",indent,""); \ if (sb < bend) \ if (comp) \ fprintf(file," %*d",coord,blen-sb); \ else \ fprintf(file," %*d",coord,sb); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s",Bbuf); \ } \ else \ { fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %s\n",indent,"",Dbuf); \ fprintf(file,"%*s %s",indent,"",Bbuf); \ } \ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ o = 0; \ sa = i-1; \ sb = j-1; \ match = diff = 0; \ } \ u = (x); \ v = (y); \ if (u == 4 || v == 4) \ Dbuf[o] = ' '; \ else if (u == v) \ Dbuf[o] = mtag; \ else \ Dbuf[o] = dtag; \ Abuf[o] = N2A[u]; \ Bbuf[o] = N2A[v]; \ o += 1; \ } a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->path->abpos; prefb = align->path->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } sa = i-1; sb = j-1; mtag = ':'; dtag = ':'; while (prefa > prefb) { COLUMN(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { COLUMN(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { COLUMN(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) COLUMN(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(7,b[j]) j += 1; diff += 1; } else { while (j != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(a[i],7) i += 1; diff += 1; } p = align->path->aepos; while (i <= p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) COLUMN(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { COLUMN(a[i],b[j]) i += 1; j += 1; } else { COLUMN(a[i],4) i += 1; } else { COLUMN(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa < aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb < bend) if (comp) fprintf(file," %*d",coord,blen-sb); else fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); fflush(file); return (0); } int Print_Reference(FILE *file, Alignment *align, Work_Data *ework, int indent, int block, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int comp, blen; int sa, sb, s0; int match, diff; char *N2A; int vmax; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif vmax = work->vecmax/3; o = sizeof(char)*6*(block+1); if (o > vmax) { if (enlarge_vector(work,3*o)) EXIT(1); vmax = work->vecmax/3; } Abuf = (char *) work->vector; Bbuf = Abuf + vmax; Dbuf = Bbuf + vmax; if (upper) N2A = ToU; else N2A = ToL; aend = align->path->aepos; bend = align->path->bepos; comp = COMP(align->flags); blen = align->blen; #define BLOCK(x,y) \ { int u, v; \ if (i%block == 1 && i != s0 && x < 4 && o > 0) \ { fprintf(file,"\n"); \ fprintf(file,"%*s",indent,""); \ if (coord > 0) \ { if (sa < aend) \ fprintf(file," %*d",coord,sa); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %.*s\n",o,Abuf); \ fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); \ fprintf(file,"%*s",indent,""); \ if (sb < bend) \ if (comp) \ fprintf(file," %*d",coord,blen-sb); \ else \ fprintf(file," %*d",coord,sb); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %.*s",o,Bbuf); \ } \ else \ { fprintf(file," %.*s\n",o,Abuf); \ fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); \ fprintf(file,"%*s %.*s",indent,"",o,Bbuf); \ } \ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ o = 0; \ sa = i-1; \ sb = j-1; \ match = diff = 0; \ } \ u = (x); \ v = (y); \ if (u == 4 || v == 4) \ Dbuf[o] = ' '; \ else if (u == v) \ Dbuf[o] = mtag; \ else \ Dbuf[o] = dtag; \ Abuf[o] = N2A[u]; \ Bbuf[o] = N2A[v]; \ o += 1; \ if (o >= vmax) \ { if (enlarge_vector(work,3*o)) \ EXIT(1); \ vmax = work->vecmax/3; \ memmove(work->vector+2*vmax,Dbuf,o); \ memmove(work->vector+vmax,Bbuf,o); \ memmove(work->vector,Abuf,o); \ Abuf = (char *) work->vector; \ Bbuf = Abuf + vmax; \ Dbuf = Bbuf + vmax; \ } \ } a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->path->abpos; prefb = align->path->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } s0 = i; sa = i-1; sb = j-1; mtag = ':'; dtag = ':'; while (prefa > prefb) { BLOCK(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { BLOCK(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { BLOCK(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) BLOCK(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } BLOCK(7,b[j]) j += 1; diff += 1; } else { while (j != p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } BLOCK(a[i],7) i += 1; diff += 1; } p = align->path->aepos; while (i <= p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) BLOCK(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { BLOCK(a[i],b[j]) i += 1; j += 1; } else { BLOCK(a[i],4) i += 1; } else { BLOCK(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa < aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb < bend) if (comp) fprintf(file," %*d",coord,blen-sb); else fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); fflush(file); return (0); } /* Print an ASCII representation of the overlap in align between fragments a and b to given file. */ static inline void repchar(FILE *file, int symbol, int rep) { while (rep-- > 0) fputc(symbol,file); } void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord) { int alen = align->alen; int blen = align->blen; Path *path = align->path; int comp = COMP(align->flags); int w; fprintf(file,"%*s",indent,""); if (path->abpos > 0) fprintf(file," %*d ",coord,path->abpos); else fprintf(file,"%*s",coord+5,""); if (path->aepos < alen) fprintf(file,"%*s%d",coord+8,"",alen-path->aepos); fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (path->abpos > 0) { fprintf(file,"A "); w = Number_Digits((int64) path->abpos); repchar(file,' ',coord-w); repchar(file,'=',w+3); fputc('+',file); repchar(file,'-',coord+5); } else { fprintf(file,"A %*s",coord+4,""); repchar(file,'-',coord+5); } if (path->aepos < alen) { fputc('+',file); w = Number_Digits((int64) (alen-path->aepos)); repchar(file,'=',w+2); fputc('>',file); repchar(file,' ',w); } else { fputc('>',file); repchar(file,' ',coord+3); } { int asub, bsub; asub = path->aepos - path->abpos; bsub = path->bepos - path->bbpos; fprintf(file," dif/(len1+len2) = %d/(%d+%d) = %5.2f%%\n", path->diffs,asub,bsub,(200.*path->diffs)/(asub+bsub)); } { int sym1e, sym2e; int sym1p, sym2p; if (comp > 0) { sym1p = '<'; sym2p = '-'; sym1e = '<'; sym2e = '='; } else { sym1p = '-'; sym2p = '>'; sym1e = '='; sym2e = '>'; } fprintf(file,"%*s",indent,""); if (path->bbpos > 0) { fprintf(file,"B "); w = Number_Digits((int64) path->bbpos); repchar(file,' ',coord-w); fputc(sym1e,file); repchar(file,'=',w+2); fputc('+',file); repchar(file,'-',coord+5); } else { fprintf(file,"B "); repchar(file,' ',coord+3); fputc(sym1p,file); repchar(file,'-',coord+5); } if (path->bepos < blen) { fprintf(file,"+"); w = Number_Digits((int64) (blen-path->bepos)); repchar(file,'=',w+2); fprintf(file,"%c\n",sym2e); } else fprintf(file,"%c\n",sym2p); } fprintf(file,"%*s",indent,""); if (path->bbpos > 0) fprintf(file," %*d ",coord,path->bbpos); else fprintf(file,"%*s",coord+5,""); if (path->bepos < blen) fprintf(file,"%*s%d",coord+8,"",blen-path->bepos); fprintf(file,"\n"); fflush(file); } /****************************************************************************************\ * * * O(ND) trace algorithm * * * \****************************************************************************************/ #ifdef DEBUG_AWAVE static void print_awave(int *V, int low, int hgh) { int k; printf(" [%6d,%6d]: ",low,hgh); for (k = low; k <= hgh; k++) printf(" %3d",V[k]); printf("\n"); fflush(stdout); } #endif #ifdef DEBUG_ALIGN static int depth = 0; #endif typedef struct { int *Stop; // Ongoing stack of alignment indels uint16 *Trace; // Base of Trace Vector char *Aabs, *Babs; // Absolute base of A and B sequences int **PVF, **PHF; // List of waves for iterative np algorithms int mida, midb; // mid point division for mid-point algorithms int *VF, *VB; // Forward/Reverse waves for nd algorithms } Trace_Waves; static int split_nd(char *A, int M, char *B, int N, Trace_Waves *wave, int *px, int *py) { int x, y; int D; int *VF = wave->VF; int *VB = wave->VB; int flow; // fhgh == D ! int blow, bhgh; char *a; y = 0; if (N < M) while (y < N && B[y] == A[y]) y += 1; else { while (y < M && B[y] == A[y]) y += 1; if (y >= M && N == M) { *px = *py = M; return (0); } } flow = 0; VF[0] = y; VF[-1] = -2; x = N-M; a = A-x; y = N-1; if (N > M) while (y >= x && B[y] == a[y]) y -= 1; else while (y >= 0 && B[y] == a[y]) y -= 1; blow = bhgh = -x; VB += x; VB[blow] = y; VB[blow-1] = N+1; for (D = 1; 1; D += 1) { int k, r; int am, ac, ap; // Forward wave flow -= 1; am = ac = VF[flow-1] = -2; a = A + D; x = M - D; for (k = D; k >= flow; k--) { ap = ac; ac = am+1; am = VF[k-1]; if (ac < am) if (ap < am) y = am; else y = ap; else if (ap < ac) y = ac; else y = ap; if (blow <= k && k <= bhgh) { r = VB[k]; if (y > r) { D = (D<<1)-1; if (ap > r) y = ap; else if (ac > r) y = ac; else y = r+1; x = k+y; *px = x; *py = y; return (D); } } if (N < x) while (y < N && B[y] == a[y]) y += 1; else while (y < x && B[y] == a[y]) y += 1; VF[k] = y; a -= 1; x += 1; } #ifdef DEBUG_AWAVE print_awave(VF,flow,D); #endif // Reverse Wave bhgh += 1; blow -= 1; am = ac = VB[blow-1] = N+1; a = A + bhgh; x = -bhgh; for (k = bhgh; k >= blow; k--) { ap = ac+1; ac = am; am = VB[k-1]; if (ac > am) if (ap > am) y = am; else y = ap; else if (ap > ac) y = ac; else y = ap; if (flow <= k && k <= D) { r = VF[k]; if (y <= r) { D = (D << 1); if (ap <= r) y = ap; else if (ac <= r) y = ac; else y = r; x = k+y; *px = x; *py = y; return (D); } } y -= 1; if (x > 0) while (y >= x && B[y] == a[y]) y -= 1; else while (y >= 0 && B[y] == a[y]) y -= 1; VB[k] = y; a -= 1; x += 1; } #ifdef DEBUG_AWAVE print_awave(VB,blow,bhgh); #endif } } static int trace_nd(char *A, int M, char *B, int N, Trace_Waves *wave, int tspace) { int x, y; int D, s; #ifdef DEBUG_ALIGN printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); fflush(stdout); #endif if (M <= 0) { y = (((A-wave->Aabs)/tspace) << 1); wave->Trace[y] += N; wave->Trace[y+1] += N; #ifdef DEBUG_TRACE printf("%*s Adding1 (%d,%d) to tp %d(%d,%d)\n",depth,"",N,N,y>>1, wave->Trace[y+1],wave->Trace[y]); fflush(stdout); #endif return (N); } if (N <= 0) { x = A - wave->Aabs; y = x/tspace; x = (y+1)*tspace - x; y <<= 1; for (s = M; s > 0; s -= x, x = tspace) { if (x > s) x = s; wave->Trace[y] += x; #ifdef DEBUG_TRACE printf("%*s Adding2 (0,%d) to tp %d(%d,%d)\n",depth,"",x,y>>1, wave->Trace[y+1],wave->Trace[y]); fflush(stdout); #endif y += 2; } return (M); } D = split_nd(A,M,B,N,wave,&x,&y); if (D > 1) { #ifdef DEBUG_ALIGN printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D); fflush(stdout); depth += 2; #endif s = A-wave->Aabs; if ((s/tspace+1)*tspace - s >= x) { s = ((s/tspace)<<1); wave->Trace[s] += (D+1)/2; wave->Trace[s+1] += y; #ifdef DEBUG_TRACE printf("%*s Adding3 (%d,%d) to tp %d(%d,%d)\n",depth,"",y,(D+1)/2,s>>1, wave->Trace[s+1],wave->Trace[s]); fflush(stdout); #endif } else trace_nd(A,x,B,y,wave,tspace); s = (A+x)-wave->Aabs; if ((s/tspace+1)*tspace - s >= M-x) { s = ((s/tspace)<<1); wave->Trace[s] += D/2; wave->Trace[s+1] += N-y; #ifdef DEBUG_TRACE printf("%*s Adding4 (%d,%d)) to tp %d(%d,%d)\n",depth,"",N-y,D/2,s>>1, wave->Trace[s+1],wave->Trace[s]); fflush(stdout); #endif } else trace_nd(A+x,M-x,B+y,N-y,wave,tspace); #ifdef DEBUG_ALIGN depth -= 2; #endif } else { int u, v; if (D == 0 || M < N) s = x; else s = x-1; if (s > 0) { u = A - wave->Aabs; v = u/tspace; u = (v+1)*tspace - u; for (v <<= 1; s > 0; s -= u, u = tspace) { if (u > s) u = s; wave->Trace[v+1] += u; #ifdef DEBUG_TRACE printf("%*s Adding5 (%d,0)) to tp %d(%d,%d)\n",depth,"",u,v>>1, wave->Trace[v+1],wave->Trace[v]); fflush(stdout); #endif v += 2; } } if (D == 0) return (D); if (M < N) y = ((((A+x)-wave->Aabs)/tspace)<<1); else y = ((((A+(x-1))-wave->Aabs)/tspace)<<1); wave->Trace[y] += 1; if (M <= N) wave->Trace[y+1] += 1; #ifdef DEBUG_TRACE printf("%*s Adding5 (%d,1)) to tp %d(%d,%d)\n",depth,"",N>=M,y>>1, wave->Trace[y+1],wave->Trace[y]); fflush(stdout); #endif s = M-x; if (s > 0) { u = (A+x) - wave->Aabs; v = u/tspace; u = (v+1)*tspace - u; for (v <<= 1; s > 0; s -= u, u = tspace) { if (u > s) u = s; wave->Trace[v+1] += u; #ifdef DEBUG_TRACE printf("%*s Adding5 (%d,0)) to tp %d(%d,%d)\n",depth,"",u,v>>1, wave->Trace[v+1],wave->Trace[v]); fflush(stdout); #endif v += 2; } } } return (D); } static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave) { int x, y; int D; #ifdef DEBUG_ALIGN printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); #endif if (M <= 0) { x = (wave->Aabs-A)-1; for (y = 1; y <= N; y++) { *wave->Stop++ = x; #ifdef DEBUG_SCRIPT printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1); #endif } return (N); } if (N <= 0) { y = (B-wave->Babs)+1; for (x = 1; x <= M; x++) { *wave->Stop++ = y; #ifdef DEBUG_SCRIPT printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1); #endif } return (M); } D = split_nd(A,M,B,N,wave,&x,&y); if (D > 1) { #ifdef DEBUG_ALIGN printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D); fflush(stdout); depth += 2; #endif dandc_nd(A,x,B,y,wave); dandc_nd(A+x,M-x,B+y,N-y,wave); #ifdef DEBUG_ALIGN depth -= 2; #endif } else if (D == 1) { if (M > N) { *wave->Stop++ = (B-wave->Babs)+y+1; #ifdef DEBUG_SCRIPT printf("%*s D %ld(%ld)\n",depth,"",(A-wave->Aabs)+x,(B-wave->Babs)+y+1); #endif } else if (M < N) { *wave->Stop++ = (wave->Aabs-A)-x-1; #ifdef DEBUG_SCRIPT printf("%*s I %ld(%ld)\n",depth,"",(B-wave->Babs)+y,(A-wave->Aabs)+x+1); #endif } #ifdef DEBUG_SCRIPT else printf("%*s %ld S %ld\n",depth,"",(wave->Aabs-A)+x,(B-wave->Babs)+y); #endif } return (D); } int Compute_Alignment(Alignment *align, Work_Data *ework, int task, int tspace) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; int L, D; int asub, bsub; char *aseq, *bseq; Path *path; int *trace; uint16 *strace; path = align->path; asub = path->aepos-path->abpos; bsub = path->bepos-path->bbpos; aseq = align->aseq+path->abpos; bseq = align->bseq+path->bbpos; if (task != DIFF_ONLY) { if (task == DIFF_TRACE || task == PLUS_TRACE) L = 2*(((path->aepos + (tspace-1))/tspace - path->abpos/tspace) + 1)*sizeof(uint16); else if (asub < bsub) L = bsub*sizeof(int); else L = asub*sizeof(int); if (L > work->tramax) if (enlarge_trace(work,L)) EXIT(1); } trace = ((int *) work->trace); strace = ((uint16 *) work->trace); if (asub > bsub) D = (4*asub+6)*sizeof(int); else D = (4*bsub+6)*sizeof(int); if (D > work->vecmax) if (enlarge_vector(work,D)) EXIT(1); if (asub > bsub) { wave.VF = ((int *) work->vector) + (asub+1); wave.VB = wave.VF + (2*asub+3); } else { wave.VF = ((int *) work->vector) + (bsub+1); wave.VB = wave.VF + (2*bsub+3); } wave.Aabs = align->aseq; wave.Babs = align->bseq; if (task == DIFF_ONLY) { wave.mida = -1; if (asub <= 0) path->diffs = bsub; else if (bsub <= 0) path->diffs = asub; else path->diffs = split_nd(aseq,asub,bseq,bsub,&wave,&wave.mida,&wave.midb); path->trace = NULL; path->tlen = -1; return (0); } else if (task < DIFF_ONLY && wave.mida >= 0) { int x = wave.mida; int y = wave.midb; if (task == PLUS_ALIGN) { wave.Stop = trace; dandc_nd(aseq,x,bseq,y,&wave); dandc_nd(aseq+x,asub-x,bseq+y,bsub-y,&wave); path->tlen = wave.Stop - trace; } else { int i, n; wave.Trace = strace - 2*(path->abpos/tspace); n = L/sizeof(uint16); for (i = 0; i < n; i++) strace[i] = 0; trace_nd(aseq,x,bseq,y,&wave,tspace); trace_nd(aseq+x,asub-x,bseq+y,bsub-y,&wave,tspace); if (strace[n-1] != 0) // Last element is to capture all inserts on TP boundary { strace[n-3] += strace[n-1]; strace[n-4] += strace[n-2]; } path->tlen = n-2; #ifdef DEBUG_SCRIPT printf(" Trace:\n"); for (i = 0; i < path->tlen; i += 2) printf(" %3d %3d\n",strace[i],strace[i+1]); fflush(stdout); #endif } } else { if (task == DIFF_ALIGN) { wave.Stop = trace; path->diffs = dandc_nd(aseq,asub,bseq,bsub,&wave); path->tlen = wave.Stop - trace; } else { int i, n; wave.Trace = strace - 2*(path->abpos/tspace); n = L/sizeof(uint16); for (i = 0; i < n; i++) strace[i] = 0; path->diffs = trace_nd(aseq,asub,bseq,bsub,&wave,tspace); if (strace[n-1] != 0) // Last element is to capture all inserts on TP boundary { strace[n-3] += strace[n-1]; strace[n-4] += strace[n-2]; } path->tlen = n-2; #ifdef DEBUG_SCRIPT printf(" Trace:\n"); for (i = 0; i < path->tlen; i += 2) printf(" %3d %3d\n",strace[i],strace[i+1]); fflush(stdout); #endif } } path->trace = trace; return (0); } /****************************************************************************************\ * * * O(NP) tracing algorithms * * * \****************************************************************************************/ /* Iterative O(np) algorithm for finding the alignment between two substrings (specified by a Path record). The variation includes handling substitutions and guarantees to find left-most alignments so that low complexity runs are always aligned in the same way. */ #ifdef DEBUG_ALIGN static int ToA[4] = { 'a', 'c', 'g', 't' }; #endif static char *TP_Align = "Bad alignment between trace points (Compute_Trace), source DB likely incorrect"; static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode, int dmax) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; int del = M-N; { int *F0, *F1, *F2; int *HF; int low, hgh; int posl, posh; #ifdef DEBUG_ALIGN printf("\n BASE %ld,%ld: %d vs %d\n",A-wave->Aabs,B-wave->Babs,M,N); printf(" A = "); for (D = 0; D < M; D++) printf("%c",ToA[(int) A[D]]); printf("\n"); printf(" B = "); for (D = 0; D < N; D++) printf("%c",ToA[(int) B[D]]); printf("\n"); #endif if (del >= 0) { low = 0; hgh = del; } else { low = del; hgh = 0; } posl = -dmax; posh = dmax; if (wave->Aabs == wave->Babs) { if (B == A) { EPRINTF(EPLACE,"%s: self comparison starts on diagonal 0 (Compute_Trace)\n",Prog_Name); EXIT(-1); } else if (B < A) { if ((B-A)+1 > posl) posl = (B-A)+1; } else { if ((B-A)-1 < posh) posh = (B-A)-1; } } F1 = PVF[-2]; F0 = PVF[-1]; for (D = low-1; D <= hgh+1; D++) F1[D] = F0[D] = -2; F0[0] = -1; low += 1; hgh -= 1; for (D = 0; 1; D += 1) { int k, i, j; int am, ac, ap; char *a; if (D > dmax) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Align); EXIT(-1); } F2 = F1; F1 = F0; F0 = PVF[D]; HF = PHF[D]; if ((D & 0x1) == 0) { if (low > posl) low -= 1; if (hgh < posh) hgh += 1; } F0[hgh+1] = F0[low-1] = -2; #define FS_MOVE(mdir,pdir) \ ac = F1[k]+1; \ if (ac < am) \ if (ap < am) \ { HF[k] = mdir; \ j = am; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ else \ if (ap < ac) \ { HF[k] = 0; \ j = ac; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ \ if (N < i) \ while (j < N && B[j] == a[j]) \ j += 1; \ else \ while (j < i && B[j] == a[j]) \ j += 1; \ F0[k] = j; j = -2; a = A + hgh; i = M - hgh; for (k = hgh; k > del; k--) { ap = j+1; am = F2[k-1]; FS_MOVE(-1,4) a -= 1; i += 1; } j = -2; a = A + low; i = M - low; for (k = low; k < del; k++) { ap = F2[k+1]+1; am = j; FS_MOVE(2,1) a += 1; i -= 1; } ap = F0[del+1]+1; am = j; FS_MOVE(2,4) #ifdef DEBUG_AWAVE print_awave(F0,low,hgh); print_awave(HF,low,hgh); #endif if (F0[del] >= N) break; } } { int k, h, m, e, c; int ap = (wave->Aabs-A)-1; int bp = (B-wave->Babs)+1; PHF[0][0] = 3; c = N; k = del; e = PHF[D][k]; PHF[D][k] = 3; if (mode == UPPERMOST) while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h < k) // => e = -1 or 2, UPPERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] <= c) c = PVF[D][h]-1; while (c >= m && a[c] == B[c]) c -= 1; if (e == -1) // => edge is 2, others are 1, and 0 { if (c <= PVF[D+2][k+1]) { e = 4; h = k+1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c+1; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c <= PVF[m][k+1]) { if (k == del) e = 4; else e = 1; h = k+1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c+1; } } m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } else if (mode == LOWERMOST) while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h > k) // => e = 1 or 4, LOWERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] < c) c = PVF[D][h]; while (c >= m && a[c] == B[c]) c -= 1; if (e == 1) // => edge is 2, others are 1, and 0 { if (c < PVF[D+2][k-1]) { e = 2; h = k-1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c--; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c < PVF[m][k-1]) { if (k == del) e = 2; else e = -1; h = k-1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c--; } } m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } else // mode == GREEDIEST while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } k = D = 0; e = PHF[D][k]; while (e != 3) { h = k-e; c = PVF[D][k]; if (e > 1) h += 3; else if (e == 0) D += 1; else D += 2; #ifdef DEBUG_SCRIPT if (h > k) printf(" D %d(%d)\n",(c-k)-(ap-1),c+bp); else if (h < k) printf(" I %d(%d)\n",c+(bp-1),(c+k)-ap); else printf(" %d S %d\n",(c+k)-(ap+1),c+(bp-1)); #endif if (h > k) *wave->Stop++ = bp+c; else if (h < k) *wave->Stop++ = ap-(c+k); k = h; e = PHF[D][h]; } } return (D + abs(del)); } static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode, int dmax) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; int del = M-N; { int *F0, *F1, *F2; int *HF; int low, hgh; int posl, posh; #ifdef DEBUG_ALIGN printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); printf("%*s A = ",depth,""); for (D = 0; D < M; D++) printf("%c",ToA[(int) A[D]]); printf("\n"); printf("%*s B = ",depth,""); for (D = 0; D < N; D++) printf("%c",ToA[(int) B[D]]); printf("\n"); #endif if (del >= 0) { low = 0; hgh = del; } else { low = del; hgh = 0; } posl = -dmax; posh = dmax; if (wave->Aabs == wave->Babs) { if (B == A) { EPRINTF(EPLACE,"%s: self comparison starts on diagonal 0 (Compute_Trace)\n",Prog_Name); EXIT(1); } else if (B < A) { if ((B-A)+1 > posl) posl = (B-A)+1; } else { if ((B-A)-1 < posh) posh = (B-A)-1; } } F1 = PVF[-2]; F0 = PVF[-1]; for (D = low-1; D <= hgh+1; D++) F1[D] = F0[D] = -2; F0[0] = -1; low += 1; hgh -= 1; for (D = 0; 1; D += 1) { int k, i, j; int am, ac, ap; char *a; if (D > dmax) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Align); EXIT(-1); } F2 = F1; F1 = F0; F0 = PVF[D]; HF = PHF[D]; if ((D & 0x1) == 0) { if (low > posl) low -= 1; if (hgh < posh) hgh += 1; } F0[hgh+1] = F0[low-1] = -2; j = -2; a = A + hgh; i = M - hgh; for (k = hgh; k > del; k--) { ap = j+1; am = F2[k-1]; FS_MOVE(-1,4) a -= 1; i += 1; } j = -2; a = A + low; i = M - low; for (k = low; k < del; k++) { ap = F2[k+1]+1; am = j; FS_MOVE(2,1) a += 1; i -= 1; } ap = F0[del+1]+1; am = j; FS_MOVE(2,4) #ifdef DEBUG_AWAVE print_awave(F0,low,hgh); print_awave(HF,low,hgh); #endif if (F0[del] >= N) break; } } { int k, h, m, e, c; int d, f; d = D + abs(del); c = N; k = del; if (mode == UPPERMOST) for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h < k) // => e = -1 or 2, UPPERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] <= c) c = PVF[D][h]-1; while (c >= m && a[c] == B[c]) c -= 1; if (e == -1) // => edge is 2, others are 1, and 0 { if (c <= PVF[D+2][k+1]) { e = 4; h = k+1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c+1; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c <= PVF[m][k+1]) { if (k == del) e = 4; else e = 1; h = k+1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c+1; } } k = h; } else if (mode == LOWERMOST) for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h > k) // => e = 1 or 4, LOWERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] < c) c = PVF[D][h]; while (c >= m && a[c] == B[c]) c -= 1; if (e == 1) // => edge is 2, others are 1, and 0 { if (c < PVF[D+2][k-1]) { e = 2; h = k-1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c--; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c < PVF[m][k-1]) { if (k == del) e = 2; else e = -1; h = k-1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c--; } } k = h; } else // mode == GREEDIEST for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; k = h; } wave->midb = (B-wave->Babs) + PVF[D][k]; wave->mida = (A-wave->Aabs) + k + PVF[D][k]; } return (0); } /****************************************************************************************\ * * * COMPUTE_TRACE FLAVORS * * * \****************************************************************************************/ static char *TP_Error = "Trace point out of bounds (Compute_Trace), source DB likely incorrect"; int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; int alen, blen; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs, dmax; alen = align->alen; blen = align->blen; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = 0; dmax = 0; for (d = 1; d < tlen; d += 2) { if (points[d-1] > dmax) dmax = points[d-1]; if (points[d] > nmax) nmax = points[d]; } if (tlen <= 1) nmax = N; s = (dmax+3)*2*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = trace_spacing+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = (int *) (work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; diffs = 0; ab = path->abpos; ae = (ab/trace_spacing)*trace_spacing; bb = path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; if (ae > alen || be > blen) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); EXIT(1); } d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; } ae = path->aepos; be = path->bepos; if (ae > alen || be > blen) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); EXIT(1); } d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; int alen, blen; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs, dmax; alen = align->alen; blen = align->blen; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = 0; dmax = 0; for (d = 1; d < tlen; d += 2) { if (points[d-1] > dmax) dmax = points[d-1]; if (points[d] > nmax) nmax = points[d]; } if (tlen <= 1) nmax = N; s = (dmax+3)*4*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = trace_spacing+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = ((int *) work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; int as, bs; int af, bf; diffs = 0; ab = as = af = path->abpos; ae = (ab/trace_spacing)*trace_spacing; bb = bs = bf = path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; if (ae > alen || be > blen) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); EXIT(1); } if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax)) EXIT(1); af = wave.mida; bf = wave.midb; d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; as = af; bs = bf; } ae = path->aepos; be = path->bepos; if (ae > alen || be > blen) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); EXIT(1); } if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax)) EXIT(1); af = wave.mida; bf = wave.midb; d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; as = af; bs = bf; d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; int alen, blen; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs, dmax; alen = align->alen; blen = align->blen; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int mmax, nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = mmax = 0; for (d = 0; d < tlen; d += 2) { if (points[d] > mmax) mmax = points[d]; if (points[d+1] > nmax) nmax = points[d+1]; } if (tlen <= 1) { mmax = M; nmax = N; } if (mmax > nmax) dmax = nmax; else dmax = mmax; s = (dmax+3)*2*((mmax+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = mmax+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = (int *) (work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; diffs = 0; ab = path->abpos; bb = path->bbpos; for (i = 0; i < tlen; i += 2) { ae = ab + points[i]; be = bb + points[i+1]; if (ae > alen || be > blen) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); EXIT(1); } d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; } } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } DASCRUBBER-1.1/align.h000066400000000000000000000473661327574206400142640ustar00rootroot00000000000000/******************************************************************************************* * * Local alignment module. Routines for finding local alignments given a seed position, * representing such an l.a. with its interval and a set of pass-thru points, so that * a detailed alignment can be efficiently computed on demand. * * All routines work on a numeric representation of DNA sequences, i.e. 0 for A, 1 for C, * 2 for G, and 3 for T. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #ifndef _A_MODULE #define _A_MODULE #include "DB.h" #define TRACE_XOVR 125 // If the trace spacing is not more than this value, then can // and do compress traces pts to 8-bit unsigned ints /*** INTERACTIVE vs BATCH version The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or batch version of the routines in this library are compiled. In batch mode, routines print an error message and exit. In interactive mode, the routines place the error message in EPLACE (also defined in DB.h) and return an error value, typically NULL if the routine returns a pointer, and an unusual integer value if the routine returns an integer. Below when an error return is described, one should understand that this value is returned only if the routine was compiled in INTERACTIVE mode. ***/ /*** PATH ABSTRACTION: Coordinates are *between* characters where 0 is the tick just before the first char, 1 is the tick between the first and second character, and so on. Our data structure is called a Path refering to its conceptualization in an edit graph. A local alignment is specified by the point '(abpos,bbpos)' at which its path in the underlying edit graph starts, and the point '(aepos,bepos)' at which it ends. In otherwords A[abpos+1..aepos] is aligned to B[bbpos+1..bepos] (assuming X[1] is the *first* character of X). There are 'diffs' differences in an optimal local alignment between the beginning and end points of the alignment (if computed by Compute_Trace), or nearly so (if computed by Local_Alignment). Optionally, a Path can have additional information about the exact nature of the aligned substrings if the field 'trace' is not NULL. Trace points to either an array of integers (if computed by a Compute_Trace routine), or an array of unsigned short integers (if computed by Local_Alignment). If computed by Local_Alignment 'trace' points at a list of 'tlen' (always even) short values: d_0, b_0, d_1, b_1, ... d_n-1, b_n-1, d_n, b_n to be interpreted as follows. The alignment from (abpos,bbpos) to (aepos,bepos) passes through the n trace points for i in [1,n]: (a_i,b_i) where a_i = floor(abpos/TS)*TS + i*TS and b_i = bbpos + (b_0 + b_1 + b_i-1) where also let a_0,b_0 = abpos,bbpos and a_(n+1),b_(n+1) = aepos,bepos. That is, the interior (i.e. i != 0 and i != n+1) trace points pass through every TS'th position of the aread where TS is the "trace spacing" employed when finding the alignment (see New_Align_Spec). Typically TS is 100. Then d_i is the number of differences in the portion of the alignment between (a_i,b_i) and (a_i+1,b_i+1). These trace points allow the Compute_Trace routines to efficiently compute the exact alignment between the two reads by efficiently computing exact alignments between consecutive pairs of trace points. Moreover, the diff values give one an idea of the quality of the alignment along every segment of TS symbols of the aread. If computed by a Compute_Trace routine, 'trace' points at a list of 'tlen' integers < i1, i2, ... in > that encodes an exact alignment as follows. A negative number j indicates that a dash should be placed before A[-j] and a positive number k indicates that a dash should be placed before B[k], where A and B are the two sequences of the overlap. The indels occur in the trace in the order in which they occur along the alignment. For a good example of how to "decode" a trace into an alignment, see the code for the routine Print_Alignment. ***/ typedef struct { void *trace; int tlen; int diffs; int abpos, bbpos; int aepos, bepos; } Path; /*** ALIGNMENT ABSTRACTION: An alignment is modeled by an Alignment record, which in addition to a *pointer* to a 'path', gives pointers to the A and B sequences, their lengths, and indicates whether the B-sequence needs to be complemented ('comp' non-zero if so). The 'trace' pointer of the 'path' subrecord can be either NULL, a list of pass-through points, or an exact trace depending on what routines have been called on the record. One can (1) compute a trace, with Compute_Trace, either from scratch if 'path.trace' = NULL, or using the sequence of pass-through points in trace, (2) print an ASCII representation of an alignment, or (3) reverse the roles of A and B, and (4) complement a sequence (which is a reversible process). If the alignment record shows the B sequence as complemented, *** THEN IT IS THE RESPONSIBILITY OF THE CALLER *** to make sure that bseq points at a complement of the sequence before calling Compute_Trace or Print_Alignment. Complement_Seq complements the sequence a of length n. The operation does the complementation/reversal in place. Calling it a second time on a given fragment restores it to its original state. With the introduction of the DAMAPPER, we need to code chains of alignments between a pair of sequences. The alignments of a chain are expected to be found in order either on a file or in memory, where the START_FLAG marks the first alignment and the NEXT_FLAG all subsequent alignmenst in a chain. A chain of a single LA is marked with the START_FLAG. The BEST_FLAG marks one of the best chains for a pair of sequences. The convention is that either every record has either a START- or NEXT-flag, or none of them do (e.g. as produced by daligner), so one can always check the flags of the first alignment to see whether or not the chain concept applies to a given collection or not. ***/ #define COMP_FLAG 0x1 #define ACOMP_FLAG 0x2 // A-sequence is complemented, not B ! Only Local_Alignment notices #define COMP(x) ((x) & COMP_FLAG) #define ACOMP(x) ((x) & ACOMP_FLAG) #define START_FLAG 0x4 // LA is the first of a chain of 1 or more la's #define NEXT_FLAG 0x8 // LA is the next segment of a chain. #define BEST_FLAG 0x10 // This is the start of the best chain #define CHAIN_START(x) ((x) & START_FLAG) #define CHAIN_NEXT(x) ((x) & NEXT_FLAG) #define BEST_CHAIN(x) ((x) & BEST_FLAG) #define ELIM_FLAG 0x20 // This LA should be ignored #define ELIM(x) ((x) & ELIM_FLAG) typedef struct { Path *path; uint32 flags; /* Pipeline status and complementation flags */ char *aseq; /* Pointer to A sequence */ char *bseq; /* Pointer to B sequence */ int alen; /* Length of A sequence */ int blen; /* Length of B sequence */ } Alignment; void Complement_Seq(char *a, int n); /* Many routines like Local_Alignment, Compute_Trace, and Print_Alignment need working storage that is more efficiently reused with each call, rather than being allocated anew with each call. Each *thread* can create a Work_Data object with New_Work_Data and this object holds and retains the working storage for routines of this module between calls to the routines. If enough memory for a Work_Data is not available then NULL is returned. Free_Work_Data frees a Work_Data object and all working storage held by it. */ typedef void Work_Data; Work_Data *New_Work_Data(); void Free_Work_Data(Work_Data *work); /* Local_Alignment seeks local alignments of a quality determined by a number of parameters. These are coded in an Align_Spec object that can be created with New_Align_Spec and freed with Free_Align_Spec when no longer needed. There are 4 essential parameters: ave_corr: the average correlation (1 - 2*error_rate) for the sought alignments. For Pacbio data we set this to .70 assuming an average of 15% error in each read. trace_space: the spacing interval for keeping trace points and segment differences (see description of 'trace' for Paths above) freq[4]: a 4-element vector where afreq[0] = frequency of A, f(A), freq[1] = f(C), freq[2] = f(G), and freq[3] = f(T). This vector is part of the header of every DAZZ database (see db.h). reach: a boolean, if set alignment extend to the boundary when reasonable, otherwise the terminate only at suffix-positive points. If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e. overlap), then the last/first 30 columns of the alignment are guaranteed to be suffix/prefix positive at correlation ave_corr * g(freq) where g is an empirically measured function that increases from 1 as the entropy of freq decreases. If memory is unavailable or the freq distribution is too skewed then NULL is returned. You can get back the original parameters used to create an Align_Spec with the simple utility functions below. */ typedef void Align_Spec; Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq, int reach); void Free_Align_Spec(Align_Spec *spec); int Trace_Spacing (Align_Spec *spec); double Average_Correlation(Align_Spec *spec); float *Base_Frequencies (Align_Spec *spec); int Overlap_If_Possible(Align_Spec *spec); /* Local_Alignment finds the longest significant local alignment between the sequences in 'align' subject to: (a) the alignment criterion given by the Align_Spec 'spec', (b) it passes through one of the points (anti+k)/2,(anti-k)/2 for k in [low,hgh] within the underlying dynamic programming matrix (i.e. the points on diagonals low to hgh on anti-diagonal anti or anti-1 (depending on whether the diagonal is odd or even)), (c) if lbord >= 0, then the alignment is always above diagonal low-lbord, and (d) if hbord >= 0, then the alignment is always below diagonal hgh+hbord. The path record of 'align' has its 'trace' filled from the point of view of an overlap between the aread and the bread. In addition a Path record from the point of view of the bread versus the aread is returned by the function, with this Path's 'trace' filled in appropriately. The space for the returned path and the two 'trace's are in the working storage supplied by the Work_Data packet and this space is reused with each call, so if one wants to retain the bread-path and the two trace point sequences, then they must be copied to user-allocated storage before calling the routine again. NULL is returned in the event of an error. Find_Extension is a variant of Local_Alignment that simply finds a local alignment that either ends (if prefix is non-zero) or begins (if prefix is zero) at the point (anti+diag)/2,(anti-diag)/2). All other parameters are as before. It returns a non-zero value only when INTERACTIVE is on and it cannot allocate the memory it needs. Only the path and trace with respect to the aread is returned. This routine is experimental and may not persist in later versions of the code. */ Path *Local_Alignment(Alignment *align, Work_Data *work, Align_Spec *spec, int low, int hgh, int anti, int lbord, int hbord); int Find_Extension(Alignment *align, Work_Data *work, Align_Spec *spec, // experimental !! int diag, int anti, int lbord, int hbord, int prefix); /* Given a legitimate Alignment object and associated trace point vector in 'align->path.trace', Compute_Trace_X, computes an exact trace for the alignment and resets 'align->path.trace' to point at an integer array within the storage of the Work_Data packet encoding an exact optimal trace from the start to end points. If the trace is needed beyond the next call to a routine that sets it, then it should be copied to an array allocated and managed by the caller. Compute_Trace_PTS computes a trace by computing the trace between successive trace points. It is much, much faster than Compute_Alignment below but at the tradeoff of not necessarily being optimal as pass-through points are not all perfect. Compute_Trace_MID computes a trace by computing the trace between the mid-points of alignments between two adjacent pairs of trace points. It is generally twice as slow as Compute_Trace_PTS, but it produces nearer optimal alignments. Both these routines return 1 if an error occurred and 0 otherwise. */ #define LOWERMOST -1 // Possible modes for "mode" parameter below) #define GREEDIEST 0 #define UPPERMOST 1 int Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing, int mode); int Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing, int mode); /* Compute_Trace_IRR (IRR for IRRegular) computes a trace for the given alignment where it assumes the spacing between trace points between both the A and B read varies, and futher assumes that the A-spacing is given in the short integers normally occupied by the differences in the alignment between the trace points. This routine is experimental and may not persist in later versions of the code. */ int Compute_Trace_IRR(Alignment *align, Work_Data *work, int mode); // experimental !! /* Compute Alignment determines the best alignment between the substrings specified by align. If the task is DIFF_ONLY, then only the difference of this alignment is computed and placed in the "diffs" field of align's path. If the task is PLUS_TRACE or DIFF_TRACE, then 'path.trace' is set to point at an integer array within the storage of the Work_Data packet encoding a trace point sequence for an optimal alignment, whereas if the task is PLUS_ALIGN or DIFF_ALIGN, then it points to an optimal trace of an optimatl alignment. The PLUS tasks can only be called if the immmediately proceeding call was a DIFF_ONLY on the same alignment record and sequences, in which case a little efficiency is gained by avoiding the repetition of the top level search for an optimal mid-point. */ #define PLUS_ALIGN 0 #define PLUS_TRACE 1 #define DIFF_ONLY 2 #define DIFF_ALIGN 3 #define DIFF_TRACE 4 int Compute_Alignment(Alignment *align, Work_Data *work, int task, int trace_spacing); /* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls the display width of numbers, it must be not less than the width of any number to be displayed. If the alignment trace is an exact trace, then one can ask Print_Alignment to print an ASCII representation of the alignment 'align' to the file 'file'. Indent the display by "indent" spaces and put "width" columns per line in the display. Show "border" characters of sequence on each side of the aligned region. If upper is non-zero then display bases in upper case. If coord is greater than 0, then the positions of the first character in A and B in the given row is displayed with a field width given by coord's value. Print_Reference is like Print_Alignment but rather than printing exaclty "width" columns per segment, it prints "block" characters of the A sequence in each segment. This results in segments of different lengths, but is convenient when looking at two alignments involving A as segments are guaranteed to cover the same interval of A in a segment. Both Print routines return 1 if an error occurred (not enough memory), and 0 otherwise. Flip_Alignment modifies align so the roles of A and B are reversed. If full is off then the trace is ignored, otherwise the trace must be to a full alignment trace and this trace is also appropriately inverted. */ void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord); int Print_Alignment(FILE *file, Alignment *align, Work_Data *work, int indent, int width, int border, int upper, int coord); int Print_Reference(FILE *file, Alignment *align, Work_Data *work, int indent, int block, int border, int upper, int coord); void Flip_Alignment(Alignment *align, int full); /*** OVERLAP ABSTRACTION: Externally, between modules an Alignment is modeled by an "Overlap" record, which (a) replaces the pointers to the two sequences with their ID's in the DAZZ data bases, (b) does not contain the length of the 2 sequences (must fetch from DB), and (c) contains its path as a subrecord rather than as a pointer (indeed, typically the corresponding Alignment record points at the Overlap's path sub-record). The trace pointer is always to a sequence of trace points and can be either compressed (uint8) or uncompressed (uint16). One can read and write binary records of an "Overlap". ***/ typedef struct { Path path; /* Path: begin- and end-point of alignment + diffs */ uint32 flags; /* Pipeline status and complementation flags */ int aread; /* Id # of A sequence */ int bread; /* Id # of B sequence */ } Overlap; /* Read_Overlap reads the next Overlap record from stream 'input', not including the trace (if any), and without modifying 'ovl's trace pointer. Read_Trace reads the ensuing trace into the memory pointed at by the trace field of 'ovl'. It is assumed to be big enough to accommodate the trace where each value take 'tbytes' bytes (1 if uint8 or 2 if uint16). Write_Overlap write 'ovl' to stream 'output' followed by its trace vector (if any) that occupies 'tbytes' bytes per value. It returns non-zero if there was an error writing. Print_Overlap prints an ASCII version of the contents of 'ovl' to stream 'output' where the trace occupes 'tbytes' per value and the print out is indented from the left margin by 'indent' spaces. Compress_TraceTo8 converts a trace fo 16-bit values to 8-bit values in place, and Decompress_TraceTo16 does the reverse conversion. If check is set in a call to Compress then it checks whether the values fit in 8-bits, and if not returns a non-zero result in interactive mode, or exits with an error message in batch mode. Check_Trace_Points checks that the number of trace points is correct and that the sum of the b-read displacements equals the b-read alignment interval, assuming the trace spacing is 'tspace'. It reports an error message if there is a problem and 'verbose' is non-zero. The 'ovl' came from the file names 'fname'. */ int Read_Overlap(FILE *input, Overlap *ovl); int Read_Trace(FILE *innput, Overlap *ovl, int tbytes); int Write_Overlap(FILE *output, Overlap *ovl, int tbytes); void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent); int Compress_TraceTo8(Overlap *ovl, int check); void Decompress_TraceTo16(Overlap *ovl); int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname); #endif // _A_MODULE