pax_global_header00006660000000000000000000000064126337367510014527gustar00rootroot0000000000000052 comment=29234506d87e2d3e207034340777550e81cf55a6 DALIGNER-master/000077500000000000000000000000001263373675100135515ustar00rootroot00000000000000DALIGNER-master/DB.c000066400000000000000000001277161263373675100142200ustar00rootroot00000000000000/******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif /******************************************************************************************* * * GENERAL UTILITIES * ********************************************************************************************/ char *Prog_Name; #ifdef INTERACTIVE char Ebuffer[1000]; #endif void *Malloc(int64 size, char *mesg) { void *p; if ((p = malloc(size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } void *Realloc(void *p, int64 size, char *mesg) { if ((p = realloc(p,size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } char *Strdup(char *name, char *mesg) { char *s; if (name == NULL) return (NULL); if ((s = strdup(name)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (s); } FILE *Fopen(char *name, char *mode) { FILE *f; if (name == NULL || mode == NULL) return (NULL); if ((f = fopen(name,mode)) == NULL) EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); return (f); } char *PathTo(char *name) { char *path, *find; if (name == NULL) return (NULL); if ((find = rindex(name,'/')) != NULL) { *find = '\0'; path = Strdup(name,"Extracting path from"); *find = '/'; } else path = Strdup(".","Allocating default path"); return (path); } char *Root(char *name, char *suffix) { char *path, *find, *dot; int epos; if (name == NULL) return (NULL); find = rindex(name,'/'); if (find == NULL) find = name; else find += 1; if (suffix == NULL) { dot = strchr(find,'.'); if (dot != NULL) *dot = '\0'; path = Strdup(find,"Extracting root from"); if (dot != NULL) *dot = '.'; } else { epos = strlen(find); epos -= strlen(suffix); if (epos > 0 && strcasecmp(find+epos,suffix) == 0) { find[epos] = '\0'; path = Strdup(find,"Extracting root from"); find[epos] = suffix[0]; } else path = Strdup(find,"Allocating root"); } return (path); } char *Catenate(char *path, char *sep, char *root, char *suffix) { static char *cat = NULL; static int max = -1; int len; if (path == NULL || root == NULL || sep == NULL || suffix == NULL) return (NULL); len = strlen(path); len += strlen(sep); len += strlen(root); len += strlen(suffix); if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); return (NULL); } } sprintf(cat,"%s%s%s%s",path,sep,root,suffix); return (cat); } char *Numbered_Suffix(char *left, int num, char *right) { static char *suffix = NULL; static int max = -1; int len; if (left == NULL || right == NULL) return (NULL); len = strlen(left); len += strlen(right) + 40; if (len > max) { max = ((int) (1.2*len)) + 100; if ((suffix = (char *) realloc(suffix,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); return (NULL); } } sprintf(suffix,"%s%d%s",left,num,right); return (suffix); } #define COMMA ',' // Print big integers with commas/periods for better readability void Print_Number(int64 num, int width, FILE *out) { if (width == 0) { if (num < 1000ll) fprintf(out,"%lld",num); else if (num < 1000000ll) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else if (num < 1000000000ll) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll, COMMA,(num%1000000000ll)/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); } else { if (num < 1000ll) fprintf(out,"%*lld",width,num); else if (num < 1000000ll) { if (width <= 4) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll); } else if (num < 1000000000ll) { if (width <= 8) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); } else { if (width <= 12) fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); } } } // Return the number of digits, base 10, of num int Number_Digits(int64 num) { int digit; digit = 0; while (num >= 1) { num /= 10; digit += 1; } return (digit); } /******************************************************************************************* * * READ COMPRESSION/DECOMPRESSION UTILITIES * ********************************************************************************************/ // Compress read into 2-bits per base (from [0-3] per byte representation void Compress_Read(int len, char *s) { int i; char c, d; char *s0, *s1, *s2, *s3; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; c = s1[len]; d = s2[len]; s0[len] = s1[len] = s2[len] = 0; for (i = 0; i < len; i += 4) *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]); s1[len] = c; s2[len] = d; } // Uncompress read form 2-bits per base into [0-3] per byte representation void Uncompress_Read(int len, char *s) { int i, tlen, byte; char *s0, *s1, *s2, *s3; char *t; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; tlen = (len-1)/4; t = s+tlen; for (i = tlen*4; i >= 0; i -= 4) { byte = *t--; s0[i] = (char) ((byte >> 6) & 0x3); s1[i] = (char) ((byte >> 4) & 0x3); s2[i] = (char) ((byte >> 2) & 0x3); s3[i] = (char) (byte & 0x3); } s[len] = 4; } // Convert read in [0-3] representation to ascii representation (end with '\n') void Lower_Read(char *s) { static char letter[4] = { 'a', 'c', 'g', 't' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } void Upper_Read(char *s) { static char letter[4] = { 'A', 'C', 'G', 'T' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } // Convert read in ascii representation to [0-3] representation (end with 4) void Number_Read(char *s) { static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; for ( ; *s != '\0'; s++) *s = number[(int) *s]; *s = 4; } /******************************************************************************************* * * DB OPEN, TRIM & CLOSE ROUTINES * ********************************************************************************************/ // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char* path, HITS_DB *db) { HITS_DB dbcopy; char *root, *pwd, *bptr, *fptr, *cat; int nreads; FILE *index, *dbvis; int status, plen, isdam; int part, cutoff, all; int ufirst, tfirst, ulast, tlast; status = -1; dbcopy = *db; plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); pwd = PathTo(path); bptr = rindex(root,'.'); if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-') { part = strtol(bptr+1,&fptr,10); if (*fptr != '\0' || part == 0) part = 0; else *bptr = '\0'; } else part = 0; isdam = 0; cat = Catenate(pwd,"/",root,".db"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { cat = Catenate(pwd,"/",root,".dam"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path); goto error; } isdam = 1; } if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) goto error1; if (fread(db,sizeof(HITS_DB),1,index) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); goto error2; } { int p, nblocks, nfiles; int64 size; char fname[MAX_NAME], prolog[MAX_NAME]; nblocks = 0; if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } for (p = 0; p < nfiles; p++) if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) if (part == 0) { cutoff = 0; all = 1; } else { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", Prog_Name,root); goto error2; } else { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (part > nblocks) { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); goto error2; } } if (part > 0) { for (p = 1; p <= part; p++) if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } } else { ufirst = tfirst = 0; ulast = db->ureads; tlast = db->treads; } } db->trimmed = 0; db->tracks = NULL; db->part = part; db->cutoff = cutoff; db->all = all; db->ufirst = ufirst; db->tfirst = tfirst; nreads = ulast-ufirst; if (part <= 0) { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); if (db->reads == NULL) goto error2; db->reads += 1; if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(db->reads); goto error2; } } else { HITS_READ *reads; int i, r, maxlen; int64 totlen; reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); if (reads == NULL) goto error2; reads += 1; fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR); if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(reads); goto error2; } totlen = 0; maxlen = 0; for (i = 0; i < nreads; i++) { r = reads[i].rlen; totlen += r; if (r > maxlen) maxlen = r; } db->maxlen = maxlen; db->totlen = totlen; db->reads = reads; } ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part ((int *) (db->reads))[-2] = tlast - tfirst; db->nreads = nreads; db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); if (db->path == NULL) goto error2; db->bases = NULL; db->loaded = 0; status = isdam; error2: fclose(index); error1: fclose(dbvis); error: if (bptr != NULL) *bptr = '.'; free(pwd); free(root); if (status < 0) *db = dbcopy; return (status); } // Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(HITS_DB *db) { int i, j, r; int allflag, cutoff; int64 totlen; int maxlen, nreads; HITS_TRACK *record; HITS_READ *reads; if (db->trimmed) return; if (db->cutoff <= 0 && db->all) return; cutoff = db->cutoff; if (db->all) allflag = 0; else allflag = DB_BEST; reads = db->reads; nreads = db->nreads; for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,".@qvs") == 0) { uint16 *table = ((HITS_QV *) record)->table; j = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) table[j++] = table[i]; } else { int *anno4, size; int64 *anno8; char *anno, *data; size = record->size; data = (char *) record->data; if (data == NULL) { anno = (char *) record->anno; j = 0; for (i = r = 0; i < db->nreads; i++, r += size) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (record->anno); j = anno4[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (record->anno); j = anno8[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno8[j],NULL); } record->anno = Realloc(record->anno,record->size*(j+1),NULL); } totlen = maxlen = 0; for (j = i = 0; i < nreads; i++) { r = reads[i].rlen; if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff) { totlen += r; if (r > maxlen) maxlen = r; reads[j++] = reads[i]; } } db->totlen = totlen; db->maxlen = maxlen; db->nreads = j; db->trimmed = 1; if (j < nreads) { db->reads = Realloc(reads-1,sizeof(HITS_READ)*(j+2),NULL); db->reads += 1; } } // The DB has already been trimmed, but a track over the untrimmed DB needs to be loaded. // Trim the track by rereading the untrimmed DB index from the file system. static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart) { int i, j, r; int allflag, cutoff; int ureads; char *root; HITS_READ read; FILE *indx; if (!db->trimmed) return (0); if (db->cutoff <= 0 && db->all) return (0); cutoff = db->cutoff; if (db->all) allflag = 0; else allflag = DB_BEST; root = rindex(db->path,'/') + 2; indx = Fopen(Catenate(db->path,"","",".idx"),"r"); fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*db->ufirst,SEEK_SET); if (ispart) ureads = ((int *) (db->reads))[-1]; else ureads = db->ureads; if (strcmp(track->name,".@qvs") == 0) { EPRINTF(EPLACE,"%s: Cannot load QV track after trimming\n",Prog_Name); fclose(indx); EXIT(1); } { int *anno4, size; int64 *anno8; char *anno, *data; size = track->size; data = (char *) track->data; if (data == NULL) { anno = (char *) track->anno; j = r = 0; for (i = r = 0; i < ureads; i++, r += size) { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } r += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (track->anno); j = anno4[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (track->anno); j = anno8[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno8[j],NULL); } track->anno = Realloc(track->anno,track->size*(j+1),NULL); } fclose(indx); return (0); } // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(HITS_DB *db) { HITS_TRACK *t, *p; if (db->loaded) free(((char *) (db->bases)) - 1); else if (db->bases != NULL) fclose((FILE *) db->bases); if (db->reads != NULL) free(db->reads-1); free(db->path); Close_QVs(db); for (t = db->tracks; t != NULL; t = p) { p = t->next; free(t->anno); free(t->data); free(t); } } /******************************************************************************************* * * QV LOAD & CLOSE ROUTINES * ********************************************************************************************/ HITS_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" HITS_QV *Active_QV; // Becomes invalid after closing int Load_QVs(HITS_DB *db) { FILE *quiva, *istub, *indx; char *root; uint16 *table; HITS_QV *qvtrk; QVcoding *coding, *nx; int ncodes; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) return (0); if (db->trimmed) { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); EXIT(1); } if (db->reads[db->nreads-1].coff < 0) { EPRINTF(EPLACE,"%s: The requested QVs have not been added to the DB!\n",Prog_Name); EXIT(1); } // Open .qvs, .idx, and .db files quiva = Fopen(Catenate(db->path,"","",".qvs"),"r"); if (quiva == NULL) return (-1); istub = NULL; indx = NULL; table = NULL; coding = NULL; qvtrk = NULL; root = rindex(db->path,'/') + 2; istub = Fopen(Catenate(db->path,"/",root,".db"),"r"); if (istub == NULL) goto error; { int first, last, nfiles; char prolog[MAX_NAME], fname[MAX_NAME]; int i, j; if (fscanf(istub,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (db->part > 0) { int pfirst, plast; int fbeg, fend; int n, k; FILE *indx; // Determine first how many and which files span the block (fbeg to fend) pfirst = db->ufirst; plast = pfirst + db->nreads; first = 0; for (fbeg = 0; fbeg < nfiles; fbeg++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (last > pfirst) break; first = last; } for (fend = fbeg+1; fend <= nfiles; fend++) { if (last >= plast) break; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } first = last; } indx = Fopen(Catenate(db->path,"","",".idx"),"r"); ncodes = fend-fbeg; coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (indx == NULL || coding == NULL || table == NULL) { ncodes = 0; goto error; } // Carefully get the first coding scheme (its offset is most likely in a HITS_RECORD // in .idx that is *not* in memory). Get all the other coding schemes normally and // assign the tables # for each read in the block in "tables". rewind(istub); fscanf(istub,DB_NFILE,&nfiles); first = 0; for (n = 0; n < fbeg; n++) { fscanf(istub,DB_FDATA,&last,fname,prolog); first = last; } for (n = fbeg; n < fend; n++) { fscanf(istub,DB_FDATA,&last,fname,prolog); i = n-fbeg; if (first < pfirst) { HITS_READ read; fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*first,SEEK_SET); if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); ncodes = i; goto error; } fseeko(quiva,read.coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; } else { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first-pfirst].coff = ftello(quiva); } j = first-pfirst; if (j < 0) j = 0; k = last-pfirst; if (k > db->nreads) k = db->nreads; while (j < k) table[j++] = (uint16) i; first = last; } fclose(indx); indx = NULL; } else { // Load in coding scheme for each file, adjust .coff of first read in the file, and // record which table each read uses ncodes = nfiles; coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (coding == NULL || table == NULL) goto error; first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } fseeko(quiva,db->reads[first].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first].coff = ftello(quiva); for (j = first; j < last; j++) table[j] = (uint16) i; first = last; } } // Allocate and fill in the HITS_QV record and add it to the front of the // track list qvtrk = (HITS_QV *) Malloc(sizeof(HITS_QV),"Allocating QV pseudo-track"); if (qvtrk == NULL) goto error; qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); if (qvtrk->name == NULL) goto error; qvtrk->next = db->tracks; db->tracks = (HITS_TRACK *) qvtrk; qvtrk->ncodes = ncodes; qvtrk->table = table; qvtrk->coding = coding; qvtrk->quiva = quiva; } fclose(istub); return (0); error: if (qvtrk != NULL) free(qvtrk); if (table != NULL) free(table); if (coding != NULL) { int i; for (i = 0; i < ncodes; i++) Free_QVcoding(coding+i); free(coding); } if (indx != NULL) fclose(indx); if (istub != NULL) fclose(istub); fclose(quiva); EXIT(1); } // Close the QV stream, free the QV pseudo track and all associated memory void Close_QVs(HITS_DB *db) { HITS_TRACK *track; HITS_QV *qvtrk; int i; Active_DB = NULL; track = db->tracks; if (track != NULL && strcmp(track->name,".@qvs") == 0) { qvtrk = (HITS_QV *) track; for (i = 0; i < qvtrk->ncodes; i++) Free_QVcoding(qvtrk->coding+i); free(qvtrk->coding); free(qvtrk->table); fclose(qvtrk->quiva); db->tracks = track->next; free(track); } return; } /******************************************************************************************* * * TRACK LOAD & CLOSE ROUTINES * ********************************************************************************************/ // Return status of track: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track int Check_Track(HITS_DB *db, char *track, int *kind) { FILE *afile; int tracklen, size, ispart; int ureads, treads; afile = NULL; if (db->part > 0) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) return (-2); if (fread(&tracklen,sizeof(int),1,afile) != 1) return (-1); if (fread(&size,sizeof(int),1,afile) != 1) return (-1); if (size == 0) *kind = MASK_TRACK; else if (size > 0) *kind = CUSTOM_TRACK; else return (-1); fclose(afile); if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (tracklen == ureads) return (0); else if (tracklen == treads) return (1); else return (-1); } // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned. HITS_TRACK *Load_Track(HITS_DB *db, char *track) { FILE *afile, *dfile; int tracklen, size; int nreads, ispart; int treads, ureads; void *anno; void *data; char *name; HITS_TRACK *record; if (track[0] == '.') { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); EXIT(NULL); } for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,track) == 0) return (record); afile = NULL; if (db->part) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); return (NULL); } dfile = NULL; anno = NULL; data = NULL; record = NULL; if (ispart) name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"); else name = Catenate(db->path,".",track,".data"); if (name == NULL) goto error; dfile = fopen(name,"r"); if (fread(&tracklen,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (fread(&size,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size < 0) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 0) size = 8; if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (db->trimmed) { if (tracklen != treads && tracklen != ureads) { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) { if (tracklen == treads) fseeko(afile,size*db->tfirst,SEEK_CUR); else fseeko(afile,size*db->ufirst,SEEK_CUR); } } else { if (tracklen != ureads) { if (tracklen == treads) EPRINTF(EPLACE,"%s: Track '%s' is for a trimmed DB !\n",Prog_Name,track); else EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->ufirst,SEEK_CUR); } nreads = db->nreads; anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); if (anno == NULL) goto error; if (dfile != NULL) { int64 *anno8, off8, dlen; int *anno4, off4; int i; if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 4) { anno4 = (int *) anno; off4 = anno4[0]; if (off4 != 0) { for (i = 0; i <= nreads; i++) anno4[i] -= off4; fseeko(dfile,off4,SEEK_SET); } dlen = anno4[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } else { anno8 = (int64 *) anno; off8 = anno8[0]; if (off8 != 0) { for (i = 0; i <= nreads; i++) anno8[i] -= off8; fseeko(dfile,off8,SEEK_SET); } dlen = anno8[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } if (data == NULL) goto error; if (dlen > 0) { if (fread(data,dlen,1,dfile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track); goto error; } } fclose(dfile); dfile = NULL; } else { if (fread(anno,size,nreads,afile) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } data = NULL; } fclose(afile); record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record"); if (record == NULL) goto error; record->name = Strdup(track,"Allocating Track Name"); if (record->name == NULL) goto error; record->data = data; record->anno = anno; record->size = size; if (db->trimmed && tracklen != treads) { if (Late_Track_Trim(db,record,ispart)) goto error; } if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) { record->next = db->tracks->next; db->tracks->next = record; } else { record->next = db->tracks; db->tracks = record; } return (record); error: if (record != NULL) free(record); if (data != NULL) free(data); if (anno != NULL) free(anno); if (dfile != NULL) fclose(dfile); fclose(afile); EXIT (NULL); } void Close_Track(HITS_DB *db, char *track) { HITS_TRACK *record, *prev; prev = NULL; for (record = db->tracks; record != NULL; record = record->next) { if (strcmp(record->name,track) == 0) { free(record->anno); free(record->data); free(record->name); if (prev == NULL) db->tracks = record->next; else prev->next = record->next; free(record); return; } prev = record; } return; } /******************************************************************************************* * * READ BUFFER ALLOCATION AND READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer big enough for the largest read in 'db', leaving room // for an initial delimiter character char *New_Read_Buffer(HITS_DB *db) { char *read; read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); if (read == NULL) EXIT(NULL); return (read+1); } // Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a // lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and // 3(T) otherwise. // // **NB**, the byte before read will be set to a delimiter character! int Load_Read(HITS_DB *db, int i, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; HITS_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(1); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); db->bases = (void *) bases; } off = r[i].boff; len = r[i].rlen; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(1); } } Uncompress_Read(len,read); if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (0); } char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; int bbeg, bend; HITS_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(NULL); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(NULL); db->bases = (void *) bases; } bbeg = beg/4; bend = (end-1)/4+1; off = r[i].boff + bbeg; len = end - beg; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = bend-bbeg; if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(NULL); } } Uncompress_Read(4*clen,read); read += beg%4; read[len] = 4; if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (read); } /******************************************************************************************* * * QV BUFFER ALLOCATION QV READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer of 5 vectors big enough for the largest read in 'db' char **New_QV_Buffer(HITS_DB *db) { char **entry; char *qvs; int i; qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); if (qvs == NULL || entry == NULL) EXIT(NULL); for (i = 0; i < 5; i++) entry[i] = qvs + i*db->maxlen; return (entry); } // Load into entry the QV streams for the i'th read from db. The parameter ascii applies to // the DELTAG stream as described for Load_Read. int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) { HITS_READ *reads; FILE *quiva; int rlen; if (db != Active_DB) { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name); EXIT(1); } Active_QV = (HITS_QV *) db->tracks; Active_DB = db; } if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); EXIT(1); } reads = db->reads; quiva = Active_QV->quiva; rlen = reads[i].rlen; fseeko(quiva,reads[i].coff,SEEK_SET); if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) EXIT(1); if (ascii != 1) { char *deltag = entry[1]; if (ascii != 2) { char x = deltag[rlen]; deltag[rlen] = '\0'; Number_Read(deltag); deltag[rlen] = x; } else { int j; int u = 'A'-'a'; for (j = 0; j < rlen; j++) deltag[j] = (char) (deltag[j]+u); } } return (0); } /******************************************************************************************* * * BLOCK LOAD OF ALL READS (PRIMARILY FOR DALIGNER) * ********************************************************************************************/ // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // non-zero then the reads are converted to ACGT ascii, otherwise the reads are left // as numeric strings over 0(A), 1(C), 2(G), and 3(T). int Read_All_Sequences(HITS_DB *db, int ascii) { FILE *bases; int nreads = db->nreads; HITS_READ *reads = db->reads; void (*translate)(char *s); char *seq; int64 o, off; int i, len, clen; bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); if (seq == NULL) { fclose(bases); EXIT(1); } *seq++ = 4; if (ascii == 1) translate = Lower_Read; else translate = Upper_Read; o = 0; for (i = 0; i < nreads; i++) { len = reads[i].rlen; off = reads[i].boff; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(seq+o,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name); free(seq); fclose(bases); EXIT(1); } } Uncompress_Read(len,seq+o); if (ascii) translate(seq+o); reads[i].boff = o; o += (len+1); } reads[nreads].boff = o; fclose(bases); db->bases = (void *) seq; db->loaded = 1; return (0); } int List_DB_Files(char *path, void actor(char *path, char *extension)) { int status, plen, rlen, dlen; char *root, *pwd, *name; int isdam; DIR *dirp; struct dirent *dp; status = 0; pwd = PathTo(path); plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); rlen = strlen(root); if (root == NULL || pwd == NULL) { free(pwd); free(root); EXIT(1); } if ((dirp = opendir(pwd)) == NULL) { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; goto error; } isdam = 0; while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary) { name = dp->d_name; if (strcmp(name,Catenate("","",root,".db")) == 0) break; if (strcmp(name,Catenate("","",root,".dam")) == 0) { isdam = 1; break; } if (strcasecmp(name,Catenate("","",root,".db")) == 0) { strncpy(root,name,rlen); break; } if (strcasecmp(name,Catenate("","",root,".dam")) == 0) { strncpy(root,name,rlen); isdam = 1; break; } } if (dp == NULL) { EPRINTF(EPLACE,"%s: Cannot find %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; closedir(dirp); goto error; } if (isdam) actor(Catenate(pwd,"/",root,".dam"),"dam"); else actor(Catenate(pwd,"/",root,".db"),"db"); rewinddir(dirp); // Report each auxiliary file while ((dp = readdir(dirp)) != NULL) { name = dp->d_name; dlen = strlen(name); #ifdef HIDE_FILES if (name[0] != '.') continue; dlen -= 1; name += 1; #endif if (dlen < rlen+1) continue; if (name[rlen] != '.') continue; if (strncmp(name,root,rlen) != 0) continue; actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); } closedir(dirp); error: free(pwd); free(root); return (status); } void Print_Read(char *s, int width) { int i; if (s[0] < 4) { for (i = 0; s[i] != 4; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%d",s[i]); } printf("\n"); } else { for (i = 0; s[i] != '\0'; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%c",s[i]); } printf("\n"); } } DALIGNER-master/DB.h000066400000000000000000000474631263373675100142250ustar00rootroot00000000000000/******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. Eventually will also * support compressed quality information. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #ifndef _HITS_DB #define _HITS_DB #include #include "QV.h" #define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" // Undefine if you don't want this // For interactive applications where it is inappropriate to simply exit with an error // message to standard error, define the constant INTERACTIVE. If set, then error // messages are put in the global variable Ebuffer and the caller of a DB routine // can decide how to deal with the error. // // DB, QV, or alignment routines that can encounter errors function as before in // non-INTERACTIVE mode by exiting after printing an error message to stderr. In // INTERACTIVE mode the routines place a message at EPLACE and return an error // value. For such routines that were previously void, they are now int, and // return 1 if an error occured, 0 otherwise. #undef INTERACTIVE #ifdef INTERACTIVE #define EPRINTF sprintf #define EPLACE Ebuffer #define EXIT(x) return (x) #else // BATCH #define EPRINTF fprintf #define EPLACE stderr #define EXIT(x) exit (1) #endif typedef unsigned char uint8; typedef unsigned short uint16; typedef unsigned int uint32; typedef unsigned long long uint64; typedef signed char int8; typedef signed short int16; typedef signed int int32; typedef signed long long int64; typedef float float32; typedef double float64; /******************************************************************************************* * * COMMAND LINE INTERPRETATION MACROS * ********************************************************************************************/ extern char *Prog_Name; // Name of program #ifdef INTERACTIVE extern char Ebuffer[]; #endif #define SYSTEM_ERROR \ { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \ exit (2); \ } #define ARG_INIT(name) \ Prog_Name = Strdup(name,""); \ for (i = 0; i < 128; i++) \ flags[i] = 0; #define ARG_FLAGS(set) \ for (k = 1; argv[i][k] != '\0'; k++) \ { if (index(set,argv[i][k]) == NULL) \ { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \ exit (1); \ } \ flags[(int) argv[i][k]] = 1; \ } #define ARG_POSITIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \ exit (1); \ } \ if (var <= 0) \ { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_NON_NEGATIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \ exit (1); \ } \ if (var < 0) \ { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_REAL(var) \ var = strtod(argv[i]+2,&eptr); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not a real number\n",Prog_Name,argv[i][1]); \ exit (1); \ } /******************************************************************************************* * * UTILITIES * ********************************************************************************************/ // The following general utilities return NULL if any of their input pointers are NULL, or if they // could not perform their function (in which case they also print an error to stderr). void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to char *Strdup(char *string, char *mesg); // stderr if out of memory FILE *Fopen(char *path, char *mode); // Open file path for "mode" char *PathTo(char *path); // Return path portion of file name "path" char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path" // Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer // Numbered_Suffix returns concatenation of left..right in a *temporary* buffer char *Catenate(char *path, char *sep, char *root, char *suffix); char *Numbered_Suffix(char *left, int num, char *right); // DB-related utilities void Print_Number(int64 num, int width, FILE *out); // Print readable big integer int Number_Digits(int64 num); // Return # of digits in printed number #define COMPRESSED_LEN(len) (((len)+3) >> 2) void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form void Print_Read(char *s, int width); void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt) void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT) void Number_Read(char *s); // Convert read from letters to numbers /******************************************************************************************* * * DB IN-CORE DATA STRUCTURES * ********************************************************************************************/ #define DB_QV 0x03ff // Mask for 3-digit quality value #define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert #define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1) typedef struct { int origin; // Well # int rlen; // Length of the sequence (Last pulse = fpulse + rlen) int fpulse; // First pulse int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file int flags; // QV of read + flags above } HITS_READ; // A track can be of 3 types: // data == NULL: there are nreads 'anno' records of size 'size'. // data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1]) // contains the variable length data // data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1]) // contains the variable length data typedef struct _track { struct _track *next; // Link to next track char *name; // Symbolic name of track int size; // Size in bytes of anno records void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL } HITS_TRACK; // The information for accessing QV streams is in a HITS_QV record that is a "pseudo-track" // named ".@qvs" and is always the first track record in the list (if present). Since normal // track names cannot begin with a . (this is enforced), this pseudo-track is never confused // with a normal track. typedef struct { struct _track *next; char *name; int ncodes; // # of coding tables QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h) uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with // scheme coding[table[i]] FILE *quiva; // the open file pointer to the .qvs file } HITS_QV; // The DB record holds all information about the current state of an active DB including an // array of HITS_READS, one per read, and a linked list of HITS_TRACKs the first of which // is always a HITS_QV pseudo-track (if the QVs have been loaded). typedef struct { int ureads; // Total number of reads in untrimmed DB int treads; // Total number of reads in trimmed DB int cutoff; // Minimum read length in block (-1 if not yet set) int all; // Consider multiple reads from a given well float freq[4]; // frequency of A, C, G, T, respectively // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed) int maxlen; // length of maximum read (initially over all DB) int64 totlen; // total # of bases (initially over all DB) int nreads; // # of reads in actively loaded portion of DB int trimmed; // DB has been trimmed by cutoff/all int part; // DB block (if > 0), total DB (if == 0) int ufirst; // Index of first read in block (without trimming) int tfirst; // Index of first read in block (with trimming) // In order to avoid forcing users to have to rebuild all thier DBs to accommodate // the addition of fields for the size of the actively loaded trimmed and untrimmed // blocks, an additional read record is allocated in "reads" when a DB is loaded into // memory (reads[-1]) and the two desired fields are crammed into the first two // integer spaces of the record. char *path; // Root name of DB for .bps, .qvs, and tracks int loaded; // Are reads loaded in memory? void *bases; // file pointer for bases file (to fetch reads from), // or memory pointer to uncompressed block of all sequences. HITS_READ *reads; // Array [-1..nreads] of HITS_READ HITS_TRACK *tracks; // Linked list of loaded tracks } HITS_DB; /******************************************************************************************* * * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock * ********************************************************************************************/ #define MAX_NAME 10000 // Longest file name or fasta header line #define DB_NFILE "files = %9d\n" // number of files #define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name #define DB_NBLOCK "blocks = %9d\n" // number of blocks #define DB_PARAMS "size = %9lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well #define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed) /******************************************************************************************* * * DB ROUTINES * ********************************************************************************************/ // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps, // .DB.qvs, and files .DB..anno and DB..data where is a track name // (not containing a . !). // A DAM is basically a DB except that: // 1. there are no QV's, instead .coff points the '\0' terminated fasta header of the read // in the file ..hdr file // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences // contain N-separated contigs), and .fpulse the first base of the contig in the // fasta entry // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char *path, HITS_DB *db); // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(HITS_DB *db); // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(HITS_DB *db); // If QV pseudo track is not already in db's track list, then load it and set it up. // The database must not have been trimmed yet. -1 is returned if a .qvs file is not // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE // is defined. Otherwise a 0 is returned. int Load_QVs(HITS_DB *db); // Remove the QV pseudo track, all space associated with it, and close the .qvs file. void Close_QVs(HITS_DB *db); // Look up the file and header in the file of the indicated track. Return: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track // In addition, if opened (0 or 1 returned), then kind points at an integer indicating // the type of track as follows: // CUSTOM 0 => a custom track // MASK 1 => a mask track #define CUSTOM_TRACK 0 #define MASK_TRACK 1 int Check_Track(HITS_DB *db, char *track, int *kind); // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise // the routine prints an error message to stderr and exits if an error occurs, and returns // with NULL only if the track does not exist. HITS_TRACK *Load_Track(HITS_DB *db, char *track); // If track is on the db's track list, then it is removed and all storage associated with it // is freed. void Close_Track(HITS_DB *db, char *track); // Allocate and return a buffer big enough for the largest read in 'db'. // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte // are needed by the alignment algorithms. If cannot allocate memory then return NULL // if INTERACTIVE is defined, or print error to stderr and exit otherwise. char *New_Read_Buffer(HITS_DB *db); // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter // for traversals in either direction. A non-zero value is returned if an error occured // and INTERACTIVE is defined. int Load_Read(HITS_DB *db, int i, char *read, int ascii); // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to // the string holding the substring so it has a delimeter for traversals in either direction. // A NULL pointer is returned if an error occured and INTERACTIVE is defined. char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii); // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, // or print error to stderr and exit otherwise. #define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer #define DEL_TAG 1 // The deleted characters #define INS_QV 2 // The insertion QVs #define SUB_QV 3 // The substitution QVs #define MRG_QV 4 // The merge QVs char **New_QV_Buffer(HITS_DB *db); // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters // are converted to a numeric or upper/lower case ascii string as per ascii. Return with // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii); // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T). // Return with a zero, except when an error occurs and INTERACTIVE is defined in which // case return wtih 1. int Read_All_Sequences(HITS_DB *db, int ascii); // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. // -1 is returned if the path could not be found, and 1 is returned if an error (reported // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. int List_DB_Files(char *path, void actor(char *path, char *extension)); #endif // _HITS_DB DALIGNER-master/HPCdaligner.c000066400000000000000000000367331263373675100160510ustar00rootroot00000000000000/*********************************************************************************************\ * * Produce a script to compute overlaps for all block pairs of a DB, and then sort and merge * them into as many .las files as their are blocks. * * Author: Gene Myers * Date : June 1, 2014 * *********************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "filter.h" #undef LSF // define if want a directly executable LSF script static char *Usage[] = { "[-vbAI] [-k] [-w] [-h] [-t] [-M]", " [-e] [-s]", " [-m]+ [-dal] [-deg]", " [[-]" }; static int power(int base, int exp) { int i, pow; pow = 1; for (i = 0; i < exp; i++) pow *= base; return (pow); } #define LSF_ALIGN "bsub -q medium -n 4 -o ALIGN.out -e ALIGN.err -R span[hosts=1] -J align#%d" #define LSF_MERGE "bsub -q short -n 12 -o MERGE.out -e MERGE.err -R span[hosts=1] -J merge#%d" int main(int argc, char *argv[]) { int nblocks; int useblock; int fblock, lblock; #ifdef LSF int jobid; #endif char *pwd, *root; int MUNIT, DUNIT; int VON, BON, AON, ION; int WINT, TINT, HGAP, HINT, KINT, SINT, LINT, MINT; double EREL; int MMAX, MTOP; char **MASK; { int i, j, k; // Process options int flags[128]; char *eptr; ARG_INIT("HPCdaligner") DUNIT = 4; MUNIT = 25; KINT = 14; WINT = 6; HINT = 35; TINT = 0; HGAP = 0; EREL = 0.; LINT = 1000; SINT = 100; MINT = -1; MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vbAI"); break; case 'k': ARG_POSITIVE(KINT,"K-mer length") break; case 'w': ARG_POSITIVE(WINT,"Log of bin width") break; case 'h': ARG_POSITIVE(HINT,"Hit threshold (in bp.s)") break; case 't': ARG_POSITIVE(TINT,"Tuple suppression frequency") break; case 'H': ARG_POSITIVE(HGAP,"HGAP threshold (in bp.s)") break; case 'e': ARG_REAL(EREL) if (EREL < .7 || EREL >= 1.) { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n",Prog_Name,EREL); exit (1); } break; case 'l': ARG_POSITIVE(LINT,"Minimum ovlerap length") break; case 's': ARG_POSITIVE(SINT,"Trace spacing") break; case 'M': ARG_NON_NEGATIVE(MINT,"Memory allocation (in Gb)") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; case 'd': if (argv[i][2] == 'e' && argv[i][3] == 'g') { MUNIT = strtol(argv[i]+4,&eptr,10); if (*eptr != '\0' || argv[i][4] == '\0') { fprintf(stderr,"%s: -mrg argument is not an integer\n",Prog_Name); exit (1); } if (MUNIT <= 0) { fprintf(stderr,"%s: Files per merge must be positive (%d)\n", Prog_Name,MUNIT); exit (1); } if (MUNIT < 3) { fprintf(stderr,"%s: Files per merge must be at least 3 (%d)\n", Prog_Name,MUNIT); exit (1); } } else if (argv[i][2] == 'a' && argv[i][3] == 'l') { DUNIT = strtol(argv[i]+4,&eptr,10); if (*eptr != '\0' || argv[i][4] == '\0') { fprintf(stderr,"%s: -dal argument is not an integer\n",Prog_Name); exit (1); } if (DUNIT <= 0) { fprintf(stderr,"%s: Blocks per daligner call must be positive (%d)\n", Prog_Name,DUNIT); exit (1); } } else { fprintf(stderr,"%s: -%.3s is an illegal option\n",Prog_Name,argv[i]+1); exit (1); } break; } else argv[j++] = argv[i]; argc = j; VON = flags['v']; BON = flags['b']; AON = flags['A']; ION = flags['I']; if (argc < 2 || argc > 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[3]); exit (1); } } // Make sure DB exists and is partitioned, get number of blocks in partition pwd = PathTo(argv[1]); if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0) root = Root(argv[1],".dam"); else root = Root(argv[1],".db"); { int i, nfiles; FILE *dbvis; dbvis = fopen(Catenate(pwd,"/",root,".dam"),"r"); if (dbvis == NULL) { dbvis = Fopen(Catenate(pwd,"/",root,".db"),"r"); if (dbvis == NULL) exit (1); } if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) SYSTEM_ERROR for (i = 0; i < nfiles; i++) { char buffer[30001]; if (fgets(buffer,30000,dbvis) == NULL) SYSTEM_ERROR } useblock = 1; if (fscanf(dbvis,"blocks = %d\n",&nblocks) != 1) { useblock = 0; nblocks = 1; } } // Set range fblock-lblock checking that DB..las exists & DB..las does not { char *eptr, *fptr; FILE *file; if (argc == 3) { fblock = strtol(argv[2],&eptr,10); if (*eptr != '\0' && *eptr != '-') { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n", Prog_Name,argv[2]); exit (1); } if (*eptr == '-') { lblock = strtol(eptr+1,&fptr,10); if (*fptr != '\0') { fprintf(stderr,"%s: second part of range '%s' is not an integer\n", Prog_Name,eptr+1); exit (1); } } else lblock = fblock; if (fblock < 1 || lblock > nblocks || fblock > lblock) { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock); exit (1); } } else { fblock = 1; lblock = nblocks; } if (fblock > 1) { file = fopen(Catenate(root,Numbered_Suffix(".",fblock-1,".las"),"",""),"r"); if (file == NULL) { fprintf(stderr,"%s: File %s.%d.las should already be present!\n", Prog_Name,root,fblock-1); exit (1); } else fclose(file); } file = fopen(Catenate(root,Numbered_Suffix(".",fblock,".las"),"",""),"r"); if (file != NULL) { fprintf(stderr,"%s: File %s.%d.las should not yet exist!\n", Prog_Name,root,fblock); exit (1); } } { int level, njobs; int i, j, k; int usepath; // Produce all necessary daligner jobs ... usepath = (strcmp(pwd,".") != 0); njobs = 0; for (i = fblock; i <= lblock; i++) njobs += (i-1)/DUNIT+1; printf("# Daligner jobs (%d)\n",njobs); #ifdef LSF jobid = 1; #endif for (i = fblock; i <= lblock; i++) { int bits; int low, hgh; bits = (i-1)/DUNIT+1; low = 1; for (j = 1; j <= bits; j++) { #ifdef LSF printf(LSF_ALIGN,jobid++); printf(" \""); #endif printf("daligner"); if (VON) printf(" -v"); if (BON) printf(" -b"); if (AON) printf(" -A"); if (ION) printf(" -I"); if (KINT != 14) printf(" -k%d",KINT); if (WINT != 6) printf(" -w%d",WINT); if (HINT != 35) printf(" -h%d",HINT); if (TINT > 0) printf(" -t%d",TINT); if (HGAP > 0) printf(" -H%d",HGAP); if (EREL > .1) printf(" -e%g",EREL); if (LINT != 1000) printf(" -l%d",LINT); if (SINT != 100) printf(" -s%d",SINT); if (MINT >= 0) printf(" -M%d",MINT); for (k = 0; k < MTOP; k++) printf(" -m%s",MASK[k]); if (useblock) if (usepath) printf(" %s/%s.%d",pwd,root,i); else printf(" %s.%d",root,i); else if (usepath) printf(" %s/%s",pwd,root); else printf(" %s",root); hgh = (i*j)/bits + 1; for (k = low; k < hgh; k++) if (useblock) if (usepath) printf(" %s/%s.%d",pwd,root,k); else printf(" %s.%d",root,k); else if (usepath) printf(" %s/%s",pwd,root); else printf(" %s",root); #ifdef LSF printf("\""); #endif printf("\n"); low = hgh; } } // ... and then all the initial sort & merge jobs for each block pair printf("# Initial sort jobs (%d)\n", lblock*lblock - (fblock-1)*(fblock-1) ); #ifdef LSF jobid = 1; #endif for (i = 1; i <= lblock; i++) for (j = (i < fblock ? fblock : 1); j <= lblock; j++) { #ifdef LSF printf(LSF_MERGE,jobid++); printf(" \""); #endif printf("LAsort"); if (VON) printf(" -v"); for (k = 0; k < NTHREADS; k++) if (useblock) { printf(" %s.%d.%s.%d.C%d",root,i,root,j,k); printf(" %s.%d.%s.%d.N%d",root,i,root,j,k); } else { printf(" %s.%s.C%d",root,root,k); printf(" %s.%s.N%d",root,root,k); } printf(" && LAmerge"); if (VON) printf(" -v"); if (lblock == 1) printf(" %s.%d",root,i); else if (i < fblock) printf(" L1.%d.%d",i,(j-fblock)+1); else printf(" L1.%d.%d",i,j); for (k = 0; k < NTHREADS; k++) if (useblock) { printf(" %s.%d.%s.%d.C%d.S",root,i,root,j,k); printf(" %s.%d.%s.%d.N%d.S",root,i,root,j,k); } else { printf(" %s.%s.C%d.S",root,root,k); printf(" %s.%s.N%d.S",root,root,k); } printf(" && rm"); for (k = 0; k < NTHREADS; k++) if (useblock) { printf(" %s.%d.%s.%d.C%d.S.las",root,i,root,j,k); printf(" %s.%d.%s.%d.N%d.S.las",root,i,root,j,k); } else { printf(" %s.%s.C%d.S.las",root,root,k); printf(" %s.%s.N%d.S.las",root,root,k); } #ifdef LSF printf("\""); #endif printf("\n"); } // Higher level merges (if lblock > 1) if (lblock > 1) { int pow, mway; // Determine most balance mway for merging in ceil(log_mrg lblock) levels pow = 1; for (level = 0; pow < lblock; level++) pow *= MUNIT; for (mway = MUNIT; mway >= 3; mway--) if (power(mway,level) < lblock) break; mway += 1; // Issue the commands for each merge level { int p, cnt, dnt; cnt = lblock; dnt = (lblock-fblock)+1; for (i = 1; i <= level; i++) { int bits, dits; int low, hgh; bits = (cnt-1)/mway+1; dits = (dnt-1)/mway+1; // Incremental update merges #ifdef LSF jobid = 1; #endif if (dnt >= 1) { int last; last = (dnt == 1 || i == level); printf("# Level %d jobs (%d)\n",i,bits*((lblock-fblock)+1) + dits*(fblock-1)); for (j = 1; j < fblock; j++) { #ifdef LSF printf(LSF_MERGE,jobid++); printf(" \""); #endif if (last) printf("mv %s.%d.las L%d.%d.0.las && ",root,j,i,j); low = 1; for (p = 1; p <= dits; p++) { hgh = (dnt*p)/dits; #ifdef LSF if (p > 1) { printf(LSF_MERGE,jobid++); printf(" \""); } #endif printf("LAmerge"); if (VON) printf(" -v"); if (last) printf(" %s.%d L%d.%d.0",root,j,i,j); else printf(" L%d.%d.%d",i+1,j,p); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d",i,j,k); printf(" && rm"); if (last) printf(" L%d.%d.0.las",i,j); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d.las",i,j,k); #ifdef LSF printf("\""); #endif printf("\n"); low = hgh+1; } } if (dnt > 1) dnt = dits; else dnt = 0; } else printf("# Level %d jobs (%d)\n",i,bits*((lblock-fblock)+1)); // New block merges for (j = fblock; j <= lblock; j++) { low = 1; for (p = 1; p <= bits; p++) { hgh = (cnt*p)/bits; #ifdef LSF printf(LSF_MERGE,jobid++); printf(" \""); #endif printf("LAmerge"); if (VON) printf(" -v"); if (i == level) printf(" %s.%d",root,j); else printf(" L%d.%d.%d",i+1,j,p); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d",i,j,k); printf(" && rm"); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d.las",i,j,k); #ifdef LSF printf("\""); #endif printf("\n"); low = hgh+1; } } cnt = bits; } } } } free(root); free(pwd); exit (0); } DALIGNER-master/HPCmapper.c000066400000000000000000000407671263373675100155520ustar00rootroot00000000000000/*********************************************************************************************\ * * Produce a script to compute overlaps for all block pairs between two DBs, and then sort * and merge * them into as many .las files as their are blocks of the 1st DB. * * Author: Gene Myers * Date : December 31, 2014 * *********************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "filter.h" #undef LSF // define if want a directly executable LSF script static char *Usage[] = { "[-vb] [-k] [-w] [-h] [-t] [-M]", " [-e] [-s]", " [-m]+ [-dal] [-deg]", " [[-]]" }; static int power(int base, int exp) { int i, pow; pow = 1; for (i = 0; i < exp; i++) pow *= base; return (pow); } #define LSF_ALIGN "bsub -q medium -n 4 -o ALIGN.out -e ALIGN.err -R span[hosts=1] -J align#%d" #define LSF_MERGE "bsub -q short -n 12 -o MERGE.out -e MERGE.err -R span[hosts=1] -J merge#%d" int main(int argc, char *argv[]) { int nblocks1, nblocks2; int useblock1, useblock2; int fblock, lblock; #ifdef LSF int jobid; #endif char *pwd1, *root1; char *pwd2, *root2; int MUNIT, DUNIT; int VON, BON, CON; int WINT, TINT, HGAP, HINT, KINT, SINT, LINT, MINT; double EREL; int MMAX, MTOP; char **MASK; { int i, j, k; // Process options int flags[128]; char *eptr; ARG_INIT("HPCmapper") DUNIT = 4; MUNIT = 25; KINT = 20; WINT = 6; HINT = 50; TINT = 0; HGAP = 0; EREL = 0.; LINT = 1000; SINT = 100; MINT = -1; MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vbc"); break; case 'k': ARG_POSITIVE(KINT,"K-mer length") break; case 'w': ARG_POSITIVE(WINT,"Log of bin width") break; case 'h': ARG_POSITIVE(HINT,"Hit threshold (in bp.s)") break; case 't': ARG_POSITIVE(TINT,"Tuple suppression frequency") break; case 'H': ARG_POSITIVE(HGAP,"HGAP threshold (in bp.s)") break; case 'e': ARG_REAL(EREL) if (EREL < .7 || EREL >= 1.) { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n",Prog_Name,EREL); exit (1); } break; case 'l': ARG_POSITIVE(LINT,"Minimum ovlerap length") break; case 's': ARG_POSITIVE(SINT,"Trace spacing") break; case 'M': ARG_NON_NEGATIVE(MINT,"Memory allocation (in Gb)") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; case 'd': if (argv[i][2] == 'e' && argv[i][3] == 'g') { MUNIT = strtol(argv[i]+4,&eptr,10); if (*eptr != '\0' || argv[i][4] == '\0') { fprintf(stderr,"%s: -mrg argument is not an integer\n",Prog_Name); exit (1); } if (MUNIT <= 0) { fprintf(stderr,"%s: Files per merge must be positive (%d)\n", Prog_Name,MUNIT); exit (1); } if (MUNIT < 3) { fprintf(stderr,"%s: Files per merge must be at least 3 (%d)\n", Prog_Name,MUNIT); exit (1); } } else if (argv[i][2] == 'a' && argv[i][3] == 'l') { DUNIT = strtol(argv[i]+4,&eptr,10); if (*eptr != '\0' || argv[i][4] == '\0') { fprintf(stderr,"%s: -dal argument is not an integer\n",Prog_Name); exit (1); } if (DUNIT <= 0) { fprintf(stderr,"%s: Blocks per daligner call must be positive (%d)\n", Prog_Name,DUNIT); exit (1); } } else { fprintf(stderr,"%s: -%.3s is an illegal option\n",Prog_Name,argv[i]+1); exit (1); } break; } else argv[j++] = argv[i]; argc = j; VON = flags['v']; BON = flags['b']; CON = flags['c']; if (argc < 3 || argc > 4) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[3]); exit (1); } } // Make sure DAM and DB exist and the DB is partitioned, get number of blocks in partition pwd1 = PathTo(argv[1]); if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0) root1 = Root(argv[1],".dam"); else root1 = Root(argv[1],".db"); { int i, nfiles; FILE *dbvis; dbvis = fopen(Catenate(pwd1,"/",root1,".dam"),"r"); if (dbvis == NULL) { dbvis = Fopen(Catenate(pwd1,"/",root1,".db"),"r"); if (dbvis == NULL) exit (1); } if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) SYSTEM_ERROR for (i = 0; i < nfiles; i++) { char buffer[30001]; if (fgets(buffer,30000,dbvis) == NULL) SYSTEM_ERROR } useblock1 = 1; if (fscanf(dbvis,"blocks = %d\n",&nblocks1) != 1) { useblock1 = 0; nblocks1 = 1; } fclose(dbvis); } pwd2 = PathTo(argv[2]); if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0) root2 = Root(argv[2],".dam"); else root2 = Root(argv[2],".db"); if (strcmp(root2,root1) == 0 && strcmp(pwd1,pwd2) == 0) { fprintf(stderr,"%s: Comparing the same data base %s/%s against itself, use HPCdaligner\n", Prog_Name,pwd1,root1); exit (1); } { int i, nfiles; FILE *dbvis; dbvis = fopen(Catenate(pwd2,"/",root2,".dam"),"r"); if (dbvis == NULL) { dbvis = Fopen(Catenate(pwd2,"/",root2,".db"),"r"); if (dbvis == NULL) exit (1); } if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) SYSTEM_ERROR for (i = 0; i < nfiles; i++) { char buffer[30001]; if (fgets(buffer,30000,dbvis) == NULL) SYSTEM_ERROR } useblock2 = 1; if (fscanf(dbvis,"blocks = %d\n",&nblocks2) != 1) { useblock2 = 0; nblocks2 = 1; } fclose(dbvis); } // Set range fblock-lblock checking that DB..las exists & DB..las does not { char *eptr, *fptr; FILE *file; if (argc == 4) { fblock = strtol(argv[3],&eptr,10); if (*eptr != '\0' && *eptr != '-') { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n", Prog_Name,argv[3]); exit (1); } if (*eptr == '-') { lblock = strtol(eptr+1,&fptr,10); if (*fptr != '\0') { fprintf(stderr,"%s: second part of range '%s' is not an integer\n", Prog_Name,eptr+1); exit (1); } } else lblock = fblock; if (fblock < 1 || lblock > nblocks2 || fblock > lblock) { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock); exit (1); } } else { fblock = 1; lblock = nblocks2; } if (fblock > 1) { file = fopen(Catenate(root1,".",root2,Numbered_Suffix(".",fblock-1,".las")),"r"); if (file == NULL) { fprintf(stderr,"%s: File %s.%s.%d.las should already be present!\n", Prog_Name,root1,root2,fblock-1); exit (1); } else fclose(file); } if (useblock2) { file = fopen(Catenate(root1,".",root2,Numbered_Suffix(".",fblock,".las")),"r"); if (file != NULL) { fprintf(stderr,"%s: File %s.%s.%d.las should not yet exist!\n", Prog_Name,root1,root2,fblock); exit (1); } } else { file = fopen(Catenate(root1,".",root2,".las"),"r"); if (file != NULL) { fprintf(stderr,"%s: File %s.%s.las should not yet exist!\n", Prog_Name,root1,root2); exit (1); } } } { int level, njobs; int i, j, k; int usepath1, usepath2; // Produce all necessary daligner jobs ... usepath1 = (strcmp(pwd1,".") != 0); usepath2 = (strcmp(pwd2,".") != 0); njobs = nblocks1 * ( (lblock-fblock)/DUNIT + 1); printf("# Daligner jobs (%d)\n",njobs); #ifdef LSF jobid = 1; #endif for (i = 1; i <= nblocks1; i++) { int bits; int low, hgh; bits = (lblock-fblock)/DUNIT+1; low = fblock; for (j = 1; j <= bits; j++) { #ifdef LSF printf(LSF_ALIGN,jobid++); printf(" \""); #endif printf("daligner -A"); if (VON) printf(" -v"); if (BON) printf(" -b"); printf(" -k%d",KINT); if (WINT != 6) printf(" -w%d",WINT); printf(" -h%d",HINT); if (TINT > 0) printf(" -t%d",TINT); if (HGAP > 0) printf(" -H%d",HGAP); if (EREL > .1) printf(" -e%g",EREL); else printf(" -e.85"); if (LINT != 1000) printf(" -l%d",LINT); if (SINT != 100) printf(" -s%d",SINT); if (MINT >= 0) printf(" -M%d",MINT); for (k = 0; k < MTOP; k++) printf(" -m%s",MASK[k]); if (useblock1) if (usepath1) printf(" %s/%s.%d",pwd1,root1,i); else printf(" %s.%d",root1,i); else if (usepath1) printf(" %s/%s",pwd1,root1); else printf(" %s",root1); hgh = fblock + (((lblock-fblock)+1)*j)/bits; for (k = low; k < hgh; k++) if (useblock2) if (usepath2) printf(" %s/%s.%d",pwd2,root2,k); else printf(" %s.%d",root2,k); else if (usepath2) printf(" %s/%s",pwd2,root2); else printf(" %s",root2); #ifdef LSF printf("\""); #endif printf("\n"); low = hgh; } } // ... and then all the initial sort & merge jobs for each block pair printf("# Initial sort jobs (%d)\n", nblocks1*((lblock-fblock)+1)); #ifdef LSF jobid = 1; #endif for (i = 1; i <= nblocks1; i++) for (j = fblock; j <= lblock; j++) { #ifdef LSF printf(LSF_MERGE,jobid++); printf(" \""); #endif printf("LAsort"); if (VON) printf(" -v"); if (CON) printf(" -c"); for (k = 0; k < NTHREADS; k++) { if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.C%d",root2,j,k); else printf(".%s.C%d",root2,k); if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.N%d",root2,j,k); else printf(".%s.N%d",root2,k); } printf(" && LAmerge"); if (VON) printf(" -v"); if (CON) printf(" -c"); if (nblocks1 == 1) if (useblock2) printf(" %s.%s.%d",root1,root2,j); else printf(" %s.%s",root1,root2); else printf(" L1.%d.%d",i,j); for (k = 0; k < NTHREADS; k++) { if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.C%d.S",root2,j,k); else printf(".%s.C%d.S",root2,k); if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.N%d.S",root2,j,k); else printf(".%s.N%d.S",root2,k); } printf(" && rm"); for (k = 0; k < NTHREADS; k++) { if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.C%d.S.las",root2,j,k); else printf(".%s.C%d.S.las",root2,k); if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.N%d.S.las",root2,j,k); else printf(".%s.N%d.S.las",root2,k); if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.C%d.las",root2,j,k); else printf(".%s.C%d.las",root2,k); if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.N%d.las",root2,j,k); else printf(".%s.N%d.las",root2,k); } #ifdef LSF printf("\""); #endif printf("\n"); } // Higher level merges (if lblock > 1) if (nblocks1 > 1) { int pow, mway; // Determine most balance mway for merging in ceil(log_mrg nblock1) levels pow = 1; for (level = 0; pow < nblocks1; level++) pow *= MUNIT; for (mway = MUNIT; mway >= 3; mway--) if (power(mway,level) < nblocks1) break; mway += 1; // Issue the commands for each merge level { int p, cnt; cnt = nblocks1; for (i = 1; i <= level; i++) { int bits; int low, hgh; bits = (cnt-1)/mway+1; printf("# Level %d jobs (%d)\n",i,bits*((lblock-fblock)+1)); // Block merges #ifdef LSF jobid = 1; #endif for (j = fblock; j <= lblock; j++) { low = 1; for (p = 1; p <= bits; p++) { hgh = (cnt*p)/bits; #ifdef LSF printf(LSF_MERGE,jobid++); printf(" \""); #endif printf("LAmerge"); if (VON) printf(" -v"); if (CON) printf(" -c"); if (i == level) if (useblock2) printf(" %s.%s.%d",root1,root2,j); else printf(" %s.%s",root1,root2); else printf(" L%d.%d.%d",i+1,j,p); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d",i,k,j); printf(" && rm"); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d.las",i,k,j); #ifdef LSF printf("\""); #endif printf("\n"); low = hgh+1; } } cnt = bits; } } } } free(root2); free(pwd2); free(root1); free(pwd1); exit (0); } DALIGNER-master/LAcat.c000066400000000000000000000100621263373675100147000ustar00rootroot00000000000000/******************************************************************************************* * * Merge together in index order, overlap files .1.las, .2.las, ... into a * single overlap file and output to the standard output * * Author: Gene Myers * Date : July 2013 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = " > .las"; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { char *iblock, *oblock; FILE *input; int64 novl, bsize, ovlsize, ptrsize; int tspace, tbytes; char *pwd, *root; Prog_Name = Strdup("LAcat",""); if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; oblock = (char *) Malloc(bsize,"Allocating output block"); iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); if (oblock == NULL || iblock == NULL) exit (1); iblock += ptrsize; pwd = PathTo(argv[1]); root = Root(argv[1],".las"); { int64 povl; int i, mspace; novl = 0; tspace = 0; mspace = 0; tbytes = 0; for (i = 0; 1; i++) { char *name = Catenate(pwd,"/",root,Numbered_Suffix(".",i+1,".las")); if ((input = fopen(name,"r")) == NULL) break; if (fread(&povl,sizeof(int64),1,input) != 1) SYSTEM_ERROR novl += povl; if (fread(&mspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (i == 0) { tspace = mspace; if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); } else if (tspace != mspace) { fprintf(stderr,"%s: PT-point spacing conflict (%d vs %d)\n",Prog_Name,tspace,mspace); exit (1); } fclose(input); } fwrite(&novl,sizeof(int64),1,stdout); fwrite(&tspace,sizeof(int),1,stdout); } { int i, j; Overlap *w; int64 tsize, povl; int mspace; char *iptr, *itop; char *optr, *otop; optr = oblock; otop = oblock + bsize; for (i = 0; 1; i++) { char *name = Catenate(pwd,"/",root,Numbered_Suffix(".",i+1,".las")); if ((input = fopen(name,"r")) == NULL) break; if (fread(&povl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&mspace,sizeof(int),1,input) != 1) SYSTEM_ERROR iptr = iblock; itop = iblock + fread(iblock,1,bsize,input); for (j = 0; j < povl; j++) { if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } w = (Overlap *) (iptr - ptrsize); tsize = w->path.tlen*tbytes; if (optr + ovlsize + tsize > otop) { fwrite(oblock,1,optr-oblock,stdout); optr = oblock; } memcpy(optr,iptr,ovlsize); optr += ovlsize; iptr += ovlsize; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } memcpy(optr,iptr,tsize); optr += tsize; iptr += tsize; } fclose(input); } if (optr > oblock) fwrite(oblock,1,optr-oblock,stdout); } free(pwd); free(root); free(oblock); free(iblock-ptrsize); exit (0); } DALIGNER-master/LAcheck.c000066400000000000000000000214231263373675100152110ustar00rootroot00000000000000/******************************************************************************************* * * Check the structural integrity of .las files * * Author: Gene Myers * Date : July 2014 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-vS] [ ] ..."; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { HITS_DB _db1, *db1 = &_db1; HITS_DB _db2, *db2 = &_db2; int VERBOSE; int SORTED; int ISTWO; // Process options { int i, j, k; int flags[128]; ARG_INIT("LAcheck") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vS") break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; SORTED = flags['S']; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open trimmed DB { int status; char *pwd, *root; FILE *input; ISTWO = 0; status = Open_DB(argv[1],db1); if (status < 0) exit (1); if (db1->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } pwd = PathTo(argv[2]); root = Root(argv[2],".las"); if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) == NULL) { ISTWO = 1; if (argc <= 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } status = Open_DB(argv[2],db2); if (status < 0) exit (1); if (db2->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); exit (1); } Trim_DB(db2); } else { fclose(input); db2 = db1; } Trim_DB(db1); free(root); free(pwd); } { char *iblock; int64 bsize, ovlsize, ptrsize; int i, j; HITS_READ *reads1 = db1->reads; int nreads1 = db1->nreads; HITS_READ *reads2 = db2->reads; int nreads2 = db2->nreads; // Setup IO buffers ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; iblock = (char *) Malloc(bsize+ptrsize,"Allocating input block"); if (iblock == NULL) exit (1); iblock += ptrsize; // For each file do for (i = 2+ISTWO; i < argc; i++) { char *pwd, *root; FILE *input; char *iptr, *itop; Overlap last; int64 novl; int tspace, tbytes; // Establish IO and (novl,tspace) header pwd = PathTo(argv[i]); root = Root(argv[i],".las"); if ((input = Fopen(Catenate(pwd,"/",root,".las"),"r")) == NULL) goto error; if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (novl < 0) { if (VERBOSE) fprintf(stderr," %s: Number of alignments < 0\n",root); goto error; } if (tspace < 0) { if (VERBOSE) fprintf(stderr," %s: Trace spacing < 0\n",root); goto error; } if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); iptr = iblock; itop = iblock + fread(iblock,1,bsize,input); // For each record in file do last.aread = -1; last.bread = -1; last.flags = 0; last.path.bbpos = last.path.abpos = 0; last.path.bepos = last.path.aepos = 0; for (j = 0; j < novl; j++) { Overlap ovl; int tsize; int equal; // Fetch next record if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); if (iptr + ovlsize > itop) { if (VERBOSE) fprintf(stderr," %s: Too few alignment records\n",root); goto error; } } ovl = *((Overlap *) (iptr - ptrsize)); iptr += ovlsize; tsize = ovl.path.tlen*tbytes; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); if (iptr + tsize > itop) { if (VERBOSE) fprintf(stderr," %s: Too few alignment records\n",root); goto error; } } ovl.path.trace = iptr; iptr += tsize; // Basic checks if (ovl.aread < 0 || ovl.bread < 0) { if (VERBOSE) fprintf(stderr," %s: Read indices < 0\n",root); goto error; } if (ovl.aread >= nreads1 || ovl.bread >= nreads2) { if (VERBOSE) fprintf(stderr," %s: Read indices out of range\n",root); goto error; } if (ovl.path.abpos >= ovl.path.aepos || ovl.path.aepos > reads1[ovl.aread].rlen || ovl.path.bbpos >= ovl.path.bepos || ovl.path.bepos > reads2[ovl.bread].rlen || ovl.path.abpos < 0 || ovl.path.bbpos < 0 ) { if (VERBOSE) fprintf(stderr," %s: Non-sense alignment intervals\n",root); goto error; } if (ovl.path.diffs < 0 || ovl.path.diffs > reads1[ovl.aread].rlen || ovl.path.diffs > reads2[ovl.bread].rlen) { if (VERBOSE) fprintf(stderr," %s: Non-sense number of differences\n",root); goto error; } if (Check_Trace_Points(&ovl,tspace,VERBOSE,root)) goto error; // Duplicate check and sort check if -S set equal = 0; if (SORTED) { if (ovl.aread > last.aread) goto inorder; if (ovl.aread == last.aread) { if (ovl.bread > last.bread) goto inorder; if (ovl.bread == last.bread) { if (COMP(ovl.flags) > COMP(last.flags)) goto inorder; if (COMP(ovl.flags) == COMP(last.flags)) { if (ovl.path.abpos > last.path.abpos) goto inorder; if (ovl.path.abpos == last.path.abpos) { equal = 1; goto inorder; } } } } if (VERBOSE) fprintf(stderr," %s: Reads are not sorted (%d vs %d)\n", root,ovl.aread+1,ovl.bread+1); goto error; } else { if (ovl.aread == last.aread && ovl.bread == last.bread && COMP(ovl.flags) == COMP(last.flags) && ovl.path.abpos == last.path.abpos) equal = 1; } inorder: if (equal) { if (ovl.path.aepos == last.path.aepos && ovl.path.bbpos == last.path.bbpos && ovl.path.bepos == last.path.bepos) { if (VERBOSE) fprintf(stderr," %s: Duplicate overlap (%d vs %d)\n", root,ovl.aread+1,ovl.bread+1); goto error; } } last = ovl; } // File processing epilog: Check all data read and print OK if -v if (iptr < itop) { if (VERBOSE) fprintf(stderr," %s: Too many alignment records\n",root); goto error; } if (VERBOSE) { fprintf(stderr," %s: ",root); Print_Number(novl,0,stderr); fprintf(stderr," all OK\n"); } error: fclose(input); free(pwd); free(root); } free(iblock-ptrsize); } Close_DB(db1); if (ISTWO) Close_DB(db2); exit (0); } DALIGNER-master/LAdump.c000066400000000000000000000266431263373675100151120ustar00rootroot00000000000000/******************************************************************************************* * * Utility for displaying the information in the overlaps of a .las file in a very * simple to parse format. * * Author: Gene Myers * Creation: July 2013 * Last Mod: Jan 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-cdt] [-o] [ ] [ | ... ]"; #define LAST_READ_SYMBOL '$' static int ORDER(const void *l, const void *r) { int x = *((int *) l); int y = *((int *) r); return (x-y); } int main(int argc, char *argv[]) { HITS_DB _db1, *db1 = &_db1; HITS_DB _db2, *db2 = &_db2; Overlap _ovl, *ovl = &_ovl; FILE *input; int64 novl; int tspace, tbytes, small; int reps, *pts; int input_pts; int OVERLAP; int DOCOORDS, DODIFFS, DOTRACE; int ISTWO; // Process options { int i, j, k; int flags[128]; ARG_INIT("LAdump") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("ocdtUF") break; } else argv[j++] = argv[i]; argc = j; OVERLAP = flags['o']; DOCOORDS = flags['c']; DODIFFS = flags['d']; DOTRACE = flags['t']; if (DOTRACE) DOCOORDS = 1; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open trimmed DB or DB pair { int status; char *pwd, *root; FILE *input; ISTWO = 0; status = Open_DB(argv[1],db1); if (status < 0) exit (1); if (db1->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } if (argc > 3) { pwd = PathTo(argv[3]); root = Root(argv[3],".las"); if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) { ISTWO = 1; fclose(input); status = Open_DB(argv[2],db2); if (status < 0) exit (1); if (db2->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); exit (1); } Trim_DB(db2); } else db2 = db1; free(root); free(pwd); } else db2 = db1; Trim_DB(db1); } // Process read index arguments into a sorted list of read ranges input_pts = 0; if (argc == ISTWO+4) { if (argv[ISTWO+3][0] != LAST_READ_SYMBOL || argv[ISTWO+3][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[ISTWO+3],&eptr,10); if (eptr > argv[ISTWO+3] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { int v, x; FILE *input; input = Fopen(argv[ISTWO+3],"r"); if (input == NULL) exit (1); reps = 0; while ((v = fscanf(input," %d",&x)) != EOF) if (v == 0) { fprintf(stderr,"%s: %d'th item of input file %s is not an integer\n", Prog_Name,reps+1,argv[2]); exit (1); } else reps += 1; reps *= 2; pts = (int *) Malloc(sizeof(int)*reps,"Allocating read parameters"); if (pts == NULL) exit (1); rewind(input); for (v = 0; v < reps; v += 2) { fscanf(input," %d",&x); pts[v] = pts[v+1] = x; } fclose(input); } else { pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 3+ISTWO) { int c, b, e; char *eptr, *fptr; for (c = 3+ISTWO; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db1->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == '\0') { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = INT32_MAX; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } qsort(pts,reps/2,sizeof(int64),ORDER); b = 0; for (c = 0; c < reps; c += 2) if (b > 0 && pts[b-1] >= pts[c]-1) { if (pts[c+1] > pts[b-1]) pts[b-1] = pts[c+1]; } else { pts[b++] = pts[c]; pts[b++] = pts[c+1]; } pts[b++] = INT32_MAX; reps = b; } else { pts[reps++] = 1; pts[reps++] = INT32_MAX; } } // Initiate file reading and read header { char *over, *pwd, *root; pwd = PathTo(argv[2+ISTWO]); root = Root(argv[2+ISTWO],".las"); over = Catenate(pwd,"/",root,".las"); input = Fopen(over,"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } free(pwd); free(root); } // Scan to count sizes of things { int j, al, tlen; int in, npt, idx, ar; int64 novls, odeg, omax, sdeg, smax, ttot, tmax; in = 0; npt = pts[0]; idx = 1; // For each record do novls = omax = smax = ttot = tmax = 0; sdeg = odeg = 0; al = 0; for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); tlen = ovl->path.tlen; fseeko(input,tlen*tbytes,SEEK_CUR); // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != db1->reads[ovl->aread].rlen && ovl->path.bepos != db2->reads[ovl->bread].rlen) continue; } if (ar != al) { if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; sdeg = odeg = 0; al = ar; } novls += 1; odeg += 1; sdeg += tlen; ttot += tlen; if (tlen > tmax) tmax = tlen; } if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; printf("+ P %lld\n",novls); printf("%% P %lld\n",omax); printf("+ T %lld\n",ttot); printf("%% T %lld\n",smax); printf("@ T %lld\n",tmax); } // Read the file and display selected records { int j; uint16 *trace; int tmax; int in, npt, idx, ar; int64 verse; rewind(input); fread(&verse,sizeof(int64),1,input); fread(&tspace,sizeof(int),1,input); if (verse < 0) { for (j = 0; j < 5; j++) fread(&verse,sizeof(int64),1,input); } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); in = 0; npt = pts[0]; idx = 1; // For each record do for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2*ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); } ovl->path.trace = (void *) trace; Read_Trace(input,ovl,tbytes); // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != db1->reads[ovl->aread].rlen && ovl->path.bepos != db2->reads[ovl->bread].rlen) continue; } // Display it printf("P %d %d",ovl->aread+1,ovl->bread+1); if (COMP(ovl->flags)) printf(" c\n"); else printf(" n\n"); if (DOCOORDS) printf("C %d %d %d %d\n",ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos); if (DODIFFS) printf("D %d\n",ovl->path.diffs); if (DOTRACE) { uint16 *trace = (uint16 *) ovl->path.trace; int tlen = ovl->path.tlen; if (small) Decompress_TraceTo16(ovl); printf("T %d\n",tlen>>1); for (j = 0; j < tlen; j += 2) printf(" %3d %3d\n",trace[j],trace[j+1]); } } free(trace); } Close_DB(db1); if (ISTWO) Close_DB(db2); exit (0); } DALIGNER-master/LAindex.c000066400000000000000000000121531263373675100152430ustar00rootroot00000000000000/******************************************************************************************* * * Create an index with extension .las.idx for a .las file. * Utility expects the .las file to be sorted. * Header contains total # of trace points, max # of trace points for * a given overlap, max # of trace points in all the overlaps for a given aread, and * max # of overlaps for a given aread. The remainder are the offsets into each pile. * * Author: Gene Myers * Date : Sept 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-v] ..."; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { char *iblock; FILE *input, *output; int64 novl, bsize, ovlsize, ptrsize; int tspace, tbytes; char *pwd, *root; int64 tmax, ttot; int64 omax, smax; int64 odeg, sdeg; int i; int VERBOSE; // Process options { int j, k; int flags[128]; ARG_INIT("LAindex") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("v") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // For each file do ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); if (iblock == NULL) exit (1); iblock += ptrsize; for (i = 1; i < argc; i++) { pwd = PathTo(argv[i]); root = Root(argv[i],".las"); input = Fopen(Catenate(pwd,"/",root,".las"),"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); output = Fopen(Catenate(pwd,"/.",root,".las.idx"),"w"); if (output == NULL) exit (1); free(pwd); free(root); if (VERBOSE) { printf(" Indexing %s: ",root); Print_Number(novl,0,stdout); printf(" records ... "); fflush(stdout); } fwrite(&novl,sizeof(int64),1,output); fwrite(&novl,sizeof(int64),1,output); fwrite(&novl,sizeof(int64),1,output); fwrite(&novl,sizeof(int64),1,output); { int j, alst; Overlap *w; int64 tsize; int64 optr; char *iptr, *itop; int64 tlen; optr = sizeof(int64) + sizeof(int32); iptr = iblock; itop = iblock + fread(iblock,1,bsize,input); alst = -1; odeg = sdeg = 0; omax = smax = 0; tmax = ttot = 0; for (j = 0; j < novl; j++) { if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } w = (Overlap *) (iptr - ptrsize); tlen = w->path.tlen; if (alst < 0) { fwrite(&optr,sizeof(int64),1,output); alst = w->aread; } else while (alst < w->aread) { if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; fwrite(&optr,sizeof(int64),1,output); odeg = sdeg = 0; alst += 1; } if (tlen > tmax) tmax = tlen; ttot += tlen; odeg += 1; sdeg += tlen; iptr += ovlsize; tsize = tlen*tbytes; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } optr += ovlsize + tsize; iptr += tsize; } fwrite(&optr,sizeof(int64),1,output); } if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; rewind(output); fwrite(&omax,sizeof(int64),1,output); fwrite(&ttot,sizeof(int64),1,output); fwrite(&smax,sizeof(int64),1,output); fwrite(&tmax,sizeof(int64),1,output); if (VERBOSE) { Print_Number(ttot,0,stdout); printf(" trace points\n"); fflush(stdout); } fclose(input); fclose(output); } free(iblock-ptrsize); exit (0); } DALIGNER-master/LAmerge.c000066400000000000000000000204451263373675100152360ustar00rootroot00000000000000/******************************************************************************************* * * Given a list of sorted .las files, merge them into a single sorted .las file. * * Author: Gene Myers * Date : July 2013 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-v] ..."; #define MEMORY 4000 // in Mb #undef DEBUG // Heap sort of records according to (aread,bread,COMP(flags),abpos) order #define COMPARE(lp,rp) \ if (lp->aread > rp->aread) \ bigger = 1; \ else if (lp->aread < rp->aread) \ bigger = 0; \ else if (lp->bread > rp->bread) \ bigger = 1; \ else if (lp->bread < rp->bread) \ bigger = 0; \ else if (COMP(lp->flags) > COMP(rp->flags)) \ bigger = 1; \ else if (COMP(lp->flags) < COMP(rp->flags)) \ bigger = 0; \ else if (lp->path.abpos > rp->path.abpos) \ bigger = 1; \ else \ bigger = 0; static void reheap(int s, Overlap **heap, int hsize) { int c, l, r; int bigger; Overlap *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; if (r > hsize) bigger = 1; else { hr = heap[r]; COMPARE(hr,hl) } if (bigger) { COMPARE(hs,hl) if (bigger) { heap[c] = hl; c = l; } else break; } else { COMPARE(hs,hr) if (bigger) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } // Heap sort of records according to (aread,abpos) order #define MAPARE(lp,rp) \ if (lp->aread > rp->aread) \ bigger = 1; \ else if (lp->aread < rp->aread) \ bigger = 0; \ else if (lp->path.abpos > rp->path.abpos) \ bigger = 1; \ else \ bigger = 0; static void maheap(int s, Overlap **heap, int hsize) { int c, l, r; int bigger; Overlap *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; if (r > hsize) bigger = 1; else { hr = heap[r]; MAPARE(hr,hl) } if (bigger) { MAPARE(hs,hl) if (bigger) { heap[c] = hl; c = l; } else break; } else { MAPARE(hs,hr) if (bigger) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } #ifdef DEBUG static void showheap(Overlap **heap, int hsize) { int i; printf("\n"); for (i = 1; i <= hsize; i++) printf(" %3d: %5d, %5d\n",i,heap[i]->aread,heap[i]->bread); } #endif // Input block data structure and block fetcher typedef struct { FILE *stream; char *block; char *ptr; char *top; int64 count; } IO_block; static void ovl_reload(IO_block *in, int64 bsize) { int64 remains; remains = in->top - in->ptr; if (remains > 0) memcpy(in->block, in->ptr, remains); in->ptr = in->block; in->top = in->block + remains; in->top += fread(in->top,1,bsize-remains,in->stream); } // The program int main(int argc, char *argv[]) { IO_block *in; int64 bsize, osize, psize; char *block, *oblock; int i, fway; Overlap **heap; int hsize; Overlap *ovls; int64 totl; int tspace, tbytes; FILE *output; char *optr, *otop; int VERBOSE; int MAP_SORT; // Process command line { int j, k; int flags[128]; ARG_INIT("LAmerge") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vc") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; MAP_SORT = flags['c']; if (argc < 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } fway = argc-2; if (fway > 252) { fprintf(stderr,"Exceeded maximum # of inputs and outputs (252) of merge\n"); exit (1); } } // Open all the input files and initialize their buffers psize = sizeof(void *); osize = sizeof(Overlap) - psize; bsize = (MEMORY*1000000ll)/(fway + 1); block = (char *) Malloc(bsize*(fway+1)+psize,"Allocating LAmerge blocks"); in = (IO_block *) Malloc(sizeof(IO_block)*fway,"Allocating LAmerge IO-reacords"); if (block == NULL || in == NULL) exit (1); block += psize; totl = 0; tbytes = 0; tspace = 0; for (i = 0; i < fway; i++) { int64 novl; int mspace; FILE *input; char *pwd, *root; char *iblock; pwd = PathTo(argv[i+2]); root = Root(argv[i+2],".las"); input = Fopen(Catenate(pwd,"/",root,".las"),"r"); if (input == NULL) exit (1); free(pwd); free(root); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR totl += novl; if (fread(&mspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (i == 0) { tspace = mspace; if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); } else if (tspace != mspace) { fprintf(stderr,"%s: PT-point spacing conflict (%d vs %d)\n",Prog_Name,tspace,mspace); exit (1); } in[i].stream = input; in[i].block = iblock = block+i*bsize; in[i].ptr = iblock; in[i].top = iblock + fread(in[i].block,1,bsize,input); in[i].count = 0; } // Open the output file buffer and write (novl,tspace) header { char *pwd, *root; pwd = PathTo(argv[1]); root = Root(argv[1],".las"); output = Fopen(Catenate(pwd,"/",root,".las"),"w"); if (output == NULL) exit (1); free(pwd); free(root); fwrite(&totl,sizeof(int64),1,output); fwrite(&tspace,sizeof(int),1,output); oblock = block+fway*bsize; optr = oblock; otop = oblock + bsize; } if (VERBOSE) { printf("Merging %d files totalling ",fway); Print_Number(totl,0,stdout); printf(" records\n"); } // Initialize the heap heap = (Overlap **) Malloc(sizeof(Overlap *)*(fway+1),"Allocating heap"); ovls = (Overlap *) Malloc(sizeof(Overlap)*fway,"Allocating heap"); if (heap == NULL || ovls == NULL) exit (1); hsize = 0; for (i = 0; i < fway; i++) { if (in[i].ptr < in[i].top) { ovls[i] = *((Overlap *) (in[i].ptr - psize)); in[i].ptr += osize; hsize += 1; heap[hsize] = ovls + i; } } if (hsize > 3) { if (MAP_SORT) for (i = hsize/2; i > 1; i--) maheap(i,heap,hsize); else for (i = hsize/2; i > 1; i--) reheap(i,heap,hsize); } // While the heap is not empty do while (hsize > 0) { Overlap *ov; IO_block *src; int64 tsize, span; if (MAP_SORT) maheap(1,heap,hsize); else reheap(1,heap,hsize); ov = heap[1]; src = in + (ov - ovls); src->count += 1; tsize = ov->path.tlen*tbytes; span = osize + tsize; if (src->ptr + span > src->top) ovl_reload(src,bsize); if (optr + span > otop) { fwrite(oblock,1,optr-oblock,output); optr = oblock; } memcpy(optr,((char *) ov) + psize,osize); optr += osize; memcpy(optr,src->ptr,tsize); optr += tsize; src->ptr += tsize; if (src->ptr < src->top) { *ov = *((Overlap *) (src->ptr - psize)); src->ptr += osize; } else { heap[1] = heap[hsize]; hsize -= 1; } } // Flush output buffer and wind up if (optr > oblock) fwrite(oblock,1,optr-oblock,output); fclose(output); for (i = 0; i < fway; i++) fclose(in[i].stream); for (i = 0; i < fway; i++) totl -= in[i].count; if (totl != 0) { fprintf(stderr,"%s: Did not write all records (%lld)\n",argv[0],totl); exit (1); } free(ovls); free(heap); free(in); free(block-psize); exit (0); } DALIGNER-master/LAshow.c000066400000000000000000000403321263373675100151140ustar00rootroot00000000000000/******************************************************************************************* * * Utility for displaying the overlaps in a .las file in a variety of ways including * a minimal listing of intervals, a cartoon, and a full out alignment. * * Author: Gene Myers * Creation: July 2013 * Last Mod: Jan 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage[] = { "[-caroUF] [-i] [-w] [-b] ", " [ ] [ | ... ]" }; #define LAST_READ_SYMBOL '$' static int ORDER(const void *l, const void *r) { int x = *((int *) l); int y = *((int *) r); return (x-y); } int main(int argc, char *argv[]) { HITS_DB _db1, *db1 = &_db1; HITS_DB _db2, *db2 = &_db2; Overlap _ovl, *ovl = &_ovl; Alignment _aln, *aln = &_aln; FILE *input; int sameDB; int64 novl; int tspace, tbytes, small; int reps, *pts; int input_pts; int ALIGN, CARTOON, REFERENCE, OVERLAP; int FLIP, MAP; int INDENT, WIDTH, BORDER, UPPERCASE; int ISTWO; // Process options { int i, j, k; int flags[128]; char *eptr; ARG_INIT("LAshow") INDENT = 4; WIDTH = 100; BORDER = 10; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("caroUFM") break; case 'i': ARG_NON_NEGATIVE(INDENT,"Indent") break; case 'w': ARG_POSITIVE(WIDTH,"Alignment width") break; case 'b': ARG_NON_NEGATIVE(BORDER,"Alignment border") break; } else argv[j++] = argv[i]; argc = j; CARTOON = flags['c']; ALIGN = flags['a']; REFERENCE = flags['r']; OVERLAP = flags['o']; UPPERCASE = flags['U']; FLIP = flags['F']; MAP = flags['M']; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); exit (1); } } // Open trimmed DB or DB pair { int status; char *pwd, *root; FILE *input; struct stat stat1, stat2; ISTWO = 0; status = Open_DB(argv[1],db1); if (status < 0) exit (1); if (db1->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } sameDB = 1; if (argc > 3) { pwd = PathTo(argv[3]); root = Root(argv[3],".las"); if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) { ISTWO = 1; fclose(input); status = Open_DB(argv[2],db2); if (status < 0) exit (1); if (db2->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); exit (1); } stat(Catenate(db1->path,"","",".idx"),&stat1); stat(Catenate(db2->path,"","",".idx"),&stat2); if (stat1.st_ino != stat2.st_ino) sameDB = 0; Trim_DB(db2); } else db2 = db1; free(root); free(pwd); } else db2 = db1; Trim_DB(db1); } // Process read index arguments into a sorted list of read ranges input_pts = 0; if (argc == ISTWO+4) { if (argv[ISTWO+3][0] != LAST_READ_SYMBOL || argv[ISTWO+3][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[ISTWO+3],&eptr,10); if (eptr > argv[ISTWO+3] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { int v, x; FILE *input; input = Fopen(argv[ISTWO+3],"r"); if (input == NULL) exit (1); reps = 0; while ((v = fscanf(input," %d",&x)) != EOF) if (v == 0) { fprintf(stderr,"%s: %d'th item of input file %s is not an integer\n", Prog_Name,reps+1,argv[2]); exit (1); } else reps += 1; reps *= 2; pts = (int *) Malloc(sizeof(int)*reps,"Allocating read parameters"); if (pts == NULL) exit (1); rewind(input); for (v = 0; v < reps; v += 2) { fscanf(input," %d",&x); pts[v] = pts[v+1] = x; } fclose(input); } else { pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 3+ISTWO) { int c, b, e; char *eptr, *fptr; for (c = 3+ISTWO; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db1->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == '\0') { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = INT32_MAX; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } qsort(pts,reps/2,sizeof(int64),ORDER); b = 0; for (c = 0; c < reps; c += 2) if (b > 0 && pts[b-1] >= pts[c]-1) { if (pts[c+1] > pts[b-1]) pts[b-1] = pts[c+1]; } else { pts[b++] = pts[c]; pts[b++] = pts[c+1]; } pts[b++] = INT32_MAX; reps = b; } else { pts[reps++] = 1; pts[reps++] = INT32_MAX; } } // Initiate file reading and read (novl, tspace) header { char *over, *pwd, *root; pwd = PathTo(argv[2+ISTWO]); root = Root(argv[2+ISTWO],".las"); over = Catenate(pwd,"/",root,".las"); input = Fopen(over,"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } printf("\n%s: ",root); Print_Number(novl,0,stdout); printf(" records\n"); free(pwd); free(root); } // Read the file and display selected records { int j; uint16 *trace; Work_Data *work; int tmax; int in, npt, idx, ar; int64 tps; char *abuffer, *bbuffer; int ar_wide, br_wide; int ai_wide, bi_wide; int mn_wide, mx_wide; int tp_wide; int blast, match, seen, lhalf, rhalf; aln->path = &(ovl->path); if (ALIGN || REFERENCE) { work = New_Work_Data(); abuffer = New_Read_Buffer(db1); bbuffer = New_Read_Buffer(db2); } else { abuffer = NULL; bbuffer = NULL; work = NULL; } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); in = 0; npt = pts[0]; idx = 1; ar_wide = Number_Digits((int64) db1->nreads); br_wide = Number_Digits((int64) db2->nreads); ai_wide = Number_Digits((int64) db1->maxlen); bi_wide = Number_Digits((int64) db2->maxlen); if (db1->maxlen < db2->maxlen) { mn_wide = ai_wide; mx_wide = bi_wide; tp_wide = Number_Digits((int64) db1->maxlen/tspace+2); } else { mn_wide = bi_wide; mx_wide = ai_wide; tp_wide = Number_Digits((int64) db2->maxlen/tspace+2); } ar_wide += (ar_wide-1)/3; br_wide += (br_wide-1)/3; ai_wide += (ai_wide-1)/3; bi_wide += (bi_wide-1)/3; mn_wide += (mn_wide-1)/3; tp_wide += (tp_wide-1)/3; if (FLIP) { int x; x = ar_wide; ar_wide = br_wide; br_wide = x; x = ai_wide; ai_wide = bi_wide; bi_wide = x; } // For each record do blast = -1; match = 0; seen = 0; lhalf = rhalf = 0; for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2*ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); } ovl->path.trace = (void *) trace; Read_Trace(input,ovl,tbytes); // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ovl->path.tlen/2; if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != aln->alen && ovl->path.bepos != aln->blen) continue; } // If -M option then check the completeness of the implied mapping if (MAP) { while (ovl->bread != blast) { if (!match && seen && !(lhalf && rhalf)) { printf("Missing "); Print_Number((int64) blast+1,br_wide+1,stdout); printf(" %d ->%lld\n",db2->reads[blast].rlen,db2->reads[blast].coff); } match = 0; seen = 0; lhalf = rhalf = 0; blast += 1; } seen = 1; if (ovl->path.abpos == 0) rhalf = 1; if (ovl->path.aepos == aln->alen) lhalf = 1; if (ovl->path.bbpos != 0 || ovl->path.bepos != aln->blen) continue; match = 1; } // Display it if (ALIGN || CARTOON || REFERENCE) printf("\n"); if (FLIP) { Flip_Alignment(aln,0); Print_Number((int64) ovl->bread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->aread+1,br_wide+1,stdout); } else { Print_Number((int64) ovl->aread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->bread+1,br_wide+1,stdout); } if (COMP(ovl->flags)) printf(" c"); else printf(" n"); printf(" ["); Print_Number((int64) ovl->path.abpos,ai_wide,stdout); printf(".."); Print_Number((int64) ovl->path.aepos,ai_wide,stdout); printf("] x ["); Print_Number((int64) ovl->path.bbpos,bi_wide,stdout); printf(".."); Print_Number((int64) ovl->path.bepos,bi_wide,stdout); printf("]"); if (ALIGN || CARTOON || REFERENCE) { if (ALIGN || REFERENCE) { char *aseq, *bseq; int amin, amax; int bmin, bmax; int self; if (FLIP) Flip_Alignment(aln,0); if (small) Decompress_TraceTo16(ovl); self = sameDB && (ovl->aread == ovl->bread) && !COMP(ovl->flags); amin = ovl->path.abpos - BORDER; if (amin < 0) amin = 0; amax = ovl->path.aepos + BORDER; if (amax > aln->alen) amax = aln->alen; if (COMP(aln->flags)) { bmin = (aln->blen-ovl->path.bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln->blen-ovl->path.bbpos) + BORDER; if (bmax > aln->blen) bmax = aln->blen; } else { bmin = ovl->path.bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = ovl->path.bepos + BORDER; if (bmax > aln->blen) bmax = aln->blen; if (self) { if (bmin < amin) amin = bmin; if (bmax > amax) amax = bmax; } } aseq = Load_Subread(db1,ovl->aread,amin,amax,abuffer,0); if (!self) bseq = Load_Subread(db2,ovl->bread,bmin,bmax,bbuffer,0); else bseq = aseq; aln->aseq = aseq - amin; if (COMP(aln->flags)) { Complement_Seq(bseq,bmax-bmin); aln->bseq = bseq - (aln->blen - bmax); } else if (self) aln->bseq = aln->aseq; else aln->bseq = bseq - bmin; Compute_Trace_PTS(aln,work,tspace,GREEDIEST); if (FLIP) { if (COMP(aln->flags)) { Complement_Seq(aseq,amax-amin); Complement_Seq(bseq,bmax-bmin); aln->aseq = aseq - (aln->alen - amax); aln->bseq = bseq - bmin; } Flip_Alignment(aln,1); } } if (CARTOON) { printf(" ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n\n"); Alignment_Cartoon(stdout,aln,INDENT,mx_wide); } else { printf(" : = "); Print_Number((int64) ovl->path.diffs,mn_wide,stdout); printf(" diffs ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n"); } if (REFERENCE) Print_Reference(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); if (ALIGN) Print_Alignment(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); } else { printf(" : < "); Print_Number((int64) ovl->path.diffs,mn_wide,stdout); printf(" diffs ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n"); } } free(trace); if (ALIGN) { free(bbuffer-1); free(abuffer-1); Free_Work_Data(work); } } Close_DB(db1); if (ISTWO) Close_DB(db2); exit (0); } DALIGNER-master/LAsort.c000066400000000000000000000132011263373675100151160ustar00rootroot00000000000000/******************************************************************************************* * * Load a file U.las of overlaps into memory, sort them all by A,B index, * and then output the result to U.S.las * * Author: Gene Myers * Date : July 2013 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-v] ..."; #define MEMORY 1000 // How many megabytes for output buffer static char *IBLOCK; static int SORT_OVL(const void *x, const void *y) { int64 l = *((int64 *) x); int64 r = *((int64 *) y); Overlap *ol, *or; int al, ar; int bl, br; int cl, cr; int pl, pr; ol = (Overlap *) (IBLOCK+l); or = (Overlap *) (IBLOCK+r); al = ol->aread; ar = or->aread; if (al != ar) return (al-ar); bl = ol->bread; br = or->bread; if (bl != br) return (bl-br); cl = COMP(ol->flags); cr = COMP(ol->flags); if (cl != cr) return (cl-cr); pl = ol->path.abpos; pr = or->path.abpos; return (pl-pr); } static int SORT_MAP(const void *x, const void *y) { int64 l = *((int64 *) x); int64 r = *((int64 *) y); Overlap *ol, *or; int al, ar; int pl, pr; ol = (Overlap *) (IBLOCK+l); or = (Overlap *) (IBLOCK+r); al = ol->aread; ar = or->aread; if (al != ar) return (al-ar); pl = ol->path.abpos; pr = or->path.abpos; return (pl-pr); } int main(int argc, char *argv[]) { char *iblock, *fblock; int64 isize, osize; int64 ovlsize, ptrsize; int tspace, tbytes; int i; int VERBOSE; int MAP_ORDER; // Process options { int j, k; int flags[128]; ARG_INIT("LAsort") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vc") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; MAP_ORDER = flags['c']; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // For each file do ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; isize = 0; iblock = NULL; osize = MEMORY * 1000000ll; fblock = Malloc(osize,"Allocating LAsort output block"); for (i = 1; i < argc; i++) { int64 *perm; FILE *input, *foutput; int64 novl; // Read in the entire file and output header { int64 size; struct stat info; char *pwd, *root, *name; pwd = PathTo(argv[i]); root = Root(argv[i],".las"); name = Catenate(pwd,"/",root,".las"); input = Fopen(name,"r"); if (input == NULL) exit (1); stat(name,&info); size = info.st_size; if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); if (VERBOSE) { printf(" %s: ",root); Print_Number(novl,0,stdout); printf(" records "); Print_Number(size-novl*ovlsize,0,stdout); printf(" trace bytes\n"); fflush(stdout); } foutput = Fopen(Catenate(pwd,"/",root,".S.las"),"w"); if (foutput == NULL) exit (1); fwrite(&novl,sizeof(int64),1,foutput); fwrite(&tspace,sizeof(int),1,foutput); free(pwd); free(root); if (size > isize) { if (iblock == NULL) iblock = Malloc(size+ptrsize,"Allocating LAsort input block"); else iblock = Realloc(iblock-ptrsize,size+ptrsize,"Allocating LAsort input block"); if (iblock == NULL) exit (1); iblock += ptrsize; isize = size; } size -= (sizeof(int64) + sizeof(int)); if (size > 0) { if (fread(iblock,size,1,input) != 1) SYSTEM_ERROR } fclose(input); } // Set up unsorted permutation array perm = (int64 *) Malloc(sizeof(int64)*novl,"Allocating LAsort permutation vector"); if (perm == NULL) exit (1); { int64 off; int j; off = -ptrsize; for (j = 0; j < novl; j++) { perm[j] = off; off += ovlsize + ((Overlap *) (iblock+off))->path.tlen*tbytes; } } // Sort permutation array of ptrs to records IBLOCK = iblock; if (MAP_ORDER) qsort(perm,novl,sizeof(int64),SORT_MAP); else qsort(perm,novl,sizeof(int64),SORT_OVL); // Output the records in sorted order { int j; Overlap *w; int64 tsize, span; char *fptr, *ftop; fptr = fblock; ftop = fblock + osize; for (j = 0; j < novl; j++) { w = (Overlap *) (iblock+perm[j]); tsize = w->path.tlen*tbytes; span = ovlsize + tsize; if (fptr + span > ftop) { fwrite(fblock,1,fptr-fblock,foutput); fptr = fblock; } memcpy(fptr,((char *) w)+ptrsize,ovlsize); fptr += ovlsize; memcpy(fptr,(char *) (w+1),tsize); fptr += tsize; } if (fptr > fblock) fwrite(fblock,1,fptr-fblock,foutput); } free(perm); fclose(foutput); } if (iblock != NULL) free(iblock - ptrsize); free(fblock); exit (0); } DALIGNER-master/LAsplit.c000066400000000000000000000130341263373675100152660ustar00rootroot00000000000000/******************************************************************************************* * * Split an OVL file arriving from the standard input into 'parts' equal sized .las-files * .1.las, .2.las ... or according to a current partitioning of * * Author: Gene Myers * Date : June 2014 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = " ( | ) < .las"; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { char *iblock, *oblock; FILE *output, *dbvis; int64 novl, bsize, ovlsize, ptrsize; int parts, tspace, tbytes; int olast, blast; char *root, *pwd; Prog_Name = Strdup("LAsplit",""); if (argc != 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } { char *eptr; int nfiles, cutoff, all; int64 size; char buffer[2*MAX_NAME+100]; parts = strtol(argv[2],&eptr,10); if (*eptr != '\0') { pwd = PathTo(argv[2]); if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0) root = Root(argv[2],".dam"); else root = Root(argv[2],".db"); dbvis = fopen(Catenate(pwd,"/",root,".dam"),"r"); if (dbvis == NULL) { dbvis = fopen(Catenate(pwd,"/",root,".db"),"r"); if (dbvis == NULL) { fprintf(stderr,"%s: Second argument '%s' is not an integer or a DB\n", Prog_Name,argv[2]); exit (1); } } free(pwd); free(root); if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR while (nfiles-- > 0) if (fgets(buffer,2*MAX_NAME+100,dbvis) == NULL) SYSTEM_ERROR parts = 0; if (fscanf(dbvis,DB_NBLOCK,&parts) != 1) { fprintf(stderr,"%s: DB %s has not been partitioned\n",Prog_Name,argv[2]); exit (1); } if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) SYSTEM_ERROR if (fscanf(dbvis,DB_BDATA,&olast,&blast) != 2) SYSTEM_ERROR } else { dbvis = NULL; if (parts <= 0) { fprintf(stderr,"%s: Number of parts is not positive\n",Prog_Name); exit (1); } } } ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; oblock = (char *) Malloc(bsize,"Allocating output block"); iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); if (oblock == NULL || iblock == NULL) exit (1); iblock += ptrsize; pwd = PathTo(argv[1]); root = Root(argv[1],".las"); if (fread(&novl,sizeof(int64),1,stdin) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,stdin) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); { int i, j; Overlap *w; int low, hgh, last; int64 tsize, povl; char *iptr, *itop; char *optr, *otop; iptr = iblock; itop = iblock + fread(iblock,1,bsize,stdin); hgh = 0; for (i = 0; i < parts; i++) { output = Fopen(Catenate(pwd,"/",root,Numbered_Suffix(".",i+1,".las")),"w"); if (output == NULL) exit (1); low = hgh; if (dbvis != NULL) { if (fscanf(dbvis,DB_BDATA,&olast,&blast) != 2) SYSTEM_ERROR last = blast-1; hgh = 0; } else { last = 0; hgh = (novl*(i+1))/parts; } povl = 0; fwrite(&povl,sizeof(int64),1,output); fwrite(&tspace,sizeof(int),1,output); optr = oblock; otop = oblock + bsize; for (j = low; j < novl; j++) { if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,stdin); } w = (Overlap *) (iptr-ptrsize); if (dbvis == NULL) { if (j >= hgh && w->aread > last) break; last = w->aread; } else { if (w->aread > last) break; } tsize = w->path.tlen*tbytes; if (optr + ovlsize + tsize > otop) { fwrite(oblock,1,optr-oblock,output); optr = oblock; } memcpy(optr,iptr,ovlsize); optr += ovlsize; iptr += ovlsize; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,stdin); } memcpy(optr,iptr,tsize); optr += tsize; iptr += tsize; } hgh = j; if (optr > oblock) fwrite(oblock,1,optr-oblock,output); rewind(output); povl = hgh-low; fwrite(&povl,sizeof(int64),1,output); fclose(output); } } free(pwd); free(root); free(iblock-ptrsize); free(oblock); exit (0); } DALIGNER-master/LAupgrade.Dec.31.2014.c000066400000000000000000000072711263373675100170710ustar00rootroot00000000000000/******************************************************************************************* * * Convert older .las files so that the alen and blen fields are removed. * * Author: Gene Myers * Date : Dec 2014 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" typedef struct { void *trace; uint16 tlen; uint16 diffs; uint16 abpos, bbpos; uint16 aepos, bepos; } PathOld; typedef struct { PathOld path; int aread; int bread; uint16 alen; uint16 blen; int flags; } OverlapOld; static char *Usage = " > .las"; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { char *iblock, *oblock; FILE *input; int64 novl, bsize, ovlsize, newsize, ptrsize; int tspace, tbytes; char *pwd, *root; Prog_Name = Strdup("Upgrade",""); if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } ptrsize = sizeof(void *); ovlsize = sizeof(OverlapOld) - ptrsize; newsize = sizeof(Overlap ) - ptrsize; bsize = MEMORY * 1000000ll; oblock = (char *) Malloc(bsize,"Allocating output block"); iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); if (oblock == NULL || iblock == NULL) exit (1); iblock += ptrsize; pwd = PathTo(argv[1]); root = Root(argv[1],".las"); input = Fopen(Catenate(pwd,"/",root,".las"),"r"); if (input == NULL) exit (1); free(pwd); free(root); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); fwrite(&novl,sizeof(int64),1,stdout); fwrite(&tspace,sizeof(int),1,stdout); { int j; OverlapOld *w; Overlap *v; int64 tsize; char *iptr, *itop; char *optr, *otop; optr = oblock; otop = oblock + bsize; iptr = iblock; itop = iblock + fread(iblock,1,bsize,input); for (j = 0; j < novl; j++) { if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } w = (OverlapOld *) (iptr - ptrsize); tsize = w->path.tlen*tbytes; if (optr + newsize + tsize > otop) { fwrite(oblock,1,optr-oblock,stdout); optr = oblock; } v = (Overlap *) (optr - ptrsize); v->path.abpos = w->path.abpos; v->path.bbpos = w->path.bbpos; v->path.aepos = w->path.aepos; v->path.bepos = w->path.bepos; v->path.diffs = w->path.diffs; v->path.tlen = w->path.tlen; v->aread = w->aread; v->bread = w->bread; v->flags = w->flags; optr += newsize; iptr += ovlsize; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } memcpy(optr,iptr,tsize); optr += tsize; iptr += tsize; } if (optr > oblock) fwrite(oblock,1,optr-oblock,stdout); } fclose(input); free(oblock); free(iblock-ptrsize); exit (0); } DALIGNER-master/LICENSE000066400000000000000000000053111263373675100145560ustar00rootroot00000000000000 Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: · Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. · Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. · The name of EWM may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. For any issues regarding this software and its use, contact EWM at: Eugene W. Myers Jr. Bautzner Str. 122e 01099 Dresden GERMANY Email: gene.myers@gmail.com DALIGNER-master/Makefile000066400000000000000000000032721263373675100152150ustar00rootroot00000000000000CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing ALL = daligner HPCdaligner HPCmapper LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck LAindex all: $(ALL) daligner: daligner.c filter.c filter.h align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o daligner daligner.c filter.c align.c DB.c QV.c -lpthread -lm HPCdaligner: HPCdaligner.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o HPCdaligner HPCdaligner.c DB.c QV.c -lm HPCmapper: HPCmapper.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o HPCmapper HPCmapper.c DB.c QV.c -lm LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm LAupgrade.Dec.31.2014: LAupgrade.Dec.31.2014.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAupgrade.Dec.31.2014 LAupgrade.Dec.31.2014.c align.c DB.c QV.c -lm LAindex: LAindex.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAindex LAindex.c align.c DB.c QV.c -lm clean: rm -f $(ALL) rm -fr *.dSYM rm -f LAupgrade.Dec.31.2014 rm -f daligner.tar.gz install: cp $(ALL) ~/bin package: make clean tar -zcf daligner.tar.gz README *.h *.c Makefile DALIGNER-master/QV.c000066400000000000000000001051451263373675100142510ustar00rootroot00000000000000/******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #undef DEBUG #define MIN_BUFFER 1000 #define HUFF_CUTOFF 16 // This cannot be larger than 16 ! /******************************************************************************************* * * Endian flipping routines * ********************************************************************************************/ static int LittleEndian; // Little-endian machine ? // Referred by: Decode & Decode_Run static int Flip; // Flip endian of all coded shorts and ints // Referred by: Decode & Decode_Run & Read_Scheme static void Set_Endian(int flip) { uint32 x = 3; uint8 *b = (uint8 *) (&x); Flip = flip; LittleEndian = (b[0] == 3); } static void Flip_Long(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[3]; v[3] = x; x = v[1]; v[1] = v[2]; v[2] = x; } static void Flip_Short(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[1]; v[1] = x; } /******************************************************************************************* * * Routines for computing a Huffman Encoding Scheme * ********************************************************************************************/ typedef struct { int type; // 0 => normal, 1 => normal but has long codes, 2 => truncated uint32 codebits[256]; // If type = 2, then code 255 is the special code for int codelens[256]; // non-Huffman exceptions int lookup[0x10000]; // Lookup table (just for decoding) } HScheme; typedef struct _HTree { struct _HTree *lft, *rgt; uint64 count; } HTree; // Establish heap property from node s down (1 is root, siblings of n are 2n and 2n+1) // assuming s is the only perturbation in the tree. static void Reheap(int s, HTree **heap, int hsize) { int c, l, r; HTree *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; hr = heap[r]; if (r > hsize || hr->count > hl->count) { if (hs->count > hl->count) { heap[c] = hl; c = l; } else break; } else { if (hs->count > hr->count) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } // Given Huffman tree build a table of codes from it, the low-order codelens[s] bits // of codebits[s] contain the code for symbol s. static void Build_Table(HTree *node, int code, int len, uint32 *codebits, int *codelens) { if (node->rgt == NULL) { uint64 symbol = (uint64) (node->lft); codebits[symbol] = code; codelens[symbol] = len; } else { code <<= 1; len += 1; Build_Table(node->lft,code,len,codebits,codelens); Build_Table(node->rgt,code+1,len,codebits,codelens); } } // For the non-zero symbols in hist, compute a huffman tree over them, and then // build a table of the codes. If inscheme is not NULL, then place all symbols // with code 255 or with more than HUFF_CUTOFF bits in the encoding by inscheme // as a single united entity, whose code signals that the value of these symbols // occur explicitly in 8 (values) or 16 (run lengths) bits following the code. // All the symbols in this class will have the same entry in the code table and // 255 is always in this class. static HScheme *Huffman(uint64 *hist, HScheme *inscheme) { HScheme *scheme; HTree *heap[259]; HTree node[512]; int hsize; HTree *lft, *rgt; int value, range; int i; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); hsize = 0; // Load heap value = 0; if (inscheme != NULL) { node[0].count = 0; node[0].lft = (HTree *) (uint64) 255; node[0].rgt = NULL; heap[++hsize] = node+(value++); } for (i = 0; i < 256; i++) if (hist[i] > 0) { if (inscheme != NULL && (inscheme->codelens[i] > HUFF_CUTOFF || i == 255)) node[0].count += hist[i]; else { node[value].count = hist[i]; node[value].lft = (HTree *) (uint64) i; node[value].rgt = NULL; heap[++hsize] = node+(value++); } } for (i = hsize/2; i >= 1; i--) // Establish heap property Reheap(i,heap,hsize); range = value; // Merge pairs with smallest count until have a tree for (i = 1; i < value; i++) { lft = heap[1]; heap[1] = heap[hsize--]; Reheap(1,heap,hsize); rgt = heap[1]; node[range].lft = lft; node[range].rgt = rgt; node[range].count = lft->count + rgt->count; heap[1] = node+(range++); Reheap(1,heap,hsize); } for (i = 0; i < 256; i++) // Build the code table { scheme->codebits[i] = 0; scheme->codelens[i] = 0; } Build_Table(node+(range-1),0,0,scheme->codebits,scheme->codelens); if (inscheme != NULL) // Set scheme type and if truncated (2), map truncated codes { scheme->type = 2; // to code and length for 255 for (i = 0; i < 255; i++) if (inscheme->codelens[i] > HUFF_CUTOFF || scheme->codelens[i] > HUFF_CUTOFF) { scheme->codelens[i] = scheme->codelens[255]; scheme->codebits[i] = scheme->codebits[255]; } } else { scheme->type = 0; for (i = 0; i < 256; i++) { if (scheme->codelens[i] > HUFF_CUTOFF) scheme->type = 1; } } return (scheme); } #ifdef DEBUG // For debug, show the coding table static void Print_Table(HScheme *scheme, uint64 *hist, int infosize) { uint64 total_bits; uint32 specval, mask, code, *bits; int speclen, clen, *lens; int i, k; total_bits = 0; bits = scheme->codebits; lens = scheme->codelens; if (scheme->type == 2) { specval = bits[255]; speclen = lens[255]; } else specval = speclen = 0x7fffffff; printf("\nCode Table:\n"); for (i = 0; i < 256; i++) if (lens[i] > 0) { clen = lens[i]; mask = (1 << clen); code = bits[i]; printf(" %3d: %2d ",i,clen); for (k = 0; k < clen; k++) { mask >>= 1; if (code & mask) printf("1"); else printf("0"); } if (code == specval && clen == speclen) { printf(" ***"); if (hist != NULL) total_bits += (clen+infosize)*hist[i]; } else if (hist != NULL) total_bits += clen*hist[i]; printf("\n"); } if (hist != NULL) printf("\nTotal Bytes = %lld\n",(total_bits-1)/8+1); } // For debug, show the histogram static void Print_Histogram(uint64 *hist) { int i, low, hgh; uint64 count; for (hgh = 255; hgh >= 0; hgh--) if (hist[hgh] != 0) break; for (low = 0; low < 256; low++) if (hist[low] != 0) break; count = 0; for (i = low; i <= hgh; i++) count += hist[i]; for (i = hgh; i >= low; i--) printf(" %3d: %8llu %5.1f%%\n",i,hist[i],(hist[i]*100.)/count); } #endif /******************************************************************************************* * * Read and Write Huffman Schemes * ********************************************************************************************/ // Write the code table to out. static void Write_Scheme(HScheme *scheme, FILE *out) { int i; uint8 x; uint32 *bits; int *lens; lens = scheme->codelens; bits = scheme->codebits; x = (uint8) (scheme->type); fwrite(&x,1,1,out); for (i = 0; i < 256; i++) { x = (uint8) (lens[i]); fwrite(&x,1,1,out); if (x > 0) fwrite(bits+i,sizeof(uint32),1,out); } } // Allocate and read a code table from in, and return a pointer to it. static HScheme *Read_Scheme(FILE *in) { HScheme *scheme; int *look, *lens; uint32 *bits, base; int i, j, powr; uint8 x; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); lens = scheme->codelens; bits = scheme->codebits; look = scheme->lookup; if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n"); free(scheme); return (NULL); } scheme->type = x; for (i = 0; i < 256; i++) { if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i); return (NULL); } lens[i] = x; if (x > 0) { if (fread(bits+i,sizeof(uint32),1,in) != 1) { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i); free(scheme); return (NULL); } } else bits[i] = 0; } if (Flip) { for (i = 0; i < 256; i++) Flip_Long(bits+i); } for (i = 0; i < 256; i++) { if (lens[i] > 0) { base = (bits[i] << (16-lens[i])); powr = (1 << (16-lens[i])); for (j = 0; j < powr; j++) look[base+j] = i; } } return (scheme); } /******************************************************************************************* * * Encoders and Decoders * ********************************************************************************************/ // Encode read[0..rlen-1] according to scheme and write to out static void Encode(HScheme *scheme, FILE *out, uint8 *read, int rlen) { uint32 x, c, ocode; int n, k, olen, llen; int *nlens; uint32 *nbits; uint32 nspec; int nslen; nlens = scheme->codelens; nbits = scheme->codebits; if (scheme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; #define OCODE(L,C) \ { int len = olen + (L); \ uint32 code = (C); \ \ llen = olen; \ if (len >= 32) \ { olen = len-32; \ ocode |= (code >> olen); \ fwrite(&ocode,sizeof(uint32),1,out); \ if (olen > 0) \ ocode = (code << (32-olen)); \ else \ ocode = 0; \ } \ else \ { olen = len; \ ocode |= (code << (32-olen));; \ } \ } llen = 0; olen = 0; ocode = 0; for (k = 0; k < rlen; k++) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); } if (olen > 0) // Tricky: must pad so decoder does not read past { fwrite(&ocode,sizeof(uint32),1,out); // last integer int the coded output. if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Encode read[0..rlen-1] according to non-rchar table neme, and run-length table reme for // runs of rchar characters. Write to out. static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int rlen, int rchar) { uint32 x, c, ocode; int n, h, k, olen, llen; int *nlens, *rlens; uint32 *nbits, *rbits; uint32 nspec, rspec; int nslen, rslen; nlens = neme->codelens; nbits = neme->codebits; rlens = reme->codelens; rbits = reme->codebits; if (neme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; rspec = rbits[255]; rslen = rlens[255]; llen = 0; olen = 0; ocode = 0; k = 0; while (k < rlen) { h = k; while (k < rlen && read[k] == rchar) k += 1; if (k-h >= 255) x = 255; else x = k-h; n = rlens[x]; c = rbits[x]; OCODE(n,c); if (c == rspec && n == rslen) OCODE(16,k-h); if (k < rlen) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); k += 1; } } if (olen > 0) { fwrite(&ocode,sizeof(uint32),1,out); if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Read and decode from in, the next rlen symbols into read according to scheme static int Decode(HScheme *scheme, FILE *in, char *read, int rlen) { int *look, *lens; int signal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (scheme->type == 2) signal = 255; else signal = 256; lens = scheme->codelens; look = scheme->lookup; #define GET \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } #define GETFLIP \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ Flip_Long(ipart); \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = look[*xpart]; n = lens[c]; if (c == signal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } else for (j = 0; j < rlen; j++) { GET c = look[*xpart]; n = lens[c]; if (c == signal) { GET c = *cpart; n = 8; } read[j] = (char) c; } return (0); } // Read and decode from in, the next rlen symbols into read according to non-rchar scheme // neme, and the rchar runlength shceme reme static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, int rlen, int rchar) { int *nlook, *nlens; int *rlook, *rlens; int nsignal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c, k; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (neme->type == 2) nsignal = 255; else nsignal = 256; nlens = neme->codelens; nlook = neme->lookup; rlens = reme->codelens; rlook = reme->lookup; n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GETFLIP c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GETFLIP c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } } else for (j = 0; j < rlen; j++) { GET c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GET c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GET c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GET c = *cpart; n = 8; } read[j] = (char) c; } } return (0); } /******************************************************************************************* * * Histogrammers * ********************************************************************************************/ // Histogram runlengths of symbol runChar in stream[0..rlen-1] into run. static void Histogram_Seqs(uint64 *hist, uint8 *stream, int rlen) { int k; for (k = 0; k < rlen; k++) hist[stream[k]] += 1; } static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar) { int k, h; k = 0; while (k < rlen) { h = k; while (k < rlen && stream[k] == runChar) k += 1; if (k-h >= 256) run[255] += 1; else run[k-h] += 1; if (k < rlen) k += 1; } } /******************************************************************************************* * * Reader * ********************************************************************************************/ static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan, static int Rmax = -1; // Compress_Next_QVentry static int Nline; // Referred by: QVcoding_Scan char *QVentry() { return (Read); } // If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines // for a sequence. Place line j at Read+j*Rmax and the length of every line is returned // unless eof occurs in which case return -1. If any error occurs return -2. int Read_Lines(FILE *input, int nlines) { int i, rlen; int tmax; char *tread; char *other; if (Read == NULL) { tmax = MIN_BUFFER; tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; } Nline += 1; if (fgets(Read,Rmax,input) == NULL) return (-1); rlen = strlen(Read); while (Read[rlen-1] != '\n') { tmax = ((int) 1.4*Rmax) + MIN_BUFFER; tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; if (fgets(Read+rlen,Rmax-rlen,input) == NULL) { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline); EXIT(-2); } rlen += strlen(Read+rlen); } other = Read; for (i = 1; i < nlines; i++) { other += Rmax; Nline += 1; if (fgets(other,Rmax,input) == NULL) { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(-2); } if (rlen != (int) strlen(other)) { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline); EXIT(-2); } } return (rlen-1); } /******************************************************************************************* * * Tag compression and decompression routines * ********************************************************************************************/ // Keep only the symbols in tags[0..rlen-1] for which qvs[k] != rchar and // return the # of symbols kept. static int Pack_Tag(char *tags, char *qvs, int rlen, int rchar) { int j, k; j = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) tags[j++] = tags[k]; tags[j] = '\0'; return (j); } // Count the # of non-rchar symbols in qvs[0..rlen-1] static int Packed_Length(char *qvs, int rlen, int rchar) { int k, clen; clen = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) clen += 1; return (clen); } // Unpack tags by moving its i'th char to position k where qvs[k] is the i'th non-rchar // symbol in qvs. All other chars are set to rchar. rlen is the length of qvs and // the unpacked result, clen is the initial length of tags. static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar) { int j, k; j = clen-1; for (k = rlen-1; k >= 0; k--) { if (qvs[k] == rchar) tags[k] = 'n'; else tags[k] = tags[j--]; } } /******************************************************************************************* * * Statistics Scan and Scheme creation and write * ********************************************************************************************/ // Read .quiva file from input, recording stats in the histograms. If zero is set then // start the stats anew with this file. static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256]; static uint64 totChar; static int delChar, subChar; // Referred by: QVcoding_Scan, Create_QVcoding int QVcoding_Scan(FILE *input) { char *slash; int rlen; // Zero histograms bzero(delHist,sizeof(uint64)*256); bzero(mrgHist,sizeof(uint64)*256); bzero(insHist,sizeof(uint64)*256); bzero(subHist,sizeof(uint64)*256); { int i; for (i = 0; i < 256; i++) delRun[i] = subRun[i] = 1; } totChar = 0; delChar = -1; subChar = -1; // Make a sweep through the .quiva entries, histogramming the relevant things // and figuring out the run chars for the deletion and substition streams Nline = 0; while (1) { int well, beg, end, qv; rlen = Read_Lines(input,1); if (rlen == -2) EXIT(1); if (rlen < 0) break; if (rlen == 0 || Read[0] != '@') { EPRINTF(EPLACE,"Line %d: Header in quiv file is missing\n",Nline); EXIT(1); } slash = index(Read+1,'/'); if (slash == NULL) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(1); } if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(1); } rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(1); } Histogram_Seqs(delHist,(uint8 *) (Read),rlen); Histogram_Seqs(insHist,(uint8 *) (Read+2*Rmax),rlen); Histogram_Seqs(mrgHist,(uint8 *) (Read+3*Rmax),rlen); Histogram_Seqs(subHist,(uint8 *) (Read+4*Rmax),rlen); if (delChar < 0) { int k; char *del = Read+Rmax; for (k = 0; k < rlen; k++) if (del[k] == 'n' || del[k] == 'N') { delChar = Read[k]; break; } } if (delChar >= 0) Histogram_Runs( delRun,(uint8 *) (Read),rlen,delChar); totChar += rlen; if (subChar < 0) { if (totChar >= 100000) { int k; subChar = 0; for (k = 1; k < 256; k++) if (subHist[k] > subHist[subChar]) subChar = k; } } if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar); } return (0); } // Using the statistics in the global stat tables, create the Huffman schemes and write // them to output. If lossy is set, then create a lossy table for the insertion and merge // QVs. QVcoding *Create_QVcoding(int lossy) { static QVcoding coding; HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; HScheme *dRunScheme, *sRunScheme; delScheme = NULL; dRunScheme = NULL; insScheme = NULL; mrgScheme = NULL; subScheme = NULL; sRunScheme = NULL; // Check whether using a subtitution run char is a win if (totChar < 200000 || subHist[subChar] < .5*totChar) subChar = -1; // If lossy encryption is enabled then scale insertions and merge QVs. if (lossy) { int k; for (k = 0; k < 256; k += 2) { insHist[k] += insHist[k+1]; insHist[k+1] = 0; } for (k = 0; k < 256; k += 4) { mrgHist[k] += mrgHist[k+1]; mrgHist[k] += mrgHist[k+2]; mrgHist[k] += mrgHist[k+3]; mrgHist[k+1] = 0; mrgHist[k+2] = 0; mrgHist[k+3] = 0; } } // Build a Huffman scheme for each stream entity from the histograms #define SCHEME_MACRO(meme,hist,label,bits) \ scheme = Huffman( (hist), NULL); \ if (scheme == NULL) \ goto error; \ if (scheme->type) \ { (meme) = Huffman( (hist), scheme); \ free(scheme); \ } \ else \ (meme) = scheme; #ifdef DEBUG #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) \ printf("\n%s\n", (label) ); \ Print_Histogram( (hist)); \ Print_Table( (meme), (hist), (bits)); #else #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) #endif { HScheme *scheme; if (delChar < 0) { MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs", 8); dRunScheme = NULL; } else { delHist[delChar] = 0; MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs less run char", 8); MAKE_SCHEME(dRunScheme,delRun, "Histogram of Deletion Runs QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",delChar); #endif } #ifdef DEBUG { int k; uint64 count; count = 0; for (k = 0; k < 256; k++) count += delHist[k]; printf("\nDelTag will require %lld bytes\n",count/4); } #endif MAKE_SCHEME(insScheme,insHist, "Hisotgram of Insertion QVs", 8); MAKE_SCHEME(mrgScheme,mrgHist, "Hisotgram of Merge QVs", 8); if (subChar < 0) { MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs", 8); sRunScheme = NULL; } else { subHist[subChar] = 0; MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs less run char", 8); MAKE_SCHEME(sRunScheme,subRun, "Histogram of Substitution Run QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",subChar); #endif } } // Setup endian handling Set_Endian(0); coding.delScheme = delScheme; coding.insScheme = insScheme; coding.mrgScheme = mrgScheme; coding.subScheme = subScheme; coding.dRunScheme = dRunScheme; coding.sRunScheme = sRunScheme; coding.delChar = delChar; coding.subChar = subChar; coding.prefix = NULL; coding.flip = 0; return (&coding); error: if (delScheme != NULL) free(delScheme); if (dRunScheme != NULL) free(dRunScheme); if (insScheme != NULL) free(insScheme); if (mrgScheme != NULL) free(mrgScheme); if (subScheme != NULL) free(subScheme); if (sRunScheme != NULL) free(sRunScheme); EXIT(NULL); } // Write the encoding scheme 'coding' to 'output' void Write_QVcoding(FILE *output, QVcoding *coding) { // Write out the endian key, run chars, and prefix (if not NULL) { uint16 half; int len; half = 0x33cc; fwrite(&half,sizeof(uint16),1,output); if (coding->delChar < 0) half = 256; else half = (uint16) (coding->delChar); fwrite(&half,sizeof(uint16),1,output); if (coding->subChar < 0) half = 256; else half = (uint16) (coding->subChar); fwrite(&half,sizeof(uint16),1,output); len = strlen(coding->prefix); fwrite(&len,sizeof(int),1,output); fwrite(coding->prefix,1,len,output); } // Write out the scheme tables Write_Scheme(coding->delScheme,output); if (coding->delChar >= 0) Write_Scheme(coding->dRunScheme,output); Write_Scheme(coding->insScheme,output); Write_Scheme(coding->mrgScheme,output); Write_Scheme(coding->subScheme,output); if (coding->subChar >= 0) Write_Scheme(coding->sRunScheme,output); } // Read the encoding scheme 'coding' to 'output' QVcoding *Read_QVcoding(FILE *input) { static QVcoding coding; // Read endian key, run chars, and short name common to all headers { uint16 half; int len; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n"); EXIT(NULL); } coding.flip = (half != 0x33cc); if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.delChar = half; if (coding.delChar >= 256) coding.delChar = -1; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.subChar = half; if (coding.subChar >= 256) coding.subChar = -1; // Read the short name common to all headers if (fread(&len,sizeof(int),1,input) != 1) { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Long(&len); coding.prefix = (char *) Malloc(len+1,"Allocating header prefix"); if (coding.prefix == NULL) EXIT(NULL); if (len > 0) { if (fread(coding.prefix,len,1,input) != 1) { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n"); EXIT(NULL); } } coding.prefix[len] = '\0'; } // Setup endian handling Set_Endian(coding.flip); // Read the Huffman schemes used to compress the data coding.delScheme = NULL; coding.dRunScheme = NULL; coding.insScheme = NULL; coding.mrgScheme = NULL; coding.subScheme = NULL; coding.sRunScheme = NULL; coding.delScheme = Read_Scheme(input); if (coding.delScheme == NULL) goto error; if (coding.delChar >= 0) { coding.dRunScheme = Read_Scheme(input); if (coding.dRunScheme == NULL) goto error; } coding.insScheme = Read_Scheme(input); if (coding.insScheme == NULL) goto error; coding.mrgScheme = Read_Scheme(input); if (coding.mrgScheme == NULL) goto error; coding.subScheme = Read_Scheme(input); if (coding.subScheme == NULL) goto error; if (coding.subChar >= 0) { coding.sRunScheme = Read_Scheme(input); if (coding.sRunScheme == NULL) goto error; } return (&coding); error: if (coding.delScheme != NULL) free(coding.delScheme); if (coding.dRunScheme != NULL) free(coding.dRunScheme); if (coding.insScheme != NULL) free(coding.insScheme); if (coding.mrgScheme != NULL) free(coding.mrgScheme); if (coding.subScheme != NULL) free(coding.subScheme); if (coding.sRunScheme != NULL) free(coding.sRunScheme); EXIT(NULL); } // Free all the auxilliary storage associated with the encoding argument void Free_QVcoding(QVcoding *coding) { if (coding->subChar >= 0) free(coding->sRunScheme); free(coding->subScheme); free(coding->mrgScheme); free(coding->insScheme); if (coding->delChar >= 0) free(coding->dRunScheme); free(coding->delScheme); free(coding->prefix); } /******************************************************************************************* * * Encode/Decode (w.r.t. coding) next entry from input and write to output * ********************************************************************************************/ int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) { int rlen, clen; // Get all 5 streams, compress each with its scheme, and output rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT (1); } if (coding->delChar < 0) { Encode(coding->delScheme, output, (uint8 *) Read, rlen); clen = rlen; } else { Encode_Run(coding->delScheme, coding->dRunScheme, output, (uint8 *) Read, rlen, coding->delChar); clen = Pack_Tag(Read+Rmax,Read,rlen,coding->delChar); } Number_Read(Read+Rmax); Compress_Read(clen,Read+Rmax); fwrite(Read+Rmax,1,COMPRESSED_LEN(clen),output); if (lossy) { uint8 *insert = (uint8 *) (Read+2*Rmax); uint8 *merge = (uint8 *) (Read+3*Rmax); int k; for (k = 0; k < rlen; k++) { insert[k] = (uint8) ((insert[k] >> 1) << 1); merge[k] = (uint8) (( merge[k] >> 2) << 2); } } Encode(coding->insScheme, output, (uint8 *) (Read+2*Rmax), rlen); Encode(coding->mrgScheme, output, (uint8 *) (Read+3*Rmax), rlen); if (coding->subChar < 0) Encode(coding->subScheme, output, (uint8 *) (Read+4*Rmax), rlen); else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) (Read+4*Rmax), rlen, coding->subChar); return (0); } int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) { int clen, tlen; // Decode each stream and write to output if (coding->delChar < 0) { if (Decode(coding->delScheme, input, entry[0], rlen)) EXIT(1); clen = rlen; tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); } else { if (Decode_Run(coding->delScheme, coding->dRunScheme, input, entry[0], rlen, coding->delChar)) EXIT(1); clen = Packed_Length(entry[0],rlen,coding->delChar); tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar); } if (Decode(coding->insScheme, input, entry[2], rlen)) EXIT(1); if (Decode(coding->mrgScheme, input, entry[3], rlen)) EXIT(1); if (coding->subChar < 0) { if (Decode(coding->subScheme, input, entry[4], rlen)) EXIT(1); } else { if (Decode_Run(coding->subScheme, coding->sRunScheme, input, entry[4], rlen, coding->subChar)) EXIT(1); } return (0); } DALIGNER-master/QV.h000066400000000000000000000104001263373675100142430ustar00rootroot00000000000000/******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #ifndef _QV_COMPRESSOR #define _QV_COMPRESSOR // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or // batch version of the routines in this library are compiled. In batch mode, routines // print an error message and exit. In interactive mode, the routines place the error // message in EPLACE (also defined in DB.h) and return an error value, typically NULL // if the routine returns a pointer, and an unusual integer value if the routine returns // an integer. // Below when an error return is described, one should understand that this value is returned // only if the routine was compiled in INTERACTIVE mode. // A PacBio compression scheme typedef struct { void *delScheme; // Huffman scheme for deletion QVs void *insScheme; // Huffman scheme for insertion QVs void *mrgScheme; // Huffman scheme for merge QVs void *subScheme; // Huffman scheme for substitution QVs void *dRunScheme; // Huffman scheme for deletion run lengths (if delChar > 0) void *sRunScheme; // Huffman scheme for substitution run lengths (if subChar > 0) int delChar; // If > 0, run-encoded deletion value int subChar; // If > 0, run-encoded substitution value int flip; // Need to flip multi-byte integers char *prefix; // Header line prefix } QVcoding; // Read the next nlines of input, and QVentry returns a pointer to the first line if needed. // If end-of-input is encountered before any further input, -1 is returned. If there is // an error than -2 is returned. Otherwise the length of the line(s) read is returned. int Read_Lines(FILE *input, int nlines); char *QVentry(); // Read the .quiva file on input and record frequency statistics. If there is an error // then 1 is returned, otherwise 0. int QVcoding_Scan(FILE *input); // Given QVcoding_Scan has been called at least once, create an encoding scheme based on // the accumulated statistics and return a pointer to it. The returned encoding object // is *statically allocated within the routine. If lossy is set then use a lossy scaling // for the insertion and merge streams. If there is an error, then NULL is returned. QVcoding *Create_QVcoding(int lossy); // Read/write a coding scheme to input/output. The encoding object returned by the reader // is *statically* allocated within the routine. If an error occurs while reading then // NULL is returned. QVcoding *Read_QVcoding(FILE *input); void Write_QVcoding(FILE *output, QVcoding *coding); // Free all the auxiliary storage associated with coding (but not the object itself!) void Free_QVcoding(QVcoding *coding); // Assuming the file pointer is positioned just beyond an entry header line, read the // next set of 5 QV lines, compress them according to 'coding', and output. If lossy // is set then the scheme is a lossy one. A non-zero value is return only if an // error occured. int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); // Assuming the input is position just beyond the compressed encoding of an entry header, // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, // and place their decompressed values into entry which is a 5 element array of character // pointers. The parameter rlen computed from the preceeding header line, critically // provides the length of each of the 5 vectors. A non-zero value is return only if an // error occured. int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); #endif // _QV_COMPRESSOR DALIGNER-master/README000066400000000000000000000550061263373675100144370ustar00rootroot00000000000000 *** PLEASE GO TO THE DAZZLER BLOG (https://dazzlerblog.wordpress.com) FOR TYPESET *** DOCUMENTATION, EXAMPLES OF USE, AND DESIGN PHILOSOPHY. /************************************************************************************\ UPGRADE & DEVELOPER NOTES ! ! ! If you have already performed a big comparison and don't want to recompute all your local alignments in .las files, but do want to use a more recent version of the software that entails a change to the data structures (currently the update on December 31, 2014), please note the routine LAupgrade.Dec.31.2014. This take a .las file, say X.las, as an argument, and writes to standard output the .las file in the new format. The program can be made with "make" but is not by default created when make is called without an argument. For those interested in the details, on December 30, the "alen" and "blen" fields were dropped to save space as they can always be gotten from the underlying DB. \************************************************************************************/ The Daligner Overlap Library Author: Gene Myers First: July 17, 2013 Current: December 31, 2014 The commands below permit one to find all significant local alignments between reads encoded in Dazzler database. The assumption is that the reads are from a PACBIO RS II long read sequencer. That is the reads are long and noisy, up to 15% on average. Recall that a database has a current partition that divides it into blocks of a size that can conveniently be handled by calling the "dalign" overlapper on all the pairs of blocks producing a collection of .las local alignment files that can then be sorted and merged into an ordered sequence of sorted files containing all alignments between reads in the data set. The alignment records are parsimonious in that they do not record an alignment but simply a set of trace points, typically every 100bp or so, that allow the efficient reconstruction of alignments on demand. 1. daligner [-vbAI] [-k] [-w] [-h] [-t] [-M] [-e] [-H] [-m]+ ... Compare sequences in the trimmed block against those in the list of blocks searching for local alignments involving at least -l base pairs (default 1000) or more, that have an average correlation rate of -e (default 70%). The local alignments found will be output in a sparse encoding where a trace point on the alignment is recorded every -s base pairs of the a-read (default 100bp). Reads are compared in both orientations and local alignments meeting the criteria are output to one of several created files described below. The -v option turns on a verbose reporting mode that gives statistics on each major step of the computation. The options -k, -h, and -w control the initial filtration search for possible matches between reads. Specifically, our search code looks for a pair of diagonal bands of width 2^w (default 2^6 = 64) that contain a collection of exact matching k-mers (default 14) between the two reads, such that the total number of bases covered by the k-mer hits is h (default 35). k cannot be larger than 32 in the current implementation. If the -b option is set, then the daligner assumes the data has a strong compositional bias (e.g. >65% AT rich), and at the cost of a bit more time, dynamically adjusts k-mer sizes depending on compositional bias, so that the mers used have an effective specificity of 4^k. If there are one or more interval tracks specified with the -m option, then the reads of the DB or DB's to which the mask applies are soft masked with the union of the intervals of all the interval tracks that apply, that is any k-mers that contain any bases in any of the masked intervals are ignored for the purposes of seeding a match. An interval track is a track, such as the "dust" track created by DBdust, that encodes a set of intervals over either the untrimmed or trimmed DB. Invariably, some k-mers are significantly over-represented (e.g. homopolymer runs). These k-mers create an excessive number of matching k-mer pairs and left unaddressed would cause daligner to overflow the available physical memory. One way to deal with this is to explicitly set the -t parameter which suppresses the use of any k-mer that occurs more than t times in either the subject or target block. However, a better way to handle the situation is to let the program automatically select a value of t that meets a given memory usage limit specified (in Gb) by the -M parameter. By default daligner will use the amount of physical memory as the choice for -M. If you want to use less, say only 8Gb on a 24Gb HPC cluster node because you want to run 3 daligner jobs on the node, then specify -M8. Specifying -M0 basically indicates that you do not want daligner to self adjust k-mer suppression to fit within a given amount of memory. For each subject, target pair of blocks, say X and Y, the program reports alignments where the a-read is in X and the b-read is in Y, and vice versa. However, if the -A option is set ("A" for "asymmetric") then just overlaps where the a-read is in X and the b-read is in Y are reported, and if X = Y, then it further reports only those overlaps where the a-read index is less than the b-read index. In either case, if the -I option is set ("I" for "identity") then when X = Y, overlaps between different portions of the same read will also be found and reported. Each found alignment is recorded as -- a[ab,ae] x bo[bb,be] -- where a and b are the indices (in the trimmed DB) of the reads that overlap, o indicates whether the b-read is from the same or opposite strand, and [ab,ae] and [bb,be] are the intervals of a and bo, respectively, that align. The program places these alignment records in files whose name is of the form X.Y.[C|N]#.las where C indicates that the b-reads are complemented and N indicates they are not (both comparisons are performed) and # is the thread that detected and wrote out the collection of alignments contained in the file. That is the file X.Y.O#.las contains the alignments produced by thread # for which the a-read is from X and the b-read is from Y and in orientation O. The command "daligner -A X Y" produces 2*NTHREAD thread files X.Y.?.las and "daligner X Y" produces 4*NTHREAD files X.Y.?.las and Y.X.?.las (unless X=Y in which case only NTHREAD files, X.X.?.las, are produced). By default daligner compares all overlaps between reads in the database that are greater than the minimum cutoff set when the DB or DBs were split, typically 1 or 2 Kbp. However, the HGAP assembly pipeline only wants to correct large reads, say 8Kbp or over, and so needs only the overlaps where the a-read is one of the large reads. By setting the -H parameter to say N, one alters daligner so that it only reports overlaps where the a-read is over N base-pairs long. While the default parameter settings are good for raw Pacbio data, daligner can be used for efficiently finding alignments in corrected reads or other less noisy reads. For example, for mapping applications against .dams we run "daligner -k20 -h60 -e.85" and on corrected reads, we typically run "daligner -k25 -w5 -h60 -e.95 -s500" and at these settings it is very fast. 2. LAsort [-v] ... Sort each .las alignment file specified on the command line. For each file it reads in all the overlaps in the file and sorts them in lexicographical order of (a,b,o,ab) assuming each alignment is recorded as a[ab,ae] x b^o[bb,be]. It then writes them all to a file named .S.las (assuming that the input file was .las). With the -v option set then the program reports the number of records read and written. 3. LAmerge [-v] ... Merge the .las files into a singled sorted file , where it is assumed that the input files are sorted. Due to operating system limits, the number of files must be <= 252. With the -v option set the program reports the # of records read and written. Used correctly, LAmerge and LAsort together allow one to perform an "external" sort that produces a collection of sorted files containing in aggregate all the local alignments found by the daligner, such that their concatenation is sorted in order of (a,b,o,ab). In particular, this means that all the alignments for a given a-read will be found consecutively in one of the files. So computations that need to look at all the alignments for a given read can operate in simple sequential scans of these sorted files. 4. LAshow [-caroUF] [-i] [-w] [-b] [ ] [ | ... ] LAshow produces a printed listing of the local alignments contained in the specified .las file, where the a- and b-reads come from src1 or from src1 and scr2, respectively. If a file or list of read ranges is given then only the overlaps for which the a-read is in the set specified by the file or list are displayed. See DBshow for an explanation of how the file and list of read ranges are interpreted. If the -F option is set then the roles of the a- and b- reads are reversed in the display. If the -c option is given then a cartoon rendering is displayed, and if -a or -r option is set then an alignment of the local alignment is displayed. The -a option puts exactly -w columns per segment of the display, whereas the -r option puts exactly -w a-read symbols in each segment of the display. The -r display mode is useful when one wants to visually compare two alignments involving the same a-read. If a combination of the -c, -a, and -r flags is set, then the cartoon comes first, then the -a alignment, and lastly the -r alignment. The -i option sets the indent for the cartoon and/or alignment displays, if they are requested. The -b option sets the number of symbols on either side of the aligned segments in an alignment display, and -U specifies that uppercase should be used for DNA sequence instead of the default lowercase. If the -o option is set then only alignments that are proper overlaps (a sequence end occurs at the each end of the alignment) are displayed. 5. LAdump [-cdt] [-o] [ ] [ | ... ] Like LAshow, LAdump allows one to display the local alignments (LAs) of a subset of the piles in an .las file and select which information to show about them. The difference is that the information is written in a very simple "1-code" ASCII format that makes it easy for one to read and parse the information for further use. For each LA the pair of reads is output on a line. -c requests that one further output the coordinates of the LA segments be output. The -d option requests that the number of difference in the LA be output, and -t requests that the tracepoint information be output. Finally, -o requests that only LAs that are proper overlaps be output. The format is very simple. Each requested piece of information occurs on a line. The first character of every line is a "1-code" character that tells you what information to expect on the line. The rest of the line contains information where each item is separated by a single blank space. The trace point line gives the number of trace point intervals in the LA and is immediately followed by that many lines containing a pair of integers giving the # of differences and b-displacement in each successive trace point interval. P #a #b - (#a,#b) have an LA between them C #ab #ae #bb #be - [#ab,#ae] aligns with [#bb,#be] D # - there are # differences in the LA T #n - there are #n trace point intervals for the LA (#d #y )^#n - there are #d difference aligning the #y bp's of B with the next fixed-size interval of A + X # - Total amount of X (X = P or T) % X # - Maximum amount of X in any pile (X = P or T) @ T # - Maximum number of trace points in any trace 1-code lines that begin with +, %, or @ are always the first lines in the output. They give size information about what is contained in the output. Specifically, '+ X #' gives the total number of LAs (X=P), or the total number of trace point intervals (X=T) in the file . '% X #' gives the maximum number of LAs (X=P) or the maximum number of trace point intervals (X=T) in a given *pile* (collection of LAs all with the same a-read (applies only to sorted .las files). Finally @ T # gives the maximum # of trace point intervals in any trace within the file. 6. LAindex -v ... LAindex takes a series of one or more sorted .las files and produces a "pile index" for each one. If the input file has name "X.las", then the name of its index file is ".X.las.idx". For each A-read pile encoded in the .las file, the index contains the offset to the first local alignment with A in the file. The index starts with four 64-bit integers that encode the numbers % P, + T, % T, and @ T described for LAdump above, and then an offset for each pile beginning with the first A-read in the file (which may not be read 0). The index is meant to allow programs that process piles to more efficiently read just the piles they need at any momment int time, as opposed to having to sequentially scan through the .las file. 7. LAcat > .las Given argument , find all files .1.las, .2.las, ... .n. where .i.las exists for every i in [1,n]. Then concatenate these files in order into a single .las file and pipe the result to the standard output. 8. LAsplit ( | ) < .las If the second argument is an integer n, then divide the alignment file , piped in through the standard input, as evenly as possible into n alignment files with the name .i.las for i in [1,n], subject to the restriction that all alignment records for a given a-read are in the same file. If the second argument refers to a database .db that has been partitioned, then divide the input alignment file into block .las files where all records whose a-read is in .i.db are in .i.las. 9. LAcheck [-vS] [ ] ... LAcheck checks each .las file for structural integrity, where the a- and b-sequences come from src1 or from src1 and scr2, respectively. That is, it makes sure each file makes sense as a plausible .las file, e.g. values are not out of bound, the number of records is correct, the number of trace points for a record is correct, and so on. If the -S option is set then it further checks that the alignments are in sorted order. If the -v option is set then a line is output for each .las file saying either the file is OK or reporting the first error. If the -v option is not set then the program runs silently. The exit status is 0 if every file is deemed good, and 1 if at least one of the files looks corrupted. 10. HPCdaligner [-vbAI] [-k] [-w] [-h] [-t] [-M] [-e] [-H] [-m]+ [-dal] [-deg] [[-]] HPCdaligner writes a UNIX shell script to the standard output that consists of a sequence of commands that effectively run daligner on all pairs of blocks of a split database and then externally sorts and merges them using LAsort and LAmerge into a collection of alignment files with names .#.las where # ranges from 1 to the number of blocks the data base is split into. These sorted files if concatenated by say LAcat would contain all the alignments in sorted order (of a-read, then b-read, ...). Moreover, all overlaps for a given a-read are guaranteed to not be split across files, so one can run artifact analyzers or error correction on each sorted file in parallel. The data base must have been previously split by DBsplit and all the parameters, except -v, -dal, and -deg, are passed through to the calls to daligner. The defaults for these parameters are as for daligner. The -v flag, for verbose-mode, is also passed to all calls to LAsort and LAmerge. -dal and -deg options are described later. For a database divided into N sub-blocks, the calls to daligner will produce in total 2TN^2 .las files assuming daligner runs with T threads. These will then be sorted and merged into N^2 sorted .las files, one for each block pair. These are then merged in ceil(log_deg N) phases where the number of files decreases geometrically in -deg until there is 1 file per row of the N x N block matrix. So at the end one has N sorted .las files that when concatenated would give a single large sorted overlap file. The -dal option (default 4) gives the desired number of block comparisons per call to daligner. Some must contain dal-1 comparisons, and the first dal-2 block comparisons even less, but the HPCdaligner "planner" does the best it can to give an average load of dal block comparisons per command. The -deg option (default 25) gives the maximum number of files that will be merged in a single LAmerge command. The planner makes the most even k-ary tree of merges, where the number of levels is ceil(log_deg N). If the integers and are missing then the script produced is for every block in the database. If is present then HPCdaligner produces an incremental script that compares blocks through ( = if not present) against each other and all previous blocks 1 through -1, and then incrementally updates the .las files for blocks 1 through -1, and creates the .las files for blocks through . Each UNIX command line output by the HPCdaligner can be a batch job (we use the && operator to combine several commands into one line to make this so). Dependencies between jobs can be maintained simply by first running all the daligner jobs, then all the initial sort jobs, and then all the jobs in each phase of the external merge sort. Each of these phases is separated by an informative comment line for your scripting convenience. 9. HPCmapper [-vb] [-k] [-w] [-h] [-t] [-M] [-e] [-H] [-m]+ [-dal] [-deg] [[-]] HPCmapper writes a UNIX shell script to the standard output that consists of a sequence of commands that effectively "maps" every read in the DB against a reference set of sequences in the DB , recording all the found local alignments in the sequence of files ..1.las, ..2.las, ... where ..k.las contains the alignments between all of and the k'th block of . The parameters are exactly the same as for HPCdaligner save that the -k, -h, and -e defaults are set appropriately for mapping, and the -A and -I options make no sense as and are expected to be distinct data sets. If the integers and are missing then the script produced is for every block in the database . If is present then HPCmapper produces an script that compares blocks through ( = if not present) against DAM . Example: // Recall G.db from the example in DAZZ_DB/README > cat G.db files = 1 1862 G Sim blocks = 2 size = 11 cutoff = 0 all = 0 0 0 1024 1024 1862 1862 > HPCdaligner -mdust -t5 G | csh -v // Run the HPCdaligner script # Dazzler jobs (2) dazzler -d -t5 -mdust G.1 G.1 dazzler -d -t5 -mdust G.2 G.1 G.2 # Initial sort jobs (4) LAsort G.1.G.1.*.las && LAmerge G.L1.1.1 G.1.G.1.*.S.las && rm G.1.G.1.*.S.las LAsort G.1.G.2.*.las && LAmerge G.L1.1.2 G.1.G.2.*.S.las && rm G.1.G.2.*.S.las LAsort G.2.G.1.*.las && LAmerge G.L1.2.1 G.2.G.1.*.S.las && rm G.2.G.1.*.S.las LAsort G.2.G.2.*.las && LAmerge G.L1.2.2 G.2.G.2.*.S.las && rm G.2.G.2.*.S.las # Level 1 jobs (2) LAmerge G.1 G.L1.1.1 G.L1.1.2 && rm G.L1.1.1.las G.L1.1.2.las LAmerge G.2 G.L1.2.1 G.L1.2.2 && rm G.L1.2.1.las G.L1.2.2.las > LAshow -c -a:G -w50 G.1 | more // Take a look at the result ! G.1: 34,510 records 1 9 c [ 0.. 1,876] x [ 9,017..10,825] ( 18 trace pts) 12645 A ---------+====> dif/(len1+len2) = 398/(1876+1808) = 21.61% B <====+--------- 9017 1 ..........gtg-cggt--caggggtgcctgc-t-t-atcgcaatgtta |||*||||**||||||||*||||*|*|*||**|*|*|||| 9008 gagaggccaagtggcggtggcaggggtg-ctgcgtcttatatccaggtta 27.5% 35 ta-ctgggtggttaaacttagccaggaaacctgttgaaataa-acggtgg ||*|||||||||||||*|**|*||*|*||||||*|**|||||*|*||||| 9057 tagctgggtggttaaa-tctg-ca-g-aacctg-t--aataacatggtgg 24.0% 83 -ctagtggcttgccgtttacccaacagaagcataatgaaa-tttgaaagt *||||||||*||||||||*||**||||*|||**|||||||*||||*|||| 9100 gctagtggc-tgccgttt-ccgcacag-agc--aatgaaaatttg-aagt 20.0% 131 ggtaggttcctgctgtct-acatacagaacgacggagcgaaaaggtaccg ||*|||||||||||||*|*||||*|*|*||||||||||*||||||||||* 9144 gg-aggttcctgctgt-tcacat-c-ggacgacggagc-aaaaggtacc- 16.0% ... > LAcat G >G.las // Combine G.1.las & G.2.las into a single .las file > LAshow G G | more // Take another look, now at G.las G: 62,654 records 1 9 c [ 0.. 1,876] x [ 9,017..10,825] : < 398 diffs ( 18 trace pts) 1 38 c [ 0.. 7,107] x [ 5,381..12,330] : < 1,614 diffs ( 71 trace pts) 1 49 n [ 5,493..14,521] x [ 0.. 9,065] : < 2,028 diffs ( 91 trace pts) 1 68 n [12,809..14,521] x [ 0.. 1,758] : < 373 diffs ( 17 trace pts) 1 147 c [ 0..13,352] x [ 854..14,069] : < 2,993 diffs (133 trace pts) 1 231 n [10,892..14,521] x [ 0.. 3,735] : < 816 diffs ( 37 trace pts) 1 292 c [ 3,835..14,521] x [ 0..10,702] : < 2,353 diffs (107 trace pts) 1 335 n [ 7,569..14,521] x [ 0.. 7,033] : < 1,544 diffs ( 70 trace pts) 1 377 c [ 9,602..14,521] x [ 0.. 5,009] : < 1,104 diffs ( 49 trace pts) 1 414 c [ 6,804..14,521] x [ 0.. 7,812] : < 1,745 diffs ( 77 trace pts) 1 415 c [ 0.. 3,613] x [ 7,685..11,224] : < 840 diffs ( 36 trace pts) 1 445 c [ 9,828..14,521] x [ 0.. 4,789] : < 1,036 diffs ( 47 trace pts) 1 464 n [ 0.. 1,942] x [12,416..14,281] : < 411 diffs ( 19 trace pts) ... DALIGNER-master/align.c000066400000000000000000003770141263373675100150230ustar00rootroot00000000000000/******************************************************************************************* * * Fast alignment discovery and trace generation along with utilites for displaying alignments * Based on previously unpublished ideas from 2005, subsequently refined in 2013-14. Basic * idea is to keep a dynamically selected interval of the f.r. waves from my 1986 O(nd) paper. * A recent cool idea is to not record all the details of an alignment while discovering it * but simply record trace points through which the optimal alignment passes every 100bp, * allowing rapid recomputation of the alignment details between trace points. * * Author : Gene Myers * First : June 2013 * Current: June 1, 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #undef DEBUG_PASSES // Show forward / backward extension termini for Local_Alignment #undef DEBUG_POINTS // Show trace points #undef DEBUG_WAVE // Show waves of Local_Alignment #undef SHOW_MATCH_WAVE // For waves of Local_Alignment also show # of matches #undef SHOW_TRAIL // Show trace at the end of forward and reverse passes #undef SHOW_TPS // Show trace points as they are encountered in a wave #undef DEBUG_EXTEND // Show waves of Extend_Until_Overlap #undef DEBUG_ALIGN // Show division points of Compute_Trace #undef DEBUG_SCRIPT // Show trace additions for Compute_Trace #undef DEBUG_AWAVE // Show F/R waves of Compute_Trace #undef SHOW_TRACE // Show full trace for Print_Alignment #undef WAVE_STATS /****************************************************************************************\ * * * Working Storage Abstraction * * * \****************************************************************************************/ typedef struct // Hidden from the user, working space for each thread { int vecmax; void *vector; int celmax; void *cells; int pntmax; void *points; int tramax; void *trace; } _Work_Data; Work_Data *New_Work_Data() { _Work_Data *work; work = (_Work_Data *) Malloc(sizeof(_Work_Data),"Allocating work data block"); if (work == NULL) EXIT(NULL); work->vecmax = 0; work->vector = NULL; work->pntmax = 0; work->points = NULL; work->tramax = 0; work->trace = NULL; work->celmax = 0; work->cells = NULL; return ((Work_Data *) work); } static int enlarge_vector(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->vector,max,"Enlarging DP vector"); if (vec == NULL) EXIT(1); work->vecmax = max; work->vector = vec; return (0); } static int enlarge_points(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->points,max,"Enlarging point vector"); if (vec == NULL) EXIT(1); work->pntmax = max; work->points = vec; return (0); } static int enlarge_trace(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->trace,max,"Enlarging trace vector"); if (vec == NULL) EXIT(1); work->tramax = max; work->trace = vec; return (0); } void Free_Work_Data(Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; if (work->vector != NULL) free(work->vector); if (work->cells != NULL) free(work->cells); if (work->trace != NULL) free(work->trace); if (work->points != NULL) free(work->points); free(work); } /****************************************************************************************\ * * * ADAPTIVE PATH FINDING * * * \****************************************************************************************/ // Absolute/Fixed Parameters #define BVEC uint64 // Can be uint32 if PATH_LEN <= 32 #define TRIM_LEN 15 // Report as the tip, the last wave maximum for which the last // 2*TRIM_LEN edits are prefix-positive at rate ave_corr*f(bias) // (max value is 20) #define PATH_LEN 60 // Follow the last PATH_LEN columns/edges (max value is 63) // Derivative fixed parameters #define PATH_TOP 0x1000000000000000ll // Must be 1 << PATH_LEN #define PATH_INT 0x0fffffffffffffffll // Must be PATH_TOP-1 #define TRIM_MASK 0x7fff // Must be (1 << TRIM_LEN) - 1 #define TRIM_MLAG 200 // How far can last trim point be behind best point #define WAVE_LAG 30 // How far can worst point be behind the best point static double Bias_Factor[10] = { .690, .690, .690, .690, .780, .850, .900, .933, .966, 1.000 }; // Adjustable paramters typedef struct { double ave_corr; int trace_space; float freq[4]; int ave_path; int16 *score; int16 *table; } _Align_Spec; /* Fill in bit table: TABLE[x] = 1 iff the alignment modeled by x (1 = match, 0 = mismatch) has a non-negative score for every suffix of the alignment under the scoring scheme where match = MATCH and mismatch = -1. MATCH is set so that an alignment with TRIM_PCT matches has zero score ( (1-TRIM_PCT) / TRIM_PCT ). */ #define FRACTION 1000 // Implicit fractional part of scores, i.e. score = x/FRACTION typedef struct { int mscore; int dscore; int16 *table; int16 *score; } Table_Bits; static void set_table(int bit, int prefix, int score, int max, Table_Bits *parms) { if (bit >= TRIM_LEN) { parms->table[prefix] = (int16) (score-max); parms->score[prefix] = (int16) score; } else { if (score > max) max = score; set_table(bit+1,(prefix<<1),score - parms->dscore,max,parms); set_table(bit+1,(prefix<<1) | 1,score + parms->mscore,max,parms); } } /* Create an alignment specification record including path tip tables & values */ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq) { _Align_Spec *spec; Table_Bits parms; double match; int bias; spec = (_Align_Spec *) Malloc(sizeof(_Align_Spec),"Allocating alignment specification"); if (spec == NULL) EXIT(NULL); spec->ave_corr = ave_corr; spec->trace_space = trace_space; spec->freq[0] = freq[0]; spec->freq[1] = freq[1]; spec->freq[2] = freq[2]; spec->freq[3] = freq[3]; match = freq[0] + freq[3]; if (match > .5) match = 1.-match; bias = (int) ((match+.025)*20.-1.); if (match < .2) { fprintf(stderr,"Warning: Base bias worse than 80/20%% ! (New_Align_Spec)\n"); fprintf(stderr," Capping bias at this ratio.\n"); bias = 3; } spec->ave_path = (int) (PATH_LEN * (1. - Bias_Factor[bias] * (1. - ave_corr))); parms.mscore = (int) (FRACTION * Bias_Factor[bias] * (1. - ave_corr)); parms.dscore = FRACTION - parms.mscore; parms.score = (int16 *) Malloc(sizeof(int16)*(TRIM_MASK+1)*2,"Allocating trim table"); if (parms.score == NULL) { free(spec); EXIT(NULL); } parms.table = parms.score + (TRIM_MASK+1); set_table(0,0,0,0,&parms); spec->table = parms.table; spec->score = parms.score; return ((Align_Spec *) spec); } void Free_Align_Spec(Align_Spec *espec) { _Align_Spec *spec = (_Align_Spec *) espec; free(spec->score); free(spec); } double Average_Correlation(Align_Spec *espec) { return (((_Align_Spec *) espec)->ave_corr); } int Trace_Spacing(Align_Spec *espec) { return (((_Align_Spec *) espec)->trace_space); } float *Base_Frequencies(Align_Spec *espec) { return (((_Align_Spec *) espec)->freq); } /****************************************************************************************\ * * * LOCAL ALIGNMENT FINDER: forward_/reverse_wave and Local_Alignment * * * \****************************************************************************************/ #ifdef WAVE_STATS static int64 MAX, TOT, NWV; static int64 RESTARTS; void Init_Stats() { MAX = TOT = NWV = 0; RESTARTS = 0; } void Print_Stats() { printf("\nMax = %lld Ave = %.1f # = %lld\n",MAX,(1.*TOT)/NWV,NWV); printf("\nRestarts = %lld\n",RESTARTS); } #endif #ifdef DEBUG_WAVE static void print_wave(int *V, int *M, int low, int hgh, int besta) { int k, bestk; (void) M; printf(" [%6d,%6d]: ",low,hgh); for (k = low; k <= hgh; k++) { if (besta == V[k]) bestk = k; // printf(" %3d",(V[k]+k)/2); printf(" %3d",besta-V[k]); } printf(" : %d (%d,%d)\n",besta,(besta+bestk)/2,(besta-bestk)/2); #ifdef SHOW_MATCH_WAVE printf(" "); for (k = low; k <= hgh; k++) printf(" %3d",M[k]); printf("\n"); #endif fflush(stdout); } #endif /* At each furthest reaching point, keep a-coordinate of point (V), bitvector recording the last TRIM_LEN columns of the implied alignment (T), and the # of matches (1-bits) in the bitvector (M). */ typedef struct { int ptr; int diag; int diff; int mark; } Pebble; static int VectorEl = 6*sizeof(int) + sizeof(BVEC); static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, int *mind, int maxd, int mida, int minp, int maxp) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *HB; int *_HA, *_HB; int *NA, *NB; int *_NA, *_NB; Pebble *cells; int avail, cmax, boff; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha, trimhb; int morea, morey, mored; int moreha, morehb; int more, morem, lasta; int aclip, bclip; hgh = maxd; low = *mind; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEl; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; if (COMP(align->flags)) boff = align->blen % TRACE_SPACE; else boff = 0; } /* Compute 0-wave starting from mid-line */ more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; trimhb = morehb = 1; morem = -1; { int k; char *a; a = aseq + hgh; for (k = hgh; k >= low; k--) { int y, c, d; int ha, hb; int na, nb; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k)/TRACE_SPACE)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; nb = ((y+(TRACE_SPACE-boff))/TRACE_SPACE-1)*TRACE_SPACE+boff; #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,-1,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb += TRACE_SPACE; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; } c = (y << 1) + k; while (y+k >= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; } while (y >= nb) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb += TRACE_SPACE; } if (c > besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; trimhb = hb; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; HB[k] = hb; NA[k] = na; NB[k] = nb; a -= 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; morehb = HB[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } #ifdef DEBUG_WAVE printf("\nFORWARD WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif /* Compute successive waves until no furthest reaching points remain */ while (more && lasta >= besta - TRIM_MLAG) { int k, n; int ua, ub; BVEC t; int am, ac, ap; char *a; low -= 1; hgh += 1; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move; int64 vd, md, had, hbd, nad, nbd, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEl)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEl; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (hbd < 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (nbd < 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nbd > 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (hbd > 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; } if (low >= minp) { NA[low] = NA[low+1]; NB[low] = NB[low+1]; V[low] = -1; } else low += 1; if (hgh <= maxp) { NA[hgh] = NA[hgh-1]; NB[hgh] = NB[hgh-1]; V[hgh] = am = -1; } else am = V[--hgh]; dif += 1; ac = V[hgh+1] = V[low-1] = -1; a = aseq + hgh; t = PATH_INT; n = PATH_LEN; ua = ub = -1; for (k = hgh; k >= low; k--) { int y, m; int ha, hb; int c, d; BVEC b; Pebble *pb; ap = ac; ac = am; am = V[d = k-1]; if (ac < am) if (am < ap) { c = ap+1; m = n; b = t; ha = ua; hb = ub; } else { c = am+1; m = M[d]; b = T[d]; ha = HA[d]; hb = HB[d]; } else if (ac < ap) { c = ap+1; m = n; b = t; ha = ua; hb = ub; } else { c = ac+2; m = M[k]; b = T[k]; ha = HA[k]; hb = HB[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k >= NA[k]) { if (cells[ha].mark < NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] += TRACE_SPACE; } while (y >= NB[k]) { if (cells[hb].mark < NB[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = dif; pb->mark = NB[k]; hb = avail++; } NB[k] += TRACE_SPACE; } if (c > besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; trimhb = hb; } } } t = T[k]; n = M[k]; ua = HA[k]; ub = HB[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; HB[k] = hb; a -= 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta-besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; morehb = HB[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } n = besta - WAVE_LAG; while (hgh >= low) if (V[hgh] < n) hgh -= 1; else { while (V[low] < n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; uint16 *btrace = (uint16 *) bpath->trace; int atlen, btlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; trimhb = morehb; } else trimx = trima-trimy; atlen = btlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida-k)/2; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; atrace[atlen++] = (uint16) (d-e); atrace[atlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[atlen++] = (uint16) (trimd-e); atrace[atlen++] = (uint16) (trimy-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } a = -1; for (h = trimhb; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida+k)/2; e = 0; low = k; #ifdef SHOW_TRAIL printf(" B path = (%5d,%5d)\n",b,(mida-k)/2); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark + k; d = cells[h].diff; btrace[btlen++] = (uint16) (d-e); btrace[btlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b-k != trimy) { btrace[btlen++] = (uint16) (trimd-e); btrace[btlen++] = (uint16) (trimx-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); #endif } else if (b != trimx) { btrace[btlen-1] = (uint16) (btrace[btlen-1] + (trimx-b)); btrace[btlen-2] = (uint16) (btrace[btlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); #endif } apath->aepos = trimx; apath->bepos = trimy; apath->diffs = trimd; apath->tlen = atlen; if (COMP(align->flags)) { bpath->abpos = align->blen - apath->bepos; bpath->bbpos = align->alen - apath->aepos; } else { bpath->aepos = apath->bepos; bpath->bepos = apath->aepos; } bpath->diffs = trimd; bpath->tlen = btlen; } *mind = low; return (0); } /*** Reverse Wave ***/ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, int mind, int maxd, int mida, int minp, int maxp) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *HB; int *_HA, *_HB; int *NA, *NB; int *_NA, *_NB; Pebble *cells; int avail, cmax, boff; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha, trimhb; int morea, morey, mored; int moreha, morehb; int more, morem, lasta; int aclip, bclip; hgh = maxd; low = mind; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEl; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; if (COMP(align->flags)) boff = align->blen % TRACE_SPACE; else boff = 0; } more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; trimhb = morehb = 1; morem = -1; { int k; char *a; a = aseq + low; for (k = low; k <= hgh; k++) { int y, c, d; int ha, hb; int na, nb; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y+k; ha = avail++; nb = ((y+(TRACE_SPACE-boff)-1)/TRACE_SPACE-1)*TRACE_SPACE+boff; #ifdef SHOW_TPS printf(" B %d: -1,%d,0,%d\n",avail,k,nb+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y; hb = avail++; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; } c = (y << 1) + k; while (y+k <= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na -= TRACE_SPACE; } while (y <= nb) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb -= TRACE_SPACE; } if (c < besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; trimhb = hb; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; HB[k] = hb; NA[k] = na; NB[k] = nb; a += 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; morehb = HB[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } #ifdef DEBUG_WAVE printf("\nREVERSE WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif while (more && lasta <= besta + TRIM_MLAG) { int k, n; int ua, ub; BVEC t; int am, ac, ap; char *a; low -= 1; hgh += 1; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move, vd, md, had, hbd, nad, nbd, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEl)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEl; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (hbd < 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (nbd < 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nbd > 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (hbd > 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; } if (low >= minp) { NA[low] = NA[low+1]; NB[low] = NB[low+1]; V[low] = ap = INT32_MAX; } else ap = V[++low]; if (hgh <= maxp) { NA[hgh] = NA[hgh-1]; NB[hgh] = NB[hgh-1]; V[hgh] = INT32_MAX; } else hgh -= 1; dif += 1; ac = V[hgh+1] = V[low-1] = INT32_MAX; a = aseq + low; t = PATH_INT; n = PATH_LEN; ua = ub = -1; for (k = low; k <= hgh; k++) { int y, m; int ha, hb; int c, d; BVEC b; Pebble *pb; am = ac; ac = ap; ap = V[d = k+1]; if (ac > ap) if (ap > am) { c = am-1; m = n; b = t; ha = ua; hb = ub; } else { c = ap-1; m = M[d]; b = T[d]; ha = HA[d]; hb = HB[d]; } else if (ac > am) { c = am-1; m = n; b = t; ha = ua; hb = ub; } else { c = ac-2; m = M[k]; b = T[k]; ha = HA[k]; hb = HB[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k <= NA[k]) { if (cells[ha].mark > NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] -= TRACE_SPACE; } while (y <= NB[k]) { if (cells[hb].mark > NB[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = dif; pb->mark = NB[k]; hb = avail++; } NB[k] -= TRACE_SPACE; } if (c < besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; trimhb = hb; } } } t = T[k]; n = M[k]; ua = HA[k]; ub = HB[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; HB[k] = hb; a += 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; morehb = HB[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } n = besta + WAVE_LAG; while (hgh >= low) if (V[hgh] > n) hgh -= 1; else { while (V[low] > n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; uint16 *btrace = (uint16 *) bpath->trace; int atlen, btlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; trimhb = morehb; } else trimx = trima-trimy; atlen = btlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark - k; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); #endif if ((b+k)%TRACE_SPACE != 0) { h = cells[h].ptr; if (h < 0) { a = trimy; d = trimd; } else { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif if (apath->tlen == 0) { atrace[--atlen] = (uint16) (b-a); atrace[--atlen] = (uint16) (d-e); } else { atrace[1] = (uint16) (atrace[1] + (b-a)); atrace[0] = (uint16) (atrace[0] + (d-e)); } b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; atrace[--atlen] = (uint16) (b-a); d = cells[h].diff; atrace[--atlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[--atlen] = (uint16) (b-trimy); atrace[--atlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } } a = -1; for (h = trimhb; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark + k; e = 0; #ifdef SHOW_TRAIL printf(" B path = (%5d,%5d)\n",b,b-k); fflush(stdout); #endif if ((b-k)%TRACE_SPACE != boff) { h = cells[h].ptr; if (h < 0) { a = trimx; d = trimd; } else { k = cells[h].diag; a = cells[h].mark + k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); #endif if (bpath->tlen == 0) { btrace[--btlen] = (uint16) (b-a); btrace[--btlen] = (uint16) (b-a); } else { btrace[1] = (uint16) (btrace[1] + (b-a)); btrace[0] = (uint16) (btrace[0] + (d-e)); } b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark + k; btrace[--btlen] = (uint16) (b-a); d = cells[h].diff; btrace[--btlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b-k != trimy) { btrace[--btlen] = (uint16) (b-trimx); btrace[--btlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); #endif } else if (b != trimx) { btrace[btlen+1] = (uint16) (btrace[btlen+1] + (b-trimx)); btrace[btlen] = (uint16) (btrace[btlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); #endif } } apath->abpos = trimx; apath->bbpos = trimy; apath->diffs = apath->diffs + trimd; apath->tlen = apath->tlen - atlen; apath->trace = atrace + atlen; if (COMP(align->flags)) { bpath->aepos = align->blen - apath->bbpos; bpath->bepos = align->alen - apath->abpos; } else { bpath->abpos = apath->bbpos; bpath->bbpos = apath->abpos; } bpath->diffs = bpath->diffs + trimd; bpath->tlen = bpath->tlen - btlen; bpath->trace = btrace + btlen; } return (0); } /* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) See associated .h file for the precise definition of the interface. */ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, int low, int hgh, int anti, int lbord, int hbord) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath, *bpath; int minp, maxp; int selfie; { int alen, blen; int maxtp, wsize; alen = align->alen; blen = align->blen; if (hgh-low >= 7500) wsize = VectorEl*(hgh-low+1); else wsize = VectorEl*10000; if (wsize >= work->vecmax) if (enlarge_vector(work,wsize)) EXIT(NULL); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); else maxtp = 2*(alen/spec->trace_space+2); wsize = 4*maxtp*sizeof(uint16) + sizeof(Path); if (wsize > work->pntmax) if (enlarge_points(work,wsize)) EXIT(NULL); apath = align->path; bpath = (Path *) work->points; apath->trace = ((uint16 *) (bpath+1)) + maxtp; bpath->trace = ((uint16 *) apath->trace) + 2*maxtp; } #ifdef DEBUG_PASSES printf("\n"); #endif selfie = (align->aseq == align->bseq); if (lbord < 0) { if (selfie && low >= 0) minp = 1; else minp = -INT32_MAX; } else minp = low-lbord; if (hbord < 0) { if (selfie && hgh <= 0) maxp = -1; else maxp = INT32_MAX; } else maxp = hgh+hbord; if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp)) EXIT(NULL); #ifdef DEBUG_PASSES printf("F1 (%d,%d) ~ %d => (%d,%d) %d\n", (2*anti+(low+hgh))/4,(anti-(low+hgh))/4,hgh-low, apath->aepos,apath->bepos,apath->diffs); #endif if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp)) EXIT(NULL); #ifdef DEBUG_PASSES printf("R1 (%d,%d) => (%d,%d) %d\n", (anti+low)/2,(anti-low)/2,apath->abpos,apath->bbpos,apath->diffs); #endif if (COMP(align->flags)) { uint16 *trace = (uint16 *) bpath->trace; uint16 p; int i, j; i = bpath->tlen-2; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; p = trace[i+1]; trace[i+1] = trace[j+1]; trace[j+1] = p; i -= 2; j += 2; } } #ifdef DEBUG_POINTS { uint16 *trace = (uint16 *) apath->trace; int a, h; printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos); printf(" %c\n",(COMP(align->flags) ? 'c' : 'n')); a = apath->bbpos; for (h = 1; h < apath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } { uint16 *trace = (uint16 *) bpath->trace; int a, h; printf("\nB-path (%d,%d)->(%d,%d)",bpath->abpos,bpath->bbpos,bpath->aepos,bpath->bepos); printf(" %c [%d,%d]\n",(COMP(align->flags) ? 'c' : 'n'),align->blen,align->alen); a = bpath->bbpos; for (h = 1; h < bpath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } #endif return (bpath); } /****************************************************************************************\ * * * EXTENSION VERSION OF LOCAL ALIGNMENT * * * \****************************************************************************************/ static int VectorEn = 4*sizeof(int) + sizeof(BVEC); static int forward_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, int midd, int mida, int minp, int maxp) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *NA; int *_HA, *_NA; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha; int morea, morey, mored; int moreha; int more, morem, lasta; int aclip, bclip; hgh = midd; low = midd; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEn; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } /* Compute 0-wave starting from mid-line */ more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; morem = -1; { int k; char *a; a = aseq + hgh; for (k = hgh; k >= low; k--) { int y, c, d; int ha, na; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k)/TRACE_SPACE)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; } c = (y << 1) + k; while (y+k >= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; } if (c > besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; NA[k] = na; a -= 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } #ifdef DEBUG_WAVE printf("\nFORWARD WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif /* Compute successive waves until no furthest reaching points remain */ while (more && lasta >= besta - TRIM_MLAG) { int k, n; int ua; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move; int64 vd, md, had, nad, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEn)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEn; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; V[low] = -1; } if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; V[hgh] = am = -1; } else am = V[hgh]; dif += 1; ac = V[hgh+1] = V[low-1] = -1; a = aseq + hgh; t = PATH_INT; n = PATH_LEN; ua = -1; for (k = hgh; k >= low; k--) { int y, m; int ha; int c, d; BVEC b; Pebble *pb; ap = ac; ac = am; am = V[d = k-1]; if (ac < am) if (am < ap) { c = ap+1; m = n; b = t; ha = ua; } else { c = am+1; m = M[d]; b = T[d]; ha = HA[d]; } else if (ac < ap) { c = ap+1; m = n; b = t; ha = ua; } else { c = ac+2; m = M[k]; b = T[k]; ha = HA[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k >= NA[k]) { if (cells[ha].mark < NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] += TRACE_SPACE; } if (c > besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; } } } t = T[k]; n = M[k]; ua = HA[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; a -= 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta-besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } n = besta - WAVE_LAG; while (hgh >= low) if (V[hgh] < n) hgh -= 1; else { while (V[low] < n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; int atlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; } else trimx = trima-trimy; atlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida-k)/2; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; atrace[atlen++] = (uint16) (d-e); atrace[atlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[atlen++] = (uint16) (trimd-e); atrace[atlen++] = (uint16) (trimy-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } apath->aepos = trimx; apath->bepos = trimy; apath->diffs = trimd; apath->tlen = atlen; } return (0); } static int reverse_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, int midd, int mida, int minp, int maxp) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *NA; int *_HA, *_NA; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha; int morea, morey, mored; int moreha; int more, morem, lasta; int aclip, bclip; hgh = midd; low = midd; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEn; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; morem = -1; { int k; char *a; a = aseq + low; for (k = low; k <= hgh; k++) { int y, c, d; int ha, na; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y+k; ha = avail++; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; } c = (y << 1) + k; while (y+k <= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na -= TRACE_SPACE; } if (c < besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; NA[k] = na; a += 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } #ifdef DEBUG_WAVE printf("\nREVERSE WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif while (more && lasta <= besta + TRIM_MLAG) { int k, n; int ua; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move, vd, md, had, nad, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEn)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEn; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; V[low] = ap = INT32_MAX; } else ap = V[low]; if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; V[hgh] = INT32_MAX; } dif += 1; ac = V[hgh+1] = V[low-1] = INT32_MAX; a = aseq + low; t = PATH_INT; n = PATH_LEN; ua = -1; for (k = low; k <= hgh; k++) { int y, m; int ha; int c, d; BVEC b; Pebble *pb; am = ac; ac = ap; ap = V[d = k+1]; if (ac > ap) if (ap > am) { c = am-1; m = n; b = t; ha = ua; } else { c = ap-1; m = M[d]; b = T[d]; ha = HA[d]; } else if (ac > am) { c = am-1; m = n; b = t; ha = ua; } else { c = ac-2; m = M[k]; b = T[k]; ha = HA[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k <= NA[k]) { if (cells[ha].mark > NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] -= TRACE_SPACE; } if (c < besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; } } } t = T[k]; n = M[k]; ua = HA[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; a += 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } n = besta + WAVE_LAG; while (hgh >= low) if (V[hgh] > n) hgh -= 1; else { while (V[low] > n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; int atlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; } else trimx = trima-trimy; atlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark - k; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); #endif if ((b+k)%TRACE_SPACE != 0) { h = cells[h].ptr; if (h < 0) { a = trimy; d = trimd; } else { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif atrace[--atlen] = (uint16) (b-a); atrace[--atlen] = (uint16) (d-e); b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; atrace[--atlen] = (uint16) (b-a); d = cells[h].diff; atrace[--atlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[--atlen] = (uint16) (b-trimy); atrace[--atlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } } apath->abpos = trimx; apath->bbpos = trimy; apath->diffs = trimd; apath->tlen = - atlen; apath->trace = atrace + atlen; } return (0); } /* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) See associated .h file for the precise definition of the interface. */ int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec, int diag, int anti, int lbord, int hbord, int prefix) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath; int minp, maxp; { int alen, blen; int maxtp, wsize; alen = align->alen; blen = align->blen; wsize = VectorEn*10000; if (wsize >= work->vecmax) if (enlarge_vector(work,wsize)) EXIT(1); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); else maxtp = 2*(alen/spec->trace_space+2); wsize = 2*maxtp*sizeof(uint16); if (wsize > work->pntmax) if (enlarge_points(work,wsize)) EXIT(1); apath = align->path; apath->trace = ((uint16 *) work->points) + maxtp; } #ifdef DEBUG_PASSES printf("\n"); #endif if (lbord < 0) minp = -INT32_MAX; else minp = diag-lbord; if (hbord < 0) maxp = INT32_MAX; else maxp = diag+hbord; if (prefix) { if (reverse_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); apath->aepos = (anti-diag)/2; apath->bepos = (anti+diag)/2; #ifdef DEBUG_PASSES printf("E1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->abpos,apath->bbpos,apath->diffs); #endif } else { if (forward_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); apath->abpos = (anti-diag)/2; apath->bbpos = (anti+diag)/2; #ifdef DEBUG_PASSES printf("F1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->aepos,apath->bepos,apath->diffs); #endif } #ifdef DEBUG_POINTS { uint16 *trace = (uint16 *) apath->trace; int a, h; printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos); printf(" %c\n",(COMP(align->flags) ? 'c' : 'n')); a = apath->bbpos; for (h = 1; h < apath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } #endif return (0); } /****************************************************************************************\ * * * OVERLAP MANIPULATION * * * \****************************************************************************************/ static int64 PtrSize = sizeof(void *); static int64 OvlIOSize = sizeof(Overlap) - sizeof(void *); int Read_Overlap(FILE *input, Overlap *ovl) { if (fread( ((char *) ovl) + PtrSize, OvlIOSize, 1, input) != 1) return (1); return (0); } int Read_Trace(FILE *input, Overlap *ovl, int tbytes) { if (tbytes > 0 && ovl->path.tlen > 0) { if (fread(ovl->path.trace, tbytes*ovl->path.tlen, 1, input) != 1) return (1); } return (0); } void Write_Overlap(FILE *output, Overlap *ovl, int tbytes) { fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output); if (ovl->path.trace != NULL) fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output); } void Compress_TraceTo8(Overlap *ovl) { uint16 *t16 = (uint16 *) ovl->path.trace; uint8 *t8 = (uint8 *) ovl->path.trace; int j; for (j = 0; j < ovl->path.tlen; j++) t8[j] = (uint8) (t16[j]); } void Decompress_TraceTo16(Overlap *ovl) { uint16 *t16 = (uint16 *) ovl->path.trace; uint8 *t8 = (uint8 *) ovl->path.trace; int j; for (j = ovl->path.tlen-1; j >= 0; j--) t16[j] = t8[j]; } void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent) { int i; fprintf(output,"%*s%d vs. ",indent,"",ovl->aread); if (COMP(ovl->flags)) fprintf(output,"c(%d)\n",ovl->bread); else fprintf(output,"%d\n",ovl->bread); fprintf(output,"%*s [%d,%d] vs [%d,%d] w. %d diffs\n",indent,"", ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos,ovl->path.diffs); if (tbytes == 1) { uint8 *trace = (uint8 *) (ovl->path.trace); if (trace != NULL) { int p = ovl->path.bbpos + trace[1]; fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); for (i = 3; i < ovl->path.tlen; i += 2) { if (i%10 == 0) fprintf(output,"\n%*s",indent+6,""); p += trace[i]; fprintf(output," %3d/%5d",trace[i-1],p); } fprintf(output,"\n"); } } else { uint16 *trace = (uint16 *) (ovl->path.trace); if (trace != NULL) { int p = ovl->path.bbpos + trace[1]; fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); for (i = 3; i < ovl->path.tlen; i += 2) { if (i%10 == 0) fprintf(output,"\n%*s",indent+6,""); p += trace[i]; fprintf(output," %3d/%5d",trace[i-1],p); } fprintf(output,"\n"); } } } int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname) { int i, p; if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2) { if (verbose) EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname); return (1); } p = ovl->path.bbpos; if (tspace <= TRACE_XOVR) { uint8 *trace8 = (uint8 *) ovl->path.trace; for (i = 1; i < ovl->path.tlen; i += 2) p += trace8[i]; } else { uint16 *trace16 = (uint16 *) ovl->path.trace; for (i = 1; i < ovl->path.tlen; i += 2) p += trace16[i]; } if (p != ovl->path.bepos) { if (verbose) EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname); return (1); } return (0); } void Flip_Alignment(Alignment *align, int full) { char *aseq = align->aseq; char *bseq = align->bseq; int alen = align->alen; int blen = align->blen; Path *path = align->path; int comp = COMP(align->flags); int *trace = (int *) path->trace; int tlen = path->tlen; int i, j, p; if (comp) { p = path->abpos; path->abpos = blen - path->bepos; path->bepos = alen - p; p = path->aepos; path->aepos = blen - path->bbpos; path->bbpos = alen - p; if (full) { alen += 2; blen += 2; for (i = 0; i < tlen; i++) if ((p = trace[i]) < 0) trace[i] = alen + p; else trace[i] = p - blen; i = tlen-1; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; i -= 1; j += 1; } alen -= 2; blen -= 2; } } else { p = path->abpos; path->abpos = path->bbpos; path->bbpos = p; p = path->aepos; path->aepos = path->bepos; path->bepos = p; if (full) for (i = 0; i < tlen; i++) trace[i] = - (trace[i]); } align->aseq = bseq; align->bseq = aseq; align->alen = blen; align->blen = alen; } /****************************************************************************************\ * * * ALIGNMENT PRINTING * * * \****************************************************************************************/ /* Complement the sequence in fragment aseq. The operation does the complementation/reversal in place. Calling it a second time on a given fragment restores it to its original state. */ void Complement_Seq(char *aseq, int len) { char *s, *t; int c; s = aseq; t = aseq + (len-1); while (s < t) { c = 3 - *s; *s++ = (char) (3 - *t); *t-- = (char) c; } if (s == t) *s = (char) (3 - *s); } /* Print an alignment to file between a and b given in trace (unpacked). Prefix gives the length of the initial prefix of a that is unaligned. */ static char ToL[8] = { 'a', 'c', 'g', 't', '.', '[', ']', '-' }; static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' }; int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, int indent, int width, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int sa, sb; int match, diff; char *N2A; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif o = sizeof(char)*3*(width+1); if (o > work->vecmax) if (enlarge_vector(work,o)) EXIT(1); if (upper) N2A = ToU; else N2A = ToL; Abuf = (char *) work->vector; Bbuf = Abuf + (width+1); Dbuf = Bbuf + (width+1); aend = align->path->aepos; bend = align->path->bepos; Abuf[width] = Bbuf[width] = Dbuf[width] = '\0'; /* buffer/output next column */ #define COLUMN(x,y) \ { int u, v; \ if (o >= width) \ { fprintf(file,"\n"); \ fprintf(file,"%*s",indent,""); \ if (coord > 0) \ { if (sa <= aend) \ fprintf(file," %*d",coord,sa); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %*s %s\n",indent,"",coord,"",Dbuf); \ fprintf(file,"%*s",indent,""); \ if (sb <= bend) \ fprintf(file," %*d",coord,sb); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s",Bbuf); \ } \ else \ { fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %s\n",indent,"",Dbuf); \ fprintf(file,"%*s %s",indent,"",Bbuf); \ } \ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ o = 0; \ sa = i; \ sb = j; \ match = diff = 0; \ } \ u = (x); \ v = (y); \ if (u == 4 || v == 4) \ Dbuf[o] = ' '; \ else if (u == v) \ Dbuf[o] = mtag; \ else \ Dbuf[o] = dtag; \ Abuf[o] = N2A[u]; \ Bbuf[o] = N2A[v]; \ o += 1; \ } a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->path->abpos; prefb = align->path->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } sa = i; sb = j; mtag = ':'; dtag = ':'; while (prefa > prefb) { COLUMN(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { COLUMN(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { COLUMN(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) COLUMN(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(7,b[j]) j += 1; diff += 1; } else { while (j != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(a[i],7) i += 1; diff += 1; } p = align->path->aepos; while (i <= p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) COLUMN(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { COLUMN(a[i],b[j]) i += 1; j += 1; } else { COLUMN(a[i],4) i += 1; } else { COLUMN(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa <= aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb <= bend) fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); fflush(file); return (0); } int Print_Reference(FILE *file, Alignment *align, Work_Data *ework, int indent, int block, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int sa, sb, s0; int match, diff; char *N2A; int vmax; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif vmax = work->vecmax/3; o = sizeof(char)*6*(block+1); if (o > vmax) { if (enlarge_vector(work,3*o)) EXIT(1); vmax = work->vecmax/3; } Abuf = (char *) work->vector; Bbuf = Abuf + vmax; Dbuf = Bbuf + vmax; if (upper) N2A = ToU; else N2A = ToL; aend = align->path->aepos; bend = align->path->bepos; #define BLOCK(x,y) \ { int u, v; \ if (i%block == 1 && i != s0 && x < 4 && o > 0) \ { fprintf(file,"\n"); \ fprintf(file,"%*s",indent,""); \ if (coord > 0) \ { if (sa <= aend) \ fprintf(file," %*d",coord,sa); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %.*s\n",o,Abuf); \ fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); \ fprintf(file,"%*s",indent,""); \ if (sb <= bend) \ fprintf(file," %*d",coord,sb); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %.*s",o,Bbuf); \ } \ else \ { fprintf(file," %.*s\n",o,Abuf); \ fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); \ fprintf(file,"%*s %.*s",indent,"",o,Bbuf); \ } \ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ o = 0; \ sa = i; \ sb = j; \ match = diff = 0; \ } \ u = (x); \ v = (y); \ if (u == 4 || v == 4) \ Dbuf[o] = ' '; \ else if (u == v) \ Dbuf[o] = mtag; \ else \ Dbuf[o] = dtag; \ Abuf[o] = N2A[u]; \ Bbuf[o] = N2A[v]; \ o += 1; \ if (o >= vmax) \ { if (enlarge_vector(work,3*o)) \ EXIT(1); \ vmax = work->vecmax/3; \ memmove(work->vector+2*vmax,Dbuf,o); \ memmove(work->vector+vmax,Bbuf,o); \ memmove(work->vector,Abuf,o); \ Abuf = (char *) work->vector; \ Bbuf = Abuf + vmax; \ Dbuf = Bbuf + vmax; \ } \ } a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->path->abpos; prefb = align->path->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } s0 = i; sa = i; sb = j; mtag = ':'; dtag = ':'; while (prefa > prefb) { BLOCK(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { BLOCK(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { BLOCK(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) BLOCK(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } BLOCK(7,b[j]) j += 1; diff += 1; } else { while (j != p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } BLOCK(a[i],7) i += 1; diff += 1; } p = align->path->aepos; while (i <= p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) BLOCK(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { BLOCK(a[i],b[j]) i += 1; j += 1; } else { BLOCK(a[i],4) i += 1; } else { BLOCK(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa <= aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb <= bend) fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); fflush(file); return (0); } /* Print an ASCII representation of the overlap in align between fragments a and b to given file. */ static inline void repchar(FILE *file, int symbol, int rep) { while (rep-- > 0) fputc(symbol,file); } void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord) { int alen = align->alen; int blen = align->blen; Path *path = align->path; int comp = COMP(align->flags); int w; fprintf(file,"%*s",indent,""); if (path->abpos > 0) fprintf(file," %*d ",coord,path->abpos); else fprintf(file,"%*s",coord+5,""); if (path->aepos < alen) fprintf(file,"%*s%d",coord+8,"",alen-path->aepos); fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (path->abpos > 0) { fprintf(file,"A "); w = Number_Digits((int64) path->abpos); repchar(file,' ',coord-w); repchar(file,'=',w+3); fputc('+',file); repchar(file,'-',coord+5); } else { fprintf(file,"A %*s",coord+4,""); repchar(file,'-',coord+5); } if (path->aepos < alen) { fputc('+',file); w = Number_Digits((int64) (alen-path->aepos)); repchar(file,'=',w+2); fputc('>',file); repchar(file,' ',w); } else { fputc('>',file); repchar(file,' ',coord+3); } { int asub, bsub; asub = path->aepos - path->abpos; bsub = path->bepos - path->bbpos; fprintf(file," dif/(len1+len2) = %d/(%d+%d) = %5.2f%%\n", path->diffs,asub,bsub,(200.*path->diffs)/(asub+bsub)); } { int sym1e, sym2e; int sym1p, sym2p; if (comp > 0) { sym1p = '<'; sym2p = '-'; sym1e = '<'; sym2e = '='; } else { sym1p = '-'; sym2p = '>'; sym1e = '='; sym2e = '>'; } fprintf(file,"%*s",indent,""); if (path->bbpos > 0) { fprintf(file,"B "); w = Number_Digits((int64) path->bbpos); repchar(file,' ',coord-w); fputc(sym1e,file); repchar(file,'=',w+2); fputc('+',file); repchar(file,'-',coord+5); } else { fprintf(file,"B "); repchar(file,' ',coord+3); fputc(sym1p,file); repchar(file,'-',coord+5); } if (path->bepos < blen) { fprintf(file,"+"); w = Number_Digits((int64) (blen-path->bepos)); repchar(file,'=',w+2); fprintf(file,"%c\n",sym2e); } else fprintf(file,"%c\n",sym2p); } fprintf(file,"%*s",indent,""); if (path->bbpos > 0) fprintf(file," %*d ",coord,path->bbpos); else fprintf(file,"%*s",coord+5,""); if (path->bepos < blen) fprintf(file,"%*s%d",coord+8,"",blen-path->bepos); fprintf(file,"\n"); fflush(file); } /****************************************************************************************\ * * * O(ND) trace algorithm * * * \****************************************************************************************/ #ifdef DEBUG_AWAVE static void print_awave(int *V, int low, int hgh) { int k; printf(" [%6d,%6d]: ",low,hgh); for (k = low; k <= hgh; k++) printf(" %3d",V[k]); printf("\n"); fflush(stdout); } #endif #ifdef DEBUG_ALIGN static int depth = 0; #endif typedef struct { int *Stop; // Ongoing stack of alignment indels char *Aabs, *Babs; // Absolute base of A and B sequences int **PVF, **PHF; // List of waves for iterative np algorithms int mida, midb; // mid point division for mid-point algorithms int *VF, *VB; // Forward/Reverse waves for nd algorithms // (defunct: were used for O(nd) algorithms) } Trace_Waves; static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave) { int x, y; int D; #ifdef DEBUG_ALIGN printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); #endif if (M <= 0) { x = (wave->Aabs-A)-1; for (y = 1; y <= N; y++) { *wave->Stop++ = x; #ifdef DEBUG_SCRIPT printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1); #endif } return (N); } if (N <= 0) { y = (B-wave->Babs)+1; for (x = 1; x <= M; x++) { *wave->Stop++ = y; #ifdef DEBUG_SCRIPT printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1); #endif } return (M); } { int *VF = wave->VF; int *VB = wave->VB; int flow; // fhgh == D ! int blow, bhgh; char *a; y = 0; if (N < M) while (y < N && B[y] == A[y]) y += 1; else { while (y < M && B[y] == A[y]) y += 1; if (y >= M && N == M) return (0); } flow = 0; VF[0] = y; VF[-1] = -2; x = N-M; a = A-x; y = N-1; if (N > M) while (y >= x && B[y] == a[y]) y -= 1; else while (y >= 0 && B[y] == a[y]) y -= 1; blow = bhgh = -x; VB += x; VB[blow] = y; VB[blow-1] = N+1; for (D = 1; 1; D += 1) { int k, r; int am, ac, ap; // Forward wave flow -= 1; am = ac = VF[flow-1] = -2; a = A + D; x = M - D; for (k = D; k >= flow; k--) { ap = ac; ac = am+1; am = VF[k-1]; if (ac < am) if (ap < am) y = am; else y = ap; else if (ap < ac) y = ac; else y = ap; if (blow <= k && k <= bhgh) { r = VB[k]; if (y > r) { D = (D<<1)-1; if (ap > r) y = ap; else if (ac > r) y = ac; else y = r+1; x = k+y; goto OVERLAP2; } } if (N < x) while (y < N && B[y] == a[y]) y += 1; else while (y < x && B[y] == a[y]) y += 1; VF[k] = y; a -= 1; x += 1; } #ifdef DEBUG_AWAVE print_awave(VF,flow,D); #endif // Reverse Wave bhgh += 1; blow -= 1; am = ac = VB[blow-1] = N+1; a = A + bhgh; x = -bhgh; for (k = bhgh; k >= blow; k--) { ap = ac+1; ac = am; am = VB[k-1]; if (ac > am) if (ap > am) y = am; else y = ap; else if (ap > ac) y = ac; else y = ap; if (flow <= k && k <= D) { r = VF[k]; if (y <= r) { D = (D << 1); if (ap <= r) y = ap; else if (ac <= r) y = ac; else y = r; x = k+y; goto OVERLAP2; } } y -= 1; if (x > 0) while (y >= x && B[y] == a[y]) y -= 1; else while (y >= 0 && B[y] == a[y]) y -= 1; VB[k] = y; a -= 1; x += 1; } #ifdef DEBUG_AWAVE print_awave(VB,blow,bhgh); #endif } } OVERLAP2: #ifdef DEBUG_ALIGN printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D); fflush(stdout); #endif if (D > 1) { #ifdef DEBUG_ALIGN depth += 2; #endif dandc_nd(A,x,B,y,wave); dandc_nd(A+x,M-x,B+y,N-y,wave); #ifdef DEBUG_ALIGN depth -= 2; #endif } else if (D == 1) { if (M > N) { *wave->Stop++ = (B-wave->Babs)+y+1; #ifdef DEBUG_SCRIPT printf("%*s D %ld(%ld)\n",depth,"",(A-wave->Aabs)+x,(B-wave->Babs)+y+1); #endif } else if (M < N) { *wave->Stop++ = (wave->Aabs-A)-x-1; #ifdef DEBUG_SCRIPT printf("%*s I %ld(%ld)\n",depth,"",(B-wave->Babs)+y,(A-wave->Aabs)+x+1); #endif } #ifdef DEBUG_SCRIPT else printf("%*s %ld S %ld\n",depth,"",(wave->Aabs-A)+x,(B-wave->Babs)+y); #endif } return (D); } static int Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; int L, D; int asub, bsub; Path *path; int *trace; path = align->path; asub = path->aepos-path->abpos; bsub = path->bepos-path->bbpos; if (asub < bsub) L = bsub; else L = asub; L *= sizeof(int); if (L > work->tramax) if (enlarge_trace(work,L)) EXIT(1); trace = wave.Stop = ((int *) work->trace); D = 2*(path->diffs + 4)*sizeof(int); if (D > work->vecmax) if (enlarge_vector(work,D)) EXIT(1); D = (path->diffs+3)/2; wave.VF = ((int *) work->vector) + (D+1); wave.VB = wave.VF + (2*D+1); wave.Aabs = align->aseq; wave.Babs = align->bseq; path->diffs = dandc_nd(align->aseq+path->abpos,path->aepos-path->abpos, align->bseq+path->bbpos,path->bepos-path->bbpos,&wave); path->trace = trace; path->tlen = wave.Stop - trace; return (0); } /****************************************************************************************\ * * * O(NP) tracing algorithms * * * \****************************************************************************************/ /* Iterative O(np) algorithm for finding the alignment between two substrings (specified by a Path record). The variation includes handling substitutions and guarantees to find left-most alignments so that low complexity runs are always aligned in the same way. */ #ifdef DEBUG_ALIGN static int ToA[4] = { 'a', 'c', 'g', 't' }; #endif static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; int del = M-N; { int *F0, *F1, *F2; int *HF; int low, hgh; int posl, posh; #ifdef DEBUG_ALIGN printf("\n BASE %ld,%ld: %d vs %d\n",A-wave->Aabs,B-wave->Babs,M,N); printf(" A = "); for (D = 0; D < M; D++) printf("%c",ToA[(int) A[D]]); printf("\n"); printf(" B = "); for (D = 0; D < N; D++) printf("%c",ToA[(int) B[D]]); printf("\n"); #endif if (del >= 0) { low = 0; hgh = del; } else { low = del; hgh = 0; } posl = -INT32_MAX; posh = INT32_MAX; if (wave->Aabs == wave->Babs) { if (B == A) { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); EXIT(-1); } else if (B < A) posl = (B-A)+1; else posh = (B-A)-1; } F1 = PVF[-2]; F0 = PVF[-1]; for (D = low-1; D <= hgh+1; D++) F1[D] = F0[D] = -2; F0[0] = -1; low += 1; hgh -= 1; for (D = 0; 1; D += 1) { int k, i, j; int am, ac, ap; char *a; F2 = F1; F1 = F0; F0 = PVF[D]; HF = PHF[D]; if ((D & 0x1) == 0) { if (low > posl) low -= 1; if (hgh < posh) hgh += 1; } F0[hgh+1] = F0[low-1] = -2; #define FS_MOVE(mdir,pdir) \ ac = F1[k]+1; \ if (ac < am) \ if (ap < am) \ { HF[k] = mdir; \ j = am; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ else \ if (ap < ac) \ { HF[k] = 0; \ j = ac; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ \ if (N < i) \ while (j < N && B[j] == a[j]) \ j += 1; \ else \ while (j < i && B[j] == a[j]) \ j += 1; \ F0[k] = j; j = -2; a = A + hgh; i = M - hgh; for (k = hgh; k > del; k--) { ap = j+1; am = F2[k-1]; FS_MOVE(-1,4) a -= 1; i += 1; } j = -2; a = A + low; i = M - low; for (k = low; k < del; k++) { ap = F2[k+1]+1; am = j; FS_MOVE(2,1) a += 1; i -= 1; } ap = F0[del+1]+1; am = j; FS_MOVE(2,4) #ifdef DEBUG_AWAVE print_awave(F0,low,hgh); print_awave(HF,low,hgh); #endif if (F0[del] >= N) break; } } { int k, h, m, e, c; int ap = (wave->Aabs-A)-1; int bp = (B-wave->Babs)+1; PHF[0][0] = 3; c = N; k = del; e = PHF[D][k]; PHF[D][k] = 3; if (mode == UPPERMOST) while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h < k) // => e = -1 or 2, UPPERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] <= c) c = PVF[D][h]-1; while (c >= m && a[c] == B[c]) c -= 1; if (e == -1) // => edge is 2, others are 1, and 0 { if (c <= PVF[D+2][k+1]) { e = 4; h = k+1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c+1; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c <= PVF[m][k+1]) { if (k == del) e = 4; else e = 1; h = k+1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c+1; } } m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } else if (mode == LOWERMOST) while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h > k) // => e = 1 or 4, LOWERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] < c) c = PVF[D][h]; while (c >= m && a[c] == B[c]) c -= 1; if (e == 1) // => edge is 2, others are 1, and 0 { if (c < PVF[D+2][k-1]) { e = 2; h = k-1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c--; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c < PVF[m][k-1]) { if (k == del) e = 2; else e = -1; h = k-1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c--; } } m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } else // mode == GREEDIEST while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } k = D = 0; e = PHF[D][k]; while (e != 3) { h = k-e; c = PVF[D][k]; if (e > 1) h += 3; else if (e == 0) D += 1; else D += 2; #ifdef DEBUG_SCRIPT if (h > k) printf(" D %d(%d)\n",(c-k)-(ap-1),c+bp); else if (h < k) printf(" I %d(%d)\n",c+(bp-1),(c+k)-ap); else printf(" %d S %d\n",(c+k)-(ap+1),c+(bp-1)); #endif if (h > k) *wave->Stop++ = bp+c; else if (h < k) *wave->Stop++ = ap-(c+k); k = h; e = PHF[D][h]; } } return (D + abs(del)); } static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; int del = M-N; { int *F0, *F1, *F2; int *HF; int low, hgh; int posl, posh; #ifdef DEBUG_ALIGN printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); printf("%*s A = ",depth,""); for (D = 0; D < M; D++) printf("%c",ToA[(int) A[D]]); printf("\n"); printf("%*s B = ",depth,""); for (D = 0; D < N; D++) printf("%c",ToA[(int) B[D]]); printf("\n"); #endif if (del >= 0) { low = 0; hgh = del; } else { low = del; hgh = 0; } posl = -INT32_MAX; posh = INT32_MAX; if (wave->Aabs == wave->Babs) { if (B == A) { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); EXIT(1); } else if (B < A) posl = (B-A)+1; else posh = (B-A)-1; } F1 = PVF[-2]; F0 = PVF[-1]; for (D = low-1; D <= hgh+1; D++) F1[D] = F0[D] = -2; F0[0] = -1; low += 1; hgh -= 1; for (D = 0; 1; D += 1) { int k, i, j; int am, ac, ap; char *a; F2 = F1; F1 = F0; F0 = PVF[D]; HF = PHF[D]; if ((D & 0x1) == 0) { if (low > posl) low -= 1; if (hgh < posh) hgh += 1; } F0[hgh+1] = F0[low-1] = -2; j = -2; a = A + hgh; i = M - hgh; for (k = hgh; k > del; k--) { ap = j+1; am = F2[k-1]; FS_MOVE(-1,4) a -= 1; i += 1; } j = -2; a = A + low; i = M - low; for (k = low; k < del; k++) { ap = F2[k+1]+1; am = j; FS_MOVE(2,1) a += 1; i -= 1; } ap = F0[del+1]+1; am = j; FS_MOVE(2,4) #ifdef DEBUG_AWAVE print_awave(F0,low,hgh); print_awave(HF,low,hgh); #endif if (F0[del] >= N) break; } } { int k, h, m, e, c; int d, f; d = D + abs(del); c = N; k = del; if (mode == UPPERMOST) for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h < k) // => e = -1 or 2, UPPERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] <= c) c = PVF[D][h]-1; while (c >= m && a[c] == B[c]) c -= 1; if (e == -1) // => edge is 2, others are 1, and 0 { if (c <= PVF[D+2][k+1]) { e = 4; h = k+1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c+1; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c <= PVF[m][k+1]) { if (k == del) e = 4; else e = 1; h = k+1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c+1; } } k = h; } else if (mode == LOWERMOST) for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h > k) // => e = 1 or 4, LOWERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] < c) c = PVF[D][h]; while (c >= m && a[c] == B[c]) c -= 1; if (e == 1) // => edge is 2, others are 1, and 0 { if (c < PVF[D+2][k-1]) { e = 2; h = k-1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c--; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c < PVF[m][k-1]) { if (k == del) e = 2; else e = -1; h = k-1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c--; } } k = h; } else // mode == GREEDIEST for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; k = h; } wave->midb = (B-wave->Babs) + PVF[D][k]; wave->mida = (A-wave->Aabs) + k + PVF[D][k]; } return (0); } /****************************************************************************************\ * * * COMPUTE_TRACE FLAVORS * * * \****************************************************************************************/ int Compute_Trace_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; int M, N, D; path = align->path; aseq = align->aseq; bseq = align->bseq; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; { int64 s; int d; int dmax; int **PVF, **PHF; if (M < N) s = N; else s = M; s *= sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); dmax = path->diffs - abs(M-N); s = (dmax+3)*2*((M+N+3)*sizeof(int) + sizeof(int *)); if (s > 256000000) return (Compute_Trace_ND_ALL(align,ework)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = M+N+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (N+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = ((int *) work->trace); wave.Aabs = aseq; wave.Babs = bseq; D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave,GREEDIEST); if (D < 0) EXIT(1); path->diffs = D; path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); return (0); } int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int dmax, nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = 0; dmax = 0; for (d = 1; d < tlen; d += 2) { if (points[d-1] > dmax) dmax = points[d-1]; if (points[d] > nmax) nmax = points[d]; } if (tlen <= 1) nmax = N; s = (dmax+3)*2*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = trace_spacing+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = (int *) (work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; diffs = 0; ab = path->abpos; ae = (ab/trace_spacing)*trace_spacing; bb = path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; } ae = path->aepos; be = path->bepos; d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); if (d < 0) EXIT(1); diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int dmax, nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = 0; dmax = 0; for (d = 1; d < tlen; d += 2) { if (points[d-1] > dmax) dmax = points[d-1]; if (points[d] > nmax) nmax = points[d]; } if (tlen <= 1) nmax = N; s = (dmax+3)*4*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = trace_spacing+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = ((int *) work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; int as, bs; int af, bf; diffs = 0; ab = as = af = path->abpos; ae = (ab/trace_spacing)*trace_spacing; bb = bs = bf = path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode)) EXIT(1); af = wave.mida; bf = wave.midb; d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; as = af; bs = bf; } ae = path->aepos; be = path->bepos; if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode)) EXIT(1); af = wave.mida; bf = wave.midb; d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode); if (d < 0) EXIT(1); diffs += d; as = af; bs = bf; d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave,mode); if (d < 0) EXIT(1); diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int mmax, nmax, dmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = mmax = 0; for (d = 0; d < tlen; d += 2) { if (points[d] > mmax) mmax = points[d]; if (points[d+1] > nmax) nmax = points[d+1]; } if (tlen <= 1) { mmax = M; nmax = N; } if (mmax > nmax) dmax = nmax; else dmax = mmax; s = (dmax+3)*2*((mmax+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = mmax+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = (int *) (work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; diffs = 0; ab = path->abpos; bb = path->bbpos; for (i = 0; i < tlen; i += 2) { ae = ab + points[i]; be = bb + points[i+1]; d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; } } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } DALIGNER-master/align.h000066400000000000000000000425641263373675100150270ustar00rootroot00000000000000/******************************************************************************************* * * Local alignment module. Routines for finding local alignments given a seed position, * representing such an l.a. with its interval and a set of pass-thru points, so that * a detailed alignment can be efficiently computed on demand. * * All routines work on a numeric representation of DNA sequences, i.e. 0 for A, 1 for C, * 2 for G, and 3 for T. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #ifndef _A_MODULE #define _A_MODULE #include "DB.h" #define TRACE_XOVR 125 // If the trace spacing is not more than this value, then can // and do compress traces pts to 8-bit unsigned ints /*** INTERACTIVE vs BATCH version The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or batch version of the routines in this library are compiled. In batch mode, routines print an error message and exit. In interactive mode, the routines place the error message in EPLACE (also defined in DB.h) and return an error value, typically NULL if the routine returns a pointer, and an unusual integer value if the routine returns an integer. Below when an error return is described, one should understand that this value is returned only if the routine was compiled in INTERACTIVE mode. ***/ /*** PATH ABSTRACTION: Coordinates are *between* characters where 0 is the tick just before the first char, 1 is the tick between the first and second character, and so on. Our data structure is called a Path refering to its conceptualization in an edit graph. A local alignment is specified by the point '(abpos,bbpos)' at which its path in the underlying edit graph starts, and the point '(aepos,bepos)' at which it ends. In otherwords A[abpos+1..aepos] is aligned to B[bbpos+1..bepos] (assuming X[1] is the *first* character of X). There are 'diffs' differences in an optimal local alignment between the beginning and end points of the alignment (if computed by Compute_Trace), or nearly so (if computed by Local_Alignment). Optionally, a Path can have additional information about the exact nature of the aligned substrings if the field 'trace' is not NULL. Trace points to either an array of integers (if computed by a Compute_Trace routine), or an array of unsigned short integers (if computed by Local_Alignment). If computed by Local_Alignment 'trace' points at a list of 'tlen' (always even) short values: d_0, b_0, d_1, b_1, ... d_n-1, b_n-1, d_n, b_n to be interpreted as follows. The alignment from (abpos,bbpos) to (aepos,bepos) passes through the n trace points for i in [1,n]: (a_i,b_i) where a_i = floor(abpos/TS)*TS + i*TS and b_i = bbpos + (b_0 + b_1 + b_i-1) where also let a_0,b_0 = abpos,bbpos and a_(n+1),b_(n+1) = aepos,bepos. That is, the interior (i.e. i != 0 and i != n+1) trace points pass through every TS'th position of the aread where TS is the "trace spacing" employed when finding the alignment (see New_Align_Spec). Typically TS is 100. Then d_i is the number of differences in the portion of the alignment between (a_i,b_i) and (a_i+1,b_i+1). These trace points allow the Compute_Trace routines to efficiently compute the exact alignment between the two reads by efficiently computing exact alignments between consecutive pairs of trace points. Moreover, the diff values give one an idea of the quality of the alignment along every segment of TS symbols of the aread. If computed by a Compute_Trace routine, 'trace' points at a list of 'tlen' integers < i1, i2, ... in > that encodes an exact alignment as follows. A negative number j indicates that a dash should be placed before A[-j] and a positive number k indicates that a dash should be placed before B[k], where A and B are the two sequences of the overlap. The indels occur in the trace in the order in which they occur along the alignment. For a good example of how to "decode" a trace into an alignment, see the code for the routine Print_Alignment. ***/ typedef struct { void *trace; int tlen; int diffs; int abpos, bbpos; int aepos, bepos; } Path; /*** ALIGNMENT ABSTRACTION: An alignment is modeled by an Alignment record, which in addition to a *pointer* to a 'path', gives pointers to the A and B sequences, their lengths, and indicates whether the B-sequence needs to be complemented ('comp' non-zero if so). The 'trace' pointer of the 'path' subrecord can be either NULL, a list of pass-through points, or an exact trace depending on what routines have been called on the record. One can (1) compute a trace, with Compute_Trace, either from scratch if 'path.trace' = NULL, or using the sequence of pass-through points in trace, (2) print an ASCII representation of an alignment, or (3) reverse the roles of A and B, and (4) complement a sequence (which is a reversible process). If the alignment record shows the B sequence as complemented, *** THEN IT IS THE RESPONSIBILITY OF THE CALLER *** to make sure that bseq points at a complement of the sequence before calling Compute_Trace or Print_Alignment. Complement_Seq complements the sequence a of length n. The operation does the complementation/reversal in place. Calling it a second time on a given fragment restores it to its original state. ***/ #define COMP(x) ((x) & 0x1) #define COMP_FLAG 0x1 typedef struct { Path *path; uint32 flags; /* Pipeline status and complementation flags */ char *aseq; /* Pointer to A sequence */ char *bseq; /* Pointer to B sequence */ int alen; /* Length of A sequence */ int blen; /* Length of B sequence */ } Alignment; void Complement_Seq(char *a, int n); /* Many routines like Local_Alignment, Compute_Trace, and Print_Alignment need working storage that is more efficiently reused with each call, rather than being allocated anew with each call. Each *thread* can create a Work_Data object with New_Work_Data and this object holds and retains the working storage for routines of this module between calls to the routines. If enough memory for a Work_Data is not available then NULL is returned. Free_Work_Data frees a Work_Data object and all working storage held by it. */ typedef void Work_Data; Work_Data *New_Work_Data(); void Free_Work_Data(Work_Data *work); /* Local_Alignment seeks local alignments of a quality determined by a number of parameters. These are coded in an Align_Spec object that can be created with New_Align_Spec and freed with Free_Align_Spec when no longer needed. There are 4 essential parameters: ave_corr: the average correlation (1 - 2*error_rate) for the sought alignments. For Pacbio data we set this to .70 assuming an average of 15% error in each read. trace_space: the spacing interval for keeping trace points and segment differences (see description of 'trace' for Paths above) freq[4]: a 4-element vector where afreq[0] = frequency of A, f(A), freq[1] = f(C), freq[2] = f(G), and freq[3] = f(T). This vector is part of the header of every HITS database (see db.h). If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e. overlap), then the last/first 30 columns of the alignment are guaranteed to be suffix/prefix positive at correlation ave_corr * g(freq) where g is an empirically measured function that increases from 1 as the entropy of freq decreases. If memory is unavailable or the freq distribution is too skewed then NULL is returned. You can get back the original parameters used to create an Align_Spec with the simple utility functions below. */ typedef void Align_Spec; Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq); void Free_Align_Spec(Align_Spec *spec); int Trace_Spacing (Align_Spec *spec); double Average_Correlation(Align_Spec *spec); float *Base_Frequencies (Align_Spec *spec); /* Local_Alignment finds the longest significant local alignment between the sequences in 'align' subject to: (a) the alignment criterion given by the Align_Spec 'spec', (b) it passes through one of the points (anti+k)/2,(anti-k)/2 for k in [low,hgh] within the underlying dynamic programming matrix (i.e. the points on diagonals low to hgh on anti-diagonal anti or anti-1 (depending on whether the diagonal is odd or even)), (c) if lbord >= 0, then the alignment is always above diagonal low-lbord, and (d) if hbord >= 0, then the alignment is always below diagonal hgh+hbord. The path record of 'align' has its 'trace' filled from the point of view of an overlap between the aread and the bread. In addition a Path record from the point of view of the bread versus the aread is returned by the function, with this Path's 'trace' filled in appropriately. The space for the returned path and the two 'trace's are in the working storage supplied by the Work_Data packet and this space is reused with each call, so if one wants to retain the bread-path and the two trace point sequences, then they must be copied to user-allocated storage before calling the routine again. NULL is returned in the event of an error. Find_Extension is a variant of Local_Alignment that simply finds a local alignment that either ends (if prefix is non-zero) or begins (if prefix is zero) at the point (anti+diag)/2,(anti-diag)/2). All other parameters are as before. It returns a non-zero value only when INTERACTIVE is on and it cannot allocate the memory it needs. Only the path and trace with respect to the aread is returned. This routine is experimental and may not persist in later versions of the code. */ Path *Local_Alignment(Alignment *align, Work_Data *work, Align_Spec *spec, int low, int hgh, int anti, int lbord, int hbord); int Find_Extension(Alignment *align, Work_Data *work, Align_Spec *spec, // experimental !! int diag, int anti, int lbord, int hbord, int prefix); /* Given a legitimate Alignment object, Compute_Trace_X computes an exact trace for the alignment. If 'path.trace' is non-NULL, then it is assumed to be a sequence of pass-through points and diff levels computed by Local_Alignment. In either case 'path.trace' is set to point at an integer array within the storage of the Work_Data packet encoding an exact optimal trace from the start to end points. If the trace is needed beyond the next call to a routine that sets it, then it should be copied to an array allocated and managed by the caller. Compute_Trace_ALL does not require a sequence of pass-through points, as it computes the best alignment between (path->abpos,path->bbpos) and (path->aepos,path->bepos) in the edit graph between the sequences. Compute_Trace_PTS computes a trace by computing the trace between successive pass through points. It is much, much faster than Compute_Trace_ALL but at the tradeoff of not necessarily being optimal as pass-through points are not all perfect. Compute_Trace_MID computes a trace by computing the trace between the mid-points of alignments between two adjacent pairs of pass through points. It is generally twice as slow as Compute_Trace_PTS, but it produces nearer optimal alignments. All these routines return 1 if an error occurred and 0 otherwise. */ #define LOWERMOST -1 // Possible modes for "mode" parameter below) #define GREEDIEST 0 #define UPPERMOST 1 int Compute_Trace_ALL(Alignment *align, Work_Data *work); int Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing, int mode); int Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing, int mode); /* Compute_Trace_IRR (IRR for IRRegular) computes a trace for the given alignment where it assumes the spacing between trace points between both the A and B read varies, and futher assumes that the A-spacing is given in the short integers normally occupied by the differences in the alignment between the trace points. This routine is experimental and may not persist in later versions of the code. */ int Compute_Trace_IRR(Alignment *align, Work_Data *work, int mode); // experimental !! /* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls the display width of numbers, it must be not less than the width of any number to be displayed. If the alignment trace is an exact trace, then one can ask Print_Alignment to print an ASCII representation of the alignment 'align' to the file 'file'. Indent the display by "indent" spaces and put "width" columns per line in the display. Show "border" characters of sequence on each side of the aligned region. If upper is non-zero then display bases in upper case. If coord is greater than 0, then the positions of the first character in A and B in the given row is displayed with a field width given by coord's value. Print_Reference is like Print_Alignment but rather than printing exaclty "width" columns per segment, it prints "block" characters of the A sequence in each segment. This results in segments of different lengths, but is convenient when looking at two alignments involving A as segments are guaranteed to cover the same interval of A in a segment. Both Print routines return 1 if an error occurred (not enough memory), and 0 otherwise. Flip_Alignment modifies align so the roles of A and B are reversed. If full is off then the trace is ignored, otherwise the trace must be to a full alignment trace and this trace is also appropriately inverted. */ void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord); int Print_Alignment(FILE *file, Alignment *align, Work_Data *work, int indent, int width, int border, int upper, int coord); int Print_Reference(FILE *file, Alignment *align, Work_Data *work, int indent, int block, int border, int upper, int coord); void Flip_Alignment(Alignment *align, int full); /*** OVERLAP ABSTRACTION: Externally, between modules an Alignment is modeled by an "Overlap" record, which (a) replaces the pointers to the two sequences with their ID's in the HITS data bases, (b) does not contain the length of the 2 sequences (must fetch from DB), and (c) contains its path as a subrecord rather than as a pointer (indeed, typically the corresponding Alignment record points at the Overlap's path sub-record). The trace pointer is always to a sequence of trace points and can be either compressed (uint8) or uncompressed (uint16). One can read and write binary records of an "Overlap". ***/ typedef struct { Path path; /* Path: begin- and end-point of alignment + diffs */ uint32 flags; /* Pipeline status and complementation flags */ int aread; /* Id # of A sequence */ int bread; /* Id # of B sequence */ } Overlap; /* Read_Overlap reads the next Overlap record from stream 'input', not including the trace (if any), and without modifying 'ovl's trace pointer. Read_Trace reads the ensuing trace into the memory pointed at by the trace field of 'ovl'. It is assumed to be big enough to accommodate the trace where each value take 'tbytes' bytes (1 if uint8 or 2 if uint16). Write_Overlap write 'ovl' to stream 'output' followed by its trace vector (if any) that occupies 'tbytes' bytes per value. Print_Overlap prints an ASCII version of the contents of 'ovl' to stream 'output' where the trace occupes 'tbytes' per value and the print out is indented from the left margin by 'indent' spaces. Compress_TraceTo8 converts a trace fo 16-bit values to 8-bit values in place, and Decompress_TraceTo16 does the reverse conversion. Check_Trace_Points checks that the number of trace points is correct and that the sum of the b-read displacements equals the b-read alignment interval, assuming the trace spacing is 'tspace'. It reports an error message if there is a problem and 'verbose' is non-zero. The 'ovl' came from the file names 'fname'. */ int Read_Overlap(FILE *input, Overlap *ovl); int Read_Trace(FILE *innput, Overlap *ovl, int tbytes); void Write_Overlap(FILE *output, Overlap *ovl, int tbytes); void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent); void Compress_TraceTo8(Overlap *ovl); void Decompress_TraceTo16(Overlap *ovl); int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname); #endif // _A_MODULE DALIGNER-master/daligner.c000066400000000000000000000457511263373675100155160ustar00rootroot00000000000000/*********************************************************************************************\ * * Find all local alignment between long, noisy DNA reads: * Compare sequences in 'subject' database against those in the list of 'target' databases * searching for local alignments of 1000bp or more (defined constant MIN_OVERLAP in * filter.c). Subject is compared in both orientations againt each target. An output * stream of 'Overlap' records (see align.h) is written in binary to the standard output, * each encoding a given found local alignment between two of the sequences. The -v * option turns on a verbose reporting mode that gives statistics on each major stage. * * There cannot be more than 65,535 reads in a given db, and each read must be less than * 66,535 characters long. * * The filter operates by looking for a pair of diagonal bands of width 2^'s' that contain * a collection of exact matching 'k'-mers between the two sequences, such that the total * number of bases covered by 'k'-mer hits is 'h'. k cannot be larger than 15 in the * current implementation. * * Some k-mers are significantly over-represented (e.g. homopolymer runs). These are * suppressed as seed hits, with the parameter 'm' -- any k-mer that occurs more than * 'm' times in either the subject or target is not counted as a seed hit. If the -m * option is absent then no k-mer is suppressed. * * For each subject, target pair, say XXX and YYY, the program outputs a file containing * overlaps of the form XXX.YYY.[C|N]#.las where C implies that the reads in XXX were * complemented and N implies they were not (both comparisons are performed), and # is * the thread that detected and wrote out the collection of overlaps. For example, if * NTHREAD in the program is 4, then 8 files are output for each subject, target pair. * * Author: Gene Myers * Date : June 1, 2014 * *********************************************************************************************/ #include #include #include #include #include #include #include #include #include #if defined(BSD) #include #endif #include "DB.h" #include "filter.h" static char *Usage[] = { "[-vbAI] [-k] [-w] [-h] [-t] [-M]", " [-e] [-s] [-H]", " [-m]+ ...", }; int VERBOSE; // Globally visible to filter.c int BIASED; int MINOVER; int HGAP_MIN; int SYMMETRIC; int IDENTITY; uint64 MEM_LIMIT; uint64 MEM_PHYSICAL; /* Adapted from code by David Robert Nadeau (http://NadeauSoftware.com) licensed under * "Creative Commons Attribution 3.0 Unported License" * (http://creativecommons.org/licenses/by/3.0/deed.en_US) * * I removed Windows options, reformated, and return int64 instead of size_t */ static int64 getMemorySize( ) { #if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64)) // OSX, NetBSD, OpenBSD int mib[2]; size_t size = 0; size_t len = sizeof( size ); mib[0] = CTL_HW; #if defined(HW_MEMSIZE) mib[1] = HW_MEMSIZE; // OSX #elif defined(HW_PHYSMEM64) mib[1] = HW_PHYSMEM64; // NetBSD, OpenBSD #endif if (sysctl(mib,2,&size,&len,NULL,0) == 0) return ((size_t) size); return (0); #elif defined(_SC_AIX_REALMEM) // AIX return ((size_t) sysconf( _SC_AIX_REALMEM ) * ((size_t) 1024L)); #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) // FreeBSD, Linux, OpenBSD, & Solaris size_t size = 0; size = (size_t) sysconf(_SC_PHYS_PAGES); return (size * ((size_t) sysconf(_SC_PAGESIZE))); #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE) // ? Legacy ? size_t size = 0; size = (size_t) sysconf(_SC_PHYS_PAGES); return (size * ((size_t) sysconf(_SC_PAGE_SIZE))); #elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) // DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX int mib[2]; unsigned int size = 0; size_t len = sizeof( size ); mib[0] = CTL_HW; #if defined(HW_REALMEM) mib[1] = HW_REALMEM; // FreeBSD #elif defined(HW_PYSMEM) mib[1] = HW_PHYSMEM; // Others #endif if (sysctl(mib,2,&size,&len,NULL,0) == 0) return (size_t)size; return (0); #else return (0); #endif } typedef struct { int *ano; int *end; int idx; int out; } Event; static void reheap(int s, Event **heap, int hsize) { int c, l, r; Event *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; hr = heap[r]; if (hr->idx > hl->idx) { if (hs->idx > hl->idx) { heap[c] = hl; c = l; } else break; } else { if (hs->idx > hr->idx) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } int64 Merge_Size(HITS_DB *block, int mtop) { Event ev[mtop+1]; Event *heap[mtop+2]; int r, mhalf; int64 nsize; { HITS_TRACK *track; int i; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].ano = ((int *) (track->data)) + ((int64 *) (track->anno))[0]; ev[i].out = 1; heap[i+1] = ev+i; track = track->next; } ev[mtop].idx = INT32_MAX; heap[mtop+1] = ev+mtop; } mhalf = mtop/2; nsize = 0; for (r = 0; r < block->nreads; r++) { int i, level, hsize; HITS_TRACK *track; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].end = ((int *) (track->data)) + ((int64 *) (track->anno))[r+1]; if (ev[i].ano < ev[i].end) ev[i].idx = *(ev[i].ano); else ev[i].idx = INT32_MAX; track = track->next; } hsize = mtop; for (i = mhalf; i > 1; i--) reheap(i,heap,hsize); level = 0; while (1) { Event *p; reheap(1,heap,hsize); p = heap[1]; if (p->idx == INT32_MAX) break; p->out = 1-p->out; if (p->out) { level -= 1; if (level == 0) nsize += 1; } else { if (level == 0) nsize += 1; level += 1; } p->ano += 1; if (p->ano >= p->end) p->idx = INT32_MAX; else p->idx = *(p->ano); } } return (nsize); } HITS_TRACK *Merge_Tracks(HITS_DB *block, int mtop, int64 nsize) { HITS_TRACK *ntrack; Event ev[mtop+1]; Event *heap[mtop+2]; int r, mhalf; int64 *anno; int *data; ntrack = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating merged track"); if (ntrack == NULL) exit (1); ntrack->name = Strdup("merge","Allocating merged track"); ntrack->anno = anno = (int64 *) Malloc(sizeof(int64)*(block->nreads+1),"Allocating merged track"); ntrack->data = data = (int *) Malloc(sizeof(int)*nsize,"Allocating merged track"); ntrack->size = sizeof(int); ntrack->next = NULL; if (anno == NULL || data == NULL || ntrack->name == NULL) exit (1); { HITS_TRACK *track; int i; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].ano = ((int *) (track->data)) + ((int64 *) (track->anno))[0]; ev[i].out = 1; heap[i+1] = ev+i; track = track->next; } ev[mtop].idx = INT32_MAX; heap[mtop+1] = ev+mtop; } mhalf = mtop/2; nsize = 0; for (r = 0; r < block->nreads; r++) { int i, level, hsize; HITS_TRACK *track; anno[r] = nsize; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].end = ((int *) (track->data)) + ((int64 *) (track->anno))[r+1]; if (ev[i].ano < ev[i].end) ev[i].idx = *(ev[i].ano); else ev[i].idx = INT32_MAX; track = track->next; } hsize = mtop; for (i = mhalf; i > 1; i--) reheap(i,heap,hsize); level = 0; while (1) { Event *p; reheap(1,heap,hsize); p = heap[1]; if (p->idx == INT32_MAX) break; p->out = 1-p->out; if (p->out) { level -= 1; if (level == 0) data[nsize++] = p->idx; } else { if (level == 0) data[nsize++] = p->idx; level += 1; } p->ano += 1; if (p->ano >= p->end) p->idx = INT32_MAX; else p->idx = *(p->ano); } } anno[r] = nsize; return (ntrack); } static int read_DB(HITS_DB *block, char *name, char **mask, int *mstat, int mtop, int kmer) { int i, isdam, status, kind, stop; isdam = Open_DB(name,block); if (isdam < 0) exit (1); for (i = 0; i < mtop; i++) { status = Check_Track(block,mask[i],&kind); if (status >= 0) if (kind == MASK_TRACK) mstat[i] = 0; else mstat[i] = -3; else mstat[i] = status; if (status == 0 && kind == MASK_TRACK) Load_Track(block,mask[i]); } Trim_DB(block); stop = 0; for (i = 0; i < mtop; i++) { HITS_TRACK *track; int64 *anno; int j; status = Check_Track(block,mask[i],&kind); if (status < 0 || kind != MASK_TRACK) continue; stop += 1; track = Load_Track(block,mask[i]); anno = (int64 *) (track->anno); for (j = 0; j <= block->nreads; j++) anno[j] /= sizeof(int); } if (stop > 1) { int64 nsize; HITS_TRACK *track; nsize = Merge_Size(block,stop); track = Merge_Tracks(block,stop,nsize); while (block->tracks != NULL) Close_Track(block,block->tracks->name); block->tracks = track; } if (block->cutoff < kmer) { for (i = 0; i < block->nreads; i++) if (block->reads[i].rlen < kmer) { fprintf(stderr,"%s: Block %s contains reads < %dbp long ! Run DBsplit.\n", Prog_Name,name,kmer); exit (1); } } Read_All_Sequences(block,0); return (isdam); } static void complement(char *s, int len) { char *t; int c; t = s + (len-1); while (s < t) { c = *s; *s = (char) (3-*t); *t = (char) (3-c); s += 1; t -= 1; } if (s == t) *s = (char) (3-*s); } static HITS_DB *complement_DB(HITS_DB *block, int inplace) { static HITS_DB _cblock, *cblock = &_cblock; int nreads; HITS_READ *reads; char *seq; nreads = block->nreads; reads = block->reads; if (inplace) { seq = (char *) block->bases; cblock = block; } else { seq = (char *) Malloc(block->reads[nreads].boff+1,"Allocating dazzler sequence block"); if (seq == NULL) exit (1); *seq++ = 4; memcpy(seq,block->bases,block->reads[nreads].boff); *cblock = *block; cblock->bases = (void *) seq; cblock->tracks = NULL; } { int i; float x; x = cblock->freq[0]; cblock->freq[0] = cblock->freq[3]; cblock->freq[3] = x; x = cblock->freq[1]; cblock->freq[1] = cblock->freq[2]; cblock->freq[2] = x; for (i = 0; i < nreads; i++) complement(seq+reads[i].boff,reads[i].rlen); } { HITS_TRACK *src, *trg; int *data, *tata; int i, x, rlen; int64 *tano, *anno; int64 j, k; for (src = block->tracks; src != NULL; src = src->next) { tano = (int64 *) src->anno; tata = (int *) src->data; if (inplace) { data = tata; anno = tano; trg = src; } else { data = (int *) Malloc(sizeof(int)*tano[nreads], "Allocating dazzler interval track data"); anno = (int64 *) Malloc(sizeof(int64)*(nreads+1), "Allocating dazzler interval track index"); trg = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK), "Allocating dazzler interval track header"); if (data == NULL || trg == NULL || anno == NULL) exit (1); trg->name = Strdup(src->name,"Copying track name"); if (trg->name == NULL) exit (1); trg->size = 4; trg->anno = (void *) anno; trg->data = (void *) data; trg->next = cblock->tracks; cblock->tracks = trg; } for (i = 0; i < nreads; i++) { rlen = reads[i].rlen; anno[i] = tano[i]; j = tano[i+1]-1; k = tano[i]; while (k < j) { x = tata[j]; data[j--] = rlen - tata[k]; data[k++] = rlen - x; } if (k == j) data[k] = rlen - tata[k]; } anno[nreads] = tano[nreads]; } } return (cblock); } int main(int argc, char *argv[]) { HITS_DB _ablock, _bblock; HITS_DB *ablock = &_ablock, *bblock = &_bblock; char *afile, *bfile; char *aroot, *broot; void *aindex, *bindex; int alen, blen; Align_Spec *asettings; int isdam; int MMAX, MTOP, *MSTAT; char **MASK; int KMER_LEN; int BIN_SHIFT; int MAX_REPS; int HIT_MIN; double AVE_ERROR; int SPACING; { int i, j, k; int flags[128]; char *eptr; ARG_INIT("daligner") KMER_LEN = 14; HIT_MIN = 35; BIN_SHIFT = 6; MAX_REPS = 0; HGAP_MIN = 0; AVE_ERROR = .70; SPACING = 100; MINOVER = 1000; // Globally visible to filter.c MEM_PHYSICAL = getMemorySize(); MEM_LIMIT = MEM_PHYSICAL; if (MEM_PHYSICAL == 0) { fprintf(stderr,"\nWarning: Could not get physical memory size\n"); fflush(stderr); } MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); MSTAT = (int *) Malloc(MMAX*sizeof(int),"Allocating mask status array"); if (MASK == NULL || MSTAT == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vbAI") break; case 'k': ARG_POSITIVE(KMER_LEN,"K-mer length") break; case 'w': ARG_POSITIVE(BIN_SHIFT,"Log of bin width") break; case 'h': ARG_POSITIVE(HIT_MIN,"Hit threshold (in bp.s)") break; case 't': ARG_POSITIVE(MAX_REPS,"Tuple supression frequency") break; case 'H': ARG_POSITIVE(HGAP_MIN,"HGAP threshold (in bp.s)") break; case 'e': ARG_REAL(AVE_ERROR) if (AVE_ERROR < .7 || AVE_ERROR >= 1.) { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n", Prog_Name,AVE_ERROR); exit (1); } break; case 'l': ARG_POSITIVE(MINOVER,"Minimum alignment length") break; case 's': ARG_POSITIVE(SPACING,"Trace spacing") break; case 'M': { int limit; ARG_NON_NEGATIVE(limit,"Memory allocation (in Gb)") MEM_LIMIT = limit * 0x40000000ll; break; } case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); MSTAT = (int *) Realloc(MSTAT,MMAX*sizeof(int),"Reallocating mask status array"); if (MASK == NULL || MSTAT == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; // Globally declared in filter.h BIASED = flags['b']; // Globally declared in filter.h SYMMETRIC = 1-flags['A']; IDENTITY = flags['I']; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); exit (1); } } MINOVER *= 2; if (Set_Filter_Params(KMER_LEN,BIN_SHIFT,MAX_REPS,HIT_MIN)) { fprintf(stderr,"Illegal combination of filter parameters\n"); exit (1); } /* Read in the reads in A */ afile = argv[1]; isdam = read_DB(ablock,afile,MASK,MSTAT,MTOP,KMER_LEN); if (isdam) aroot = Root(afile,".dam"); else aroot = Root(afile,".db"); asettings = New_Align_Spec( AVE_ERROR, SPACING, ablock->freq); /* Compare against reads in B in both orientations */ { int i, j; aindex = NULL; broot = NULL; for (i = 2; i < argc; i++) { bfile = argv[i]; if (strcmp(afile,bfile) != 0) { isdam = read_DB(bblock,bfile,MASK,MSTAT,MTOP,KMER_LEN); if (isdam) broot = Root(bfile,".dam"); else broot = Root(bfile,".db"); } if (i == 2) { for (j = 0; j < MTOP; j++) { if (MSTAT[j] == -2) printf("%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]); else if (MSTAT[j] == -1) printf("%s: Warning: %s track not sync'd with relevant db.\n",Prog_Name,MASK[i]); else if (MSTAT[j] == -3) printf("%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]); } if (VERBOSE) printf("\nBuilding index for %s\n",aroot); aindex = Sort_Kmers(ablock,&alen); } if (strcmp(afile,bfile) != 0) { if (VERBOSE) printf("\nBuilding index for %s\n",broot); bindex = Sort_Kmers(bblock,&blen); Match_Filter(aroot,ablock,broot,bblock,aindex,alen,bindex,blen,0,asettings); bblock = complement_DB(bblock,1); if (VERBOSE) printf("\nBuilding index for c(%s)\n",broot); bindex = Sort_Kmers(bblock,&blen); Match_Filter(aroot,ablock,broot,bblock,aindex,alen,bindex,blen,1,asettings); free(broot); } else { Match_Filter(aroot,ablock,aroot,ablock,aindex,alen,aindex,alen,0,asettings); bblock = complement_DB(ablock,0); if (VERBOSE) printf("\nBuilding index for c(%s)\n",aroot); bindex = Sort_Kmers(bblock,&blen); Match_Filter(aroot,ablock,aroot,bblock,aindex,alen,bindex,blen,1,asettings); bblock->reads = NULL; // ablock & bblock share "reads" vector, don't let Close_DB // free it ! } Close_DB(bblock); } } exit (0); } DALIGNER-master/filter.c000066400000000000000000001715331263373675100152140ustar00rootroot00000000000000/******************************************************************************************* * * Fast local alignment filter for long, noisy reads based on "dumbing down" of my RECOMB 2005 * filter with Jens Stoye, and a "smarting up" of the k-mer matching by turning it into * a threaded sort and merge paradigm using a super cache coherent radix sort. Local * alignment is accomplised with dynamically-banded O(nd) algorithm that terminates when * it fails to find a e-matching patch for a significant distance, and polishes the match * to the last e-prefix-positive 32-mer. * * Author : Gene Myers * First : June 2013 * Current: June 1, 2014 * ********************************************************************************************/ // A complete threaded code for the filter #include #include #include #include #include #include #include #include "DB.h" #include "filter.h" #include "align.h" #define THREAD pthread_t #define MAX_BIAS 2 // In -b mode, don't consider tuples with specificity // <= 4 ^ -(kmer-MAX_BIAS) #define MAXGRAM 10000 // Cap on k-mer count histogram (in count_thread, merge_thread) #define PANEL_SIZE 50000 // Size to break up very long A-reads #define PANEL_OVERLAP 10000 // Overlap of A-panels #define MATCH_CHUNK 100 // Max expected number of hits between two reads #define TRACE_CHUNK 20000 // Max expected trace points in hits between two reads #undef TEST_LSORT #undef TEST_KSORT #undef TEST_PAIRS #undef TEST_CSORT #define HOW_MANY 3000 // Print first HOW_MANY items for each of the TEST options above #undef TEST_GATHER #undef TEST_CONTAIN #undef SHOW_OVERLAP // Show the cartoon #undef SHOW_ALIGNMENT // Show the alignment #define ALIGN_WIDTH 80 // Parameters for alignment #define ALIGN_INDENT 20 #define ALIGN_BORDER 10 #ifdef SHOW_OVERLAP #define NOTHREAD #endif #ifdef TEST_GATHER #define NOTHREAD #endif #ifdef TEST_CONTAIN #define NOTHREAD #endif typedef struct { uint64 p1; // The lower half uint64 p2; } Double; #if __ORDER_LITTLE_ENDIAN__ == __BYTE_ORDER__ typedef struct { uint64 code; int rpos; int read; } KmerPos; typedef struct { int diag; int apos; int aread; int bread; } SeedPair; #else typedef struct { uint64 code; int read; int rpos; } KmerPos; typedef struct { int apos; int diag; int bread; int aread; } SeedPair; #endif /******************************************************************************************* * * PARAMETER SETUP * ********************************************************************************************/ static int Kmer; static int Hitmin; static int Binshift; static int Suppress; static int Kshift; // 2*Kmer static uint64 Kmask; // 4^Kmer-1 static uint64 Kpowr; // 4^Kmer static int TooFrequent; // (Suppress != 0) ? Suppress : INT32_MAX int Set_Filter_Params(int kmer, int binshift, int suppress, int hitmin) { if (kmer <= 1) return (1); Kmer = kmer; Binshift = binshift; Suppress = suppress; Hitmin = hitmin; Kshift = 2*Kmer; Kpowr = (0x1llu << Kshift); Kmask = Kpowr-1; if (Suppress == 0) TooFrequent = INT32_MAX; else TooFrequent = Suppress; return (0); } /******************************************************************************************* * * LEXICOGRAPHIC SORT * ********************************************************************************************/ #define BMER 4 #define BSHIFT 8 // = 2*BMER #define BPOWR 256 // = 2^BSHIFT #define BMASK 0xffllu // = BPOWR-1 static uint64 QMASK; // = BMASK << NSHIFT static int LEX_shift; static int64 LEX_zsize; static int LEX_last; static int LEX_next; static Double *LEX_src; static Double *LEX_trg; typedef struct { int64 beg; int64 end; int64 tptr[BPOWR]; int64 sptr[NTHREADS*BPOWR]; } Lex_Arg; static void *lex_thread(void *arg) { Lex_Arg *data = (Lex_Arg *) arg; int64 *sptr = data->sptr; int64 *tptr = data->tptr; int shift = LEX_shift; // Must be a multiple of 8 in [0,120] int qshift = (LEX_next - LEX_shift) - NSHIFT; int64 zsize = LEX_zsize; Double *src = LEX_src; Double *trg = LEX_trg; int64 i, n, x; uint64 c, b; n = data->end; if (shift >= 64) { shift -= 64; if (LEX_last) for (i = data->beg; i < n; i++) { c = src[i].p2; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; } else for (i = data->beg; i < n; i++) { c = src[i].p2; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((b >> qshift) & QMASK) + x/zsize] += 1; } } else if ( ! LEX_last && LEX_next >= 64) // && LEX_shift < 64 { qshift = (LEX_next - 64) - NSHIFT; if (qshift < 0) for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((src[i].p2 << NSHIFT) & QMASK) + x/zsize] += 1; } else for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((src[i].p2 >> qshift) & QMASK) + x/zsize] += 1; } } else // LEX_last || LEX_next < 64 if (LEX_last) if (shift == 0) for (i = data->beg; i < n; i++) { c = src[i].p1; x = tptr[c&BMASK]++; trg[x] = src[i]; } else for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; } else if (shift == 0) for (i = data->beg; i < n; i++) { c = src[i].p1; x = tptr[c&BMASK]++; trg[x] = src[i]; sptr[((c >> qshift) & QMASK) + x/zsize] += 1; } else for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((b >> qshift) & QMASK) + x/zsize] += 1; } return (NULL); } static Double *lex_sort(int bytes[16], Double *src, Double *trg, Lex_Arg *parmx) { THREAD threads[NTHREADS]; int64 len, x, y; Double *xch; int i, j, k, z; int b, c, fb; len = parmx[NTHREADS-1].end; LEX_zsize = (len-1)/NTHREADS + 1; LEX_src = src; LEX_trg = trg; QMASK = (BMASK << NSHIFT); for (c = 0; c < 16; c++) if (bytes[c]) break; fb = c; for (b = c; b < 16; b = c) { for (c = b+1; c < 16; c++) if (bytes[c]) break; LEX_last = (c >= 16); LEX_shift = (b << 3); LEX_next = (c << 3); if (b == fb) { for (i = 0; i < NTHREADS; i++) for (z = 0; z < NTHREADS*BPOWR; z++) parmx[i].sptr[z] = 0; } else { x = 0; for (i = 0; i < NTHREADS; i++) { parmx[i].beg = x; x = LEX_zsize*(i+1); if (x > len) x = len; parmx[i].end = x; for (j = 0; j < BPOWR; j++) parmx[i].tptr[j] = 0; } parmx[NTHREADS-1].end = len; for (j = 0; j < BPOWR; j++) { k = (j << NSHIFT); for (z = 0; z < NTHREADS; z++) for (i = 0; i < NTHREADS; i++) { parmx[i].tptr[j] += parmx[z].sptr[k+i]; parmx[z].sptr[k+i] = 0; } } } x = 0; for (j = 0; j < BPOWR; j++) for (i = 0; i < NTHREADS; i++) { y = parmx[i].tptr[j]; parmx[i].tptr[j] = x; x += y; } for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,lex_thread,parmx+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); xch = LEX_src; LEX_src = LEX_trg; LEX_trg = xch; #ifdef TEST_LSORT printf("\nLSORT %d\n",LEX_shift); if (LEX_shift >= 64) { x = (1 << ((LEX_shift-64)+BSHIFT))-1; for (i = 0; i < len; i++) { printf("%6d: %8llx %8llx %8llx %8llx : %4llx", i,LEX_src[i].p2>>32,(LEX_src[i].p2)&0xffffffffll,LEX_src[i].p1>>32, LEX_src[i].p1&0xffffffffll,LEX_src[i].p2&x); if (i > 0 && (LEX_src[i].p1 < LEX_src[i].p1 || (LEX_src[i].p1 == LEX_src[i].p1 && (LEX_src[i].p2 & x) < (LEX_src[i-1].p2 & x)))) printf(" OO"); printf("\n"); } } else { x = (1 << (LEX_shift+BSHIFT))-1; for (i = 0; i < len; i++) { printf("%6d: %8llx %8llx %8llx %8llx : %4llx", i,LEX_src[i].p2>>32,(LEX_src[i].p2)&0xffffffffll,LEX_src[i].p1>>32, LEX_src[i].p1&0xffffffffll,LEX_src[i].p1&x); if (i > 0 && (LEX_src[i].p1 & x) < (LEX_src[i-1].p1 & x)) printf(" OO"); printf("\n"); } } #endif } return (LEX_src); } /******************************************************************************************* * * INDEX BUILD * ********************************************************************************************/ static int *NormShift = NULL; static int LogNorm, LogThresh; static int LogBase[4]; static HITS_DB *TA_block; static KmerPos *TA_list; static HITS_TRACK *TA_track; typedef struct { int tnum; int64 *kptr; int fill; } Tuple_Arg; static void *tuple_thread(void *arg) { Tuple_Arg *data = (Tuple_Arg *) arg; int tnum = data->tnum; int64 *kptr = data->kptr; KmerPos *list = TA_list; int i, m, n, x, p; uint64 c; char *s; c = TA_block->nreads; i = (c * tnum) >> NSHIFT; n = TA_block->reads[i].boff; s = ((char *) (TA_block->bases)) + n; n -= Kmer*i; if (TA_track != NULL) { HITS_READ *reads = TA_block->reads; int64 *anno1 = ((int64 *) (TA_track->anno)) + 1; int *point = (int *) (TA_track->data); int64 a, b, f; int q; f = anno1[i-1]; for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { b = f; f = anno1[i]; q = reads[i].rlen; for (a = b; a <= f; a += 2) { if (a == b) p = 0; else p = point[a-1]; if (a == f) q = reads[i].rlen; else q = point[a]; if (p+Kmer <= q) { c = 0; for (x = 1; x < Kmer; x++) c = (c << 2) | s[p++]; while (p < q) { x = s[p]; c = ((c << 2) | x) & Kmask; list[n].read = i; list[n].rpos = p++; list[n].code = c; n += 1; kptr[c & BMASK] += 1; } } } s += (q+1); } m = TA_block->reads[m].boff - Kmer*m; kptr[BMASK] += (data->fill = m-n); while (n < m) { list[n].code = 0xffffffffffffffffllu; list[n].read = 0xffffffff; list[n].rpos = 0xffffffff; n += 1; } } else for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { c = p = 0; for (x = 1; x < Kmer; x++) c = (c << 2) | s[p++]; while ((x = s[p]) != 4) { c = ((c << 2) | x) & Kmask; list[n].read = i; list[n].rpos = p++; list[n].code = c; n += 1; kptr[c & BMASK] += 1; } s += (p+1); } return (NULL); } static void *biased_tuple_thread(void *arg) { Tuple_Arg *data = (Tuple_Arg *) arg; int tnum = data->tnum; int64 *kptr = data->kptr; KmerPos *list = TA_list; int n, i, m; int x, a, k, p; uint64 d, c; char *s, *t; c = TA_block->nreads; i = (c * tnum) >> NSHIFT; n = TA_block->reads[i].boff; s = ((char *) (TA_block->bases)) + n; n -= Kmer*i; if (TA_track != NULL) { HITS_READ *reads = TA_block->reads; int64 *anno1 = ((int64 *) (TA_track->anno)) + 1; int *point = (int *) (TA_track->data); int64 j, b, f; int q; f = anno1[i-1]; for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { b = f; f = anno1[i]; t = s+1; q = reads[i].rlen; for (j = b; j <= f; j += 2) { if (j == b) p = 0; else p = point[j-1]; if (j == f) q = reads[i].rlen; else q = point[j]; if (p+Kmer <= q) { c = 0; a = 0; k = 1; while (p < q) { x = s[p]; a += LogBase[x]; c = ((c << 2) | x); while (a < LogNorm && k < Kmer) { if (++p >= q) break; k += 1; x = s[p]; a += LogBase[x]; c = ((c << 2) | x); } while (1) { int u = a-LogBase[(int) t[p-k]]; if (u < LogNorm) break; a = u; k -= 1; } if (a > LogThresh) { d = ((c << NormShift[k]) & Kmask); list[n].read = i; list[n].rpos = p; list[n].code = d; n += 1; kptr[d & BMASK] += 1; } p += 1; a -= LogBase[(int) s[p-k]]; } } } s += (q+1); } } else for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { t = s+1; c = 0; p = a = 0; k = 1; while ((x = s[p]) != 4) { a += LogBase[x]; c = ((c << 2) | x); while (a < LogNorm && k < Kmer) { if ((x = s[++p]) == 4) goto eoread2; k += 1; a += LogBase[x]; c = ((c << 2) | x); } while (1) { int u = a-LogBase[(int) t[p-k]]; if (u < LogNorm) break; a = u; k -= 1; } if (a > LogThresh) { d = ((c << NormShift[k]) & Kmask); list[n].read = i; list[n].rpos = p; list[n].code = d; n += 1; kptr[d & BMASK] += 1; } p += 1; a -= LogBase[(int) s[p-k]]; } eoread2: s += (p+1); } m = TA_block->reads[m].boff - Kmer*m; kptr[BMASK] += (data->fill = m-n); while (n < m) { list[n].code = 0xffffffffffffffffllu; list[n].read = 0xffffffff; list[n].rpos = 0xffffffff; n += 1; } return (NULL); } static KmerPos *FR_src; static KmerPos *FR_trg; typedef struct { int beg; int end; int kept; } Comp_Arg; static void *compsize_thread(void *arg) { Comp_Arg *data = (Comp_Arg *) arg; int end = data->end; KmerPos *src = FR_src; int n, i, c, p; uint64 h, g; i = data->beg; h = src[i].code; n = 0; while (i < end) { p = i++; while ((g = src[i].code) == h) i += 1; if ((c = (i-p)) < TooFrequent) n += c; h = g; } data->kept = n; return (NULL); } static void *compress_thread(void *arg) { Comp_Arg *data = (Comp_Arg *) arg; int end = data->end; KmerPos *src = FR_src; KmerPos *trg = FR_trg; int n, i, p; uint64 h, g; i = data->beg; h = src[i].code; n = data->kept; while (i < end) { p = i++; while ((g = src[i].code) == h) i += 1; if (i-p < TooFrequent) { while (p < i) trg[n++] = src[p++]; } h = g; } return (NULL); } void *Sort_Kmers(HITS_DB *block, int *len) { THREAD threads[NTHREADS]; Tuple_Arg parmt[NTHREADS]; Comp_Arg parmf[NTHREADS]; Lex_Arg parmx[NTHREADS]; int mersort[16]; KmerPos *src, *trg, *rez; int kmers, nreads; int i, j, x, z; uint64 h; for (i = 0; i < 16; i++) mersort[i] = 0; for (i = 0; i < Kshift; i += 8) mersort[i>>3] = 1; if (NormShift == NULL && BIASED) { double scale; NormShift = (int *) Malloc(sizeof(int)*(Kmer+1),"Allocating Sort_Kmers bias shift"); if (NormShift == NULL) exit (1); for (i = 0; i <= Kmer; i++) NormShift[i] = Kshift - 2*i; LogNorm = 10000 * Kmer; LogThresh = 10000 * (Kmer-MAX_BIAS); scale = -10000. / log(4.); for (i = 0; i < 4; i++) LogBase[i] = (int) ceil( scale * log(block->freq[i]) ); } nreads = block->nreads; kmers = block->reads[nreads].boff - Kmer * nreads; if (kmers <= 0) goto no_mers; if (( (Kshift-1)/BSHIFT + (TooFrequent < INT32_MAX) ) & 0x1) { trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+1),"Allocating Sort_Kmers vectors"); src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+1),"Allocating Sort_Kmers vectors"); } else { src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+1),"Allocating Sort_Kmers vectors"); trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+1),"Allocating Sort_Kmers vectors"); } if (src == NULL || trg == NULL) exit (1); if (VERBOSE) { printf("\n Kmer count = "); Print_Number((int64) kmers,0,stdout); printf("\n Using %.2fGb of space\n",(1. * kmers) / 33554432); fflush(stdout); } TA_block = block; TA_list = src; TA_track = block->tracks; for (i = 0; i < NTHREADS; i++) { parmt[i].tnum = i; parmt[i].kptr = parmx[i].tptr; for (j = 0; j < BPOWR; j++) parmt[i].kptr[j] = 0; } if (BIASED) for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,biased_tuple_thread,parmt+i); else for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,tuple_thread,parmt+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); x = 0; for (i = 0; i < NTHREADS; i++) { parmx[i].beg = x; j = (int) ((((int64) nreads) * (i+1)) >> NSHIFT); parmx[i].end = x = block->reads[j].boff - j*Kmer; } rez = (KmerPos *) lex_sort(mersort,(Double *) src,(Double *) trg,parmx); if (BIASED || TA_track != NULL) for (i = 0; i < NTHREADS; i++) kmers -= parmt[i].fill; rez[kmers].code = Kpowr; if (TooFrequent < INT32_MAX && kmers > 0) { parmf[0].beg = 0; for (i = 1; i < NTHREADS; i++) { x = (((int64) i)*kmers) >> NSHIFT; h = rez[x-1].code; while (rez[x].code == h) x += 1; parmf[i-1].end = parmf[i].beg = x; } parmf[NTHREADS-1].end = kmers; if (src == rez) { FR_src = src; FR_trg = rez = trg; } else { FR_src = trg; FR_trg = rez = src; } for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,compsize_thread,parmf+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); x = 0; for (i = 0; i < NTHREADS; i++) { z = parmf[i].kept; parmf[i].kept = x; x += z; } kmers = x; for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,compress_thread,parmf+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); rez[kmers].code = Kpowr; } if (src != rez) free(src); else free(trg); #ifdef TEST_KSORT { int i; printf("\nKMER SORT:\n"); for (i = 0; i < HOW_MANY && i < kmers; i++) { KmerPos *c = rez+i; printf(" %5d / %5d / %10lld\n",c->read,c->rpos,c->code); } fflush(stdout); } #endif if (VERBOSE) { if (TooFrequent < INT32_MAX) { printf(" Revised kmer count = "); Print_Number((int64) kmers,0,stdout); printf("\n"); } printf(" Index occupies %.2fGb\n",(1. * kmers) / 67108864); fflush(stdout); } if (kmers <= 0) { free(rez); goto no_mers; } if (MEM_LIMIT > 0 && kmers > (int64) (MEM_LIMIT/(4*sizeof(KmerPos)))) { fprintf(stderr,"Warning: Block size too big, index occupies more than 1/4 of"); if (MEM_LIMIT == MEM_PHYSICAL) fprintf(stderr," physical memory (%.1fGb)\n",(1.*MEM_LIMIT)/0x40000000ll); else fprintf(stderr," desired memory allocation (%.1fGb)\n",(1.*MEM_LIMIT)/0x40000000ll); fflush(stderr); } *len = kmers; return (rez); no_mers: *len = 0; return (NULL); } /******************************************************************************************* * * FILTER MATCH * ********************************************************************************************/ static int find_tuple(uint64 x, KmerPos *a, int n) { int l, r, m; // smallest k s.t. a[k].code >= x (or n if does not exist) l = 0; r = n; while (l < r) { m = ((l+r) >> 1); if (a[m].code < x) l = m+1; else r = m; } return (l); } // Determine what *will* be the size of the merged list and histogram of sizes for given cutoffs static KmerPos *MG_alist; static KmerPos *MG_blist; static SeedPair *MG_hits; static int MG_comp; static int MG_self; typedef struct { int abeg, aend; int bbeg, bend; int64 *kptr; int64 nhits; int limit; int64 hitgram[MAXGRAM]; } Merge_Arg; static void *count_thread(void *arg) { Merge_Arg *data = (Merge_Arg *) arg; KmerPos *asort = MG_alist; KmerPos *bsort = MG_blist; int64 *gram = data->hitgram; int64 nhits = 0; int aend = data->aend; int64 ct; int ia, ib; int jb, ja; uint64 ca, cb; uint64 da, db; int ar; ia = data->abeg; ca = asort[ia].code; ib = data->bbeg; cb = bsort[ib].code; if (MG_self) { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { if (ia >= aend) break; ct = 0; jb = ib; db = cb; if (IDENTITY) do { ar = asort[ia].read; if (MG_comp) while (db == cb && bsort[ib].read <= ar) db = bsort[++ib].code; else { while (db == cb && bsort[ib].read < ar) db = bsort[++ib].code; while (db == cb && bsort[ib].read == ar && bsort[ib].rpos < asort[ia].rpos) db = bsort[++ib].code; } ct += (ib-jb); } while ((da = asort[++ia].code) == ca); else do { ar = asort[ia].read; while (db == cb && bsort[ib].read < ar) db = bsort[++ib].code; ct += (ib-jb); } while ((da = asort[++ia].code) == ca); while (db == cb) db = bsort[++ib].code; nhits += ct; ca = da; cb = db; if (ct < MAXGRAM) gram[ct] += 1; } } } else { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { if (ia >= aend) break; ja = ia++; while ((da = asort[ia].code) == ca) ia += 1; jb = ib++; while ((db = bsort[ib].code) == cb) ib += 1; ct = (ia-ja); ct *= (ib-jb); nhits += ct; ca = da; cb = db; if (ct < MAXGRAM) gram[ct] += 1; } } } data->nhits = nhits; return (NULL); } // Produce the merged list now that the list has been allocated and // the appropriate cutoff determined. static void *merge_thread(void *arg) { Merge_Arg *data = (Merge_Arg *) arg; int64 *kptr = data->kptr; KmerPos *asort = MG_alist; KmerPos *bsort = MG_blist; SeedPair *hits = MG_hits; int64 nhits = data->nhits; int aend = data->aend; int limit = data->limit; int64 ct; int ia, ib; int jb, ja; uint64 ca, cb; uint64 da, db; int ar, ap; int a, b; ia = data->abeg; ca = asort[ia].code; ib = data->bbeg; cb = bsort[ib].code; if (MG_self) { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { if (ia >= aend) break; ct = 0; ja = ia; jb = ib; db = cb; if (IDENTITY) do { ar = asort[ia].read; ap = asort[ia].rpos; if (MG_comp) while (db == cb && bsort[ib].read <= ar) db = bsort[++ib].code; else { while (db == cb && bsort[ib].read < ar) db = bsort[++ib].code; while (db == cb && bsort[ib].read == ar && bsort[ib].rpos < ap) db = bsort[++ib].code; } ct += (ib-jb); } while ((da = asort[++ia].code) == ca); else do { ar = asort[ia].read; while (db == cb && bsort[ib].read < ar) db = bsort[++ib].code; ct += (ib-jb); } while ((da = asort[++ia].code) == ca); while (db == cb) db = bsort[++ib].code; if (ct < limit) { ib = jb; db = cb; if (IDENTITY) for (a = ja; a < ia; a++) { ap = asort[a].rpos; ar = asort[a].read; if (MG_comp) while (db == cb && bsort[ib].read <= ar) db = bsort[++ib].code; else { while (db == cb && bsort[ib].read < ar) db = bsort[++ib].code; while (db == cb && bsort[ib].read == ar && bsort[ib].rpos < ap) db = bsort[++ib].code; } if ((ct = ib-jb) > 0) { kptr[ap & BMASK] += ct; for (b = jb; b < ib; b++) { hits[nhits].bread = bsort[b].read; hits[nhits].aread = ar; hits[nhits].apos = ap; hits[nhits].diag = ap - bsort[b].rpos; nhits += 1; } } } else for (a = ja; a < ia; a++) { ap = asort[a].rpos; ar = asort[a].read; while (db == cb && bsort[ib].read < ar) db = bsort[++ib].code; if ((ct = ib-jb) > 0) { kptr[ap & BMASK] += ct; for (b = jb; b < ib; b++) { hits[nhits].bread = bsort[b].read; hits[nhits].aread = ar; hits[nhits].apos = ap; hits[nhits].diag = ap - bsort[b].rpos; nhits += 1; } } } while (db == cb) db = bsort[++ib].code; } ca = da; cb = db; } } } else { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { if (ia >= aend) break; ja = ia++; while ((da = asort[ia].code) == ca) ia += 1; jb = ib++; while ((db = bsort[ib].code) == cb) ib += 1; ct = ib-jb; if ((ia-ja)*ct < limit) { for (a = ja; a < ia; a++) { ap = asort[a].rpos; kptr[ap & BMASK] += ct; for (b = jb; b < ib; b++) { hits[nhits].bread = bsort[b].read; hits[nhits].aread = asort[a].read; hits[nhits].apos = ap; hits[nhits].diag = ap - bsort[b].rpos; nhits += 1; } } } ca = da; cb = db; } } } return (NULL); } // Report threads: given a segment of merged list, find all seeds and from them all alignments. static HITS_DB *MR_ablock; static HITS_DB *MR_bblock; static SeedPair *MR_hits; static int MR_two; static Align_Spec *MR_spec; static int MR_tspace; typedef struct { uint64 max; uint64 top; uint16 *trace; } Trace_Buffer; static int Entwine(Path *jpath, Path *kpath, Trace_Buffer *tbuf, int *where) { int ac, b2, y2, ae; int i, j, k; int num, den, min; #ifdef SEE_ENTWINE int strt = 1; int iflare, oflare; #endif uint16 *ktrace = tbuf->trace + (uint64) (kpath->trace); uint16 *jtrace = tbuf->trace + (uint64) (jpath->trace); min = 10000; num = 0; den = 0; #ifdef SEE_ENTWINE printf("\n"); #endif y2 = jpath->bbpos; j = jpath->abpos/MR_tspace; b2 = kpath->bbpos; k = kpath->abpos/MR_tspace; if (j < k) { ac = k*MR_tspace; j = 1 + 2*(k-j); k = 1; for (i = 1; i < j; i += 2) y2 += jtrace[i]; } else { ac = j*MR_tspace; k = 1 + 2*(j-k); j = 1; for (i = 1; i < k; i += 2) b2 += ktrace[i]; } ae = jpath->aepos; if (ae > kpath->aepos) ae = kpath->aepos; while (1) { ac += MR_tspace; if (ac >= ae) break; y2 += jtrace[j]; b2 += ktrace[k]; j += 2; k += 2; #ifdef SEE_ENTWINE printf(" @ %5d : %5d %5d = %4d\n",ac,y2,b2,abs(b2-y2)); #endif i = abs(y2-b2); if (i <= min) { min = i; if (i == 0) *where = ac; } num += i; den += 1; #ifdef SEE_ENTWINE if (strt) { strt = 0; iflare = i; } oflare = i; #endif } #ifdef SEE_ENTWINE if (den == 0) printf("Nothing\n"); else printf("MINIM = %d AVERAGE = %d IFLARE = %d OFLARE = %d\n",min,num/den,iflare,oflare); #endif if (den == 0) return (-1); else return (min); } // Produce the concatentation of path1 and path2 where they are known to meet at // the trace point with coordinate ap. Place this result in a big growing buffer, // that gets reset when fusion is called with path1 = NULL static void Fusion(Path *path1, int ap, Path *path2, Trace_Buffer *tbuf) { int k, k1, k2; int len, diff; uint16 *trace; k1 = 2 * ((ap/MR_tspace) - (path1->abpos/MR_tspace)); k2 = 2 * ((ap/MR_tspace) - (path2->abpos/MR_tspace)); len = k1+(path2->tlen-k2); if (tbuf->top + len >= tbuf->max) { tbuf->max = 1.2*(tbuf->top+len) + 1000; tbuf->trace = (uint16 *) Realloc(tbuf->trace,sizeof(uint16)*tbuf->max,"Allocating paths"); if (tbuf->trace == NULL) exit (1); } trace = tbuf->trace + tbuf->top; tbuf->top += len; diff = 0; len = 0; if (k1 > 0) { uint16 *t = tbuf->trace + (uint64) (path1->trace); for (k = 0; k < k1; k += 2) { trace[len++] = t[k]; trace[len++] = t[k+1]; diff += t[k]; } } if (k2 < path2->tlen) { uint16 *t = tbuf->trace + (uint64) (path2->trace); for (k = k2; k < path2->tlen; k += 2) { trace[len++] = t[k]; trace[len++] = t[k+1]; diff += t[k]; } } path1->aepos = path2->aepos; path1->bepos = path2->bepos; path1->diffs = diff; path1->trace = (void *) (trace - tbuf->trace); path1->tlen = len; } static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buffer *tbuf) { Path *jpath, *kpath; int j, k, no; int dist, awhen, bwhen; int hasB; #ifdef TEST_CONTAIN for (j = 0; j < novls; j++) printf(" %3d: [%5d,%5d] x [%5d,%5d]\n",j,amatch[j].abpos,amatch[j].aepos, amatch[j].bbpos,amatch[j].bepos); #endif hasB = (bmatch != NULL); no = 0; for (j = 1; j < novls; j++) { jpath = amatch+j; for (k = no; k >= 0; k--) { kpath = amatch+k; if (jpath->abpos < kpath->abpos) { if (kpath->abpos <= jpath->aepos && kpath->bbpos <= jpath->bepos) { dist = Entwine(jpath,kpath,tbuf,&awhen); if (dist == 0) { if (kpath->aepos > jpath->aepos) { if (hasB) { if (MG_comp) { dist = Entwine(bmatch+k,bmatch+j,tbuf,&bwhen); if (dist != 0) continue; Fusion(jpath,awhen,kpath,tbuf); amatch[k] = *jpath; Fusion(bmatch+k,bwhen,bmatch+j,tbuf); #ifdef TEST_CONTAIN printf(" Really 1"); #endif } else { dist = Entwine(bmatch+j,bmatch+k,tbuf,&bwhen); if (dist != 0) continue; Fusion(jpath,awhen,kpath,tbuf); amatch[k] = *jpath; Fusion(bmatch+j,bwhen,bmatch+k,tbuf); bmatch[k] = bmatch[j]; #ifdef TEST_CONTAIN printf(" Really 2"); #endif } } else { Fusion(jpath,awhen,kpath,tbuf); amatch[k] = *jpath; #ifdef TEST_CONTAIN printf(" Really 3"); #endif } } else { amatch[k] = *jpath; if (hasB) bmatch[k] = bmatch[j]; } #ifdef TEST_CONTAIN printf(" Fuse! A %d %d\n",j,k); #endif break; } } } else // kpath->abpos <= jpath->abpos { if (jpath->abpos <= kpath->aepos && jpath->bbpos <= kpath->bepos) { dist = Entwine(kpath,jpath,tbuf,&awhen); if (dist == 0) { if (kpath->abpos == jpath->abpos) { if (kpath->aepos < jpath->aepos) { amatch[k] = *jpath; if (hasB) bmatch[k] = bmatch[j]; } } else if (jpath->aepos > kpath->aepos) { if (hasB) { if (MG_comp) { dist = Entwine(bmatch+j,bmatch+k,tbuf,&bwhen); if (dist != 0) continue; Fusion(kpath,awhen,jpath,tbuf); Fusion(bmatch+j,bwhen,bmatch+k,tbuf); bmatch[k] = bmatch[j]; #ifdef TEST_CONTAIN printf(" Really 4"); #endif } else { dist = Entwine(bmatch+k,bmatch+j,tbuf,&bwhen); if (dist != 0) continue; Fusion(kpath,awhen,jpath,tbuf); Fusion(bmatch+k,bwhen,bmatch+j,tbuf); #ifdef TEST_CONTAIN printf(" Really 5"); #endif } } else { Fusion(kpath,awhen,jpath,tbuf); #ifdef TEST_CONTAIN printf(" Really 6"); #endif } } #ifdef TEST_CONTAIN printf(" Fuse! B %d %d\n",j,k); #endif break; } } } } if (k < 0) { no += 1; amatch[no] = *jpath; if (hasB) bmatch[no] = bmatch[j]; } } novls = no+1; #ifdef TEST_CONTAIN for (j = 0; j < novls; j++) printf(" %3d: [%5d,%5d] x [%5d,%5d]\n",j,amatch[j].abpos,amatch[j].aepos, amatch[j].bbpos,amatch[j].bepos); #endif return (novls); } void Diagonal_Span(Path *path, int *mind, int *maxd) { uint16 *points; int i, tlen; int dd, low, hgh; points = path->trace; tlen = path->tlen; dd = path->abpos - path->bbpos; low = hgh = dd; dd = path->aepos - path->bepos; if (dd < low) low = dd; else if (dd > hgh) hgh = dd; dd = (path->abpos/MR_tspace)*MR_tspace - path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { dd += MR_tspace - points[i]; if (dd < low) low = dd; else if (dd > hgh) hgh = dd; } *mind = (low >> Binshift)-1; *maxd = (hgh >> Binshift)+1; } typedef struct { int64 beg, end; int *score; int *lastp; int *lasta; Work_Data *work; FILE *ofile1; FILE *ofile2; int64 nfilt; int64 ncheck; } Report_Arg; static void *report_thread(void *arg) { Report_Arg *data = (Report_Arg *) arg; SeedPair *hits = MR_hits; Double *hitd = (Double *) MR_hits; char *aseq = (char *) (MR_ablock->bases); char *bseq = (char *) (MR_bblock->bases); HITS_READ *aread = MR_ablock->reads; HITS_READ *bread = MR_bblock->reads; int *score = data->score; int *scorp = data->score + 1; int *scorm = data->score - 1; int *lastp = data->lastp; int *lasta = data->lasta; Work_Data *work = data->work; FILE *ofile1 = data->ofile1; FILE *ofile2 = data->ofile2; int afirst = MR_ablock->tfirst; int bfirst = MR_bblock->tfirst; int maxdiag = ( MR_ablock->maxlen >> Binshift); int mindiag = (-MR_bblock->maxlen >> Binshift); Overlap _ovla, *ovla = &_ovla; Overlap _ovlb, *ovlb = &_ovlb; Alignment _align, *align = &_align; Path *apath = &(ovla->path); Path *bpath; int64 nfilt = 0; int64 ahits = 0; int64 bhits = 0; int small, tbytes; int AOmax, BOmax; int novla, novlb; Path *amatch, *bmatch; Trace_Buffer _tbuf, *tbuf = &_tbuf; Double *hitc; int minhit; uint64 cpair, npair; int64 nidx, eidx; // In ovl and align roles of A and B are reversed, as the B sequence must be the // complemented sequence !! align->flags = ovla->flags = ovlb->flags = MG_comp; align->path = apath; if (MR_tspace <= TRACE_XOVR) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } AOmax = BOmax = MATCH_CHUNK; amatch = Malloc(sizeof(Path)*AOmax,"Allocating match vector"); bmatch = Malloc(sizeof(Path)*BOmax,"Allocating match vector"); tbuf->max = 2*TRACE_CHUNK; tbuf->trace = Malloc(sizeof(short)*tbuf->max,"Allocating trace vector"); if (amatch == NULL || bmatch == NULL || tbuf->trace == NULL) exit (1); fwrite(&ahits,sizeof(int64),1,ofile1); fwrite(&MR_tspace,sizeof(int),1,ofile1); if (MR_two) { fwrite(&bhits,sizeof(int64),1,ofile2); fwrite(&MR_tspace,sizeof(int),1,ofile2); } minhit = (Hitmin-1)/Kmer + 1; hitc = hitd + (minhit-1); eidx = data->end - minhit; nidx = data->beg; for (cpair = hitd[nidx].p2; nidx < eidx; cpair = npair) if (hitc[nidx].p2 != cpair) { nidx += 1; while ((npair = hitd[nidx].p2) == cpair) nidx += 1; } else { int ar, br; int alen, blen; int doA, doB; int setaln, amark, amark2; int apos, bpos, diag; int64 lidx, sidx; int64 f, h2; ar = hits[nidx].aread; br = hits[nidx].bread; alen = aread[ar].rlen; blen = bread[br].rlen; if (alen < HGAP_MIN && blen < HGAP_MIN) { nidx += 1; while ((npair = hitd[nidx].p2) == cpair) nidx += 1; continue; } #ifdef TEST_GATHER printf("%5d vs %5d : %5d x %5d\n",br+bfirst,ar+afirst,blen,alen); #endif setaln = 1; doA = doB = 0; amark2 = 0; novla = novlb = 0; tbuf->top = 0; for (sidx = nidx; hitd[nidx].p2 == cpair; nidx = h2) { amark = amark2 + PANEL_SIZE; amark2 = amark - PANEL_OVERLAP; h2 = lidx = nidx; do { apos = hits[nidx].apos; npair = hitd[++nidx].p2; if (apos <= amark2) h2 = nidx; } while (npair == cpair && apos <= amark); if (nidx-lidx < minhit) continue; for (f = lidx; f < nidx; f++) { apos = hits[f].apos; diag = hits[f].diag >> Binshift; if (apos - lastp[diag] >= Kmer) score[diag] += Kmer; else score[diag] += apos - lastp[diag]; lastp[diag] = apos; } #ifdef TEST_GATHER printf(" %6lld upto %6d",nidx-lidx,amark); #endif for (f = lidx; f < nidx; f++) { apos = hits[f].apos; diag = hits[f].diag; bpos = apos - diag; diag = diag >> Binshift; if (apos > lasta[diag] && (score[diag] + scorp[diag] >= Hitmin || score[diag] + scorm[diag] >= Hitmin)) { if (setaln) { setaln = 0; align->aseq = aseq + aread[ar].boff; align->bseq = bseq + bread[br].boff; align->alen = alen; align->blen = blen; ovlb->bread = ovla->aread = ar + afirst; ovlb->aread = ovla->bread = br + bfirst; doA = (alen >= HGAP_MIN); doB = (SYMMETRIC && blen >= HGAP_MIN && (ar != br || !MG_self || !MG_comp)); } #ifdef TEST_GATHER else printf("\n "); if (scorm[diag] > scorp[diag]) printf(" %5d.. x %5d.. %5d (%3d)", bpos,apos,apos-bpos,score[diag]+scorm[diag]); else printf(" %5d.. x %5d.. %5d (%3d)", bpos,apos,apos-bpos,score[diag]+scorp[diag]); #endif nfilt += 1; bpath = Local_Alignment(align,work,MR_spec,apos-bpos,apos-bpos,apos+bpos,-1,-1); { int low, hgh, ae; Diagonal_Span(apath,&low,&hgh); if (diag < low) low = diag; else if (diag > hgh) hgh = diag; ae = apath->aepos; for (diag = low; diag <= hgh; diag++) if (ae > lasta[diag]) lasta[diag] = ae; #ifdef TEST_GATHER printf(" %d - %d @ %d",low,hgh,apath->aepos); #endif } if ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos) >= MINOVER) { if (doA) { if (novla >= AOmax) { AOmax = 1.2*novla + MATCH_CHUNK; amatch = Realloc(amatch,sizeof(Path)*AOmax, "Reallocating match vector"); if (amatch == NULL) exit (1); } if (tbuf->top + apath->tlen > tbuf->max) { tbuf->max = 1.2*(tbuf->top+apath->tlen) + TRACE_CHUNK; tbuf->trace = Realloc(tbuf->trace,sizeof(short)*tbuf->max, "Reallocating trace vector"); if (tbuf->trace == NULL) exit (1); } amatch[novla] = *apath; amatch[novla].trace = (void *) (tbuf->top); memcpy(tbuf->trace+tbuf->top,apath->trace,sizeof(short)*apath->tlen); novla += 1; tbuf->top += apath->tlen; } if (doB) { if (novlb >= BOmax) { BOmax = 1.2*novlb + MATCH_CHUNK; bmatch = Realloc(bmatch,sizeof(Path)*BOmax, "Reallocating match vector"); if (bmatch == NULL) exit (1); } if (tbuf->top + bpath->tlen > tbuf->max) { tbuf->max = 1.2*(tbuf->top+bpath->tlen) + TRACE_CHUNK; tbuf->trace = Realloc(tbuf->trace,sizeof(short)*tbuf->max, "Reallocating trace vector"); if (tbuf->trace == NULL) exit (1); } bmatch[novlb] = *bpath; bmatch[novlb].trace = (void *) (tbuf->top); memcpy(tbuf->trace+tbuf->top,bpath->trace,sizeof(short)*bpath->tlen); novlb += 1; tbuf->top += bpath->tlen; } #ifdef TEST_GATHER printf(" [%5d,%5d] x [%5d,%5d] = %4d", apath->abpos,apath->aepos,apath->bbpos,apath->bepos,apath->diffs); #endif #ifdef SHOW_OVERLAP printf("\n\n %d(%d) vs %d(%d)\n\n", ovla->aread,ovla->alen,ovla->bread,ovla->blen); Print_ACartoon(stdout,align,ALIGN_INDENT); #ifdef SHOW_ALIGNMENT Compute_Trace_ALL(align,work); printf("\n Diff = %d\n",align->path->diffs); Print_Alignment(stdout,align,work, ALIGN_INDENT,ALIGN_WIDTH,ALIGN_BORDER,0,5); #endif #endif // SHOW_OVERLAP } #ifdef TEST_GATHER else printf(" No alignment %d", ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos))/2); #endif } } for (f = lidx; f < nidx; f++) { diag = hits[f].diag >> Binshift; score[diag] = lastp[diag] = 0; } #ifdef TEST_GATHER printf("\n"); #endif } for (f = sidx; f < nidx; f++) { int d; diag = hits[f].diag >> Binshift; for (d = diag; d <= maxdiag; d++) if (lasta[d] == 0) break; else lasta[d] = 0; for (d = diag-1; d >= mindiag; d--) if (lasta[d] == 0) break; else lasta[d] = 0; } { int i; #ifdef TEST_CONTAIN if (novla > 1 || novlb > 1) printf("\n%5d vs %5d:\n",ar,br); #endif if (novla > 1) { if (novlb > 1) novla = novlb = Handle_Redundancies(amatch,novla,bmatch,tbuf); else novla = Handle_Redundancies(amatch,novla,NULL,tbuf); } else if (novlb > 1) novlb = Handle_Redundancies(bmatch,novlb,NULL,tbuf); for (i = 0; i < novla; i++) { ovla->path = amatch[i]; ovla->path.trace = tbuf->trace + (uint64) (ovla->path.trace); if (small) Compress_TraceTo8(ovla); Write_Overlap(ofile1,ovla,tbytes); } for (i = 0; i < novlb; i++) { ovlb->path = bmatch[i]; ovlb->path.trace = tbuf->trace + (uint64) (ovlb->path.trace); if (small) Compress_TraceTo8(ovlb); Write_Overlap(ofile2,ovlb,tbytes); } ahits += novla; bhits += novlb; } } free(tbuf->trace); free(bmatch); free(amatch); data->nfilt = nfilt; data->ncheck = ahits + bhits; if (MR_two) { rewind(ofile2); fwrite(&bhits,sizeof(int64),1,ofile2); fclose(ofile2); } else ahits += bhits; rewind(ofile1); fwrite(&ahits,sizeof(int64),1,ofile1); fclose(ofile1); return (NULL); } /******************************************************************************************* * * THE ALGORITHM * ********************************************************************************************/ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock, void *vasort, int alen, void *vbsort, int blen, int comp, Align_Spec *aspec) { THREAD threads[NTHREADS]; Merge_Arg parmm[NTHREADS]; Lex_Arg parmx[NTHREADS]; Report_Arg parmr[NTHREADS]; int pairsort[16]; SeedPair *khit, *hhit; SeedPair *work1, *work2; int64 nhits; int64 nfilt, ncheck; KmerPos *asort, *bsort; int64 atot, btot; asort = (KmerPos *) vasort; bsort = (KmerPos *) vbsort; atot = ablock->totlen; btot = bblock->totlen; { int64 powr; int i, nbyte; for (i = 0; i < 16; i++) pairsort[i] = 0; powr = 1; for (nbyte = 0; powr < ablock->maxlen; nbyte += 1) powr <<= 8; for (i = 4; i < 4+nbyte; i++) pairsort[i] = 1; powr = 1; for (nbyte = 0; powr < ablock->nreads; nbyte += 1) powr <<= 8; for (i = 8; i < 8+nbyte; i++) pairsort[i] = 1; powr = 1; for (nbyte = 0; powr < bblock->nreads; nbyte += 1) powr <<= 8; for (i = 12; i < 12+nbyte; i++) pairsort[i] = 1; } nfilt = ncheck = nhits = 0; if (VERBOSE) { if (comp) printf("\nComparing %s to c(%s)\n",aname,bname); else printf("\nComparing %s to %s\n",aname,bname); } if (alen == 0 || blen == 0) goto zerowork; { int i, j, p; uint64 c; int limit; MG_alist = asort; MG_blist = bsort; MG_self = (aname == bname); MG_comp = comp; parmm[0].abeg = parmm[0].bbeg = 0; for (i = 1; i < NTHREADS; i++) { p = (int) ((((int64) alen) * i) >> NSHIFT); if (p > 0) { c = asort[p-1].code; while (asort[p].code == c) p += 1; } parmm[i].abeg = parmm[i-1].aend = p; parmm[i].bbeg = parmm[i-1].bend = find_tuple(asort[p].code,bsort,blen); } parmm[NTHREADS-1].aend = alen; parmm[NTHREADS-1].bend = blen; for (i = 0; i < NTHREADS; i++) for (j = 0; j < MAXGRAM; j++) parmm[i].hitgram[j] = 0; for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,count_thread,parmm+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); if (VERBOSE) printf("\n"); if (MEM_LIMIT > 0) { int64 histo[MAXGRAM]; int64 tom, avail; for (j = 0; j < MAXGRAM; j++) histo[j] = parmm[0].hitgram[j]; for (i = 1; i < NTHREADS; i++) for (j = 0; j < MAXGRAM; j++) histo[j] += parmm[i].hitgram[j]; if (asort == bsort || (int64) (MEM_LIMIT/sizeof(Double)) > alen + 2*blen) avail = (MEM_LIMIT/sizeof(Double) - alen) / 2; else avail = MEM_LIMIT/sizeof(Double) - (alen + blen); avail *= .98; tom = 0; for (j = 0; j < MAXGRAM; j++) { tom += j*histo[j]; if (tom > avail) break; } limit = j; if (limit <= 1) { fprintf(stderr,"\nError: Insufficient "); if (MEM_LIMIT == MEM_PHYSICAL) fprintf(stderr," physical memory (%.1fGb), reduce block size\n", (1.*MEM_LIMIT)/0x40000000ll); else { fprintf(stderr," memory allocation (%.1fGb),",(1.*MEM_LIMIT)/0x40000000ll); fprintf(stderr," reduce block size or increase allocation\n"); } fflush(stderr); exit (1); } if (limit < 10) { fprintf(stderr,"\nWarning: Sensitivity hampered by low "); if (MEM_LIMIT == MEM_PHYSICAL) fprintf(stderr," physical memory (%.1fGb), reduce block size\n", (1.*MEM_LIMIT)/0x40000000ll); else { fprintf(stderr," memory allocation (%.1fGb),",(1.*MEM_LIMIT)/0x40000000ll); fprintf(stderr," reduce block size or increase allocation\n"); } fflush(stderr); } if (VERBOSE) { printf(" Capping mutual k-mer matches over %d (effectively -t%d)\n", limit,(int) sqrt(1.*limit)); fflush(stdout); } for (i = 0; i < NTHREADS; i++) { parmm[i].nhits = 0; for (j = 1; j < limit; j++) parmm[i].nhits += j * parmm[i].hitgram[j]; parmm[i].limit = limit; } } else for (i = 0; i < NTHREADS; i++) parmm[i].limit = INT32_MAX; nhits = parmm[0].nhits; for (i = 1; i < NTHREADS; i++) parmm[i].nhits = nhits += parmm[i].nhits; if (VERBOSE) { printf(" Hit count = "); Print_Number(nhits,0,stdout); if (asort == bsort || nhits >= blen) printf("\n Highwater of %.2fGb space\n", (1. * (alen + 2*nhits)) / 67108864); else printf("\n Highwater of %.2fGb space\n", (1. * (alen + blen + nhits)) / 67108864); fflush(stdout); } if (nhits == 0) goto zerowork; if (asort == bsort) hhit = work1 = (SeedPair *) Malloc(sizeof(SeedPair)*(nhits+1), "Allocating dazzler hit vectors"); else { if (nhits >= blen) bsort = (KmerPos *) Realloc(bsort,sizeof(SeedPair)*(nhits+1), "Reallocating dazzler sort vectors"); hhit = work1 = (SeedPair *) bsort; } khit = work2 = (SeedPair *) Malloc(sizeof(SeedPair)*(nhits+1),"Allocating dazzler hit vectors"); if (hhit == NULL || khit == NULL || bsort == NULL) exit (1); MG_blist = bsort; MG_hits = khit; for (i = NTHREADS-1; i > 0; i--) parmm[i].nhits = parmm[i-1].nhits; parmm[0].nhits = 0; for (i = 0; i < NTHREADS; i++) { parmm[i].kptr = parmx[i].tptr; for (p = 0; p < BPOWR; p++) parmm[i].kptr[p] = 0; } for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,merge_thread,parmm+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); #ifdef TEST_PAIRS printf("\nSETUP SORT:\n"); for (i = 0; i < HOW_MANY && i < nhits; i++) { SeedPair *c = khit+i; printf(" %5d / %5d / %5d /%5d\n",c->aread,c->bread,c->apos,c->apos-c->diag); } #endif } { int i; int64 x; x = 0; for (i = 0; i < NTHREADS-1; i++) { parmx[i].beg = x; parmx[i].end = x = parmm[i+1].nhits; } parmx[NTHREADS-1].beg = x; parmx[NTHREADS-1].end = nhits; khit = (SeedPair *) lex_sort(pairsort,(Double *) khit,(Double *) hhit,parmx); khit[nhits].aread = 0x7fffffff; khit[nhits].bread = 0x7fffffff; khit[nhits].diag = 0x7fffffff; khit[nhits].apos = 0; #ifdef TEST_CSORT printf("\nCROSS SORT %lld:\n",nhits); for (i = 0; i < HOW_MANY && i <= nhits; i++) { SeedPair *c = khit+i; printf(" %5d / %5d / %5d /%5d\n",c->aread,c->bread,c->apos,c->apos-c->diag); } #endif } { int i, w; int64 p; int d; int *counters; MR_ablock = ablock; MR_bblock = bblock; MR_hits = khit; MR_two = ! MG_self && SYMMETRIC; MR_spec = aspec; MR_tspace = Trace_Spacing(aspec); parmr[0].beg = 0; for (i = 1; i < NTHREADS; i++) { p = (nhits * i) >> NSHIFT; if (p > 0) { d = khit[p-1].bread; while ((khit[p].bread) == d) p += 1; } parmr[i].beg = parmr[i-1].end = p; } parmr[NTHREADS-1].end = nhits; w = ((ablock->maxlen >> Binshift) - ((-bblock->maxlen) >> Binshift)) + 1; counters = (int *) Malloc(NTHREADS*3*w*sizeof(int),"Allocating diagonal buckets"); if (counters == NULL) exit (1); for (i = 0; i < 3*w*NTHREADS; i++) counters[i] = 0; for (i = 0; i < NTHREADS; i++) { if (i == 0) parmr[i].score = counters - ((-bblock->maxlen) >> Binshift); else parmr[i].score = parmr[i-1].lasta + w; parmr[i].lastp = parmr[i].score + w; parmr[i].lasta = parmr[i].lastp + w; parmr[i].work = New_Work_Data(); parmr[i].ofile1 = Fopen(Catenate(aname,".",bname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w"); if (parmr[i].ofile1 == NULL) exit (1); if (MG_self) parmr[i].ofile2 = parmr[i].ofile1; else if (SYMMETRIC) { parmr[i].ofile2 = Fopen(Catenate(bname,".",aname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w"); if (parmr[i].ofile2 == NULL) exit (1); } } #ifdef NOTHREAD for (i = 0; i < NTHREADS; i++) report_thread(parmr+i); #else for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,report_thread,parmr+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); #endif if (VERBOSE) for (i = 0; i < NTHREADS; i++) { nfilt += parmr[i].nfilt; ncheck += parmr[i].ncheck; } for (i = 0; i < NTHREADS; i++) Free_Work_Data(parmr[i].work); free(counters); } free(work2); free(work1); goto epilogue; zerowork: { FILE *ofile; int i; nhits = 0; for (i = 0; i < NTHREADS; i++) { ofile = Fopen(Catenate(aname,".",bname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w"); fwrite(&nhits,sizeof(int64),1,ofile); fwrite(&MR_tspace,sizeof(int),1,ofile); fclose(ofile); if (! MG_self && SYMMETRIC) { ofile = Fopen(Catenate(bname,".",aname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w"); fwrite(&nhits,sizeof(int64),1,ofile); fwrite(&MR_tspace,sizeof(int),1,ofile); fclose(ofile); } } } epilogue: if (VERBOSE) { int width; if (nhits <= 0) width = 1; else width = ((int) log10((double) nhits)) + 1; width += (width-1)/3; printf("\n "); Print_Number(nhits,width,stdout); printf(" %d-mers (%e of matrix)\n ",Kmer,(1.*nhits/atot)/btot); Print_Number(nfilt,width,stdout); printf(" seed hits (%e of matrix)\n ",(1.*nfilt/atot)/btot); Print_Number(ncheck,width,stdout); printf(" confirmed hits (%e of matrix)\n",(1.*ncheck/atot)/btot); fflush(stdout); } } DALIGNER-master/filter.h000066400000000000000000000017041263373675100152110ustar00rootroot00000000000000/******************************************************************************************* * * Filter interface for the dazzler. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #ifndef _FILTER #define _FILTER #include "DB.h" #include "align.h" extern int BIASED; extern int VERBOSE; extern int MINOVER; extern int HGAP_MIN; extern int SYMMETRIC; extern int IDENTITY; extern uint64 MEM_LIMIT; extern uint64 MEM_PHYSICAL; #define NTHREADS 4 // Must be a power of 2 #define NSHIFT 2 // log_2 NTHREADS int Set_Filter_Params(int kmer, int binshift, int suppress, int hitmin); void *Sort_Kmers(HITS_DB *block, int *len); void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock, void *atable, int alen, void *btable, int blen, int comp, Align_Spec *asettings); #endif