pax_global_header00006660000000000000000000000064132246522450014517gustar00rootroot0000000000000052 comment=233274ac5a069342715cc6119e61a7fd86c507bf DALIGNER-master/000077500000000000000000000000001322465224500135415ustar00rootroot00000000000000DALIGNER-master/DB.c000066400000000000000000001522361322465224500142030ustar00rootroot00000000000000/******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif /******************************************************************************************* * * GENERAL UTILITIES * ********************************************************************************************/ char *Prog_Name; #ifdef INTERACTIVE char Ebuffer[1000]; #endif int Count_Args(char *var) { int cnt, lev; char *s; cnt = 1; lev = 0; for (s = var; *s != '\0'; s++) if (*s == ',') { if (lev == 0) cnt += 1; } else if (*s == '(') lev += 1; else if (*s == ')') lev -= 1; return (cnt); } void *Malloc(int64 size, char *mesg) { void *p; if ((p = malloc(size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } void *Realloc(void *p, int64 size, char *mesg) { if (size <= 0) size = 1; if ((p = realloc(p,size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } char *Strdup(char *name, char *mesg) { char *s; if (name == NULL) return (NULL); if ((s = strdup(name)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (s); } FILE *Fopen(char *name, char *mode) { FILE *f; if (name == NULL || mode == NULL) return (NULL); if ((f = fopen(name,mode)) == NULL) EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); return (f); } char *PathTo(char *name) { char *path, *find; if (name == NULL) return (NULL); if ((find = rindex(name,'/')) != NULL) { *find = '\0'; path = Strdup(name,"Extracting path from"); *find = '/'; } else path = Strdup(".","Allocating default path"); return (path); } char *Root(char *name, char *suffix) { char *path, *find, *dot; int epos; if (name == NULL) return (NULL); find = rindex(name,'/'); if (find == NULL) find = name; else find += 1; if (suffix == NULL) { dot = strchr(find,'.'); if (dot != NULL) *dot = '\0'; path = Strdup(find,"Extracting root from"); if (dot != NULL) *dot = '.'; } else { epos = strlen(find); epos -= strlen(suffix); if (epos > 0 && strcasecmp(find+epos,suffix) == 0) { find[epos] = '\0'; path = Strdup(find,"Extracting root from"); find[epos] = suffix[0]; } else path = Strdup(find,"Allocating root"); } return (path); } char *Catenate(char *path, char *sep, char *root, char *suffix) { static char *cat = NULL; static int max = -1; int len; if (path == NULL || root == NULL || sep == NULL || suffix == NULL) return (NULL); len = strlen(path); len += strlen(sep); len += strlen(root); len += strlen(suffix); if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); return (NULL); } } sprintf(cat,"%s%s%s%s",path,sep,root,suffix); return (cat); } char *Numbered_Suffix(char *left, int num, char *right) { static char *suffix = NULL; static int max = -1; int len; if (left == NULL || right == NULL) return (NULL); len = strlen(left); len += strlen(right) + 40; if (len > max) { max = ((int) (1.2*len)) + 100; if ((suffix = (char *) realloc(suffix,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); return (NULL); } } sprintf(suffix,"%s%d%s",left,num,right); return (suffix); } #define COMMA ',' // Print big integers with commas/periods for better readability void Print_Number(int64 num, int width, FILE *out) { if (width == 0) { if (num < 1000ll) fprintf(out,"%lld",num); else if (num < 1000000ll) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else if (num < 1000000000ll) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll, COMMA,(num%1000000000ll)/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); } else { if (num < 1000ll) fprintf(out,"%*lld",width,num); else if (num < 1000000ll) { if (width <= 4) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll); } else if (num < 1000000000ll) { if (width <= 8) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); } else { if (width <= 12) fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); } } } // Return the number of digits, base 10, of num int Number_Digits(int64 num) { int digit; digit = 0; while (num >= 1) { num /= 10; digit += 1; } return (digit); } /******************************************************************************************* * * READ COMPRESSION/DECOMPRESSION UTILITIES * ********************************************************************************************/ // Compress read into 2-bits per base (from [0-3] per byte representation void Compress_Read(int len, char *s) { int i; char c, d; char *s0, *s1, *s2, *s3; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; c = s1[len]; d = s2[len]; s0[len] = s1[len] = s2[len] = 0; for (i = 0; i < len; i += 4) *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]); s1[len] = c; s2[len] = d; } // Uncompress read form 2-bits per base into [0-3] per byte representation void Uncompress_Read(int len, char *s) { int i, tlen, byte; char *s0, *s1, *s2, *s3; char *t; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; tlen = (len-1)/4; t = s+tlen; for (i = tlen*4; i >= 0; i -= 4) { byte = *t--; s0[i] = (char) ((byte >> 6) & 0x3); s1[i] = (char) ((byte >> 4) & 0x3); s2[i] = (char) ((byte >> 2) & 0x3); s3[i] = (char) (byte & 0x3); } s[len] = 4; } // Convert read in [0-3] representation to ascii representation (end with '\n') void Lower_Read(char *s) { static char letter[4] = { 'a', 'c', 'g', 't' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } void Upper_Read(char *s) { static char letter[4] = { 'A', 'C', 'G', 'T' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } void Letter_Arrow(char *s) { static char letter[4] = { '1', '2', '3', '4' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } // Convert read in ascii representation to [0-3] representation (end with 4) void Number_Read(char *s) { static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; for ( ; *s != '\0'; s++) *s = number[(int) *s]; *s = 4; } void Number_Arrow(char *s) { static char arrow[128] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, }; for ( ; *s != '\0'; s++) *s = arrow[(int) *s]; *s = 4; } /******************************************************************************************* * * DB OPEN, TRIM & CLOSE ROUTINES * ********************************************************************************************/ // Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char* path, DAZZ_DB *db) { DAZZ_DB dbcopy; char *root, *pwd, *bptr, *fptr, *cat; int nreads; FILE *index, *dbvis; int status, plen, isdam; int part, cutoff, all; int ufirst, tfirst, ulast, tlast; status = -1; dbcopy = *db; plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); pwd = PathTo(path); bptr = rindex(root,'.'); if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-') { part = strtol(bptr+1,&fptr,10); if (*fptr != '\0' || part == 0) part = 0; else *bptr = '\0'; } else part = 0; isdam = 0; cat = Catenate(pwd,"/",root,".db"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { cat = Catenate(pwd,"/",root,".dam"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path); goto error; } isdam = 1; } if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) goto error1; if (fread(db,sizeof(DAZZ_DB),1,index) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); goto error2; } { int p, nblocks, nfiles; int64 size; char fname[MAX_NAME], prolog[MAX_NAME]; nblocks = 0; if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } for (p = 0; p < nfiles; p++) if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) if (part == 0) { cutoff = 0; all = DB_ALL; } else { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", Prog_Name,root); goto error2; } else { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (part > nblocks) { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); goto error2; } } if (part > 0) { for (p = 1; p <= part; p++) if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } } else { ufirst = tfirst = 0; ulast = db->ureads; tlast = db->treads; } } db->trimmed = 0; db->tracks = NULL; db->part = part; db->cutoff = cutoff; db->allarr |= all; db->ufirst = ufirst; db->tfirst = tfirst; nreads = ulast-ufirst; if (part <= 0) { db->reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); if (db->reads == NULL) goto error2; db->reads += 1; if (fread(db->reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(db->reads-1); goto error2; } } else { DAZZ_READ *reads; int i, r, maxlen; int64 totlen; reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); if (reads == NULL) goto error2; reads += 1; fseeko(index,sizeof(DAZZ_READ)*ufirst,SEEK_CUR); if (fread(reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(reads-1); goto error2; } totlen = 0; maxlen = 0; for (i = 0; i < nreads; i++) { r = reads[i].rlen; totlen += r; if (r > maxlen) maxlen = r; } db->maxlen = maxlen; db->totlen = totlen; db->reads = reads; } ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part ((int *) (db->reads))[-2] = tlast - tfirst; db->nreads = nreads; db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); if (db->path == NULL) goto error2; db->bases = NULL; db->loaded = 0; status = isdam; error2: fclose(index); error1: fclose(dbvis); error: if (bptr != NULL) *bptr = '.'; free(pwd); free(root); if (status < 0) *db = dbcopy; return (status); } // Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(DAZZ_DB *db) { int i, j, r; int allflag, cutoff; int64 totlen; int maxlen, nreads; DAZZ_TRACK *record; DAZZ_READ *reads; if (db->trimmed) return; if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return; cutoff = db->cutoff; if ((db->allarr & DB_ALL) != 0) allflag = 0; else allflag = DB_BEST; reads = db->reads; nreads = db->nreads; for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,".@qvs") == 0) { uint16 *table = ((DAZZ_QV *) record)->table; j = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) table[j++] = table[i]; } else { int *anno4, size; int64 *anno8; char *anno, *data; size = record->size; data = (char *) record->data; if (data == NULL) { anno = (char *) record->anno; j = 0; for (i = r = 0; i < db->nreads; i++, r += size) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (record->anno); j = anno4[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (record->anno); j = anno8[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno8[j],NULL); } record->anno = Realloc(record->anno,record->size*(j+1),NULL); } totlen = maxlen = 0; for (j = i = 0; i < nreads; i++) { r = reads[i].rlen; if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff) { totlen += r; if (r > maxlen) maxlen = r; reads[j++] = reads[i]; } } db->totlen = totlen; db->maxlen = maxlen; db->nreads = j; db->trimmed = 1; if (j < nreads) { db->reads = Realloc(reads-1,sizeof(DAZZ_READ)*(j+2),NULL); db->reads += 1; } } // The DB has already been trimmed, but a track over the untrimmed DB needs to be loaded. // Trim the track by rereading the untrimmed DB index from the file system. static int Late_Track_Trim(DAZZ_DB *db, DAZZ_TRACK *track, int ispart) { int i, j, r; int allflag, cutoff; int ureads; char *root; DAZZ_READ read; FILE *indx; if (!db->trimmed) return (0); if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return (0); cutoff = db->cutoff; if ((db->allarr & DB_ALL) != 0) allflag = 0; else allflag = DB_BEST; root = rindex(db->path,'/') + 2; indx = Fopen(Catenate(db->path,"","",".idx"),"r"); fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*db->ufirst,SEEK_SET); if (ispart) ureads = ((int *) (db->reads))[-1]; else ureads = db->ureads; if (strcmp(track->name,".@qvs") == 0) { EPRINTF(EPLACE,"%s: Cannot load QV track after trimming\n",Prog_Name); fclose(indx); EXIT(1); } { int *anno4, size; int64 *anno8; char *anno, *data; size = track->size; data = (char *) track->data; if (data == NULL) { anno = (char *) track->anno; j = r = 0; for (i = r = 0; i < ureads; i++, r += size) { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } r += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (track->anno); j = anno4[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (track->anno); j = anno8[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno8[j],NULL); } track->anno = Realloc(track->anno,track->size*(j+1),NULL); } fclose(indx); return (0); } // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(DAZZ_DB *db) { DAZZ_TRACK *t, *p; if (db->loaded) free(((char *) (db->bases)) - 1); else if (db->bases != NULL) fclose((FILE *) db->bases); if (db->reads != NULL) free(db->reads-1); free(db->path); Close_QVs(db); for (t = db->tracks; t != NULL; t = p) { p = t->next; free(t->anno); free(t->data); free(t); } } // Return the size in bytes of the memory occupied by a given DB int64 sizeof_DB(DAZZ_DB *db) { int64 s; DAZZ_TRACK *t; s = sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*(db->nreads+2) + strlen(db->path)+1 + (db->totlen+db->nreads+4); t = db->tracks; if (t != NULL && strcmp(t->name,".@qvs") == 0) { DAZZ_QV *q = (DAZZ_QV *) t; s += sizeof(DAZZ_QV) + sizeof(uint16) * db->nreads + q->ncodes * sizeof(QVcoding) + 6; t = t->next; } for (; t != NULL; t = t->next) { s += sizeof(DAZZ_TRACK) + strlen(t->name)+1 + t->size * (db->nreads+1); if (t->data != NULL) { if (t->size == 8) s += sizeof(int)*((int64 *) t->anno)[db->nreads]; else // t->size == 4 s += sizeof(int)*((int *) t->anno)[db->nreads]; } } return (s); } /******************************************************************************************* * * QV LOAD & CLOSE ROUTINES * ********************************************************************************************/ DAZZ_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" DAZZ_QV *Active_QV; // Becomes invalid after closing int Load_QVs(DAZZ_DB *db) { FILE *quiva, *istub, *indx; char *root; uint16 *table; DAZZ_QV *qvtrk; QVcoding *coding, *nx; int ncodes = 0; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) return (0); if (db->trimmed) { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); EXIT(1); } if (db->reads[db->nreads-1].coff < 0) { if (db->part > 0) { EPRINTF(EPLACE,"%s: All QVs for this block have not been added to the DB!\n",Prog_Name); EXIT(1); } else { EPRINTF(EPLACE,"%s: All QVs for this DB have not been added!\n",Prog_Name); EXIT(1); } } // Open .qvs, .idx, and .db files quiva = Fopen(Catenate(db->path,"","",".qvs"),"r"); if (quiva == NULL) return (-1); istub = NULL; indx = NULL; table = NULL; coding = NULL; qvtrk = NULL; root = rindex(db->path,'/'); if (root[1] == '.') { *root = '\0'; istub = Fopen(Catenate(db->path,"/",root+2,".db"),"r"); *root = '/'; } else istub = Fopen(Catenate(db->path,"","",".db"),"r"); if (istub == NULL) goto error; { int first, last, nfiles; char prolog[MAX_NAME], fname[MAX_NAME]; int i, j; if (fscanf(istub,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (db->part > 0) { int pfirst, plast; int fbeg, fend; int n, k; FILE *indx; // Determine first how many and which files span the block (fbeg to fend) pfirst = db->ufirst; plast = pfirst + db->nreads; first = 0; for (fbeg = 0; fbeg < nfiles; fbeg++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (last > pfirst) break; first = last; } for (fend = fbeg+1; fend <= nfiles; fend++) { if (last >= plast) break; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } first = last; } indx = Fopen(Catenate(db->path,"","",".idx"),"r"); ncodes = fend-fbeg; coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (indx == NULL || coding == NULL || table == NULL) { ncodes = 0; goto error; } // Carefully get the first coding scheme (its offset is most likely in a DAZZ_RECORD // in .idx that is *not* in memory). Get all the other coding schemes normally and // assign the tables # for each read in the block in "tables". rewind(istub); (void) fscanf(istub,DB_NFILE,&nfiles); first = 0; for (n = 0; n < fbeg; n++) { (void) fscanf(istub,DB_FDATA,&last,fname,prolog); first = last; } for (n = fbeg; n < fend; n++) { (void) fscanf(istub,DB_FDATA,&last,fname,prolog); i = n-fbeg; if (first < pfirst) { DAZZ_READ read; fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*first,SEEK_SET); if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); ncodes = i; goto error; } fseeko(quiva,read.coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; } else { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first-pfirst].coff = ftello(quiva); } j = first-pfirst; if (j < 0) j = 0; k = last-pfirst; if (k > db->nreads) k = db->nreads; while (j < k) table[j++] = (uint16) i; first = last; } fclose(indx); indx = NULL; } else { // Load in coding scheme for each file, adjust .coff of first read in the file, and // record which table each read uses ncodes = nfiles; coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (coding == NULL || table == NULL) goto error; first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } fseeko(quiva,db->reads[first].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first].coff = ftello(quiva); for (j = first; j < last; j++) table[j] = (uint16) i; first = last; } } // Allocate and fill in the DAZZ_QV record and add it to the front of the // track list qvtrk = (DAZZ_QV *) Malloc(sizeof(DAZZ_QV),"Allocating QV pseudo-track"); if (qvtrk == NULL) goto error; qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); if (qvtrk->name == NULL) goto error; qvtrk->next = db->tracks; db->tracks = (DAZZ_TRACK *) qvtrk; qvtrk->ncodes = ncodes; qvtrk->table = table; qvtrk->coding = coding; qvtrk->quiva = quiva; } fclose(istub); return (0); error: if (qvtrk != NULL) free(qvtrk); if (table != NULL) free(table); if (coding != NULL) { int i; for (i = 0; i < ncodes; i++) Free_QVcoding(coding+i); free(coding); } if (indx != NULL) fclose(indx); if (istub != NULL) fclose(istub); fclose(quiva); EXIT(1); } // Close the QV stream, free the QV pseudo track and all associated memory void Close_QVs(DAZZ_DB *db) { DAZZ_TRACK *track; DAZZ_QV *qvtrk; int i; Active_DB = NULL; track = db->tracks; if (track != NULL && strcmp(track->name,".@qvs") == 0) { qvtrk = (DAZZ_QV *) track; for (i = 0; i < qvtrk->ncodes; i++) Free_QVcoding(qvtrk->coding+i); free(qvtrk->coding); free(qvtrk->table); fclose(qvtrk->quiva); db->tracks = track->next; free(track); } return; } /******************************************************************************************* * * TRACK LOAD & CLOSE ROUTINES * ********************************************************************************************/ // Return status of track: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track int Check_Track(DAZZ_DB *db, char *track, int *kind) { FILE *afile; int tracklen, size, ispart; int ureads, treads; afile = NULL; if (db->part > 0) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) return (-2); if (fread(&tracklen,sizeof(int),1,afile) != 1) { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } if (fread(&size,sizeof(int),1,afile) != 1) { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } if (size == 0) *kind = MASK_TRACK; else if (size > 0) *kind = CUSTOM_TRACK; else { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } fclose(afile); if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (tracklen == ureads) return (0); else if (tracklen == treads) return (1); else return (-1); } // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created DAZZ_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned. DAZZ_TRACK *Load_Track(DAZZ_DB *db, char *track) { FILE *afile, *dfile; int tracklen, size; int nreads, ispart; int treads, ureads; void *anno; void *data; char *name; DAZZ_TRACK *record; if (track[0] == '.') { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); EXIT(NULL); } for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,track) == 0) return (record); afile = NULL; if (db->part) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); return (NULL); } dfile = NULL; anno = NULL; data = NULL; record = NULL; if (ispart) name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"); else name = Catenate(db->path,".",track,".data"); if (name == NULL) goto error; dfile = fopen(name,"r"); if (fread(&tracklen,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (fread(&size,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size < 0) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 0) size = 8; if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (db->trimmed) { if (tracklen != treads && tracklen != ureads) { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) { if (tracklen == treads) fseeko(afile,size*db->tfirst,SEEK_CUR); else fseeko(afile,size*db->ufirst,SEEK_CUR); } } else { if (tracklen != ureads) { if (tracklen == treads) EPRINTF(EPLACE,"%s: Track '%s' is for a trimmed DB !\n",Prog_Name,track); else EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->ufirst,SEEK_CUR); } if (tracklen == treads) nreads = ((int *) (db->reads))[-2]; else nreads = ((int *) (db->reads))[-1]; anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); if (anno == NULL) goto error; if (dfile != NULL) { int64 *anno8, off8, dlen; int *anno4, off4; int i; if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 4) { anno4 = (int *) anno; off4 = anno4[0]; if (off4 != 0) { for (i = 0; i <= nreads; i++) anno4[i] -= off4; fseeko(dfile,off4,SEEK_SET); } dlen = anno4[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } else { anno8 = (int64 *) anno; off8 = anno8[0]; if (off8 != 0) { for (i = 0; i <= nreads; i++) anno8[i] -= off8; fseeko(dfile,off8,SEEK_SET); } dlen = anno8[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } if (data == NULL) goto error; if (dlen > 0) { if (fread(data,dlen,1,dfile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track); goto error; } } fclose(dfile); dfile = NULL; } else { if (fread(anno,size,nreads,afile) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } data = NULL; } fclose(afile); record = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK),"Allocating Track Record"); if (record == NULL) goto error; record->name = Strdup(track,"Allocating Track Name"); if (record->name == NULL) goto error; record->data = data; record->anno = anno; record->size = size; if (db->trimmed && tracklen != treads) { if (Late_Track_Trim(db,record,ispart)) goto error; } if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) { record->next = db->tracks->next; db->tracks->next = record; } else { record->next = db->tracks; db->tracks = record; } return (record); error: if (record != NULL) free(record); if (data != NULL) free(data); if (anno != NULL) free(anno); if (dfile != NULL) fclose(dfile); fclose(afile); EXIT (NULL); } // Assumming file pointer for afile is correctly positioned at the start of a extra item, // and aname is the name of the .anno file, decode the value present and places it in // extra if extra->nelem == 0, otherwise reduce the value just read into extra according // according the to the directive given by 'accum'. Leave the read poinrt at the next // extra or end-of-file. // Returns: // 1 if at the end of file, // 0 if item was read and folded correctly, // -1 if there was a system IO or allocation error (if interactive), and // -2 if the new value could not be reduced into the currenct value of extra (interactive) int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra) { int vtype, nelem, accum, slen; char *name; void *value; #define EREAD(v,s,n,file,ret) \ { if (fread(v,s,n,file) != (size_t) n) \ { if (ferror(file)) \ fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ else if (ret) \ return (1); \ else \ fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,aname); \ EXIT(-1); \ } \ } EREAD(&vtype,sizeof(int),1,afile,1) EREAD(&nelem,sizeof(int),1,afile,0) EREAD(&accum,sizeof(int),1,afile,0) EREAD(&slen,sizeof(int),1,afile,0) if (extra == NULL) { if (fseeko(afile,slen+8*nelem,SEEK_CUR) < 0) { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); EXIT(-1); } return (0); } name = (char *) Malloc(slen+1,"Allocating extra name"); value = Malloc(8*nelem,"Allocating extra value"); if (name == NULL || value == NULL) EXIT(-1); EREAD(name,1,slen,afile,0); EREAD(value,8,nelem,afile,0); name[slen] = '\0'; if (extra->nelem == 0) { extra->vtype = vtype; extra->nelem = nelem; extra->accum = accum; extra->name = name; extra->value = value; return (0); } if (vtype != extra->vtype) { fprintf(stderr,"%s: Type of extra %s does not agree with previous .anno block files\n", Prog_Name,name); goto error; } if (nelem != extra->nelem) { fprintf(stderr,"%s: Length of extra %s does not agree with previous .anno block files\n", Prog_Name,name); goto error; } if (accum != extra->accum) { fprintf(stderr,"%s: Reduction indicator of extra %s does not agree with",Prog_Name,name); fprintf(stderr," previos .anno block files\n"); goto error; } if (strcmp(name,extra->name) != 0) { fprintf(stderr,"%s: Expecting extra %s in .anno block file, not %s\n", Prog_Name,extra->name,name); goto error; } if (vtype == DB_INT) { int64 *ival = (int64 *) value; int64 *eval = (int64 *) (extra->value); int j; if (accum == DB_EXACT) { for (j = 0; j < nelem; j++) if (eval[j] != ival[j]) { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name); fprintf(stderr," with previous .anno block files\n"); goto error; } } else { for (j = 0; j < nelem; j++) eval[j] += ival[j]; } } else { double *ival = (double *) value; double *eval = (double *) (extra->value); int j; if (accum == DB_EXACT) { for (j = 0; j < nelem; j++) if (eval[j] != ival[j]) { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name); fprintf(stderr," with previous .anoo block files\n"); goto error; } } else { for (j = 0; j < nelem; j++) eval[j] += ival[j]; } } free(value); free(name); return (0); error: free(value); free(name); EXIT(1); } // Write extra record to end of file afile and advance write pointer // If interactive, then return non-zero on error, if bash, then print // and halt if an error int Write_Extra(FILE *afile, DAZZ_EXTRA *extra) { int slen; #define EWRITE(v,s,n,file) \ { if (fwrite(v,s,n,file) != (size_t) n) \ { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ EXIT(1); \ } \ } EWRITE(&(extra->vtype),sizeof(int),1,afile) FWRITE(&(extra->nelem),sizeof(int),1,afile) FWRITE(&(extra->accum),sizeof(int),1,afile) slen = strlen(extra->name); FWRITE(&slen,sizeof(int),1,afile) FWRITE(extra->name,1,slen,afile) FWRITE(extra->value,8,extra->nelem,afile) return (0); } void Close_Track(DAZZ_DB *db, char *track) { DAZZ_TRACK *record, *prev; prev = NULL; for (record = db->tracks; record != NULL; record = record->next) { if (strcmp(record->name,track) == 0) { free(record->anno); free(record->data); free(record->name); if (prev == NULL) db->tracks = record->next; else prev->next = record->next; free(record); return; } prev = record; } return; } /******************************************************************************************* * * READ BUFFER ALLOCATION AND READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer big enough for the largest read in 'db', leaving room // for an initial delimiter character char *New_Read_Buffer(DAZZ_DB *db) { char *read; read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); if (read == NULL) EXIT(NULL); return (read+1); } // Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a // lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and // 3(T) otherwise. // // **NB**, the byte before read will be set to a delimiter character! int Load_Read(DAZZ_DB *db, int i, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(1); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); db->bases = (void *) bases; } off = r[i].boff; len = r[i].rlen; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(1); } } Uncompress_Read(len,read); if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (0); } // Load into 'read' the i'th arrow in 'db'. As an ASCII string if ascii is 1, // and as a numeric string otherwise. // DAZZ_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow" FILE *Arrow_File = NULL; // Becomes invalid after closing int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii) { FILE *arrow; int64 off; int len, clen; DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Arrow)\n",Prog_Name); EXIT(1); } if (Arrow_DB != db) { if (Arrow_File != NULL) fclose(Arrow_File); arrow = Fopen(Catenate(db->path,"","",".arw"),"r"); if (arrow == NULL) EXIT(1); Arrow_File = arrow; Arrow_DB = db; } else arrow = Arrow_File; off = r[i].boff; len = r[i].rlen; if (ftello(arrow) != off) fseeko(arrow,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,arrow) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Arrow)\n",Prog_Name); EXIT(1); } } Uncompress_Read(len,read); if (ascii == 1) { Letter_Arrow(read); read[-1] = '\0'; } else read[-1] = 4; return (0); } char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; int bbeg, bend; DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(NULL); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(NULL); db->bases = (void *) bases; } bbeg = beg/4; bend = (end-1)/4+1; off = r[i].boff + bbeg; len = end - beg; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = bend-bbeg; if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(NULL); } } Uncompress_Read(4*clen,read); read += beg%4; read[len] = 4; if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (read); } /******************************************************************************************* * * QV BUFFER ALLOCATION QV READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer of 5 vectors big enough for the largest read in 'db' char **New_QV_Buffer(DAZZ_DB *db) { char **entry; char *qvs; int i; qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); if (qvs == NULL || entry == NULL) EXIT(NULL); for (i = 0; i < 5; i++) entry[i] = qvs + i*db->maxlen; return (entry); } // Load into entry the QV streams for the i'th read from db. The parameter ascii applies to // the DELTAG stream as described for Load_Read. int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii) { DAZZ_READ *reads; FILE *quiva; int rlen; if (db != Active_DB) { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name); EXIT(1); } Active_QV = (DAZZ_QV *) db->tracks; Active_DB = db; } if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); EXIT(1); } reads = db->reads; quiva = Active_QV->quiva; rlen = reads[i].rlen; fseeko(quiva,reads[i].coff,SEEK_SET); if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) EXIT(1); if (ascii != 1) { char *deltag = entry[1]; if (ascii != 2) { char x = deltag[rlen]; deltag[rlen] = '\0'; Number_Read(deltag); deltag[rlen] = x; } else { int j; int u = 'A'-'a'; for (j = 0; j < rlen; j++) deltag[j] = (char) (deltag[j]+u); } } return (0); } /******************************************************************************************* * * BLOCK LOAD OF ALL READS (PRIMARILY FOR DALIGNER) * ********************************************************************************************/ // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // non-zero then the reads are converted to ACGT ascii, otherwise the reads are left // as numeric strings over 0(A), 1(C), 2(G), and 3(T). int Read_All_Sequences(DAZZ_DB *db, int ascii) { FILE *bases; int nreads = db->nreads; DAZZ_READ *reads = db->reads; void (*translate)(char *s); char *seq; int64 o, off; int i, len, clen; bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); if (seq == NULL) { fclose(bases); EXIT(1); } *seq++ = 4; if (ascii == 1) translate = Lower_Read; else translate = Upper_Read; o = 0; for (i = 0; i < nreads; i++) { len = reads[i].rlen; off = reads[i].boff; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(seq+o,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name); free(seq); fclose(bases); EXIT(1); } } Uncompress_Read(len,seq+o); if (ascii) translate(seq+o); reads[i].boff = o; o += (len+1); } reads[nreads].boff = o; fclose(bases); db->bases = (void *) seq; db->loaded = 1; return (0); } // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. // -1 is returned if the path could not be found, and 1 is returned if an error (reported // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. int List_DB_Files(char *path, void actor(char *path, char *extension)) { int status, plen, rlen, dlen; char *root, *pwd, *name; int isdam; DIR *dirp; struct dirent *dp; status = 0; pwd = PathTo(path); plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); rlen = strlen(root); if (root == NULL || pwd == NULL) { free(pwd); free(root); EXIT(1); } if ((dirp = opendir(pwd)) == NULL) { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; goto error; } isdam = 0; while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary) { name = dp->d_name; if (strcmp(name,Catenate("","",root,".db")) == 0) break; if (strcmp(name,Catenate("","",root,".dam")) == 0) { isdam = 1; break; } } if (dp == NULL) { status = -1; closedir(dirp); goto error; } if (isdam) actor(Catenate(pwd,"/",root,".dam"),"dam"); else actor(Catenate(pwd,"/",root,".db"),"db"); rewinddir(dirp); // Report each auxiliary file while ((dp = readdir(dirp)) != NULL) { name = dp->d_name; dlen = strlen(name); #ifdef HIDE_FILES if (name[0] != '.') continue; dlen -= 1; name += 1; #endif if (dlen < rlen+1) continue; if (name[rlen] != '.') continue; if (strncmp(name,root,rlen) != 0) continue; actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); } closedir(dirp); error: free(pwd); free(root); return (status); } void Print_Read(char *s, int width) { int i; if (s[0] < 4) { for (i = 0; s[i] != 4; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%d",s[i]); } printf("\n"); } else { for (i = 0; s[i] != '\0'; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%c",s[i]); } printf("\n"); } } DALIGNER-master/DB.h000066400000000000000000000614371322465224500142120ustar00rootroot00000000000000/******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. Eventually will also * support compressed quality information. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #ifndef _DAZZ_DB #define _DAZZ_DB #include #include "QV.h" #define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" // Undefine if you don't want this // For interactive applications where it is inappropriate to simply exit with an error // message to standard error, define the constant INTERACTIVE. If set, then error // messages are put in the global variable Ebuffer and the caller of a DB routine // can decide how to deal with the error. // // DB, QV, or alignment routines that can encounter errors function as before in // non-INTERACTIVE mode by exiting after printing an error message to stderr. In // INTERACTIVE mode the routines place a message at EPLACE and return an error // value. For such routines that were previously void, they are now int, and // return 1 if an error occured, 0 otherwise. #ifdef INTERACTIVE #define EPRINTF sprintf #define EPLACE Ebuffer #define EXIT(x) return (x) #else // BATCH #define EPRINTF fprintf #define EPLACE stderr #define EXIT(x) exit (1) #endif typedef unsigned char uint8; typedef unsigned short uint16; typedef unsigned int uint32; typedef unsigned long long uint64; typedef signed char int8; typedef signed short int16; typedef signed int int32; typedef signed long long int64; typedef float float32; typedef double float64; /******************************************************************************************* * * COMMAND LINE INTERPRETATION MACROS * ********************************************************************************************/ extern char *Prog_Name; // Name of program #ifdef INTERACTIVE extern char Ebuffer[]; #endif #define ARG_INIT(name) \ Prog_Name = Strdup(name,""); \ for (i = 0; i < 128; i++) \ flags[i] = 0; #define ARG_FLAGS(set) \ for (k = 1; argv[i][k] != '\0'; k++) \ { if (index(set,argv[i][k]) == NULL) \ { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \ exit (1); \ } \ flags[(int) argv[i][k]] = 1; \ } #define ARG_POSITIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } \ if (var <= 0) \ { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_NON_NEGATIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } \ if (var < 0) \ { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_REAL(var) \ var = strtod(argv[i]+2,&eptr); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not a real number\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } /******************************************************************************************* * * GUARDED BATCH IO MACROS * ********************************************************************************************/ // Utilitieis int Count_Args(char *arg); #define SYSTEM_READ_ERROR \ { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ exit (2); \ } #define SYSTEM_WRITE_ERROR \ { fprintf(stderr,"%s: System error, write failed!\n",Prog_Name); \ exit (2); \ } #define SYSTEM_CLOSE_ERROR \ { fprintf(stderr,"%s: System error, file close failed!\n",Prog_Name); \ exit (2); \ } // Output #define FWRITE(v,s,n,file) \ { if (fwrite(v,s,n,file) != (size_t) n) \ SYSTEM_WRITE_ERROR \ } #define FPRINTF(file,...) \ { if (fprintf(file,__VA_ARGS__) < 0) \ SYSTEM_WRITE_ERROR \ } #define PRINTF(...) \ { if (printf(__VA_ARGS__) < 0) \ SYSTEM_WRITE_ERROR \ } #define FPUTS(x,file) \ { if (fputs(x,file) == EOF) \ SYSTEM_WRITE_ERROR \ } // Close #define FCLOSE(file) \ { if (fclose(file) != 0) \ SYSTEM_CLOSE_ERROR \ } // Input #define FREAD(v,s,n,file) \ { if (fread(v,s,n,file) != (size_t) n) \ { if (ferror(file)) \ SYSTEM_READ_ERROR \ else \ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ exit (1); \ } \ } \ } #define FSCANF(file,...) \ { if (fscanf(file,__VA_ARGS__) != Count_Args(#__VA_ARGS__)-1) \ { if (ferror(file)) \ SYSTEM_READ_ERROR \ else \ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ exit (1); \ } \ } \ } #define FGETS(v,n,file) \ { if (fgets(v,n,file) == NULL) \ { if (ferror(file)) \ SYSTEM_READ_ERROR \ else \ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ exit (1); \ } \ } \ } #define FSEEKO(file,p,d) \ { if (fseeko(file,p,d) < 0) \ SYSTEM_READ_ERROR \ } #define FTELLO(file) \ ( { int x = ftello(file); \ if (x < 0) \ SYSTEM_READ_ERROR \ ; x; \ } ) /******************************************************************************************* * * UTILITIES * ********************************************************************************************/ // The following general utilities return NULL if any of their input pointers are NULL, or if they // could not perform their function (in which case they also print an error to stderr). void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to char *Strdup(char *string, char *mesg); // stderr if out of memory FILE *Fopen(char *path, char *mode); // Open file path for "mode" char *PathTo(char *path); // Return path portion of file name "path" char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path" // Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer // Numbered_Suffix returns concatenation of left..right in a *temporary* buffer char *Catenate(char *path, char *sep, char *root, char *suffix); char *Numbered_Suffix(char *left, int num, char *right); // DB-related utilities void Print_Number(int64 num, int width, FILE *out); // Print readable big integer int Number_Digits(int64 num); // Return # of digits in printed number #define COMPRESSED_LEN(len) (((len)+3) >> 2) void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form void Print_Read(char *s, int width); void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt) void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT) void Number_Read(char *s); // Convert read from letters to numbers void Letter_Arrow(char *s); // Convert arrow pw's from numbers to uppercase letters (0-3 to 1234) void Number_Arrow(char *s); // Convert arrow pw string from letters to numbers /******************************************************************************************* * * DB IN-CORE DATA STRUCTURES * ********************************************************************************************/ #define DB_QV 0x03ff // Mask for 3-digit quality value #define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert #define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1) #define DB_ARROW 0x2 // DB is an arrow DB #define DB_ALL 0x1 // all wells are in the trimmed DB // Fields have different interpretations if a .db versus a .dam typedef struct { int origin; // Well # (DB), Contig # (DAM) int rlen; // Length of the sequence (Last pulse = fpulse + rlen) int fpulse; // First pulse (DB), left index of contig in scaffold (DAM) int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in '.qvs' file (DB), // Offset (in bytes) of scaffold header string in '.hdr' file (DAM) // 4 compressed shorts containing snr info if an arrow DB. int flags; // QV of read + flags above (DB only) } DAZZ_READ; // A track can be of 3 types: // data == NULL: there are nreads 'anno' records of size 'size'. // data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1]) // contains the variable length data // data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1]) // contains the variable length data typedef struct _track { struct _track *next; // Link to next track char *name; // Symbolic name of track int size; // Size in bytes of anno records void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL } DAZZ_TRACK; // The tailing part of a .anno track file can contain meta-information produced by the // command that produced the track. For example, the coverage, or good/bad parameters // for trimming, or even say a histogram of QV values. Each item is an array of 'nelem' // 64-bit ints or floats ('vtype' = DB_INT or DB_REAL), has a 'name' string that // describes it, and an indicator as to whether the values should be equal accross all // block tracks, or summed accross all block tracks (by Catrack). 'value' points at the // array of values #define DB_INT 0 #define DB_REAL 1 #define DB_EXACT 0 #define DB_SUM 1 typedef struct { int vtype; // INT64 or FLOAST64 int nelem; // >= 1 int accum; // EXACT, SUM char *name; void *value; } DAZZ_EXTRA; // The information for accessing QV streams is in a DAZZ_QV record that is a "pseudo-track" // named ".@qvs" and is always the first track record in the list (if present). Since normal // track names cannot begin with a . (this is enforced), this pseudo-track is never confused // with a normal track. typedef struct { struct _track *next; char *name; int ncodes; // # of coding tables QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h) uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with // scheme coding[table[i]] FILE *quiva; // the open file pointer to the .qvs file } DAZZ_QV; // The DB record holds all information about the current state of an active DB including an // array of DAZZ_READS, one per read, and a linked list of DAZZ_TRACKs the first of which // is always a DAZZ_QV pseudo-track (if the QVs have been loaded). typedef struct { int ureads; // Total number of reads in untrimmed DB int treads; // Total number of reads in trimmed DB int cutoff; // Minimum read length in block (-1 if not yet set) int allarr; // DB_ALL | DB_ARROW float freq[4]; // frequency of A, C, G, T, respectively // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed) int maxlen; // length of maximum read (initially over all DB) int64 totlen; // total # of bases (initially over all DB) int nreads; // # of reads in actively loaded portion of DB int trimmed; // DB has been trimmed by cutoff/all int part; // DB block (if > 0), total DB (if == 0) int ufirst; // Index of first read in block (without trimming) int tfirst; // Index of first read in block (with trimming) // In order to avoid forcing users to have to rebuild all thier DBs to accommodate // the addition of fields for the size of the actively loaded trimmed and untrimmed // blocks, an additional read record is allocated in "reads" when a DB is loaded into // memory (reads[-1]) and the two desired fields are crammed into the first two // integer spaces of the record. char *path; // Root name of DB for .bps, .qvs, and tracks int loaded; // Are reads loaded in memory? void *bases; // file pointer for bases file (to fetch reads from), // or memory pointer to uncompressed block of all sequences. DAZZ_READ *reads; // Array [-1..nreads] of DAZZ_READ DAZZ_TRACK *tracks; // Linked list of loaded tracks } DAZZ_DB; /******************************************************************************************* * * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock * ********************************************************************************************/ #define MAX_NAME 10000 // Longest file name or fasta header line #define DB_NFILE "files = %9d\n" // number of files #define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name #define DB_NBLOCK "blocks = %9d\n" // number of blocks #define DB_PARAMS "size = %10lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well #define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed) /******************************************************************************************* * * DB ROUTINES * ********************************************************************************************/ // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps, // .DB.qvs, and files .DB..anno and DB..data where is a track name // (not containing a . !). // A DAM is basically a DB except that: // 1. there are no QV's, instead .coff points the '\0' terminated fasta header of the read // in the file ..hdr file // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences // contain N-separated contigs), and .fpulse the first base of the contig in the // fasta entry // Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char *path, DAZZ_DB *db); // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(DAZZ_DB *db); // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(DAZZ_DB *db); // Return the size in bytes of the given DB int64 sizeof_DB(DAZZ_DB *db); // If QV pseudo track is not already in db's track list, then load it and set it up. // The database must not have been trimmed yet. -1 is returned if a .qvs file is not // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE // is defined. Otherwise a 0 is returned. int Load_QVs(DAZZ_DB *db); // Remove the QV pseudo track, all space associated with it, and close the .qvs file. void Close_QVs(DAZZ_DB *db); // Look up the file and header in the file of the indicated track. Return: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track // In addition, if opened (0 or 1 returned), then kind points at an integer indicating // the type of track as follows: // CUSTOM 0 => a custom track // MASK 1 => a mask track #define CUSTOM_TRACK 0 #define MASK_TRACK 1 int Check_Track(DAZZ_DB *db, char *track, int *kind); // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created DAZZ_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise // the routine prints an error message to stderr and exits if an error occurs, and returns // with NULL only if the track does not exist. DAZZ_TRACK *Load_Track(DAZZ_DB *db, char *track); // Assumming file pointer for afile is correctly positioned at the start of a extra item, // and aname is the name of the .anno file, decode the value present and places it in // extra if extra->nelem == 0, otherwise reduce the value just read into extra according // according the to the directive given by 'accum'. Leave the read poinrt at the next // extra or end-of-file. // Returns: // 1 if at the end of file, // 0 if item was read and folded correctly, // -1 if there was a system IO or allocation error (if interactive), and // -2 if the new value could not be reduced into the currenct value of extra (interactive) int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra); // Write extra record to end of file afile and advance write pointer // If interactive, then return non-zero on error, if bash, then print // and halt if an error int Write_Extra(FILE *afile, DAZZ_EXTRA *extra); // If track is on the db's track list, then it is removed and all storage associated with it // is freed. void Close_Track(DAZZ_DB *db, char *track); // Allocate and return a buffer big enough for the largest read in 'db'. // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte // are needed by the alignment algorithms. If cannot allocate memory then return NULL // if INTERACTIVE is defined, or print error to stderr and exit otherwise. char *New_Read_Buffer(DAZZ_DB *db); // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter // for traversals in either direction. A non-zero value is returned if an error occured // and INTERACTIVE is defined. int Load_Read(DAZZ_DB *db, int i, char *read, int ascii); // Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence, // and there is only a choice between numeric (0) or ascii (1); int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii); // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to // the string holding the substring so it has a delimeter for traversals in either direction. // A NULL pointer is returned if an error occured and INTERACTIVE is defined. char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii); // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, // or print error to stderr and exit otherwise. #define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer #define DEL_TAG 1 // The deleted characters #define INS_QV 2 // The insertion QVs #define SUB_QV 3 // The substitution QVs #define MRG_QV 4 // The merge QVs char **New_QV_Buffer(DAZZ_DB *db); // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters // are converted to a numeric or upper/lower case ascii string as per ascii. Return with // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii); // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T). // Return with a zero, except when an error occurs and INTERACTIVE is defined in which // case return wtih 1. int Read_All_Sequences(DAZZ_DB *db, int ascii); // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. // -1 is returned if the path could not be found, and 1 is returned if an error (reported // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. int List_DB_Files(char *path, void actor(char *path, char *extension)); #endif // _DAZZ_DB DALIGNER-master/HPC.daligner.c000066400000000000000000001240421322465224500161060ustar00rootroot00000000000000/*********************************************************************************************\ * * Produce a script to compute overlaps for all block pairs of a DB, and then sort and merge * them into as many .las files as their are blocks. * * Author: Gene Myers * Date : June 1, 2014 * *********************************************************************************************/ #include #include #include #include #include #include #include #include #include #include "DB.h" #include "filter.h" #undef LSF // define if want a directly executable LSF script static char *Usage[] = { "[-vbad] [-t] [-w] [-l] [-s]", " [-M] [-B] [-D] [-T] [-f]", " ( [-k] [-h] [-e] [-H]", " [-k] [-h] [-e] )", " [-m]+ [[-]]" }; // Command Options static int DUNIT, BUNIT; static int VON, BON, CON, DON; static int WINT, TINT, HGAP, HINT, KINT, SINT, LINT, MINT; static int NTHREADS; static double EREL; static int MMAX, MTOP; static char **MASK; static char *ONAME; static char *PDIR; #define LSF_ALIGN "bsub -q medium -n 4 -o DALIGNER.out -e DALIGNER.err -R span[hosts=1] -J align#%d" #define LSF_MERGE \ "bsub -q short -n 12 -o MERGE%d.DAL.out -e MERGE%d.DAL.err -R span[hosts=1] -J merge#%d" #define LSF_CHECK \ "bsub -q short -n 12 -o CHECK%d.DAL.out -e CHECK%d.DAL.err -R span[hosts=1] -J check#%d" void daligner_script(int argc, char *argv[]) { int nblocks; int usepath; int useblock; int fblock, lblock; #ifdef LSF int jobid; #endif FILE *out; char name[100]; char *pwd, *root; // Make sure DB exists and is partitioned, get number of blocks in partition pwd = PathTo(argv[1]); if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0) root = Root(argv[1],".dam"); else root = Root(argv[1],".db"); { int i, nfiles; FILE *dbvis; dbvis = fopen(Catenate(pwd,"/",root,".dam"),"r"); if (dbvis == NULL) { dbvis = Fopen(Catenate(pwd,"/",root,".db"),"r"); if (dbvis == NULL) exit (1); } if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) SYSTEM_READ_ERROR for (i = 0; i < nfiles; i++) { char buffer[30001]; if (fgets(buffer,30000,dbvis) == NULL) SYSTEM_READ_ERROR } useblock = 1; if (fscanf(dbvis,"blocks = %d\n",&nblocks) != 1 || nblocks == 1) { useblock = 0; nblocks = 1; } usepath = (strcmp(pwd,".") != 0); } // Set range fblock-lblock checking that DB..las exists & DB..las does not { char *eptr, *fptr; FILE *file; if (argc == 3) { fblock = strtol(argv[2],&eptr,10); if (*eptr != '\0' && *eptr != '-') { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n", Prog_Name,argv[2]); exit (1); } useblock = 1; if (*eptr == '-') { lblock = strtol(eptr+1,&fptr,10); if (*fptr != '\0') { fprintf(stderr,"%s: second part of range '%s' is not an integer\n", Prog_Name,eptr+1); exit (1); } } else lblock = fblock; if (fblock < 1 || lblock > nblocks || fblock > lblock) { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock); exit (1); } } else { fblock = 1; lblock = nblocks; } if (fblock > 1) { file = fopen(Catenate(pwd,"/",root,Numbered_Suffix(".",fblock-1,".las")),"r"); if (file == NULL) { if (usepath) fprintf(stderr,"%s: File %s/%s.%d.las should already be present!\n", Prog_Name,pwd,root,fblock-1); else fprintf(stderr,"%s: File %s.%d.las should already be present!\n", Prog_Name,root,fblock-1); exit (1); } else fclose(file); } if (useblock) file = fopen(Catenate(pwd,"/",root,Numbered_Suffix(".",fblock,".las")),"r"); else file = fopen(Catenate(pwd,"/",root,".las"),"r"); if (file != NULL) { if (usepath) if (useblock) fprintf(stderr,"%s: File %s/%s.%d.las should not yet exist!\n", Prog_Name,pwd,root,fblock); else fprintf(stderr,"%s: File %s/%s.las should not yet exist!\n",Prog_Name,pwd,root); else if (useblock) fprintf(stderr,"%s: File %s.%d.las should not yet exist!\n",Prog_Name,root,fblock); else fprintf(stderr,"%s: File %s.las should not yet exist!\n",Prog_Name,root); exit (1); } DON = (DON && (lblock > 1)); out = stdout; } { int level, njobs; int i, j, k; // Create all work subdirectories if DON if (DON) { if (ONAME != NULL) { sprintf(name,"%s.00.MKDIR",ONAME); out = fopen(name,"w"); } fprintf(out,"# Create work subdirectories\n"); for (i = fblock; i <= lblock; i++) fprintf(out,"mkdir work%d\n",i); if (ONAME != NULL) fclose(out); } // Produce all necessary daligner jobs if (ONAME != NULL) { sprintf(name,"%s.01.OVL",ONAME); out = fopen(name,"w"); } njobs = 0; for (i = fblock; i <= lblock; i++) njobs += (i-1)/BUNIT+1; fprintf(out,"# Daligner jobs (%d)\n",njobs); #ifdef LSF jobid = 1; #endif for (i = fblock; i <= lblock; i++) { int bits; int low, hgh; bits = (i-1)/BUNIT+1; low = 1; for (j = 1; j <= bits; j++) { #ifdef LSF fprintf(out,LSF_ALIGN,jobid++); fprintf(out," \""); #endif fprintf(out,"daligner"); if (VON) fprintf(out," -v"); if (BON) fprintf(out," -b"); if (KINT != 14) fprintf(out," -k%d",KINT); if (WINT != 6) fprintf(out," -w%d",WINT); if (HINT != 35) fprintf(out," -h%d",HINT); if (TINT > 0) fprintf(out," -t%d",TINT); if (HGAP > 0) fprintf(out," -H%d",HGAP); if (EREL > 0.) fprintf(out," -e%g",EREL); if (LINT != 1000) fprintf(out," -l%d",LINT); if (SINT != 100) fprintf(out," -s%d",SINT); if (MINT >= 0) fprintf(out," -M%d",MINT); if (PDIR != NULL) fprintf(out," -P%s",PDIR); if (NTHREADS != 4) fprintf(out," -T%d",NTHREADS); for (k = 0; k < MTOP; k++) fprintf(out," -m%s",MASK[k]); if (useblock) if (usepath) fprintf(out," %s/%s.%d",pwd,root,i); else fprintf(out," %s.%d",root,i); else if (usepath) fprintf(out," %s/%s",pwd,root); else fprintf(out," %s",root); hgh = (i*j)/bits + 1; for (k = low; k < hgh; k++) if (useblock) if (usepath) fprintf(out," %s/%s.%d",pwd,root,k); else fprintf(out," %s.%d",root,k); else if (usepath) fprintf(out," %s/%s",pwd,root); else fprintf(out," %s",root); if (lblock == 1) // ==> i = 1, [low,hgh) = [1,2) { fprintf(out," && mv"); if (useblock) fprintf(out," %s.1.%s.1.las",root,root); else fprintf(out," %s.%s.las",root,root); if (usepath) fprintf(out," %s/",pwd); else fprintf(out," "); if (useblock) fprintf(out,"%s.1.las",root); else fprintf(out,"%s.las",root); } else if (DON) { fprintf(out," && mv"); for (k = low; k < hgh; k++) fprintf(out," %s.%d.%s.%d.las",root,i,root,k); fprintf(out," work%d",i); for (k = low; k < hgh; k++) if (k != i) fprintf(out," && mv %s.%d.%s.%d.las work%d",root,k,root,i,k); } #ifdef LSF fprintf(out,"\""); #endif fprintf(out,"\n"); low = hgh; } } // Check .las files (optional) if (ONAME != NULL) { fclose(out); sprintf(name,"%s.02.CHECK.OPT",ONAME); out = fopen(name,"w"); } fprintf(out,"# Check initial .las files jobs (%d) (optional but recommended)\n", (fblock-1) * ((lblock-fblock)/(BUNIT+1) + 1) + (lblock-fblock+1) * ((lblock-1)/(BUNIT+1) + 1) ); #ifdef LSF jobid = 1; #endif for (i = 1; i <= lblock; i++) for (j = (i < fblock ? fblock : 1); j <= lblock; ) { k = j+BUNIT; if (k > lblock) k = lblock; #ifdef LSF fprintf(out,LSF_CHECK,0,0,jobid++); fprintf(out," \""); #endif fprintf(out,"LAcheck -vS"); if (usepath) fprintf(out," %s/%s",pwd,root); else fprintf(out," %s",root); while (j <= k) { if (lblock == 1) { if (usepath) if (useblock) fprintf(out," %s/%s.1",pwd,root); else fprintf(out," %s/%s",pwd,root); else if (useblock) fprintf(out," %s.1",root); else fprintf(out," %s",root); } else { if (DON) fprintf(out," work%d/%s.%d.%s.%d",i,root,i,root,j); else fprintf(out," %s.%d.%s.%d",root,i,root,j); } j += 1; } #ifdef LSF fprintf(out,"\""); #endif fprintf(out,"\n"); } if (ONAME != NULL) fclose(out); // Higher level merges (if lblock > 1) if (lblock > 1) { int pow, stage; // Determine the number of merging levels stage = 3; pow = 1; for (level = 0; pow < lblock; level++) pow *= DUNIT; // Issue the commands for each merge level { int p, cnt, dnt; cnt = lblock; dnt = (lblock-fblock)+1; for (i = 1; i <= level; i++) { int bits, dits; int low, hgh; if (ONAME != NULL) { sprintf(name,"%s.%02d.MERGE",ONAME,stage++); out = fopen(name,"w"); } bits = (cnt-1)/DUNIT+1; dits = (dnt-1)/DUNIT+1; // Incremental update merges #ifdef LSF jobid = 1; #endif if (dnt >= 1) { int last; last = (dnt == 1 || i == level); fprintf(out,"# Level %d merge jobs (%d)\n", i,bits*((lblock-fblock)+1) + dits*(fblock-1)); for (j = 1; j < fblock; j++) { #ifdef LSF fprintf(out,LSF_MERGE,i,i,jobid++); fprintf(out," \""); #endif if (last) { if (DON) { if (usepath) fprintf(out,"mv %s/%s.%d.las work%d/L%d.%d.0.las && ", pwd,root,j,j,i,j); else fprintf(out,"mv %s.%d.las work%d/L%d.%d.0.las && ",root,j,j,i,j); } else { if (usepath) fprintf(out,"mv %s/%s.%d.las L%d.%d.0.las && ",pwd,root,j,i,j); else fprintf(out,"mv %s.%d.las L%d.%d.0.las && ",root,j,i,j); } } low = 1; for (p = 1; p <= dits; p++) { hgh = (dnt*p)/dits; #ifdef LSF if (p > 1) { fprintf(out,LSF_MERGE,i,i,jobid++); fprintf(out," \""); } #endif fprintf(out,"LAmerge"); if (VON) fprintf(out," -v"); if (CON) fprintf(out," -a"); if (last) if (DON) if (usepath) fprintf(out," %s/%s.%d work%d/L%d.%d.0",pwd,root,j,j,i,j); else fprintf(out," %s.%d work%d/L%d.%d.0",root,j,j,i,j); else if (usepath) fprintf(out," %s/%s.%d L%d.%d.0",pwd,root,j,i,j); else fprintf(out," %s.%d L%d.%d.0",root,j,i,j); else if (DON) fprintf(out," work%d/L%d.%d.%d",j,i+1,j,p); else fprintf(out," L%d.%d.%d",i+1,j,p); for (k = low; k <= hgh; k++) if (i == 1) if (DON) fprintf(out," work%d/%s.%d.%s.%d",j,root,j,root,k+(fblock-1)); else fprintf(out," %s.%d.%s.%d",root,j,root,k+(fblock-1)); else if (DON) fprintf(out," work%d/L%d.%d.%d",j,i,j,k); else fprintf(out," L%d.%d.%d",i,j,k); #ifdef LSF fprintf(out,"\""); #endif fprintf(out,"\n"); low = hgh+1; } } } else fprintf(out,"# Level %d merge jobs (%d)\n",i,bits*((lblock-fblock)+1)); // New block merges for (j = fblock; j <= lblock; j++) { low = 1; for (p = 1; p <= bits; p++) { hgh = (cnt*p)/bits; #ifdef LSF fprintf(out,LSF_MERGE,i,i,jobid++); fprintf(out," \""); #endif fprintf(out,"LAmerge"); if (VON) fprintf(out," -v"); if (CON) fprintf(out," -a"); if (i == level) if (usepath) fprintf(out," %s/%s.%d",pwd,root,j); else fprintf(out," %s.%d",root,j); else if (DON) fprintf(out," work%d/L%d.%d.%d",j,i+1,j,p); else fprintf(out," L%d.%d.%d",i+1,j,p); for (k = low; k <= hgh; k++) if (i == 1) if (DON) fprintf(out," work%d/%s.%d.%s.%d",j,root,j,root,k); else fprintf(out," %s.%d.%s.%d",root,j,root,k); else if (DON) fprintf(out," work%d/L%d.%d.%d",j,i,j,k); else fprintf(out," L%d.%d.%d",i,j,k); #ifdef LSF fprintf(out,"\""); #endif fprintf(out,"\n"); low = hgh+1; } } // Check new .las (optional) if (ONAME != NULL) { fclose(out); sprintf(name,"%s.%02d.CHECK.OPT",ONAME,stage++); out = fopen(name,"w"); } fprintf(out,"# Check level %d .las files jobs (%d) (optional but recommended)\n", i+1,(fblock-1)*((dits-1)/(BUNIT+1)+1) + (lblock-fblock+1)*((bits-1)/(BUNIT+1)+1) ); #ifdef LSF jobid = 1; #endif if (dnt >= 1) { int last; last = (dnt == 1 || i == level); for (j = 1; j < fblock; j++) for (p = 1; p <= dits;) { k = p+BUNIT; if (k > dits) k = dits; #ifdef LSF fprintf(out,LSF_CHECK,i,i,jobid++); fprintf(out," \""); #endif fprintf(out,"LAcheck -vS"); if (usepath) fprintf(out," %s/%s",pwd,root); else fprintf(out," %s",root); while (p <= k) { if (last) if (usepath) fprintf(out," %s/%s.%d",pwd,root,j); else fprintf(out," %s.%d",root,j); else if (DON) fprintf(out," work%d/L%d.%d.%d",j,i+1,j,p); else fprintf(out," L%d.%d.%d",i+1,j,p); p += 1; } #ifdef LSF fprintf(out,"\""); #endif fprintf(out,"\n"); } } for (j = fblock; j <= lblock; j++) for (p = 1; p <= bits;) { k = p+BUNIT; if (k > bits) k = bits; #ifdef LSF fprintf(out,LSF_CHECK,i,i,jobid++); fprintf(out," \""); #endif fprintf(out,"LAcheck -vS"); if (usepath) fprintf(out," %s/%s",pwd,root); else fprintf(out," %s",root); while (p <= k) { if (i == level) if (usepath) fprintf(out," %s/%s.%d",pwd,root,j); else fprintf(out," %s.%d",root,j); else if (DON) fprintf(out," work%d/L%d.%d.%d",j,i+1,j,p); else fprintf(out," L%d.%d.%d",i+1,j,p); p += 1; } #ifdef LSF fprintf(out,"\""); #endif fprintf(out,"\n"); } // Cleanup (optional) if (ONAME != NULL) { fclose(out); if (i == 1) sprintf(name,"%s.%02d.RM.OPT",ONAME,stage++); else sprintf(name,"%s.%02d.RM",ONAME,stage++); out = fopen(name,"w"); } if (i == 1) fprintf(out,"# Remove level %d .las files (optional)\n",i); else fprintf(out,"# Remove level %d .las files\n",i); if (dnt >= 1) { int last; last = (dnt == 1 || i == level); for (j = 1; j < fblock; j++) { low = 1; for (p = 1; p <= dits; p++) { hgh = (dnt*p)/dits; if (DON) fprintf(out,"cd work%d; ",j); fprintf(out,"rm"); if (last) fprintf(out," L%d.%d.0.las",i,j); for (k = low; k <= hgh; k++) if (i == 1) fprintf(out," %s.%d.%s.%d.las",root,j,root,k+(fblock-1)); else fprintf(out," L%d.%d.%d.las",i,j,k); if (DON) fprintf(out,"; cd .."); fprintf(out,"\n"); low = hgh+1; } } } for (j = fblock; j <= lblock; j++) { low = 1; for (p = 1; p <= bits; p++) { hgh = (cnt*p)/bits; if (DON) fprintf(out,"cd work%d; ",j); fprintf(out,"rm"); for (k = low; k <= hgh; k++) if (i == 1) fprintf(out," %s.%d.%s.%d.las",root,j,root,k); else fprintf(out," L%d.%d.%d.las",i,j,k); if (DON) fprintf(out,"; cd .."); fprintf(out,"\n"); low = hgh+1; } } if (ONAME != NULL) fclose(out); if (dnt >= 1) { if (dnt > 1) dnt = dits; else dnt = 0; } cnt = bits; } } } } free(root); free(pwd); } /*********************************************************************************************\ * * Produce a script to compute overlaps for all block pairs between two DBs, and then sort * and merge them into as many .las files as their are blocks of the 1st DB. * * Author: Gene Myers * Date : December 31, 2014 * *********************************************************************************************/ #define LSF_MALIGN "bsub -q medium -n 4 -o MAPALL.out -e MAPALL.err -R span[hosts=1] -J align#%d" #define LSF_MSORT "bsub -q short -n 12 -o SORT.ALL.out -e SORT.ALL.err -R span[hosts=1] -J sort#%d" #define LSF_MMERGE \ "bsub -q short -n 12 -o MERGE%d.ALL.out -e MERGE%d.ALL.err -R span[hosts=1] -J merge#%d" void mapper_script(int argc, char *argv[]) { int nblocks1, nblocks2; int useblock1, useblock2; int usepath1, usepath2; int fblock, lblock; #ifdef LSF int jobid; #endif FILE *out; char name[100]; char *pwd1, *root1; char *pwd2, *root2; // Make sure DAM and DB exist and the DB is partitioned, get number of blocks in partition pwd1 = PathTo(argv[1]); if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0) root1 = Root(argv[1],".dam"); else root1 = Root(argv[1],".db"); { int i, nfiles; FILE *dbvis; dbvis = fopen(Catenate(pwd1,"/",root1,".dam"),"r"); if (dbvis == NULL) { dbvis = Fopen(Catenate(pwd1,"/",root1,".db"),"r"); if (dbvis == NULL) exit (1); } if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) SYSTEM_READ_ERROR for (i = 0; i < nfiles; i++) { char buffer[30001]; if (fgets(buffer,30000,dbvis) == NULL) SYSTEM_READ_ERROR } useblock1 = 1; if (fscanf(dbvis,"blocks = %d\n",&nblocks1) != 1 || nblocks1 == 1) { useblock1 = 0; nblocks1 = 1; } usepath1 = (strcmp(pwd1,".") != 0); fclose(dbvis); } pwd2 = PathTo(argv[2]); if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0) root2 = Root(argv[2],".dam"); else root2 = Root(argv[2],".db"); if (strcmp(root2,root1) == 0 && strcmp(pwd1,pwd2) == 0) { fprintf(stderr,"%s: Comparing the same data base %s/%s against itself, use HPCdaligner\n", Prog_Name,pwd1,root1); exit (1); } { int i, nfiles; FILE *dbvis; dbvis = fopen(Catenate(pwd2,"/",root2,".dam"),"r"); if (dbvis == NULL) { dbvis = Fopen(Catenate(pwd2,"/",root2,".db"),"r"); if (dbvis == NULL) exit (1); } if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) SYSTEM_READ_ERROR for (i = 0; i < nfiles; i++) { char buffer[30001]; if (fgets(buffer,30000,dbvis) == NULL) SYSTEM_READ_ERROR } useblock2 = 1; if (fscanf(dbvis,"blocks = %d\n",&nblocks2) != 1 || nblocks2 == 1) { useblock2 = 0; nblocks2 = 1; } usepath2 = (strcmp(pwd2,".") != 0); fclose(dbvis); } // Set range fblock-lblock checking that DB..las exists & DB..las does not { char *eptr, *fptr, *src2; FILE *file; if (argc == 4) { fblock = strtol(argv[3],&eptr,10); if ((*eptr != '\0' && *eptr != '-') || eptr <= argv[3]) { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n", Prog_Name,argv[3]); exit (1); } useblock2 = 1; if (*eptr == '-') { lblock = strtol(eptr+1,&fptr,10); if (*fptr != '\0' || fptr <= eptr+1) { fprintf(stderr,"%s: second part of range '%s' is not an integer\n", Prog_Name,eptr+1); exit (1); } } else lblock = fblock; if (fblock < 1 || lblock > nblocks2 || fblock > lblock) { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock); exit (1); } } else { fblock = 1; lblock = nblocks2; } if (usepath2) src2 = Strdup(Catenate(pwd2,"/",root2,""),"Allocating small string!"); else src2 = Strdup(root2,"Allocating small string!"); if (src2 == NULL) exit (1); if (fblock > 1) { file = fopen(Catenate(src2,".",root1,Numbered_Suffix(".",fblock-1,".las")),"r"); if (file == NULL) { fprintf(stderr,"%s: File %s.%d.%s.las should already be present!\n", Prog_Name,src2,fblock-1,root1); exit (1); } else fclose(file); } if (useblock2) { file = fopen(Catenate(src2,".",root1,Numbered_Suffix(".",fblock,".las")),"r"); if (file != NULL) { fprintf(stderr,"%s: File %s.%d.%s.las should not yet exist!\n", Prog_Name,src2,fblock,root1); exit (1); } } else { file = fopen(Catenate(src2,".",root1,".las"),"r"); if (file != NULL) { fprintf(stderr,"%s: File %s.%s.las should not yet exist!\n", Prog_Name,src2,root1); exit (1); } } free(src2); DON = (DON && (nblocks1 > 1)); out = stdout; } { int level, njobs; int i, j, k; // Create all work subdirectories if DON if (DON) { if (ONAME != NULL) { sprintf(name,"%s.00.MKDIR",ONAME); out = fopen(name,"w"); } fprintf(out,"# Create work subdirectories\n"); for (i = fblock; i <= lblock; i++) fprintf(out,"mkdir work%d\n",i); if (ONAME != NULL) fclose(out); } // Produce all necessary daligner jobs ... if (ONAME != NULL) { sprintf(name,"%s.01.CMP",ONAME); out = fopen(name,"w"); } njobs = nblocks1 * ( (lblock-fblock)/BUNIT + 1); fprintf(out,"# Daligner jobs (%d)\n",njobs); #ifdef LSF jobid = 1; #endif for (i = fblock; i <= lblock; i++) { int bits; int low, hgh; bits = (nblocks1-1)/BUNIT+1; low = 1; for (j = 1; j <= bits; j++) { #ifdef LSF fprintf(out,LSF_MALIGN,jobid++); fprintf(out," \""); #endif fprintf(out,"daligner -A"); if (VON) fprintf(out," -v"); if (BON) fprintf(out," -b"); fprintf(out," -k%d",KINT); if (WINT != 6) fprintf(out," -w%d",WINT); fprintf(out," -h%d",HINT); if (TINT > 0) fprintf(out," -t%d",TINT); if (EREL > 0.) fprintf(out," -e%g",EREL); else fprintf(out," -e.85"); if (LINT != 1000) fprintf(out," -l%d",LINT); if (SINT != 100) fprintf(out," -s%d",SINT); if (NTHREADS != 4) fprintf(out," -T%d",NTHREADS); if (MINT >= 0) fprintf(out," -M%d",MINT); if (PDIR != NULL) fprintf(out," -P%s",PDIR); for (k = 0; k < MTOP; k++) fprintf(out," -m%s",MASK[k]); fprintf(out," "); if (usepath2) fprintf(out,"%s/",pwd2); fprintf(out,"%s",root2); if (useblock2) fprintf(out,".%d",i); hgh = 1 + (nblocks1*j)/bits; for (k = low; k < hgh; k++) { fprintf(out," "); if (usepath1) fprintf(out,"%s/",pwd1); fprintf(out,"%s",root1); if (useblock1) fprintf(out,".%d",k); } if (nblocks1 == 1) { if (useblock1 || usepath2) { fprintf(out," && mv %s",root2); if (useblock2) fprintf(out,".%d.las",i); if (useblock1) fprintf(out,".%s.1.las ",root1); else fprintf(out,".%s.las ",root1); if (useblock1) { if (usepath2) fprintf(out,"%s/",pwd2); fprintf(out,"%s",root2); if (useblock2) fprintf(out,".%d",i); fprintf(out,".%s.las",root1); } else fprintf(out,"%s",pwd2); } } else if (DON) { fprintf(out," && mv"); for (k = low; k < hgh; k++) { fprintf(out," %s",root2); if (useblock2) fprintf(out,".%d",i); fprintf(out,".%s.%d.las",root1,k); } fprintf(out," work%d",i); } #ifdef LSF fprintf(out,"\""); #endif fprintf(out,"\n"); low = hgh; } } // Check .las files (optional) if (ONAME != NULL) { fclose(out); sprintf(name,"%s.02.CHECK.OPT",ONAME); out = fopen(name,"w"); } fprintf(out,"# Check initial .las files jobs (%d) (optional but recommended)\n", (lblock-fblock+1) * ((nblocks1-1)/(BUNIT+1) + 1) ); #ifdef LSF jobid = 1; #endif for (j = fblock; j <= lblock; j++) for (i = 1; i <= nblocks1; ) { k = i+BUNIT; if (k > nblocks1) k = nblocks1; #ifdef LSF fprintf(out,LSF_CHECK,0,0,jobid++); fprintf(out," \""); #endif fprintf(out,"LAcheck -vS"); if (usepath2) fprintf(out," %s/%s",pwd2,root2); else fprintf(out," %s",root2); if (usepath1) fprintf(out," %s/%s",pwd1,root1); else fprintf(out," %s",root1); while (i <= k) { fprintf(out," "); if (nblocks1 == 1) { if (usepath2) fprintf(out,"%s/",pwd2); fprintf(out,"%s",root2); if (useblock2) fprintf(out,".%d",j); fprintf(out,".%s",root1); } else { if (DON) fprintf(out,"work%d/",j); fprintf(out,"%s",root2); if (useblock2) fprintf(out,".%d",j); fprintf(out,".%s.%d",root1,i); } i += 1; } #ifdef LSF fprintf(out,"\""); #endif fprintf(out,"\n"); } if (ONAME != NULL) fclose(out); // Higher level merges (if lblock > 1) if (nblocks1 > 1) { int pow, stage; // Determine the number of merging levels stage = 3; pow = 1; for (level = 0; pow < nblocks1; level++) pow *= DUNIT; // Issue the commands for each merge level { int p, cnt; cnt = nblocks1; for (i = 1; i <= level; i++) { int bits; int low, hgh; if (ONAME != NULL) { sprintf(name,"%s.%02d.MERGE",ONAME,stage++); out = fopen(name,"w"); } bits = (cnt-1)/DUNIT+1; fprintf(out,"# Level %d jobs (%d)\n",i,bits*((lblock-fblock)+1)); // Block merges #ifdef LSF jobid = 1; #endif for (j = fblock; j <= lblock; j++) { low = 1; for (p = 1; p <= bits; p++) { hgh = (cnt*p)/bits; #ifdef LSF fprintf(out,LSF_MMERGE,i,i,jobid++); fprintf(out," \""); #endif fprintf(out,"LAmerge "); if (VON) fprintf(out,"-v "); if (CON) fprintf(out,"-a "); if (i == level) { if (usepath2) fprintf(out,"%s/",pwd2); fprintf(out,"%s",root2); if (useblock2) fprintf(out,".%d",j); fprintf(out,".%s",root1); } else { if (DON) fprintf(out,"work%d/",j); fprintf(out,"L%d.%d.%d",i+1,j,p); } for (k = low; k <= hgh; k++) if (i == 1) { if (DON) fprintf(out," work%d/",j); else fprintf(out," "); fprintf(out,"%s",root2); if (useblock2) fprintf(out,".%d",j); fprintf(out,".%s.%d",root1,k); } else if (DON) fprintf(out," work%d/L%d.%d.%d",j,i,j,k); else fprintf(out," L%d.%d.%d",i,j,k); #ifdef LSF fprintf(out,"\""); #endif fprintf(out,"\n"); low = hgh+1; } } // Check new .las (optional) if (ONAME != NULL) { fclose(out); sprintf(name,"%s.%02d.CHECK.OPT",ONAME,stage++); out = fopen(name,"w"); } fprintf(out,"# Check level %d .las files jobs (%d) (optional but recommended)\n", i+1,(lblock-fblock+1)*((bits-1)/(BUNIT+1)+1) ); #ifdef LSF jobid = 1; #endif for (j = fblock; j <= lblock; j++) for (p = 1; p <= bits; ) { k = p+BUNIT; if (k > bits) k = bits; #ifdef LSF fprintf(out,LSF_CHECK,0,0,jobid++); fprintf(out," \""); #endif fprintf(out,"LAcheck -vS"); if (usepath2) fprintf(out," %s/%s",pwd2,root2); else fprintf(out," %s",root2); if (usepath1) fprintf(out," %s/%s",pwd1,root1); else fprintf(out," %s",root1); while (p <= k) { fprintf(out," "); if (i == level) { if (usepath2) fprintf(out,"%s/",pwd2); fprintf(out,"%s",root2); if (useblock2) fprintf(out,".%d",j); fprintf(out,".%s",root1); } else { if (DON) fprintf(out,"work%d/",j); fprintf(out,"L%d.%d.%d",i+1,j,p); } p += 1; } #ifdef LSF fprintf(out,"\""); #endif fprintf(out,"\n"); } // Cleanup (optional) if (ONAME != NULL) { fclose(out); sprintf(name,"%s.%02d.RM",ONAME,stage++); out = fopen(name,"w"); } fprintf(out,"# Remove level %d .las files\n",i); for (j = fblock; j <= lblock; j++) { low = 1; for (p = 1; p <= bits; p++) { hgh = (cnt*p)/bits; if (DON) fprintf(out,"cd work%d; ",j); fprintf(out,"rm"); for (k = low; k <= hgh; k++) if (i == 1) { fprintf(out," %s",root2); if (useblock2) fprintf(out,".%d",j); fprintf(out,".%s.%d.las",root1,k); } else fprintf(out," L%d.%d.%d.las",i,j,k); if (DON) fprintf(out,"; cd .."); fprintf(out,"\n"); low = hgh+1; } } if (ONAME != NULL) fclose(out); cnt = bits; } } } } free(root2); free(pwd2); free(root1); free(pwd1); exit (0); } int main(int argc, char *argv[]) { int i, j, k; int flags[128]; char *eptr; int mapper; // Process options and decide if its a overlap or mapper script ARG_INIT("HPC.daligner") KINT = 0; HINT = 0; HGAP = 0; EREL = 0.; BUNIT = 4; DUNIT = 250; TINT = 0; WINT = 6; LINT = 1000; SINT = 100; MINT = -1; PDIR = NULL; MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); ONAME = NULL; NTHREADS = 4; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vbadAI"); break; case 'e': ARG_REAL(EREL) if (EREL < .7 || EREL >= 1.) { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n",Prog_Name,EREL); exit (1); } break; case 'f': ONAME = argv[i]+2; break; case 'h': ARG_POSITIVE(HINT,"Hit threshold (in bp.s)") break; case 'k': ARG_POSITIVE(KINT,"K-mer length") if (KINT > 32) { fprintf(stderr,"%s: K-mer length must be 32 or less\n",Prog_Name); exit (1); } break; case 'l': ARG_POSITIVE(LINT,"Minimum ovlerap length") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; case 's': ARG_POSITIVE(SINT,"Trace spacing") break; case 't': ARG_POSITIVE(TINT,"Tuple suppression frequency") break; case 'w': ARG_POSITIVE(WINT,"Log of bin width") break; case 'B': ARG_NON_NEGATIVE(BUNIT,"Blocks per command") break; case 'D': ARG_NON_NEGATIVE(DUNIT,"File per merge") if (DUNIT < 3) { fprintf(stderr,"%s: Files per merge must be at least 3 (%d)\n", Prog_Name,DUNIT); exit (1); } break; case 'H': ARG_POSITIVE(HGAP,"HGAP threshold (in bp.s)") break; case 'M': ARG_NON_NEGATIVE(MINT,"Memory allocation (in Gb)") break; case 'P': PDIR = argv[i]+2; break; case 'T': ARG_POSITIVE(NTHREADS,"Number of threads") break; } else argv[j++] = argv[i]; argc = j; VON = flags['v']; BON = flags['b']; CON = flags['a']; DON = flags['d']; if (argc < 2 || argc > 4) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[3]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[4]); exit (1); } if (argc == 2) mapper = 0; else if (argc == 4) mapper = 1; else { (void) strtol(argv[2],&eptr,10); if ((*eptr == '\0' || *eptr == '-') && eptr > argv[2]) mapper = 0; else mapper = 1; } if (mapper) { if (HGAP > 0) { fprintf(stderr,"%s: Cannot use -H option in a comparison script\n",Prog_Name); exit (1); } if (KINT <= 0) KINT = 20; if (HINT <= 0) HINT = 50; if (EREL <= 0.) EREL = .85; } else { if (KINT <= 0) KINT = 14; if (HINT <= 0) HINT = 35; } for (j = 1; 2*j <= NTHREADS; j *= 2) ; NTHREADS = j; if (mapper) mapper_script(argc,argv); else daligner_script(argc,argv); exit (0); } DALIGNER-master/LAcat.c000066400000000000000000000120701322465224500146710ustar00rootroot00000000000000/******************************************************************************************* * * Merge together in index order, overlap files .1.las, .2.las, ... into a * single overlap file and output to the standard output * * Author: Gene Myers * Date : July 2013 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-v] > .las"; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { char *iblock, *oblock; FILE *input; int64 novl, bsize, ovlsize, ptrsize; int tspace, tbytes; char *pwd, *root, *root2; int VERBOSE; // Process options { int i, j, k; int flags[128]; ARG_INIT("LAcat") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("v") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; oblock = (char *) Malloc(bsize,"Allocating output block"); iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); if (oblock == NULL || iblock == NULL) exit (1); iblock += ptrsize; pwd = PathTo(argv[1]); root = Root(argv[1],".las"); root2 = index(root,'#'); if (root2 == NULL) { fprintf(stderr,"%s: No #-sign in source name '%s'\n",Prog_Name,root); exit (1); } if (index(root2+1,'#') != NULL) { fprintf(stderr,"%s: Two or more occurences of #-sign in source name '%s'\n",Prog_Name,root); exit (1); } *root2++ = '\0'; { int64 povl; int i, mspace; novl = 0; tspace = 0; mspace = 0; tbytes = 0; for (i = 0; 1; i++) { char *name = Catenate(pwd,"/",Numbered_Suffix(root,i+1,root2),".las"); if ((input = fopen(name,"r")) == NULL) break; if (fread(&povl,sizeof(int64),1,input) != 1) SYSTEM_READ_ERROR novl += povl; if (fread(&mspace,sizeof(int),1,input) != 1) SYSTEM_READ_ERROR if (i == 0) { tspace = mspace; if (tspace <= TRACE_XOVR && tspace != 0) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); } else if (tspace != mspace) { fprintf(stderr,"%s: PT-point spacing conflict (%d vs %d)\n",Prog_Name,tspace,mspace); exit (1); } fclose(input); } if (fwrite(&novl,sizeof(int64),1,stdout) != 1) SYSTEM_READ_ERROR if (fwrite(&tspace,sizeof(int),1,stdout) != 1) SYSTEM_READ_ERROR } { int i, j; Overlap *w; int64 tsize, povl; int mspace; char *iptr, *itop; char *optr, *otop; optr = oblock; otop = oblock + bsize; for (i = 0; 1; i++) { char *name = Catenate(pwd,"/",Numbered_Suffix(root,i+1,root2),".las"); if ((input = fopen(name,"r")) == NULL) break; if (fread(&povl,sizeof(int64),1,input) != 1) SYSTEM_READ_ERROR if (fread(&mspace,sizeof(int),1,input) != 1) SYSTEM_READ_ERROR if (VERBOSE) fprintf(stderr," Concatenating %s: %lld la\'s\n",Numbered_Suffix(root,i+1,root2),povl); iptr = iblock; itop = iblock + fread(iblock,1,bsize,input); for (j = 0; j < povl; j++) { if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memmove(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } w = (Overlap *) (iptr - ptrsize); tsize = w->path.tlen*tbytes; if (optr + ovlsize + tsize > otop) { if (fwrite(oblock,1,optr-oblock,stdout) != (size_t) (optr-oblock)) SYSTEM_READ_ERROR optr = oblock; } memmove(optr,iptr,ovlsize); optr += ovlsize; iptr += ovlsize; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memmove(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } memmove(optr,iptr,tsize); optr += tsize; iptr += tsize; } fclose(input); } if (optr > oblock) { if (fwrite(oblock,1,optr-oblock,stdout) != (size_t) (optr-oblock)) SYSTEM_READ_ERROR } } if (VERBOSE) fprintf(stderr," Totalling %lld la\'s\n",novl); free(pwd); free(root); free(oblock); free(iblock-ptrsize); exit (0); } DALIGNER-master/LAcheck.c000066400000000000000000000256241322465224500152100ustar00rootroot00000000000000/******************************************************************************************* * * Check the structural integrity of .las files * * Author: Gene Myers * Date : July 2014 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-vS] [ ] ..."; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { DAZZ_DB _db1, *db1 = &_db1; DAZZ_DB _db2, *db2 = &_db2; int VERBOSE; int SORTED; int ISTWO; int status; // Process options { int i, j, k; int flags[128]; ARG_INIT("LAcheck") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vS") break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; SORTED = flags['S']; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open trimmed DB { int status; char *pwd, *root; FILE *input; ISTWO = 0; status = Open_DB(argv[1],db1); if (status < 0) exit (1); if (db1->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } pwd = PathTo(argv[2]); root = Root(argv[2],".las"); if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) == NULL) { ISTWO = 1; if (argc <= 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } status = Open_DB(argv[2],db2); if (status < 0) exit (1); if (db2->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); exit (1); } Trim_DB(db2); } else { fclose(input); db2 = db1; } Trim_DB(db1); free(root); free(pwd); } { char *iblock; int64 bsize, ovlsize, ptrsize; int i, j; DAZZ_READ *reads1 = db1->reads; int nreads1 = db1->nreads; DAZZ_READ *reads2 = db2->reads; int nreads2 = db2->nreads; // Setup IO buffers ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; iblock = (char *) Malloc(bsize+ptrsize,"Allocating input block"); if (iblock == NULL) exit (1); iblock += ptrsize; // For each file do status = 0; for (i = 2+ISTWO; i < argc; i++) { char *pwd, *root; FILE *input; char *iptr, *itop; Overlap last, prev; int64 novl; int tspace, tbytes; int has_chains; // Establish IO and (novl,tspace) header pwd = PathTo(argv[i]); root = Root(argv[i],".las"); if ((input = Fopen(Catenate(pwd,"/",root,".las"),"r")) == NULL) goto error; if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_READ_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_READ_ERROR if (novl < 0) { if (VERBOSE) fprintf(stderr," %s: Number of alignments < 0\n",root); goto error; } if (tspace < 0) { if (VERBOSE) fprintf(stderr," %s: Trace spacing < 0\n",root); goto error; } if (tspace <= TRACE_XOVR && tspace != 0) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); iptr = iblock; itop = iblock + fread(iblock,1,bsize,input); // For each record in file do has_chains = 0; last.aread = -1; last.bread = -1; last.flags = 0; last.path.bbpos = last.path.abpos = 0; last.path.bepos = last.path.aepos = 0; prev = last; for (j = 0; j < novl; j++) { Overlap ovl; int tsize; int equal; // Fetch next record if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memmove(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); if (iptr + ovlsize > itop) { if (VERBOSE) fprintf(stderr," %s: Too few alignment records\n",root); goto error; } } ovl = *((Overlap *) (iptr - ptrsize)); iptr += ovlsize; tsize = ovl.path.tlen*tbytes; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memmove(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); if (iptr + tsize > itop) { if (VERBOSE) fprintf(stderr," %s: Too few alignment records\n",root); goto error; } } ovl.path.trace = iptr; iptr += tsize; // Basic checks if (ovl.aread < 0 || ovl.bread < 0) { if (VERBOSE) fprintf(stderr," %s: Read indices < 0\n",root); goto error; } if (ovl.aread >= nreads1 || ovl.bread >= nreads2) { if (VERBOSE) fprintf(stderr," %s: Read indices out of range\n",root); goto error; } if (ovl.path.abpos >= ovl.path.aepos || ovl.path.aepos > reads1[ovl.aread].rlen || ovl.path.bbpos >= ovl.path.bepos || ovl.path.bepos > reads2[ovl.bread].rlen || ovl.path.abpos < 0 || ovl.path.bbpos < 0 ) { if (VERBOSE) fprintf(stderr," %s: Non-sense alignment intervals\n",root); goto error; } if (ovl.path.diffs < 0 || ovl.path.diffs > reads1[ovl.aread].rlen || ovl.path.diffs > reads2[ovl.bread].rlen) { if (VERBOSE) fprintf(stderr," %s: Non-sense number of differences\n",root); goto error; } if (Check_Trace_Points(&ovl,tspace,VERBOSE,root)) goto error; if (j == 0) has_chains = ((ovl.flags & (START_FLAG | NEXT_FLAG | BEST_FLAG)) != 0); if (has_chains) { if ((ovl.flags & (START_FLAG | NEXT_FLAG)) == 0) { if (VERBOSE) fprintf(stderr," %s: LA has both start & next flag set\n",root); goto error; } if (BEST_CHAIN(ovl.flags) && CHAIN_NEXT(ovl.flags)) { if (VERBOSE) fprintf(stderr," %s: LA has both best & next flag set\n",root); goto error; } } else { if ((ovl.flags & (START_FLAG | NEXT_FLAG | BEST_FLAG)) != 0) { if (VERBOSE) fprintf(stderr," %s: LAs should not have chain flags\n",root); goto error; } } // Duplicate check and sort check if -S set equal = 0; if (SORTED) { if (CHAIN_NEXT(ovl.flags) || !has_chains) { if (ovl.aread > last.aread) goto inorder; if (ovl.aread == last.aread) { if (ovl.bread > last.bread) goto inorder; if (ovl.bread == last.bread) { if (COMP(ovl.flags) > COMP(last.flags)) goto inorder; if (COMP(ovl.flags) == COMP(last.flags)) { if (ovl.path.abpos > last.path.abpos) goto inorder; if (ovl.path.abpos == last.path.abpos) { equal = 1; goto inorder; } } } } if (VERBOSE) { if (CHAIN_NEXT(ovl.flags)) fprintf(stderr," %s: Chain is not valid (%d vs %d)\n", root,ovl.aread+1,ovl.bread+1); else fprintf(stderr," %s: Reads are not sorted (%d vs %d)\n", root,ovl.aread+1,ovl.bread+1); } goto error; } else { if (ovl.aread > prev.aread) goto inorder; if (ovl.aread == prev.aread) { if (ovl.path.abpos > prev.path.abpos) goto inorder; if (ovl.path.abpos == prev.path.abpos) goto dupcheck; } if (VERBOSE) fprintf(stderr," %s: Chains are not sorted (%d vs %d)\n", root,ovl.aread+1,ovl.bread+1); goto error; } } dupcheck: if (ovl.aread == last.aread && ovl.bread == last.bread && COMP(ovl.flags) == COMP(last.flags) && ovl.path.abpos == last.path.abpos) equal = 1; inorder: if (equal) { if (ovl.path.aepos == last.path.aepos && ovl.path.bbpos == last.path.bbpos && ovl.path.bepos == last.path.bepos) { if (VERBOSE) fprintf(stderr," %s: Duplicate overlap (%d vs %d)\n", root,ovl.aread+1,ovl.bread+1); goto error; } } last = ovl; if (CHAIN_START(ovl.flags)) prev = ovl; } // File processing epilog: Check all data read and print OK if -v if (iptr < itop) { if (VERBOSE) fprintf(stderr," %s: Too many alignment records\n",root); goto error; } if (VERBOSE) { fprintf(stderr," %s: ",root); Print_Number(novl,0,stderr); fprintf(stderr," all OK\n"); } goto cleanup; error: status = 1; cleanup: if (input != NULL) fclose(input); free(pwd); free(root); } free(iblock-ptrsize); } Close_DB(db1); if (ISTWO) Close_DB(db2); exit (status); } DALIGNER-master/LAdump.c000066400000000000000000000314461322465224500150770ustar00rootroot00000000000000/******************************************************************************************* * * Utility for displaying the information in the overlaps of a .las file in a very * simple to parse format. * * Author: Gene Myers * Creation: July 2013 * Last Mod: Jan 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-cdtlo] [] [ | ...]"; #define LAST_READ_SYMBOL '$' static int ORDER(const void *l, const void *r) { int x = *((int *) l); int y = *((int *) r); return (x-y); } int main(int argc, char *argv[]) { DAZZ_DB _db1, *db1 = &_db1; DAZZ_DB _db2, *db2 = &_db2; Overlap _ovl, *ovl = &_ovl; FILE *input; int64 novl; int tspace, tbytes, small; int tmax; int reps, *pts; int input_pts; int OVERLAP; int DOCOORDS, DODIFFS, DOTRACE, DOLENS; int ISTWO; // Process options { int i, j, k; int flags[128]; ARG_INIT("LAdump") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("ocdtl") break; } else argv[j++] = argv[i]; argc = j; OVERLAP = flags['o']; DOCOORDS = flags['c']; DODIFFS = flags['d']; DOTRACE = flags['t']; DOLENS = flags['l']; if (DOTRACE) DOCOORDS = 1; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); fprintf(stderr,"\n"); fprintf(stderr," P #a #b #o #c -"); fprintf(stderr," (#a,#b^#o) have an LA between them where #o is 'n' or 'c' and\n"); fprintf(stderr," "); fprintf(stderr," #c is '>' (start of best chain), '+' (start of alternate chain),\n"); fprintf(stderr," "); fprintf(stderr," '-' (continuation of chain), or '.' (no chains in file).\n"); fprintf(stderr,"\n"); fprintf(stderr," -c: C #ab #ae #bb #be - #a[#ab,#ae] aligns with #b^#o[#bb,#be]\n"); fprintf(stderr," -d: D # - there are # differences in the LA\n"); fprintf(stderr," -t: T #n -"); fprintf(stderr," there are #n trace point intervals for the LA\n"); fprintf(stderr," (#d #y )^#n -"); fprintf(stderr," there are #d difference aligning the #y bp's of B with the\n"); fprintf(stderr," next fixed-size interval of A\n"); fprintf(stderr," -l: L #la #lb -"); fprintf(stderr," #la is the length of the a-read and #lb that of the b-read\n"); fprintf(stderr,"\n"); fprintf(stderr," -o: Output proper overlaps only\n"); exit (1); } } // Open trimmed DB or DB pair { int status; char *pwd, *root; FILE *input; ISTWO = 0; status = Open_DB(argv[1],db1); if (status < 0) exit (1); if (db1->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } if (argc > 3) { pwd = PathTo(argv[3]); root = Root(argv[3],".las"); if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) { ISTWO = 1; fclose(input); status = Open_DB(argv[2],db2); if (status < 0) exit (1); if (db2->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); exit (1); } Trim_DB(db2); } else db2 = db1; free(root); free(pwd); } else db2 = db1; Trim_DB(db1); } // Process read index arguments into a sorted list of read ranges input_pts = 0; if (argc == ISTWO+4) { if (argv[ISTWO+3][0] != LAST_READ_SYMBOL || argv[ISTWO+3][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[ISTWO+3],&eptr,10); if (eptr > argv[ISTWO+3] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { int v, x; FILE *input; input = Fopen(argv[ISTWO+3],"r"); if (input == NULL) exit (1); reps = 0; while ((v = fscanf(input," %d",&x)) != EOF) if (v == 0) { fprintf(stderr,"%s: %d'th item of input file %s is not an integer\n", Prog_Name,reps+1,argv[2]); exit (1); } else reps += 1; reps *= 2; pts = (int *) Malloc(sizeof(int)*reps,"Allocating read parameters"); if (pts == NULL) exit (1); rewind(input); for (v = 0; v < reps; v += 2) { fscanf(input," %d",&x); pts[v] = pts[v+1] = x; } fclose(input); } else { pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 3+ISTWO) { int c, b, e; char *eptr, *fptr; for (c = 3+ISTWO; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db1->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == '\0') { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = INT32_MAX; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } qsort(pts,reps/2,sizeof(int64),ORDER); b = 0; for (c = 0; c < reps; c += 2) if (b > 0 && pts[b-1] >= pts[c]-1) { if (pts[c+1] > pts[b-1]) pts[b-1] = pts[c+1]; } else { pts[b++] = pts[c]; pts[b++] = pts[c+1]; } pts[b++] = INT32_MAX; reps = b; } else { pts[reps++] = 1; pts[reps++] = INT32_MAX; } } // Initiate file reading and read header { char *over, *pwd, *root; pwd = PathTo(argv[2+ISTWO]); root = Root(argv[2+ISTWO],".las"); over = Catenate(pwd,"/",root,".las"); input = Fopen(over,"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_READ_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_READ_ERROR if (tspace <= TRACE_XOVR && tspace != 0) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } free(pwd); free(root); } // Scan to count sizes of things { int j, al, tlen; int in, npt, idx, ar; int64 novls, odeg, omax, sdeg, smax, ttot; in = 0; npt = pts[0]; idx = 1; // For each record do novls = omax = smax = ttot = tmax = 0; sdeg = odeg = 0; al = 0; for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); tlen = ovl->path.tlen; fseeko(input,tlen*tbytes,SEEK_CUR); // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != db1->reads[ovl->aread].rlen && ovl->path.bepos != db2->reads[ovl->bread].rlen) continue; } if (ar != al) { if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; sdeg = odeg = 0; al = ar; } novls += 1; odeg += 1; sdeg += tlen; ttot += tlen; if (tlen > tmax) tmax = tlen; } if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; printf("+ P %lld\n",novls); printf("%% P %lld\n",omax); if (DOTRACE) { printf("+ T %lld\n",ttot); printf("%% T %lld\n",smax); printf("@ T %d\n",tmax); } } // Read the file and display selected records { int j, k; uint16 *trace; int in, npt, idx, ar; DAZZ_READ *read1, *read2; rewind(input); fread(&novl,sizeof(int64),1,input); fread(&tspace,sizeof(int),1,input); trace = (uint16 *) Malloc(sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); read1 = db1->reads; read2 = db2->reads; in = 0; npt = pts[0]; idx = 1; // For each record do for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); ovl->path.trace = (void *) trace; Read_Trace(input,ovl,tbytes); // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != db1->reads[ovl->aread].rlen && ovl->path.bepos != db2->reads[ovl->bread].rlen) continue; } // Display it printf("P %d %d",ovl->aread+1,ovl->bread+1); if (COMP(ovl->flags)) printf(" c"); else printf(" n"); if (CHAIN_NEXT(ovl->flags)) printf(" -"); else if (BEST_CHAIN(ovl->flags)) printf(" >"); else if (CHAIN_START(ovl->flags)) printf(" +"); else printf(" ."); printf("\n"); if (DOLENS) printf("L %d %d\n",read1[ovl->aread].rlen,read2[ovl->bread].rlen); if (DOCOORDS) printf("C %d %d %d %d\n",ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos); if (DODIFFS) printf("D %d\n",ovl->path.diffs); if (DOTRACE) { uint16 *trace = (uint16 *) ovl->path.trace; int tlen = ovl->path.tlen; if (small) Decompress_TraceTo16(ovl); printf("T %d\n",tlen>>1); for (k = 0; k < tlen; k += 2) printf(" %3d %3d\n",trace[k],trace[k+1]); } } free(trace); } Close_DB(db1); if (ISTWO) Close_DB(db2); exit (0); } DALIGNER-master/LAindex.c000066400000000000000000000122061322465224500152320ustar00rootroot00000000000000/******************************************************************************************* * * Create an index with extension .las.idx for a .las file. * Utility expects the .las file to be sorted. * Header contains total # of trace points, max # of trace points for * a given overlap, max # of trace points in all the overlaps for a given aread, and * max # of overlaps for a given aread. The remainder are the offsets into each pile. * * Author: Gene Myers * Date : Sept 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-v] ..."; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { char *iblock; FILE *input, *output; int64 novl, bsize, ovlsize, ptrsize; int tspace, tbytes; char *pwd, *root; int64 tmax, ttot; int64 omax, smax; int64 odeg, sdeg; int i; int VERBOSE; // Process options { int j, k; int flags[128]; ARG_INIT("LAindex") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("v") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // For each file do ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); if (iblock == NULL) exit (1); iblock += ptrsize; for (i = 1; i < argc; i++) { pwd = PathTo(argv[i]); root = Root(argv[i],".las"); input = Fopen(Catenate(pwd,"/",root,".las"),"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_READ_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_READ_ERROR if (tspace <= TRACE_XOVR && tspace != 0) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); output = Fopen(Catenate(pwd,"/.",root,".las.idx"),"w"); if (output == NULL) exit (1); free(pwd); free(root); if (VERBOSE) { printf(" Indexing %s: ",root); Print_Number(novl,0,stdout); printf(" records ... "); fflush(stdout); } fwrite(&novl,sizeof(int64),1,output); fwrite(&novl,sizeof(int64),1,output); fwrite(&novl,sizeof(int64),1,output); fwrite(&novl,sizeof(int64),1,output); { int j, alst; Overlap *w; int64 tsize; int64 optr; char *iptr, *itop; int64 tlen; optr = sizeof(int64) + sizeof(int32); iptr = iblock; itop = iblock + fread(iblock,1,bsize,input); alst = -1; odeg = sdeg = 0; omax = smax = 0; tmax = ttot = 0; for (j = 0; j < novl; j++) { if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memmove(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } w = (Overlap *) (iptr - ptrsize); tlen = w->path.tlen; if (alst < 0) { fwrite(&optr,sizeof(int64),1,output); alst = w->aread; } else while (alst < w->aread) { if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; fwrite(&optr,sizeof(int64),1,output); odeg = sdeg = 0; alst += 1; } if (tlen > tmax) tmax = tlen; ttot += tlen; odeg += 1; sdeg += tlen; iptr += ovlsize; tsize = tlen*tbytes; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memmove(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } optr += ovlsize + tsize; iptr += tsize; } fwrite(&optr,sizeof(int64),1,output); } if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; rewind(output); fwrite(&omax,sizeof(int64),1,output); fwrite(&ttot,sizeof(int64),1,output); fwrite(&smax,sizeof(int64),1,output); fwrite(&tmax,sizeof(int64),1,output); if (VERBOSE) { Print_Number(ttot,0,stdout); printf(" trace points\n"); fflush(stdout); } fclose(input); fclose(output); } free(iblock-ptrsize); exit (0); } DALIGNER-master/LAmerge.c000066400000000000000000000215711322465224500152270ustar00rootroot00000000000000/******************************************************************************************* * * Given a list of sorted .las files, merge them into a single sorted .las file. * * Author: Gene Myers * Date : July 2013 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-va] ..."; #define MEMORY 4000 // in Mb #undef DEBUG // Heap sort of records according to (aread,bread,COMP(flags),abpos) order #define COMPARE(lp,rp) \ if (lp->aread > rp->aread) \ bigger = 1; \ else if (lp->aread < rp->aread) \ bigger = 0; \ else if (lp->bread > rp->bread) \ bigger = 1; \ else if (lp->bread < rp->bread) \ bigger = 0; \ else if (COMP(lp->flags) > COMP(rp->flags)) \ bigger = 1; \ else if (COMP(lp->flags) < COMP(rp->flags)) \ bigger = 0; \ else if (lp->path.abpos > rp->path.abpos) \ bigger = 1; \ else if (lp->path.abpos < rp->path.abpos) \ bigger = 0; \ else if (lp > rp) \ bigger = 1; \ else \ bigger = 0; static void reheap(int s, Overlap **heap, int hsize) { int c, l, r; int bigger; Overlap *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; if (r > hsize) bigger = 1; else { hr = heap[r]; COMPARE(hr,hl) } if (bigger) { COMPARE(hs,hl) if (bigger) { heap[c] = hl; c = l; } else break; } else { COMPARE(hs,hr) if (bigger) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } // Heap sort of records according to (aread,abpos) order #define MAPARE(lp,rp) \ if (lp->aread > rp->aread) \ bigger = 1; \ else if (lp->aread < rp->aread) \ bigger = 0; \ else if (lp->path.abpos > rp->path.abpos) \ bigger = 1; \ else if (lp->path.abpos < rp->path.abpos) \ bigger = 0; \ else if (lp > rp) \ bigger = 1; \ else \ bigger = 0; static void maheap(int s, Overlap **heap, int hsize) { int c, l, r; int bigger; Overlap *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; if (r > hsize) bigger = 1; else { hr = heap[r]; MAPARE(hr,hl) } if (bigger) { MAPARE(hs,hl) if (bigger) { heap[c] = hl; c = l; } else break; } else { MAPARE(hs,hr) if (bigger) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } #ifdef DEBUG static void showheap(Overlap **heap, int hsize) { int i; printf("\n"); for (i = 1; i <= hsize; i++) printf(" %3d: %5d, %5d\n",i,heap[i]->aread,heap[i]->bread); } #endif // Input block data structure and block fetcher typedef struct { FILE *stream; char *block; char *ptr; char *top; int64 count; } IO_block; static void ovl_reload(IO_block *in, int64 bsize) { int64 remains; remains = in->top - in->ptr; if (remains > 0) memmove(in->block, in->ptr, remains); in->ptr = in->block; in->top = in->block + remains; in->top += fread(in->top,1,bsize-remains,in->stream); } // The program int main(int argc, char *argv[]) { IO_block *in; int64 bsize, osize, psize; char *block, *oblock; int i, fway; Overlap **heap; int hsize; Overlap *ovls; int64 totl; int tspace, tbytes; FILE *output; char *optr, *otop; int VERBOSE; int MAP_SORT; // Process command line { int j, k; int flags[128]; ARG_INIT("LAmerge") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("va") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; MAP_SORT = flags['a']; if (argc < 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } fway = argc-2; if (fway > 252) { fprintf(stderr,"Exceeded maximum # of inputs and outputs (252) of merge\n"); exit (1); } } // Open all the input files and initialize their buffers psize = sizeof(void *); osize = sizeof(Overlap) - psize; bsize = (MEMORY*1000000ll)/(fway + 1); block = (char *) Malloc(bsize*(fway+1)+psize,"Allocating LAmerge blocks"); in = (IO_block *) Malloc(sizeof(IO_block)*fway,"Allocating LAmerge IO-reacords"); if (block == NULL || in == NULL) exit (1); block += psize; totl = 0; tbytes = 0; tspace = 0; for (i = 0; i < fway; i++) { int64 novl; int mspace; FILE *input; char *pwd, *root; char *iblock; pwd = PathTo(argv[i+2]); root = Root(argv[i+2],".las"); input = Fopen(Catenate(pwd,"/",root,".las"),"r"); if (input == NULL) exit (1); free(pwd); free(root); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_READ_ERROR totl += novl; if (fread(&mspace,sizeof(int),1,input) != 1) SYSTEM_READ_ERROR if (i == 0) { tspace = mspace; if (tspace <= TRACE_XOVR && tspace != 0) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); } else if (tspace != mspace) { fprintf(stderr,"%s: PT-point spacing conflict (%d vs %d)\n",Prog_Name,tspace,mspace); exit (1); } in[i].stream = input; in[i].block = iblock = block+i*bsize; in[i].ptr = iblock; in[i].top = iblock + fread(in[i].block,1,bsize,input); in[i].count = 0; } // Open the output file buffer and write (novl,tspace) header { char *pwd, *root; pwd = PathTo(argv[1]); root = Root(argv[1],".las"); output = Fopen(Catenate(pwd,"/",root,".las"),"w"); if (output == NULL) exit (1); free(pwd); free(root); if (fwrite(&totl,sizeof(int64),1,output) != 1) SYSTEM_READ_ERROR if (fwrite(&tspace,sizeof(int),1,output) != 1) SYSTEM_READ_ERROR oblock = block+fway*bsize; optr = oblock; otop = oblock + bsize; } if (VERBOSE) { printf("Merging %d files totalling ",fway); Print_Number(totl,0,stdout); printf(" records\n"); } // Initialize the heap heap = (Overlap **) Malloc(sizeof(Overlap *)*(fway+1),"Allocating heap"); ovls = (Overlap *) Malloc(sizeof(Overlap)*fway,"Allocating heap"); if (heap == NULL || ovls == NULL) exit (1); hsize = 0; for (i = 0; i < fway; i++) { if (in[i].ptr < in[i].top) { ovls[i] = *((Overlap *) (in[i].ptr - psize)); in[i].ptr += osize; hsize += 1; heap[hsize] = ovls + i; } } if (hsize > 3) { if (MAP_SORT) for (i = hsize/2; i > 1; i--) maheap(i,heap,hsize); else for (i = hsize/2; i > 1; i--) reheap(i,heap,hsize); } // While the heap is not empty do while (hsize > 0) { Overlap *ov; IO_block *src; int64 tsize, span; if (MAP_SORT) maheap(1,heap,hsize); else reheap(1,heap,hsize); ov = heap[1]; src = in + (ov - ovls); do { src->count += 1; tsize = ov->path.tlen*tbytes; span = osize + tsize; if (src->ptr + span > src->top) ovl_reload(src,bsize); if (optr + span > otop) { if (fwrite(oblock,1,optr-oblock,output) != (size_t) (optr-oblock)) SYSTEM_READ_ERROR optr = oblock; } memmove(optr,((char *) ov) + psize,osize); optr += osize; memmove(optr,src->ptr,tsize); optr += tsize; src->ptr += tsize; if (src->ptr >= src->top) { heap[1] = heap[hsize]; hsize -= 1; break; } *ov = *((Overlap *) (src->ptr - psize)); src->ptr += osize; } while (CHAIN_NEXT(ov->flags)); } // Flush output buffer and wind up if (optr > oblock) { if (fwrite(oblock,1,optr-oblock,output) != (size_t) (optr-oblock)) SYSTEM_READ_ERROR } fclose(output); for (i = 0; i < fway; i++) fclose(in[i].stream); for (i = 0; i < fway; i++) totl -= in[i].count; if (totl != 0) { fprintf(stderr,"%s: Did not write all records to %s (%lld)\n",argv[0],argv[1],totl); exit (1); } free(ovls); free(heap); free(in); free(block-psize); exit (0); } DALIGNER-master/LAshow.c000066400000000000000000000436271322465224500151160ustar00rootroot00000000000000/******************************************************************************************* * * Utility for displaying the overlaps in a .las file in a variety of ways including * a minimal listing of intervals, a cartoon, and a full out alignment. * * Author: Gene Myers * Creation: July 2013 * Last Mod: Jan 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage[] = { "[-caroUF] [-i] [-w] [-b] ", " [ ] [ | ... ]" }; #define LAST_READ_SYMBOL '$' static int ORDER(const void *l, const void *r) { int x = *((int *) l); int y = *((int *) r); return (x-y); } int main(int argc, char *argv[]) { DAZZ_DB _db1, *db1 = &_db1; DAZZ_DB _db2, *db2 = &_db2; Overlap _ovl, *ovl = &_ovl; Alignment _aln, *aln = &_aln; FILE *input; int sameDB; int64 novl; int tspace, tbytes, small; int reps, *pts; int input_pts; int ALIGN, CARTOON, REFERENCE, OVERLAP; int FLIP, MAP; int INDENT, WIDTH, BORDER, UPPERCASE; int ISTWO; // Process options { int i, j, k; int flags[128]; char *eptr; ARG_INIT("LAshow") INDENT = 4; WIDTH = 100; BORDER = 10; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("caroUFM") break; case 'i': ARG_NON_NEGATIVE(INDENT,"Indent") break; case 'w': ARG_POSITIVE(WIDTH,"Alignment width") break; case 'b': ARG_NON_NEGATIVE(BORDER,"Alignment border") break; } else argv[j++] = argv[i]; argc = j; CARTOON = flags['c']; ALIGN = flags['a']; REFERENCE = flags['r']; OVERLAP = flags['o']; UPPERCASE = flags['U']; FLIP = flags['F']; MAP = flags['M']; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); exit (1); } } // Open trimmed DB or DB pair { int status; char *pwd, *root; FILE *input; struct stat stat1, stat2; ISTWO = 0; status = Open_DB(argv[1],db1); if (status < 0) exit (1); if (db1->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } sameDB = 1; if (argc > 3) { pwd = PathTo(argv[3]); root = Root(argv[3],".las"); if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) { ISTWO = 1; fclose(input); status = Open_DB(argv[2],db2); if (status < 0) exit (1); if (db2->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); exit (1); } stat(Catenate(db1->path,"","",".idx"),&stat1); stat(Catenate(db2->path,"","",".idx"),&stat2); if (stat1.st_ino != stat2.st_ino) sameDB = 0; Trim_DB(db2); } else db2 = db1; free(root); free(pwd); } else db2 = db1; Trim_DB(db1); } // Process read index arguments into a sorted list of read ranges input_pts = 0; if (argc == ISTWO+4) { if (argv[ISTWO+3][0] != LAST_READ_SYMBOL || argv[ISTWO+3][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[ISTWO+3],&eptr,10); if (eptr > argv[ISTWO+3] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { int v, x; FILE *input; input = Fopen(argv[ISTWO+3],"r"); if (input == NULL) exit (1); reps = 0; while ((v = fscanf(input," %d",&x)) != EOF) if (v == 0) { fprintf(stderr,"%s: %d'th item of input file %s is not an integer\n", Prog_Name,reps+1,argv[2]); exit (1); } else reps += 1; reps *= 2; pts = (int *) Malloc(sizeof(int)*reps,"Allocating read parameters"); if (pts == NULL) exit (1); rewind(input); for (v = 0; v < reps; v += 2) { fscanf(input," %d",&x); pts[v] = pts[v+1] = x; } fclose(input); } else { pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 3+ISTWO) { int c, b, e; char *eptr, *fptr; for (c = 3+ISTWO; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db1->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == '\0') { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = INT32_MAX; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } qsort(pts,reps/2,sizeof(int64),ORDER); b = 0; for (c = 0; c < reps; c += 2) if (b > 0 && pts[b-1] >= pts[c]-1) { if (pts[c+1] > pts[b-1]) pts[b-1] = pts[c+1]; } else { pts[b++] = pts[c]; pts[b++] = pts[c+1]; } pts[b++] = INT32_MAX; reps = b; } else { pts[reps++] = 1; pts[reps++] = INT32_MAX; } } // Initiate file reading and read (novl, tspace) header { char *over, *pwd, *root; pwd = PathTo(argv[2+ISTWO]); root = Root(argv[2+ISTWO],".las"); over = Catenate(pwd,"/",root,".las"); input = Fopen(over,"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_READ_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_READ_ERROR if (tspace < 0) { fprintf(stderr,"%s: Garbage .las file, trace spacing < 0 !\n",Prog_Name); exit (1); } if (tspace <= TRACE_XOVR && tspace != 0) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } printf("\n%s: ",root); Print_Number(novl,0,stdout); printf(" records\n"); free(pwd); free(root); } // Read the file and display selected records { int j; uint16 *trace; Work_Data *work; int tmax; int in, npt, idx, ar; int64 tps; char *abuffer, *bbuffer; int ar_wide, br_wide; int ai_wide, bi_wide; int mn_wide, mx_wide; int tp_wide; int blast, match, seen, lhalf, rhalf; aln->path = &(ovl->path); if (ALIGN || REFERENCE) { work = New_Work_Data(); abuffer = New_Read_Buffer(db1); bbuffer = New_Read_Buffer(db2); } else { abuffer = NULL; bbuffer = NULL; work = NULL; } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); in = 0; npt = pts[0]; idx = 1; ar_wide = Number_Digits((int64) db1->nreads); br_wide = Number_Digits((int64) db2->nreads); ai_wide = Number_Digits((int64) db1->maxlen); bi_wide = Number_Digits((int64) db2->maxlen); if (db1->maxlen < db2->maxlen) { mn_wide = ai_wide; mx_wide = bi_wide; if (tspace > 0) tp_wide = Number_Digits((int64) db1->maxlen/tspace+2); else tp_wide = 0; } else { mn_wide = bi_wide; mx_wide = ai_wide; if (tspace > 0) tp_wide = Number_Digits((int64) db2->maxlen/tspace+2); else tp_wide = 0; } ar_wide += (ar_wide-1)/3; br_wide += (br_wide-1)/3; ai_wide += (ai_wide-1)/3; bi_wide += (bi_wide-1)/3; mn_wide += (mn_wide-1)/3; tp_wide += (tp_wide-1)/3; if (FLIP) { int x; x = ar_wide; ar_wide = br_wide; br_wide = x; x = ai_wide; ai_wide = bi_wide; bi_wide = x; } // For each record do blast = -1; match = 0; seen = 0; lhalf = rhalf = 0; for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2*ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); } ovl->path.trace = (void *) trace; Read_Trace(input,ovl,tbytes); if (ovl->aread >= db1->nreads) { fprintf(stderr,"%s: A-read is out-of-range of DB %s\n",Prog_Name,argv[1]); exit (1); } if (ovl->bread >= db2->nreads) { fprintf(stderr,"%s: B-read is out-of-range of DB %s\n",Prog_Name,argv[1+ISTWO]); exit (1); } // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ovl->path.tlen/2; if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != aln->alen && ovl->path.bepos != aln->blen) continue; } // If -M option then check the completeness of the implied mapping if (MAP) { while (ovl->bread != blast) { if (!match && seen && !(lhalf && rhalf)) { printf("Missing "); Print_Number((int64) blast+1,br_wide+1,stdout); printf(" %d ->%lld\n",db2->reads[blast].rlen,db2->reads[blast].coff); } match = 0; seen = 0; lhalf = rhalf = 0; blast += 1; } seen = 1; if (ovl->path.abpos == 0) rhalf = 1; if (ovl->path.aepos == aln->alen) lhalf = 1; if (ovl->path.bbpos != 0 || ovl->path.bepos != aln->blen) continue; match = 1; } // Display it if (ALIGN || CARTOON || REFERENCE) printf("\n"); if (BEST_CHAIN(ovl->flags)) printf("> "); else if (CHAIN_START(ovl->flags)) printf("+ "); else if (CHAIN_NEXT(ovl->flags)) printf(" -"); if (FLIP) { Flip_Alignment(aln,0); Print_Number((int64) ovl->bread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->aread+1,br_wide+1,stdout); } else { Print_Number((int64) ovl->aread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->bread+1,br_wide+1,stdout); } if (COMP(ovl->flags)) printf(" c"); else printf(" n"); if (ovl->path.abpos == 0) printf(" <"); else printf(" ["); Print_Number((int64) ovl->path.abpos,ai_wide,stdout); printf(".."); Print_Number((int64) ovl->path.aepos,ai_wide,stdout); if (ovl->path.aepos == aln->alen) printf("> x "); else printf("] x "); if (ovl->path.bbpos == 0) printf("<"); else printf("["); if (COMP(ovl->flags)) { Print_Number((int64) (aln->blen - ovl->path.bbpos),bi_wide,stdout); printf(".."); Print_Number((int64) (aln->blen - ovl->path.bepos),bi_wide,stdout); } else { Print_Number((int64) ovl->path.bbpos,bi_wide,stdout); printf(".."); Print_Number((int64) ovl->path.bepos,bi_wide,stdout); } if (ovl->path.bepos == aln->blen) printf(">"); else printf("]"); if (!CARTOON) printf(" ~ %5.2f%% ",(200.*ovl->path.diffs) / ((ovl->path.aepos - ovl->path.abpos) + (ovl->path.bepos - ovl->path.bbpos)) ); printf(" ("); if (FLIP) { Print_Number(aln->alen,ai_wide,stdout); printf(" x "); Print_Number(aln->blen,bi_wide,stdout); } else { Print_Number(aln->blen,bi_wide,stdout); printf(" x "); Print_Number(aln->alen,ai_wide,stdout); } printf(" bps,"); if (CARTOON) { Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n\n"); } else { Print_Number((int64) ovl->path.diffs,mn_wide,stdout); printf(" diffs, "); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n"); } if (ALIGN || CARTOON || REFERENCE) { if (ALIGN || REFERENCE) { char *aseq, *bseq; int amin, amax; int bmin, bmax; int self; if (FLIP) Flip_Alignment(aln,0); if (small) Decompress_TraceTo16(ovl); self = sameDB && (ovl->aread == ovl->bread) && !COMP(ovl->flags); amin = ovl->path.abpos - BORDER; if (amin < 0) amin = 0; amax = ovl->path.aepos + BORDER; if (amax > aln->alen) amax = aln->alen; if (COMP(aln->flags)) { bmin = (aln->blen-ovl->path.bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln->blen-ovl->path.bbpos) + BORDER; if (bmax > aln->blen) bmax = aln->blen; } else { bmin = ovl->path.bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = ovl->path.bepos + BORDER; if (bmax > aln->blen) bmax = aln->blen; if (self) { if (bmin < amin) amin = bmin; if (bmax > amax) amax = bmax; } } aseq = Load_Subread(db1,ovl->aread,amin,amax,abuffer,0); if (!self) bseq = Load_Subread(db2,ovl->bread,bmin,bmax,bbuffer,0); else bseq = aseq; aln->aseq = aseq - amin; if (COMP(aln->flags)) { Complement_Seq(bseq,bmax-bmin); aln->bseq = bseq - (aln->blen - bmax); } else if (self) aln->bseq = aln->aseq; else aln->bseq = bseq - bmin; if (tspace == 0) Compute_Trace_IRR(aln,work,GREEDIEST); else Compute_Trace_PTS(aln,work,tspace,GREEDIEST); if (FLIP) { if (COMP(aln->flags)) { Complement_Seq(aseq,amax-amin); Complement_Seq(bseq,bmax-bmin); aln->aseq = aseq - (aln->alen - amax); aln->bseq = bseq - bmin; } Flip_Alignment(aln,1); } } if (CARTOON) Alignment_Cartoon(stdout,aln,INDENT,mx_wide); if (REFERENCE) Print_Reference(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); if (ALIGN) Print_Alignment(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); } } free(trace); if (ALIGN) { free(bbuffer-1); free(abuffer-1); Free_Work_Data(work); } } Close_DB(db1); if (ISTWO) Close_DB(db2); exit (0); } DALIGNER-master/LAsort.c000066400000000000000000000153641322465224500151220ustar00rootroot00000000000000/******************************************************************************************* * * Load a file U.las of overlaps into memory, sort them all by A,B index, * and then output the result to U.S.las * * Author: Gene Myers * Date : July 2013 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-va] ..."; #define MEMORY 1000 // How many megabytes for output buffer static char *IBLOCK; static int SORT_OVL(const void *x, const void *y) { int64 l = *((int64 *) x); int64 r = *((int64 *) y); Overlap *ol, *or; int al, ar; int bl, br; int cl, cr; int pl, pr; ol = (Overlap *) (IBLOCK+l); or = (Overlap *) (IBLOCK+r); al = ol->aread; ar = or->aread; if (al != ar) return (al-ar); bl = ol->bread; br = or->bread; if (bl != br) return (bl-br); cl = COMP(ol->flags); cr = COMP(or->flags); if (cl != cr) return (cl-cr); pl = ol->path.abpos; pr = or->path.abpos; if (pl != pr) return (pl-pr); if (ol < or) return (-1); else if (ol > or) return (1); else return (0); } static int SORT_MAP(const void *x, const void *y) { int64 l = *((int64 *) x); int64 r = *((int64 *) y); Overlap *ol, *or; int al, ar; int pl, pr; ol = (Overlap *) (IBLOCK+l); or = (Overlap *) (IBLOCK+r); al = ol->aread; ar = or->aread; if (al != ar) return (al-ar); pl = ol->path.abpos; pr = or->path.abpos; if (pl != pr) return (pl-pr); if (ol < or) return (-1); else if (ol > or) return (1); else return (0); } int main(int argc, char *argv[]) { char *iblock, *fblock, *iend; int64 isize, osize; int64 ovlsize, ptrsize; int tspace, tbytes; int i; int VERBOSE; int MAP_ORDER; // Process options { int j, k; int flags[128]; ARG_INIT("LAsort") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("va") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; MAP_ORDER = flags['a']; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // For each file do ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; isize = 0; iblock = NULL; osize = MEMORY * 1000000ll; fblock = Malloc(osize,"Allocating LAsort output block"); for (i = 1; i < argc; i++) { int64 *perm; FILE *input, *foutput; int64 novl, sov; // Read in the entire file and output header { int64 size; struct stat info; char *pwd, *root, *name; pwd = PathTo(argv[i]); root = Root(argv[i],".las"); name = Catenate(pwd,"/",root,".las"); input = Fopen(name,"r"); if (input == NULL) exit (1); stat(name,&info); size = info.st_size; if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_READ_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_READ_ERROR if (tspace <= TRACE_XOVR && tspace != 0) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); if (VERBOSE) { printf(" %s: ",root); Print_Number(novl,0,stdout); printf(" records "); Print_Number(size-novl*ovlsize,0,stdout); printf(" trace bytes\n"); fflush(stdout); } foutput = Fopen(Catenate(pwd,"/",root,".S.las"),"w"); if (foutput == NULL) exit (1); if (fwrite(&novl,sizeof(int64),1,foutput) != 1) SYSTEM_READ_ERROR if (fwrite(&tspace,sizeof(int),1,foutput) != 1) SYSTEM_READ_ERROR free(pwd); free(root); if (size > isize) { if (iblock == NULL) iblock = Malloc(size+ptrsize,"Allocating LAsort input block"); else iblock = Realloc(iblock-ptrsize,size+ptrsize,"Allocating LAsort input block"); if (iblock == NULL) exit (1); iblock += ptrsize; isize = size; } size -= (sizeof(int64) + sizeof(int)); if (size > 0) { if (fread(iblock,size,1,input) != 1) SYSTEM_READ_ERROR } fclose(input); iend = iblock + (size - ptrsize); } // Set up unsorted permutation array perm = (int64 *) Malloc(sizeof(int64)*novl,"Allocating LAsort permutation vector"); if (perm == NULL) exit (1); { int64 off; int j; if (CHAIN_START(((Overlap *) (iblock-ptrsize))->flags)) { sov = 0; off = -ptrsize; for (j = 0; j < novl; j++) { if (CHAIN_START(((Overlap *) (iblock+off))->flags)) perm[sov++] = off; off += ovlsize + ((Overlap *) (iblock+off))->path.tlen*tbytes; } } else { off = -ptrsize; for (j = 0; j < novl; j++) { perm[j] = off; off += ovlsize + ((Overlap *) (iblock+off))->path.tlen*tbytes; } sov = novl; } } // Sort permutation array of ptrs to records IBLOCK = iblock; if (MAP_ORDER) qsort(perm,sov,sizeof(int64),SORT_MAP); else qsort(perm,sov,sizeof(int64),SORT_OVL); // Output the records in sorted order { int j; Overlap *w; int64 tsize, span; char *fptr, *ftop, *wo; fptr = fblock; ftop = fblock + osize; for (j = 0; j < sov; j++) { w = (Overlap *) (wo = iblock+perm[j]); do { tsize = w->path.tlen*tbytes; span = ovlsize + tsize; if (fptr + span > ftop) { if (fwrite(fblock,1,fptr-fblock,foutput) != (size_t) (fptr-fblock)) SYSTEM_READ_ERROR fptr = fblock; } memmove(fptr,((char *) w)+ptrsize,ovlsize); fptr += ovlsize; memmove(fptr,(char *) (w+1),tsize); fptr += tsize; w = (Overlap *) (wo += span); } while (wo < iend && CHAIN_NEXT(w->flags)); } if (fptr > fblock) { if (fwrite(fblock,1,fptr-fblock,foutput) != (size_t) (fptr-fblock)) SYSTEM_READ_ERROR } } free(perm); fclose(foutput); } if (iblock != NULL) free(iblock - ptrsize); free(fblock); exit (0); } DALIGNER-master/LAsplit.c000066400000000000000000000145441322465224500152650ustar00rootroot00000000000000/******************************************************************************************* * * Split an OVL file arriving from the standard input into 'parts' equal sized .las-files * .1.las, .2.las ... or according to a current partitioning of * * Author: Gene Myers * Date : June 2014 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "-v ( | ) < .las"; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { char *iblock, *oblock; FILE *output, *dbvis; int64 novl, bsize, ovlsize, ptrsize; int parts, tspace, tbytes; int olast, blast; char *pwd, *root, *root2; int VERBOSE; // Process options { int i, j, k; int flags[128]; ARG_INIT("LAsplit") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("v") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc != 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } { char *eptr; int nfiles, cutoff, all; int64 size; char buffer[2*MAX_NAME+100]; parts = strtol(argv[2],&eptr,10); if (*eptr != '\0') { pwd = PathTo(argv[2]); if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0) root = Root(argv[2],".dam"); else root = Root(argv[2],".db"); dbvis = fopen(Catenate(pwd,"/",root,".dam"),"r"); if (dbvis == NULL) { dbvis = fopen(Catenate(pwd,"/",root,".db"),"r"); if (dbvis == NULL) { fprintf(stderr,"%s: Second argument '%s' is not an integer or a DB\n", Prog_Name,argv[2]); exit (1); } } free(pwd); free(root); if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) SYSTEM_READ_ERROR while (nfiles-- > 0) if (fgets(buffer,2*MAX_NAME+100,dbvis) == NULL) SYSTEM_READ_ERROR parts = 0; if (fscanf(dbvis,DB_NBLOCK,&parts) != 1) { fprintf(stderr,"%s: DB %s has not been partitioned\n",Prog_Name,argv[2]); exit (1); } if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) SYSTEM_READ_ERROR if (fscanf(dbvis,DB_BDATA,&olast,&blast) != 2) SYSTEM_READ_ERROR } else { dbvis = NULL; if (parts <= 0) { fprintf(stderr,"%s: Number of parts is not positive\n",Prog_Name); exit (1); } } } ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; oblock = (char *) Malloc(bsize,"Allocating output block"); iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); if (oblock == NULL || iblock == NULL) exit (1); iblock += ptrsize; pwd = PathTo(argv[1]); root = Root(argv[1],".las"); root2 = index(root,'#'); if (root2 == NULL) { fprintf(stderr,"%s: No #-sign in source name '%s'\n",Prog_Name,root); exit (1); } if (index(root2+1,'#') != NULL) { fprintf(stderr,"%s: Two or more occurences of #-sign in source name '%s'\n",Prog_Name,root); exit (1); } *root2++ = '\0'; if (fread(&novl,sizeof(int64),1,stdin) != 1) SYSTEM_READ_ERROR if (fread(&tspace,sizeof(int),1,stdin) != 1) SYSTEM_READ_ERROR if (tspace <= TRACE_XOVR && tspace != 0) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); if (VERBOSE) fprintf(stderr," Distributing %lld la\'s\n",novl); { int i; Overlap *w; int64 j, low, hgh, last; int64 tsize, povl; char *iptr, *itop; char *optr, *otop; iptr = iblock; itop = iblock + fread(iblock,1,bsize,stdin); hgh = 0; for (i = 0; i < parts; i++) { output = Fopen(Catenate(pwd,"/",Numbered_Suffix(root,i+1,root2),".las"),"w"); if (output == NULL) exit (1); low = hgh; if (dbvis != NULL) { if (fscanf(dbvis,DB_BDATA,&olast,&blast) != 2) SYSTEM_READ_ERROR last = blast-1; hgh = 0; } else { last = 0; hgh = (novl*(i+1))/parts; } povl = 0; fwrite(&povl,sizeof(int64),1,output); fwrite(&tspace,sizeof(int),1,output); optr = oblock; otop = oblock + bsize; for (j = low; j < novl; j++) { if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memmove(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,stdin); } w = (Overlap *) (iptr-ptrsize); if (dbvis == NULL) { if (j >= hgh && w->aread > last) break; last = w->aread; } else { if (w->aread > last) break; } tsize = w->path.tlen*tbytes; if (optr + ovlsize + tsize > otop) { fwrite(oblock,1,optr-oblock,output); optr = oblock; } memmove(optr,iptr,ovlsize); optr += ovlsize; iptr += ovlsize; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memmove(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,stdin); } memmove(optr,iptr,tsize); optr += tsize; iptr += tsize; } hgh = j; if (optr > oblock) fwrite(oblock,1,optr-oblock,output); rewind(output); povl = hgh-low; fwrite(&povl,sizeof(int64),1,output); if (VERBOSE) fprintf(stderr," Split off %s: %lld la\'s\n",Numbered_Suffix(root,i+1,root2),povl); fclose(output); } } free(pwd); free(root); free(iblock-ptrsize); free(oblock); exit (0); } DALIGNER-master/LICENSE000066400000000000000000000053111322465224500145460ustar00rootroot00000000000000 Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: · Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. · Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. · The name of EWM may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. For any issues regarding this software and its use, contact EWM at: Eugene W. Myers Jr. Bautzner Str. 122e 01099 Dresden GERMANY Email: gene.myers@gmail.com DALIGNER-master/Makefile000066400000000000000000000031561322465224500152060ustar00rootroot00000000000000DEST_DIR = ~/bin CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck LAindex all: $(ALL) daligner: daligner.c filter.c filter.h align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o daligner daligner.c filter.c align.c DB.c QV.c -lpthread -lm HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm LAupgrade.Dec.31.2014: LAupgrade.Dec.31.2014.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAupgrade.Dec.31.2014 LAupgrade.Dec.31.2014.c align.c DB.c QV.c -lm LAindex: LAindex.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAindex LAindex.c align.c DB.c QV.c -lm clean: rm -f $(ALL) rm -fr *.dSYM rm -f LAupgrade.Dec.31.2014 rm -f daligner.tar.gz install: cp $(ALL) $(DEST_DIR) package: make clean tar -zcf daligner.tar.gz README.md Makefile *.h *.c DALIGNER-master/QV.c000066400000000000000000001132131322465224500142340ustar00rootroot00000000000000/******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #undef DEBUG #define MIN_BUFFER 1000 #define HUFF_CUTOFF 16 // This cannot be larger than 16 ! /******************************************************************************************* * * Endian flipping routines * ********************************************************************************************/ static int LittleEndian; // Little-endian machine ? // Referred by: Decode & Decode_Run static int Flip; // Flip endian of all coded shorts and ints // Referred by: Decode & Decode_Run & Read_Scheme static void Set_Endian(int flip) { uint32 x = 3; uint8 *b = (uint8 *) (&x); Flip = flip; LittleEndian = (b[0] == 3); } static void Flip_Long(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[3]; v[3] = x; x = v[1]; v[1] = v[2]; v[2] = x; } static void Flip_Short(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[1]; v[1] = x; } /******************************************************************************************* * * Routines for computing a Huffman Encoding Scheme * ********************************************************************************************/ typedef struct { int type; // 0 => normal, 1 => normal but has long codes, 2 => truncated uint32 codebits[256]; // If type = 2, then code 255 is the special code for int codelens[256]; // non-Huffman exceptions int lookup[0x10000]; // Lookup table (just for decoding) } HScheme; typedef struct _HTree { struct _HTree *lft, *rgt; uint64 count; } HTree; // Establish heap property from node s down (1 is root, siblings of n are 2n and 2n+1) // assuming s is the only perturbation in the tree. static void Reheap(int s, HTree **heap, int hsize) { int c, l, r; HTree *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; hr = heap[r]; if (r > hsize || hr->count > hl->count) { if (hs->count > hl->count) { heap[c] = hl; c = l; } else break; } else { if (hs->count > hr->count) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } // Given Huffman tree build a table of codes from it, the low-order codelens[s] bits // of codebits[s] contain the code for symbol s. static void Build_Table(HTree *node, int code, int len, uint32 *codebits, int *codelens) { if (node->rgt == NULL) { uint64 symbol = (uint64) (node->lft); codebits[symbol] = code; codelens[symbol] = len; } else { code <<= 1; len += 1; Build_Table(node->lft,code,len,codebits,codelens); Build_Table(node->rgt,code+1,len,codebits,codelens); } } // For the non-zero symbols in hist, compute a huffman tree over them, and then // build a table of the codes. If inscheme is not NULL, then place all symbols // with code 255 or with more than HUFF_CUTOFF bits in the encoding by inscheme // as a single united entity, whose code signals that the value of these symbols // occur explicitly in 8 (values) or 16 (run lengths) bits following the code. // All the symbols in this class will have the same entry in the code table and // 255 is always in this class. static HScheme *Huffman(uint64 *hist, HScheme *inscheme) { HScheme *scheme; HTree *heap[259]; HTree node[512]; int hsize; HTree *lft, *rgt; int value, range; int i; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); hsize = 0; // Load heap value = 0; if (inscheme != NULL) { node[0].count = 0; node[0].lft = (HTree *) (uint64) 255; node[0].rgt = NULL; heap[++hsize] = node+(value++); } for (i = 0; i < 256; i++) if (hist[i] > 0) { if (inscheme != NULL && (inscheme->codelens[i] > HUFF_CUTOFF || i == 255)) node[0].count += hist[i]; else { node[value].count = hist[i]; node[value].lft = (HTree *) (uint64) i; node[value].rgt = NULL; heap[++hsize] = node+(value++); } } for (i = hsize/2; i >= 1; i--) // Establish heap property Reheap(i,heap,hsize); range = value; // Merge pairs with smallest count until have a tree for (i = 1; i < value; i++) { lft = heap[1]; heap[1] = heap[hsize--]; Reheap(1,heap,hsize); rgt = heap[1]; node[range].lft = lft; node[range].rgt = rgt; node[range].count = lft->count + rgt->count; heap[1] = node+(range++); Reheap(1,heap,hsize); } for (i = 0; i < 256; i++) // Build the code table { scheme->codebits[i] = 0; scheme->codelens[i] = 0; } Build_Table(node+(range-1),0,0,scheme->codebits,scheme->codelens); if (inscheme != NULL) // Set scheme type and if truncated (2), map truncated codes { scheme->type = 2; // to code and length for 255 for (i = 0; i < 255; i++) if (inscheme->codelens[i] > HUFF_CUTOFF || scheme->codelens[i] > HUFF_CUTOFF) { scheme->codelens[i] = scheme->codelens[255]; scheme->codebits[i] = scheme->codebits[255]; } } else { scheme->type = 0; for (i = 0; i < 256; i++) { if (scheme->codelens[i] > HUFF_CUTOFF) scheme->type = 1; } } return (scheme); } #ifdef DEBUG // For debug, show the coding table static void Print_Table(HScheme *scheme, uint64 *hist, int infosize) { uint64 total_bits; uint32 specval, mask, code, *bits; int speclen, clen, *lens; int i, k; total_bits = 0; bits = scheme->codebits; lens = scheme->codelens; if (scheme->type == 2) { specval = bits[255]; speclen = lens[255]; } else specval = speclen = 0x7fffffff; printf("\nCode Table:\n"); for (i = 0; i < 256; i++) if (lens[i] > 0) { clen = lens[i]; mask = (1 << clen); code = bits[i]; printf(" %3d: %2d ",i,clen); for (k = 0; k < clen; k++) { mask >>= 1; if (code & mask) printf("1"); else printf("0"); } if (code == specval && clen == speclen) { printf(" ***"); if (hist != NULL) total_bits += (clen+infosize)*hist[i]; } else if (hist != NULL) total_bits += clen*hist[i]; printf("\n"); } if (hist != NULL) printf("\nTotal Bytes = %lld\n",(total_bits-1)/8+1); } // For debug, show the histogram static void Print_Histogram(uint64 *hist) { int i, low, hgh; uint64 count; for (hgh = 255; hgh >= 0; hgh--) if (hist[hgh] != 0) break; for (low = 0; low < 256; low++) if (hist[low] != 0) break; count = 0; for (i = low; i <= hgh; i++) count += hist[i]; for (i = hgh; i >= low; i--) printf(" %3d: %8llu %5.1f%%\n",i,hist[i],(hist[i]*100.)/count); } #endif /******************************************************************************************* * * Read and Write Huffman Schemes * ********************************************************************************************/ // Write the code table to out. static void Write_Scheme(HScheme *scheme, FILE *out) { int i; uint8 x; uint32 *bits; int *lens; lens = scheme->codelens; bits = scheme->codebits; x = (uint8) (scheme->type); fwrite(&x,1,1,out); for (i = 0; i < 256; i++) { x = (uint8) (lens[i]); fwrite(&x,1,1,out); if (x > 0) fwrite(bits+i,sizeof(uint32),1,out); } } // Allocate and read a code table from in, and return a pointer to it. static HScheme *Read_Scheme(FILE *in) { HScheme *scheme; int *look, *lens; uint32 *bits, base; int i, j, powr; uint8 x; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); lens = scheme->codelens; bits = scheme->codebits; look = scheme->lookup; if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n"); free(scheme); return (NULL); } scheme->type = x; for (i = 0; i < 256; i++) { if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i); return (NULL); } lens[i] = x; if (x > 0) { if (fread(bits+i,sizeof(uint32),1,in) != 1) { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i); free(scheme); return (NULL); } } else bits[i] = 0; } if (Flip) { for (i = 0; i < 256; i++) Flip_Long(bits+i); } for (i = 0; i < 256; i++) { if (lens[i] > 0) { base = (bits[i] << (16-lens[i])); powr = (1 << (16-lens[i])); for (j = 0; j < powr; j++) look[base+j] = i; } } return (scheme); } /******************************************************************************************* * * Encoders and Decoders * ********************************************************************************************/ // Encode read[0..rlen-1] according to scheme and write to out static void Encode(HScheme *scheme, FILE *out, uint8 *read, int rlen) { uint32 x, c, ocode; int n, k, olen, llen; int *nlens; uint32 *nbits; uint32 nspec; int nslen; nlens = scheme->codelens; nbits = scheme->codebits; if (scheme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; #define OCODE(L,C) \ { int len = olen + (L); \ uint32 code = (C); \ \ llen = olen; \ if (len >= 32) \ { olen = len-32; \ ocode |= (code >> olen); \ fwrite(&ocode,sizeof(uint32),1,out); \ if (olen > 0) \ ocode = (code << (32-olen)); \ else \ ocode = 0; \ } \ else \ { olen = len; \ ocode |= (code << (32-olen));; \ } \ } llen = 0; olen = 0; ocode = 0; for (k = 0; k < rlen; k++) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); } if (olen > 0) // Tricky: must pad so decoder does not read past { fwrite(&ocode,sizeof(uint32),1,out); // last integer int the coded output. if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Encode read[0..rlen-1] according to non-rchar table neme, and run-length table reme for // runs of rchar characters. Write to out. static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int rlen, int rchar) { uint32 x, c, ocode; int n, h, k, olen, llen; int *nlens, *rlens; uint32 *nbits, *rbits; uint32 nspec, rspec; int nslen, rslen; nlens = neme->codelens; nbits = neme->codebits; rlens = reme->codelens; rbits = reme->codebits; if (neme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; rspec = rbits[255]; rslen = rlens[255]; llen = 0; olen = 0; ocode = 0; k = 0; while (k < rlen) { h = k; while (k < rlen && read[k] == rchar) k += 1; if (k-h >= 255) x = 255; else x = k-h; n = rlens[x]; c = rbits[x]; OCODE(n,c); if (c == rspec && n == rslen) OCODE(16,k-h); if (k < rlen) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); k += 1; } } if (olen > 0) { fwrite(&ocode,sizeof(uint32),1,out); if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Read and decode from in, the next rlen symbols into read according to scheme static int Decode(HScheme *scheme, FILE *in, char *read, int rlen) { int *look, *lens; int signal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (scheme->type == 2) signal = 255; else signal = 256; lens = scheme->codelens; look = scheme->lookup; #define GET \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } #define GETFLIP \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ Flip_Long(ipart); \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = look[*xpart]; n = lens[c]; if (c == signal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } else for (j = 0; j < rlen; j++) { GET c = look[*xpart]; n = lens[c]; if (c == signal) { GET c = *cpart; n = 8; } read[j] = (char) c; } return (0); } // Read and decode from in, the next rlen symbols into read according to non-rchar scheme // neme, and the rchar runlength shceme reme static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, int rlen, int rchar) { int *nlook, *nlens; int *rlook, *rlens; int nsignal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c, k; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (neme->type == 2) nsignal = 255; else nsignal = 256; nlens = neme->codelens; nlook = neme->lookup; rlens = reme->codelens; rlook = reme->lookup; n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GETFLIP c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GETFLIP c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } } else for (j = 0; j < rlen; j++) { GET c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GET c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GET c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GET c = *cpart; n = 8; } read[j] = (char) c; } } return (0); } /******************************************************************************************* * * Histogrammers * ********************************************************************************************/ // Histogram runlengths of symbol runChar in stream[0..rlen-1] into run. static void Histogram_Seqs(uint64 *hist, uint8 *stream, int rlen) { int k; for (k = 0; k < rlen; k++) hist[stream[k]] += 1; } static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar) { int k, h; k = 0; while (k < rlen) { h = k; while (k < rlen && stream[k] == runChar) k += 1; if (k-h >= 256) run[255] += 1; else run[k-h] += 1; if (k < rlen) k += 1; } } /******************************************************************************************* * * Reader * ********************************************************************************************/ static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan, static int Rmax = -1; // Compress_Next_QVentry static int Nline; // Referred by: QVcoding_Scan char *QVentry() { return (Read); } void Set_QV_Line(int line) { Nline = line; } int Get_QV_Line() { return (Nline); } // If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines // for a sequence. Place line j at Read+j*Rmax and the length of every line is returned // unless eof occurs in which case return -1. If any error occurs return -2. int Read_Lines(FILE *input, int nlines) { int i, rlen; int tmax; char *tread; char *other; if (Read == NULL) { tmax = MIN_BUFFER; tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; } Nline += 1; if (fgets(Read,Rmax,input) == NULL) return (-1); rlen = strlen(Read); while (Read[rlen-1] != '\n') { tmax = ((int) 1.4*Rmax) + MIN_BUFFER; tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; if (fgets(Read+rlen,Rmax-rlen,input) == NULL) { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline); EXIT(-2); } rlen += strlen(Read+rlen); } other = Read; for (i = 1; i < nlines; i++) { other += Rmax; Nline += 1; if (fgets(other,Rmax,input) == NULL) { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(-2); } if (rlen != (int) strlen(other)) { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline); EXIT(-2); } } return (rlen-1); } /******************************************************************************************* * * Tag compression and decompression routines * ********************************************************************************************/ // Keep only the symbols in tags[0..rlen-1] for which qvs[k] != rchar and // return the # of symbols kept. static int Pack_Tag(char *tags, char *qvs, int rlen, int rchar) { int j, k; j = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) tags[j++] = tags[k]; tags[j] = '\0'; return (j); } // Count the # of non-rchar symbols in qvs[0..rlen-1] static int Packed_Length(char *qvs, int rlen, int rchar) { int k, clen; clen = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) clen += 1; return (clen); } // Unpack tags by moving its i'th char to position k where qvs[k] is the i'th non-rchar // symbol in qvs. All other chars are set to rchar. rlen is the length of qvs and // the unpacked result, clen is the initial length of tags. static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar) { int j, k; j = clen-1; for (k = rlen-1; k >= 0; k--) { if (qvs[k] == rchar) tags[k] = 'n'; else tags[k] = tags[j--]; } } /******************************************************************************************* * * Statistics Scan and Scheme creation and write * ********************************************************************************************/ // Read up to the next num entries or until eof from the .quiva file on input and record // frequency statistics. Copy these entries to the temporary file temp if != NULL. // If there is an error then -1 is returned, otherwise the number of entries read. static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256]; static uint64 totChar; static int delChar, subChar; // Referred by: QVcoding_Scan, Create_QVcoding void QVcoding_Scan1(int rlen, char *delQV, char *delTag, char *insQV, char *mergeQV, char *subQV) { if (rlen == 0) // Initialization call { int i; // Zero histograms bzero(delHist,sizeof(uint64)*256); bzero(mrgHist,sizeof(uint64)*256); bzero(insHist,sizeof(uint64)*256); bzero(subHist,sizeof(uint64)*256); for (i = 0; i < 256; i++) delRun[i] = subRun[i] = 1; totChar = 0; delChar = -1; subChar = -1; return; } // Add streams to accumulating histograms and figure out the run chars // for the deletion and substition streams Histogram_Seqs(delHist,(uint8 *) delQV,rlen); Histogram_Seqs(insHist,(uint8 *) insQV,rlen); Histogram_Seqs(mrgHist,(uint8 *) mergeQV,rlen); Histogram_Seqs(subHist,(uint8 *) subQV,rlen); if (delChar < 0) { int k; for (k = 0; k < rlen; k++) if (delTag[k] == 'n' || delTag[k] == 'N') { delChar = delQV[k]; break; } } if (delChar >= 0) Histogram_Runs( delRun,(uint8 *) delQV,rlen,delChar); totChar += rlen; if (subChar < 0) { if (totChar >= 100000) { int k; subChar = 0; for (k = 1; k < 256; k++) if (subHist[k] > subHist[subChar]) subChar = k; } } if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) subQV,rlen,subChar); return; } int QVcoding_Scan(FILE *input, int num, FILE *temp) { char *slash; int rlen; int i, r; // Zero histograms bzero(delHist,sizeof(uint64)*256); bzero(mrgHist,sizeof(uint64)*256); bzero(insHist,sizeof(uint64)*256); bzero(subHist,sizeof(uint64)*256); for (i = 0; i < 256; i++) delRun[i] = subRun[i] = 1; totChar = 0; delChar = -1; subChar = -1; // Make a sweep through the .quiva entries, histogramming the relevant things // and figuring out the run chars for the deletion and substition streams r = 0; for (i = 0; i < num; i++) { int well, beg, end, qv; rlen = Read_Lines(input,1); if (rlen == -2) EXIT(-1); if (rlen < 0) break; if (rlen == 0 || Read[0] != '@') { EPRINTF(EPLACE,"Line %d: Header in quiva file is missing\n",Nline); EXIT(-1); } slash = index(Read+1,'/'); if (slash == NULL) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(-1); } if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(-1); } if (temp != NULL) fputs(Read,temp); rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(-1); } if (temp != NULL) { fputs(Read,temp); fputs(Read+Rmax,temp); fputs(Read+2*Rmax,temp); fputs(Read+3*Rmax,temp); fputs(Read+4*Rmax,temp); } Histogram_Seqs(delHist,(uint8 *) (Read),rlen); Histogram_Seqs(insHist,(uint8 *) (Read+2*Rmax),rlen); Histogram_Seqs(mrgHist,(uint8 *) (Read+3*Rmax),rlen); Histogram_Seqs(subHist,(uint8 *) (Read+4*Rmax),rlen); if (delChar < 0) { int k; char *del = Read+Rmax; for (k = 0; k < rlen; k++) if (del[k] == 'n' || del[k] == 'N') { delChar = Read[k]; break; } } if (delChar >= 0) Histogram_Runs( delRun,(uint8 *) (Read),rlen,delChar); totChar += rlen; if (subChar < 0) { if (totChar >= 100000) { int k; subChar = 0; for (k = 1; k < 256; k++) if (subHist[k] > subHist[subChar]) subChar = k; } } if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar); r += 1; } return (r); } // Using the statistics in the global stat tables, create the Huffman schemes and write // them to output. If lossy is set, then create a lossy table for the insertion and merge // QVs. QVcoding *Create_QVcoding(int lossy) { static QVcoding coding; HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; HScheme *dRunScheme, *sRunScheme; delScheme = NULL; dRunScheme = NULL; insScheme = NULL; mrgScheme = NULL; subScheme = NULL; sRunScheme = NULL; // Check whether using a subtitution run char is a win if (totChar < 200000 || subHist[subChar] < .5*totChar) subChar = -1; // If lossy encryption is enabled then scale insertions and merge QVs. if (lossy) { int k; for (k = 0; k < 256; k += 2) { insHist[k] += insHist[k+1]; insHist[k+1] = 0; } for (k = 0; k < 256; k += 4) { mrgHist[k] += mrgHist[k+1]; mrgHist[k] += mrgHist[k+2]; mrgHist[k] += mrgHist[k+3]; mrgHist[k+1] = 0; mrgHist[k+2] = 0; mrgHist[k+3] = 0; } } // Build a Huffman scheme for each stream entity from the histograms #define SCHEME_MACRO(meme,hist,label,bits) \ scheme = Huffman( (hist), NULL); \ if (scheme == NULL) \ goto error; \ if (scheme->type) \ { (meme) = Huffman( (hist), scheme); \ free(scheme); \ } \ else \ (meme) = scheme; #ifdef DEBUG #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) \ printf("\n%s\n", (label) ); \ Print_Histogram( (hist)); \ Print_Table( (meme), (hist), (bits)); #else #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) #endif { HScheme *scheme; if (delChar < 0) { MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs", 8); dRunScheme = NULL; } else { delHist[delChar] = 0; MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs less run char", 8); MAKE_SCHEME(dRunScheme,delRun, "Histogram of Deletion Runs QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",delChar); #endif } #ifdef DEBUG { int k; uint64 count; count = 0; for (k = 0; k < 256; k++) count += delHist[k]; printf("\nDelTag will require %lld bytes\n",count/4); } #endif MAKE_SCHEME(insScheme,insHist, "Hisotgram of Insertion QVs", 8); MAKE_SCHEME(mrgScheme,mrgHist, "Hisotgram of Merge QVs", 8); if (subChar < 0) { MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs", 8); sRunScheme = NULL; } else { subHist[subChar] = 0; MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs less run char", 8); MAKE_SCHEME(sRunScheme,subRun, "Histogram of Substitution Run QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",subChar); #endif } } // Setup endian handling Set_Endian(0); coding.delScheme = delScheme; coding.insScheme = insScheme; coding.mrgScheme = mrgScheme; coding.subScheme = subScheme; coding.dRunScheme = dRunScheme; coding.sRunScheme = sRunScheme; coding.delChar = delChar; coding.subChar = subChar; coding.prefix = NULL; coding.flip = 0; return (&coding); error: if (delScheme != NULL) free(delScheme); if (dRunScheme != NULL) free(dRunScheme); if (insScheme != NULL) free(insScheme); if (mrgScheme != NULL) free(mrgScheme); if (subScheme != NULL) free(subScheme); if (sRunScheme != NULL) free(sRunScheme); EXIT(NULL); } // Write the encoding scheme 'coding' to 'output' void Write_QVcoding(FILE *output, QVcoding *coding) { // Write out the endian key, run chars, and prefix (if not NULL) { uint16 half; int len; half = 0x33cc; fwrite(&half,sizeof(uint16),1,output); if (coding->delChar < 0) half = 256; else half = (uint16) (coding->delChar); fwrite(&half,sizeof(uint16),1,output); if (coding->subChar < 0) half = 256; else half = (uint16) (coding->subChar); fwrite(&half,sizeof(uint16),1,output); len = strlen(coding->prefix); fwrite(&len,sizeof(int),1,output); fwrite(coding->prefix,1,len,output); } // Write out the scheme tables Write_Scheme(coding->delScheme,output); if (coding->delChar >= 0) Write_Scheme(coding->dRunScheme,output); Write_Scheme(coding->insScheme,output); Write_Scheme(coding->mrgScheme,output); Write_Scheme(coding->subScheme,output); if (coding->subChar >= 0) Write_Scheme(coding->sRunScheme,output); } // Read the encoding scheme 'coding' to 'output' QVcoding *Read_QVcoding(FILE *input) { static QVcoding coding; // Read endian key, run chars, and short name common to all headers { uint16 half; int len; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n"); EXIT(NULL); } coding.flip = (half != 0x33cc); if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.delChar = half; if (coding.delChar >= 256) coding.delChar = -1; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.subChar = half; if (coding.subChar >= 256) coding.subChar = -1; // Read the short name common to all headers if (fread(&len,sizeof(int),1,input) != 1) { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Long(&len); coding.prefix = (char *) Malloc(len+1,"Allocating header prefix"); if (coding.prefix == NULL) EXIT(NULL); if (len > 0) { if (fread(coding.prefix,len,1,input) != 1) { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n"); EXIT(NULL); } } coding.prefix[len] = '\0'; } // Setup endian handling Set_Endian(coding.flip); // Read the Huffman schemes used to compress the data coding.delScheme = NULL; coding.dRunScheme = NULL; coding.insScheme = NULL; coding.mrgScheme = NULL; coding.subScheme = NULL; coding.sRunScheme = NULL; coding.delScheme = Read_Scheme(input); if (coding.delScheme == NULL) goto error; if (coding.delChar >= 0) { coding.dRunScheme = Read_Scheme(input); if (coding.dRunScheme == NULL) goto error; } coding.insScheme = Read_Scheme(input); if (coding.insScheme == NULL) goto error; coding.mrgScheme = Read_Scheme(input); if (coding.mrgScheme == NULL) goto error; coding.subScheme = Read_Scheme(input); if (coding.subScheme == NULL) goto error; if (coding.subChar >= 0) { coding.sRunScheme = Read_Scheme(input); if (coding.sRunScheme == NULL) goto error; } return (&coding); error: if (coding.delScheme != NULL) free(coding.delScheme); if (coding.dRunScheme != NULL) free(coding.dRunScheme); if (coding.insScheme != NULL) free(coding.insScheme); if (coding.mrgScheme != NULL) free(coding.mrgScheme); if (coding.subScheme != NULL) free(coding.subScheme); if (coding.sRunScheme != NULL) free(coding.sRunScheme); EXIT(NULL); } // Free all the auxilliary storage associated with the encoding argument void Free_QVcoding(QVcoding *coding) { if (coding->subChar >= 0) free(coding->sRunScheme); free(coding->subScheme); free(coding->mrgScheme); free(coding->insScheme); if (coding->delChar >= 0) free(coding->dRunScheme); free(coding->delScheme); free(coding->prefix); } /******************************************************************************************* * * Encode/Decode (w.r.t. coding) next entry from input and write to output * ********************************************************************************************/ void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub, FILE *output, QVcoding *coding, int lossy) { int clen; if (coding->delChar < 0) { Encode(coding->delScheme, output, (uint8 *) del, rlen); clen = rlen; } else { Encode_Run(coding->delScheme, coding->dRunScheme, output, (uint8 *) del, rlen, coding->delChar); clen = Pack_Tag(tag,del,rlen,coding->delChar); } Number_Read(tag); Compress_Read(clen,tag); fwrite(tag,1,COMPRESSED_LEN(clen),output); if (lossy) { uint8 *insert = (uint8 *) ins; uint8 *merge = (uint8 *) mrg; int k; for (k = 0; k < rlen; k++) { insert[k] = (uint8) ((insert[k] >> 1) << 1); merge[k] = (uint8) (( merge[k] >> 2) << 2); } } Encode(coding->insScheme, output, (uint8 *) ins, rlen); Encode(coding->mrgScheme, output, (uint8 *) mrg, rlen); if (coding->subChar < 0) Encode(coding->subScheme, output, (uint8 *) sub, rlen); else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) sub, rlen, coding->subChar); return; } int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) { int rlen, clen; // Get all 5 streams, compress each with its scheme, and output rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT (-1); } if (coding->delChar < 0) { Encode(coding->delScheme, output, (uint8 *) Read, rlen); clen = rlen; } else { Encode_Run(coding->delScheme, coding->dRunScheme, output, (uint8 *) Read, rlen, coding->delChar); clen = Pack_Tag(Read+Rmax,Read,rlen,coding->delChar); } Number_Read(Read+Rmax); Compress_Read(clen,Read+Rmax); fwrite(Read+Rmax,1,COMPRESSED_LEN(clen),output); if (lossy) { uint8 *insert = (uint8 *) (Read+2*Rmax); uint8 *merge = (uint8 *) (Read+3*Rmax); int k; for (k = 0; k < rlen; k++) { insert[k] = (uint8) ((insert[k] >> 1) << 1); merge[k] = (uint8) (( merge[k] >> 2) << 2); } } Encode(coding->insScheme, output, (uint8 *) (Read+2*Rmax), rlen); Encode(coding->mrgScheme, output, (uint8 *) (Read+3*Rmax), rlen); if (coding->subChar < 0) Encode(coding->subScheme, output, (uint8 *) (Read+4*Rmax), rlen); else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) (Read+4*Rmax), rlen, coding->subChar); return (rlen); } int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) { int clen, tlen; // Decode each stream and write to output if (coding->delChar < 0) { if (Decode(coding->delScheme, input, entry[0], rlen)) EXIT(1); clen = rlen; tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); } else { if (Decode_Run(coding->delScheme, coding->dRunScheme, input, entry[0], rlen, coding->delChar)) EXIT(1); clen = Packed_Length(entry[0],rlen,coding->delChar); tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar); } if (Decode(coding->insScheme, input, entry[2], rlen)) EXIT(1); if (Decode(coding->mrgScheme, input, entry[3], rlen)) EXIT(1); if (coding->subChar < 0) { if (Decode(coding->subScheme, input, entry[4], rlen)) EXIT(1); } else { if (Decode_Run(coding->subScheme, coding->sRunScheme, input, entry[4], rlen, coding->subChar)) EXIT(1); } return (0); } DALIGNER-master/QV.h000066400000000000000000000115151322465224500142430ustar00rootroot00000000000000/******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #ifndef _QV_COMPRESSOR #include #define _QV_COMPRESSOR // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or // batch version of the routines in this library are compiled. In batch mode, routines // print an error message and exit. In interactive mode, the routines place the error // message in EPLACE (also defined in DB.h) and return an error value, typically NULL // if the routine returns a pointer, and an unusual integer value if the routine returns // an integer. // Below when an error return is described, one should understand that this value is returned // only if the routine was compiled in INTERACTIVE mode. // A PacBio compression scheme typedef struct { void *delScheme; // Huffman scheme for deletion QVs void *insScheme; // Huffman scheme for insertion QVs void *mrgScheme; // Huffman scheme for merge QVs void *subScheme; // Huffman scheme for substitution QVs void *dRunScheme; // Huffman scheme for deletion run lengths (if delChar > 0) void *sRunScheme; // Huffman scheme for substitution run lengths (if subChar > 0) int delChar; // If > 0, run-encoded deletion value int subChar; // If > 0, run-encoded substitution value int flip; // Need to flip multi-byte integers char *prefix; // Header line prefix } QVcoding; // Read the next nlines of input, and QVentry returns a pointer to the first line if needed. // If end-of-input is encountered before any further input, -1 is returned. If there is // an error than -2 is returned. Otherwise the length of the line(s) read is returned. int Read_Lines(FILE *input, int nlines); char *QVentry(); // Get and set the line counter for error reporting void Set_QV_Line(int line); int Get_QV_Line(); // Read up to the next num entries or until eof from the .quiva file on input and record // frequency statistics. Copy these entries to the temporary file temp if != NULL. // If there is an error then -1 is returned, otherwise the number of entries read. int QVcoding_Scan(FILE *input, int num, FILE *temp); void QVcoding_Scan1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub); // Given QVcoding_Scan has been called at least once, create an encoding scheme based on // the accumulated statistics and return a pointer to it. The returned encoding object // is *statically allocated within the routine. If lossy is set then use a lossy scaling // for the insertion and merge streams. If there is an error, then NULL is returned. QVcoding *Create_QVcoding(int lossy); // Read/write a coding scheme to input/output. The encoding object returned by the reader // is *statically* allocated within the routine. If an error occurs while reading then // NULL is returned. QVcoding *Read_QVcoding(FILE *input); void Write_QVcoding(FILE *output, QVcoding *coding); // Free all the auxiliary storage associated with coding (but not the object itself!) void Free_QVcoding(QVcoding *coding); // Assuming the file pointer is positioned just beyond an entry header line, read the // next set of 5 QV lines, compress them according to 'coding', and output. If lossy // is set then the scheme is a lossy one. A negative value is returned if an error // occurred, and the sequence length otherwise. int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub, FILE *output, QVcoding *coding, int lossy); // Assuming the input is position just beyond the compressed encoding of an entry header, // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, // and place their decompressed values into entry which is a 5 element array of character // pointers. The parameter rlen computed from the preceeding header line, critically // provides the length of each of the 5 vectors. A non-zero value is return only if an // error occured. int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); #endif // _QV_COMPRESSOR DALIGNER-master/README.md000066400000000000000000000725731322465224500150360ustar00rootroot00000000000000# Daligner: The Dazzler "Overlap" Module ## _Author: Gene Myers_ ## _First: April 10, 2016_ For typeset documentation, examples of use, and design philosophy please go to my [blog](https://dazzlerblog.wordpress.com/command-guides/damasker-commands). The commands below permit one to find all significant local alignments between reads encoded in Dazzler database. The assumption is that the reads are from a PACBIO RS II long read sequencer. That is the reads are long and noisy, up to 15% on average. Recall that a database has a current partition that divides it into blocks of a size that can conveniently be handled by calling the "dalign" overlapper on all the pairs of blocks producing a collection of .las local alignment files that can then be sorted and merged into an ordered sequence of sorted files containing all alignments between reads in the data set. The alignment records are parsimonious in that they do not record an alignment but simply a set of trace points, typically every 100bp or so, that allow the efficient reconstruction of alignments on demand. ``` 1. daligner [-vbAI] [-k] [-w] [-h] [-t] [-M] [-P] [-e] [-H] [-T] [-m]+ ... ``` Compare sequences in the trimmed \ block against those in the list of \ blocks searching for local alignments involving at least -l base pairs (default 1000) or more, that have an average correlation rate of -e (default 70%). The local alignments found will be output in a sparse encoding where a trace point on the alignment is recorded every -s base pairs of the a-read (default 100bp). Reads are compared in both orientations and local alignments meeting the criteria are output to one of several created files described below. The -v option turns on a verbose reporting mode that gives statistics on each major step of the computation. The program runs with 4 threads by default, but this may be set to any power of 2 with the -T option. The options -k, -h, and -w control the initial filtration search for possible matches between reads. Specifically, our search code looks for a pair of diagonal bands of width 2w (default 26 = 64) that contain a collection of exact matching k-mers (default 14) between the two reads, such that the total number of bases covered by the k-mer hits is h (default 35). k cannot be larger than 32 in the current implementation. If the -b option is set, then the daligner assumes the data has a strong compositional bias (e.g. >65% AT rich), and at the cost of a bit more time, dynamically adjusts k-mer sizes depending on compositional bias, so that the mers used have an effective specificity of 4k. If there are one or more interval tracks specified with the -m option, then the reads of the DB or DB's to which the mask applies are soft masked with the union of the intervals of all the interval tracks that apply, that is any k-mers that contain any bases in any of the masked intervals are ignored for the purposes of seeding a match. An interval track is a track, such as the "dust" track created by DBdust, that encodes a set of intervals over either the untrimmed or trimmed DB. Invariably, some k-mers are significantly over-represented (e.g. homopolymer runs). These k-mers create an excessive number of matching k-mer pairs and left unaddressed would cause daligner to overflow the available physical memory. One way to deal with this is to explicitly set the -t parameter which suppresses the use of any k-mer that occurs more than t times in either the subject or target block. However, a better way to handle the situation is to let the program automatically select a value of t that meets a given memory usage limit specified (in Gb) by the -M parameter. By default daligner will use the amount of physical memory as the choice for -M. If you want to use less, say only 8Gb on a 24Gb HPC cluster node because you want to run 3 daligner jobs on the node, then specify -M8. Specifying -M0 basically indicates that you do not want daligner to self adjust k-mer suppression to fit within a given amount of memory. Each found alignment is recorded as -- a[ab,ae] x bo[bb,be] -- where a and b are the indices (in the trimmed DB) of the reads that overlap, o indicates whether the b-read is from the same or opposite strand, and [ab,ae] and [bb,be] are the intervals of a and bo, respectively, that align. For each subject, target pair of blocks, say X and Y, the program reports alignments where the a-read is in X and the b-read is in Y, or vice versa. However, if the -A option is set ("A" for "asymmetric") then just overlaps where the a-read is in X and the b-read is in Y are reported, and if X = Y then it further reports only those overlaps where the a-read index is less than the b-read index. In either case, if the -I option is set ("I" for "identity") then when X = Y, overlaps between different portions of the same read will also be found and reported. In summary, the command "daligner -A X Y" produces a single file X.Y..las and "daligner X Y" produces 2 files X.Y..las and Y.X.las (unless X=Y in which case only a single file, X.X.las, is produced). The overlap records in one of these files are sorted as described for LAsort. In order to produce the aforementioned .las file, several temporary .las files, two for each thread, are produce in the sub-directory /tmp by default. You can overide this location by specifying the directory you would like this activity to take place in with the -P option. By default daligner compares all overlaps between reads in the database that are greater than the minimum cutoff set when the DB or DBs were split, typically 1 or 2 Kbp. However, the HGAP assembly pipeline only wants to correct large reads, say 8Kbp or over, and so needs only the overlaps where the a-read is one of the large reads. By setting the -H parameter to say N, one alters daligner so that it only reports overlaps where the a-read is over N base-pairs long. While the default parameter settings are good for raw Pacbio data, daligner can be used for efficiently finding alignments in corrected reads or other less noisy reads. For example, for mapping applications against .dams we run "daligner -k20 -h60 -e.85" and on corrected reads, we typically run "daligner -k25 -w5 -h60 -e.95 -s500" and at these settings it is very fast. ``` 2. LAsort [-va] ... ``` Sort each .las alignment file specified on the command line. For each file it reads in all the overlaps in the file and sorts them in lexicographical order of (a,b,o,ab) assuming each alignment is recorded as a[ab,ae] x bo[bb,be]. It then writes them all to a file named \.S.las (assuming that the input file was \.las). With the -v option set then the program reports the number of records read and written. If the -a option is set then it sorts LAs in lexicographical order of (a,ab) alone, which is desired when sorting a mapping of reads to a reference. If the .las file was produced by damapper the local alignments are organized into chains where the LA segments of a chain are consecutive and ordered in the file. LAsort can detects that it has been passed such a file and if so treats the chains as a unit and sorts them on the basis of the first LA in the chain. ``` 3. LAmerge [-va] ... ``` Merge the .las files \ into a singled sorted file \, where it is assumed that the input \ files are sorted. Due to operating system limits, the number of \ files must be ≤ 252. With the -v option set the program reports the # of records read and written. The -a option indicates the sort is as describe for LAsort above. If the .las file was produced by damapper the local alignments are organized into chains where the LA segments of a chain are consecutive and ordered in the file. When merging such files, LAmerge treats the chains as a unit and orders them on the basis of the first LA in the chain. Used correctly, LAmerge and LAsort together allow one to perform an "external" sort that produces a collection of sorted files containing in aggregate all the local alignments found by the daligner, such that their concatenation is sorted in order of (a,b,o,ab) (or (a,ab) if the -a option is set). In particular, this means that all the alignments for a given a-read will be found consecutively in one of the files. So computations that need to look at all the alignments for a given read can operate in simple sequential scans of these sorted files. ``` 4. LAshow [-caroUF] [-i] [-w] [-b] [ ] [ | ... ] ``` LAshow produces a printed listing of the local alignments contained in the specified .las file, where the a- and b-reads come from src1 or from src1 and scr2, respectively. If a file or list of read ranges is given then only the overlaps for which the a-read is in the set specified by the file or list are displayed. See DBshow for an explanation of how the file and list of read ranges are interpreted. If the -F option is set then the roles of the a- and b- reads are reversed in the display. If the -c option is given then a cartoon rendering is displayed, and if -a or -r option is set then an alignment of the local alignment is displayed. The -a option puts exactly -w columns per segment of the display, whereas the -r option puts exactly -w a-read symbols in each segment of the display. The -r display mode is useful when one wants to visually compare two alignments involving the same a-read. If a combination of the -c, -a, and -r flags is set, then the cartoon comes first, then the -a alignment, and lastly the -r alignment. The -i option sets the indent for the cartoon and/or alignment displays, if they are requested. The -b option sets the number of symbols on either side of the aligned segments in an alignment display, and -U specifies that uppercase should be used for DNA sequence instead of the default lowercase. If the -o option is set then only alignments that are proper overlaps (a sequence end occurs at the each end of the alignment) are displayed. If the -F option is given then the roles of the A- and B-reads are flipped. When examining LAshow output it is important to keep in mind that the coordinates describing an interval of a read are referring conceptually to positions between bases starting at 0 for the position to the left of the first base. That is, a coordinate c refers to the position between the c-1'st and c'th base, and the interval [b,e] captures the e-b bases from the b'th to the e-1'st, inclusive. We give an example with a cartoon and (part of an) alignment for which we will explain several additional important points: ``` 1 1,865 c [18,479..20,216] x [ 1,707..0> (24,451 x 7,283 bps, 19 trace pts) 18479 4235 A ========+----------+======> dif/(len1+len2) = 478/(1737+1707) = 27.76% B <======+----------- 5576 18469 agccgcctag[tgcctcgcaaacgc-t-cggggcggcgt-gaaagcgg-- ::::::::::[||||||||||||||*|*|||*|||*|||*||||||||** 1717 ctcttcttta[tgcctcgcaaacgccttcggcgcg-cgttgaaagcggtt 17.9% 18513 -ccggtgggtc--agtggcgagttctggcagtgcgctggg-ctgcgaaat *||||||*|||**|||||*||||*|*|*|||**|||||||*||*|||||| 1669 gccggtgcgtcgcagtgg-gagt-c-gtcag--cgctggggcttcgaaat 24.0% . . . ``` The display of an LA always begins with a line giving the A-read, then the B-read, then an indication of orientation (i.e. 'n' for same strand, and 'c' for the opposite strand) followed by the A-interval and B-interval that are aligned and in parentheses the lengths of the two reads and the number of tracepoints in the alignment between them. In particular, note carefully that when the B-read is in the complement orientation (c), then the B-interval gives the higher coordinate first, the idea being that one will align from the highest base down to the lowest base in the descending direction on B, complement the characters as you go. Further note that in the alignment display the coordinates at the start of each line follow this orientation convention and give the coordinate of the "tick mark" just left of the first character in each line. It is useful to know if an interval reaches the end of read, and to signal this we use an angle-bracket \<\> instead of a square bracket [], e.g. in the example the B-segment starts at the beginning of the read. Finally, observe that in the cartoon the numbers are not coordinates but rather indicate the lengths of the unaligned bits left and right of the two aligned intervals. Finally, observe that in the cartoon the numbers are not coordinates but rather indicate the lengths of the unaligned bits left and right of the two aligned intervals. With the introduction of damapper, .las files can now contain chains. If LAshow detects that it has been passed a file with chain information then it displays marks at the left that reveal the chain structure, e.g.: ``` > 117 37,630 c [ 253.. 7,980] x [ 331,430.. 324,027] ~ 10.5% + 117 37,628 n [ 253.. 7,983] x [21,493,673..21,501,079] ~ 10.6% + 117 57 c [ 253.. 1,086] x [ 2,008,164.. 2,007,369] ~ 9.8% - 117 57 c [ 1,300.. 7,982] x [ 2,007,351.. 2,000,945] ~ 10.7% > 117 15 c [ 7,992.. 8,716] x [ 242,529.. 241,822] ~ 7.8% - 117 15 c [ 8,752..14,299] x [ 241,824.. 236,425] ~ 10.7% - 117 15 c [14,133..14,832] x [ 236,630.. 235,953] ~ 12.1% + 117 37,628 n [ 7,992.. 8,716] x [19,202,357..19,203,064] ~ 7.7% - 117 37,628 n [ 8,752..14,832] x [19,203,062..19,208,974] ~ 10.9% ``` A chain begins with either a > or + character, where > indicates this is the highest scoring chain and + indicates an alternate near optimal chain (controlled by the -n parameter to damapper). Each additional LA of a chain is marked with a - character. ``` 5. LAdump [-cdtlo] [ ] [ | ... ] ``` Like LAshow, LAdump allows one to display the local alignments (LAs) of a subset of the piles in an .las file and select which information to show about them. The difference is that the information is written in a very simple "1-code" ASCII format that makes it easy for one to read and parse the information for further use. For each LA the pair of reads is output on a line. -c requests that one further output the coordinates of the LA segments be output. The -d option requests that the number of difference in the LA be output, -t requests that the tracepoint information be output, and -l requests the length of the two reads be output. Finally, -o requests that only LAs that are proper overlaps be output. The format is very simple. Each requested piece of information occurs on a line. The first character of every line is a "1-code" character that tells you what information to expect on the line. The rest of the line contains information where each item is separated by a single blank space. The trace point line gives the number of trace point intervals in the LA and is immediately followed by that many lines containing a pair of integers giving the # of differences and b-displacement in each successive trace point interval. ``` P #a #b #o #c - (#a,#b^#o) have an LA between them where #o is 'n' or 'c' and #c is '>' (start of best chain), '+' (start of alternate chain), '-' (continuation of chain), or '.' (no chains in file). L #la #lb - #la is the length of the a-read and #lb that of the b-read C #ab #ae #bb #be - #a[#ab,#ae] aligns with #b^#o[#bb,#be] D # - there are # differences in the LA T #n - there are #n trace point intervals for the LA (#d #y )^#n - there are #d difference aligning the #y bp's of B with the next fixed-size interval of A + X # - Total amount of X (X = P or T) % X # - Maximum amount of X in any pile (X = P or T) @ T # - Maximum number of trace points in any trace ``` 1-code lines that begin with +, %, or @ are always the first lines in the output. They give size information about what is contained in the output. Specifically, '+ X #' gives the total number of LAs (X=P), or the total number of trace point intervals (X=T) in the file . '% X #' gives the maximum number of LAs (X=P) or the maximum number of trace point intervals (X=T) in a given *pile* (collection of LAs all with the same a-read (applies only to sorted .las files). Finally @ T # gives the maximum # of trace point intervals in any trace within the file. ``` 6. LAindex -v ... ``` LAindex takes a series of one or more sorted .las files and produces a "pile index" for each one. If the input file has name "X.las", then the name of its index file is ".X.las.idx". For each A-read pile encoded in the .las file, the index contains the offset to the first local alignment with A in the file. The index starts with four 64-bit integers that encode the numbers % P, + T, % T, and @ T described for LAdump above, and then an offset for each pile beginning with the first A-read in the file (which may not be read 0). The index is meant to allow programs that process piles to more efficiently read just the piles they need at any momment int time, as opposed to having to sequentially scan through the .las file. ``` 7. LAcat [-v] > .las ``` Given template name \ that contains a single #-sign somewhere within it, find all files that match it when the # is replace by i for i in 1,2,3,... and a .las extension is added if not present. Then concatenate these files in order into a single .las file and pipe the result to the standard output. The -v option reports the files concatenated and the number of la's within them to standard error (as the standard output receives the concatenated file). ``` 8. LAsplit [-v] ( | ) < .las ``` If the second argument is an integer n, then divide the alignment file \, piped in through the standard input, as evenly as possible into n alignment files with the names specified by template \, subject to the restriction that all alignment records for a given a-read are in the same file. The name of the n files is the string \ where the single #-sign that occurs somewhere in it is replaced by i for i in [1,n] and a .las extension is added if necessary. If the second argument refers to a database \.db that has been partitioned, then divide the input alignment file into block .las files where all records whose a-read is in \.i.db are in the i'th file generated from the template \. The -v option reports the files produced and the number of la's within them to standard error. ``` 9. LAcheck [-vS] [ ] ... ``` LAcheck checks each .las file for structural integrity, where the a- and b-sequences come from src1 or from src1 and scr2, respectively. That is, it makes sure each file makes sense as a plausible .las file, e.g. values are not out of bound, the number of records is correct, the number of trace points for a record is correct, and so on. If the -S option is set then it further checks that the alignments are in sorted order. If the -v option is set then a line is output for each .las file saying either the file is OK or reporting the first error. If the -v option is not set then the program runs silently. The exit status is 0 if every file is deemed good, and 1 if at least one of the files looks corrupted. With the introduction of damapper, LAcheck checks to see if a file has chain information, and if it does, then it checks the validity of chains and assumes that the chains were sorted with the -a option to LAsort and LAmerge. ``` 10. HPC.daligner [-vbad] [-t] [-w] [-l] [-M] [-B] [-D] [-T] [-f] ( [-k] [-h] [-e] [-k] [-h] [-e ) [-m]+ [[-]] ``` HPC.daligner writes a UNIX shell script to the standard output or to a series of files beginning with the prefix \ if the -f option is set, that either performs an "overlap" computation on all the blocks in a single database, or a "comparison" computation on all pairs of blocks between two databases, depending on whether it is given one or two DB's as arguments (\ and \). We describe the overlap script first and its effect first and then later the comparison script. An Overlap Script: consists of a sequence of commands that effectively run daligner on all pairs of blocks of a split database and then externally sorts and merges them using LAsort and LAmerge into a collection of alignment files with names \.#.las where # ranges from 1 to the number of blocks the data base is split into. These sorted files if concatenated by say LAcat would contain all the alignments in sorted order (of a-read, then b-read, ...). Moreover, all overlaps for a given a-read are guaranteed to not be split across files, so one can run artifact analyzers or error correction on each sorted file in parallel. The data base must have been previously split by DBsplit and all the parameters, except -a, -d, -f, -B, and -D, are passed through to the calls to daligner. The defaults for these parameters are as for daligner. The -v and -a flags are passed to all calls to LAsort and LAmerge. All other options are described later. For a database divided into N sub-blocks, the calls to daligner will produce in total N2 .las files, on per block pair. These are then merged in ceil(logD N) phases where the number of files decreases geometrically in -D until there is 1 file per row of the N x N block matrix. So at the end one has N sorted .las files that when concatenated would give a single large sorted overlap file. The -B option (default 4) gives the desired number of block comparisons per call to daligner. Some must contain B-1 comparisons, and the first B-2 block comparisons even less, but the HPCdaligner "planner" does the best it can to give an average load of dal block comparisons per command. The -D option (default 250) gives the maximum number of files that will be merged in a single LAmerge command. The planner performs D-way merges at all of the ceil(logD N) levels save the last, so as to minimize the number of intermediate files. If the integers \ and \ are missing then the script produced is for every block in the database. If \ is present then HPCdaligner produces an incremental script that compares blocks \ through \ (\ = \ if not present) against each other and all previous blocks 1 through \-1, and then incrementally updates the .las files for blocks 1 through \-1, and creates the .las files for blocks \ through \. A Comparison Script: consists of a sequence of commands that effectively maps every read in the DB \ against a reference set of sequences in the DB \, recording all the found local alignments in the sequence of files \.1.\.las, \.2.\.las, ... where \.\.k.las contains the alignments between all of \ and the k'th block of \. The parameters are exactly the same as for the overlap script save that the -k, -h, and -e defaults are set more stringently for mapping, and the -A, -I , and -H options make no sense as \ and \ are expected to be distinct data sets. If the integers \ and \ are missing then the script produced is for every block in the database \. If \ is present then HPC.daligner produces a script that compares blocks \ through \ (\ = \ if not present) of \ against DAM \. The command scripts output by HPC.daligner and other HPC.\ programs consists of command blocks each of which begins with a comment line (begins with #) followed by a potentially long list of lines each containing a shell command. Command blocks whose comment mentions "jobs" and gives the number of said in parenthesis, we call parallel blocks because each command line in the block can be sent to a node in a cluster for independent execution, i.e. none of the commands in a block depend on another in the block. The remaining command blocks we call house-keeping blocks because they can be executed by the shell on the launch/server node and the commands are either checking the integrity of .las files with LAcheck, or removing intermediate files with rm. Each block should be performed in the order given and should complete before the next block is performed. If the -f option is set, then each command block is written to a file with a name of the form \.#.\ where \ is specified by the user in the -f option argument, # gives the order in which the command block in the given file is to be performed in relation to other command block files, and \ is a (very) short symbolic reminder of what the block is doing. For example, "HPC.daligner -fJOBS DB" would produce the files: ``` JOBS.01.OVL JOBS.02.CHECK.OPT JOBS.03.MERGE JOBS.04.CHECK.OPT JOBS.05.RM.OPT ``` The number of command blocks varies as it depends on the number of merging rounds required in the external sort of the .las files. The files with the suffix .OPT are optional and need not be executed albeit we highly recommend that one run all the CHECK blocks. A new -d option requests scripts that organize files into a collection of sub-directories so as not to overwhelm the underlying OS for large genomes. Recall that for a DB divided into N blocks, the daligner will produce N2 .las-files. With the -d option set, N sub-directories (with respect to the directory HPC.daligner is called in) of the form "work\" for i from 1 to N are created in an initial command block, and then all work files are placed in those sub-directories, with a maximum of 2N files appearing in any sub-directory at any given point in the process. Example: ``` // Recall G.db from the example in DAZZ_DB/README > cat G.db files = 1 1862 G Sim blocks = 2 size = 11 cutoff = 0 all = 0 0 0 1024 1024 1862 1862 > HPCdaligner -mdust -t5 G | csh -v // Run the HPCdaligner script # Dazzler jobs (2) dazzler -d -t5 -mdust G.1 G.1 dazzler -d -t5 -mdust G.2 G.1 G.2 # Initial sort jobs (4) LAsort G.1.G.1.*.las && LAmerge G.L1.1.1 G.1.G.1.*.S.las && rm G.1.G.1.*.S.las LAsort G.1.G.2.*.las && LAmerge G.L1.1.2 G.1.G.2.*.S.las && rm G.1.G.2.*.S.las LAsort G.2.G.1.*.las && LAmerge G.L1.2.1 G.2.G.1.*.S.las && rm G.2.G.1.*.S.las LAsort G.2.G.2.*.las && LAmerge G.L1.2.2 G.2.G.2.*.S.las && rm G.2.G.2.*.S.las # Level 1 jobs (2) LAmerge G.1 G.L1.1.1 G.L1.1.2 && rm G.L1.1.1.las G.L1.1.2.las LAmerge G.2 G.L1.2.1 G.L1.2.2 && rm G.L1.2.1.las G.L1.2.2.las > LAshow -c -a:G -w50 G.1 | more // Take a look at the result ! G.1: 34,510 records 1 9 c [ 0.. 1,876] x [ 9,017..10,825] ( 18 trace pts) 12645 A ---------+====> dif/(len1+len2) = 398/(1876+1808) = 21.61% B <====+--------- 9017 1 ..........gtg-cggt--caggggtgcctgc-t-t-atcgcaatgtta |||*||||**||||||||*||||*|*|*||**|*|*|||| 9008 gagaggccaagtggcggtggcaggggtg-ctgcgtcttatatccaggtta 27.5% 35 ta-ctgggtggttaaacttagccaggaaacctgttgaaataa-acggtgg ||*|||||||||||||*|**|*||*|*||||||*|**|||||*|*||||| 9057 tagctgggtggttaaa-tctg-ca-g-aacctg-t--aataacatggtgg 24.0% 83 -ctagtggcttgccgtttacccaacagaagcataatgaaa-tttgaaagt *||||||||*||||||||*||**||||*|||**|||||||*||||*|||| 9100 gctagtggc-tgccgttt-ccgcacag-agc--aatgaaaatttg-aagt 20.0% 131 ggtaggttcctgctgtct-acatacagaacgacggagcgaaaaggtaccg ||*|||||||||||||*|*||||*|*|*||||||||||*||||||||||* 9144 gg-aggttcctgctgt-tcacat-c-ggacgacggagc-aaaaggtacc- 16.0% ... > LAcat G >G.las // Combine G.1.las & G.2.las into a single .las file > LAshow G G | more // Take another look, now at G.las G: 62,654 records 1 9 c [ 0.. 1,876] x [ 9,017..10,825] : < 398 diffs ( 18 trace pts) 1 38 c [ 0.. 7,107] x [ 5,381..12,330] : < 1,614 diffs ( 71 trace pts) 1 49 n [ 5,493..14,521] x [ 0.. 9,065] : < 2,028 diffs ( 91 trace pts) 1 68 n [12,809..14,521] x [ 0.. 1,758] : < 373 diffs ( 17 trace pts) 1 147 c [ 0..13,352] x [ 854..14,069] : < 2,993 diffs (133 trace pts) 1 231 n [10,892..14,521] x [ 0.. 3,735] : < 816 diffs ( 37 trace pts) 1 292 c [ 3,835..14,521] x [ 0..10,702] : < 2,353 diffs (107 trace pts) 1 335 n [ 7,569..14,521] x [ 0.. 7,033] : < 1,544 diffs ( 70 trace pts) 1 377 c [ 9,602..14,521] x [ 0.. 5,009] : < 1,104 diffs ( 49 trace pts) 1 414 c [ 6,804..14,521] x [ 0.. 7,812] : < 1,745 diffs ( 77 trace pts) 1 415 c [ 0.. 3,613] x [ 7,685..11,224] : < 840 diffs ( 36 trace pts) 1 445 c [ 9,828..14,521] x [ 0.. 4,789] : < 1,036 diffs ( 47 trace pts) 1 464 n [ 0.. 1,942] x [12,416..14,281] : < 411 diffs ( 19 trace pts) ... ``` DALIGNER-master/align.c000066400000000000000000004065471322465224500150170ustar00rootroot00000000000000/******************************************************************************************* * * Fast alignment discovery and trace generation along with utilites for displaying alignments * Based on previously unpublished ideas from 2005, subsequently refined in 2013-14. Basic * idea is to keep a dynamically selected interval of the f.r. waves from my 1986 O(nd) paper. * A recent cool idea is to not record all the details of an alignment while discovering it * but simply record trace points through which the optimal alignment passes every 100bp, * allowing rapid recomputation of the alignment details between trace points. * * Author : Gene Myers * First : June 2013 * Current: June 1, 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #undef DEBUG_PASSES // Show forward / backward extension termini for Local_Alignment #undef DEBUG_POINTS // Show trace points #undef DEBUG_WAVE // Show waves of Local_Alignment #undef SHOW_MATCH_WAVE // For waves of Local_Alignment also show # of matches #undef SHOW_TRAIL // Show trace at the end of forward and reverse passes #undef SHOW_TPS // Show trace points as they are encountered in a wave #undef DEBUG_EXTEND // Show waves of Extend_Until_Overlap #undef DEBUG_ALIGN // Show division points of Compute_Trace #undef DEBUG_SCRIPT // Show trace additions for Compute_Trace #undef DEBUG_AWAVE // Show F/R waves of Compute_Trace #undef SHOW_TRACE // Show full trace for Print_Alignment #undef WAVE_STATS /****************************************************************************************\ * * * Working Storage Abstraction * * * \****************************************************************************************/ typedef struct // Hidden from the user, working space for each thread { int vecmax; void *vector; int celmax; void *cells; int pntmax; void *points; int tramax; void *trace; } _Work_Data; Work_Data *New_Work_Data() { _Work_Data *work; work = (_Work_Data *) Malloc(sizeof(_Work_Data),"Allocating work data block"); if (work == NULL) EXIT(NULL); work->vecmax = 0; work->vector = NULL; work->pntmax = 0; work->points = NULL; work->tramax = 0; work->trace = NULL; work->celmax = 0; work->cells = NULL; return ((Work_Data *) work); } static int enlarge_vector(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->vector,max,"Enlarging DP vector"); if (vec == NULL) EXIT(1); work->vecmax = max; work->vector = vec; return (0); } static int enlarge_points(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->points,max,"Enlarging point vector"); if (vec == NULL) EXIT(1); work->pntmax = max; work->points = vec; return (0); } static int enlarge_trace(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->trace,max,"Enlarging trace vector"); if (vec == NULL) EXIT(1); work->tramax = max; work->trace = vec; return (0); } void Free_Work_Data(Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; if (work->vector != NULL) free(work->vector); if (work->cells != NULL) free(work->cells); if (work->trace != NULL) free(work->trace); if (work->points != NULL) free(work->points); free(work); } /****************************************************************************************\ * * * ADAPTIVE PATH FINDING * * * \****************************************************************************************/ // Absolute/Fixed Parameters #define BVEC uint64 // Can be uint32 if PATH_LEN <= 32 #define TRIM_LEN 15 // Report as the tip, the last wave maximum for which the last // 2*TRIM_LEN edits are prefix-positive at rate ave_corr*f(bias) // (max value is 20) #define PATH_LEN 60 // Follow the last PATH_LEN columns/edges (max value is 63) // Derivative fixed parameters #define PATH_TOP 0x1000000000000000ll // Must be 1 << PATH_LEN #define PATH_INT 0x0fffffffffffffffll // Must be PATH_TOP-1 #define TRIM_MASK 0x7fff // Must be (1 << TRIM_LEN) - 1 #define TRIM_MLAG 200 // How far can last trim point be behind best point #define WAVE_LAG 30 // How far can worst point be behind the best point static double Bias_Factor[10] = { .690, .690, .690, .690, .780, .850, .900, .933, .966, 1.000 }; // Adjustable paramters typedef struct { double ave_corr; int trace_space; int reach; float freq[4]; int ave_path; int16 *score; int16 *table; } _Align_Spec; /* Fill in bit table: TABLE[x] = 1 iff the alignment modeled by x (1 = match, 0 = mismatch) has a non-negative score for every suffix of the alignment under the scoring scheme where match = MATCH and mismatch = -1. MATCH is set so that an alignment with TRIM_PCT matches has zero score ( (1-TRIM_PCT) / TRIM_PCT ). */ #define FRACTION 1000 // Implicit fractional part of scores, i.e. score = x/FRACTION typedef struct { int mscore; int dscore; int16 *table; int16 *score; } Table_Bits; static void set_table(int bit, int prefix, int score, int max, Table_Bits *parms) { if (bit >= TRIM_LEN) { parms->table[prefix] = (int16) (score-max); parms->score[prefix] = (int16) score; } else { if (score > max) max = score; set_table(bit+1,(prefix<<1),score - parms->dscore,max,parms); set_table(bit+1,(prefix<<1) | 1,score + parms->mscore,max,parms); } } /* Create an alignment specification record including path tip tables & values */ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq, int reach) { _Align_Spec *spec; Table_Bits parms; double match; int bias; spec = (_Align_Spec *) Malloc(sizeof(_Align_Spec),"Allocating alignment specification"); if (spec == NULL) EXIT(NULL); spec->ave_corr = ave_corr; spec->trace_space = trace_space; spec->reach = reach; spec->freq[0] = freq[0]; spec->freq[1] = freq[1]; spec->freq[2] = freq[2]; spec->freq[3] = freq[3]; match = freq[0] + freq[3]; if (match > .5) match = 1.-match; bias = (int) ((match+.025)*20.-1.); if (match < .2) { fprintf(stderr,"Warning: Base bias worse than 80/20%% ! (New_Align_Spec)\n"); fprintf(stderr," Capping bias at this ratio.\n"); bias = 3; } spec->ave_path = (int) (PATH_LEN * (1. - Bias_Factor[bias] * (1. - ave_corr))); parms.mscore = (int) (FRACTION * Bias_Factor[bias] * (1. - ave_corr)); parms.dscore = FRACTION - parms.mscore; parms.score = (int16 *) Malloc(sizeof(int16)*(TRIM_MASK+1)*2,"Allocating trim table"); if (parms.score == NULL) { free(spec); EXIT(NULL); } parms.table = parms.score + (TRIM_MASK+1); set_table(0,0,0,0,&parms); spec->table = parms.table; spec->score = parms.score; return ((Align_Spec *) spec); } void Free_Align_Spec(Align_Spec *espec) { _Align_Spec *spec = (_Align_Spec *) espec; free(spec->score); free(spec); } double Average_Correlation(Align_Spec *espec) { return (((_Align_Spec *) espec)->ave_corr); } int Trace_Spacing(Align_Spec *espec) { return (((_Align_Spec *) espec)->trace_space); } float *Base_Frequencies(Align_Spec *espec) { return (((_Align_Spec *) espec)->freq); } int Overlap_If_Possible(Align_Spec *espec) { return (((_Align_Spec *) espec)->reach); } /****************************************************************************************\ * * * LOCAL ALIGNMENT FINDER: forward_/reverse_wave and Local_Alignment * * * \****************************************************************************************/ #ifdef WAVE_STATS static int64 MAX, TOT, NWV; static int64 RESTARTS; void Init_Stats() { MAX = TOT = NWV = 0; RESTARTS = 0; } void Print_Stats() { printf("\nMax = %lld Ave = %.1f # = %lld\n",MAX,(1.*TOT)/NWV,NWV); printf("\nRestarts = %lld\n",RESTARTS); } #endif #ifdef DEBUG_WAVE static void print_wave(int *V, int *M, int low, int hgh, int besta) { int k, bestk; (void) M; printf(" [%6d,%6d]: ",low,hgh); for (k = low; k <= hgh; k++) { if (besta == V[k]) bestk = k; // printf(" %3d",(V[k]+k)/2); printf(" %3d",besta-V[k]); } printf(" : %d (%d,%d)\n",besta,(besta+bestk)/2,(besta-bestk)/2); #ifdef SHOW_MATCH_WAVE printf(" "); for (k = low; k <= hgh; k++) printf(" %3d",M[k]); printf("\n"); #endif fflush(stdout); } #endif /* At each furthest reaching point, keep a-coordinate of point (V), bitvector recording the last TRIM_LEN columns of the implied alignment (T), and the # of matches (1-bits) in the bitvector (M). */ typedef struct { int ptr; int diag; int diff; int mark; } Pebble; static int VectorEl = 6*sizeof(int) + sizeof(BVEC); static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, int *mind, int maxd, int mida, int minp, int maxp, int aoff, int boff) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *HB; int *_HA, *_HB; int *NA, *NB; int *_NA, *_NB; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int REACH = spec->reach; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha, trimhb; int morea, morey, mored; int moreha, morehb; int more, morem, lasta; int aclip, bclip; hgh = maxd; low = *mind; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEl; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } /* Compute 0-wave starting from mid-line */ more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; trimhb = morehb = 1; morem = -1; { int k; char *a; a = aseq + hgh; for (k = hgh; k >= low; k--) { int y, c, d; int ha, hb; int na, nb; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = (((y+k)+(TRACE_SPACE-aoff))/TRACE_SPACE-1)*TRACE_SPACE+aoff; #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; nb = ((y+(TRACE_SPACE-boff))/TRACE_SPACE-1)*TRACE_SPACE+boff; #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,-1,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb += TRACE_SPACE; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; } c = (y << 1) + k; while (y+k >= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; } while (y >= nb) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb += TRACE_SPACE; } if (c > besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; trimhb = hb; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; HB[k] = hb; NA[k] = na; NB[k] = nb; a -= 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; morehb = HB[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } #ifdef DEBUG_WAVE printf("\nFORWARD WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif /* Compute successive waves until no furthest reaching points remain */ while (more && lasta >= besta - TRIM_MLAG) { int k, n; int ua, ub; BVEC t; int am, ac, ap; char *a; low -= 1; hgh += 1; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move; int64 vd, md, had, hbd, nad, nbd, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEl)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEl; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (hbd < 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (nbd < 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nbd > 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (hbd > 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; } if (low >= minp) { NA[low] = NA[low+1]; NB[low] = NB[low+1]; V[low] = -1; } else low += 1; if (hgh <= maxp) { NA[hgh] = NA[hgh-1]; NB[hgh] = NB[hgh-1]; V[hgh] = am = -1; } else am = V[--hgh]; dif += 1; ac = V[hgh+1] = V[low-1] = -1; a = aseq + hgh; t = PATH_INT; n = PATH_LEN; ua = ub = -1; for (k = hgh; k >= low; k--) { int y, m; int ha, hb; int c, d; BVEC b; Pebble *pb; ap = ac; ac = am; am = V[d = k-1]; if (ac < am) if (am < ap) { c = ap+1; m = n; b = t; ha = ua; hb = ub; } else { c = am+1; m = M[d]; b = T[d]; ha = HA[d]; hb = HB[d]; } else if (ac < ap) { c = ap+1; m = n; b = t; ha = ua; hb = ub; } else { c = ac+2; m = M[k]; b = T[k]; ha = HA[k]; hb = HB[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k >= NA[k]) { if (cells[ha].mark < NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] += TRACE_SPACE; } while (y >= NB[k]) { if (cells[hb].mark < NB[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = dif; pb->mark = NB[k]; hb = avail++; } NB[k] += TRACE_SPACE; } if (c > besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; trimhb = hb; } } } t = T[k]; n = M[k]; ua = HA[k]; ub = HB[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; HB[k] = hb; a -= 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta-besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; morehb = HB[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } n = besta - WAVE_LAG; while (hgh >= low) if (V[hgh] < n) hgh -= 1; else { while (V[low] < n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; uint16 *btrace = (uint16 *) bpath->trace; int atlen, btlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0 && REACH) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; trimhb = morehb; } else trimx = trima-trimy; atlen = btlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida-k)/2; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; atrace[atlen++] = (uint16) (d-e); atrace[atlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[atlen++] = (uint16) (trimd-e); atrace[atlen++] = (uint16) (trimy-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } a = -1; for (h = trimhb; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida+k)/2; e = 0; low = k; #ifdef SHOW_TRAIL printf(" B path = (%5d,%5d)\n",b,(mida-k)/2); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark + k; d = cells[h].diff; btrace[btlen++] = (uint16) (d-e); btrace[btlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b-k != trimy) { btrace[btlen++] = (uint16) (trimd-e); btrace[btlen++] = (uint16) (trimx-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); #endif } else if (b != trimx) { btrace[btlen-1] = (uint16) (btrace[btlen-1] + (trimx-b)); btrace[btlen-2] = (uint16) (btrace[btlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); #endif } apath->aepos = trimx; apath->bepos = trimy; apath->diffs = trimd; apath->tlen = atlen; bpath->tlen = btlen; } *mind = low; return (0); } /*** Reverse Wave ***/ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, int mind, int maxd, int mida, int minp, int maxp, int aoff, int boff) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *HB; int *_HA, *_HB; int *NA, *NB; int *_NA, *_NB; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int REACH = spec->reach; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha, trimhb; int morea, morey, mored; int moreha, morehb; int more, morem, lasta; int aclip, bclip; hgh = maxd; low = mind; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEl; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; trimhb = morehb = 1; morem = -1; { int k; char *a; a = aseq + low; for (k = low; k <= hgh; k++) { int y, c, d; int ha, hb; int na, nb; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = (((y+k)+(TRACE_SPACE-aoff)-1)/TRACE_SPACE-1)*TRACE_SPACE+aoff; #ifdef SHOW_TPS printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y+k; ha = avail++; nb = ((y+(TRACE_SPACE-boff)-1)/TRACE_SPACE-1)*TRACE_SPACE+boff; #ifdef SHOW_TPS printf(" B %d: -1,%d,0,%d\n",avail,k,nb+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y; hb = avail++; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; } c = (y << 1) + k; while (y+k <= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na -= TRACE_SPACE; } while (y <= nb) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb -= TRACE_SPACE; } if (c < besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; trimhb = hb; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; HB[k] = hb; NA[k] = na; NB[k] = nb; a += 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; morehb = HB[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } #ifdef DEBUG_WAVE printf("\nREVERSE WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif while (more && lasta <= besta + TRIM_MLAG) { int k, n; int ua, ub; BVEC t; int am, ac, ap; char *a; low -= 1; hgh += 1; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move, vd, md, had, hbd, nad, nbd, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEl)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEl; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (hbd < 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (nbd < 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nbd > 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (hbd > 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; } if (low >= minp) { NA[low] = NA[low+1]; NB[low] = NB[low+1]; V[low] = ap = INT32_MAX; } else ap = V[++low]; if (hgh <= maxp) { NA[hgh] = NA[hgh-1]; NB[hgh] = NB[hgh-1]; V[hgh] = INT32_MAX; } else hgh -= 1; dif += 1; ac = V[hgh+1] = V[low-1] = INT32_MAX; a = aseq + low; t = PATH_INT; n = PATH_LEN; ua = ub = -1; for (k = low; k <= hgh; k++) { int y, m; int ha, hb; int c, d; BVEC b; Pebble *pb; am = ac; ac = ap; ap = V[d = k+1]; if (ac > ap) if (ap > am) { c = am-1; m = n; b = t; ha = ua; hb = ub; } else { c = ap-1; m = M[d]; b = T[d]; ha = HA[d]; hb = HB[d]; } else if (ac > am) { c = am-1; m = n; b = t; ha = ua; hb = ub; } else { c = ac-2; m = M[k]; b = T[k]; ha = HA[k]; hb = HB[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k <= NA[k]) { if (cells[ha].mark > NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] -= TRACE_SPACE; } while (y <= NB[k]) { if (cells[hb].mark > NB[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = dif; pb->mark = NB[k]; hb = avail++; } NB[k] -= TRACE_SPACE; } if (c < besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; trimhb = hb; } } } t = T[k]; n = M[k]; ua = HA[k]; ub = HB[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; HB[k] = hb; a += 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; morehb = HB[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } n = besta + WAVE_LAG; while (hgh >= low) if (V[hgh] > n) hgh -= 1; else { while (V[low] > n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; uint16 *btrace = (uint16 *) bpath->trace; int atlen, btlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0 && REACH) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; trimhb = morehb; } else trimx = trima-trimy; atlen = btlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark - k; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); #endif if ((b+k)%TRACE_SPACE != aoff) { h = cells[h].ptr; if (h < 0) { a = trimy; d = trimd; } else { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif if (apath->tlen == 0) { atrace[--atlen] = (uint16) (b-a); atrace[--atlen] = (uint16) (d-e); } else { atrace[1] = (uint16) (atrace[1] + (b-a)); atrace[0] = (uint16) (atrace[0] + (d-e)); } b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; atrace[--atlen] = (uint16) (b-a); d = cells[h].diff; atrace[--atlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[--atlen] = (uint16) (b-trimy); atrace[--atlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } } a = -1; for (h = trimhb; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark + k; e = 0; #ifdef SHOW_TRAIL printf(" B path = (%5d,%5d)\n",b,b-k); fflush(stdout); #endif if ((b-k)%TRACE_SPACE != boff) { h = cells[h].ptr; if (h < 0) { a = trimx; d = trimd; } else { k = cells[h].diag; a = cells[h].mark + k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); #endif if (bpath->tlen == 0) { btrace[--btlen] = (uint16) (b-a); btrace[--btlen] = (uint16) (b-a); } else { btrace[1] = (uint16) (btrace[1] + (b-a)); btrace[0] = (uint16) (btrace[0] + (d-e)); } b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark + k; btrace[--btlen] = (uint16) (b-a); d = cells[h].diff; btrace[--btlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b-k != trimy) { btrace[--btlen] = (uint16) (b-trimx); btrace[--btlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); #endif } else if (b != trimx) { btrace[btlen+1] = (uint16) (btrace[btlen+1] + (b-trimx)); btrace[btlen] = (uint16) (btrace[btlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); #endif } } apath->abpos = trimx; apath->bbpos = trimy; apath->diffs = apath->diffs + trimd; apath->tlen = apath->tlen - atlen; apath->trace = atrace + atlen; bpath->tlen = bpath->tlen - btlen; bpath->trace = btrace + btlen; } return (0); } /* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) See associated .h file for the precise definition of the interface. */ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, int low, int hgh, int anti, int lbord, int hbord) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath, *bpath; int aoff, boff; int minp, maxp; int selfie; { int alen, blen; int maxtp, wsize; alen = align->alen; blen = align->blen; if (hgh-low >= 7500) wsize = VectorEl*(hgh-low+1); else wsize = VectorEl*10000; if (wsize >= work->vecmax) if (enlarge_vector(work,wsize)) EXIT(NULL); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); else maxtp = 2*(alen/spec->trace_space+2); wsize = 4*maxtp*sizeof(uint16) + sizeof(Path); if (wsize > work->pntmax) if (enlarge_points(work,wsize)) EXIT(NULL); apath = align->path; bpath = (Path *) work->points; apath->trace = ((uint16 *) (bpath+1)) + maxtp; bpath->trace = ((uint16 *) apath->trace) + 2*maxtp; } #ifdef DEBUG_PASSES printf("\n"); #endif selfie = (align->aseq == align->bseq); if (lbord < 0) { if (selfie && low >= 0) minp = 1; else minp = -INT32_MAX; } else minp = low-lbord; if (hbord < 0) { if (selfie && hgh <= 0) maxp = -1; else maxp = INT32_MAX; } else maxp = hgh+hbord; if (ACOMP(align->flags)) { aoff = align->alen % spec->trace_space; boff = 0; } else if (COMP(align->flags)) { aoff = 0; boff = align->blen % spec->trace_space; } else { aoff = 0; boff = 0; } if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp,aoff,boff)) EXIT(NULL); #ifdef DEBUG_PASSES printf("F1 (%d,%d) ~ %d => (%d,%d) %d\n", (2*anti+(low+hgh))/4,(anti-(low+hgh))/4,hgh-low, apath->aepos,apath->bepos,apath->diffs); #endif if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp,aoff,boff)) EXIT(NULL); #ifdef DEBUG_PASSES printf("R1 (%d,%d) => (%d,%d) %d\n", (anti+low)/2,(anti-low)/2,apath->abpos,apath->bbpos,apath->diffs); #endif bpath->diffs = apath->diffs; if (ACOMP(align->flags)) { uint16 *trace = (uint16 *) apath->trace; uint16 p; int i, j; bpath->aepos = apath->bepos; bpath->bepos = apath->aepos; bpath->abpos = apath->bbpos; bpath->bbpos = apath->abpos; apath->abpos = align->alen - bpath->bepos; apath->bbpos = align->blen - bpath->aepos; apath->aepos = align->alen - bpath->bbpos; apath->bepos = align->blen - bpath->abpos; i = apath->tlen-2; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; p = trace[i+1]; trace[i+1] = trace[j+1]; trace[j+1] = p; i -= 2; j += 2; } } else if (COMP(align->flags)) { uint16 *trace = (uint16 *) bpath->trace; uint16 p; int i, j; bpath->abpos = align->blen - apath->bepos; bpath->bbpos = align->alen - apath->aepos; bpath->aepos = align->blen - apath->bbpos; bpath->bepos = align->alen - apath->abpos; i = bpath->tlen-2; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; p = trace[i+1]; trace[i+1] = trace[j+1]; trace[j+1] = p; i -= 2; j += 2; } } else { bpath->aepos = apath->bepos; bpath->bepos = apath->aepos; bpath->abpos = apath->bbpos; bpath->bbpos = apath->abpos; } #ifdef DEBUG_POINTS { uint16 *trace = (uint16 *) apath->trace; int a, h; printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos); printf(" %c\n",((COMP(align->flags) || ACOMP(align->flags)) ? 'c' : 'n')); a = apath->bbpos; for (h = 1; h < apath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } { uint16 *trace = (uint16 *) bpath->trace; int a, h; printf("\nB-path (%d,%d)->(%d,%d)",bpath->abpos,bpath->bbpos,bpath->aepos,bpath->bepos); printf(" %c [%d,%d]\n",((COMP(align->flags) || ACOMP(align->flags)) ? 'c' : 'n'), align->blen,align->alen); a = bpath->bbpos; for (h = 1; h < bpath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } #endif return (bpath); } /****************************************************************************************\ * * * EXTENSION VERSION OF LOCAL ALIGNMENT * * * \****************************************************************************************/ static int VectorEn = 4*sizeof(int) + sizeof(BVEC); static int forward_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, int midd, int mida, int minp, int maxp) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *NA; int *_HA, *_NA; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha; int morea, morey, mored; int moreha; int more, morem, lasta; int aclip, bclip; hgh = midd; low = midd; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEn; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } /* Compute 0-wave starting from mid-line */ more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; morem = -1; { int k; char *a; a = aseq + hgh; for (k = hgh; k >= low; k--) { int y, c, d; int ha, na; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k)/TRACE_SPACE)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; } c = (y << 1) + k; while (y+k >= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; } if (c > besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; NA[k] = na; a -= 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } #ifdef DEBUG_WAVE printf("\nFORWARD WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif /* Compute successive waves until no furthest reaching points remain */ while (more && lasta >= besta - TRIM_MLAG) { int k, n; int ua; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move; int64 vd, md, had, nad, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEn)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEn; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; V[low] = -1; } if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; V[hgh] = am = -1; } else am = V[hgh]; dif += 1; ac = V[hgh+1] = V[low-1] = -1; a = aseq + hgh; t = PATH_INT; n = PATH_LEN; ua = -1; for (k = hgh; k >= low; k--) { int y, m; int ha; int c, d; BVEC b; Pebble *pb; ap = ac; ac = am; am = V[d = k-1]; if (ac < am) if (am < ap) { c = ap+1; m = n; b = t; ha = ua; } else { c = am+1; m = M[d]; b = T[d]; ha = HA[d]; } else if (ac < ap) { c = ap+1; m = n; b = t; ha = ua; } else { c = ac+2; m = M[k]; b = T[k]; ha = HA[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k >= NA[k]) { if (cells[ha].mark < NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] += TRACE_SPACE; } if (c > besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; } } } t = T[k]; n = M[k]; ua = HA[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; a -= 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta-besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } n = besta - WAVE_LAG; while (hgh >= low) if (V[hgh] < n) hgh -= 1; else { while (V[low] < n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; int atlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; } else trimx = trima-trimy; atlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida-k)/2; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; atrace[atlen++] = (uint16) (d-e); atrace[atlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[atlen++] = (uint16) (trimd-e); atrace[atlen++] = (uint16) (trimy-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } apath->aepos = trimx; apath->bepos = trimy; apath->diffs = trimd; apath->tlen = atlen; } return (0); } static int reverse_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, int midd, int mida, int minp, int maxp) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *NA; int *_HA, *_NA; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha; int morea, morey, mored; int moreha; int more, morem, lasta; int aclip, bclip; hgh = midd; low = midd; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEn; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; morem = -1; { int k; char *a; a = aseq + low; for (k = low; k <= hgh; k++) { int y, c, d; int ha, na; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y+k; ha = avail++; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; } c = (y << 1) + k; while (y+k <= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na -= TRACE_SPACE; } if (c < besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; NA[k] = na; a += 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } #ifdef DEBUG_WAVE printf("\nREVERSE WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif while (more && lasta <= besta + TRIM_MLAG) { int k, n; int ua; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move, vd, md, had, nad, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEn)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEn; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; V[low] = ap = INT32_MAX; } else ap = V[low]; if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; V[hgh] = INT32_MAX; } dif += 1; ac = V[hgh+1] = V[low-1] = INT32_MAX; a = aseq + low; t = PATH_INT; n = PATH_LEN; ua = -1; for (k = low; k <= hgh; k++) { int y, m; int ha; int c, d; BVEC b; Pebble *pb; am = ac; ac = ap; ap = V[d = k+1]; if (ac > ap) if (ap > am) { c = am-1; m = n; b = t; ha = ua; } else { c = ap-1; m = M[d]; b = T[d]; ha = HA[d]; } else if (ac > am) { c = am-1; m = n; b = t; ha = ua; } else { c = ac-2; m = M[k]; b = T[k]; ha = HA[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k <= NA[k]) { if (cells[ha].mark > NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] -= TRACE_SPACE; } if (c < besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; } } } t = T[k]; n = M[k]; ua = HA[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; a += 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } n = besta + WAVE_LAG; while (hgh >= low) if (V[hgh] > n) hgh -= 1; else { while (V[low] > n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; int atlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; } else trimx = trima-trimy; atlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark - k; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); #endif if ((b+k)%TRACE_SPACE != 0) { h = cells[h].ptr; if (h < 0) { a = trimy; d = trimd; } else { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif atrace[--atlen] = (uint16) (b-a); atrace[--atlen] = (uint16) (d-e); b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; atrace[--atlen] = (uint16) (b-a); d = cells[h].diff; atrace[--atlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[--atlen] = (uint16) (b-trimy); atrace[--atlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } } apath->abpos = trimx; apath->bbpos = trimy; apath->diffs = trimd; apath->tlen = - atlen; apath->trace = atrace + atlen; } return (0); } /* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) See associated .h file for the precise definition of the interface. */ int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec, int diag, int anti, int lbord, int hbord, int prefix) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath; int minp, maxp; { int alen, blen; int maxtp, wsize; alen = align->alen; blen = align->blen; wsize = VectorEn*10000; if (wsize >= work->vecmax) if (enlarge_vector(work,wsize)) EXIT(1); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); else maxtp = 2*(alen/spec->trace_space+2); wsize = 2*maxtp*sizeof(uint16); if (wsize > work->pntmax) if (enlarge_points(work,wsize)) EXIT(1); apath = align->path; apath->trace = ((uint16 *) work->points) + maxtp; } #ifdef DEBUG_PASSES printf("\n"); #endif if (lbord < 0) minp = -INT32_MAX; else minp = diag-lbord; if (hbord < 0) maxp = INT32_MAX; else maxp = diag+hbord; if (prefix) { if (reverse_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); apath->aepos = (anti+diag)/2; apath->bepos = (anti-diag)/2; #ifdef DEBUG_PASSES printf("E1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->abpos,apath->bbpos,apath->diffs); #endif } else { if (forward_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); apath->abpos = (anti+diag)/2; apath->bbpos = (anti-diag)/2; #ifdef DEBUG_PASSES printf("F1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->aepos,apath->bepos,apath->diffs); #endif } #ifdef DEBUG_POINTS { uint16 *trace = (uint16 *) apath->trace; int a, h; printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos); printf(" %c\n",(COMP(align->flags) ? 'c' : 'n')); a = apath->bbpos; for (h = 1; h < apath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } #endif return (0); } /****************************************************************************************\ * * * OVERLAP MANIPULATION * * * \****************************************************************************************/ static int64 PtrSize = sizeof(void *); static int64 OvlIOSize = sizeof(Overlap) - sizeof(void *); int Read_Overlap(FILE *input, Overlap *ovl) { if (fread( ((char *) ovl) + PtrSize, OvlIOSize, 1, input) != 1) return (1); return (0); } int Read_Trace(FILE *input, Overlap *ovl, int tbytes) { if (tbytes > 0 && ovl->path.tlen > 0) { if (fread(ovl->path.trace, tbytes*ovl->path.tlen, 1, input) != 1) return (1); } return (0); } int Write_Overlap(FILE *output, Overlap *ovl, int tbytes) { if (fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output) != 1) return (1); if (ovl->path.trace != NULL) if (fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output) != (size_t) ovl->path.tlen) return (1); return (0); } void Compress_TraceTo8(Overlap *ovl) { uint16 *t16 = (uint16 *) ovl->path.trace; uint8 *t8 = (uint8 *) ovl->path.trace; int j; for (j = 0; j < ovl->path.tlen; j++) t8[j] = (uint8) (t16[j]); } void Decompress_TraceTo16(Overlap *ovl) { uint16 *t16 = (uint16 *) ovl->path.trace; uint8 *t8 = (uint8 *) ovl->path.trace; int j; for (j = ovl->path.tlen-1; j >= 0; j--) t16[j] = t8[j]; } void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent) { int i; fprintf(output,"%*s%d vs. ",indent,"",ovl->aread); if (COMP(ovl->flags)) fprintf(output,"c(%d)\n",ovl->bread); else fprintf(output,"%d\n",ovl->bread); fprintf(output,"%*s [%d,%d] vs [%d,%d] w. %d diffs\n",indent,"", ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos,ovl->path.diffs); if (tbytes == 1) { uint8 *trace = (uint8 *) (ovl->path.trace); if (trace != NULL) { int p = ovl->path.bbpos + trace[1]; fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); for (i = 3; i < ovl->path.tlen; i += 2) { if (i%10 == 0) fprintf(output,"\n%*s",indent+6,""); p += trace[i]; fprintf(output," %3d/%5d",trace[i-1],p); } fprintf(output,"\n"); } } else { uint16 *trace = (uint16 *) (ovl->path.trace); if (trace != NULL) { int p = ovl->path.bbpos + trace[1]; fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); for (i = 3; i < ovl->path.tlen; i += 2) { if (i%10 == 0) fprintf(output,"\n%*s",indent+6,""); p += trace[i]; fprintf(output," %3d/%5d",trace[i-1],p); } fprintf(output,"\n"); } } } int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname) { int i, p, q; if (tspace != 0) { if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2) { if (verbose) EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname); return (1); } p = ovl->path.bbpos; if (tspace <= TRACE_XOVR) { uint8 *trace8 = (uint8 *) ovl->path.trace; for (i = 1; i < ovl->path.tlen; i += 2) p += trace8[i]; } else { uint16 *trace16 = (uint16 *) ovl->path.trace; for (i = 1; i < ovl->path.tlen; i += 2) p += trace16[i]; } if (p != ovl->path.bepos) { if (verbose) EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname); return (1); } } else { uint16 *trace16 = (uint16 *) ovl->path.trace; p = ovl->path.bbpos; q = ovl->path.abpos; for (i = 1; i < ovl->path.tlen; i += 2) { p += trace16[i]; q += trace16[i-1]; } if (p != ovl->path.bepos || q != ovl->path.aepos) { if (verbose) EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname); return (1); } } return (0); } void Flip_Alignment(Alignment *align, int full) { char *aseq = align->aseq; char *bseq = align->bseq; int alen = align->alen; int blen = align->blen; Path *path = align->path; int comp = COMP(align->flags); int *trace = (int *) path->trace; int tlen = path->tlen; int i, j, p; if (comp) { p = path->abpos; path->abpos = blen - path->bepos; path->bepos = alen - p; p = path->aepos; path->aepos = blen - path->bbpos; path->bbpos = alen - p; if (full) { alen += 2; blen += 2; for (i = 0; i < tlen; i++) if ((p = trace[i]) < 0) trace[i] = alen + p; else trace[i] = p - blen; i = tlen-1; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; i -= 1; j += 1; } alen -= 2; blen -= 2; } } else { p = path->abpos; path->abpos = path->bbpos; path->bbpos = p; p = path->aepos; path->aepos = path->bepos; path->bepos = p; if (full) for (i = 0; i < tlen; i++) trace[i] = - (trace[i]); } align->aseq = bseq; align->bseq = aseq; align->alen = blen; align->blen = alen; } /****************************************************************************************\ * * * ALIGNMENT PRINTING * * * \****************************************************************************************/ /* Complement the sequence in fragment aseq. The operation does the complementation/reversal in place. Calling it a second time on a given fragment restores it to its original state. */ void Complement_Seq(char *aseq, int len) { char *s, *t; int c; s = aseq; t = aseq + (len-1); while (s < t) { c = 3 - *s; *s++ = (char) (3 - *t); *t-- = (char) c; } if (s == t) *s = (char) (3 - *s); } /* Print an alignment to file between a and b given in trace (unpacked). Prefix gives the length of the initial prefix of a that is unaligned. */ static char ToL[8] = { 'a', 'c', 'g', 't', '.', '[', ']', '-' }; static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' }; int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, int indent, int width, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int comp, blen; int sa, sb; int match, diff; char *N2A; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif o = sizeof(char)*3*(width+1); if (o > work->vecmax) if (enlarge_vector(work,o)) EXIT(1); if (upper) N2A = ToU; else N2A = ToL; Abuf = (char *) work->vector; Bbuf = Abuf + (width+1); Dbuf = Bbuf + (width+1); aend = align->path->aepos; bend = align->path->bepos; comp = COMP(align->flags); blen = align->blen; Abuf[width] = Bbuf[width] = Dbuf[width] = '\0'; /* buffer/output next column */ #define COLUMN(x,y) \ { int u, v; \ if (o >= width) \ { fprintf(file,"\n"); \ fprintf(file,"%*s",indent,""); \ if (coord > 0) \ { if (sa < aend) \ fprintf(file," %*d",coord,sa); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %*s %s\n",indent,"",coord,"",Dbuf); \ fprintf(file,"%*s",indent,""); \ if (sb < bend) \ if (comp) \ fprintf(file," %*d",coord,blen-sb); \ else \ fprintf(file," %*d",coord,sb); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s",Bbuf); \ } \ else \ { fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %s\n",indent,"",Dbuf); \ fprintf(file,"%*s %s",indent,"",Bbuf); \ } \ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ o = 0; \ sa = i-1; \ sb = j-1; \ match = diff = 0; \ } \ u = (x); \ v = (y); \ if (u == 4 || v == 4) \ Dbuf[o] = ' '; \ else if (u == v) \ Dbuf[o] = mtag; \ else \ Dbuf[o] = dtag; \ Abuf[o] = N2A[u]; \ Bbuf[o] = N2A[v]; \ o += 1; \ } a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->path->abpos; prefb = align->path->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } sa = i-1; sb = j-1; mtag = ':'; dtag = ':'; while (prefa > prefb) { COLUMN(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { COLUMN(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { COLUMN(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) COLUMN(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(7,b[j]) j += 1; diff += 1; } else { while (j != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(a[i],7) i += 1; diff += 1; } p = align->path->aepos; while (i <= p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) COLUMN(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { COLUMN(a[i],b[j]) i += 1; j += 1; } else { COLUMN(a[i],4) i += 1; } else { COLUMN(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa < aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb < bend) if (comp) fprintf(file," %*d",coord,blen-sb); else fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); fflush(file); return (0); } int Print_Reference(FILE *file, Alignment *align, Work_Data *ework, int indent, int block, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int comp, blen; int sa, sb, s0; int match, diff; char *N2A; int vmax; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif vmax = work->vecmax/3; o = sizeof(char)*6*(block+1); if (o > vmax) { if (enlarge_vector(work,3*o)) EXIT(1); vmax = work->vecmax/3; } Abuf = (char *) work->vector; Bbuf = Abuf + vmax; Dbuf = Bbuf + vmax; if (upper) N2A = ToU; else N2A = ToL; aend = align->path->aepos; bend = align->path->bepos; comp = COMP(align->flags); blen = align->blen; #define BLOCK(x,y) \ { int u, v; \ if (i%block == 1 && i != s0 && x < 4 && o > 0) \ { fprintf(file,"\n"); \ fprintf(file,"%*s",indent,""); \ if (coord > 0) \ { if (sa < aend) \ fprintf(file," %*d",coord,sa); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %.*s\n",o,Abuf); \ fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); \ fprintf(file,"%*s",indent,""); \ if (sb < bend) \ if (comp) \ fprintf(file," %*d",coord,blen-sb); \ else \ fprintf(file," %*d",coord,sb); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %.*s",o,Bbuf); \ } \ else \ { fprintf(file," %.*s\n",o,Abuf); \ fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); \ fprintf(file,"%*s %.*s",indent,"",o,Bbuf); \ } \ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ o = 0; \ sa = i-1; \ sb = j-1; \ match = diff = 0; \ } \ u = (x); \ v = (y); \ if (u == 4 || v == 4) \ Dbuf[o] = ' '; \ else if (u == v) \ Dbuf[o] = mtag; \ else \ Dbuf[o] = dtag; \ Abuf[o] = N2A[u]; \ Bbuf[o] = N2A[v]; \ o += 1; \ if (o >= vmax) \ { if (enlarge_vector(work,3*o)) \ EXIT(1); \ vmax = work->vecmax/3; \ memmove(work->vector+2*vmax,Dbuf,o); \ memmove(work->vector+vmax,Bbuf,o); \ memmove(work->vector,Abuf,o); \ Abuf = (char *) work->vector; \ Bbuf = Abuf + vmax; \ Dbuf = Bbuf + vmax; \ } \ } a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->path->abpos; prefb = align->path->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } s0 = i; sa = i-1; sb = j-1; mtag = ':'; dtag = ':'; while (prefa > prefb) { BLOCK(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { BLOCK(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { BLOCK(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) BLOCK(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } BLOCK(7,b[j]) j += 1; diff += 1; } else { while (j != p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } BLOCK(a[i],7) i += 1; diff += 1; } p = align->path->aepos; while (i <= p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) BLOCK(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { BLOCK(a[i],b[j]) i += 1; j += 1; } else { BLOCK(a[i],4) i += 1; } else { BLOCK(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa < aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb < bend) if (comp) fprintf(file," %*d",coord,blen-sb); else fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); fflush(file); return (0); } /* Print an ASCII representation of the overlap in align between fragments a and b to given file. */ static inline void repchar(FILE *file, int symbol, int rep) { while (rep-- > 0) fputc(symbol,file); } void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord) { int alen = align->alen; int blen = align->blen; Path *path = align->path; int comp = COMP(align->flags); int w; fprintf(file,"%*s",indent,""); if (path->abpos > 0) fprintf(file," %*d ",coord,path->abpos); else fprintf(file,"%*s",coord+5,""); if (path->aepos < alen) fprintf(file,"%*s%d",coord+8,"",alen-path->aepos); fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (path->abpos > 0) { fprintf(file,"A "); w = Number_Digits((int64) path->abpos); repchar(file,' ',coord-w); repchar(file,'=',w+3); fputc('+',file); repchar(file,'-',coord+5); } else { fprintf(file,"A %*s",coord+4,""); repchar(file,'-',coord+5); } if (path->aepos < alen) { fputc('+',file); w = Number_Digits((int64) (alen-path->aepos)); repchar(file,'=',w+2); fputc('>',file); repchar(file,' ',w); } else { fputc('>',file); repchar(file,' ',coord+3); } { int asub, bsub; asub = path->aepos - path->abpos; bsub = path->bepos - path->bbpos; fprintf(file," dif/(len1+len2) = %d/(%d+%d) = %5.2f%%\n", path->diffs,asub,bsub,(200.*path->diffs)/(asub+bsub)); } { int sym1e, sym2e; int sym1p, sym2p; if (comp > 0) { sym1p = '<'; sym2p = '-'; sym1e = '<'; sym2e = '='; } else { sym1p = '-'; sym2p = '>'; sym1e = '='; sym2e = '>'; } fprintf(file,"%*s",indent,""); if (path->bbpos > 0) { fprintf(file,"B "); w = Number_Digits((int64) path->bbpos); repchar(file,' ',coord-w); fputc(sym1e,file); repchar(file,'=',w+2); fputc('+',file); repchar(file,'-',coord+5); } else { fprintf(file,"B "); repchar(file,' ',coord+3); fputc(sym1p,file); repchar(file,'-',coord+5); } if (path->bepos < blen) { fprintf(file,"+"); w = Number_Digits((int64) (blen-path->bepos)); repchar(file,'=',w+2); fprintf(file,"%c\n",sym2e); } else fprintf(file,"%c\n",sym2p); } fprintf(file,"%*s",indent,""); if (path->bbpos > 0) fprintf(file," %*d ",coord,path->bbpos); else fprintf(file,"%*s",coord+5,""); if (path->bepos < blen) fprintf(file,"%*s%d",coord+8,"",blen-path->bepos); fprintf(file,"\n"); fflush(file); } /****************************************************************************************\ * * * O(ND) trace algorithm * * * \****************************************************************************************/ #ifdef DEBUG_AWAVE static void print_awave(int *V, int low, int hgh) { int k; printf(" [%6d,%6d]: ",low,hgh); for (k = low; k <= hgh; k++) printf(" %3d",V[k]); printf("\n"); fflush(stdout); } #endif #ifdef DEBUG_ALIGN static int depth = 0; #endif typedef struct { int *Stop; // Ongoing stack of alignment indels char *Aabs, *Babs; // Absolute base of A and B sequences int **PVF, **PHF; // List of waves for iterative np algorithms int mida, midb; // mid point division for mid-point algorithms int *VF, *VB; // Forward/Reverse waves for nd algorithms // (defunct: were used for O(nd) algorithms) } Trace_Waves; static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave) { int x, y; int D; #ifdef DEBUG_ALIGN printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); #endif if (M <= 0) { x = (wave->Aabs-A)-1; for (y = 1; y <= N; y++) { *wave->Stop++ = x; #ifdef DEBUG_SCRIPT printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1); #endif } return (N); } if (N <= 0) { y = (B-wave->Babs)+1; for (x = 1; x <= M; x++) { *wave->Stop++ = y; #ifdef DEBUG_SCRIPT printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1); #endif } return (M); } { int *VF = wave->VF; int *VB = wave->VB; int flow; // fhgh == D ! int blow, bhgh; char *a; y = 0; if (N < M) while (y < N && B[y] == A[y]) y += 1; else { while (y < M && B[y] == A[y]) y += 1; if (y >= M && N == M) return (0); } flow = 0; VF[0] = y; VF[-1] = -2; x = N-M; a = A-x; y = N-1; if (N > M) while (y >= x && B[y] == a[y]) y -= 1; else while (y >= 0 && B[y] == a[y]) y -= 1; blow = bhgh = -x; VB += x; VB[blow] = y; VB[blow-1] = N+1; for (D = 1; 1; D += 1) { int k, r; int am, ac, ap; // Forward wave flow -= 1; am = ac = VF[flow-1] = -2; a = A + D; x = M - D; for (k = D; k >= flow; k--) { ap = ac; ac = am+1; am = VF[k-1]; if (ac < am) if (ap < am) y = am; else y = ap; else if (ap < ac) y = ac; else y = ap; if (blow <= k && k <= bhgh) { r = VB[k]; if (y > r) { D = (D<<1)-1; if (ap > r) y = ap; else if (ac > r) y = ac; else y = r+1; x = k+y; goto OVERLAP2; } } if (N < x) while (y < N && B[y] == a[y]) y += 1; else while (y < x && B[y] == a[y]) y += 1; VF[k] = y; a -= 1; x += 1; } #ifdef DEBUG_AWAVE print_awave(VF,flow,D); #endif // Reverse Wave bhgh += 1; blow -= 1; am = ac = VB[blow-1] = N+1; a = A + bhgh; x = -bhgh; for (k = bhgh; k >= blow; k--) { ap = ac+1; ac = am; am = VB[k-1]; if (ac > am) if (ap > am) y = am; else y = ap; else if (ap > ac) y = ac; else y = ap; if (flow <= k && k <= D) { r = VF[k]; if (y <= r) { D = (D << 1); if (ap <= r) y = ap; else if (ac <= r) y = ac; else y = r; x = k+y; goto OVERLAP2; } } y -= 1; if (x > 0) while (y >= x && B[y] == a[y]) y -= 1; else while (y >= 0 && B[y] == a[y]) y -= 1; VB[k] = y; a -= 1; x += 1; } #ifdef DEBUG_AWAVE print_awave(VB,blow,bhgh); #endif } } OVERLAP2: #ifdef DEBUG_ALIGN printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D); fflush(stdout); #endif if (D > 1) { #ifdef DEBUG_ALIGN depth += 2; #endif dandc_nd(A,x,B,y,wave); dandc_nd(A+x,M-x,B+y,N-y,wave); #ifdef DEBUG_ALIGN depth -= 2; #endif } else if (D == 1) { if (M > N) { *wave->Stop++ = (B-wave->Babs)+y+1; #ifdef DEBUG_SCRIPT printf("%*s D %ld(%ld)\n",depth,"",(A-wave->Aabs)+x,(B-wave->Babs)+y+1); #endif } else if (M < N) { *wave->Stop++ = (wave->Aabs-A)-x-1; #ifdef DEBUG_SCRIPT printf("%*s I %ld(%ld)\n",depth,"",(B-wave->Babs)+y,(A-wave->Aabs)+x+1); #endif } #ifdef DEBUG_SCRIPT else printf("%*s %ld S %ld\n",depth,"",(wave->Aabs-A)+x,(B-wave->Babs)+y); #endif } return (D); } static int Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; int L, D; int asub, bsub; Path *path; int *trace; path = align->path; asub = path->aepos-path->abpos; bsub = path->bepos-path->bbpos; if (asub < bsub) L = bsub; else L = asub; L *= sizeof(int); if (L > work->tramax) if (enlarge_trace(work,L)) EXIT(1); trace = wave.Stop = ((int *) work->trace); D = 2*(path->diffs + 4)*sizeof(int); if (D > work->vecmax) if (enlarge_vector(work,D)) EXIT(1); D = (path->diffs+3)/2; wave.VF = ((int *) work->vector) + (D+1); wave.VB = wave.VF + (2*D+1); wave.Aabs = align->aseq; wave.Babs = align->bseq; path->diffs = dandc_nd(align->aseq+path->abpos,path->aepos-path->abpos, align->bseq+path->bbpos,path->bepos-path->bbpos,&wave); path->trace = trace; path->tlen = wave.Stop - trace; return (0); } /****************************************************************************************\ * * * O(NP) tracing algorithms * * * \****************************************************************************************/ /* Iterative O(np) algorithm for finding the alignment between two substrings (specified by a Path record). The variation includes handling substitutions and guarantees to find left-most alignments so that low complexity runs are always aligned in the same way. */ #ifdef DEBUG_ALIGN static int ToA[4] = { 'a', 'c', 'g', 't' }; #endif static char *TP_Align = "Bad alignment between trace points (Compute_Trace), source DB likely incorrect"; static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode, int dmax) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; int del = M-N; { int *F0, *F1, *F2; int *HF; int low, hgh; int posl, posh; #ifdef DEBUG_ALIGN printf("\n BASE %ld,%ld: %d vs %d\n",A-wave->Aabs,B-wave->Babs,M,N); printf(" A = "); for (D = 0; D < M; D++) printf("%c",ToA[(int) A[D]]); printf("\n"); printf(" B = "); for (D = 0; D < N; D++) printf("%c",ToA[(int) B[D]]); printf("\n"); #endif if (del >= 0) { low = 0; hgh = del; } else { low = del; hgh = 0; } posl = -dmax; posh = dmax; if (wave->Aabs == wave->Babs) { if (B == A) { EPRINTF(EPLACE,"%s: self comparison starts on diagonal 0 (Compute_Trace)\n",Prog_Name); EXIT(-1); } else if (B < A) { if ((B-A)+1 > posl) posl = (B-A)+1; } else { if ((B-A)-1 < posh) posh = (B-A)-1; } } F1 = PVF[-2]; F0 = PVF[-1]; for (D = low-1; D <= hgh+1; D++) F1[D] = F0[D] = -2; F0[0] = -1; low += 1; hgh -= 1; for (D = 0; 1; D += 1) { int k, i, j; int am, ac, ap; char *a; if (D > dmax) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Align); EXIT(-1); } F2 = F1; F1 = F0; F0 = PVF[D]; HF = PHF[D]; if ((D & 0x1) == 0) { if (low > posl) low -= 1; if (hgh < posh) hgh += 1; } F0[hgh+1] = F0[low-1] = -2; #define FS_MOVE(mdir,pdir) \ ac = F1[k]+1; \ if (ac < am) \ if (ap < am) \ { HF[k] = mdir; \ j = am; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ else \ if (ap < ac) \ { HF[k] = 0; \ j = ac; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ \ if (N < i) \ while (j < N && B[j] == a[j]) \ j += 1; \ else \ while (j < i && B[j] == a[j]) \ j += 1; \ F0[k] = j; j = -2; a = A + hgh; i = M - hgh; for (k = hgh; k > del; k--) { ap = j+1; am = F2[k-1]; FS_MOVE(-1,4) a -= 1; i += 1; } j = -2; a = A + low; i = M - low; for (k = low; k < del; k++) { ap = F2[k+1]+1; am = j; FS_MOVE(2,1) a += 1; i -= 1; } ap = F0[del+1]+1; am = j; FS_MOVE(2,4) #ifdef DEBUG_AWAVE print_awave(F0,low,hgh); print_awave(HF,low,hgh); #endif if (F0[del] >= N) break; } } { int k, h, m, e, c; int ap = (wave->Aabs-A)-1; int bp = (B-wave->Babs)+1; PHF[0][0] = 3; c = N; k = del; e = PHF[D][k]; PHF[D][k] = 3; if (mode == UPPERMOST) while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h < k) // => e = -1 or 2, UPPERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] <= c) c = PVF[D][h]-1; while (c >= m && a[c] == B[c]) c -= 1; if (e == -1) // => edge is 2, others are 1, and 0 { if (c <= PVF[D+2][k+1]) { e = 4; h = k+1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c+1; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c <= PVF[m][k+1]) { if (k == del) e = 4; else e = 1; h = k+1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c+1; } } m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } else if (mode == LOWERMOST) while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h > k) // => e = 1 or 4, LOWERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] < c) c = PVF[D][h]; while (c >= m && a[c] == B[c]) c -= 1; if (e == 1) // => edge is 2, others are 1, and 0 { if (c < PVF[D+2][k-1]) { e = 2; h = k-1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c--; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c < PVF[m][k-1]) { if (k == del) e = 2; else e = -1; h = k-1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c--; } } m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } else // mode == GREEDIEST while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } k = D = 0; e = PHF[D][k]; while (e != 3) { h = k-e; c = PVF[D][k]; if (e > 1) h += 3; else if (e == 0) D += 1; else D += 2; #ifdef DEBUG_SCRIPT if (h > k) printf(" D %d(%d)\n",(c-k)-(ap-1),c+bp); else if (h < k) printf(" I %d(%d)\n",c+(bp-1),(c+k)-ap); else printf(" %d S %d\n",(c+k)-(ap+1),c+(bp-1)); #endif if (h > k) *wave->Stop++ = bp+c; else if (h < k) *wave->Stop++ = ap-(c+k); k = h; e = PHF[D][h]; } } return (D + abs(del)); } static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode, int dmax) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; int del = M-N; { int *F0, *F1, *F2; int *HF; int low, hgh; int posl, posh; #ifdef DEBUG_ALIGN printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); printf("%*s A = ",depth,""); for (D = 0; D < M; D++) printf("%c",ToA[(int) A[D]]); printf("\n"); printf("%*s B = ",depth,""); for (D = 0; D < N; D++) printf("%c",ToA[(int) B[D]]); printf("\n"); #endif if (del >= 0) { low = 0; hgh = del; } else { low = del; hgh = 0; } posl = -dmax; posh = dmax; if (wave->Aabs == wave->Babs) { if (B == A) { EPRINTF(EPLACE,"%s: self comparison starts on diagonal 0 (Compute_Trace)\n",Prog_Name); EXIT(1); } else if (B < A) { if ((B-A)+1 > posl) posl = (B-A)+1; } else { if ((B-A)-1 < posh) posh = (B-A)-1; } } F1 = PVF[-2]; F0 = PVF[-1]; for (D = low-1; D <= hgh+1; D++) F1[D] = F0[D] = -2; F0[0] = -1; low += 1; hgh -= 1; for (D = 0; 1; D += 1) { int k, i, j; int am, ac, ap; char *a; if (D > dmax) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Align); EXIT(-1); } F2 = F1; F1 = F0; F0 = PVF[D]; HF = PHF[D]; if ((D & 0x1) == 0) { if (low > posl) low -= 1; if (hgh < posh) hgh += 1; } F0[hgh+1] = F0[low-1] = -2; j = -2; a = A + hgh; i = M - hgh; for (k = hgh; k > del; k--) { ap = j+1; am = F2[k-1]; FS_MOVE(-1,4) a -= 1; i += 1; } j = -2; a = A + low; i = M - low; for (k = low; k < del; k++) { ap = F2[k+1]+1; am = j; FS_MOVE(2,1) a += 1; i -= 1; } ap = F0[del+1]+1; am = j; FS_MOVE(2,4) #ifdef DEBUG_AWAVE print_awave(F0,low,hgh); print_awave(HF,low,hgh); #endif if (F0[del] >= N) break; } } { int k, h, m, e, c; int d, f; d = D + abs(del); c = N; k = del; if (mode == UPPERMOST) for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h < k) // => e = -1 or 2, UPPERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] <= c) c = PVF[D][h]-1; while (c >= m && a[c] == B[c]) c -= 1; if (e == -1) // => edge is 2, others are 1, and 0 { if (c <= PVF[D+2][k+1]) { e = 4; h = k+1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c+1; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c <= PVF[m][k+1]) { if (k == del) e = 4; else e = 1; h = k+1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c+1; } } k = h; } else if (mode == LOWERMOST) for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h > k) // => e = 1 or 4, LOWERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] < c) c = PVF[D][h]; while (c >= m && a[c] == B[c]) c -= 1; if (e == 1) // => edge is 2, others are 1, and 0 { if (c < PVF[D+2][k-1]) { e = 2; h = k-1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c--; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c < PVF[m][k-1]) { if (k == del) e = 2; else e = -1; h = k-1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c--; } } k = h; } else // mode == GREEDIEST for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; k = h; } wave->midb = (B-wave->Babs) + PVF[D][k]; wave->mida = (A-wave->Aabs) + k + PVF[D][k]; } return (0); } /****************************************************************************************\ * * * COMPUTE_TRACE FLAVORS * * * \****************************************************************************************/ static char *TP_Error = "Trace point out of bounds (Compute_Trace), source DB likely incorrect"; int Compute_Trace_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; int alen, blen; int M, N, D; int dmax; alen = align->alen; blen = align->blen; path = align->path; aseq = align->aseq; bseq = align->bseq; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; { int64 s; int d; int **PVF, **PHF; if (M < N) s = N; else s = M; s *= sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); dmax = path->diffs - abs(M-N); s = (dmax+3)*2*((M+N+3)*sizeof(int) + sizeof(int *)); if (s > 256000000) return (Compute_Trace_ND_ALL(align,ework)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = M+N+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (N+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = ((int *) work->trace); wave.Aabs = aseq; wave.Babs = bseq; if (path->aepos > alen || path->bepos > blen) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); EXIT(1); } D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave,GREEDIEST,dmax); if (D < 0) EXIT(1); path->diffs = D; path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); return (0); } int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; int alen, blen; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs, dmax; alen = align->alen; blen = align->blen; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = 0; dmax = 0; for (d = 1; d < tlen; d += 2) { if (points[d-1] > dmax) dmax = points[d-1]; if (points[d] > nmax) nmax = points[d]; } if (tlen <= 1) nmax = N; s = (dmax+3)*2*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = trace_spacing+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = (int *) (work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; diffs = 0; ab = path->abpos; ae = (ab/trace_spacing)*trace_spacing; bb = path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; if (ae > alen || be > blen) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); EXIT(1); } d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; } ae = path->aepos; be = path->bepos; if (ae > alen || be > blen) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); EXIT(1); } d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; int alen, blen; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs, dmax; alen = align->alen; blen = align->blen; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = 0; dmax = 0; for (d = 1; d < tlen; d += 2) { if (points[d-1] > dmax) dmax = points[d-1]; if (points[d] > nmax) nmax = points[d]; } if (tlen <= 1) nmax = N; s = (dmax+3)*4*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = trace_spacing+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = ((int *) work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; int as, bs; int af, bf; diffs = 0; ab = as = af = path->abpos; ae = (ab/trace_spacing)*trace_spacing; bb = bs = bf = path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; if (ae > alen || be > blen) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); EXIT(1); } if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax)) EXIT(1); af = wave.mida; bf = wave.midb; d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; as = af; bs = bf; } ae = path->aepos; be = path->bepos; if (ae > alen || be > blen) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); EXIT(1); } if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax)) EXIT(1); af = wave.mida; bf = wave.midb; d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; as = af; bs = bf; d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; int alen, blen; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs, dmax; alen = align->alen; blen = align->blen; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int mmax, nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = mmax = 0; for (d = 0; d < tlen; d += 2) { if (points[d] > mmax) mmax = points[d]; if (points[d+1] > nmax) nmax = points[d+1]; } if (tlen <= 1) { mmax = M; nmax = N; } if (mmax > nmax) dmax = nmax; else dmax = mmax; s = (dmax+3)*2*((mmax+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = mmax+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = (int *) (work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; diffs = 0; ab = path->abpos; bb = path->bbpos; for (i = 0; i < tlen; i += 2) { ae = ab + points[i]; be = bb + points[i+1]; if (ae > alen || be > blen) { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); EXIT(1); } d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; } } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } DALIGNER-master/align.h000066400000000000000000000456171322465224500150210ustar00rootroot00000000000000/******************************************************************************************* * * Local alignment module. Routines for finding local alignments given a seed position, * representing such an l.a. with its interval and a set of pass-thru points, so that * a detailed alignment can be efficiently computed on demand. * * All routines work on a numeric representation of DNA sequences, i.e. 0 for A, 1 for C, * 2 for G, and 3 for T. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #ifndef _A_MODULE #define _A_MODULE #include "DB.h" #define TRACE_XOVR 125 // If the trace spacing is not more than this value, then can // and do compress traces pts to 8-bit unsigned ints /*** INTERACTIVE vs BATCH version The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or batch version of the routines in this library are compiled. In batch mode, routines print an error message and exit. In interactive mode, the routines place the error message in EPLACE (also defined in DB.h) and return an error value, typically NULL if the routine returns a pointer, and an unusual integer value if the routine returns an integer. Below when an error return is described, one should understand that this value is returned only if the routine was compiled in INTERACTIVE mode. ***/ /*** PATH ABSTRACTION: Coordinates are *between* characters where 0 is the tick just before the first char, 1 is the tick between the first and second character, and so on. Our data structure is called a Path refering to its conceptualization in an edit graph. A local alignment is specified by the point '(abpos,bbpos)' at which its path in the underlying edit graph starts, and the point '(aepos,bepos)' at which it ends. In otherwords A[abpos+1..aepos] is aligned to B[bbpos+1..bepos] (assuming X[1] is the *first* character of X). There are 'diffs' differences in an optimal local alignment between the beginning and end points of the alignment (if computed by Compute_Trace), or nearly so (if computed by Local_Alignment). Optionally, a Path can have additional information about the exact nature of the aligned substrings if the field 'trace' is not NULL. Trace points to either an array of integers (if computed by a Compute_Trace routine), or an array of unsigned short integers (if computed by Local_Alignment). If computed by Local_Alignment 'trace' points at a list of 'tlen' (always even) short values: d_0, b_0, d_1, b_1, ... d_n-1, b_n-1, d_n, b_n to be interpreted as follows. The alignment from (abpos,bbpos) to (aepos,bepos) passes through the n trace points for i in [1,n]: (a_i,b_i) where a_i = floor(abpos/TS)*TS + i*TS and b_i = bbpos + (b_0 + b_1 + b_i-1) where also let a_0,b_0 = abpos,bbpos and a_(n+1),b_(n+1) = aepos,bepos. That is, the interior (i.e. i != 0 and i != n+1) trace points pass through every TS'th position of the aread where TS is the "trace spacing" employed when finding the alignment (see New_Align_Spec). Typically TS is 100. Then d_i is the number of differences in the portion of the alignment between (a_i,b_i) and (a_i+1,b_i+1). These trace points allow the Compute_Trace routines to efficiently compute the exact alignment between the two reads by efficiently computing exact alignments between consecutive pairs of trace points. Moreover, the diff values give one an idea of the quality of the alignment along every segment of TS symbols of the aread. If computed by a Compute_Trace routine, 'trace' points at a list of 'tlen' integers < i1, i2, ... in > that encodes an exact alignment as follows. A negative number j indicates that a dash should be placed before A[-j] and a positive number k indicates that a dash should be placed before B[k], where A and B are the two sequences of the overlap. The indels occur in the trace in the order in which they occur along the alignment. For a good example of how to "decode" a trace into an alignment, see the code for the routine Print_Alignment. ***/ typedef struct { void *trace; int tlen; int diffs; int abpos, bbpos; int aepos, bepos; } Path; /*** ALIGNMENT ABSTRACTION: An alignment is modeled by an Alignment record, which in addition to a *pointer* to a 'path', gives pointers to the A and B sequences, their lengths, and indicates whether the B-sequence needs to be complemented ('comp' non-zero if so). The 'trace' pointer of the 'path' subrecord can be either NULL, a list of pass-through points, or an exact trace depending on what routines have been called on the record. One can (1) compute a trace, with Compute_Trace, either from scratch if 'path.trace' = NULL, or using the sequence of pass-through points in trace, (2) print an ASCII representation of an alignment, or (3) reverse the roles of A and B, and (4) complement a sequence (which is a reversible process). If the alignment record shows the B sequence as complemented, *** THEN IT IS THE RESPONSIBILITY OF THE CALLER *** to make sure that bseq points at a complement of the sequence before calling Compute_Trace or Print_Alignment. Complement_Seq complements the sequence a of length n. The operation does the complementation/reversal in place. Calling it a second time on a given fragment restores it to its original state. With the introduction of the DAMAPPER, we need to code chains of alignments between a pair of sequences. The alignments of a chain are expected to be found in order either on a file or in memory, where the START_FLAG marks the first alignment and the NEXT_FLAG all subsequent alignmenst in a chain. A chain of a single LA is marked with the START_FLAG. The BEST_FLAG marks one of the best chains for a pair of sequences. The convention is that either every record has either a START- or NEXT-flag, or none of them do (e.g. as produced by daligner), so one can always check the flags of the first alignment to see whether or not the chain concept applies to a given collection or not. ***/ #define COMP_FLAG 0x1 #define ACOMP_FLAG 0x2 // A-sequence is complemented, not B ! Only Local_Alignment notices #define COMP(x) ((x) & COMP_FLAG) #define ACOMP(x) ((x) & ACOMP_FLAG) #define START_FLAG 0x4 // LA is the first of a chain of 1 or more la's #define NEXT_FLAG 0x8 // LA is the next segment of a chain. #define BEST_FLAG 0x10 // This is the start of the best chain #define CHAIN_START(x) ((x) & START_FLAG) #define CHAIN_NEXT(x) ((x) & NEXT_FLAG) #define BEST_CHAIN(x) ((x) & BEST_FLAG) #define ELIM_FLAG 0x20 // This LA should be ignored #define ELIM(x) ((x) & ELIM_FLAG) typedef struct { Path *path; uint32 flags; /* Pipeline status and complementation flags */ char *aseq; /* Pointer to A sequence */ char *bseq; /* Pointer to B sequence */ int alen; /* Length of A sequence */ int blen; /* Length of B sequence */ } Alignment; void Complement_Seq(char *a, int n); /* Many routines like Local_Alignment, Compute_Trace, and Print_Alignment need working storage that is more efficiently reused with each call, rather than being allocated anew with each call. Each *thread* can create a Work_Data object with New_Work_Data and this object holds and retains the working storage for routines of this module between calls to the routines. If enough memory for a Work_Data is not available then NULL is returned. Free_Work_Data frees a Work_Data object and all working storage held by it. */ typedef void Work_Data; Work_Data *New_Work_Data(); void Free_Work_Data(Work_Data *work); /* Local_Alignment seeks local alignments of a quality determined by a number of parameters. These are coded in an Align_Spec object that can be created with New_Align_Spec and freed with Free_Align_Spec when no longer needed. There are 4 essential parameters: ave_corr: the average correlation (1 - 2*error_rate) for the sought alignments. For Pacbio data we set this to .70 assuming an average of 15% error in each read. trace_space: the spacing interval for keeping trace points and segment differences (see description of 'trace' for Paths above) freq[4]: a 4-element vector where afreq[0] = frequency of A, f(A), freq[1] = f(C), freq[2] = f(G), and freq[3] = f(T). This vector is part of the header of every DAZZ database (see db.h). reach: a boolean, if set alignment extend to the boundary when reasonable, otherwise the terminate only at suffix-positive points. If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e. overlap), then the last/first 30 columns of the alignment are guaranteed to be suffix/prefix positive at correlation ave_corr * g(freq) where g is an empirically measured function that increases from 1 as the entropy of freq decreases. If memory is unavailable or the freq distribution is too skewed then NULL is returned. You can get back the original parameters used to create an Align_Spec with the simple utility functions below. */ typedef void Align_Spec; Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq, int reach); void Free_Align_Spec(Align_Spec *spec); int Trace_Spacing (Align_Spec *spec); double Average_Correlation(Align_Spec *spec); float *Base_Frequencies (Align_Spec *spec); int Overlap_If_Possible(Align_Spec *spec); /* Local_Alignment finds the longest significant local alignment between the sequences in 'align' subject to: (a) the alignment criterion given by the Align_Spec 'spec', (b) it passes through one of the points (anti+k)/2,(anti-k)/2 for k in [low,hgh] within the underlying dynamic programming matrix (i.e. the points on diagonals low to hgh on anti-diagonal anti or anti-1 (depending on whether the diagonal is odd or even)), (c) if lbord >= 0, then the alignment is always above diagonal low-lbord, and (d) if hbord >= 0, then the alignment is always below diagonal hgh+hbord. The path record of 'align' has its 'trace' filled from the point of view of an overlap between the aread and the bread. In addition a Path record from the point of view of the bread versus the aread is returned by the function, with this Path's 'trace' filled in appropriately. The space for the returned path and the two 'trace's are in the working storage supplied by the Work_Data packet and this space is reused with each call, so if one wants to retain the bread-path and the two trace point sequences, then they must be copied to user-allocated storage before calling the routine again. NULL is returned in the event of an error. Find_Extension is a variant of Local_Alignment that simply finds a local alignment that either ends (if prefix is non-zero) or begins (if prefix is zero) at the point (anti+diag)/2,(anti-diag)/2). All other parameters are as before. It returns a non-zero value only when INTERACTIVE is on and it cannot allocate the memory it needs. Only the path and trace with respect to the aread is returned. This routine is experimental and may not persist in later versions of the code. */ Path *Local_Alignment(Alignment *align, Work_Data *work, Align_Spec *spec, int low, int hgh, int anti, int lbord, int hbord); int Find_Extension(Alignment *align, Work_Data *work, Align_Spec *spec, // experimental !! int diag, int anti, int lbord, int hbord, int prefix); /* Given a legitimate Alignment object, Compute_Trace_X computes an exact trace for the alignment. If 'path.trace' is non-NULL, then it is assumed to be a sequence of pass-through points and diff levels computed by Local_Alignment. In either case 'path.trace' is set to point at an integer array within the storage of the Work_Data packet encoding an exact optimal trace from the start to end points. If the trace is needed beyond the next call to a routine that sets it, then it should be copied to an array allocated and managed by the caller. Compute_Trace_ALL does not require a sequence of pass-through points, as it computes the best alignment between (path->abpos,path->bbpos) and (path->aepos,path->bepos) in the edit graph between the sequences. Compute_Trace_PTS computes a trace by computing the trace between successive pass through points. It is much, much faster than Compute_Trace_ALL but at the tradeoff of not necessarily being optimal as pass-through points are not all perfect. Compute_Trace_MID computes a trace by computing the trace between the mid-points of alignments between two adjacent pairs of pass through points. It is generally twice as slow as Compute_Trace_PTS, but it produces nearer optimal alignments. All these routines return 1 if an error occurred and 0 otherwise. */ #define LOWERMOST -1 // Possible modes for "mode" parameter below) #define GREEDIEST 0 #define UPPERMOST 1 int Compute_Trace_ALL(Alignment *align, Work_Data *work); int Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing, int mode); int Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing, int mode); /* Compute_Trace_IRR (IRR for IRRegular) computes a trace for the given alignment where it assumes the spacing between trace points between both the A and B read varies, and futher assumes that the A-spacing is given in the short integers normally occupied by the differences in the alignment between the trace points. This routine is experimental and may not persist in later versions of the code. */ int Compute_Trace_IRR(Alignment *align, Work_Data *work, int mode); // experimental !! /* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls the display width of numbers, it must be not less than the width of any number to be displayed. If the alignment trace is an exact trace, then one can ask Print_Alignment to print an ASCII representation of the alignment 'align' to the file 'file'. Indent the display by "indent" spaces and put "width" columns per line in the display. Show "border" characters of sequence on each side of the aligned region. If upper is non-zero then display bases in upper case. If coord is greater than 0, then the positions of the first character in A and B in the given row is displayed with a field width given by coord's value. Print_Reference is like Print_Alignment but rather than printing exaclty "width" columns per segment, it prints "block" characters of the A sequence in each segment. This results in segments of different lengths, but is convenient when looking at two alignments involving A as segments are guaranteed to cover the same interval of A in a segment. Both Print routines return 1 if an error occurred (not enough memory), and 0 otherwise. Flip_Alignment modifies align so the roles of A and B are reversed. If full is off then the trace is ignored, otherwise the trace must be to a full alignment trace and this trace is also appropriately inverted. */ void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord); int Print_Alignment(FILE *file, Alignment *align, Work_Data *work, int indent, int width, int border, int upper, int coord); int Print_Reference(FILE *file, Alignment *align, Work_Data *work, int indent, int block, int border, int upper, int coord); void Flip_Alignment(Alignment *align, int full); /*** OVERLAP ABSTRACTION: Externally, between modules an Alignment is modeled by an "Overlap" record, which (a) replaces the pointers to the two sequences with their ID's in the DAZZ data bases, (b) does not contain the length of the 2 sequences (must fetch from DB), and (c) contains its path as a subrecord rather than as a pointer (indeed, typically the corresponding Alignment record points at the Overlap's path sub-record). The trace pointer is always to a sequence of trace points and can be either compressed (uint8) or uncompressed (uint16). One can read and write binary records of an "Overlap". ***/ typedef struct { Path path; /* Path: begin- and end-point of alignment + diffs */ uint32 flags; /* Pipeline status and complementation flags */ int aread; /* Id # of A sequence */ int bread; /* Id # of B sequence */ } Overlap; /* Read_Overlap reads the next Overlap record from stream 'input', not including the trace (if any), and without modifying 'ovl's trace pointer. Read_Trace reads the ensuing trace into the memory pointed at by the trace field of 'ovl'. It is assumed to be big enough to accommodate the trace where each value take 'tbytes' bytes (1 if uint8 or 2 if uint16). Write_Overlap write 'ovl' to stream 'output' followed by its trace vector (if any) that occupies 'tbytes' bytes per value. It returns non-zero if there was an error writing. Print_Overlap prints an ASCII version of the contents of 'ovl' to stream 'output' where the trace occupes 'tbytes' per value and the print out is indented from the left margin by 'indent' spaces. Compress_TraceTo8 converts a trace fo 16-bit values to 8-bit values in place, and Decompress_TraceTo16 does the reverse conversion. Check_Trace_Points checks that the number of trace points is correct and that the sum of the b-read displacements equals the b-read alignment interval, assuming the trace spacing is 'tspace'. It reports an error message if there is a problem and 'verbose' is non-zero. The 'ovl' came from the file names 'fname'. */ int Read_Overlap(FILE *input, Overlap *ovl); int Read_Trace(FILE *innput, Overlap *ovl, int tbytes); int Write_Overlap(FILE *output, Overlap *ovl, int tbytes); void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent); void Compress_TraceTo8(Overlap *ovl); void Decompress_TraceTo16(Overlap *ovl); int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname); #endif // _A_MODULE DALIGNER-master/daligner.c000066400000000000000000000527501322465224500155030ustar00rootroot00000000000000/*********************************************************************************************\ * * Find all local alignment between long, noisy DNA reads: * Compare sequences in 'subject' database against those in the list of 'target' databases * searching for local alignments of 1000bp or more (defined constant MIN_OVERLAP in * filter.c). Subject is compared in both orientations againt each target. An output * stream of 'Overlap' records (see align.h) is written in binary to the standard output, * each encoding a given found local alignment between two of the sequences. The -v * option turns on a verbose reporting mode that gives statistics on each major stage. * * The filter operates by looking for a pair of diagonal bands of width 2^'s' that contain * a collection of exact matching 'k'-mers between the two sequences, such that the total * number of bases covered by 'k'-mer hits is 'h'. k cannot be larger than 32 in the * current implementation. * * Some k-mers are significantly over-represented (e.g. homopolymer runs). These are * suppressed as seed hits, with the parameter 't' -- any k-mer that occurs more than * 't' times in either the subject or target is not counted as a seed hit. If the -t * option is absent then no k-mer is suppressed. Alternatively, the option -M specifies * that 't' is dynamically set to the largest value such that less than -M memory is * used. * * For each subject, target pair, say XXX and YYY, the program outputs a file containing * overlaps of the form XXX.YYY.[C|N]#.las where C implies that the reads in XXX were * complemented and N implies they were not (both comparisons are performed), and # is * the thread that detected and wrote out the collection of overlaps. For example, if * NTHREAD in the program is 4, then 8 files are output for each subject, target pair. * * Author: Gene Myers * Date : June 1, 2014 * *********************************************************************************************/ #include #include #include #include #include #include #include #include #include #include #if defined(BSD) #include #endif #include "DB.h" #include "filter.h" static char *Usage[] = { "[-vbAI] [-k] [-w] [-h] [-t] [-M] [-P]", " [-e] [-s] [-H] [-T]", " [-m]+ ...", }; int VERBOSE; // Globally visible to filter.c char *SORT_PATH; int BIASED; int MINOVER; int HGAP_MIN; int SYMMETRIC; int IDENTITY; uint64 MEM_LIMIT; uint64 MEM_PHYSICAL; /* Adapted from code by David Robert Nadeau (http://NadeauSoftware.com) licensed under * "Creative Commons Attribution 3.0 Unported License" * (http://creativecommons.org/licenses/by/3.0/deed.en_US) * * I removed Windows options, reformated, and return int64 instead of size_t */ static int64 getMemorySize( ) { #if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64)) // OSX, NetBSD, OpenBSD int mib[2]; size_t size = 0; size_t len = sizeof( size ); mib[0] = CTL_HW; #if defined(HW_MEMSIZE) mib[1] = HW_MEMSIZE; // OSX #elif defined(HW_PHYSMEM64) mib[1] = HW_PHYSMEM64; // NetBSD, OpenBSD #endif if (sysctl(mib,2,&size,&len,NULL,0) == 0) return ((size_t) size); return (0); #elif defined(_SC_AIX_REALMEM) // AIX return ((size_t) sysconf( _SC_AIX_REALMEM ) * ((size_t) 1024L)); #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) // FreeBSD, Linux, OpenBSD, & Solaris size_t size = 0; size = (size_t) sysconf(_SC_PHYS_PAGES); return (size * ((size_t) sysconf(_SC_PAGESIZE))); #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE) // ? Legacy ? size_t size = 0; size = (size_t) sysconf(_SC_PHYS_PAGES); return (size * ((size_t) sysconf(_SC_PAGE_SIZE))); #elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) // DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX int mib[2]; unsigned int size = 0; size_t len = sizeof( size ); mib[0] = CTL_HW; #if defined(HW_REALMEM) mib[1] = HW_REALMEM; // FreeBSD #elif defined(HW_PYSMEM) mib[1] = HW_PHYSMEM; // Others #endif if (sysctl(mib,2,&size,&len,NULL,0) == 0) return (size_t)size; return (0); #else return (0); #endif } typedef struct { int *ano; int *end; int idx; int out; } Event; static void reheap(int s, Event **heap, int hsize) { int c, l, r; Event *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; hr = heap[r]; if (hr->idx > hl->idx) { if (hs->idx > hl->idx) { heap[c] = hl; c = l; } else break; } else { if (hs->idx > hr->idx) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } static int64 merge_size(DAZZ_DB *block, int mtop) { Event ev[mtop+1]; Event *heap[mtop+2]; int r, mhalf; int64 nsize; { DAZZ_TRACK *track; int i; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].ano = ((int *) (track->data)) + ((int64 *) (track->anno))[0]; ev[i].out = 1; heap[i+1] = ev+i; track = track->next; } ev[mtop].idx = INT32_MAX; heap[mtop+1] = ev+mtop; } mhalf = mtop/2; nsize = 0; for (r = 0; r < block->nreads; r++) { int i, level, hsize; DAZZ_TRACK *track; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].end = ((int *) (track->data)) + ((int64 *) (track->anno))[r+1]; if (ev[i].ano < ev[i].end) ev[i].idx = *(ev[i].ano); else ev[i].idx = INT32_MAX; track = track->next; } hsize = mtop; for (i = mhalf; i > 1; i--) reheap(i,heap,hsize); level = 0; while (1) { Event *p; reheap(1,heap,hsize); p = heap[1]; if (p->idx == INT32_MAX) break; p->out = 1-p->out; if (p->out) { level -= 1; if (level == 0) nsize += 1; } else { if (level == 0) nsize += 1; level += 1; } p->ano += 1; if (p->ano >= p->end) p->idx = INT32_MAX; else p->idx = *(p->ano); } } return (nsize); } static DAZZ_TRACK *merge_tracks(DAZZ_DB *block, int mtop, int64 nsize) { DAZZ_TRACK *ntrack; Event ev[mtop+1]; Event *heap[mtop+2]; int r, mhalf; int64 *anno; int *data; ntrack = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK),"Allocating merged track"); if (ntrack == NULL) exit (1); ntrack->name = Strdup("merge","Allocating merged track"); ntrack->anno = anno = (int64 *) Malloc(sizeof(int64)*(block->nreads+1),"Allocating merged track"); ntrack->data = data = (int *) Malloc(sizeof(int)*nsize,"Allocating merged track"); ntrack->size = sizeof(int); ntrack->next = NULL; if (anno == NULL || data == NULL || ntrack->name == NULL) exit (1); { DAZZ_TRACK *track; int i; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].ano = ((int *) (track->data)) + ((int64 *) (track->anno))[0]; ev[i].out = 1; heap[i+1] = ev+i; track = track->next; } ev[mtop].idx = INT32_MAX; heap[mtop+1] = ev+mtop; } mhalf = mtop/2; nsize = 0; for (r = 0; r < block->nreads; r++) { int i, level, hsize; DAZZ_TRACK *track; anno[r] = nsize; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].end = ((int *) (track->data)) + ((int64 *) (track->anno))[r+1]; if (ev[i].ano < ev[i].end) ev[i].idx = *(ev[i].ano); else ev[i].idx = INT32_MAX; track = track->next; } hsize = mtop; for (i = mhalf; i > 1; i--) reheap(i,heap,hsize); level = 0; while (1) { Event *p; reheap(1,heap,hsize); p = heap[1]; if (p->idx == INT32_MAX) break; p->out = 1-p->out; if (p->out) { level -= 1; if (level == 0) data[nsize++] = p->idx; } else { if (level == 0) data[nsize++] = p->idx; level += 1; } p->ano += 1; if (p->ano >= p->end) p->idx = INT32_MAX; else p->idx = *(p->ano); } } anno[r] = nsize; return (ntrack); } static int read_DB(DAZZ_DB *block, char *name, char **mask, int *mstat, int mtop, int kmer) { int i, isdam, status, kind, stop; isdam = Open_DB(name,block); if (isdam < 0) exit (1); for (i = 0; i < mtop; i++) { status = Check_Track(block,mask[i],&kind); if (status >= 0) if (kind == MASK_TRACK) mstat[i] = 0; else { if (mstat[i] != 0) mstat[i] = -3; } else { if (mstat[i] == -2) mstat[i] = status; } if (status == 0 && kind == MASK_TRACK) Load_Track(block,mask[i]); } Trim_DB(block); stop = 0; for (i = 0; i < mtop; i++) { DAZZ_TRACK *track; int64 *anno; int j; status = Check_Track(block,mask[i],&kind); if (status < 0 || kind != MASK_TRACK) continue; stop += 1; track = Load_Track(block,mask[i]); anno = (int64 *) (track->anno); for (j = 0; j <= block->nreads; j++) anno[j] /= sizeof(int); } if (stop > 1) { int64 nsize; DAZZ_TRACK *track; nsize = merge_size(block,stop); track = merge_tracks(block,stop,nsize); while (block->tracks != NULL) Close_Track(block,block->tracks->name); block->tracks = track; } if (block->cutoff < kmer) { for (i = 0; i < block->nreads; i++) if (block->reads[i].rlen < kmer) { fprintf(stderr,"%s: Block %s contains reads < %dbp long ! Run DBsplit.\n", Prog_Name,name,kmer); exit (1); } } Read_All_Sequences(block,0); return (isdam); } static void complement(char *s, int len) { char *t; int c; t = s + (len-1); while (s < t) { c = *s; *s = (char) (3-*t); *t = (char) (3-c); s += 1; t -= 1; } if (s == t) *s = (char) (3-*s); } static DAZZ_DB *complement_DB(DAZZ_DB *block, int inplace) { static DAZZ_DB _cblock, *cblock = &_cblock; int nreads; DAZZ_READ *reads; char *seq; nreads = block->nreads; reads = block->reads; if (inplace) { seq = (char *) block->bases; cblock = block; } else { seq = (char *) Malloc(block->reads[nreads].boff+1,"Allocating dazzler sequence block"); if (seq == NULL) exit (1); *seq++ = 4; memmove(seq,block->bases,block->reads[nreads].boff); *cblock = *block; cblock->bases = (void *) seq; cblock->tracks = NULL; } { int i; float x; x = cblock->freq[0]; cblock->freq[0] = cblock->freq[3]; cblock->freq[3] = x; x = cblock->freq[1]; cblock->freq[1] = cblock->freq[2]; cblock->freq[2] = x; for (i = 0; i < nreads; i++) complement(seq+reads[i].boff,reads[i].rlen); } { DAZZ_TRACK *src, *trg; int *data, *tata; int i, x, rlen; int64 *tano, *anno; int64 j, k; for (src = block->tracks; src != NULL; src = src->next) { tano = (int64 *) src->anno; tata = (int *) src->data; if (inplace) { data = tata; anno = tano; trg = src; } else { data = (int *) Malloc(sizeof(int)*tano[nreads], "Allocating dazzler interval track data"); anno = (int64 *) Malloc(sizeof(int64)*(nreads+1), "Allocating dazzler interval track index"); trg = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK), "Allocating dazzler interval track header"); if (data == NULL || trg == NULL || anno == NULL) exit (1); trg->name = Strdup(src->name,"Copying track name"); if (trg->name == NULL) exit (1); trg->size = 4; trg->anno = (void *) anno; trg->data = (void *) data; trg->next = cblock->tracks; cblock->tracks = trg; } for (i = 0; i < nreads; i++) { rlen = reads[i].rlen; anno[i] = tano[i]; j = tano[i+1]-1; k = tano[i]; while (k < j) { x = tata[j]; data[j--] = rlen - tata[k]; data[k++] = rlen - x; } if (k == j) data[k] = rlen - tata[k]; } anno[nreads] = tano[nreads]; } } return (cblock); } static char *CommandBuffer(char *aname, char *bname) { static char *cat = NULL; static int max = -1; int len; len = 2*(strlen(aname) + strlen(bname)) + 200; if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) { fprintf(stderr,"%s: Out of memory (Making path name)\n",Prog_Name); exit (1); } } return (cat); } int main(int argc, char *argv[]) { DAZZ_DB _ablock, _bblock; DAZZ_DB *ablock = &_ablock, *bblock = &_bblock; char *afile, *bfile; char *aroot, *broot; void *aindex, *bindex; int alen, blen; Align_Spec *asettings; int isdam; int MMAX, MTOP, *MSTAT; char **MASK; int KMER_LEN; int BIN_SHIFT; int MAX_REPS; int HIT_MIN; double AVE_ERROR; int SPACING; int NTHREADS; { int i, j, k; int flags[128]; char *eptr; DIR *dirp; ARG_INIT("daligner") KMER_LEN = 14; HIT_MIN = 35; BIN_SHIFT = 6; MAX_REPS = 0; HGAP_MIN = 0; AVE_ERROR = .70; SPACING = 100; MINOVER = 1000; // Globally visible to filter.c NTHREADS = 4; SORT_PATH = "/tmp"; MEM_PHYSICAL = getMemorySize(); MEM_LIMIT = MEM_PHYSICAL; if (MEM_PHYSICAL == 0) { fprintf(stderr,"\nWarning: Could not get physical memory size\n"); fflush(stderr); } MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); MSTAT = (int *) Malloc(MMAX*sizeof(int),"Allocating mask status array"); if (MASK == NULL || MSTAT == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vbAI") break; case 'k': ARG_POSITIVE(KMER_LEN,"K-mer length") if (KMER_LEN > 32) { fprintf(stderr,"%s: K-mer length must be 32 or less\n",Prog_Name); exit (1); } break; case 'w': ARG_POSITIVE(BIN_SHIFT,"Log of bin width") break; case 'h': ARG_POSITIVE(HIT_MIN,"Hit threshold (in bp.s)") break; case 't': ARG_POSITIVE(MAX_REPS,"Tuple supression frequency") break; case 'H': ARG_POSITIVE(HGAP_MIN,"HGAP threshold (in bp.s)") break; case 'e': ARG_REAL(AVE_ERROR) if (AVE_ERROR < .7 || AVE_ERROR >= 1.) { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n", Prog_Name,AVE_ERROR); exit (1); } break; case 'l': ARG_POSITIVE(MINOVER,"Minimum alignment length") break; case 's': ARG_POSITIVE(SPACING,"Trace spacing") break; case 'M': { int limit; ARG_NON_NEGATIVE(limit,"Memory allocation (in Gb)") MEM_LIMIT = limit * 0x40000000ll; break; } case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); MSTAT = (int *) Realloc(MSTAT,MMAX*sizeof(int),"Reallocating mask status array"); if (MASK == NULL || MSTAT == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; case 'P': SORT_PATH = argv[i]+2; if ((dirp = opendir(SORT_PATH)) == NULL) { fprintf(stderr,"%s: -P option: cannot open directory %s\n",Prog_Name,SORT_PATH); exit (1); } closedir(dirp); break; case 'T': ARG_POSITIVE(NTHREADS,"Number of threads") break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; // Globally declared in filter.h BIASED = flags['b']; // Globally declared in filter.h SYMMETRIC = 1-flags['A']; IDENTITY = flags['I']; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); exit (1); } for (j = 0; j < MTOP; j++) MSTAT[j] = -2; } MINOVER *= 2; if (Set_Filter_Params(KMER_LEN,BIN_SHIFT,MAX_REPS,HIT_MIN,NTHREADS)) { fprintf(stderr,"Illegal combination of filter parameters\n"); exit (1); } /* Read in the reads in A */ afile = argv[1]; isdam = read_DB(ablock,afile,MASK,MSTAT,MTOP,KMER_LEN); if (isdam) aroot = Root(afile,".dam"); else aroot = Root(afile,".db"); asettings = New_Align_Spec( AVE_ERROR, SPACING, ablock->freq, 1); /* Compare against reads in B in both orientations */ { int i, j; char *command; aindex = NULL; broot = NULL; for (i = 2; i < argc; i++) { bfile = argv[i]; if (strcmp(afile,bfile) != 0) { isdam = read_DB(bblock,bfile,MASK,MSTAT,MTOP,KMER_LEN); if (isdam) broot = Root(bfile,".dam"); else broot = Root(bfile,".db"); } else broot = aroot; if (i == 2) { for (j = 0; j < MTOP; j++) { if (MSTAT[j] == -2) printf("%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[j]); else if (MSTAT[j] == -1) printf("%s: Warning: %s track not sync'd with relevant db.\n",Prog_Name,MASK[j]); else if (MSTAT[j] == -3) printf("%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[j]); } if (VERBOSE) printf("\nBuilding index for %s\n",aroot); aindex = Sort_Kmers(ablock,&alen); } if (aroot != broot) { if (VERBOSE) printf("\nBuilding index for %s\n",broot); bindex = Sort_Kmers(bblock,&blen); Match_Filter(aroot,ablock,broot,bblock,aindex,alen,bindex,blen,0,asettings); bblock = complement_DB(bblock,1); if (VERBOSE) printf("\nBuilding index for c(%s)\n",broot); bindex = Sort_Kmers(bblock,&blen); Match_Filter(aroot,ablock,broot,bblock,aindex,alen,bindex,blen,1,asettings); } else { Match_Filter(aroot,ablock,aroot,ablock,aindex,alen,aindex,alen,0,asettings); bblock = complement_DB(ablock,0); if (VERBOSE) printf("\nBuilding index for c(%s)\n",aroot); bindex = Sort_Kmers(bblock,&blen); Match_Filter(aroot,ablock,aroot,bblock,aindex,alen,bindex,blen,1,asettings); bblock->reads = NULL; // ablock & bblock share "reads" vector, don't let Close_DB // free it ! } Close_DB(bblock); command = CommandBuffer(aroot,broot); sprintf(command,"LAsort %s/%s.%s.[CN]*.las",SORT_PATH,aroot,broot); if (VERBOSE) printf("\n%s\n",command); system(command); sprintf(command,"LAmerge %s.%s.las %s/%s.%s.[CN]*.S.las",aroot,broot,SORT_PATH,aroot,broot); if (VERBOSE) printf("%s\n",command); system(command); sprintf(command,"rm %s/%s.%s.[CN]*.las",SORT_PATH,aroot,broot); if (VERBOSE) printf("%s\n",command); system(command); if (aroot != broot && SYMMETRIC) { sprintf(command,"LAsort %s/%s.%s.[CN]*.las",SORT_PATH,broot,aroot); if (VERBOSE) printf("%s\n",command); system(command); sprintf(command,"LAmerge %s.%s.las %s/%s.%s.[CN]*.S.las",broot,aroot, SORT_PATH,broot,aroot); if (VERBOSE) printf("%s\n",command); system(command); sprintf(command,"rm %s/%s.%s.[CN]*.las",SORT_PATH,broot,aroot); if (VERBOSE) printf("%s\n",command); system(command); } if (aroot != broot) free(broot); } } exit (0); } DALIGNER-master/filter.c000066400000000000000000002004731322465224500152000ustar00rootroot00000000000000/******************************************************************************************* * * Fast local alignment filter for long, noisy reads based on "dumbing down" of my RECOMB 2005 * filter with Jens Stoye, and a "smarting up" of the k-mer matching by turning it into * a threaded sort and merge paradigm using a super cache coherent radix sort. Local * alignment is accomplised with dynamically-banded O(nd) algorithm that terminates when * it fails to find a e-matching patch for a significant distance, and polishes the match * to the last e-prefix-positive 32-mer. * * Author : Gene Myers * First : June 2013 * Current: June 1, 2014 * ********************************************************************************************/ // A complete threaded code for the filter #include #include #include #include #include #include #include #include "DB.h" #include "filter.h" #include "align.h" #undef FOR_PACBIO #define THREAD pthread_t #define MAX_BIAS 2 // In -b mode, don't consider tuples with specificity // <= 4 ^ -(kmer-MAX_BIAS) #define MAXGRAM 10000 // Cap on k-mer count histogram (in count_thread, merge_thread) #define PANEL_SIZE 50000 // Size to break up very long A-reads #define PANEL_OVERLAP 10000 // Overlap of A-panels #define MATCH_CHUNK 100 // Max expected number of hits between two reads #define TRACE_CHUNK 20000 // Max expected trace points in hits between two reads #undef TEST_LSORT #undef TEST_KSORT #undef TEST_PAIRS #undef TEST_CSORT #define HOW_MANY 3000 // Print first HOW_MANY items for each of the TEST options above #define DO_ALIGNMENT #undef TEST_GATHER #undef TEST_CONTAIN #undef SHOW_OVERLAP // Show the cartoon #undef SHOW_ALIGNMENT // Show the alignment #define ALIGN_WIDTH 80 // Parameters for alignment #define ALIGN_INDENT 20 #define ALIGN_BORDER 10 #ifdef SHOW_OVERLAP #define NOTHREAD #endif #ifdef TEST_GATHER #define NOTHREAD #endif #ifdef TEST_CONTAIN #define NOTHREAD #endif typedef struct { uint64 p1; // The lower half uint64 p2; } Double; #if __ORDER_LITTLE_ENDIAN__ == __BYTE_ORDER__ typedef struct { uint64 code; int rpos; int read; } KmerPos; typedef struct { int diag; int apos; int aread; int bread; } SeedPair; #else typedef struct { uint64 code; int read; int rpos; } KmerPos; typedef struct { int apos; int diag; int bread; int aread; } SeedPair; #endif /******************************************************************************************* * * PARAMETER SETUP * ********************************************************************************************/ static int Kmer; static int Hitmin; static int Binshift; static int Suppress; static int Kshift; // 2*Kmer static uint64 Kmask; // 4^Kmer-1 static int TooFrequent; // (Suppress != 0) ? Suppress : INT32_MAX static int NTHREADS; // Adjusted downward to nearest power of 2 static int NSHIFT; // NTHREADS = 1 << NSHIFT int Set_Filter_Params(int kmer, int binshift, int suppress, int hitmin, int nthread) { if (kmer <= 1) return (1); Kmer = kmer; Binshift = binshift; Suppress = suppress; Hitmin = hitmin; Kshift = 2*Kmer; if (Kmer == 32) Kmask = 0xffffffffffffffffllu; else Kmask = (0x1llu << Kshift) - 1; if (Suppress == 0) TooFrequent = INT32_MAX; else TooFrequent = Suppress; NTHREADS = 1; NSHIFT = 0; while (2*NTHREADS <= nthread) { NTHREADS *= 2; NSHIFT += 1; } return (0); } /******************************************************************************************* * * LEXICOGRAPHIC SORT * ********************************************************************************************/ #define BMER 4 #define BSHIFT 8 // = 2*BMER #define BPOWR 256 // = 2^BSHIFT #define BMASK 0xffllu // = BPOWR-1 static uint64 QMASK; // = BMASK << NSHIFT static int LEX_shift; static int64 LEX_zsize; static int LEX_last; static int LEX_next; static Double *LEX_src; static Double *LEX_trg; typedef struct { int64 beg; int64 end; int64 tptr[BPOWR]; int64 *sptr; } Lex_Arg; static void *lex_thread(void *arg) { Lex_Arg *data = (Lex_Arg *) arg; int64 *sptr = data->sptr; int64 *tptr = data->tptr; int shift = LEX_shift; // Must be a multiple of 8 in [0,120] int qshift = (LEX_next - LEX_shift) - NSHIFT; int64 zsize = LEX_zsize; Double *src = LEX_src; Double *trg = LEX_trg; int64 i, n, x; uint64 c, b; n = data->end; if (shift >= 64) { shift -= 64; if (LEX_last) for (i = data->beg; i < n; i++) { c = src[i].p2; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; } else for (i = data->beg; i < n; i++) { c = src[i].p2; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((b >> qshift) & QMASK) + x/zsize] += 1; } } else if ( ! LEX_last && LEX_next >= 64) // && LEX_shift < 64 { qshift = (LEX_next - 64) - NSHIFT; if (qshift < 0) for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((src[i].p2 << NSHIFT) & QMASK) + x/zsize] += 1; } else for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((src[i].p2 >> qshift) & QMASK) + x/zsize] += 1; } } else // LEX_last || LEX_next < 64 if (LEX_last) if (shift == 0) for (i = data->beg; i < n; i++) { c = src[i].p1; x = tptr[c&BMASK]++; trg[x] = src[i]; } else for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; } else if (shift == 0) for (i = data->beg; i < n; i++) { c = src[i].p1; x = tptr[c&BMASK]++; trg[x] = src[i]; sptr[((c >> qshift) & QMASK) + x/zsize] += 1; } else for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((b >> qshift) & QMASK) + x/zsize] += 1; } return (NULL); } static Double *lex_sort(int bytes[16], Double *src, Double *trg, Lex_Arg *parmx) { THREAD threads[NTHREADS]; int64 len, x, y; Double *xch; int i, j, k, z; int b, c, fb; len = parmx[NTHREADS-1].end; LEX_zsize = (len-1)/NTHREADS + 1; LEX_src = src; LEX_trg = trg; QMASK = (BMASK << NSHIFT); for (c = 0; c < 16; c++) if (bytes[c]) break; fb = c; for (b = c; b < 16; b = c) { for (c = b+1; c < 16; c++) if (bytes[c]) break; LEX_last = (c >= 16); LEX_shift = (b << 3); LEX_next = (c << 3); if (b == fb) { for (i = 0; i < NTHREADS; i++) for (z = 0; z < NTHREADS*BPOWR; z++) parmx[i].sptr[z] = 0; } else { x = 0; for (i = 0; i < NTHREADS; i++) { parmx[i].beg = x; x = LEX_zsize*(i+1); if (x > len) x = len; parmx[i].end = x; for (j = 0; j < BPOWR; j++) parmx[i].tptr[j] = 0; } parmx[NTHREADS-1].end = len; for (j = 0; j < BPOWR; j++) { k = (j << NSHIFT); for (z = 0; z < NTHREADS; z++) for (i = 0; i < NTHREADS; i++) { parmx[i].tptr[j] += parmx[z].sptr[k+i]; parmx[z].sptr[k+i] = 0; } } } x = 0; for (j = 0; j < BPOWR; j++) for (i = 0; i < NTHREADS; i++) { y = parmx[i].tptr[j]; parmx[i].tptr[j] = x; x += y; } for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,lex_thread,parmx+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); xch = LEX_src; LEX_src = LEX_trg; LEX_trg = xch; #ifdef TEST_LSORT printf("\nLSORT %d\n",LEX_shift); if (LEX_shift >= 64) { x = (1 << ((LEX_shift-64)+BSHIFT))-1; for (i = 0; i < len; i++) { printf("%6d: %8llx %8llx %8llx %8llx : %4llx", i,LEX_src[i].p2>>32,(LEX_src[i].p2)&0xffffffffll,LEX_src[i].p1>>32, LEX_src[i].p1&0xffffffffll,LEX_src[i].p2&x); if (i > 0 && (LEX_src[i].p1 < LEX_src[i].p1 || (LEX_src[i].p1 == LEX_src[i].p1 && (LEX_src[i].p2 & x) < (LEX_src[i-1].p2 & x)))) printf(" OO"); printf("\n"); } } else { x = (1 << (LEX_shift+BSHIFT))-1; for (i = 0; i < len; i++) { printf("%6d: %8llx %8llx %8llx %8llx : %4llx", i,LEX_src[i].p2>>32,(LEX_src[i].p2)&0xffffffffll,LEX_src[i].p1>>32, LEX_src[i].p1&0xffffffffll,LEX_src[i].p1&x); if (i > 0 && (LEX_src[i].p1 & x) < (LEX_src[i-1].p1 & x)) printf(" OO"); printf("\n"); } } #endif } return (LEX_src); } /******************************************************************************************* * * INDEX BUILD * ********************************************************************************************/ static int *NormShift = NULL; static int LogNorm, LogThresh; static int LogBase[4]; static DAZZ_DB *TA_block; static KmerPos *TA_list; static DAZZ_TRACK *TA_track; typedef struct { int tnum; int64 *kptr; int fill; } Tuple_Arg; static void *tuple_thread(void *arg) { Tuple_Arg *data = (Tuple_Arg *) arg; int tnum = data->tnum; int64 *kptr = data->kptr; KmerPos *list = TA_list; int i, m, n, x, p; uint64 c; char *s; c = TA_block->nreads; i = (c * tnum) >> NSHIFT; n = TA_block->reads[i].boff; s = ((char *) (TA_block->bases)) + n; n -= Kmer*i; if (TA_track != NULL) { DAZZ_READ *reads = TA_block->reads; int64 *anno1 = ((int64 *) (TA_track->anno)) + 1; int *point = (int *) (TA_track->data); int64 a, b, f; int q = 0; f = anno1[i-1]; for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { b = f; f = anno1[i]; for (a = b; a <= f; a += 2) { if (a == b) p = 0; else p = point[a-1]; if (a == f) q = reads[i].rlen; else q = point[a]; if (p+Kmer <= q) { c = 0; for (x = 1; x < Kmer; x++) c = (c << 2) | s[p++]; while (p < q) { x = s[p]; c = ((c << 2) | x) & Kmask; list[n].read = i; list[n].rpos = p++; list[n].code = c; n += 1; kptr[c & BMASK] += 1; } } } s += (q+1); } m = TA_block->reads[m].boff - Kmer*m; kptr[BMASK] += (data->fill = m-n); while (n < m) { list[n].code = 0xffffffffffffffffllu; list[n].read = -1; list[n].rpos = -1; n += 1; } } else for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { c = p = 0; for (x = 1; x < Kmer; x++) c = (c << 2) | s[p++]; while ((x = s[p]) != 4) { c = ((c << 2) | x) & Kmask; list[n].read = i; list[n].rpos = p++; list[n].code = c; n += 1; kptr[c & BMASK] += 1; } s += (p+1); } return (NULL); } static void *biased_tuple_thread(void *arg) { Tuple_Arg *data = (Tuple_Arg *) arg; int tnum = data->tnum; int64 *kptr = data->kptr; KmerPos *list = TA_list; int n, i, m; int x, a, k, p; uint64 d, c; char *s, *t; c = TA_block->nreads; i = (c * tnum) >> NSHIFT; n = TA_block->reads[i].boff; s = ((char *) (TA_block->bases)) + n; n -= Kmer*i; if (TA_track != NULL) { DAZZ_READ *reads = TA_block->reads; int64 *anno1 = ((int64 *) (TA_track->anno)) + 1; int *point = (int *) (TA_track->data); int64 j, b, f; int q = 0; f = anno1[i-1]; for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { b = f; f = anno1[i]; t = s+1; for (j = b; j <= f; j += 2) { if (j == b) p = 0; else p = point[j-1]; if (j == f) q = reads[i].rlen; else q = point[j]; if (p+Kmer <= q) { c = 0; a = 0; k = 1; while (p < q) { x = s[p]; a += LogBase[x]; c = ((c << 2) | x); while (a < LogNorm && k < Kmer) { if (++p >= q) break; k += 1; x = s[p]; a += LogBase[x]; c = ((c << 2) | x); } while (1) { int u = a-LogBase[(int) t[p-k]]; if (u < LogNorm) break; a = u; k -= 1; } if (a > LogThresh) { d = ((c << NormShift[k]) & Kmask); list[n].read = i; list[n].rpos = p; list[n].code = d; n += 1; kptr[d & BMASK] += 1; } p += 1; a -= LogBase[(int) s[p-k]]; } } } s += (q+1); } } else for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { t = s+1; c = 0; p = a = 0; k = 1; while ((x = s[p]) != 4) { a += LogBase[x]; c = ((c << 2) | x); while (a < LogNorm && k < Kmer) { if ((x = s[++p]) == 4) goto eoread2; k += 1; a += LogBase[x]; c = ((c << 2) | x); } while (1) { int u = a-LogBase[(int) t[p-k]]; if (u < LogNorm) break; a = u; k -= 1; } if (a > LogThresh) { d = ((c << NormShift[k]) & Kmask); list[n].read = i; list[n].rpos = p; list[n].code = d; n += 1; kptr[d & BMASK] += 1; } p += 1; a -= LogBase[(int) s[p-k]]; } eoread2: s += (p+1); } m = TA_block->reads[m].boff - Kmer*m; kptr[BMASK] += (data->fill = m-n); while (n < m) { list[n].code = 0xffffffffffffffffllu; list[n].read = -1; list[n].rpos = -1; n += 1; } return (NULL); } static KmerPos *FR_src; static KmerPos *FR_trg; typedef struct { int beg; int end; int kept; } Comp_Arg; static void *compsize_thread(void *arg) { Comp_Arg *data = (Comp_Arg *) arg; int end = data->end; KmerPos *src = FR_src; int n, i, c, p; uint64 h, g; i = data->beg; h = src[i].code; n = 0; while (i < end) { p = i++; while ((g = src[i].code) == h) i += 1; if ((c = (i-p)) < TooFrequent) n += c; h = g; } data->kept = n; return (NULL); } static void *compress_thread(void *arg) { Comp_Arg *data = (Comp_Arg *) arg; int end = data->end; KmerPos *src = FR_src; KmerPos *trg = FR_trg; int n, i, p; uint64 h, g; i = data->beg; h = src[i].code; n = data->kept; while (i < end) { p = i++; while ((g = src[i].code) == h) i += 1; if (i-p < TooFrequent) { while (p < i) trg[n++] = src[p++]; } h = g; } return (NULL); } void *Sort_Kmers(DAZZ_DB *block, int *len) { THREAD threads[NTHREADS]; Tuple_Arg parmt[NTHREADS]; Comp_Arg parmf[NTHREADS]; Lex_Arg parmx[NTHREADS]; int mersort[16]; KmerPos *src, *trg, *rez; int kmers, nreads; int i, j, x, z; uint64 h; for (i = 0; i < NTHREADS; i++) parmx[i].sptr = (int64 *) alloca(NTHREADS*BPOWR*sizeof(int64)); for (i = 0; i < 16; i++) mersort[i] = 0; for (i = 0; i < Kshift; i += 8) mersort[i>>3] = 1; if (NormShift == NULL && BIASED) { double scale; NormShift = (int *) Malloc(sizeof(int)*(Kmer+1),"Allocating Sort_Kmers bias shift"); if (NormShift == NULL) exit (1); for (i = 0; i <= Kmer; i++) NormShift[i] = Kshift - 2*i; LogNorm = 10000 * Kmer; LogThresh = 10000 * (Kmer-MAX_BIAS); scale = -10000. / log(4.); for (i = 0; i < 4; i++) LogBase[i] = (int) ceil( scale * log(block->freq[i]) ); } nreads = block->nreads; kmers = block->reads[nreads].boff - Kmer * nreads; if (kmers <= 0) goto no_mers; if (( (Kshift-1)/BSHIFT + (TooFrequent < INT32_MAX) ) & 0x1) { trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); } else { src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); } if (src == NULL || trg == NULL) exit (1); if (VERBOSE) { printf("\n Kmer count = "); Print_Number((int64) kmers,0,stdout); printf("\n Using %.2fGb of space\n",(1. * kmers) / 33554432); fflush(stdout); } TA_block = block; TA_list = src; TA_track = block->tracks; for (i = 0; i < NTHREADS; i++) { parmt[i].tnum = i; parmt[i].kptr = parmx[i].tptr; for (j = 0; j < BPOWR; j++) parmt[i].kptr[j] = 0; } if (BIASED) for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,biased_tuple_thread,parmt+i); else for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,tuple_thread,parmt+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); x = 0; for (i = 0; i < NTHREADS; i++) { parmx[i].beg = x; j = (int) ((((int64) nreads) * (i+1)) >> NSHIFT); parmx[i].end = x = block->reads[j].boff - j*Kmer; } rez = (KmerPos *) lex_sort(mersort,(Double *) src,(Double *) trg,parmx); if (BIASED || TA_track != NULL) { if (Kmer%4 == 0) { int wedge[NTHREADS]; for (j = 0; j < NTHREADS; j++) if (parmt[j].fill > 0) break; j += 1; if (j < NTHREADS) { x = kmers-1; for (i = NTHREADS-1; i >= j; i--) { x = x - parmt[i].fill; z = x; while (rez[x].read >= 0) x -= 1; wedge[i] = z-x; } x += 1; z = x-parmt[j-1].fill; for (i = j; i < NTHREADS; i++) { memmove(rez+z,rez+x,wedge[i]*sizeof(KmerPos)); x += wedge[i] + parmt[i].fill; z += wedge[i]; } } } for (i = 0; i < NTHREADS; i++) kmers -= parmt[i].fill; } if (TooFrequent < INT32_MAX && kmers > 0) { parmf[0].beg = 0; for (i = 1; i < NTHREADS; i++) { x = (((int64) i)*kmers) >> NSHIFT; h = rez[x-1].code; while (rez[x].code == h) x += 1; parmf[i-1].end = parmf[i].beg = x; } parmf[NTHREADS-1].end = kmers; if (rez[kmers-1].code == 0xffffffffffffffffllu) rez[kmers].code = 0; else rez[kmers].code = 0xffffffffffffffffllu; if (src == rez) { FR_src = src; FR_trg = rez = trg; } else { FR_src = trg; FR_trg = rez = src; } for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,compsize_thread,parmf+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); x = 0; for (i = 0; i < NTHREADS; i++) { z = parmf[i].kept; parmf[i].kept = x; x += z; } kmers = x; for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,compress_thread,parmf+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); } rez[kmers].code = 0xffffffffffffffffllu; rez[kmers+1].code = 0; if (src != rez) free(src); else free(trg); #ifdef TEST_KSORT { int i; printf("\nKMER SORT:\n"); for (i = 0; i < HOW_MANY && i < kmers; i++) { KmerPos *c = rez+i; printf(" %9d: %6d / %6d / %16llx\n",i,c->read,c->rpos,c->code); } fflush(stdout); } #endif if (VERBOSE) { if (TooFrequent < INT32_MAX || BIASED || TA_track != NULL) { printf(" Revised kmer count = "); Print_Number((int64) kmers,0,stdout); printf("\n"); } printf(" Index occupies %.2fGb\n",(1. * kmers) / 67108864); fflush(stdout); } if (kmers <= 0) { free(rez); goto no_mers; } if (kmers > (int64) (MEM_LIMIT/(4*sizeof(KmerPos)))) { fprintf(stderr,"Warning: Block size too big, index occupies more than 1/4 of"); if (MEM_LIMIT == MEM_PHYSICAL) fprintf(stderr," physical memory (%.1fGb)\n",(1.*MEM_LIMIT)/0x40000000ll); else fprintf(stderr," desired memory allocation (%.1fGb)\n",(1.*MEM_LIMIT)/0x40000000ll); fflush(stderr); } *len = kmers; return (rez); no_mers: *len = 0; return (NULL); } /******************************************************************************************* * * FILTER MATCH * ********************************************************************************************/ static int find_tuple(uint64 x, KmerPos *a, int n) { int l, r, m; // smallest k s.t. a[k].code >= x (or n if does not exist) l = 0; r = n; while (l < r) { m = ((l+r) >> 1); if (a[m].code < x) l = m+1; else r = m; } return (l); } // Determine what *will* be the size of the merged list and histogram of sizes for given cutoffs static KmerPos *MG_alist; static KmerPos *MG_blist; static SeedPair *MG_hits; static int MG_comp; static int MG_self; typedef struct { int abeg, aend; int bbeg, bend; int64 *kptr; int64 nhits; int limit; int64 hitgram[MAXGRAM]; } Merge_Arg; static void *count_thread(void *arg) { Merge_Arg *data = (Merge_Arg *) arg; KmerPos *asort = MG_alist; KmerPos *bsort = MG_blist; int64 *gram = data->hitgram; int64 nhits = 0; int aend = data->aend; int64 ct; int ia, ib; int jb, ja; uint64 ca, cb; uint64 da, db; int ar, ap; int a, b; ia = data->abeg; ca = asort[ia].code; ib = data->bbeg; cb = bsort[ib].code; if (MG_self) { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { ja = ia++; while ((da = asort[ia].code) == ca) ia += 1; jb = ib++; while ((db = bsort[ib].code) == cb) ib += 1; if (ia > aend) { if (ja >= aend) break; da = asort[ia = aend].code; db = bsort[ib = data->bend].code; } ct = 0; b = jb; if (IDENTITY) for (a = ja; a < ia; a++) { ar = asort[a].read; if (MG_comp) { while (b < ib && bsort[b].read <= ar) b += 1; } else { ap = asort[a].rpos; while (b < ib && bsort[b].read < ar) b += 1; while (b < ib && bsort[b].read == ar && bsort[b].rpos < ap) b += 1; } ct += (b-jb); } else for (a = ja; a < ia; a++) { ar = asort[a].read; while (b < ib && bsort[b].read < ar) b += 1; ct += (b-jb); } nhits += ct; ca = da; cb = db; if (ct < MAXGRAM) gram[ct] += 1; } } } else { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { ja = ia++; while ((da = asort[ia].code) == ca) ia += 1; jb = ib++; while ((db = bsort[ib].code) == cb) ib += 1; if (ia > aend) { if (ja >= aend) break; da = asort[ia = aend].code; db = bsort[ib = data->bend].code; } ct = (ia-ja); ct *= (ib-jb); nhits += ct; ca = da; cb = db; if (ct < MAXGRAM) gram[ct] += 1; } } } data->nhits = nhits; return (NULL); } // Produce the merged list now that the list has been allocated and // the appropriate cutoff determined. static void *merge_thread(void *arg) { Merge_Arg *data = (Merge_Arg *) arg; int64 *kptr = data->kptr; KmerPos *asort = MG_alist; KmerPos *bsort = MG_blist; SeedPair *hits = MG_hits; int64 nhits = data->nhits; int aend = data->aend; int limit = data->limit; int64 ct; int ia, ib; int jb, ja; uint64 ca, cb; uint64 da, db; int ar, ap; int a, b, c; ia = data->abeg; ca = asort[ia].code; ib = data->bbeg; cb = bsort[ib].code; if (MG_self) { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { ja = ia++; while ((da = asort[ia].code) == ca) ia += 1; jb = ib++; while ((db = bsort[ib].code) == cb) ib += 1; if (ia > aend) { if (ja >= aend) break; da = asort[ia = aend].code; db = bsort[ib = data->bend].code; } ct = 0; b = jb; if (IDENTITY) for (a = ja; a < ia; a++) { ar = asort[a].read; if (MG_comp) { while (b < ib && bsort[b].read <= ar) b += 1; } else { ap = asort[a].rpos; while (b < ib && bsort[b].read < ar) b += 1; while (b < ib && bsort[b].read == ar && bsort[b].rpos < ap) b += 1; } ct += (b-jb); } else for (a = ja; a < ia; a++) { ar = asort[a].read; while (b < ib && bsort[b].read < ar) b += 1; ct += (b-jb); } if (ct < limit) { b = jb; if (IDENTITY) for (a = ja; a < ia; a++) { ap = asort[a].rpos; ar = asort[a].read; if (MG_comp) { while (b < ib && bsort[b].read <= ar) b += 1; } else { while (b < ib && bsort[b].read < ar) b += 1; while (b < ib && bsort[b].read == ar && bsort[b].rpos < ap) b += 1; } if ((ct = b-jb) > 0) { kptr[ap & BMASK] += ct; for (c = jb; c < b; c++) { hits[nhits].bread = bsort[c].read; hits[nhits].aread = ar; hits[nhits].apos = ap; hits[nhits].diag = ap - bsort[c].rpos; nhits += 1; } } } else for (a = ja; a < ia; a++) { ap = asort[a].rpos; ar = asort[a].read; while (b < ib && bsort[b].read < ar) b += 1; if ((ct = b-jb) > 0) { kptr[ap & BMASK] += ct; for (c = jb; c < b; c++) { hits[nhits].bread = bsort[c].read; hits[nhits].aread = ar; hits[nhits].apos = ap; hits[nhits].diag = ap - bsort[c].rpos; nhits += 1; } } } } ca = da; cb = db; } } } else { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { if (ia >= aend) break; ja = ia++; while ((da = asort[ia].code) == ca) ia += 1; jb = ib++; while ((db = bsort[ib].code) == cb) ib += 1; if (ia > aend) { if (ja >= aend) break; da = asort[ia = aend].code; db = bsort[ib = data->bend].code; } ct = ib-jb; if ((ia-ja)*ct < limit) { for (a = ja; a < ia; a++) { ap = asort[a].rpos; kptr[ap & BMASK] += ct; for (b = jb; b < ib; b++) { hits[nhits].bread = bsort[b].read; hits[nhits].aread = asort[a].read; hits[nhits].apos = ap; hits[nhits].diag = ap - bsort[b].rpos; nhits += 1; } } } ca = da; cb = db; } } } return (NULL); } // Report threads: given a segment of merged list, find all seeds and from them all alignments. static DAZZ_DB *MR_ablock; static DAZZ_DB *MR_bblock; static SeedPair *MR_hits; static int MR_two; static Align_Spec *MR_spec; static int MR_tspace; typedef struct { uint64 max; uint64 top; uint16 *trace; } Trace_Buffer; static int Entwine(Path *jpath, Path *kpath, Trace_Buffer *tbuf, int *where) { int ac, b2, y2, ae; int i, j, k; int num, den, min; #ifdef SEE_ENTWINE int strt = 1; int iflare, oflare; #endif uint16 *ktrace = tbuf->trace + (uint64) (kpath->trace); uint16 *jtrace = tbuf->trace + (uint64) (jpath->trace); min = 10000; num = 0; den = 0; #ifdef SEE_ENTWINE printf("\n"); #endif y2 = jpath->bbpos; j = jpath->abpos/MR_tspace; b2 = kpath->bbpos; k = kpath->abpos/MR_tspace; if (jpath->abpos == kpath->abpos) { min = abs(y2-b2); if (min == 0) *where = kpath->abpos; } if (j < k) { ac = k*MR_tspace; j = 1 + 2*(k-j); k = 1; for (i = 1; i < j; i += 2) y2 += jtrace[i]; } else { ac = j*MR_tspace; k = 1 + 2*(j-k); j = 1; for (i = 1; i < k; i += 2) b2 += ktrace[i]; } ae = jpath->aepos; if (ae > kpath->aepos) ae = kpath->aepos; while (1) { ac += MR_tspace; if (ac >= ae) break; y2 += jtrace[j]; b2 += ktrace[k]; j += 2; k += 2; #ifdef SEE_ENTWINE printf(" @ %5d : %5d %5d = %4d\n",ac,y2,b2,abs(b2-y2)); #endif i = abs(y2-b2); if (i <= min) { min = i; if (i == 0) *where = ac; } num += i; den += 1; #ifdef SEE_ENTWINE if (strt) { strt = 0; iflare = i; } oflare = i; #endif } if (jpath->aepos == kpath->aepos) { i = abs(jpath->bepos-kpath->bepos); if (i <= min) { min = i; if (i == 0) *where = kpath->aepos; } } #ifdef SEE_ENTWINE if (den == 0) printf("Nothing\n"); else printf("MINIM = %d AVERAGE = %d IFLARE = %d OFLARE = %d\n",min,num/den,iflare,oflare); #endif if (den == 0) return (-1); else return (min); } // Produce the concatentation of path1 and path2 where they are known to meet at // the trace point with coordinate ap. Place this result in a big growing buffer, // that gets reset when fusion is called with path1 = NULL static void Fusion(Path *path1, int ap, Path *path2, Trace_Buffer *tbuf) { int k, k1, k2; int len, diff; uint16 *trace; k1 = 2 * ((ap/MR_tspace) - (path1->abpos/MR_tspace)); k2 = 2 * ((ap/MR_tspace) - (path2->abpos/MR_tspace)); len = k1+(path2->tlen-k2); if (tbuf->top + len >= tbuf->max) { tbuf->max = 1.2*(tbuf->top+len) + 1000; tbuf->trace = (uint16 *) Realloc(tbuf->trace,sizeof(uint16)*tbuf->max,"Allocating paths"); if (tbuf->trace == NULL) exit (1); } trace = tbuf->trace + tbuf->top; tbuf->top += len; diff = 0; len = 0; if (k1 > 0) { uint16 *t = tbuf->trace + (uint64) (path1->trace); for (k = 0; k < k1; k += 2) { trace[len++] = t[k]; trace[len++] = t[k+1]; diff += t[k]; } } if (k2 < path2->tlen) { uint16 *t = tbuf->trace + (uint64) (path2->trace); for (k = k2; k < path2->tlen; k += 2) { trace[len++] = t[k]; trace[len++] = t[k+1]; diff += t[k]; } } path1->aepos = path2->aepos; path1->bepos = path2->bepos; path1->diffs = diff; path1->trace = (void *) (trace - tbuf->trace); path1->tlen = len; } static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buffer *tbuf) { Path *jpath, *kpath; int j, k, no; int dist; int awhen = 0, bwhen = 0; int hasB; #ifdef TEST_CONTAIN for (j = 0; j < novls; j++) printf(" %3d: [%5d,%5d] x [%5d,%5d]\n",j,amatch[j].abpos,amatch[j].aepos, amatch[j].bbpos,amatch[j].bepos); #endif hasB = (bmatch != NULL); for (j = 1; j < novls; j++) { jpath = amatch+j; for (k = j-1; k >= 0; k--) { kpath = amatch+k; if (kpath->abpos < 0) continue; if (jpath->abpos < kpath->abpos) { if (kpath->abpos <= jpath->aepos && kpath->bbpos <= jpath->bepos) { dist = Entwine(jpath,kpath,tbuf,&awhen); if (dist == 0) { if (kpath->aepos > jpath->aepos) { if (hasB) { if (MG_comp) { dist = Entwine(bmatch+k,bmatch+j,tbuf,&bwhen); if (dist != 0) continue; Fusion(jpath,awhen,kpath,tbuf); Fusion(bmatch+k,bwhen,bmatch+j,tbuf); bmatch[j] = bmatch[k]; #ifdef TEST_CONTAIN printf(" Really 1"); #endif } else { dist = Entwine(bmatch+j,bmatch+k,tbuf,&bwhen); if (dist != 0) continue; Fusion(jpath,awhen,kpath,tbuf); Fusion(bmatch+j,bwhen,bmatch+k,tbuf); #ifdef TEST_CONTAIN printf(" Really 2"); #endif } } else { Fusion(jpath,awhen,kpath,tbuf); #ifdef TEST_CONTAIN printf(" Really 3"); #endif } k = j; } kpath->abpos = -1; #ifdef TEST_CONTAIN printf(" Fuse! A %d %d\n",j,k); #endif } } } else // kpath->abpos <= jpath->abpos { if (jpath->abpos <= kpath->aepos && jpath->bbpos <= kpath->bepos) { dist = Entwine(kpath,jpath,tbuf,&awhen); if (dist == 0) { if (kpath->abpos == jpath->abpos) { if (kpath->aepos > jpath->aepos) { *jpath = *kpath; if (hasB) bmatch[j] = bmatch[k]; } } else if (jpath->aepos > kpath->aepos) { if (hasB) { if (MG_comp) { dist = Entwine(bmatch+j,bmatch+k,tbuf,&bwhen); if (dist != 0) continue; Fusion(kpath,awhen,jpath,tbuf); *jpath = *kpath; Fusion(bmatch+j,bwhen,bmatch+k,tbuf); #ifdef TEST_CONTAIN printf(" Really 4"); #endif } else { dist = Entwine(bmatch+k,bmatch+j,tbuf,&bwhen); if (dist != 0) continue; Fusion(kpath,awhen,jpath,tbuf); *jpath = *kpath; Fusion(bmatch+k,bwhen,bmatch+j,tbuf); bmatch[j] = bmatch[k]; #ifdef TEST_CONTAIN printf(" Really 5"); #endif } } else { Fusion(kpath,awhen,jpath,tbuf); *jpath = *kpath; #ifdef TEST_CONTAIN printf(" Really 6"); #endif } k = j; } else { *jpath = *kpath; if (hasB) bmatch[j] = bmatch[k]; } kpath->abpos = -1; #ifdef TEST_CONTAIN printf(" Fuse! B %d %d\n",j,k); #endif } } } } } no = 0; for (j = 0; j < novls; j++) if (amatch[j].abpos >= 0) { if (hasB) bmatch[no] = bmatch[j]; amatch[no++] = amatch[j]; } novls = no; #ifdef TEST_CONTAIN for (j = 0; j < novls; j++) printf(" %3d: [%5d,%5d] x [%5d,%5d]\n",j,amatch[j].abpos,amatch[j].aepos, amatch[j].bbpos,amatch[j].bepos); #endif return (novls); } void Diagonal_Span(Path *path, int *mind, int *maxd) { uint16 *points; int i, tlen; int dd, low, hgh; points = path->trace; tlen = path->tlen; dd = path->abpos - path->bbpos; low = hgh = dd; dd = path->aepos - path->bepos; if (dd < low) low = dd; else if (dd > hgh) hgh = dd; dd = (path->abpos/MR_tspace)*MR_tspace - path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { dd += MR_tspace - points[i]; if (dd < low) low = dd; else if (dd > hgh) hgh = dd; } *mind = (low >> Binshift)-1; *maxd = (hgh >> Binshift)+1; } typedef struct { int64 beg, end; int *score; int *lastp; int *lasta; Work_Data *work; FILE *ofile1; FILE *ofile2; int64 nfilt; int64 ncheck; } Report_Arg; static void *report_thread(void *arg) { Report_Arg *data = (Report_Arg *) arg; SeedPair *hits = MR_hits; Double *hitd = (Double *) MR_hits; char *aseq = (char *) (MR_ablock->bases); char *bseq = (char *) (MR_bblock->bases); DAZZ_READ *aread = MR_ablock->reads; DAZZ_READ *bread = MR_bblock->reads; int *score = data->score; int *scorp = data->score + 1; int *scorm = data->score - 1; int *lastp = data->lastp; int *lasta = data->lasta; Work_Data *work = data->work; FILE *ofile1 = data->ofile1; FILE *ofile2 = data->ofile2; int afirst = MR_ablock->tfirst; int bfirst = MR_bblock->tfirst; int maxdiag = ( MR_ablock->maxlen >> Binshift); int mindiag = (-MR_bblock->maxlen >> Binshift); Overlap _ovla, *ovla = &_ovla; Overlap _ovlb, *ovlb = &_ovlb; Alignment _align, *align = &_align; Path *apath = &(ovla->path); Path *bpath; int64 nfilt = 0; int64 ahits = 0; int64 bhits = 0; int small, tbytes; int AOmax, BOmax; int novla, novlb; Path *amatch, *bmatch; Trace_Buffer _tbuf, *tbuf = &_tbuf; Double *hitc; int minhit; uint64 cpair; uint64 npair = 0; int64 nidx, eidx; // In ovl and align roles of A and B are reversed, as the B sequence must be the // complemented sequence !! align->flags = ovla->flags = ovlb->flags = MG_comp; align->path = apath; if (MR_tspace <= TRACE_XOVR) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } AOmax = BOmax = MATCH_CHUNK; amatch = Malloc(sizeof(Path)*AOmax,"Allocating match vector"); bmatch = Malloc(sizeof(Path)*BOmax,"Allocating match vector"); tbuf->max = 2*TRACE_CHUNK; tbuf->trace = Malloc(sizeof(short)*tbuf->max,"Allocating trace vector"); if (amatch == NULL || bmatch == NULL || tbuf->trace == NULL) exit (1); fwrite(&ahits,sizeof(int64),1,ofile1); fwrite(&MR_tspace,sizeof(int),1,ofile1); if (MR_two) { fwrite(&bhits,sizeof(int64),1,ofile2); fwrite(&MR_tspace,sizeof(int),1,ofile2); } minhit = (Hitmin-1)/Kmer + 1; hitc = hitd + (minhit-1); eidx = data->end - minhit; nidx = data->beg; for (cpair = hitd[nidx].p2; nidx <= eidx; cpair = npair) if (hitc[nidx].p2 != cpair) { nidx += 1; while ((npair = hitd[nidx].p2) == cpair) nidx += 1; } else { int ar, br; int alen, blen; int doA, doB; int setaln, amark, amark2; int apos, bpos, diag; int64 lidx, sidx; int64 f, h2; ar = hits[nidx].aread; br = hits[nidx].bread; alen = aread[ar].rlen; blen = bread[br].rlen; if (alen < HGAP_MIN && blen < HGAP_MIN) { nidx += 1; while ((npair = hitd[nidx].p2) == cpair) nidx += 1; continue; } #ifdef TEST_GATHER printf("%5d vs %5d : %5d x %5d\n",br+bfirst,ar+afirst,blen,alen); #endif setaln = 1; doA = doB = 0; amark2 = 0; novla = novlb = 0; tbuf->top = 0; for (sidx = nidx; hitd[nidx].p2 == cpair; nidx = h2) { amark = amark2 + PANEL_SIZE; amark2 = amark - PANEL_OVERLAP; h2 = lidx = nidx; do { apos = hits[nidx].apos; npair = hitd[++nidx].p2; if (apos <= amark2) h2 = nidx; } while (npair == cpair && apos <= amark); if (nidx-lidx < minhit) continue; for (f = lidx; f < nidx; f++) { apos = hits[f].apos; diag = hits[f].diag >> Binshift; if (apos - lastp[diag] >= Kmer) score[diag] += Kmer; else score[diag] += apos - lastp[diag]; lastp[diag] = apos; } #ifdef TEST_GATHER printf(" %6lld upto %6d",nidx-lidx,amark); #endif for (f = lidx; f < nidx; f++) { apos = hits[f].apos; diag = hits[f].diag; bpos = apos - diag; diag = diag >> Binshift; if (apos > lasta[diag] && (score[diag] + scorp[diag] >= Hitmin || score[diag] + scorm[diag] >= Hitmin)) { if (setaln) { setaln = 0; align->aseq = aseq + aread[ar].boff; align->bseq = bseq + bread[br].boff; align->alen = alen; align->blen = blen; ovlb->bread = ovla->aread = ar + afirst; ovlb->aread = ovla->bread = br + bfirst; #ifdef FOR_PACBIO doA = 1; doB = (SYMMETRIC && (ar != br || !MG_self || !MG_comp)); #else doA = (alen >= HGAP_MIN); doB = (SYMMETRIC && blen >= HGAP_MIN && (ar != br || !MG_self || !MG_comp)); #endif } #ifdef TEST_GATHER else printf("\n "); if (scorm[diag] > scorp[diag]) printf(" %5d.. x %5d.. %5d (%3d)", bpos,apos,apos-bpos,score[diag]+scorm[diag]); else printf(" %5d.. x %5d.. %5d (%3d)", bpos,apos,apos-bpos,score[diag]+scorp[diag]); #endif nfilt += 1; #ifdef DO_ALIGNMENT bpath = Local_Alignment(align,work,MR_spec,apos-bpos,apos-bpos,apos+bpos,-1,-1); { int low, hgh, ae; Diagonal_Span(apath,&low,&hgh); if (diag < low) low = diag; else if (diag > hgh) hgh = diag; ae = apath->aepos; for (diag = low; diag <= hgh; diag++) if (ae > lasta[diag]) lasta[diag] = ae; #ifdef TEST_GATHER printf(" %d - %d @ %d",low,hgh,apath->aepos); #endif } if ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos) >= MINOVER) { if (doA) { if (novla >= AOmax) { AOmax = 1.2*novla + MATCH_CHUNK; amatch = Realloc(amatch,sizeof(Path)*AOmax, "Reallocating match vector"); if (amatch == NULL) exit (1); } if (tbuf->top + apath->tlen > tbuf->max) { tbuf->max = 1.2*(tbuf->top+apath->tlen) + TRACE_CHUNK; tbuf->trace = Realloc(tbuf->trace,sizeof(short)*tbuf->max, "Reallocating trace vector"); if (tbuf->trace == NULL) exit (1); } amatch[novla] = *apath; amatch[novla].trace = (void *) (tbuf->top); memmove(tbuf->trace+tbuf->top,apath->trace,sizeof(short)*apath->tlen); novla += 1; tbuf->top += apath->tlen; } if (doB) { if (novlb >= BOmax) { BOmax = 1.2*novlb + MATCH_CHUNK; bmatch = Realloc(bmatch,sizeof(Path)*BOmax, "Reallocating match vector"); if (bmatch == NULL) exit (1); } if (tbuf->top + bpath->tlen > tbuf->max) { tbuf->max = 1.2*(tbuf->top+bpath->tlen) + TRACE_CHUNK; tbuf->trace = Realloc(tbuf->trace,sizeof(short)*tbuf->max, "Reallocating trace vector"); if (tbuf->trace == NULL) exit (1); } bmatch[novlb] = *bpath; bmatch[novlb].trace = (void *) (tbuf->top); memmove(tbuf->trace+tbuf->top,bpath->trace,sizeof(short)*bpath->tlen); novlb += 1; tbuf->top += bpath->tlen; } #ifdef TEST_GATHER printf(" [%5d,%5d] x [%5d,%5d] = %4d", apath->abpos,apath->aepos,apath->bbpos,apath->bepos,apath->diffs); #endif #ifdef SHOW_OVERLAP printf("\n\n %d(%d) vs %d(%d)\n\n", ovla->aread,ovla->alen,ovla->bread,ovla->blen); Print_ACartoon(stdout,align,ALIGN_INDENT); #ifdef SHOW_ALIGNMENT Compute_Trace_ALL(align,work); printf("\n Diff = %d\n",align->path->diffs); Print_Alignment(stdout,align,work, ALIGN_INDENT,ALIGN_WIDTH,ALIGN_BORDER,0,5); #endif #endif // SHOW_OVERLAP } #ifdef TEST_GATHER else printf(" No alignment %d", ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos))/2); #endif #endif // DO_ALIGNMENT } } for (f = lidx; f < nidx; f++) { diag = hits[f].diag >> Binshift; score[diag] = lastp[diag] = 0; } #ifdef TEST_GATHER printf("\n"); #endif } for (f = sidx; f < nidx; f++) { int d; diag = hits[f].diag >> Binshift; for (d = diag; d <= maxdiag; d++) if (lasta[d] == 0) break; else lasta[d] = 0; for (d = diag-1; d >= mindiag; d--) if (lasta[d] == 0) break; else lasta[d] = 0; } { int i; #ifdef TEST_CONTAIN if (novla > 1 || novlb > 1) printf("\n%5d vs %5d:\n",ar,br); #endif if (novla > 1) { if (novlb > 1) novla = novlb = Handle_Redundancies(amatch,novla,bmatch,tbuf); else novla = Handle_Redundancies(amatch,novla,NULL,tbuf); } else if (novlb > 1) novlb = Handle_Redundancies(bmatch,novlb,NULL,tbuf); for (i = 0; i < novla; i++) { ovla->path = amatch[i]; ovla->path.trace = tbuf->trace + (uint64) (ovla->path.trace); if (small) Compress_TraceTo8(ovla); if (Write_Overlap(ofile1,ovla,tbytes)) { fprintf(stderr,"%s: Cannot write to %s too small?\n",SORT_PATH,Prog_Name); exit (1); } } for (i = 0; i < novlb; i++) { ovlb->path = bmatch[i]; ovlb->path.trace = tbuf->trace + (uint64) (ovlb->path.trace); if (small) Compress_TraceTo8(ovlb); if (Write_Overlap(ofile2,ovlb,tbytes)) { fprintf(stderr,"%s: Cannot write to %s, too small?\n",SORT_PATH,Prog_Name); exit (1); } } ahits += novla; bhits += novlb; } } free(tbuf->trace); free(bmatch); free(amatch); data->nfilt = nfilt; data->ncheck = ahits + bhits; if (MR_two) { rewind(ofile2); fwrite(&bhits,sizeof(int64),1,ofile2); fclose(ofile2); } else ahits += bhits; rewind(ofile1); fwrite(&ahits,sizeof(int64),1,ofile1); fclose(ofile1); return (NULL); } /******************************************************************************************* * * THE ALGORITHM * ********************************************************************************************/ static char *NameBuffer(char *aname, char *bname) { static char *cat = NULL; static int max = -1; int len; len = strlen(aname) + strlen(bname) + 100; if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) { fprintf(stderr,"%s: Out of memory (Making path name)\n",Prog_Name); exit (1); } } return (cat); } void Match_Filter(char *aname, DAZZ_DB *ablock, char *bname, DAZZ_DB *bblock, void *vasort, int alen, void *vbsort, int blen, int comp, Align_Spec *aspec) { THREAD threads[NTHREADS]; Merge_Arg parmm[NTHREADS]; Lex_Arg parmx[NTHREADS]; Report_Arg parmr[NTHREADS]; int pairsort[16]; char *fname; SeedPair *khit, *hhit; SeedPair *work1, *work2; int64 nhits; int64 nfilt, ncheck; KmerPos *asort, *bsort; int64 atot, btot; asort = (KmerPos *) vasort; bsort = (KmerPos *) vbsort; atot = ablock->totlen; btot = bblock->totlen; MR_tspace = Trace_Spacing(aspec); { int64 powr; int i, nbyte; for (i = 0; i < NTHREADS; i++) parmx[i].sptr = (int64 *) alloca(NTHREADS*BPOWR*sizeof(int64)); for (i = 0; i < 16; i++) pairsort[i] = 0; powr = 1; for (nbyte = 0; powr < ablock->maxlen; nbyte += 1) powr <<= 8; for (i = 4; i < 4+nbyte; i++) pairsort[i] = 1; powr = 1; for (nbyte = 0; powr < ablock->nreads; nbyte += 1) powr <<= 8; for (i = 8; i < 8+nbyte; i++) pairsort[i] = 1; powr = 1; for (nbyte = 0; powr < bblock->nreads; nbyte += 1) powr <<= 8; for (i = 12; i < 12+nbyte; i++) pairsort[i] = 1; } nfilt = ncheck = nhits = 0; if (VERBOSE) { if (comp) printf("\nComparing %s to c(%s)\n",aname,bname); else printf("\nComparing %s to %s\n",aname,bname); } if (alen == 0 || blen == 0) goto zerowork; { int i, j, p; uint64 c; int limit; MG_alist = asort; MG_blist = bsort; MG_self = (aname == bname); MG_comp = comp; parmm[0].abeg = parmm[0].bbeg = 0; for (i = 1; i < NTHREADS; i++) { p = (int) ((((int64) alen) * i) >> NSHIFT); if (p > 0) { c = asort[p-1].code; while (asort[p].code == c) p += 1; } parmm[i].abeg = parmm[i-1].aend = p; parmm[i].bbeg = parmm[i-1].bend = find_tuple(asort[p].code,bsort,blen); } parmm[NTHREADS-1].aend = alen; parmm[NTHREADS-1].bend = blen; for (i = 0; i < NTHREADS; i++) for (j = 0; j < MAXGRAM; j++) parmm[i].hitgram[j] = 0; for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,count_thread,parmm+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); if (VERBOSE) printf("\n"); if (MEM_LIMIT > 0) { int64 histo[MAXGRAM]; int64 tom, avail; for (j = 0; j < MAXGRAM; j++) histo[j] = parmm[0].hitgram[j]; for (i = 1; i < NTHREADS; i++) for (j = 0; j < MAXGRAM; j++) histo[j] += parmm[i].hitgram[j]; avail = (int64) (MEM_LIMIT - (sizeof_DB(ablock) + sizeof_DB(bblock))) / sizeof(Double); if (asort == bsort || avail > alen + 2*blen) avail = (avail - alen) / 2; else avail = avail - (alen + blen); avail *= .98; tom = 0; for (j = 0; j < MAXGRAM; j++) { tom += j*histo[j]; if (tom > avail) break; } limit = j; if (limit <= 1) { fprintf(stderr,"\nError: Insufficient "); if (MEM_LIMIT == MEM_PHYSICAL) fprintf(stderr," physical memory (%.1fGb), reduce block size\n", (1.*MEM_LIMIT)/0x40000000ll); else { fprintf(stderr," memory allocation (%.1fGb),",(1.*MEM_LIMIT)/0x40000000ll); fprintf(stderr," reduce block size or increase allocation\n"); } fflush(stderr); exit (1); } if (limit < 10) { fprintf(stderr,"\nWarning: Sensitivity hampered by low "); if (MEM_LIMIT == MEM_PHYSICAL) fprintf(stderr," physical memory (%.1fGb), reduce block size\n", (1.*MEM_LIMIT)/0x40000000ll); else { fprintf(stderr," memory allocation (%.1fGb),",(1.*MEM_LIMIT)/0x40000000ll); fprintf(stderr," reduce block size or increase allocation\n"); } fflush(stderr); } if (VERBOSE) { printf(" Capping mutual k-mer matches over %d (effectively -t%d)\n", limit,(int) sqrt(1.*limit)); fflush(stdout); } for (i = 0; i < NTHREADS; i++) { parmm[i].nhits = 0; for (j = 1; j < limit; j++) parmm[i].nhits += j * parmm[i].hitgram[j]; parmm[i].limit = limit; } } else for (i = 0; i < NTHREADS; i++) parmm[i].limit = INT32_MAX; nhits = parmm[0].nhits; for (i = 1; i < NTHREADS; i++) parmm[i].nhits = nhits += parmm[i].nhits; if (VERBOSE) { printf(" Hit count = "); Print_Number(nhits,0,stdout); if (asort == bsort || nhits >= blen) printf("\n Highwater of %.2fGb space\n", (1. * (alen + 2*nhits)) / 67108864); else printf("\n Highwater of %.2fGb space\n", (1. * (alen + blen + nhits)) / 67108864); fflush(stdout); } if (nhits == 0) goto zerowork; if (asort == bsort) hhit = work1 = (SeedPair *) Malloc(sizeof(SeedPair)*(nhits+1), "Allocating daligner hit vectors"); else { if (nhits >= blen) bsort = (KmerPos *) Realloc(bsort,sizeof(SeedPair)*(nhits+1), "Reallocating daligner sort vectors"); hhit = work1 = (SeedPair *) bsort; } khit = work2 = (SeedPair *) Malloc(sizeof(SeedPair)*(nhits+1), "Allocating daligner hit vectors"); if (hhit == NULL || khit == NULL || bsort == NULL) exit (1); MG_blist = bsort; MG_hits = khit; for (i = NTHREADS-1; i > 0; i--) parmm[i].nhits = parmm[i-1].nhits; parmm[0].nhits = 0; for (i = 0; i < NTHREADS; i++) { parmm[i].kptr = parmx[i].tptr; for (p = 0; p < BPOWR; p++) parmm[i].kptr[p] = 0; } for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,merge_thread,parmm+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); #ifdef TEST_PAIRS printf("\nSETUP SORT:\n"); for (i = 0; i < HOW_MANY && i < nhits; i++) { SeedPair *c = khit+i; printf(" %5d / %5d / %5d /%5d\n",c->aread,c->bread,c->apos,c->apos-c->diag); } #endif } { int i; int64 x; x = 0; for (i = 0; i < NTHREADS-1; i++) { parmx[i].beg = x; parmx[i].end = x = parmm[i+1].nhits; } parmx[NTHREADS-1].beg = x; parmx[NTHREADS-1].end = nhits; khit = (SeedPair *) lex_sort(pairsort,(Double *) khit,(Double *) hhit,parmx); khit[nhits].aread = 0x7fffffff; khit[nhits].bread = 0x7fffffff; khit[nhits].diag = 0x7fffffff; khit[nhits].apos = 0; #ifdef TEST_CSORT printf("\nCROSS SORT %lld:\n",nhits); for (i = 0; i < HOW_MANY && i <= nhits; i++) { SeedPair *c = khit+i; printf(" %5d / %5d / %5d /%5d\n",c->aread,c->bread,c->apos,c->apos-c->diag); } #endif } { int i, w; int64 p; int d; int *counters; MR_ablock = ablock; MR_bblock = bblock; MR_hits = khit; MR_two = ! MG_self && SYMMETRIC; MR_spec = aspec; parmr[0].beg = 0; for (i = 1; i < NTHREADS; i++) { p = (nhits * i) >> NSHIFT; if (p > 0) { d = khit[p-1].bread; while ((khit[p].bread) == d) p += 1; } parmr[i].beg = parmr[i-1].end = p; } parmr[NTHREADS-1].end = nhits; w = ((ablock->maxlen >> Binshift) - ((-bblock->maxlen) >> Binshift)) + 1; counters = (int *) Malloc(NTHREADS*3*w*sizeof(int),"Allocating diagonal buckets"); if (counters == NULL) exit (1); fname = NameBuffer(aname,bname); for (i = 0; i < 3*w*NTHREADS; i++) counters[i] = 0; for (i = 0; i < NTHREADS; i++) { if (i == 0) parmr[i].score = counters - ((-bblock->maxlen) >> Binshift); else parmr[i].score = parmr[i-1].lasta + w; parmr[i].lastp = parmr[i].score + w; parmr[i].lasta = parmr[i].lastp + w; parmr[i].work = New_Work_Data(); sprintf(fname,"%s/%s.%s.%c%d.las",SORT_PATH,aname,bname,(comp?'C':'N'),i+1); parmr[i].ofile1 = Fopen(fname,"w"); if (parmr[i].ofile1 == NULL) exit (1); if (MG_self) parmr[i].ofile2 = parmr[i].ofile1; else if (SYMMETRIC) { sprintf(fname,"%s/%s.%s.%c%d.las",SORT_PATH,bname,aname,(comp?'C':'N'),i+1); parmr[i].ofile2 = Fopen(fname,"w"); if (parmr[i].ofile2 == NULL) exit (1); } } #ifdef NOTHREAD for (i = 0; i < NTHREADS; i++) report_thread(parmr+i); #else for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,report_thread,parmr+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); #endif if (VERBOSE) for (i = 0; i < NTHREADS; i++) { nfilt += parmr[i].nfilt; ncheck += parmr[i].ncheck; } for (i = 0; i < NTHREADS; i++) Free_Work_Data(parmr[i].work); free(counters); } free(work2); free(work1); goto epilogue; zerowork: { FILE *ofile; int i; fname = NameBuffer(aname,bname); nhits = 0; for (i = 0; i < NTHREADS; i++) { sprintf(fname,"%s/%s.%s.%c%d.las",SORT_PATH,aname,bname,(comp?'C':'N'),i+1); ofile = Fopen(fname,"w"); fwrite(&nhits,sizeof(int64),1,ofile); fwrite(&MR_tspace,sizeof(int),1,ofile); fclose(ofile); if (! MG_self && SYMMETRIC) { sprintf(fname,"%s/%s.%s.%c%d.las",SORT_PATH,bname,aname,(comp?'C':'N'),i+1); ofile = Fopen(fname,"w"); fwrite(&nhits,sizeof(int64),1,ofile); fwrite(&MR_tspace,sizeof(int),1,ofile); fclose(ofile); } } } epilogue: if (VERBOSE) { int width; if (nhits <= 0) width = 1; else width = ((int) log10((double) nhits)) + 1; width += (width-1)/3; printf("\n "); Print_Number(nhits,width,stdout); printf(" %d-mers (%e of matrix)\n ",Kmer,(1.*nhits/atot)/btot); Print_Number(nfilt,width,stdout); printf(" seed hits (%e of matrix)\n ",(1.*nfilt/atot)/btot); Print_Number(ncheck,width,stdout); printf(" confirmed hits (%e of matrix)\n",(1.*ncheck/atot)/btot); fflush(stdout); } } DALIGNER-master/filter.h000066400000000000000000000016201322465224500151760ustar00rootroot00000000000000/******************************************************************************************* * * Filter interface for the dazzler. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #ifndef _FILTER #define _FILTER #include "DB.h" #include "align.h" extern int BIASED; extern int VERBOSE; extern int MINOVER; extern int HGAP_MIN; extern int SYMMETRIC; extern int IDENTITY; extern char *SORT_PATH; extern uint64 MEM_LIMIT; extern uint64 MEM_PHYSICAL; int Set_Filter_Params(int kmer, int binshift, int suppress, int hitmin, int nthreads); void *Sort_Kmers(DAZZ_DB *block, int *len); void Match_Filter(char *aname, DAZZ_DB *ablock, char *bname, DAZZ_DB *bblock, void *atable, int alen, void *btable, int blen, int comp, Align_Spec *asettings); #endif