SIBsim4-0.20/0000755000551200011300000000000011361265654012033 5ustar chrisludwigSIBsim4-0.20/COPYRIGHT0000644000551200011300000000443610041504446013322 0ustar chrisludwigThis package (SIBsim4) implements a variation of the Sim4 algorithm, as described below, originally written by Liliana Florea and Scott Schwartz. The modifications were written by Claudio Lottaz and Christian Iseli, while working for the Swiss Institute of Bioinformatics and the Ludwig Institute for Cancer Research. The modifications are Copyright (c) Swiss Institute of Bioinformatics, and Ludwig Institute for Cancer Research (LICR), 2001-2004. For the purposes of this copyright, the Swiss Institute of Bioinformatics acts on behalf of its partner, LICR. The resulting, modified, package is licensed under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. Below is the original copyright notice. ----------------------------------------------------------------------- This package implements the Sim4 algorithm for aligning expressed DNA with genomic sequences, described in the paper: L. Florea, G. Hartzell, Z. Zhang, G. Rubin, and W. Miller (1998) "A computer program for aligning a cDNA sequence with a genomic DNA sequence." Genome Research 8, 967-974. Portions copyright by: Copyright (C) 1998-2001 Liliana Florea Copyright (C) 1998-2001 Scott Schwartz This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # The following files were written by Liliana Florea: Xtend1.c Xtend1.h align.c align.h sim4.h sim4.init.c sim4b1.c sim4b1.h splice.c splice.h # The following files were written by Scott Schwartz: args.c args.h charvec.c charvec.h discrim.c discrim.h dna.c dna.h encoding.c encoding.h libc.h misc.c misc.h prnt.c prnt.h psublast.h seq.c seq.h seq_read.c types.h SIBsim4-0.20/align.c0000644000551200011300000004003010774266223013266 0ustar chrisludwig/* $Id: align.c,v 1.21 2008/03/31 22:58:27 c4chris Exp $ * * Christian Iseli, LICR ITO, Christian.Iseli@licr.org * * Copyright (c) 2001-2006 Swiss Institute of Bioinformatics. * Copyright (C) 1998-2001 Liliana Florea. */ #include #include #include #include #include "sim4.h" #include "sim4b1.h" #include "align.h" #include "misc.h" static int snake(uchar *,uchar *,int, int, int, int); static int rsnake(uchar *,uchar *,int, int, int, int, int, int); void align_path(uchar *seq1, uchar *seq2, int i1, int j1, int i2, int j2, int dist, edit_script_p_t *head, edit_script_p_t *tail, int M, int N) { int *last_d, *temp_d, /* forward vectors */ *rlast_d, *rtemp_d; /* backward vectors */ edit_script_p_t head1, tail1, head2, tail2; int midc, rmidc; int start, lower, upper; int rstart, rlower, rupper; int c, k, row; int mi, mj, tmp, ll, uu; char flag; *head = *tail = NULL; /* Boundary cases */ if (i1 == i2) { if (j1 == j2) *head = NULL; else { head1 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); head1->op_type = INSERT; head1->num = j2-j1; head1->next = NULL; *head = *tail = head1; } return; } if (j1 == j2) { head1 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); head1->op_type = DELETE; head1->num = i2-i1; head1->next = NULL; *head = *tail = head1; return; } if (dist <= 1) { start = j1-i1; if (j2-i2 == j1-i1) { head1 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); head1->op_type = SUBSTITUTE; head1->num = i2-i1; head1->next = NULL; *head = *tail = head1; } else if (j2-j1 == i2-i1+1) { tmp = snake(seq1,seq2,start,i1,i2,j2); if (tmp>i1) { head1 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); head1->op_type = SUBSTITUTE; head1->num = tmp-i1; *head = head1; } head2 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); head2->op_type = INSERT; head2->num = 1; if (*head) head1->next = head2; else *head = head2; *tail = head2; head2->next = NULL; if (i2-tmp) { head1 = head2; *tail = head2 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); head2->op_type = SUBSTITUTE; head2->num = i2-tmp; head2->next = NULL; head1->next = head2; } } else if (j2-j1+1 == i2-i1) { tmp = snake(seq1,seq2,start,i1,i2,j2); if (tmp>i1) { head1 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); head1->op_type = SUBSTITUTE; head1->num = tmp-i1; *head = head1; } head2 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); head2->op_type = DELETE; head2->num = 1; if (*head) head1->next = head2; else *head = head2; *tail = head2; head2->next = NULL; if (i2>tmp+1) { head1 = head2; *tail = head2 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); head2->op_type = SUBSTITUTE; head2->num = i2-tmp-1; head2->next = NULL; head1->next = head2; } } else { fprintf(stderr, "align.c: warning: something wrong when aligning."); } return; } /* Divide the problem at the middle cost */ midc = dist/2; rmidc = dist - midc; /* Compute the boundary diagonals */ start = j1 - i1; lower = max(j1-i2, start-midc); upper = min(j2-i1, start+midc); rstart = j2-i2; rlower = max(j1-i2, rstart-rmidc); rupper = min(j2-i1, rstart+rmidc); /* Allocate space for forward vectors */ last_d = (int *)xmalloc((size_t) (upper-lower+1)*sizeof(int)) - lower; temp_d = (int *)xmalloc((size_t) (upper-lower+1)*sizeof(int)) - lower; for (k=lower; k<=upper; k++) last_d[k] = -1; last_d[start] = snake(seq1,seq2,start,i1,i2,j2); /* Forward computation */ for (c=1; c<=midc; ++c) { ll = max(lower,start-c); uu = min(upper,start+c); for (k=ll; k<=uu; ++k) { if (k == ll) { /* DELETE : down from (k+1,c-1) */ row = last_d[k+1]+1; } else if (k == uu) { /* INSERT : right from (k-1,c-1) */ row = last_d[k-1]; } else if ((last_d[k]>=last_d[k+1]) && (last_d[k]+1>=last_d[k-1])) { /* SUBSTITUTE */ row = last_d[k]+1; } else if ((last_d[k+1]+1>=last_d[k-1]) && (last_d[k+1]>=last_d[k])) { /* DELETE */ row = last_d[k+1]+1; } else { /* INSERT */ row = last_d[k-1]; } temp_d[k] = snake(seq1,seq2,k,row,i2,j2); } for (k=ll; k<=uu; ++k) last_d[k] = temp_d[k]; } /* Allocate space for backward vectors */ rlast_d = (int *)xmalloc((size_t) (rupper-rlower+1)*sizeof(int)) - rlower; rtemp_d = (int *)xmalloc((size_t) (rupper-rlower+1)*sizeof(int)) - rlower; for (k=rlower; k<=rupper; k++) rlast_d[k] = i2+1; rlast_d[rstart] = rsnake(seq1,seq2,rstart,i2,i1,j1,M,N); /* Backward computation */ for (c=1; c<=rmidc; ++c) { ll = max(rlower,rstart-c); uu = min(rupper,rstart+c); for (k=ll; k<=uu; ++k) { if (k == ll) { /* INSERT : left from (k+1,c-1) */ row = rlast_d[k+1]; } else if (k == uu) { /* DELETE : up from (k-1,c-1) */ row = rlast_d[k-1]-1; } else if ((rlast_d[k]-1<=rlast_d[k+1]) && (rlast_d[k]-1<=rlast_d[k-1]-1)) { /* SUBSTITUTE */ row = rlast_d[k]-1; } else if ((rlast_d[k-1]-1<=rlast_d[k+1]) && (rlast_d[k-1]-1<=rlast_d[k]-1)) { /* DELETE */ row = rlast_d[k-1]-1; } else { /* INSERT */ row = rlast_d[k+1]; } rtemp_d[k] = rsnake(seq1,seq2,k,row,i1,j1,M,N); } for (k=ll; k<=uu; ++k) rlast_d[k] = rtemp_d[k]; } /* Find (mi, mj) such that the distance from (i1, j1) to (mi, mj) is midc and the distance from (mi, mj) to (i2, j2) is rmidc. */ flag = 0; mi = i1; mj = j1; ll = max(lower,rlower); uu = min(upper,rupper); for (k=ll; k<=uu; ++k) { if (last_d[k]>=rlast_d[k]) { if (last_d[k]-i1>=i2-rlast_d[k]) { mi = last_d[k]; mj = k+mi; } else { mi = rlast_d[k]; mj = k+mi; } flag = 1; break; } } free(last_d+lower); free(rlast_d+rlower); free(temp_d+lower); free(rtemp_d+rlower); if (flag) { /* Find a path from (i1,j1) to (mi,mj) */ align_path(seq1,seq2,i1,j1,mi,mj,midc,&head1,&tail1,M,N); /* Find a path from (mi,mj) to (i2,j2) */ align_path(seq1,seq2,mi,mj,i2,j2,rmidc,&head2,&tail2,M,N); /* Join these two paths together */ if (head1) tail1->next = head2; else head1 = head2; } else { fprintf(stderr, "align.c: warning: something wrong when dividing\n"); head1 = NULL; } *head = head1; if (head2) *tail = tail2; else *tail = tail1; } int align_get_dist(uchar *seq1, uchar *seq2, int i1, int j1, int i2, int j2, int limit) { int *last_d, *temp_d; int goal_diag, ll, uu; int c, k, row; int start, lower, upper; /* Compute the boundary diagonals */ start = j1 - i1; lower = max(j1-i2, start-limit); upper = min(j2-i1, start+limit); goal_diag = j2-i2; if (goal_diag > upper || goal_diag < lower) return -1; /* Allocate space for forward vectors */ last_d = (int *)xmalloc((size_t) (upper-lower+1)*sizeof(int)) - lower; temp_d = (int *)xmalloc((size_t) (upper-lower+1)*sizeof(int)) - lower; /* Initialization */ for (k=lower; k<=upper; ++k) last_d[k] = INT_MIN; last_d[start] = snake(seq1,seq2,start, i1, i2, j2); if (last_d[goal_diag] >= i2) { /* Free working vectors */ free(last_d+lower); free(temp_d+lower); return 0; } for (c=1; c<=limit; ++c) { ll = max(lower,start-c); uu = min(upper, start+c); for (k=ll; k<=uu; ++k) { if (k == ll) row = last_d[k+1]+1; /* DELETE */ else if (k == uu) row = last_d[k-1]; /* INSERT */ else if ((last_d[k]>=last_d[k+1]) && (last_d[k]+1>=last_d[k-1])) row = last_d[k]+1; /*SUBSTITUTE */ else if ((last_d[k+1]+1>=last_d[k-1]) && (last_d[k+1]>=last_d[k])) row = last_d[k+1]+1; /* DELETE */ else row = last_d[k-1]; /* INSERT */ temp_d[k] = snake(seq1,seq2,k,row,i2,j2); } for (k=ll; k<=uu; ++k) last_d[k] = temp_d[k]; if (last_d[goal_diag] >= i2) { /* Free working vectors */ free(last_d+lower); free(temp_d+lower); return c; } } /* Ran out of distance limit */ return -1; } /* Condense_both_Ends -- merge contiguous operations of the same type */ /* together; return both new ends of the chain. */ void Condense_both_Ends (edit_script_p_t *head, edit_script_p_t *tail, edit_script_p_t *prev) { edit_script_p_t tp, tp1; tp = *head; *prev = NULL; while (tp != NULL) { while (((tp1 = tp->next) != NULL) && (tp->op_type == tp1->op_type)) { tp->num = tp->num + tp1->num; tp->next = tp1->next; free(tp1); } if (tp->next) *prev = tp; else *tail = tp; tp = tp->next; } } void S2A(edit_script_p_t head, int *S) { edit_script_p_t tp; int *lastS, i; tp = head; lastS = S; while (tp != NULL) { /* printf("tp->op_type=%d, tp->num=%d\n",tp->op_type, tp->num); */ if (tp->op_type == SUBSTITUTE) { for (i=0; inum; ++i) *lastS++ = 0; } else if (tp->op_type == INSERT) { *lastS++ = tp->num; } else if (tp->op_type == CHIMERA) { *lastS++ = INT_MAX; *lastS++ = tp->num; } else { /* DELETE */ *lastS++ = 0 - tp->num; } tp = tp->next; } *(S-1) = (int) (lastS - S); } /* Alignment display routine */ static uchar ALINE[51], BLINE[51], CLINE[51]; static unsigned int get_pos_width(collec_p_t eCol) { unsigned int last = eCol->e.exon[eCol->nb - 1]->to1 + options.dnaOffset; unsigned int w = 1; while ((last = last / 10) > 0) w += 1; if (w < 7) w = 7; return w; } void IDISPLAY(uchar *A, uchar *B, unsigned int M, unsigned int N, int *S, unsigned int AP, unsigned int BP, collec_p_t eCol, int direction) { uchar *a, *b, *c, sign; int op, index, starti, shiftA = 0; unsigned int i, j, lines, ap, bp, pWidth; unsigned int ii = 0; exon_p_t ep; assert(eCol->nb > 0); pWidth = get_pos_width(eCol); /* find the starting exon for this alignment */ while (ii < eCol->nb && ((ep = eCol->e.exon[ii])->from1 != AP || ep->from2 != BP)) ii += 1; if (ii >= eCol->nb) fatal("align.c: Alignment fragment not found.\n"); i = j = lines = 0; op = index = 0; sign = '*'; ap = AP; bp = BP; a = ALINE; b = BLINE; c = CLINE; starti = (ii < eCol->nb - 1) ? (int) ep->to1 + 1 : -1; while (i < M || j < N) { if (op == 0 && *S == 0) { op = *S++; *a = A[++i]; *b = B[++j]; *c++ = (unsigned char) ((*a++ == *b++) ? '|' : ' '); } else { if (op == 0) op = *S++; if (op > 0 && op != INT_MAX) { *a++ = ' '; *b++ = B[++j]; *c++ = '-'; op--; } else { if (op == INT_MAX) { shiftA = *S++; op = -10; } if ((int) (i + AP) == starti) { /* detected intron */ if (ep->type < 0 || direction == 0) sign = '='; else if (direction > 0) sign = '>'; else sign = '<'; ii += 1; ep = (ii < eCol->nb) ? eCol->e.exon[ii] : NULL; starti = (ii < eCol->nb - 1) ? (int) ep->to1 + 1 : -1; if (shiftA != 0) AP = ep->from1 - 3; index = 1; *c++ = sign; *a++ = A[++i]; *b++ = ' '; op++; } else if (!index) { *c++ = '-'; *a++ = A[++i]; *b++ = ' '; op++; } else { /* not the first deletion in the intron */ switch (index) { case 0: case 1: case 2: *a++ = A[++i]; *b++ = ' '; *c++ = sign; op++; index++; break; case 3: case 4: *a++ = '.'; *b++ = ' '; *c++ = '.'; i++; op++; index++; break; case 5: *a++ = '.'; *b++ = ' '; *c++ = '.'; if (shiftA != 0) { A += shiftA; A += i; A -= 8; shiftA = 0; i = 0; M = 0; } else i += (unsigned int) (-op) - 3; op = -3; index++; break; case 6: case 7: *a++ = A[++i]; *b++ = ' '; *c++ = sign; op++; index++; break; case 8: *a++ = A[++i]; *b++ = ' '; *c++ = sign; op++; index = 0; break; } } } } if (a >= ALINE + 50 || (i >= M && j >= N)) { *a = *b = *c = '\0'; printf("\n%*u ", pWidth, 50 * lines++); for (b = ALINE + 10; b <= a; b += 10) printf(" . :"); if (b <= a + 5) printf(" ."); printf("\n%*u %s\n%*s %s\n%*u %s\n", pWidth, ap + options.dnaOffset, ALINE, pWidth, " ", CLINE, pWidth, bp, BLINE); ap = AP + i; bp = BP + j; a = ALINE; b = BLINE; c = CLINE; } } } void Free_script(edit_script_p_t head) { edit_script_p_t tp, tp1; tp = head; while (tp != NULL) { tp1 = tp->next; free(tp); tp = tp1; } } static int snake(uchar *seq1, uchar *seq2, int k, int x, int endx, int endy) { int y; if (x<0) return x; y = x+k; while (x M) return x; if (startx < 0 || starty < 0) fprintf(stderr, "TROUBLE!!! startx: %5d, starty: %5d\n",startx, starty); if (x + k > N) fprintf(stderr, "TROUBLE!!! x: %5d, y: %5d\n",x,x+k); y = x + k; while (x > startx && y > starty && seq1[x - 1] == seq2[y - 1]) { x -= 1; y -= 1; } return x; } SIBsim4-0.20/sim4b1.h0000644000551200011300000000114110774266223013300 0ustar chrisludwig/* $Id: sim4b1.h,v 1.22 2008/03/31 22:58:27 c4chris Exp $ * * Christian Iseli, LICR ITO, Christian.Iseli@licr.org * * Copyright (c) 2001-2004 Swiss Institute of Bioinformatics. * Copyright (C) 1998-2001 Liliana Florea. */ #ifndef SIM4B1_H #define SIM4B1_H void free_align(edit_script_list_p_t); void print_exons(collec_p_t, int); void SIM4(hash_env_p_t, seq_p_t, collec_p_t); void init_encoding(void); void init_hash_env(hash_env_p_t, unsigned int, uchar *, unsigned int); void free_hash_env(hash_env_p_t); void bld_table(hash_env_p_t); void init_col(collec_p_t, unsigned int); #endif /* SIM4B1_H */ SIBsim4-0.20/Makefile0000644000551200011300000000201210774015411013455 0ustar chrisludwig# $Id: Makefile,v 1.23 2008/03/30 22:58:17 c4chris Exp $ # # Christian Iseli, LICR ITO, Christian.Iseli@licr.org # # Copyright (c) 2001-2005 Swiss Institute of Bioinformatics. # Copyright (C) 1998-2001 Liliana Florea. DEBUG = # For better performance, replace ``-O'' with whatever # the best optimization flag is for your computer. # For Sun's compilers under Solaris, ``-fast'' works well. # For gcc, ``-O2'' works well. OPT = -O # The default CFLAGS is meant for GCC. If you compile for Solaris, you need # to change it to this: # CFLAGS = -Xc CFLAGS = -std=gnu99 -W -Wall -Wconversion -pedantic $(DEBUG) $(OPT) # The default is GCC. On Solaris, you might put: # CC = /opt/SUNWspro/bin/cc CC = gcc # Depending on the compile flags you use, you might need to explicitly use the # math library: # LIBS = -lm LIBS = # There should be no need to change things below... OBJS = sim4b1.o align.o misc.o sim4.init.o sim4: $(OBJS) $(CC) -o SIBsim4 $(CFLAGS) $(OBJS) $(LIBS) clean: rm -f SIBsim4 *.o # DO NOT DELETE SIBsim4-0.20/align.h0000644000551200011300000000141410773315325013273 0ustar chrisludwig/* $Id: align.h,v 1.11 2008/03/29 01:26:45 c4chris Exp $ * * Christian Iseli, LICR ITO, Christian.Iseli@licr.org * * Copyright (c) 2001-2004 Swiss Institute of Bioinformatics. * Copyright (C) 1998-2001 Liliana Florea. */ #ifndef SCRIPTLIB_H #define SCRIPTLIB_H extern void align_path(uchar *,uchar *,int,int,int,int,int,edit_script_p_t*, edit_script_p_t*, int,int); extern int align_get_dist(uchar *,uchar *,int, int, int, int, int); extern void Condense_both_Ends(edit_script_p_t *, edit_script_p_t *, edit_script_p_t*); extern void S2A(edit_script_p_t, int *); extern void IDISPLAY(uchar *, uchar *, unsigned int, unsigned int, int *, unsigned int, unsigned int, collec_p_t, int); extern void Free_script(edit_script_p_t); #endif /* SCRIPTLIB_H */ SIBsim4-0.20/SIBsim4.10000644000551200011300000001150311246731115013317 0ustar chrisludwig.\" $Id: SIBsim4.1,v 1.9 2009/08/31 11:21:49 c4chris Exp $ .\" Christian Iseli, LICR ITO, Christian.Iseli@licr.org .\" .\" Copyright (c) 2004-2006 Swiss Institute of Bioinformatics. .\" .TH SIBsim4 1 "April 2007" Bioinformatics "User Manuals" .SH NAME SIBsim4 \- align RNA sequences with a DNA sequence, allowing for introns .SH SYNOPSIS .B SIBsim4 [ .I options .B ] .I dna rna_db .SH DESCRIPTION .B SIBsim4 is a similarity-based tool for aligning a collection of expressed sequences (EST, mRNA) with a genomic DNA sequence. Launching .B SIBsim4 without any arguments will print the options list, along with their default values. .B SIBsim4 employs a blast-based technique to first determine the basic matching blocks representing the "exon cores". In this first stage, it detects all possible exact matches of W-mers (i.e., DNA words of size W) between the two sequences and extends them to maximal scoring gap-free segments. In the second stage, the exon cores are extended into the adjacent as-yet-unmatched fragments using greedy alignment algorithms, and heuristics are used to favor configurations that conform to the splice-site recognition signals (e.g., GT-AG). If necessary, the process is repeated with less stringent parameters on the unmatched fragments. By default, .B SIBsim4 searches both strands and reports the best matches, measured by the number of matching nucleotides found in the alignment. The .B R command line option can be used to restrict the search to one orientation (strand) only. Currently, four major alignment display options are supported, controlled by the .B A option. By default, only the endpoints, overall similarity, and orientation of the introns are reported. An arrow sign ('->' or '<-') indicates the orientation of the intron. The sign `==' marks the absence from the alignment of a cDNA fragment starting at that position. In the description below, the term .B MSP denotes a maximal scoring pair, that is, a pair of highly similar fragments in the two sequences, obtained during the blast-like procedure by extending a W-mer hit by matches and perhaps a few mismatches. .SH OPTIONS .IP "-A " output format 0: exon endpoints only 1: alignment text 3: both exon endpoints and alignment text 4: both exon endpoints and alignment text with polyA info Note that 2 is unimplemented. Default value is 0. .IP "-C " MSP score threshold for the second pass. Default value is 12. .IP "-c " minimum score cutoff value. Alignments which have scores below this value are not reported. Default value is 50. .IP "-E " cutoff value. Default value is 3. .IP "-f " score filter in percent. When multiple hits are detected for the same RNA element, only those having a score within this percentage of the maximal score for that RNA element are reported. Setting this value to 0 disables filtering and all hits will be reported, provided their score is above the cutoff value specified through the .B c option. Default value is 75. .IP "-g " join exons when gap on genomic and RNA have lengths which differ at most by this percentage. Default value is 10. .IP "-H " report chimeric transcripts when the best score is lower than this percentage of the overall RNA coverage and the chimera score is greater than this percentage of the RNA length (0 disables this report) Default value is 75. .IP "-I " window width in which to search for intron splicing. Default value is 6. .IP "-K " MSP score threshold for the first pass. Default value is 16. .IP "-L " a comma separated list of forward splice-types. Default value is "GTAG,GCAG,GTAC,ATAC". .IP "-M " scoring splice sites, evaluate match within M nucleotides. Default value is 10. .IP "-o " when printing results, offset nt positions in dna sequence by this amount. Default value is 0. .IP "-q " penalty for a nucleotide mismatch. Default value is -5. .IP "-R " direction of search 0: search the '+' (direct) strand only 1: search the '-' strand only 2: search both strands Default value is 2. .IP "-r " reward for a nucleotide match. Default value is 1. .IP "-S " splice site indels search breadth. While determining the best position of a splice site, .B SIBsim4 will evaluate adding at most this number of insertions and deletions on the DNA strand on each side of the splice junction. Default value is 2. .IP "-s " split score in percent. While linking MSP, if two consecutive group of exons appear like they could be part of two different copies of the same gene, they will be tested to see if the score of each individual group relative to the best overall score is greater than this value. If both groups have a relative score above this threshold they will be split. Default value is 75. .IP "-W " word size. Default value is 12. .IP "-X " value for terminating word extensions. Default value is 12. SIBsim4-0.20/sim4.h0000644000551200011300000000767711246731115013071 0ustar chrisludwig/* $Id: sim4.h,v 1.57 2009/08/31 11:21:49 c4chris Exp $ * * Christian Iseli, LICR ITO, Christian.Iseli@licr.org * * Copyright (c) 2001-2006 Swiss Institute of Bioinformatics. * Copyright (C) 1998-2001 Liliana Florea. */ #ifndef SIM4_H #define SIM4_H #define DIST_CUTOFF 3 #define DEFAULT_GAPPCT 10 #define MIN_INTRON 30 #define MAX_GRINIT 500 #define MATCH_CUTOFF 50 #define DEFAULT_SPLIT_SCORE 75 #define DEFAULT_FILTER 75 #define DEFAULT_W 12 #define DEFAULT_X 12 #define DEFAULT_K 15 #define DEFAULT_C 15 #define DEFAULT_CHIMERA 75 #define DEFAULT_SPLICE_I_D 2 #define P (.2) #define MATCH 1 #define MISMATCH (-5) #define DELETE 1 #define INSERT 2 #define SUBSTITUTE 3 #define CHIMERA 4 #define BUF_SIZE 4096 #define NACHARS 128 #define HASH_SIZE (1UL << 19) #define HASH_MASK (HASH_SIZE - 1) #define min(x, y) ((x > y) ? (y) : (x)) #define max(x, y) ((x < y) ? (y) : (x)) /* data structures */ typedef unsigned char uchar; typedef struct _hash_node_t { unsigned int ecode; /* integer encoding of the word */ unsigned int pos; /* positions where word hits query sequence */ } hash_node_t, *hash_node_p_t; typedef struct _hash_env_t { void **hashtab; uchar *seq; unsigned int len; int *next_pos; unsigned int mask; unsigned int W; } hash_env_t, *hash_env_p_t; typedef struct _read_buf_t { char *line; unsigned int lmax; unsigned int lc; unsigned int ic; char in[BUF_SIZE]; } read_buf_t, *read_buf_p_t; typedef struct _seq_t { const char *fName; char *header; unsigned char *seq; read_buf_t rb; int fd; unsigned int len; unsigned int maxHead; unsigned int max; } seq_t, *seq_p_t; typedef struct _exon_t { unsigned int from1, from2, to1, to2; unsigned int score; unsigned int Score; int prev; int direction; unsigned int splScore; unsigned int top : 1; unsigned int bot : 1; int type : 30; } exon_t, *exon_p_t; typedef struct _sim4_stats { unsigned int nmatches; int polyA_cut; int polyT_cut; } sim4_stats_t, *sim4_stats_p_t; /* used only in the alignment stage */ typedef struct _edit_script { struct _edit_script *next; int num; /* Number of operations */ char op_type; /* SUB, INS, or DEL */ } edit_script_t, *edit_script_p_t; typedef struct _edit_script_list { struct _edit_script_list *next_script; edit_script_p_t script; unsigned int offset1, offset2; unsigned int len1, len2; int score; } edit_script_list_t, *edit_script_list_p_t; typedef union _collec_elt_t { void **elt; exon_p_t *exon; struct _result_t **result; } collec_elt_t; typedef struct _collec_t { collec_elt_t e; unsigned int nb; unsigned int size; } collec_t, *collec_p_t; typedef struct _result_t { edit_script_list_p_t sList; collec_t eCol; unsigned int dStart; unsigned int dLen; int direction; int chimera; sim4_stats_t st; } result_t, *result_p_t; typedef struct _junction_t { uchar fwd[4]; uchar rev[4]; } junction_t, *junction_p_t; typedef struct _splice_score_t { unsigned int to1; unsigned int to2; unsigned int nFrom1; int type; unsigned int score; unsigned int splScore; int direction; } splice_score_t, *splice_score_p_t; typedef struct _options_t { unsigned char *splice_type_list; junction_p_t splice; unsigned int nbSplice; int ali_flag; unsigned int C; int cutoff; unsigned int gapPct; unsigned int intron_window; unsigned int K; unsigned int scoreSplice_window; int mismatchScore; unsigned int reverse; int matchScore; unsigned int W; unsigned int X; unsigned int dnaOffset; unsigned int filterPct; unsigned int minScore_cutoff; unsigned int splitScorePct; unsigned int huntChimera; unsigned int spliceInDel; } options_t; extern options_t options; extern char *argv0; #endif SIBsim4-0.20/sim4b1.c0000644000551200011300000022135611322656112013275 0ustar chrisludwig/* $Id: sim4b1.c,v 1.138 2010/01/11 17:13:46 c4chris Exp $ * * Christian Iseli, LICR ITO, Christian.Iseli@licr.org * * Copyright (c) 2001-2006,2010 Swiss Institute of Bioinformatics. * Copyright (C) 1998-2001 Liliana Florea. */ #define _GNU_SOURCE 1 #include #include #include #include #include #include #include #include "sim4.h" #include "sim4b1.h" #include "align.h" #include "misc.h" static unsigned int encoding[NACHARS]; static void merge(collec_p_t, collec_p_t, unsigned int, unsigned int); static void slide_intron(result_p_t, uchar *, uchar *); static void compact_exons(collec_p_t, unsigned int); static unsigned int greedy(uchar *, uchar *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, collec_p_t); static int extend_bw(uchar *, uchar *, int, int, int, int, int *, int *, unsigned int); static int extend_fw(uchar *, uchar *, int, int, int, int, int *, int *, unsigned int); static int pluri_align(uchar *, uchar *, unsigned int *, collec_p_t, edit_script_list_p_t *, unsigned int, unsigned int); static exon_p_t new_exon(unsigned int, unsigned int, unsigned int,unsigned int); static void extend_hit(int, int, hash_env_p_t, const uchar * const, unsigned int, unsigned int, collec_p_t, int *); static int msp_compare(const void *, const void *); static int chimera_compare(const void *, const void *); static int msp_rna_compare(const void *, const void *); static void search(hash_env_p_t, uchar *, unsigned int, unsigned int, collec_p_t); static void trim_small_repeated_msps(collec_p_t); static void combine_msps(collec_p_t); static int link_msps(collec_p_t, unsigned int, unsigned int); static int link_chimera(collec_p_t, unsigned int, unsigned int); static void msp2exons(exon_p_t *, int, collec_p_t, int, int); static void exon_cores(hash_env_p_t, uchar *, unsigned int, unsigned int, unsigned int, unsigned int, collec_p_t, collec_p_t, collec_p_t); static int good_ratio(int, unsigned int); static void swap_seqs(collec_p_t); static unsigned int SWscore(uchar *, uchar *, unsigned int); #ifdef DEBUG static void debug_print_exons(collec_p_t, const char *, const unsigned char *, const unsigned char *); #endif static int is_polyAT_exon_p(exon_p_t e, const unsigned char *s) { unsigned int cntA = 0; unsigned int cntC = 0; unsigned int cntG = 0; unsigned int cntT = 0; unsigned int cntN = 0; unsigned int i; unsigned int len = e->to2 - e->from2 + 1; for (i = e->from2 - 1; i < e->to2; i++) switch (s[i]) { case 'A': cntA += 1; break; case 'C': cntC += 1; break; case 'G': cntG += 1; break; case 'T': cntT += 1; break; default: cntN += 1; } len -= cntN; if (len < MIN_INTRON) { if ((cntA * 10) / len >= 7 || ((cntA + cntG) * 10) / len >= 8 || (cntT * 10) / len >= 7 || ((cntT + cntC) * 10) / len >= 8) return 1; } else { if ((cntA * 10) / len >= 8 || ((cntA + cntG) * 100) / len >= 95 || (cntT * 10) / len >= 8 || ((cntT + cntC) * 100) / len >= 95) return 1; } return 0; } static void kill_polyA(result_p_t res, const unsigned char *s1, const unsigned char *s2) { unsigned int i; collec_p_t eCol = &res->eCol; /* Stupid initialization below to avoid spurious uninitialized warning * from GCC... */ struct {int score; unsigned int cnt; unsigned int d;} best = best; i = 0; while (i < eCol->nb && is_polyAT_exon_p(eCol->e.exon[i], s2)) i += 1; if (i > 0) { unsigned int j; for (j = 0; j < i; j++) free(eCol->e.exon[j]); memmove(eCol->e.elt, eCol->e.elt + i, (eCol->nb - i) * sizeof(void *)); eCol->nb -= i; } i = 0; while (i < eCol->nb && is_polyAT_exon_p(eCol->e.exon[eCol->nb - i - 1], s2)) i += 1; if (i > 0) { unsigned int j; for (j = eCol->nb - i; j < eCol->nb; j++) free(eCol->e.exon[j]); eCol->nb -= i; } if (eCol->nb > 0) { exon_p_t e = eCol->e.exon[eCol->nb - 1]; unsigned int cntAs1 = 0, cntAs2 = 0, j = 0; int score = 0; const unsigned char *s = s2 + e->to2; best.score = 0; while (*s && best.score - score < 10) { j += 1; switch (*s) { case 'A': cntAs2 += 1; score += 1; if (score > best.score) { best.score = score; best.cnt = cntAs2; best.d = j; } break; case 'N': break; default: score -= 2; } s += 1; } if (best.score > 0 && best.cnt >= 8 && (best.cnt * 10) / best.d >= 8) { s = s1 + e->to1; j = 0; while (*s && j < best.d) { j += 1; if (*s == 'A') cntAs1 += 1; s += 1; } if (j > 0 && (cntAs1 * 10) / j < 8) { res->st.polyA_cut = 1; } } } if (eCol->nb > 0) { exon_p_t e = eCol->e.exon[0]; unsigned int cntTs1 = 0, cntTs2 = 0, j = 0; int score = 0; const unsigned char *s = s2 + e->from2 - 2; best.score = 0; while (s >= s2 && best.score - score < 10) { j += 1; switch (*s) { case 'T': cntTs2 += 1; score += 1; if (score > best.score) { best.score = score; best.cnt = cntTs2; best.d = j; } break; case 'N': break; default: score -= 2; } s -= 1; } if (best.score > 0 && best.cnt >= 8 && (best.cnt * 10) / best.d >= 8) { s = s1 + e->from1 - 2; j = 0; while (s >= s1 && j < best.d) { j += 1; if (*s == 'T') cntTs1 += 1; s -= 1; } if (j > 0 && (cntTs1 * 10) / j < 8) { res->st.polyT_cut = 1; } } } } static void grow_exon_left(exon_p_t e, uchar *s1, uchar *s2) { uchar *p1 = s1 + e->from1 - 2; uchar *p2 = s2 + e->from2 - 2; while (p1 >= s1 && p2 >= s2 && *p1 == *p2) { p1 -= 1; p2 -= 1; e->from1 -= 1; e->from2 -= 1; } } static void grow_exon_right(exon_p_t e, uchar *s1, unsigned int l1, uchar *s2, unsigned int l2) { while (e->to1 < l1 && e->to2 < l2 && s1[e->to1] == s2[e->to2]) { e->to1 += 1; e->to2 += 1; } } /* seq1 = genomic DNA (text); seq2 = cDNA */ void SIM4(hash_env_p_t he, seq_p_t seq2, collec_p_t res) { collec_t mCol; collec_t tem_eCol; int align_status; unsigned int curRes; if (he->len == 0 || seq2->len == 0) return; init_col(&mCol, 5); /* Compute the distance between two sequences A and B */ exon_cores(he, seq2->seq, seq2->len, 1, 1, options.K, &mCol, res, NULL); init_col(&tem_eCol, 0); for (curRes = 0; curRes < res->nb; curRes++) { result_p_t r = res->e.result[curRes]; collec_p_t eCol = &r->eCol; sim4_stats_p_t st = &r->st; #ifdef DEBUG if (r->chimera) fprintf(stderr, "Handling chimera here\n"); debug_print_exons(eCol, "LSIS", he->seq, seq2->seq); #endif /* Chase down polyA tails. */ st->polyA_cut = 0; st->polyT_cut = 0; kill_polyA(r, he->seq, seq2->seq); #ifdef DEBUG debug_print_exons(eCol, "LSIS 2", he->seq, seq2->seq); #endif if (eCol->nb == 0) continue; /* Look at the first exon, and try to extend it backward. */ if (!st->polyT_cut && eCol->e.exon[0]->from2 > 1) { exon_p_t e = eCol->e.exon[0]; unsigned int i = 0; if (e->from2 - 1 > (MIN_INTRON << 1) && e->from1 - 1 > r->dStart) { hash_env_t tem_he; #ifdef DEBUG fprintf(stderr, "Find new exons (head) %d %d\n", e->from1, e->from2); #endif init_hash_env(&tem_he, min(10, he->W), seq2->seq, e->from2 - 1); bld_table(&tem_he); exon_cores(&tem_he, he->seq + r->dStart, e->from1 - r->dStart - 1, 1, r->dStart + 1, options.C, &mCol, NULL, &tem_eCol); free_hash_env(&tem_he); /* Insert new exons (merging if needed), swaping seqs. */ if (tem_eCol.nb > 0) { swap_seqs(&tem_eCol); grow_exon_right(tem_eCol.e.exon[tem_eCol.nb - 1], he->seq, he->len, seq2->seq, seq2->len); merge(eCol, &tem_eCol, 0, he->W); tem_eCol.nb = 0; e = eCol->e.exon[0]; } } while (i < eCol->nb && is_polyAT_exon_p(eCol->e.exon[i], seq2->seq)) i += 1; if (i > 0) { unsigned int j; for (j = 0; j < i; j++) free(eCol->e.exon[j]); memmove(eCol->e.elt, eCol->e.elt + i, (eCol->nb - i) * sizeof(void *)); eCol->nb -= i; if (eCol->nb == 0) continue; e = eCol->e.exon[0]; } if (e->from2 - 1 > 0) { int diff = (int) min(e->from2 - 1, MAX_GRINIT >> 1); int u = min(4 * diff, (int) e->from1 - 1); int I, J, cost; #ifdef DEBUG fprintf(stderr, "extend_bw from %d\n", e->from2); #endif cost = extend_bw(seq2->seq + e->from2 - 1 - diff, he->seq + e->from1 - 1 - u, diff, u, (int) e->from2 - 1 - diff, (int) e->from1 - 1 - u, &I, &J, he->W); #ifdef DEBUG fprintf(stderr, "extend_bw returned %d, I: %d J: %d\n", cost, I, J); #endif if (((int) e->from2 - 1 - I) * options.matchScore + cost * options.mismatchScore >= 0) { e->from2 = (unsigned int) I + 1; e->from1 = (unsigned int) J + 1; } } } /* Look at the last exon, and try to extend it forward. */ if (!st->polyA_cut && eCol->e.exon[eCol->nb - 1]->to2 < seq2->len) { exon_p_t e = eCol->e.exon[eCol->nb - 1]; unsigned int i = 0; if (seq2->len - e->to2 > (MIN_INTRON << 1) && e->to1 < r->dStart + r->dLen) { hash_env_t tem_he; #ifdef DEBUG fprintf(stderr, "Find new exons (tail) %d %d\n", e->to1, e->to2); #endif init_hash_env(&tem_he, min(10, he->W), seq2->seq + e->to2, seq2->len - e->to2); bld_table(&tem_he); exon_cores(&tem_he, he->seq + e->to1, r->dStart + r->dLen - e->to1, e->to2 + 1, e->to1 + 1, options.C, &mCol, NULL, &tem_eCol); free_hash_env(&tem_he); /* Append new exons (merging if needed), swaping seqs. */ if (tem_eCol.nb > 0) { swap_seqs(&tem_eCol); grow_exon_left(tem_eCol.e.exon[0], he->seq, seq2->seq); merge(eCol, &tem_eCol, eCol->nb, he->W); tem_eCol.nb = 0; e = eCol->e.exon[eCol->nb - 1]; } } while (i < eCol->nb && is_polyAT_exon_p(eCol->e.exon[eCol->nb - i - 1], seq2->seq)) i += 1; if (i > 0) { unsigned int j; for (j = eCol->nb - i; j < eCol->nb; j++) free(eCol->e.exon[j]); eCol->nb -= i; if (eCol->nb == 0) continue; e = eCol->e.exon[eCol->nb - 1]; } if (seq2->len - e->to2 > 0) { int diff = (int) min(seq2->len - e->to2, MAX_GRINIT >> 1); int cost, I, J; #ifdef DEBUG fprintf(stderr, "extend_fw from %d (%d)\n", e->to2, diff); #endif cost = extend_fw(seq2->seq + e->to2, he->seq + e->to1, diff, min(4 * diff, (int) (he->len - e->to1)), (int) e->to2, (int) e->to1, &I, &J, he->W); #ifdef DEBUG fprintf(stderr, "extend_fw returned %d, I: %d J: %d\n", cost, I, J); #endif if ((I - (int) e->to2) * options.matchScore + cost * options.mismatchScore >= 0) { e->to2 = (unsigned int) I; e->to1 = (unsigned int) J; } } } /* Proceed in case of several exons. */ if (eCol->nb > 1) { unsigned int i; for (i = 1; i < eCol->nb; i++) { exon_p_t cur = eCol->e.exon[i - 1]; exon_p_t next = eCol->e.exon[i]; int diff = (int) (next->from2) - (int) (cur->to2) - 1; if (diff > 0) { /* bridge the gap (provided there is one...) */ if (next->from1 - 1 > cur->to1) { hash_env_t tem_he; if (diff <= MAX_GRINIT) { unsigned int cost; #ifdef DEBUG fprintf(stderr, "Trying greedy %d %d\n", cur->to2, next->from2); #endif cost = greedy(seq2->seq + cur->to2, he->seq + cur->to1, (unsigned int) diff, next->from1 - cur->to1 - 1, cur->to2, cur->to1, he->W, &tem_eCol); if (tem_eCol.nb > 0 && cost <= max(he->W, P * diff + 1)) { grow_exon_left(tem_eCol.e.exon[0], he->seq, seq2->seq); grow_exon_right(tem_eCol.e.exon[tem_eCol.nb - 1], he->seq, he->len, seq2->seq, seq2->len); merge(eCol, &tem_eCol, i, he->W); tem_eCol.nb = 0; i -= 1; continue; } } #ifdef DEBUG fprintf(stderr, "Find new exons %d %d\n", cur->to1, next->from1); #endif init_hash_env(&tem_he, min(8, he->W), he->seq + cur->to1, next->from1 - cur->to1 - 1); bld_table(&tem_he); exon_cores(&tem_he, seq2->seq + cur->to2, (unsigned int) diff, cur->to1 + 1, cur->to2 + 1, options.C, &mCol, NULL, &tem_eCol); free_hash_env(&tem_he); if (tem_eCol.nb > 0) { grow_exon_left(tem_eCol.e.exon[0], he->seq, seq2->seq); grow_exon_right(tem_eCol.e.exon[tem_eCol.nb - 1], he->seq, he->len, seq2->seq, seq2->len); merge(eCol, &tem_eCol, i, he->W); tem_eCol.nb = 0; i -= 1; } } } } } /* Re-check for polyA. */ kill_polyA(r, he->seq, seq2->seq); /* just printing ... */ #ifdef DEBUG debug_print_exons(eCol, "EXTENSIONS", he->seq, seq2->seq); #endif /* compaction step; note: it resets the right end of the list to */ /* the last item in the block list */ if (!r->chimera) compact_exons(eCol, he->W); /* just printing ... */ #ifdef DEBUG debug_print_exons(eCol, "NORMALIZATION", he->seq, seq2->seq); #endif /* eliminate marginal small blocks at the start of the sequence; */ if (eCol->nb > 0) { unsigned int i = 0; while (i < eCol->nb) { exon_p_t e = eCol->e.exon[i]; if (e->to2 - e->from2 + 1 >= he->W) break; free(e); i += 1; } if (i > 0) { memmove(eCol->e.elt, eCol->e.elt + i, (eCol->nb - i) * sizeof(void *)); eCol->nb -= i; } } /* eliminate marginal small blocks at the end of the sequence */ if (eCol->nb > 0) { int i = (int) (eCol->nb) - 1; while (i >= 0) { exon_p_t e = eCol->e.exon[i]; if (e->to2 - e->from2 + 1 >= he->W) break; free(e); i -= 1; eCol->nb -= 1; } } /* Slide exon boundaries for optimal intron signals */ slide_intron(r, he->seq, seq2->seq); /* */ align_status = pluri_align(he->seq, seq2->seq, &(st->nmatches), eCol, &r->sList, he->len, seq2->len); if (align_status != 0 || !options.ali_flag) { free_align(r->sList); r->sList = NULL; } } free(mCol.e.elt); free(tem_eCol.e.elt); } void init_col(collec_p_t c, unsigned int size) { c->size = size; c->nb = 0; if (size > 0) c->e.elt = (void **) xmalloc(size * sizeof(void *)); else c->e.elt = NULL; } static void add_col_elt(collec_p_t c, void *elt) { if (c->size <= c->nb) { c->size += 5; c->e.elt = (void **) xrealloc(c->e.elt, c->size * sizeof(void *)); } c->e.elt[c->nb++] = elt; } #ifdef DEBUG static void debug_msps(hash_env_p_t he, uchar *s2, collec_p_t mCol, char *title) { unsigned int j; fputs(title, stderr); for (j = 0; j < mCol->nb; ++j) { exon_p_t m = mCol->e.exon[j]; fprintf(stderr, "[%d] %d-%d %d-%d, %d %d\n", j, m->from1, m->to1, m->from2, m->to2, m->score, m->Score); } if (he == NULL) return; for (j = 0; j < mCol->nb; ++j) { exon_p_t m = mCol->e.exon[j]; fprintf(stderr, "%.10s %.*s %.10s\n%.10s %.*s %.10s\n", (m->from1 >= 10) ? he->seq + m->from1 - 10 : he->seq, m->to1 - m->from1 + 1, he->seq + m->from1, he->seq + m->to1 + 1, (m->from2 >= 10) ? s2 + m->from2 - 10 : s2, m->to2 - m->from2 + 1, s2 + m->from2, s2 + m->to2 + 1); } } static void debug_organized_msps(collec_p_t mCol, int last_msp, char *title) { int i; fputs(title, stderr); for (i = last_msp; i >= 0; i = mCol->e.exon[i]->prev) { exon_p_t m = mCol->e.exon[i]; fprintf(stderr, "[%d] %d-%d %d-%d, %d %d\n", i, m->from1, m->to1, m->from2, m->to2, m->score, m->Score); } } #endif static void exon_cores(hash_env_p_t he, uchar *s2, unsigned int len2, unsigned int offset1, unsigned int offset2, unsigned int K, collec_p_t mCol, collec_p_t res, collec_p_t eCol) { unsigned int j; int last_msp; int swapped = eCol != NULL; /* True when sequences were swapped. */ search(he, s2, len2, K, mCol); #ifdef DEBUG debug_msps(he, s2, mCol, "==== unsorted MSPs\n"); #endif /* Kill small repeated segments. */ qsort(mCol->e.exon, (size_t) mCol->nb, sizeof(exon_p_t), msp_rna_compare); trim_small_repeated_msps(mCol); #ifdef DEBUG debug_msps(he, s2, mCol, "==== sorted MSPs\n"); #endif /* sort in order of mp->pos1. */ qsort(mCol->e.exon, (size_t) mCol->nb, sizeof(exon_p_t), msp_compare); combine_msps(mCol); #ifdef DEBUG debug_msps(NULL, NULL, mCol, "==== sorted, combined MSPs\n"); #endif /* Check for duplicated genes if requested. */ if (eCol == NULL) { result_p_t r; unsigned int minMPos = len2; unsigned int maxMPos = 0; unsigned int cov, covM = 0, covR = 0; unsigned int globScore, minPartScore; int tested = 0; unsigned int *coverage = (unsigned int *) xcalloc((size_t) len2, sizeof(unsigned int)); assert(res != NULL); /* See which part of the RNA is duplicated. */ for (j = 0; j < mCol->nb; j++) { unsigned int k; exon_p_t m = mCol->e.exon[j]; if (m->from2 < minMPos) minMPos = m->from2; if (m->to2 > maxMPos) maxMPos = m->to2; for (k = m->from2; k <= m->to2; k++) coverage[k] += 1; } for (j = 0; j < len2; j++) { if (coverage[j] > 0) covR += 1; if (coverage[j] > 1) covM += 1; } cov = maxMPos - minMPos + 1; cov = cov / 4; minMPos += cov; if (maxMPos > cov) maxMPos -= cov; for (j = 0; j < mCol->nb; j++) { exon_p_t m = mCol->e.exon[j]; m->bot = m->top = 0; if (m->from2 < minMPos) m->bot = 1; if (m->to2 > maxMPos) m->top = 1; } #ifdef DEBUG fprintf(stderr, "==== top, max: %d\n", maxMPos); for (j = 0; j < mCol->nb; ++j) { exon_p_t m = mCol->e.exon[j]; fprintf(stderr, "[%d] %d-%d %d-%d, %d %d, %d\n", j, m->from1, m->to1, m->from2, m->to2, m->score, m->Score, m->top); } #endif last_msp = link_msps(mCol, 0, mCol->nb); if (last_msp < 0) { free(coverage); return; } minMPos = 0; maxMPos = 0; globScore = mCol->e.exon[last_msp]->Score; minPartScore = globScore * options.splitScorePct / 100; #ifdef DEBUG fprintf(stderr, "global score: %u, minPartScore: %u, length: %u," " covR: %u, covM: %u\n", globScore, minPartScore, len2, covR, covM); #endif if (options.huntChimera > 0 && (covR * options.huntChimera) / 100 > globScore) { /* sort in order of mp->pos2. */ qsort(mCol->e.exon, (size_t) mCol->nb, sizeof(exon_p_t), chimera_compare); #ifdef DEBUG debug_msps(NULL, NULL, mCol, "==== sorted, chimera MSPs\n"); #endif last_msp = link_chimera(mCol, 0, mCol->nb); add_col_elt(res, xcalloc(1UL, sizeof(result_t))); r = res->e.result[res->nb - 1]; r->dStart = 0; r->dLen = he->len; r->chimera = 1; eCol = &r->eCol; #ifdef DEBUG debug_organized_msps(mCol, last_msp, "==== organized chimera MSPs\n"); #endif init_col(eCol, mCol->nb); msp2exons(mCol->e.exon, last_msp, eCol, 0, 1); for (j = 0; j < eCol->nb; j++) { exon_p_t e = eCol->e.exon[j]; e->to1 += offset1; e->from1 += offset1; e->to2 += offset2; e->from2 += offset2; } /* sort in order of mp->pos1. */ qsort(mCol->e.exon, (size_t) mCol->nb, sizeof(exon_p_t), msp_compare); last_msp = link_msps(mCol, 0, mCol->nb); } /* Only look for duplicates when the global score is higher than half the * length of the RNA sequence and when we have enough multiple coverage to * produce multiple high scores. */ if (globScore > (len2 >> 1) && covM >= minPartScore) { unsigned int *coverageL = (unsigned int *) xcalloc((size_t) len2, sizeof(unsigned int)); unsigned int covL = 0; /* Check that both pieces have good scores. */ /* See if we have split points, and if the parts have good scores. */ for (j = 1; j < mCol->nb; j++) { exon_p_t p = mCol->e.exon[j - 1]; exon_p_t m = mCol->e.exon[j]; unsigned int jj; /* Keep track of which parts are covered. */ for (jj = p->from2; jj <= p->to2; jj++) { if (coverageL[jj] == 0) covL += 1; coverageL[jj] += 1; coverage[jj] -= 1; if (coverage[jj] == 0) covR -= 1; } if (covL >= minPartScore && covR >= minPartScore && ((p->top && !m->top) || (!p->bot && m->bot) || (p->top && m->bot))) { /* We have a split. */ int lLast; unsigned int lScore, rScore; tested = 1; lLast = link_msps(mCol, minMPos, j); assert(lLast >= 0); lScore = mCol->e.exon[lLast]->Score; last_msp = link_msps(mCol, j, mCol->nb); assert(last_msp >= 0); rScore = mCol->e.exon[last_msp]->Score; #ifdef DEBUG fprintf(stderr, "glob: %d, l: %d, r: %d, minP: %d, maxP: %d, j: %d\n", minPartScore, lScore, rScore, minMPos, maxMPos, j); #endif if (lScore >= minPartScore && rScore >= minPartScore) { unsigned int k; /* Good split. Store it for processing. */ add_col_elt(res, xcalloc(1UL, sizeof(result_t))); r = res->e.result[res->nb - 1]; r->dStart = maxMPos; r->dLen = m->from1 - maxMPos; eCol = &r->eCol; #ifdef DEBUG fprintf(stderr, "dStart: %u, dLen: %u\n", r->dStart, r->dLen); debug_organized_msps(mCol, lLast, "==== organized MSPs (part)\n"); #endif init_col(eCol, j - minMPos); msp2exons(mCol->e.exon, lLast, eCol, 0, 0); for (k = 0; k < eCol->nb; k++) { exon_p_t e = eCol->e.exon[k]; e->to1 += offset1; e->from1 += offset1; e->to2 += offset2; e->from2 += offset2; } /* Adjust coverage count tracking. */ memset(coverageL, 0, len2 * sizeof(unsigned int)); minMPos = j; maxMPos = mCol->e.exon[lLast]->to1; tested = 0; } } } free(coverageL); } free(coverage); if (tested) last_msp = link_msps(mCol, minMPos, mCol->nb); add_col_elt(res, xcalloc(1UL, sizeof(result_t))); r = res->e.result[res->nb - 1]; r->dStart = maxMPos; r->dLen = he->len - maxMPos; #ifdef DEBUG fprintf(stderr, "dStart: %u, dLen: %u\n", r->dStart, r->dLen); #endif eCol = &r->eCol; } else last_msp = link_msps(mCol, 0, mCol->nb); /* organize Blast hits (MSPs) into exons */ #ifdef DEBUG debug_organized_msps(mCol, last_msp, "==== organized MSPs\n"); #endif if (eCol->size == 0) init_col(eCol, mCol->nb); msp2exons(mCol->e.exon, last_msp, eCol, swapped, 0); for (j = 0; j < eCol->nb; j++) { exon_p_t e = eCol->e.exon[j]; e->to1 += offset1; e->from1 += offset1; e->to2 += offset2; e->from2 += offset2; } mCol->nb = 0; } static inline int lies_after_p(exon_p_t a, exon_p_t b) { /* When we have some overlap, make sure it is only a small part. */ /* ------------------ --------------------- | p1 | p2 | p3 | */ if (b->from1 > a->to1) { unsigned int p1; unsigned int p2; unsigned int p3; if (b->from2 > a->to2) return 1; if (b->from2 < a->from2 || b->to2 < a->to2) return 0; p1 = b->from2 - a->from2; p2 = a->to2 - b->from2; p3 = b->to2 - a->to2; if (p1 > p2 && p3 > p2 && p1 > options.K && p3 > options.K) return 1; } else if (b->from2 > a->to2) { unsigned int p1; unsigned int p2; unsigned int p3; if (b->from1 < a->from1 || b->to1 < a->to1) return 0; p1 = b->from1 - a->from1; p2 = a->to1 - b->from1; p3 = b->to1 - a->to1; if (p1 > p2 && p3 > p2 && p1 > options.K && p3 > options.K) return 1; } return 0; } static inline int lies_after_chimera_p(exon_p_t a, exon_p_t b) { /* When we have some overlap, make sure it is only a small part. */ /* ------------------ --------------------- | p1 | p2 | p3 | */ unsigned int p1; unsigned int p2; unsigned int p3; if (b->from2 > a->to2) return 1; if (b->from2 < a->from2 || b->to2 < a->to2) return 0; p1 = b->from2 - a->from2; p2 = a->to2 - b->from2; p3 = b->to2 - a->to2; if (p1 > p2 && p3 > p2 && p1 > options.K && p3 > options.K) return 1; return 0; } #define SMALL_EXON 50 #define MIN_REPEAT 20 #define JITTER_FACTOR 5 static void trim_small_repeated_msps(collec_p_t mCol) { unsigned int i = 0; while (i < mCol->nb) { exon_p_t m = mCol->e.exon[i]; unsigned int j, k, end; if (m->to2 - m->from2 >= SMALL_EXON) { i += 1; continue; } end = m->to2 + JITTER_FACTOR; j = i + 1; while (j < mCol->nb && mCol->e.exon[j]->to2 <= end) j += 1; if (j - i < MIN_REPEAT) { i += 1; continue; } for (k = i; k < j; k++) free(mCol->e.exon[k]); memmove(mCol->e.exon + i, mCol->e.exon + j, (mCol->nb - j) * sizeof(exon_p_t)); mCol->nb -= (j - i); } } static void combine_msps(collec_p_t mCol) { unsigned int i = 0; while (i < mCol->nb) { exon_p_t m = mCol->e.exon[i]; unsigned int ovl = 0; unsigned int j; for (j = i + 1; j < mCol->nb; j++) { exon_p_t n = mCol->e.exon[j]; unsigned int o = 0; if (n->from2 <= m->to2 + 1) ovl = m->to2 - n->from2 + 2; if (n->from1 > m->from1 && n->from1 <= m->to1 + 1) o = m->to1 - n->from1 + 2; if ((ovl == 0) == (o == 0) && abs((int) ovl - (int) o) <= 10) break; ovl = 0; } if (ovl != 0) { exon_p_t n = mCol->e.exon[j]; unsigned int nScore = m->score + n->score; if (nScore >= ovl + 1) nScore -= ovl + 1; else nScore = 0; m->from1 = min(m->from1, n->from1); m->from2 = min(m->from2, n->from2); m->to1 = max(m->to1, n->to1); m->to2 = max(m->to2, n->to2); if (nScore > m->score) m->score = nScore; mCol->nb -= 1; free(n); memmove(mCol->e.exon + j, mCol->e.exon + j + 1, (mCol->nb - j) * sizeof(exon_p_t)); } else i += 1; } } static int link_msps(collec_p_t mCol, unsigned int start, unsigned int stop) { struct { unsigned int elt; unsigned int score; } best; unsigned int i; if (start >= stop) return -1; memset(&best, 0, sizeof(best)); for (i = start; i < stop; i++) { exon_p_t m = mCol->e.exon[i]; m->Score = 0; m->prev = -1; } for (i = start; i < stop; i++) { exon_p_t m = mCol->e.exon[i]; unsigned int j; m->Score += m->score; if (m->Score > best.score) { best.score = m->Score; best.elt = i; } for (j = i + 1; j < stop; j++) { exon_p_t n = mCol->e.exon[j]; if (lies_after_p(m, n) && m->Score >= n->Score) { unsigned int penalty; penalty = (unsigned int) abs((int) (n->from1) - (int) (m->from1)) >> 15; penalty += (unsigned int) abs((int) (n->from2) - (int) (m->from2)) >> 15; if (penalty < m->Score) { n->Score = m->Score - penalty; n->prev = (int) i; } } } } return (int) best.elt; } static int link_chimera(collec_p_t mCol, unsigned int start, unsigned int stop) { struct { unsigned int elt; unsigned int score; } best; unsigned int i; if (start >= stop) return -1; memset(&best, 0, sizeof(best)); for (i = start; i < stop; i++) { exon_p_t m = mCol->e.exon[i]; m->Score = 0; m->prev = -1; } for (i = start; i < stop; i++) { exon_p_t m = mCol->e.exon[i]; unsigned int j; m->Score += m->score; if (m->Score > best.score) { best.score = m->Score; best.elt = i; } for (j = i + 1; j < stop; j++) { exon_p_t n = mCol->e.exon[j]; if (lies_after_chimera_p(m, n) && m->Score >= n->Score) { unsigned int penalty; penalty = (unsigned int) abs((int) (n->from1) - (int) (m->from1)) >> 15; penalty += (unsigned int) abs((int) (n->from2) - (int) (m->from2)) >> 15; /* Add some little penalty when unordered, to try to favor ordered * when available... */ if (n->from1 < m->from1) penalty += m->score / 10; if (penalty < m->Score) { n->Score = m->Score - penalty; n->prev = (int) i; } } } } return (int) best.elt; } void init_encoding(void) { unsigned int i; for (i = 0; i < NACHARS; i++) encoding[i] = 4; encoding['A'] = 0; encoding['C'] = 1; encoding['G'] = 2; encoding['T'] = 3; } void init_hash_env(hash_env_p_t he, unsigned int W, uchar *seq, unsigned int len) { he->W = W; he->seq = seq; he->len = len; he->mask = (1U << (W + W - 2)) - 1; he->next_pos = (int *) xmalloc((len + 1) * sizeof(int)); he->hashtab = (void **) xcalloc(HASH_SIZE, sizeof(void *)); } #ifndef __GLIBC__ void tdestroy(void *VROOT, void(*FREEFCT)(void *)) { } #endif void free_hash_env(hash_env_p_t he) { unsigned int hval; free(he->next_pos); for (hval = 0; hval < HASH_SIZE; hval++) { tdestroy(he->hashtab[hval], free); } free(he->hashtab); } static int hash_node_compare(const void *a, const void *b) { const hash_node_p_t ha = (hash_node_p_t) a, hb = (hash_node_p_t)b; if (ha->ecode < hb->ecode) return -1; if (ha->ecode > hb->ecode) return 1; return 0; } /* add_word - add a word to the table of critical words */ static inline void add_word(hash_env_p_t he, unsigned int ecode, unsigned int pos) { hash_node_p_t h = (hash_node_p_t) xmalloc(sizeof(hash_node_t)); hash_node_p_t *key; h->ecode = ecode; key = tsearch(h, he->hashtab + (ecode & HASH_MASK), hash_node_compare); assert(key != NULL); if (*key != h) { free(h); he->next_pos[pos] = (int) ((*key)->pos); } else { he->next_pos[pos] = -1; } (*key)->pos = pos; } /* ----------- build table of W-tuples in one of the sequences ------------*/ void bld_table(hash_env_p_t he) { unsigned int ecode; unsigned int i = 0; uchar *t; /* skip any word containing an N/X */ t = he->seq; while (i < he->len) { unsigned int j; restart: ecode = 0; for (j = 1; j < he->W && i < he->len; j++) { unsigned int tmp = encoding[*t++]; i += 1; if (tmp > 3) goto restart; ecode = (ecode << 2) + tmp; } while (i < he->len) { unsigned int tmp = encoding[*t++]; i += 1; if (tmp > 3) goto restart; ecode = ((ecode & he->mask) << 2) + tmp; add_word(he, ecode, i); } } } /* ----------------------- search the other sequence ---------------------*/ static void search(hash_env_p_t he, uchar *s2, unsigned int len2, unsigned int K, collec_p_t mCol) { uchar *t; unsigned int i = 0; int *allocated = xcalloc((size_t) (he->len + len2 + 1), sizeof(int)); int *diag_lev = allocated + he->len; t = s2; while (i < len2) { unsigned int j; hash_node_t hn; restart: hn.ecode = 0; for (j = 1; j < he->W && i < len2; j++) { unsigned int tmp = encoding[*t++]; i += 1; if (tmp > 3) goto restart; hn.ecode = (hn.ecode << 2) + tmp; } while (i < len2) { unsigned int tmp = encoding[*t++]; hash_node_p_t *key; i += 1; if (tmp > 3) goto restart; hn.ecode = ((hn.ecode & he->mask) << 2) + tmp; key = tfind(&hn, he->hashtab + (hn.ecode & HASH_MASK), hash_node_compare); if (key != NULL) { int p; for (p = (int) ((*key)->pos); p >= 0; p = he->next_pos[p]) extend_hit(p, (int) i, he, s2, len2, K, mCol, diag_lev); } } } free(allocated); } /* extend_hit - extend a word-sized hit to a longer match */ static void extend_hit(int pos1, int pos2, hash_env_p_t he, const uchar * const s2, unsigned int len2, unsigned int K, collec_p_t mCol, int *diag_lev) { const uchar *beg2, *beg1, *end1, *q, *s; int right_sum, left_sum, sum, diag, score; diag = pos2 - pos1; if (diag_lev[diag] > pos1) return; /* extend to the right */ left_sum = sum = 0; q = he->seq + pos1; s = s2 + pos2; end1 = q; while (s < s2 + len2 && q < he->seq + he->len && sum >= left_sum - (int) options.X) { sum += ((*s++ == *q++) ? options.matchScore : options.mismatchScore); if (sum > left_sum) { left_sum = sum; end1 = q; } } /* extend to the left */ right_sum = sum = 0; beg1 = q = (he->seq + pos1) - he->W; beg2 = s = (s2 + pos2) - he->W; while ((s > s2) && (q > he->seq) && sum >= right_sum - (int) options.X) { sum += ((*(--s) == *(--q)) ? options.matchScore : options.mismatchScore); if (sum > right_sum) { right_sum = sum; beg2 = s; beg1 = q; } } score = (int) (he->W) + left_sum + right_sum; if (score >= (int) K) { add_col_elt(mCol, new_exon((unsigned int) (beg1 - he->seq), (unsigned int) (beg2 - s2), (unsigned int) (end1 - he->seq) - 1, (unsigned int) (beg2 - s2) + (unsigned int) (end1 - beg1) - 1)); mCol->e.exon[mCol->nb - 1]->score = (unsigned int) score; } diag_lev[diag] = (int) ((end1 - he->seq) + he->W); } /* ---------------------------- sort the MSPs ----------------------------*/ /* msp_compare - determine ordering relationship between two MSPs */ static int msp_compare(const void *a, const void *b) { exon_p_t ki = * (exon_p_t *) a, kj = * (exon_p_t *) b; if (ki->from1 > kj->from1) return 1; if (ki->from1 < kj->from1) return -1; if (ki->from2 > kj->from2) return 1; if (ki->from2 < kj->from2) return -1; return 0; } /* chimera_compare - determine ordering relationship between two chimera MSPs */ static int chimera_compare(const void *a, const void *b) { exon_p_t ki = * (exon_p_t *) a, kj = * (exon_p_t *) b; if (ki->from2 > kj->from2) return 1; if (ki->from2 < kj->from2) return -1; if (ki->from1 > kj->from1) return 1; if (ki->from1 < kj->from1) return -1; return 0; } /* msp_rna_compare - determine RNA ordering relationship between two MSPs */ static int msp_rna_compare(const void *a, const void *b) { exon_p_t ki = * (exon_p_t *) a, kj = * (exon_p_t *) b; if (ki->from2 > kj->from2) return 1; if (ki->from2 < kj->from2) return -1; if (ki->to2 > kj->to2) return -1; if (ki->to2 < kj->to2) return 1; return 0; } /* --------------------- organize the MSPs into exons ---------------------*/ static void msp2exons(exon_p_t *msp, int last_msp, collec_p_t eCol, int swapped, int copy) { while (last_msp >= 0) { exon_p_t mp = msp[last_msp]; if (eCol->nb > 0 && !copy) { /* See if we merge with next exon (we go in reverse). */ exon_p_t next = eCol->e.exon[eCol->nb - 1]; if (!swapped && next->to1 > mp->to1 && next->from1 < mp->to1 + MIN_INTRON && next->from2 > mp->to2 - 1 && next->from2 < mp->to2 + MIN_INTRON) { #ifdef DEBUG fprintf(stderr, "Merging %u %u (%u %u) with %u %u (%u %u)\n", mp->from1, mp->to1, mp->from2, mp->to2, next->from1, next->to1, next->from2, next->to2); #endif next->to1 = max(next->to1, mp->to1); next->to2 = max(next->to2, mp->to2); next->from1 = min(next->from1, mp->from1); next->from2 = min(next->from2, mp->from2); last_msp = mp->prev; free(mp); continue; } } if (copy) { exon_p_t c = xmalloc(sizeof(exon_t)); memcpy(c, mp, sizeof(exon_t)); mp = c; } add_col_elt(eCol, mp); last_msp = mp->prev; } /* Now, need to reverse the exons... */ if (eCol->nb > 1) { unsigned int i, j; for (i = 0, j = eCol->nb - 1; j > i; i++, j--) { exon_p_t e = eCol->e.exon[i]; eCol->e.exon[i] = eCol->e.exon[j]; eCol->e.exon[j] = e; } } } /* ---------------------- print endpoints of exons --------------------*/ void print_exons(collec_p_t eCol, int direction) { unsigned int i; unsigned int last = eCol->nb - 1; exon_p_t cur; assert(eCol->nb > 0); for (i = 0; i < last; i++) { cur = eCol->e.exon[i]; if (direction == 0 || cur->type < 0) printf("%u-%u (%u-%u) %u%% ==\n", cur->from1 + options.dnaOffset, cur->to1 + options.dnaOffset, cur->from2, cur->to2, cur->score); else printf("%u-%u (%u-%u) %u%% %s (%.2s/%.2s) %u\n", cur->from1 + options.dnaOffset, cur->to1 + options.dnaOffset, cur->from2, cur->to2, cur->score, direction > 0 ? "->" : "<-", options.splice[cur->type].fwd, options.splice[cur->type].fwd + 2, cur->splScore); } cur = eCol->e.exon[last]; printf("%u-%u (%u-%u) %u%%\n", cur->from1 + options.dnaOffset, cur->to1 + options.dnaOffset, cur->from2, cur->to2, cur->score); } static int pluri_align(uchar *seq1, uchar *seq2, unsigned int *num_matches, collec_p_t eCol, edit_script_list_p_t *Aligns, unsigned int M, unsigned int N) { exon_t eFake; exon_p_t cur = &eFake; int diff, ali_dist; unsigned int end1, end2; unsigned int nmatches = 0; edit_script_p_t head; int ii; head = NULL; *Aligns = NULL; ali_dist = 0; end1 = M; end2 = N; eFake.from1 = M + 1; eFake.from2 = N + 1; eFake.to1 = 0; eFake.to2 = 0; for (ii = (int) (eCol->nb) - 1; ii >= 0; ii--) { exon_p_t prev = eCol->e.exon[ii]; edit_script_p_t left, right, prevE, tmp_script; uchar *a, *b; int tmpi, di_count, alen; if ((diff = (int) (cur->from2 - prev->to2) - 1) != 0) { if (cur->to1) { edit_script_list_p_t enew = (edit_script_list_p_t) xmalloc(sizeof(edit_script_list_t)); enew->next_script = *Aligns; *Aligns = enew; enew->script = head; enew->offset1 = cur->from1; enew->offset2 = cur->from2; enew->len1 = end1 - enew->offset1 + 1; enew->len2 = end2 - enew->offset2 + 1; enew->score = ali_dist; ali_dist = 0; head = NULL; } end1 = prev->to1; end2 = prev->to2; } else if ((diff = (int) (cur->from1 - prev->to1) - 1) != 0 && cur->to1) { edit_script_p_t new = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); if (diff < 0) new->op_type = CHIMERA; else new->op_type = DELETE; new->num = diff; new->next = head; head = new; } else if (diff) end1 = prev->to1; diff = align_get_dist(seq1, seq2, (int) (prev->from1) - 1, (int) (prev->from2) - 1, (int) (prev->to1), (int) (prev->to2), max(1000, (int) (.2 * (prev->to2 - prev->from2 + 1)))); if (diff < 0) return -1; align_path(seq1, seq2, (int) (prev->from1) - 1, (int) (prev->from2) - 1, (int) (prev->to1), (int) (prev->to2), diff, &left, &right, (int) M, (int) N); if (right == NULL) return -1; Condense_both_Ends(&left, &right, &prevE); if (!cur->to1 && right->op_type == DELETE) { /* remove gaps at end of alignment */ diff -= 0 + right->num; /* subtract GAP_OPEN = 0 */ if (right->num > (int) (prev->to1)) { fprintf(stderr, "Trouble in DELETE alignment op.\n"); prev->to1 = 0; end1 = 0; } else { prev->to1 -= (unsigned int) (right->num); end1 -= (unsigned int) (right->num); } if (head && (head->op_type == DELETE)) head->num += right->num; free(right); prevE->next = NULL; right = prevE; } if (ii == 0 && left && (left->op_type == DELETE)) { diff -= 0 + left->num; /* subtract GAP_OPEN = 0 */ prev->from1 += (unsigned int) left->num; tmp_script = left->next; if (right == left) right = tmp_script; free(left); left = tmp_script; } ali_dist += diff; a = seq1 + prev->from1 - 1; b = seq2 + prev->from2 - 1; tmpi = di_count = 0; tmp_script = left; while (tmp_script) { switch (tmp_script->op_type) { case DELETE: di_count += tmp_script->num; tmpi += tmp_script->num; a += tmp_script->num; break; case INSERT: di_count += tmp_script->num; tmpi += tmp_script->num; b += tmp_script->num; break; case SUBSTITUTE: { int j; for (j = 0; j < tmp_script->num; ++j, ++a, ++b) if (*a != *b) tmpi++; else nmatches++; break; } } tmp_script = tmp_script->next; } alen = (int) (((int) (prev->to1 - prev->from1 + 1 + prev->to2 - prev->from2 + 1) + di_count) / (double) 2); prev->score = (unsigned int) (((alen - tmpi) * 100) / alen); right->next = head; head = left; cur = prev; } /* at the beginning of the sequences */ if ((diff = (int) (cur->from2) - 1) != 0 && diff != (int) N) { edit_script_list_p_t enew = (edit_script_list_p_t) xmalloc(sizeof(edit_script_list_t)); enew->next_script = *Aligns; *Aligns = enew; enew->offset1 = cur->from1; enew->offset2 = cur->from2; enew->len1 = end1 - enew->offset1 + 1; enew->len2 = end2 - enew->offset2 + 1; enew->script = head; enew->score = ali_dist; } else if (diff != (int) N) { /* modified to cut introns at the beginning of the sequence */ edit_script_list_p_t enew = (edit_script_list_p_t) xmalloc(sizeof(edit_script_list_t)); enew->next_script = *Aligns; *Aligns = enew; enew->offset1 = cur->from1; enew->offset2 = 1; enew->len1 = end1 - enew->offset1 + 1; enew->len2 = end2 - enew->offset2 + 1; enew->script = head; enew->score = ali_dist; } *num_matches = nmatches; return 0; } static exon_p_t new_exon(unsigned int f1, unsigned int f2, unsigned int t1, unsigned int t2) { exon_p_t e = (exon_p_t) xmalloc(sizeof(exon_t)); e->from1 = f1; e->from2 = f2; e->to1 = t1; e->to2 = t2; return e; } /* FIXME: why are s1 and s2 reversed here, wrt SIM4 ??? */ static unsigned int greedy(uchar *s1, uchar *s2, unsigned int m, unsigned int n, unsigned int offset1, unsigned int offset2, unsigned int W, collec_p_t eCol) { int col, /* column number */ k, /* current diagonal */ blower,flower, /* boundaries for searching diagonals */ bupper,fupper, row, /* row number */ DELTA, /* n-m */ B_ORIGIN, F_ORIGIN; unsigned int d, /* current distance */ max_d, /* bound on size of edit script */ Cost, MAX_D, i; int back, forth; /* backward and forward limits at exit */ int *blast_d, *flast_d, /* rows containing the last d (at crt step, d-1) */ *btemp_d, *ftemp_d; /* rows containing tmp values for the last d */ int *min_row, *min_diag, /* min (b)/ max (f) row (and diagonal) */ *max_row, *max_diag; /* reached for cost d=0, ... m. */ /* No point trying to span megabase-sized holes... */ if (n >= 1000000) return 0; DELTA = (int) n - (int) m; /*max_d = MAX_D = m+1; */ max_d = MAX_D = max(W, (unsigned int) (P * m + 1)); if (DELTA < 0) { if (m <= min(W, (1 + P) * n)) { add_col_elt(eCol, new_exon(offset2 + 1, offset1 + 1, offset2 + n, offset1 + m)); return m - n + (unsigned int) (P * n + 1); } else { return max(W, (unsigned int) (P * m + 1)) + 1; } } F_ORIGIN = (int) MAX_D; B_ORIGIN = (int) MAX_D - DELTA; for (row = (int) m, col = (int) n; row > 0 && col > 0 && (s1[row - 1] == s2[col - 1]); row--,col--) /*LINTED empty loop body*/; if (row == 0) { /* hit last row; stop search */ add_col_elt(eCol, new_exon(offset2 - m + n + 1, offset1 + 1, offset2 + n, offset1 + m)); return 0; } blast_d = (int *) xmalloc((MAX_D + n + 1) * sizeof(int)); btemp_d = (int *) xmalloc((MAX_D + n + 1) * sizeof(int)); for (i = 0; i <= MAX_D + n; ++i) { blast_d[i] = (int) m + 1; btemp_d[i] = (int) m + 1; } blast_d[B_ORIGIN + DELTA] = row; blower = B_ORIGIN + DELTA - 1; bupper = B_ORIGIN + DELTA + 1; for (row = 0; (unsigned int) row < n && (unsigned int) row < m && (s1[row] == s2[row]); row++) /*LINTED empty loop body*/; if ((unsigned int) row == m) { /* hit last row; stop search */ add_col_elt(eCol, new_exon(offset2 + 1, offset1 + 1, offset2 + m, offset1 + m)); free(blast_d); free(btemp_d); return 0; } flast_d = (int *) xmalloc((MAX_D + n + 1) * sizeof(int)); ftemp_d = (int *) xmalloc((MAX_D + n + 1) * sizeof(int)); for (i = 0; i <= MAX_D + n; ++i) { flast_d[i] = -1; ftemp_d[i] = -1; } flast_d[F_ORIGIN] = row; flower = F_ORIGIN - 1; fupper = F_ORIGIN + 1; max_row = (int *) xmalloc((MAX_D + 1) * sizeof(int)); min_row = (int *) xmalloc((MAX_D + 1) * sizeof(int)); max_diag = (int *) xmalloc((MAX_D + 1) * sizeof(int)); min_diag = (int *) xmalloc((MAX_D + 1) * sizeof(int)); for (d = 1; d <= MAX_D; d++) { min_row[d] = (int) m + 1; max_row[d] = -1; } min_row[0] = blast_d[B_ORIGIN + DELTA]; min_diag[0] = B_ORIGIN + DELTA; max_row[0] = flast_d[F_ORIGIN]; max_diag[0] = F_ORIGIN; back = forth = -1; d = 1; while (d <= max_d) { /* for each relevant diagonal ... */ for (k = blower; k <= bupper; k++) { /* process the next edit instruction */ /* find a d on diagonal k */ if (k == -((int) d) + DELTA + B_ORIGIN) { /* move left from the last d-1 on diagonal k+1 */ row = blast_d[k + 1]; /* INSERT */ } else if (k == (int) d + DELTA + B_ORIGIN) { /* move up from the last d-1 on diagonal k-1 */ row = blast_d[k - 1] - 1; /* DELETE */ } else if ((blast_d[k] <= blast_d[k + 1]) && (blast_d[k] - 1 <= blast_d[k - 1])) { /* substitution */ row = blast_d[k] - 1; /* SUBSTITUTE */ } else if ((blast_d[k - 1] <= blast_d[k + 1] - 1) && (blast_d[k - 1] <= blast_d[k] - 1)) { /* move right from the last d-1 on diagonal k-1 */ row = blast_d[k - 1] - 1; /* DELETE */ } else { /* move left from the last d-1 on diagonal k+1 */ row = blast_d[k + 1]; /* INSERT */ } /* code common to the three cases */ col = row + k - B_ORIGIN; /* slide up the diagonal */ while (row > 0 && col > 0 && (s1[row - 1] == s2[col - 1])) { --row; --col; } btemp_d[k] = row; /* if (row == 0 || col == 0) max_d = d; */ } /* for k */ min_row[d] = btemp_d[DELTA + B_ORIGIN]; min_diag[d] = DELTA + B_ORIGIN; for (k = blower; k <= bupper; ++k) { blast_d[k] = btemp_d[k]; btemp_d[k] = (int) m + 1; if (blast_d[k] < min_row[d]) { min_row[d] = blast_d[k]; min_diag[d] = k; } } /* record cell, if paths overlap with minimum combined cost */ /* obs: it suffices to search up to Cost=min(d-1,(max_d-d)) */ for (Cost = 0; Cost < d; Cost++) { if ((min_row[d] <= max_row[Cost]) && ((max_d > d + Cost) || (max_d == d + Cost && (forth < 0)))) { max_d = d + Cost; back = (int) d; forth = (int) Cost; break; } } --blower; ++bupper; /* for each relevant diagonal ... */ for (k = flower; k <= fupper; k++) { /* process the next edit instruction */ /* find a d on diagonal k */ if (k == -((int) d) + F_ORIGIN) { /* move down from the last d-1 on diagonal k+1 */ row = flast_d[k + 1] + 1; /* DELETE */ } else if (k == (int) d + F_ORIGIN) { /* move right from the last d-1 on diagonal k-1 */ row = flast_d[k - 1]; /* INSERT */ } else if ((flast_d[k] >= flast_d[k + 1]) && (flast_d[k] + 1 >= flast_d[k - 1])) { /* substitution */ row = flast_d[k] + 1; /* SUBSTITUTE */ } else if ((flast_d[k + 1] + 1 >= flast_d[k - 1]) && (flast_d[k + 1] >= flast_d[k])) { /* move left from the last d-1 on diagonal k+1 */ row = flast_d[k + 1] + 1; /* DELETE */ } else { /* move right from the last d-1 on diagonal k-1 */ row = flast_d[k - 1]; /* INSERT */ } /* code common to the three cases */ col = row + k - F_ORIGIN; /* slide down the diagonal */ if (row >= 0) while ((unsigned int) row < m && (unsigned int) col < n && (s1[row] == s2[col])) { ++row; ++col; } ftemp_d[k] = row; /* if (row == m || col == n) max_d = d; */ } /* for k */ max_row[d] = ftemp_d[F_ORIGIN]; max_diag[d] = F_ORIGIN; for (k = flower; k <= fupper; ++k) { flast_d[k] = ftemp_d[k]; ftemp_d[k] = -1; if (flast_d[k] > max_row[d]) { max_row[d] = flast_d[k]; max_diag[d] = k; } } /* record backward and forward limits, if minimum combined * cost in overlapping. Note: it suffices to search up to * Cost=min(d,(max_d-d)). */ for (Cost = 0; Cost <= d; Cost++) { if ((min_row[Cost] <= max_row[d]) && ((max_d > d + Cost) || (max_d == d + Cost && (forth < 0)))) { max_d = d + Cost; back = (int) Cost; forth = (int) d; break; } } --flower; ++fupper; ++d; /* for d */ } if (d > MAX_D) { free(blast_d); free(btemp_d); free(flast_d); free(ftemp_d); free(min_row); free(min_diag); free(max_row); free(max_diag); return d; } /*fin:*/ { unsigned int p1, p2, q1, q2; if ((int) m - min_row[back] >= max_row[forth]) { p1 = (unsigned int) min_row[back]; p2 = (unsigned int) (min_row[back] + max_diag[forth] - F_ORIGIN); q1 = (unsigned int) min_row[back]; q2 = (unsigned int) (min_row[back] + min_diag[back] - B_ORIGIN); } else { p1 = (unsigned int) max_row[forth]; p2 = (unsigned int) (max_row[forth] + max_diag[forth] - F_ORIGIN); q1 = (unsigned int) max_row[forth]; q2 = (unsigned int) (max_row[forth] + min_diag[back] - B_ORIGIN); } assert(q1 > 0 || p1 < m); if (q1 > 0) add_col_elt(eCol, new_exon(offset2 + 1, offset1 + 1, offset2 + p2, offset1 + p1)); if (p1 < m) add_col_elt(eCol, new_exon(offset2 + q2 + 1, offset1 + q1 + 1, offset2 + n, offset1 + m)); } free(blast_d); free(btemp_d); free(flast_d); free(ftemp_d); free(min_row); free(min_diag); free(max_row); free(max_diag); assert(back + forth >= 0); return (unsigned int) (back + forth); } static int about_same_gap_p(unsigned int to1, unsigned int nFrom1, unsigned int to2, unsigned int nFrom2) { unsigned int g1, g2, d; if (nFrom1 <= to1 || nFrom2 <= to2) return 0; g1 = nFrom1 - to1 - 1; g2 = nFrom2 - to2 - 1; if (g2 > g1) { unsigned int tem = g1; g1 = g2; g2 = tem; } d = g1 - g2; if (d < MIN_INTRON || (d * 100) / g1 <= options.gapPct) return 1; return 0; } /* operates on a list sorted in increasing order of exon coordinates */ static void compact_exons(collec_p_t eCol, unsigned int W) { unsigned int i = 1; /* Kill stupid overlaping exons. */ while (i < eCol->nb) { exon_p_t cur = eCol->e.exon[i - 1]; exon_p_t next = eCol->e.exon[i]; unsigned int diff = next->from2 - cur->from2; if (diff <= options.intron_window) { eCol->nb -= 1; if (cur->to2 > next->to2) { free(next); memmove(eCol->e.exon + i, eCol->e.exon + i + 1, (eCol->nb - i) * sizeof(exon_p_t)); if (i < eCol->nb) { next = eCol->e.exon[i]; cur->to1 += diff; cur->to2 += diff; next->from1 -= diff; next->from2 -= diff; } } else { free(cur); memmove(eCol->e.exon + i - 1, eCol->e.exon + i, (eCol->nb - i + 1) * sizeof(exon_p_t)); if (i > 1) { cur = eCol->e.exon[i - 2]; cur->to1 += diff; cur->to2 += diff; next->from1 -= diff; next->from2 -= diff; } } } else i += 1; } for (i = 1; i < eCol->nb; i++) { exon_p_t cur = eCol->e.exon[i - 1]; exon_p_t next = eCol->e.exon[i]; if ((next->from1 < cur->to1 + 1 + MIN_INTRON && next->from2 <= cur->to2 + 1 + W) || about_same_gap_p(cur->to1, next->from1, cur->to2, next->from2)) { /* merge blocks cur and next */ cur->to1 = next->to1; cur->to2 = next->to2; free(next); eCol->nb -= 1; memmove(eCol->e.elt + i, eCol->e.elt + i + 1, (eCol->nb - i) * sizeof(void *)); i -= 1; } } } static int good_ratio(int l, unsigned int W) { unsigned int length = (unsigned int) l; assert(l >= 0); if (length<=W/2) return 2; else if (length<2*W) return options.cutoff; else return (int)(.75*P*length+1); } static int extend_bw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, int *line1, int *line2, unsigned int W) { int col, /* column number */ row, /* row number */ max_d, /* bound on the length of the edit script */ d, /* current compressed distance */ k, /* current diagonal */ DELTA, /* n-m */ ORIGIN, lower, upper; int *last_d, *temp_d; /* column containing the last p */ int *min_row, *min_diag; /* min (b)/ max (f) row (and diagonal) */ /* reached for cost d=0, ... m. */ DELTA = n-m; max_d = m+1; ORIGIN = m; for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--) /*LINTED empty loop body*/; if ((row == 0) || (col == 0)) { *line1 = row+offset1; *line2 = col+offset2; return 0; } last_d = (int *)xmalloc((size_t) (m+n+1) * sizeof(int)); temp_d = (int *)xmalloc((size_t) (m+n+1) * sizeof(int)); for (k=0; k<=m+n; ++k) last_d[k]=m+1; last_d[ORIGIN+DELTA] = row; lower = ORIGIN + DELTA - 1; upper = ORIGIN + DELTA + 1; min_row = (int *)xmalloc((size_t) (m+1) * sizeof(int)); min_diag = (int *)xmalloc((size_t) (m+1) * sizeof(int)); for (d=1; d<=m; d++) min_row[d] = m+1; min_row[0] = last_d[ORIGIN+DELTA]; min_diag[0] = ORIGIN + DELTA; d = 0; while ((++d<=max_d) && ((d-1<=good_ratio(m-min_row[d-1], W)) || ((d>=2) && (d-2<=good_ratio(m-min_row[d-2], W))))) { /* for each relevant diagonal ... */ for (k = lower; k <= upper; k++) { /* find a d on diagonal k */ if (k==-d+DELTA+ORIGIN) { /* move down from the last d-1 on diagonal k+1 */ row = last_d[k+1]; /* op = INSERT; */ } else if (k==d+DELTA+ORIGIN) { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]-1; /* op = DELETE; */ } else if ((last_d[k]-1<=last_d[k+1]) && (last_d[k]-1<=last_d[k-1]-1)) { /* substitution */ row = last_d[k]-1; /* op = SUBSTITUTE; */ } else if ((last_d[k-1]-1<=last_d[k+1]) && (last_d[k-1]-1<=last_d[k]-1)) { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]-1; /* op = DELETE; */ } else { /* move left from the last d-1 on diagonal k+1 */ row = last_d[k+1]; /* op = INSERT; */ } /* code common to the three cases */ /* slide down the diagonal */ col = row+k-ORIGIN; while ((row > 0) && (col > 0) && (s1[row-1]==s2[col-1])) { row--; col--; } temp_d[k] = row; if ((row == 0) && (col == 0)) { /* hit southeast corner; have the answer */ free(last_d); free(temp_d); free(min_row); free(min_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } if (row == 0) { /* hit first row; don't look further */ free(last_d); free(temp_d); free(min_row); free(min_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } if (col == 0) { /* hit last column; don't look further */ free(last_d); free(temp_d); free(min_row); free(min_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } } min_row[d] = last_d[ORIGIN+DELTA]; min_diag[d] = ORIGIN+DELTA; for (k=lower; k<=upper; ++k) if (temp_d[k]0) && (min_row[d-1]-min_row[d]<3)) d--; *line1 = min_row[d]+offset1; *line2 = min_row[d]+min_diag[d]-ORIGIN+offset2; free(min_row); free(min_diag); free(last_d); free(temp_d); return d; } static int extend_fw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, int *line1, int *line2, unsigned int W) { int col, /* column number */ row, /* row number */ max_d, /* bound on the length of the edit script */ d, /* current compressed distance */ k, /* current diagonal */ ORIGIN, lower, upper; int *last_d, *temp_d; /* column containing the last p */ int *max_row, *max_diag; /* min (b)/ max (f) row (and diagonal) */ /* reached for cost d=0, ... m. */ max_d = m+1; ORIGIN = m; for (row=0, col=0; col=2) && (d-2<=good_ratio(max_row[d-2], W))))) { /* for each relevant diagonal ... */ for (k = lower; k <= upper; k++) { /* find a d on diagonal k */ if (k==-d+ORIGIN) { /* move down from the last d-1 on diagonal k+1 */ row = last_d[k+1]+1; /* op = DELETE; */ } else if (k==d+ORIGIN) { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]; /* op = INSERT; */ } else if ((last_d[k]>=last_d[k+1]) && (last_d[k]+1>=last_d[k-1])) { /* substitution */ row = last_d[k]+1; /* op = SUBSTITUTE; */ } else if ((last_d[k+1]+1>=last_d[k-1]) && (last_d[k+1]>=last_d[k])) { /* move down from the last d-1 on diagonal k+1 */ row = last_d[k+1]+1; /* op = DELETE; */ } else { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]; /* op = INSERT; */ } /* code common to the three cases */ /* slide down the diagonal */ col = row+k-ORIGIN; if (row>=0) while ((row < m) && (col < n) && (s1[row]==s2[col])) { row++; col++; } temp_d[k] = row; if ((row == m) && (col == n)) { /* hit southeast corner; have the answer */ free(last_d); free(temp_d); free(max_row); free(max_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } if (row == m) { /* hit last row; don't look further */ free(temp_d); free(last_d); free(max_row); free(max_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } if (col == n) { /* hit last column; don't look further */ free(temp_d); free(last_d); free(max_row); free(max_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } } max_row[d] = last_d[ORIGIN]; max_diag[d] = ORIGIN; for (k=lower; k<=upper; ++k) if (temp_d[k]>max_row[d]) { max_row[d] = temp_d[k]; max_diag[d] = k; } for (k=lower; k<=upper; k++) { last_d[k] = temp_d[k]; } --lower; ++upper; } /* report here the previous maximal match, stored in max_diag and max_row */ while ((d>0) && (max_row[d]-max_row[d-1]<3)) d--; *line1 = max_row[d]+offset1; *line2 = max_row[d]+max_diag[d]-ORIGIN+offset2; free(max_row); free(max_diag); free(last_d); free(temp_d); return d; /* if ((d>2) && (max_row[d-1]-max_row[d-2]<3)) { *line1 = max_row[d-2]+offset1; *line2 = max_row[d-2]+max_diag[d-2]-ORIGIN+offset2; free(max_row); free(max_diag); free(last_d); free(temp_d); return d-2; } *line1 = max_row[d-1]+offset1; *line2 = max_row[d-1]+max_diag[d-1]-ORIGIN+offset2; free(max_row); free(max_diag); free(last_d); free(temp_d); return d-1; */ } static void swap_seqs(collec_p_t eCol) { unsigned int i; for (i = 0; i < eCol->nb; i++) { exon_p_t e = eCol->e.exon[i]; unsigned int tem = e->from1; e->from1 = e->from2; e->from2 = tem; tem = e->to1; e->to1 = e->to2; e->to2 = tem; } } static void merge(collec_p_t eCol, collec_p_t aCol, unsigned int pos, unsigned int W) { unsigned int last = pos + aCol->nb; unsigned int i; if (aCol->nb == 0) return; /* Make enough room. */ if (eCol->nb + aCol->nb > eCol->size) { eCol->size = eCol->nb + aCol->nb; eCol->e.elt = (void **) xrealloc(eCol->e.elt, eCol->size * sizeof(void *)); } /* Insert the new exons. */ memmove(eCol->e.elt + last, eCol->e.elt + pos, (eCol->nb - pos) * sizeof(void *)); memcpy(eCol->e.elt + pos, aCol->e.elt, aCol->nb * sizeof(void *)); eCol->nb += aCol->nb; if (last < eCol->nb) last += 1; if (pos == 0) pos += 1; for (i = pos; i < last; i++) { exon_p_t cur = eCol->e.exon[i - 1]; exon_p_t next = eCol->e.exon[i]; /* Check for new exons that migth have gobbled up existing ones. */ if (next->from2 <= cur->from2) { free(cur); memmove(eCol->e.elt + i - 1, eCol->e.elt + i, (eCol->nb - i) * sizeof(void *)); eCol->nb -= 1; last -= 1; i -= 1; continue; } if (cur->to2 >= next->to2) { free(next); eCol->nb -= 1; memmove(eCol->e.elt + i, eCol->e.elt + i + 1, (eCol->nb - i) * sizeof(void *)); last -= 1; i -= 1; continue; } if (next->from1 < cur->to1 + 1 + MIN_INTRON && next->from2 <= cur->to2 + 1 + W) { /* merge blocks cur and next */ cur->from1 = min(cur->from1, next->from1); cur->from2 = min(cur->from2, next->from2); cur->to1 = max(next->to1, cur->to1); cur->to2 = max(next->to2, cur->to2); free(next); eCol->nb -= 1; memmove(eCol->e.elt + i, eCol->e.elt + i + 1, (eCol->nb - i) * sizeof(void *)); last -= 1; i -= 1; } } } void free_align(edit_script_list_p_t aligns) { edit_script_list_p_t head; head = aligns; while ((head=aligns)!=NULL) { aligns = aligns->next_script; Free_script(head->script); free(head); } } #ifdef DEBUG static void debug_print_exons(collec_p_t eCol, const char *label, const unsigned char *s1, const unsigned char *s2) { unsigned int i; fprintf(stderr, "\n====================%s:\n\n", label); for (i = 0; i < eCol->nb; i++) { exon_p_t e = eCol->e.exon[i]; fprintf(stderr, " [ %u, %u, %u, %u ]\n", e->from1, e->from2, e->to1, e->to2); } for (i = 0; i < eCol->nb; i++) { exon_p_t e = eCol->e.exon[i]; int len1 = (int) e->to1 - (int) e->from1 + 1; int len2 = (int) e->to2 - (int) e->from2 + 1; if (len1 > 1) { fprintf(stderr, "%.10s %.*s %.10s\n%.10s %.*s %.10s\n", (e->from1 > 10) ? s1 + e->from1 - 11 : s1 + e->from1 - 1, len1, s1 + e->from1 - 1, s1 + e->to1, (e->from2 > 10) ? s2 + e->from2 - 11 : s2 + e->from2 - 1, len2, s2 + e->from2 - 1, s2 + e->to2); if (e->from1 > 1 && e->from2 > 1 && s1[e->from1 - 2] == s2[e->from2 - 2]) fprintf(stderr, "WARNING: further left match: %c\n", s1[e->from1 - 2]); if (s1[e->to1] != 0 && s1[e->to1] == s2[e->to2]) fprintf(stderr, "WARNING: further right match: %c\n", s1[e->to1]); } } } #endif static int perfect_spl_p(uchar *seq1, uchar *seq2, splice_score_p_t splS) { unsigned int score, j; uchar splice[4]; score = SWscore(seq1 + splS->to1 - options.scoreSplice_window, seq2 + splS->to2 - options.scoreSplice_window, options.scoreSplice_window); if (score < options.scoreSplice_window) return 0; score = SWscore(seq1 + splS->nFrom1 - 1, seq2 + splS->to2, options.scoreSplice_window); if (score < options.scoreSplice_window) return 0; memcpy(splice, seq1 + splS->to1, 2UL); memcpy(splice + 2, seq1 + splS->nFrom1 - 3, 2UL); for (j = 0; j < options.nbSplice; j++) { if (memcmp(splice, options.splice[j].fwd, 4UL) == 0) { splS->type = (int) j; splS->direction = 1; return 1; } if (memcmp(splice, options.splice[j].rev, 4UL) == 0) { splS->type = (int) j; splS->direction = -1; return 1; } } return 0; } static int splice_score_compare(const void *a, const void *b) { const splice_score_p_t sa = (splice_score_p_t) a; const splice_score_p_t sb = (splice_score_p_t) b; unsigned int adj_score_a = sa->score + sa->splScore; unsigned int adj_score_b = sb->score + sb->splScore; adj_score_a += (sa->type < sb->type) ? 1 : 0; adj_score_b += (sb->type < sa->type) ? 1 : 0; if (adj_score_a < adj_score_b) return -1; if (adj_score_a > adj_score_b) return 1; if (sa->splScore < sb->splScore) return -1; if (sa->splScore > sb->splScore) return 1; if (sa->type > sb->type) return -1; if (sa->type < sb->type) return 1; return 0; } static void compute_max_score_1(uchar *seq1, uchar *seq2, splice_score_p_t splS, int type, unsigned int to1, unsigned int to2, unsigned int nFrom1, uchar *s, uchar *jct, int dir) { int j; memcpy(s + options.scoreSplice_window, jct, 4UL); for (j = - (int) options.intron_window; j <= (int) options.intron_window; j++) { splice_score_t curL, curR; int i; curL.type = curR.type = type; curL.splScore = curR.splScore = 0; curL.score = curR.score = 0; memcpy(s, seq2 + to2 - options.scoreSplice_window + j, (size_t) options.scoreSplice_window); memcpy(s + options.scoreSplice_window + 4, seq2 + to2 + j, (size_t) options.scoreSplice_window); for (i = - (int) options.spliceInDel; i <= (int) options.spliceInDel; i++) { splice_score_t cur; cur.type = type; cur.splScore = 0; if (seq1[(int) to1 + j + i] == jct[0]) cur.splScore += 1; if (seq1[(int) to1 + j + i + 1] == jct[1]) cur.splScore += 1; cur.score = SWscore(seq1 + to1 - options.scoreSplice_window + j + i, s, options.scoreSplice_window + 2); #ifdef DEBUG fprintf(stderr, "%.*s %.2s\n%.*s %.2s\nL: %d %d\n", options.scoreSplice_window, seq1 + to1 - options.scoreSplice_window + j + i, seq1 + to1 + j + i, options.scoreSplice_window, s, s + options.scoreSplice_window, cur.score, cur.splScore); #endif if (splice_score_compare(&cur, &curL) > 0) { curL.score = cur.score; curL.splScore = cur.splScore; curL.to1 = (unsigned int) ((int) to1 + j + i); } cur.splScore = 0; if (seq1[(int) nFrom1 - 3 + j + i] == jct[2]) cur.splScore += 1; if (seq1[(int) nFrom1 - 2 + j + i] == jct[3]) cur.splScore += 1; cur.score = SWscore(seq1 + nFrom1 - 3 + j + i, s + options.scoreSplice_window + 2, options.scoreSplice_window + 2); #ifdef DEBUG fprintf(stderr, "%.2s %.*s\n%.2s %.*s\nR: %d %d\n\n", seq1 + nFrom1 - 3 + j + i, options.scoreSplice_window, seq1 + nFrom1 - 1 + j + i, s + options.scoreSplice_window + 2, options.scoreSplice_window, s + options.scoreSplice_window + 4, cur.score, cur.splScore); #endif if (splice_score_compare(&cur, &curR) > 0) { curR.score = cur.score; curR.splScore = cur.splScore; curR.nFrom1 = (unsigned int) ((int) nFrom1 + j + i); } } #ifdef DEBUG fprintf(stderr, "Best is %d %d %d %d\n", curL.score, curL.splScore, curR.score, curR.splScore); #endif curL.score += curR.score; curL.splScore += curR.splScore; if (splice_score_compare(&curL, splS) > 0) { splS->score = curL.score; splS->splScore = curL.splScore; splS->to1 = curL.to1; splS->to2 = (unsigned int) ((int) to2 + j); splS->nFrom1 = curR.nFrom1; splS->type = type; splS->direction = dir; } #ifdef DEBUG fprintf(stderr, "Kept best is %d %d\n\n", splS->score, splS->splScore); #endif } } /* FIXME : Frame shifts are a real pain. Look at BM149342 for * example. The scoring is not quite right in that case. */ static void compute_max_score(uchar *seq1, uchar *seq2, splice_score_p_t splS, int direction) { int k; unsigned int to1 = splS->to1; unsigned int to2 = splS->to2; unsigned int nFrom1 = splS->nFrom1; uchar *s = (uchar *) xmalloc((options.scoreSplice_window * 2 + 4) * sizeof(uchar)); splS->score = 0; splS->splScore = 0; splS->type = -1; for (k = 0; k < (int) options.nbSplice; k++) { #ifdef DEBUG fprintf(stderr, "\nChecking with %.4s\n\n", options.splice[k].fwd); #endif if (direction >= 0) compute_max_score_1(seq1, seq2, splS, k, to1, to2, nFrom1, s, options.splice[k].fwd, 1); if (direction <= 0) compute_max_score_1(seq1, seq2, splS, k, to1, to2, nFrom1, s, options.splice[k].rev, -1); } free(s); } static void slide_intron(result_p_t r, uchar *seq1, uchar *seq2) { unsigned int i; /* First, try to get direction through perfect splices. */ for (i = 1; i < r->eCol.nb; i++) { exon_p_t cur = r->eCol.e.exon[i - 1]; exon_p_t next = r->eCol.e.exon[i]; splice_score_t splS; cur->type = -1; cur->direction = 0; cur->splScore = 0; if (next->from2 - cur->to2 != 1) continue; splS.to1 = cur->to1; splS.to2 = cur->to2; splS.nFrom1 = next->from1; if (perfect_spl_p(seq1, seq2, &splS)) { r->direction += splS.direction; cur->direction = splS.direction; cur->type = (char) splS.type; cur->splScore = 4 + options.scoreSplice_window * 2; } } /* Second, go through overlaping exons. */ for (i = 1; i < r->eCol.nb; i++) { exon_p_t cur = r->eCol.e.exon[i - 1]; exon_p_t next = r->eCol.e.exon[i]; splice_score_p_t splS; unsigned int nb, j, nbP = 0; if (next->from2 > cur->to2) continue; nb = cur->to2 - next->from2 + 2; splS = (splice_score_p_t) xmalloc(nb * sizeof(splice_score_t)); for (j = 0; j < nb; j++) { splS[j].to1 = cur->to1 - nb + j + 1; splS[j].to2 = cur->to2 - nb + j + 1; splS[j].nFrom1 = next->from1 + j; if (perfect_spl_p(seq1, seq2, splS + j)) nbP += 1; else splS[j].direction = 0; } if (nbP == 1) for (j = 0; j < nb; j++) if (splS[j].direction != 0) { r->direction += splS[j].direction; cur->direction = splS[j].direction; cur->type = (char) splS[j].type; cur->splScore = 4 + options.scoreSplice_window * 2; cur->to1 = splS[j].to1; cur->to2 = splS[j].to2; next->from2 = cur->to2 + 1; next->from1 = splS[j].nFrom1; } free(splS); } /* In case we are still undecided... */ if (r->direction == 0) { unsigned int fwd = 0, rev = 0; for (i = 1; i < r->eCol.nb; i++) { exon_p_t cur = r->eCol.e.exon[i - 1]; exon_p_t next = r->eCol.e.exon[i]; splice_score_t max, cs; unsigned int nb, j; if (cur->to2 + 1 < next->from2) continue; if (cur->direction > 0) { fwd += cur->splScore; continue; } if (cur->direction < 0) { rev += cur->splScore; continue; } nb = cur->to2 - next->from2 + 2; max.type = -1; max.score = 0; max.splScore = 0; for (j = 0; j < nb; j++) { cs.to1 = cur->to1 - nb + j + 1; cs.to2 = cur->to2 - nb + j + 1; cs.nFrom1 = next->from1 + j; compute_max_score(seq1, seq2, &cs, 0); if (splice_score_compare(&cs, &max) > 0) max = cs; } if (max.direction > 0) fwd += max.score; if (max.direction < 0) rev += max.score; } if (fwd >= rev) r->direction = 1; else r->direction = -1; } for (i = 1; i < r->eCol.nb; i++) { exon_p_t cur = r->eCol.e.exon[i - 1]; exon_p_t next = r->eCol.e.exon[i]; splice_score_t max, cs; unsigned int nb, j; if ((cur->type >= 0 && cur->direction * r->direction > 0) || cur->to2 + 1 < next->from2) continue; nb = cur->to2 - next->from2 + 2; max.type = -1; max.score = 0; max.splScore = 0; for (j = 0; j < nb; j++) { cs.to1 = cur->to1 - nb + j + 1; cs.to2 = cur->to2 - nb + j + 1; cs.nFrom1 = next->from1 + j; compute_max_score(seq1, seq2, &cs, r->direction); if (splice_score_compare(&cs, &max) > 0) max = cs; } cur->direction = max.direction; cur->type = (char) max.type; cur->splScore = max.score; cur->to1 = max.to1; cur->to2 = max.to2; next->from2 = cur->to2 + 1; next->from1 = max.nFrom1; #ifdef DEBUG fprintf(stderr, "Resolving intron (%d) %d\n" " %u..%u (%u..%u)" " %u..%u (%u..%u)" " scores: %u %u %u\n" "%.10s ... %.10s\n" "%.18s...%.18s\n", nb, r->direction, cur->from2, cur->to2, next->from2, next->to2, cur->from1, cur->to1, next->from1, next->to1, max.score, max.splScore, max.type, seq2 + cur->to2 - 10, seq2 + next->from2 - 1, seq1 + cur->to1 - 10, seq1 + next->from1 - 9); #endif if (cur->to2 == cur->from2 || cur->to1 <= cur->from1) { /* Remove cur block, which has been absorbed into next. */ free(cur); r->eCol.nb -= 1; i -= 1; memmove(r->eCol.e.exon + i, r->eCol.e.exon + i + 1, (r->eCol.nb - i) * sizeof(exon_p_t)); if (i > 0) { i -= 1; cur = r->eCol.e.exon[i]; next->from2 -= 1; next->from1 -= 1; cur->direction = 0; cur->type = 0; cur->splScore = 0; } #ifdef DEBUG fprintf(stderr, "Removing absorbed cur exon %u %u\n", i, r->eCol.nb); #endif } if (next->to2 <= next->from2 || next->to1 <= next->from1) { /* Remove next block, which has been absorbed into cur. */ free(next); r->eCol.nb -= 1; memmove(r->eCol.e.exon + i, r->eCol.e.exon + i + 1, (r->eCol.nb - i) * sizeof(exon_p_t)); i -= 1; cur->direction = 0; cur->type = 0; cur->splScore = 0; #ifdef DEBUG fprintf(stderr, "Removing absorbed next exon %u %u\n", i, r->eCol.nb); #endif } } } /* Compute some sort of score, using a Smith/Waterman style algorithm, * but allowing for only one gap. * We use a matrix of this form: * T C A G T ... * +---------------------- * A | * | +===+ * T | | 2 | * | +===+ | * C | | 0 1 | * | +=======+ * A | * | * T | */ static unsigned int SWscore(uchar *s1, uchar *s2, unsigned int len) { unsigned int i; int score[3]; score[0] = score[2] = 0; score[1] = *s1 == *s2 ? 1 : 0; for (i = 1; i < len; i++) { score[0] = max(score[0] + (s1[i - 1] == s2[i] ? 1 : 0), score[1]); score[2] = max(score[2] + (s1[i] == s2[i - 1] ? 1 : 0), score[1]); score[1] = max(max(score[0] - 1, score[2] - 1), score[1] + (s1[i] == s2[i] ? 1 : 0)); } assert(score[1] >= 0); return (unsigned int) score[1]; } SIBsim4-0.20/misc.c0000644000551200011300000000270010220112463013107 0ustar chrisludwig/* $Id: misc.c,v 1.10 2005/03/22 21:59:15 c4chris Exp $ * * Christian Iseli, LICR ITO, Christian.Iseli@licr.org * * Copyright (c) 2001-2005 Swiss Institute of Bioinformatics. * Copyright (C) 1998-2001 Liliana Florea. * Copyright (C) 1998-2001 Scott Schwartz. */ #include #include #include #include #include #include "misc.h" /* format message, print it, and die */ void fatal(const char *fmt, ...) { extern char *argv0; extern char dna_seq_head[256]; extern char rna_seq_head[256]; va_list ap; va_start(ap, fmt); fflush(stdout); if (argv0) { char *p = strrchr(argv0, '/'); fprintf(stderr, "%s: ", p ? p+1 : argv0); } vfprintf(stderr, fmt, ap); va_end(ap); fprintf(stderr, "\n while processing:\n%.256s\n%.256s\n", dna_seq_head, rna_seq_head); #ifdef DEBUG abort(); #else exit(1); #endif } void * xmalloc(size_t size) { void *res = malloc(size); if (res == NULL) fatal("malloc of %zd failed: %s (%d)\n", size, strerror(errno), errno); return res; } void * xcalloc(size_t nmemb, size_t size) { void *res = calloc(nmemb, size); if (res == NULL) fatal("calloc of %zd, %zd failed: %s (%d)\n", nmemb, size, strerror(errno), errno); return res; } void * xrealloc(void *ptr, size_t size) { void *res = realloc(ptr, size); if (res == NULL) fatal("realloc of %p to %zd failed: %s (%d)\n", ptr, size, strerror(errno), errno); return res; } SIBsim4-0.20/misc.h0000644000551200011300000000111010041504447013116 0ustar chrisludwig/* $Id: misc.h,v 1.4 2004/04/21 14:45:59 chris Exp $ * * Christian Iseli, LICR ITO, Christian.Iseli@licr.org * * Copyright (c) 2001-2004 Swiss Institute of Bioinformatics. * Copyright (C) 1998-2001 Liliana Florea. * Copyright (C) 1998-2001 Scott Schwartz. */ #ifndef SIM_MISC_H #define SIM_MISC_H #ifdef __GNUC__ void fatal(const char *fmt, ...) __attribute__ ((format (printf, 1, 2) , __noreturn__)); #else void fatal(const char *fmt, ...); #endif void *xmalloc(size_t size); void *xcalloc(size_t nmemb, size_t size); void *xrealloc(void *ptr, size_t size); #endif SIBsim4-0.20/sim4.init.c0000644000551200011300000005571711322656623014031 0ustar chrisludwig/* $Id: sim4.init.c,v 1.91 2010/01/11 17:19:15 c4chris Exp $ * * Christian Iseli, LICR ITO, Christian.Iseli@licr.org * * Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2010 * Swiss Institute of Bioinformatics. * Copyright (C) 1998-2001 Liliana Florea. */ /* * TODO */ /* * sim4 - Align a cDNA sequence with a genomic sequence for it. * * The basic command syntax is * * sim4 [options] dna.seq rna.seq * * where dna.seq names a file containing a DNA sequence and rna.seq * names a file containing one or more RNA sequences. * The files are to be in FASTA format. Thus a typical sequence file * might begin: * * >BOVHBPP3I Bovine beta-globin psi-3 pseudogene, 5' end. * GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT * GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC */ #ifdef __sun #define _XOPEN_SOURCE /* tell sun we want getopt, etc. */ #define _XPG5 /* and we want snprintf */ #endif #include #include #include #include #include #include #include #include #include #include #include #include "sim4.h" #include "align.h" #include "misc.h" #include "sim4b1.h" #if defined(DEBUG) && (DEBUG > 1) #include #endif static void init_seq(const char *, seq_p_t); static int get_next_seq(seq_p_t, unsigned int, int); static void seq_revcomp_inplace(seq_p_t); static void print_align_lat(uchar *, uchar *, result_p_t); static void print_polyA_info(seq_p_t, seq_p_t, collec_p_t, sim4_stats_p_t); static void print_res(result_p_t, int, seq_p_t, seq_p_t); static void init_splice_junctions(void); static void bug_handler(int); #ifdef DEBUG static void free_seq(seq_p_t); #endif static const char Usage[] = "%s [options] dna est_db\n\n" "This is SIBsim4 version 0.20.\n\n" #ifdef DEBUG "Debug version\n\n" #endif "Available options (default value in braces[]):\n" " -A output format\n" " 0: exon endpoints only\n" " 1: alignment text\n" " 3: both exon endpoints and alignment text\n" " 4: both exon endpoints and alignment text with polyA info\n" " 5: same as 4 plus display chimeric alignment text\n" " (which is currently problematic)\n" " Note that 2 is unimplemented [%d]\n" " -C MSP score threshold for the second pass [%d]\n" " -c minimum score cutoff [%d]\n" " -E cutoff value [%d]\n" " -f score filter in percent (0 to disable filtering) [%d]\n" " -g join exons when gap on genomic and RNA have lengths which\n" " differ at most by this percentage [%d]\n" " -H report chimeric transcripts when the best score is lower\n" " than this percentage of the overall RNA coverage and the\n" " chimera score is greater than this percentage of the\n" " RNA length (0 disables this report) [%u]\n" " -I window width in which to search for intron splicing [%d]\n" " -K MSP score threshold for the first pass [%d]\n" " -L a comma separated list of forward splice-types [%s]\n" " -M scoring splice sites, evaluate match within M nucleotides [%d]\n" " -o offset nt positions in dna sequence by this amount [%u]\n" " -q penalty for a nucleotide mismatch [%d]\n" " -R direction of search\n" " 0: search the '+' (direct) strand only\n" " 1: search the '-' strand only\n" " 2: search both strands and report the best match\n" " [%d]\n" " -r reward for a nucleotide match [%d]\n" " -S max number of insertions and/or deletions evaluated on the DNA\n" " strand when determining the best splice site positions [%d]\n" " -s split score in percent [%d]\n" " -W word size [%d]\n" " -X value for terminating word extensions [%d]\n"; options_t options; char *argv0; char dna_seq_head[256]; char rna_seq_head[256]; int main(int argc, char *argv[]) { int count; seq_t seq1, seq2; hash_env_t he; collec_t res, rev_res; #if defined(DEBUG) && (DEBUG > 1) mcheck(NULL); mtrace(); #endif argv0 = argv[0]; if (setlocale(LC_ALL, "POSIX") == NULL) fprintf(stderr, "%s: Warning: could not set locale to POSIX\n", argv[0]); signal(SIGSEGV, bug_handler); signal(SIGBUS, bug_handler); /* Default options. */ options.C = DEFAULT_C; options.cutoff = DIST_CUTOFF; options.gapPct = DEFAULT_GAPPCT; options.intron_window = 6; options.K = DEFAULT_K; options.splice_type_list = (unsigned char *) "GTAG,GCAG,GTAC,ATAC"; options.nbSplice = 4; options.scoreSplice_window = 10; options.mismatchScore = MISMATCH; options.reverse = 2; options.matchScore = MATCH; options.W = DEFAULT_W; options.X = DEFAULT_X; options.filterPct = DEFAULT_FILTER; options.minScore_cutoff = MATCH_CUTOFF; options.splitScorePct = DEFAULT_SPLIT_SCORE; options.huntChimera = DEFAULT_CHIMERA; options.spliceInDel = DEFAULT_SPLICE_I_D; while (1) { int c = getopt(argc, argv, "A:C:c:E:f:g:H:I:K:L:M:o:q:R:r:S:s:W:X:"); if (c == -1) break; switch (c) { case 'A': options.ali_flag = atoi(optarg); if (options.ali_flag < 0 || options.ali_flag > 5) fatal("A must be one of 0, 1, 2, 3, 4, or 5.\n"); break; case 'C': { int val = atoi(optarg); if (val < 0) fatal("Value for option C must be non-negative.\n"); options.C = (unsigned int) val; break; } case 'c': { int val = atoi(optarg); if (val < 0) fatal("Value for option c must be non-negative.\n"); options.minScore_cutoff = (unsigned int) val; break; } case 'E': options.cutoff = atoi(optarg); if (options.cutoff < 3 || options.cutoff > 10) fatal("Cutoff (E) must be within [3,10].\n"); break; case 'f': options.filterPct = (unsigned int) atoi(optarg); if (options.filterPct > 100) fatal("Filter in percent (f) must be within [0,100].\n"); break; case 'g': options.gapPct = (unsigned int) atoi(optarg); if (options.gapPct > 100) fatal("Length difference in percent (g) must be within [0,100].\n"); break; case 'H': options.huntChimera = (unsigned int) atoi(optarg); if (options.huntChimera > 100) fatal("Chimera filter in percent (H) must be within [0,100].\n"); break; case 'I': { int val = atoi(optarg); if (val < 0) fatal("Value for option I must be non-negative.\n"); options.intron_window = (unsigned int) val; break; } case 'K': { int val = atoi(optarg); if (val < 0) fatal("Value for option K must be non-negative.\n"); options.K = (unsigned int) val; break; } case 'L': { size_t i; size_t len = strlen(optarg); options.splice_type_list = (unsigned char *) optarg; options.nbSplice = 1; if (len % 5 != 4) fatal("Splice types list has illegal length (%zu)\n", len); for (i = 0; i < len; i++) if (i % 5 == 4) { if (options.splice_type_list[i] != ',') fatal("Comma expected instead of %c at position %zu" "in splice types list.\n", options.splice_type_list[i], i); options.nbSplice += 1; } else { if (options.splice_type_list[i] != 'A' && options.splice_type_list[i] != 'C' && options.splice_type_list[i] != 'G' && options.splice_type_list[i] != 'T') fatal("Expected 'A', 'C', 'G' or 'T' instead of '%c' at" "position %zu in splice types list.\n", options.splice_type_list[i], i); } break; } case 'M': { int val = atoi(optarg); if (val < 0) fatal("Value for option M must be non-negative.\n"); options.scoreSplice_window = (unsigned int) val; break; } case 'o': { int val = atoi(optarg); if (val < 0) fatal("Value for option o must be non-negative.\n"); options.dnaOffset = (unsigned int) val; break; } case 'q': options.mismatchScore = atoi(optarg); break; case 'R': options.reverse = (unsigned int) atoi(optarg); if (options.reverse > 2) fatal("R must be one of 0, 1, or 2.\n"); break; case 'r': options.matchScore = atoi(optarg); break; case 'S': options.spliceInDel = (unsigned int) atoi(optarg); break; case 's': options.splitScorePct = (unsigned int) atoi(optarg); if (options.splitScorePct > 100) fatal("Split score in percent (s) must be within [0,100].\n"); break; case 'W': options.W = (unsigned int) atoi(optarg); if (options.W < 1 || options.W > 15) fatal("W must be within [1,15].\n"); break; case 'X': options.X = (unsigned int) atoi(optarg); if (options.X < 1) fatal("X must be positive.\n"); break; case '?': break; default: fprintf(stderr, "?? getopt returned character code 0%o ??\n", c); } } if (optind + 2 != argc) { fprintf(stderr, Usage, argv[0], options.ali_flag, options.C, options.minScore_cutoff, options.cutoff, options.filterPct, options.gapPct, options.huntChimera, options.intron_window, options.K, options.splice_type_list, options.scoreSplice_window, options.dnaOffset, options.mismatchScore, options.reverse, options.matchScore, options.spliceInDel, options.splitScorePct, options.W, options.X); return 1; } /* read seq1 */ init_seq(argv[optind], &seq1); if (get_next_seq(&seq1, options.dnaOffset, 1) != 0) fatal("Cannot read sequence from %s.\n", argv[optind]); strncpy(dna_seq_head, seq1.header, 256UL); /* read seq2 */ init_seq(argv[optind + 1], &seq2); if (get_next_seq(&seq2, 0, 0) != 0) fatal("Cannot read sequence from %s.\n", argv[optind + 1]); init_encoding(); init_hash_env(&he, options.W, seq1.seq, seq1.len); init_col(&res, 1); init_col(&rev_res, 1); bld_table(&he); init_splice_junctions(); count = 0; while (!count || get_next_seq(&seq2, 0, 0) == 0) { unsigned int curRes; strncpy(rna_seq_head, seq2.header, 256UL); ++count; switch (options.reverse) { case 0: SIM4(&he, &seq2, &res); break; case 2: SIM4(&he, &seq2, &res); case 1: seq_revcomp_inplace(&seq2); SIM4(&he, &seq2, &rev_res); break; default: fatal ("Unrecognized request for EST orientation.\n"); } /* Show chimeric result if requested. */ if (options.huntChimera > 0) { unsigned int max_nmatches = 0; unsigned int max_cmatches = 0; for (curRes = 0; curRes < rev_res.nb; curRes++) { result_p_t r = rev_res.e.result[curRes]; if (r->chimera) { if (r->st.nmatches > max_cmatches) max_cmatches = r->st.nmatches; } else { if (r->st.nmatches > max_nmatches) max_nmatches = r->st.nmatches; } } for (curRes = 0; curRes < res.nb; curRes++) { result_p_t r = res.e.result[curRes]; if (r->chimera) { if (r->st.nmatches > max_cmatches) max_cmatches = r->st.nmatches; } else { if (r->st.nmatches > max_nmatches) max_nmatches = r->st.nmatches; } } if ((max_cmatches * options.huntChimera) / 100 > max_nmatches && max_cmatches > (seq2.len * options.huntChimera) / 100) { /* The chimera wins... */ for (curRes = 0; curRes < rev_res.nb; curRes++) { result_p_t r = rev_res.e.result[curRes]; if (r->chimera == 0) r->st.nmatches = 0; } for (curRes = 0; curRes < res.nb; curRes++) { result_p_t r = res.e.result[curRes]; if (r->chimera == 0) r->st.nmatches = 0; } } else { /* The chimera loses. */ for (curRes = 0; curRes < rev_res.nb; curRes++) { result_p_t r = rev_res.e.result[curRes]; if (r->chimera) r->st.nmatches = 0; } for (curRes = 0; curRes < res.nb; curRes++) { result_p_t r = res.e.result[curRes]; if (r->chimera) r->st.nmatches = 0; } } } /* Keep only the best matches, according to filterPct. */ if (options.filterPct > 0) { unsigned int max_nmatches = 0; for (curRes = 0; curRes < rev_res.nb; curRes++) { result_p_t r = rev_res.e.result[curRes]; if (r->st.nmatches > max_nmatches) max_nmatches = r->st.nmatches; } for (curRes = 0; curRes < res.nb; curRes++) { result_p_t r = res.e.result[curRes]; if (r->st.nmatches > max_nmatches) max_nmatches = r->st.nmatches; } max_nmatches = (max_nmatches * options.filterPct) / 100; for (curRes = 0; curRes < rev_res.nb; curRes++) { result_p_t r = rev_res.e.result[curRes]; if (r->st.nmatches < max_nmatches) r->st.nmatches = 0; } for (curRes = 0; curRes < res.nb; curRes++) { result_p_t r = res.e.result[curRes]; if (r->st.nmatches < max_nmatches) r->st.nmatches = 0; } } /* Now, print results. */ for (curRes = 0; curRes < rev_res.nb; curRes++) print_res(rev_res.e.result[curRes], 1, &seq1, &seq2); rev_res.nb = 0; if (options.reverse && options.ali_flag) /* reverse-complement back seq2 for alignment */ seq_revcomp_inplace(&seq2); for (curRes = 0; curRes < res.nb; curRes++) print_res(res.e.result[curRes], 0, &seq1, &seq2); res.nb = 0; } #ifdef DEBUG fprintf(stderr, "DEBUG mode: freeing all memory...\n"); fflush(stdout); fflush(stderr); free_hash_env(&he); free_seq(&seq1); free_seq(&seq2); free(options.splice); free(res.e.elt); free(rev_res.e.elt); #endif return 0; } static const unsigned char dna_complement[256] = " " " TVGH CD M KN YSA BWXR tvgh cd m kn ysa bwxr " " " " "; /* ................................................................ */ /* @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~. */ /* ................................................................ */ /* ................................................................ */ static void init_splice_junctions(void) { unsigned int i; options.splice = (junction_p_t) xmalloc(options.nbSplice * sizeof(junction_t)); for (i = 0; i < options.nbSplice; i++) { unsigned int j; for (j = 0; j < 4; j++) { uchar c = options.splice_type_list[i * 5 + j]; options.splice[i].fwd[j] = c; options.splice[i].rev[3 - j] = dna_complement[c]; } } #ifdef DEBUG for (i = 0; i < options.nbSplice; i++) fprintf(stderr, "Splice[%u]: %.4s %.4s\n", i, options.splice[i].fwd, options.splice[i].rev); #endif } static void print_res(result_p_t res, int rev, seq_p_t seq1, seq_p_t seq2) { unsigned int i; if (res->st.nmatches >= options.minScore_cutoff) { printf("\n%s%s\n", seq1->header, seq2->header); if (res->chimera) printf("Chimera\n\n"); if (rev) printf("(complement)\n\n"); switch (options.ali_flag) { case 0: print_exons(&res->eCol, res->direction); break; case 1: print_align_lat(seq1->seq, seq2->seq, res); break; case 3: print_exons(&res->eCol, res->direction); print_align_lat(seq1->seq, seq2->seq, res); break; case 4: case 5: print_exons(&res->eCol, res->direction); print_polyA_info(seq1, seq2, &res->eCol, &res->st); print_align_lat(seq1->seq, seq2->seq, res); break; default: fatal("Unrecognized option for alignment output.\n"); } printf("\n"); } for (i = 0; i < res->eCol.nb; i++) free(res->eCol.e.elt[i]); free(res->eCol.e.elt); if (res->sList) free_align(res->sList); free(res); } static void print_polyA_info(seq_p_t s1, seq_p_t s2, collec_p_t eCol, sim4_stats_p_t st) { if (st->polyA_cut) { unsigned int cnt = 0, cntDna = 0, pos, i, scanLen = 50; char *pSig, buf[51]; exon_p_t e = eCol->e.exon[eCol->nb - 1]; for (pos = 0; pos < 10 && e->to2 + pos < s2->len; pos++) if (s2->seq[e->to2 + pos] == 'A') cnt += 1; while (e->to2 + pos < s2->len && s2->seq[e->to2 + pos] == 'A') { pos += 1; cnt += 1; } for (i = 0; i < s1->len && i < pos; i++) if (s1->seq[e->to1 + i] == 'A') cntDna += 1; printf("\nPolyA site %u nt, %u/%u A's %u\n R %.*s %u\n D %*.*s %u\n", pos, cnt, cntDna, e->to1 + 1 + options.dnaOffset, pos, s2->seq + e->to2, e->to2 + 1, pos, i, s1->seq + e->to1, e->to1 + 1 + options.dnaOffset); if (e->to1 < scanLen) scanLen = e->to1; strncpy(buf, (char *) s1->seq + e->to1 - scanLen, (size_t) scanLen); buf[scanLen] = 0; pSig = strstr(buf, "AATAAA"); if (pSig == NULL) pSig = strstr(buf, "ATTAAA"); if (pSig != NULL) printf("PolyA signal %u\n", (unsigned int) (pSig - buf + e->to1 - scanLen + 1 + options.dnaOffset)); } if (st->polyT_cut) { unsigned int cnt = 0, cntDna = 0, pos, i; char *pSig, buf[51]; exon_p_t e = eCol->e.exon[0]; for (pos = 0; pos < 10 && pos < e->from2 - 1; pos++) if (s2->seq[e->from2 - 2 - pos] == 'T') cnt += 1; while (pos < e->from2 - 1 && s2->seq[e->from2 - 2 - pos] == 'T') { pos += 1; cnt += 1; } for (i = 0; i < e->from1 - 1 && i < pos; i++) if (s1->seq[e->from1 - 2 - i] == 'T') cntDna += 1; printf("\nPolyA site %u nt, %u/%u A's %u minus strand\n R %.*s %u\n D %*.*s %u\n", pos, cnt, cntDna, e->from1 - 1 + options.dnaOffset, pos, s2->seq + (e->from2 - 1 - pos), e->from2 - 1, pos, i, s1->seq + (e->from1 - 1 - i), e->from1 - 1 + options.dnaOffset); strncpy(buf, (char *) s1->seq + e->from1 - 1, 50UL); buf[50] = 0; pSig = strstr(buf, "TTTATT"); if (pSig == NULL) pSig = strstr(buf, "TTTAAT"); if (pSig != NULL) printf("PolyA signal %u minus strand\n", (unsigned int) (pSig - buf + e->from1 + 5 + options.dnaOffset)); } } static void print_align_lat(uchar *seq1, uchar *seq2, result_p_t r) { int *S; edit_script_list_p_t head, aligns; if (r->sList == NULL) return; if (r->chimera && options.ali_flag < 5) return; aligns = r->sList; while (aligns != NULL) { head = aligns; aligns = aligns->next_script; S = (int *) xmalloc((2 * head->len2 + 1 + 1) * sizeof(int)); S++; S2A(head->script, S); Free_script(head->script); IDISPLAY(seq1 + head->offset1 - 1 - 1, seq2 + head->offset2 - 1 - 1, head->len1, head->len2, S, head->offset1, head->offset2, &r->eCol, r->direction); free(S - 1); free(head); } r->sList = NULL; } #ifdef DEBUG static void free_buf(read_buf_p_t b) { free(b->line); } static void free_seq(seq_p_t sp) { free(sp->seq); free(sp->header); free_buf(&sp->rb); if (sp->fName != NULL) close(sp->fd); } #endif static void grow_read_buf(read_buf_p_t b) { b->lmax += BUF_SIZE; b->line = xrealloc(b->line, b->lmax * sizeof(char)); } static char * shuffle_line(read_buf_p_t b, size_t *cur) { if (b->ic == 0 || *cur >= b->ic) return NULL; /* Make sure we have enough room in line. */ if (b->lmax <= b->lc + (b->ic - *cur)) grow_read_buf(b); while (*cur < b->ic && b->in[*cur] != '\n') b->line[b->lc++] = b->in[(*cur)++]; if (*cur < b->ic) { /* Ok, we have our string. */ /* Copy the newline. */ b->line[b->lc++] = b->in[(*cur)++]; /* We should be fine, since we read BUF_SIZE -1 at most... */ b->line[b->lc] = 0; /* Adjust the input buffer. */ if (*cur < b->ic) { memmove(b->in, b->in + *cur, (b->ic - *cur) * sizeof(char)); b->ic -= (unsigned int) *cur; } else b->ic = 0; *cur = 0; return b->line; } /* Go read some more. */ b->ic = 0, *cur = 0; return NULL; } static char * read_line_buf(read_buf_p_t b, int fd) { char *s = NULL; ssize_t rc; size_t cur = 0; b->lc = 0; if ((s = shuffle_line(b, &cur)) != NULL) return s; do { if ((rc = read(fd, b->in + b->ic, (size_t) (BUF_SIZE - b->ic - 1))) == -1) { if (errno != EINTR) fatal("Could not read from %d: %s(%d)\n", fd, strerror(errno), errno); } else b->ic += (unsigned int) rc; s = shuffle_line(b, &cur); if (s == NULL && rc == 0) { /* Got to the EOF... */ b->line[b->lc] = 0; s = b->line; } } while (s == NULL); return s; } static void init_buf(read_buf_p_t b) { b->line = xmalloc(BUF_SIZE * sizeof(char)); b->lmax = BUF_SIZE; b->lc = 0; b->ic = 0; } static void init_seq(const char *fName, seq_p_t sp) { sp->fName = fName; sp->header = NULL; sp->seq = NULL; init_buf(&sp->rb); if (fName != NULL) { sp->fd = open(fName, O_RDONLY); if (sp->fd == -1) fatal("Could not open file %s: %s(%d)\n", fName, strerror(errno), errno); } else sp->fd = 0; sp->len = 0; sp->maxHead = 0; sp->max = 0; read_line_buf(&sp->rb, sp->fd); } static int get_next_seq(seq_p_t sp, unsigned int offset, int warnMultiSeq) { const unsigned int lenStr = 24; unsigned int headerLen; char *buf = sp->rb.line; int res; while (sp->rb.lc > 0 && buf[0] != '>') buf = read_line_buf(&sp->rb, sp->fd); if (sp->rb.lc == 0) return -1; /* We have the FASTA header. */ if (sp->rb.lc + lenStr + 1 > sp->maxHead) { sp->maxHead = sp->rb.lc + lenStr + 1; sp->header = (char *) xrealloc(sp->header, sp->maxHead * sizeof(char)); } headerLen = sp->rb.lc; memcpy(sp->header, buf, (sp->rb.lc + 1) * sizeof(char)); sp->len = 0; buf = read_line_buf(&sp->rb, sp->fd); while (sp->rb.lc > 0 && buf[0] != '>') { unsigned char c; /* Make sure we have enough room for this additional line. */ if (sp->len + sp->rb.lc + 1 > sp->max) { sp->max = max(sp->len + sp->rb.lc + 1, sp->max + 0x40000); sp->seq = (unsigned char *) xrealloc(sp->seq, sp->max * sizeof(unsigned char)); } while ((c = (unsigned char) *buf++) != 0) { if (isupper(c)) { sp->seq[sp->len++] = c; } else if (islower(c)) { sp->seq[sp->len++] = (unsigned char) toupper(c); } } buf = read_line_buf(&sp->rb, sp->fd); } if (warnMultiSeq && sp->rb.lc > 0) fprintf(stderr, "\n" "*** WARNING ***\n" "*** there appears to be several sequences in the DNA ***\n" "*** sequence file. Only the first one will be used, ***\n" "*** which might not be what was intended. ***\n" "\n"); sp->seq[sp->len] = 0; buf = strstr(sp->header, "; LEN="); if (buf) { char *s = buf + 6; headerLen -= 6; while (isdigit(*s)) { s += 1; headerLen -= 1; } while (*s) *buf++ = *s++; } buf = sp->header + headerLen - 1; while (iscntrl(*buf) || isspace(*buf)) buf -= 1; res = snprintf(buf + 1, (size_t) lenStr, "; LEN=%u\n", sp->len + offset); if (res < 0 || res >= (int) lenStr) fatal("Sequence too long: %u\n", sp->len); return 0; } static void seq_revcomp_inplace(seq_p_t seq) { unsigned char *s = seq->seq; unsigned char *t = seq->seq + seq->len; unsigned char c; while (s < t) { c = dna_complement[*--t]; *t = dna_complement[*s]; *s++ = c; } } static void bug_handler(int signum) { fflush(stdout); fflush(stderr); fprintf(stderr, "\nCaught signal %d while processing:\n%.256s\n%.256s\n", signum, dna_seq_head, rna_seq_head); abort(); }