sim4.2012-10-10/0000755000515200116500000000000012035303737012617 5ustar floreasalzbergsim4.2012-10-10/COPYING0000644000515200116500000004311007733353756013670 0ustar floreasalzberg GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. sim4.2012-10-10/Xtend1.c0000444000515200116500000004755707733353757014166 0ustar floreasalzberg#include #include #include #include "psublast.h" #include "sim4.h" #include "Xtend1.h" #include "sim4b1.h" #ifndef __lint /*@unused@*/ static const char rcsid[] = "$Id: Xtend1.c,v 1.9 2000/06/05 22:48:19 florea Exp $"; #endif static void free_coords(coords ***,int); int Xextend_bw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, int *line1, int *line2) { int col, /* column number */ row, /* row number */ max_d, /* bound on the length of the edit script */ d, /* current compressed distance */ k, /* current diagonal */ DELTA, /* n-m */ ORIGIN, lower, upper; int *last_d, *temp_d; /* column containing the last p */ int *min_row, *min_diag; /* min (b)/ max (f) row (and diagonal) */ /* reached for cost d=0, ... m. */ coords ***trace_AG, ***trace_AC; coords *AG_cell, *AC_cell, *new; ValNodePtr data_list = NULL, prev = NULL; DELTA = n-m; max_d = m+1; trace_AG = (coords ***)ckalloc((max_d+1)*sizeof(coords **)); trace_AC = (coords ***)ckalloc((max_d+1)*sizeof(coords **)); for (d=0; d<=max_d; d++) { trace_AG[d] = (coords **)ckalloc((m+n+1)*sizeof(coords *)); trace_AC[d] = (coords **)ckalloc((m+n+1)*sizeof(coords *)); } ORIGIN = m; trace_AG[0][ORIGIN+DELTA] = &last_AG; trace_AC[0][ORIGIN+DELTA] = &last_AC; for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--) /*LINTED empty loop body*/; for (k=n; (k>=2) && (k>=col); k--) if (!strncmp((char *)(s2+k-2),"AG",2)) { new = (coords *)ckalloc(sizeof(coords)); link_to_data_list((Pointer)new, &data_list, &prev); new->pos2 = k-DELTA+offset1 +1; /* to compensate for -1 */ new->pos1 = k+offset2 +1; /* refer to sim4b1.c */ trace_AG[0][ORIGIN+DELTA] = new; } else if (!strncmp((char *)(s2+k-2),"AC",2)) { new = (coords *)ckalloc(sizeof(coords)); link_to_data_list((Pointer)new, &data_list, &prev); new->pos2 = k-DELTA+offset1 +1; new->pos1 = k+offset2 +1; trace_AC[0][ORIGIN+DELTA] = new; } if ((row == 0) || (col == 0)) { *line1 = row+offset1; *line2 = col+offset2; (void)memcpy(&last_AG,trace_AG[0][ORIGIN+DELTA],sizeof(coords)); (void)memcpy(&last_AC,trace_AC[0][ORIGIN+DELTA],sizeof(coords)); ValNodeFreeData(data_list); free_coords(trace_AG,max_d+1); free_coords(trace_AC,max_d+1); return 0; } last_d = (int *)ckalloc((m+n+1)*sizeof(int)); temp_d = (int *)ckalloc((m+n+1)*sizeof(int)); for (k=0; k<=m+n; ++k) last_d[k]=m+1; last_d[ORIGIN+DELTA] = row; lower = ORIGIN + DELTA - 1; upper = ORIGIN + DELTA + 1; min_row = (int *)ckalloc((m+1)*sizeof(int)); min_diag = (int *)ckalloc((m+1)*sizeof(int)); for (d=1; d<=m; d++) min_row[d] = m+1; min_row[0] = last_d[ORIGIN+DELTA]; min_diag[0] = ORIGIN + DELTA; d = 0; while ((++d<=max_d) && ((d-1<=good_ratio(m-min_row[d-1])) || ((d>=2) && (d-2<=good_ratio(m-min_row[d-2]))))) { /* for each relevant diagonal ... */ for (k = lower; k <= upper; k++) { /* find a d on diagonal k */ if (k==-d+DELTA+ORIGIN) { /* move down from the last d-1 on diagonal k+1 */ row = last_d[k+1]; /* op = INSERT; */ AG_cell = trace_AG[d-1][k+1]; AC_cell = trace_AC[d-1][k+1]; } else if (k==d+DELTA+ORIGIN) { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]-1; /* op = DELETE; */ AG_cell = trace_AG[d-1][k-1]; AC_cell = trace_AC[d-1][k-1]; } else if ((last_d[k]-1<=last_d[k+1]) && (last_d[k]-1<=last_d[k-1]-1)) { /* substitution */ row = last_d[k]-1; /* op = SUBSTITUTE; */ AG_cell = trace_AG[d-1][k]; AC_cell = trace_AC[d-1][k]; } else if ((last_d[k-1]-1<=last_d[k+1]) && (last_d[k-1]-1<=last_d[k]-1)) { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]-1; /* op = DELETE; */ AG_cell = trace_AG[d-1][k-1]; AC_cell = trace_AC[d-1][k-1]; } else { /* move left from the last d-1 on diagonal k+1 */ row = last_d[k+1]; /* op = INSERT; */ AG_cell = trace_AG[d-1][k+1]; AC_cell = trace_AC[d-1][k+1]; } /* code common to the three cases */ /* slide down the diagonal */ col = row+k-ORIGIN; trace_AG[d][k] = AG_cell; trace_AC[d][k] = AC_cell; while ((row > 0) && (col > 0) && (s1[row-1]==s2[col-1])) { if ((col>1) && !strncmp((char *)(s2+col-2),"AG",2)) { new = (coords *)ckalloc(sizeof(coords)); link_to_data_list((Pointer)new, &data_list, &prev); new->pos1 = row + k - ORIGIN + offset2 +1; new->pos2 = row + offset1 +1; trace_AG[d][k] = new; } else if ((col>1) && !strncmp((char *)(s2+col-2),"AC",2)) { new = (coords *)ckalloc(sizeof(coords)); link_to_data_list((Pointer)new, &data_list, &prev); new->pos1 = row + k - ORIGIN + offset2 +1; new->pos2 = row + offset1 +1; trace_AC[d][k] = new; } row--; col--; } if ((col>1) && !strncmp((char *)(s2+col-2),"AG",2)) { new = (coords *)ckalloc(sizeof(coords)); link_to_data_list((Pointer)new, &data_list, &prev); new->pos1 = row + k - ORIGIN + offset2 +1; new->pos2 = row + offset1 +1; trace_AG[d][k] = new; } else if ((col>1) && !strncmp((char *)(s2+col-2),"AC",2)) { new = (coords *)ckalloc(sizeof(coords)); link_to_data_list((Pointer)new, &data_list, &prev); new->pos1 = row + k - ORIGIN + offset2 +1; new->pos2 = row + offset1 +1; trace_AC[d][k] = new; } temp_d[k] = row; if ((row == 0) && (col == 0)) { /* hit southeast corner; have the answer */ (void)memcpy(&last_AG,trace_AG[d][k],sizeof(coords)); (void)memcpy(&last_AC,trace_AC[d][k],sizeof(coords)); free(last_d); free(temp_d); free(min_row); free(min_diag); ValNodeFreeData(data_list); free_coords(trace_AG,max_d+1); free_coords(trace_AC,max_d+1); *line1 = row+offset1; *line2 = col+offset2; return d; } if (row == 0) { /* hit first row; don't look further */ (void)memcpy(&last_AG,trace_AG[d][k],sizeof(coords)); (void)memcpy(&last_AC,trace_AC[d][k],sizeof(coords)); free(last_d); free(temp_d); free(min_row); free(min_diag); ValNodeFreeData(data_list); free_coords(trace_AG,max_d+1); free_coords(trace_AC,max_d+1); *line1 = row+offset1; *line2 = col+offset2; return d; } if (col == 0) { /* hit last column; don't look further */ (void)memcpy(&last_AG,trace_AG[d][k],sizeof(coords)); (void)memcpy(&last_AC,trace_AC[d][k],sizeof(coords)); free(last_d); free(temp_d); free(min_row); free(min_diag); ValNodeFreeData(data_list); free_coords(trace_AG,max_d+1); free_coords(trace_AC,max_d+1); *line1 = row+offset1; *line2 = col+offset2; return d; } } min_row[d] = last_d[ORIGIN+DELTA]; min_diag[d] = ORIGIN+DELTA; for (k=lower; k<=upper; ++k) if (temp_d[k]0) && (min_row[d-1]-min_row[d]<3)) d--; *line1 = min_row[d]+offset1; *line2 = min_row[d]+min_diag[d]-ORIGIN+offset2; (void)memcpy(&last_AG,trace_AG[d][min_diag[d]],sizeof(coords)); (void)memcpy(&last_AC,trace_AC[d][min_diag[d]],sizeof(coords)); free(min_row); free(min_diag); free(last_d); free(temp_d); ValNodeFreeData(data_list); free_coords(trace_AG,max_d+1); free_coords(trace_AC,max_d+1); return d; } int Xextend_fw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, int *line1, int *line2) { int col, /* column number */ row, /* row number */ max_d, /* bound on the length of the edit script */ d, /* current compressed distance */ k, /* current diagonal */ ORIGIN, lower, upper; int *last_d, *temp_d; /* column containing the last p */ int *max_row, *max_diag; /* min (b)/ max (f) row (and diagonal) */ /* reached for cost d=0, ... m. */ coords ***trace_GT, ***trace_CT; coords *GT_cell, *CT_cell, *new; ValNodePtr data_list = NULL, prev = NULL; max_d = m+1; trace_GT = (coords ***)ckalloc((max_d+1)*sizeof(coords **)); trace_CT = (coords ***)ckalloc((max_d+1)*sizeof(coords **)); for (d=0; d<=max_d; d++) { trace_GT[d] = (coords **)ckalloc((m+n+1)*sizeof(coords *)); trace_CT[d] = (coords **)ckalloc((m+n+1)*sizeof(coords *)); } ORIGIN = m; trace_GT[0][ORIGIN] = &last_GT; trace_CT[0][ORIGIN] = &last_CT; for (row=0, col=0; colpos2 = k+offset1; new->pos1 = k+offset2; trace_GT[0][ORIGIN] = new; } else if (!strncmp((char *)(s2+k),"CT",2)) { new = (coords *)ckalloc(sizeof(coords)); link_to_data_list((Pointer)new, &data_list, &prev); new->pos2 = k+offset1; new->pos1 = k+offset2; trace_CT[0][ORIGIN] = new; } if ((row == m) || (col == n)){ *line1 = row+offset1; *line2 = col+offset2; (void)memcpy(&last_GT,trace_GT[0][ORIGIN],sizeof(coords)); (void)memcpy(&last_CT,trace_CT[0][ORIGIN],sizeof(coords)); ValNodeFreeData(data_list); free_coords(trace_GT,max_d+1); free_coords(trace_CT,max_d+1); return 0; } last_d = (int *)ckalloc((m+n+1)*sizeof(int)); temp_d = (int *)ckalloc((m+n+1)*sizeof(int)); for (k=0; k<=m+n; ++k) last_d[k]=-1; last_d[ORIGIN] = row; lower = ORIGIN - 1; upper = ORIGIN + 1; max_row = (int *)ckalloc((m+1)*sizeof(int)); max_diag = (int *)ckalloc((m+1)*sizeof(int)); for (d=1; d<=m; d++) max_row[d] = -1; max_row[0] = last_d[ORIGIN]; max_diag[0] = ORIGIN; d = 0; while ((++d<=max_d) && ((d-1<=good_ratio(max_row[d-1])) || ((d>=2) && (d-2<=good_ratio(max_row[d-2]))))) { /* for each relevant diagonal ... */ for (k = lower; k <= upper; k++) { /* find a d on diagonal k */ if (k==-d+ORIGIN) { /* move down from the last d-1 on diagonal k+1 */ row = last_d[k+1]+1; /* op = DELETE; */ GT_cell = trace_GT[d-1][k+1]; CT_cell = trace_CT[d-1][k+1]; } else if (k==d+ORIGIN) { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]; /* op = INSERT; */ GT_cell = trace_GT[d-1][k-1]; CT_cell = trace_CT[d-1][k-1]; } else if ((last_d[k]>=last_d[k+1]) && (last_d[k]+1>=last_d[k-1])) { /* substitution */ row = last_d[k]+1; /* op = SUBSTITUTE; */ GT_cell = trace_GT[d-1][k]; CT_cell = trace_CT[d-1][k]; } else if ((last_d[k+1]+1>=last_d[k-1]) && (last_d[k+1]>=last_d[k])) { /* move down from the last d-1 on diagonal k+1 */ row = last_d[k+1]+1; /* op = DELETE; */ GT_cell = trace_GT[d-1][k+1]; CT_cell = trace_CT[d-1][k+1]; } else { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]; /* op = INSERT; */ GT_cell = trace_GT[d-1][k-1]; CT_cell = trace_CT[d-1][k-1]; } /* code common to the three cases */ /* slide down the diagonal */ col = row+k-ORIGIN; trace_GT[d][k] = GT_cell; trace_CT[d][k] = CT_cell; if (row>=0) while ((row < m) && (col < n) && (s1[row]==s2[col])) { if ((colpos1 = row + k - ORIGIN + offset2; new->pos2 = row + offset1; trace_GT[d][k] = new; } else if ((colpos1 = row + k - ORIGIN + offset2; new->pos2 = row + offset1; trace_CT[d][k] = new; } row++; col++; } if ((colpos1 = row + k - ORIGIN + offset2; new->pos2 = row + offset1; trace_GT[d][k] = new; } else if ((colpos1 = row + k - ORIGIN + offset2; new->pos2 = row + offset1; trace_CT[d][k] = new; } temp_d[k] = row; if ((row == m) && (col == n)) { /* hit southeast corner; have the answer */ (void)memcpy(&last_GT,trace_GT[d][k],sizeof(coords)); (void)memcpy(&last_CT,trace_CT[d][k],sizeof(coords)); ValNodeFreeData(data_list); free_coords(trace_GT,max_d+1); free_coords(trace_CT,max_d+1); free(last_d); free(temp_d); free(max_row); free(max_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } if (row == m) { /* hit last row; don't look further */ (void)memcpy(&last_GT,trace_GT[d][k],sizeof(coords)); (void)memcpy(&last_CT,trace_CT[d][k],sizeof(coords)); ValNodeFreeData(data_list); free_coords(trace_GT,max_d+1); free_coords(trace_CT,max_d+1); free(temp_d); free(last_d); free(max_row); free(max_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } if (col == n) { /* hit last column; don't look further */ (void)memcpy(&last_GT,trace_GT[d][k],sizeof(coords)); (void)memcpy(&last_CT,trace_CT[d][k],sizeof(coords)); ValNodeFreeData(data_list); free_coords(trace_GT,max_d+1); free_coords(trace_CT,max_d+1); free(temp_d); free(last_d); free(max_row); free(max_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } } max_row[d] = last_d[ORIGIN]; max_diag[d] = ORIGIN; for (k=lower; k<=upper; ++k) if (temp_d[k]>max_row[d]) { max_row[d] = temp_d[k]; max_diag[d] = k; } for (k=lower; k<=upper; k++) { last_d[k] = temp_d[k]; } --lower; ++upper; } /* report here the previous maximal match, stored in max_diag and max_row */ while ((d>0) && (max_row[d]-max_row[d-1]<3)) d--; *line1 = max_row[d]+offset1; *line2 = max_row[d]+max_diag[d]-ORIGIN+offset2; (void)memcpy(&last_GT,trace_GT[d][max_diag[d]],sizeof(coords)); (void)memcpy(&last_CT,trace_CT[d][max_diag[d]],sizeof(coords)); free(max_row); free(max_diag); free(last_d); free(temp_d); ValNodeFreeData(data_list); free_coords(trace_GT,max_d+1); free_coords(trace_CT,max_d+1); return d; } static void free_coords(coords ***val, int size) { int i; for(i=0; i #include #include "psublast.h" #include "sim4.h" #include "sim4b1.h" #include "Xtend1.h" #include "align.h" #ifndef __lint /*@unused@*/ static const char rcsid[] = "$Id: align.c,v 1.20 2000/08/04 18:55:24 florea Exp $"; #endif static int snake(int k, int x, int endx, int endy); static int rsnake(int k, int x, int startx, int starty, int M); void align_path(int i1, int j1, int i2, int j2, int dist, edit_script **head, edit_script **tail) { int *last_d, *temp_d, /* forward vectors */ *rlast_d, *rtemp_d; /* backward vectors */ edit_script *head1, *tail1, *head2, *tail2; int midc, rmidc; int start, lower, upper; int rstart, rlower, rupper; int c, k, row; int mi, mj, tmp, ll, uu; char flag; *head = *tail = NULL; /* Boundary cases */ if (i1 == i2) { if (j1 == j2) *head = NULL; else { head1 = (edit_script *) ckalloc(sizeof(edit_script)); head1->op_type = INSERT; head1->num = j2-j1; head1->next = NULL; *head = *tail = head1; } return; } if (j1 == j2) { head1 = (edit_script *) ckalloc(sizeof(edit_script)); head1->op_type = DELETE; head1->num = i2-i1; head1->next = NULL; *head = *tail = head1; return; } if (dist <= 1) { start = j1-i1; if (j2-i2 == j1-i1) { head1 = (edit_script *) ckalloc(sizeof(edit_script)); head1->op_type = SUBSTITUTE; head1->num = i2-i1; head1->next = NULL; *head = *tail = head1; } else if (j2-j1 == i2-i1+1) { tmp = snake(start,i1,i2,j2); if (tmp>i1) { head1 = (edit_script *) ckalloc(sizeof(edit_script)); head1->op_type = SUBSTITUTE; head1->num = tmp-i1; *head = head1; } head2 = (edit_script *) ckalloc(sizeof(edit_script)); head2->op_type = INSERT; head2->num = 1; if (*head) head1->next = head2; else *head = head2; *tail = head2; head2->next = NULL; if (i2-tmp) { head1 = head2; *tail = head2 = (edit_script *)ckalloc(sizeof(edit_script)); head2->op_type = SUBSTITUTE; head2->num = i2-tmp; head2->next = NULL; head1->next = head2; } } else if (j2-j1+1 == i2-i1) { tmp = snake(start,i1,i2,j2); if (tmp>i1) { head1 = (edit_script *) ckalloc(sizeof(edit_script)); head1->op_type = SUBSTITUTE; head1->num = tmp-i1; *head = head1; } head2 = (edit_script *) ckalloc(sizeof(edit_script)); head2->op_type = DELETE; head2->num = 1; if (*head) head1->next = head2; else *head = head2; *tail = head2; head2->next = NULL; if (i2>tmp+1) { head1 = head2; *tail = head2 = (edit_script *)ckalloc(sizeof(edit_script)); head2->op_type = SUBSTITUTE; head2->num = i2-tmp-1; head2->next = NULL; head1->next = head2; } } else { (void)fprintf(stderr, "align.c: warning: something wrong when aligning."); } return; } /* Divide the problem at the middle cost */ midc = dist/2; rmidc = dist - midc; /* Compute the boundary diagonals */ start = j1 - i1; lower = max(j1-i2, start-midc); upper = min(j2-i1, start+midc); rstart = j2-i2; rlower = max(j1-i2, rstart-rmidc); rupper = min(j2-i1, rstart+rmidc); /* Allocate space for forward vectors */ last_d = (int *)ckalloc((upper-lower+1)*sizeof(int)) - lower; temp_d = (int *)ckalloc((upper-lower+1)*sizeof(int)) - lower; for (k=lower; k<=upper; k++) last_d[k] = -1; last_d[start] = snake(start,i1,i2,j2); /* Forward computation */ for (c=1; c<=midc; ++c) { ll = max(lower,start-c); uu = min(upper,start+c); for (k=ll; k<=uu; ++k) { if (k == ll) { /* DELETE : down from (k+1,c-1) */ row = last_d[k+1]+1; } else if (k == uu) { /* INSERT : right from (k-1,c-1) */ row = last_d[k-1]; } else if ((last_d[k]>=last_d[k+1]) && (last_d[k]+1>=last_d[k-1])) { /* SUBSTITUTE */ row = last_d[k]+1; } else if ((last_d[k+1]+1>=last_d[k-1]) && (last_d[k+1]>=last_d[k])) { /* DELETE */ row = last_d[k+1]+1; } else { /* INSERT */ row = last_d[k-1]; } temp_d[k] = snake(k,row,i2,j2); } for (k=ll; k<=uu; ++k) last_d[k] = temp_d[k]; } /* Allocate space for backward vectors */ rlast_d = (int *)ckalloc((rupper-rlower+1)*sizeof(int)) - rlower; rtemp_d = (int *)ckalloc((rupper-rlower+1)*sizeof(int)) - rlower; for (k=rlower; k<=rupper; k++) rlast_d[k] = i2+1; rlast_d[rstart] = rsnake(rstart,i2,i1,j1,M); /* Backward computation */ for (c=1; c<=rmidc; ++c) { ll = max(rlower,rstart-c); uu = min(rupper,rstart+c); for (k=ll; k<=uu; ++k) { if (k == ll) { /* INSERT : left from (k+1,c-1) */ row = rlast_d[k+1]; } else if (k == uu) { /* DELETE : up from (k-1,c-1) */ row = rlast_d[k-1]-1; } else if ((rlast_d[k]-1<=rlast_d[k+1]) && (rlast_d[k]-1<=rlast_d[k-1]-1)) { /* SUBSTITUTE */ row = rlast_d[k]-1; } else if ((rlast_d[k-1]-1<=rlast_d[k+1]) && (rlast_d[k-1]-1<=rlast_d[k]-1)) { /* DELETE */ row = rlast_d[k-1]-1; } else { /* INSERT */ row = rlast_d[k+1]; } rtemp_d[k] = rsnake(k,row,i1,j1,M); } for (k=ll; k<=uu; ++k) rlast_d[k] = rtemp_d[k]; } /* Find (mi, mj) such that the distance from (i1, j1) to (mi, mj) is midc and the distance from (mi, mj) to (i2, j2) is rmidc. */ flag = FALSE; mi = i1; mj = j1; ll = max(lower,rlower); uu = min(upper,rupper); for (k=ll; k<=uu; ++k) { if (last_d[k]>=rlast_d[k]) { if (last_d[k]-i1>=i2-rlast_d[k]) { mi = last_d[k]; mj = k+mi; } else { mi = rlast_d[k]; mj = k+mi; } flag = TRUE; break; } } free(last_d+lower); free(rlast_d+rlower); free(temp_d+lower); free(rtemp_d+rlower); if (flag) { /* Find a path from (i1,j1) to (mi,mj) */ align_path(i1,j1,mi,mj,midc,&head1,&tail1); /* Find a path from (mi,mj) to (i2,j2) */ align_path(mi,mj,i2,j2,rmidc,&head2,&tail2); /* Join these two paths together */ if (head1) tail1->next = head2; else head1 = head2; } else { (void)fprintf(stderr, "align.c: warning: something wrong when dividing\n"); head1 = NULL; } *head = head1; if (head2) *tail = tail2; else *tail = tail1; } int align_get_dist(int i1, int j1, int i2, int j2, int limit) { int *last_d, *temp_d; int goal_diag, ll, uu; int c, k, row; int start, lower, upper; /* Compute the boundary diagonals */ start = j1 - i1; lower = max(j1-i2, start-limit); upper = min(j2-i1, start+limit); goal_diag = j2-i2; if (goal_diag > upper || goal_diag < lower) { (void)fprintf(stderr, "The two sequences are not really similar.\n"); (void)fprintf(stderr, "Please try an exact aligning method.\n"); exit(1); } /* Allocate space for forward vectors */ last_d = (int *)ckalloc((upper-lower+1)*sizeof(int)) - lower; temp_d = (int *)ckalloc((upper-lower+1)*sizeof(int)) - lower; /* Initialization */ for (k=lower; k<=upper; ++k) last_d[k] = MININT; last_d[start] = snake(start, i1, i2, j2); if (last_d[goal_diag] >= i2) { /* Free working vectors */ free(last_d+lower); free(temp_d+lower); return 0; } for (c=1; c<=limit; ++c) { ll = max(lower,start-c); uu = min(upper, start+c); for (k=ll; k<=uu; ++k) { if (k == ll) row = last_d[k+1]+1; /* DELETE */ else if (k == uu) row = last_d[k-1]; /* INSERT */ else if ((last_d[k]>=last_d[k+1]) && (last_d[k]+1>=last_d[k-1])) row = last_d[k]+1; /*SUBSTITUTE */ else if ((last_d[k+1]+1>=last_d[k-1]) && (last_d[k+1]>=last_d[k])) row = last_d[k+1]+1; /* DELETE */ else row = last_d[k-1]; /* INSERT */ temp_d[k] = snake(k,row,i2,j2); } for (k=ll; k<=uu; ++k) last_d[k] = temp_d[k]; if (last_d[goal_diag] >= i2) { #ifdef STATS (void)fprintf(stderr, "get_dist = %d\n",c); #endif /* Free working vectors */ free(last_d+lower); free(temp_d+lower); return c; } } /* Ran out of distance limit */ return -1; } /* Condense_script - merge contiguous operations of the same type together */ void Condense_script(edit_script *head) { edit_script *tp, *tp1; tp = head; while (tp != NULL) { while (((tp1 = tp->next) != NULL) && (tp->op_type == tp1->op_type)) { tp->num = tp->num + tp1->num; tp->next = tp1->next; free(tp1); } tp = tp->next; } } /* Condense_both_Ends -- merge contiguous operations of the same type */ /* together; return both new ends of the chain. */ void Condense_both_Ends (edit_script **head, edit_script **tail, edit_script **prev) { edit_script *tp, *tp1; tp = *head; *prev = NULL; while (tp != NULL) { while (((tp1 = tp->next) != NULL) && (tp->op_type == tp1->op_type)) { tp->num = tp->num + tp1->num; tp->next = tp1->next; free(tp1); } if (tp->next) *prev = tp; else *tail = tp; tp = tp->next; } } /* Flip_script - reverse the script list */ void Flip_script(struct edit_script **script) { struct edit_script *ep, *ahead, *behind; ahead = *script; ep = NULL; while (ahead!=NULL) { behind = ep; ep = ahead; ahead = ahead->next; ep->next = behind; } *script = ep; } /* reverse the edit script */ #ifdef AUXUTILS void Reverse_script(edit_script *head) { edit_script *tp; tp = head; while (tp != NULL) { if (tp->op_type == INSERT) tp->op_type = DELETE; else if (tp->op_type == DELETE) tp->op_type = INSERT; tp = tp->next; } } #endif #ifdef AUXUTILS void Print_script(edit_script *head, int M, int N) { edit_script *tp; int i, j, k, count; i = j = 0; tp = head; while (tp != NULL) { if (tp->op_type == SUBSTITUTE) { k = 0; while (k < tp->num) { count = 0; while ((seq1[i] == seq2[j]) && (knum)) { ++i; ++j; ++count; ++k; } if (count > 0) (void)printf("copy %d\n", count); if (k < tp->num) { (void)printf("replace %c by %c\n", seq1[i], seq2[j]); ++i; ++j; ++k; } } /* if (tp->num > 1) (void)printf("%d substitutions\n", tp->num); else (void)printf("%d substitution\n", tp->num); */ } else if (tp->op_type == INSERT) { if ((tp==head || tp->next==NULL) && (M <= N)) (void)printf("skip (second sequence) %d\n", tp->num); else { /* (void)printf("insert %d\n", tp->num); */ (void)printf("insert "); if (tp->num<=10) for (k=j; knum; ++k) (void)printf("%c", seq2[k]); else (void)printf(" %d ",tp->num); (void)printf("\n"); } j += tp->num; } else { /* DEL */ if ((tp==head || tp->next==NULL) && (M > N)) (void)printf("skip (first sequence) %d\n", tp->num); else { /* (void)printf("delete %d\n", tp->num); */ (void)printf("delete "); if (tp->num <= 10) for (k=i; knum; ++k) (void)printf("%c", seq1[k]); else (void)printf("%d ",tp->num); (void)printf("\n"); } i += tp->num; } tp = tp->next; } } #endif void S2A(edit_script *head, int *S, int flag) { edit_script *tp; int *lastS, i; tp = head; lastS = S; while (tp != NULL) { /* (void)printf("tp->op_type=%d, tp->num=%d\n",tp->op_type, tp->num); */ if (tp->op_type == SUBSTITUTE) { for (i=0; inum; ++i) *lastS++ = 0; } else if (tp->op_type == INSERT) { *lastS++ = (!flag) ? tp->num : (0-tp->num); } else { /* DELETE */ *lastS++ = (!flag) ? (0 - tp->num) : tp->num; } tp = tp->next; } *(S-1) = lastS - S; } void align_reverse(int *S) { int auxi, *begi, *endi; begi = S; endi = S + *(S-1); while (begi < endi) { auxi = *begi; *begi = *--endi; *endi = auxi; begi++; } return; } /* Alignment display routine */ static uchar ALINE[51], BLINE[51], CLINE[51]; void IDISPLAY(uchar A[], uchar B[], int M, int N, int S[], int AP, int BP, int est_strand, Exon *exons) { Exon *t0; register uchar *a, *b, *c, sign; register int i, j, op, index; int lines, ap, bp, starti; if ((exons==NULL) || (!exons->to1 && (exons->next_exon==NULL))) fatal("align.c: Exon list cannot be empty."); /* find the starting exon for this alignment */ t0 = exons; while (t0 && (((est_strand==2) && ((t0->from1!=AP) || (t0->from2!=BP))) || ((est_strand==1) && ((t0->from1!=BP) || (t0->from2!=AP))))) t0 = t0->next_exon; if (!t0) fatal("align.c: Alignment fragment not found."); i = j = op = lines = index = 0; sign = '*'; ap = AP; bp = BP; a = ALINE; b = BLINE; c = CLINE; starti = (t0->next_exon && t0->next_exon->to1) ? (t0->to1+1):-1; while (i < M || j < N) { if (op == 0 && *S == 0) { op = *S++; *a = A[++i]; *b = B[++j]; *c++ = (*a++ == *b++) ? '|' : ' '; } else { if (op == 0) { op = *S++; } if (op > 0) { if (est_strand==2) { *a++ = ' '; *b++ = B[++j]; *c++ = '-'; op--; } else { if (j+BP==starti) { /* detected intron */ switch (t0->ori) { case 'C': sign = '<'; break; case 'G': sign = '>'; break; case 'N': sign = '='; break; default: fatal("align.c: Unrecognized intron type."); } t0 = t0->next_exon; starti=(t0->next_exon && t0->next_exon->to1)?(t0->to1+1):-1; index = 1; *c++ = sign; *a++ = ' '; *b++ = B[++j]; op--; } else if (!index) { *c++ = '-'; *a++ = ' '; *b++ = B[++j]; op--; } else { /* not the first deletion in the intron */ switch (index) { case 0: case 1: case 2: *a++ = ' '; *b++ = B[++j]; *c++ = sign; op--; index++; break; case 3: case 4: *a++ = ' '; *b++ = '.'; *c++ = '.'; j++; op--; index++; break; case 5: *a++ = ' '; *b++ = '.'; *c++ = '.'; j+= op-3; op = 3; index++; break; case 6: case 7: *a++ = ' '; *b++ = B[++j]; *c++ = sign; op--; index++; break; case 8: *a++ = ' '; *b++ = B[++j]; *c++ = sign; op--; index = 0; break; } } } } else { if (est_strand==1) { *a++ = A[++i]; *b++ = ' '; *c++ = '-'; op++; } else { if (i+AP==starti) { /* detected intron */ switch (t0->ori) { case 'C': sign = '<'; break; case 'G': sign = '>'; break; case 'N': sign = '='; break; default: fatal("align.c: Unrecognized intron type."); } t0 = t0->next_exon; starti=(t0->next_exon && t0->next_exon->to1)?(t0->to1+1):-1; index = 1; *c++ = sign; *a++ = A[++i]; *b++ = ' '; op++; } else if (!index) { *c++ = '-'; *a++ = A[++i]; *b++ = ' '; op++; } else { /* not the first deletion in the intron */ switch (index) { case 0: case 1: case 2: *a++ = A[++i]; *b++ = ' '; *c++ = sign; op++; index++; break; case 3: case 4: *a++ = '.'; *b++ = ' '; *c++ = '.'; i++; op++; index++; break; case 5: *a++ = '.'; *b++ = ' '; *c++ = '.'; i+=(-op)-3; op=-3; index++; break; case 6: case 7: *a++ = A[++i]; *b++ = ' '; *c++ = sign; op++; index++; break; case 8: *a++ = A[++i]; *b++ = ' '; *c++ = sign; op++; index = 0; break; } } } } } if ((a >= ALINE+50) || ((i >= M) && (j >= N))) { *a = *b = *c = '\0'; (void)printf("\n%7d ",50*lines++); for (b = ALINE+10; b <= a; b += 10) (void)printf(" . :"); if (b <= a+5) (void)printf(" ."); (void)printf("\n%7d %s\n %s\n%7d %s\n",ap,ALINE,CLINE,bp,BLINE); ap = AP + i; bp = BP + j; a = ALINE; b = BLINE; c = CLINE; } } } void Free_script(edit_script *head) { edit_script *tp, *tp1; tp = head; while (tp != NULL) { tp1 = tp->next; free(tp); tp = tp1; } } static int snake(int k, int x, int endx, int endy) { int y; if (x<0) return x; y = x+k; while (xM) return x; if ((startx<0) || (starty<0)) (void)printf("TROUBLE!!! startx: %5d, starty: %5d\n",startx, starty); if ((x>M) || (x+k>N)) (void)printf("TROUBLE!!! x: %5d, y: %5d\n",x,x+k); y = x+k; while (x>startx && y>starty && seq1[x-1]==seq2[y-1]) { --x; --y; } return x; } sim4.2012-10-10/align.h0000444000515200116500000000135107733353757014100 0ustar floreasalzberg#ifndef SCRIPTLIB_H #define SCRIPTLIB_H /* $Id: align.h,v 1.9 2000/06/06 15:03:02 florea Exp $ */ extern void align_path(int,int,int,int,int,edit_script**,edit_script**); extern int align_get_dist(int, int, int, int, int); extern void Condense_script(edit_script *); extern void Condense_both_Ends(edit_script **, edit_script **, edit_script **); extern void S2A(edit_script *, int *, int); extern void align_reverse(int *); extern void IDISPLAY(uchar *, uchar *, int, int, int *, int, int,int, Exon *); extern void Free_script(edit_script *); extern void Flip_script(struct edit_script **); #ifdef AUXUTILS extern void Reverse_script(edit_script *); extern void Print_script(edit_script *head, int M, int N); #endif #endif /* SCRIPTLIB_H */ sim4.2012-10-10/sim4.h0000444000515200116500000000673707733353757013677 0ustar floreasalzberg#ifndef SIM4_H #define SIM4_H /* $Id: sim4.h,v 1.21 2002/01/24 00:56:23 schwartz Exp $ */ #define DEFAULT_NUM_I 3 #define DIST_CUTOFF 3 #define DEFAULT_MIN_COV (.8) #define MININT (-99999) #define MIN_INTRON 30 #define MAX_INTRON 20000 #define MAX_GRINIT 500 #define MAX_INTERNAL_GAP 50 /* 50 */ #define LL 60 #define DEFAULT_DRANGE 10 #define DEFAULT_WEIGHT 100 #define DEFAULT_RELINK_H 500 #define DEFAULT_W 12 #define DEFAULT_X 12 #define DEFAULT_K 16 #define DEFAULT_C 12 #define LINK 0 #define RELINK 1 #define P (.2) #define min(x,y) ((x>y) ? (y):(x)) #define max(x,y) ((x= abs(C_score)) ? "GT" : "CT") #define END_SIG ((G_score >= abs(C_score)) ? "AG" : "AC") #define MATCH 1 #define MISMATCH (-5) #define L 8 #define DELETE 1 #define INSERT 2 #define SUBSTITUTE 3 #define INTRON 4 #define O_INTRON 5 #undef TRUE #undef FALSE enum { FALSE = 0, TRUE = 1}; enum { INIT = 0, PERM = 1, TEMP = 2}; enum { EST_GEN = 1, GEN_EST = 2 }; enum { FWD = 0, BWD = 1, BOTH = 2 }; enum { OK = 0, FREE_START = 1, FREE_END = 2, FREE_BOTH_ENDS = 3}; /* data structures */ /* used in select_path() */ typedef struct msp { int len, pos1, pos2; int score; int Score; int prev; struct msp *next_msp; } *Msp_ptr; typedef struct exon { int from1, from2, to1, to2; int min_diag, max_diag; int match; char ori; int length; int flag; int ematches; int nmatches; int edist; int alen; Msp_ptr msps; struct exon *next_exon; } *Exon_ptr; typedef struct intron { int from1, from2, to1, to2; int length; char orientation; struct intron *next_intron; } *Intron_ptr, Intron; typedef struct exon Exon; typedef struct coordinates { int pos1; int pos2; } coords; /* used only in the alignment stage */ typedef struct edit_script { char op_type; /* SUB, INS, or DEL */ int num; /* Number of operations */ struct edit_script *next; } edit_script; typedef struct edit_script_list { int offset1, offset2; int len1, len2; int score; struct edit_script *script; struct edit_script_list *next_script; } edit_script_list; struct edit { struct edit *link; /* previous edit instruction */ char type[2]; int accumulation; char op; /* INSERT, DELETE or INTRON */ int line1; /* line number in file1 */ int line2; /* line number in file2 */ }; typedef void *Pointer; typedef struct ValNode { Pointer data; struct ValNode *next; } *ValNodePtr; typedef int signal_t[5][5]; typedef struct spliced { int xs, xe, ys, ye, score; char type; struct spliced *next; } splice_t; typedef struct sim4_stats { int internal, icoverage, mult, nmatches; double fcoverage, marginals; } sim4_stats_t; typedef struct sim4_args { int ali_flag, poly_flag, acc_flag, reverse, DRANGE, weight, cutoff; int set_K, set_C, set_H; int W, K, C, X, B, CDS_from, CDS_to; char *S; } sim4_args_t; #endif sim4.2012-10-10/sim4b1.c0000444000515200116500000032022007733353757014077 0ustar floreasalzberg/* * sim4.c - A program to align a cDNA sequence with a genomic sequence * for the gene. */ #ifndef __lint /*@unused@*/ static const char rcsid[] = "$Id: sim4b1.c,v 1.64 2002/03/03 23:29:48 florea Exp $"; #endif #include #include #include #include #include #include "psublast.h" #include "sim4.h" #include "sim4b1.h" #include "Xtend1.h" #include "align.h" #include "splice.h" #include "poly.h" #define EXTEND_FW (rs.acc_flag?Xextend_fw:extend_fw) #define EXTEND_BW (rs.acc_flag?Xextend_bw:extend_bw) #define SLIDE_INTRON(x) (((x)==TRUE)?sync_slide_intron:slide_intron) uchar *seq1, *seq2; int M, N, encoding[NACHARS]; coords last_GT, last_CT, last_AG, last_AC; int file_type; sim4_args_t rs; static int numMSPs, K, W, X; static int G_score, C_score; static int *diag_lev; static Msp_ptr msp_list, *msp; static Exon_ptr exon_list; static void merge(Exon **,Exon **); static bool get_sync_flag(Exon *, Exon *, int); static void slide_intron(int w, Exon **,uchar *,uchar *); static void sync_slide_intron(int w, Exon **,uchar *,uchar *); static void wobble(Exon **,Exon **,const char *,const char *,uchar *seq1); static Exon *bmatch(uchar *,uchar *,int,int,int,int); static Exon *fmatch(uchar *,uchar *,int,int,int,int); static void compact_list(Exon **Lblock, Exon **Rblock); static int resolve_overlap(Exon *,Exon *,uchar *); static int greedy(uchar *,uchar *,int,int,int,int,Exon **, Exon **); static int extend_bw(uchar *,uchar *,int,int,int,int,int *,int *); static int extend_fw(uchar *,uchar *,int,int,int,int,int *,int *); static void pluri_align(int *,int *,Exon *,struct edit_script_list **); static void get_stats(Exon *,sim4_stats_t *); static int get_edist(int,int,int,int,uchar *,uchar *); static int get_msp_threshold(int len1, int len2); static int find_log_entry(long *log4s, int n, int len, int offset); static Exon *new_exon(int,int,int,int,int,int,int,Exon *); static void add_word(int,int); static void extend_hit(int,int,const uchar *const,const uchar * const,int,int,int); static void sort_msps(void); static void heapify(int,int); static int smaller(int,int); static void search(uchar *,uchar *,int,int,int); static int link_msps(Msp_ptr *msp,int,int,int); static int scale(int n); static void msp2exons(Msp_ptr *,int,uchar *,uchar *); static void free_msps(Msp_ptr **,int *); static void exon_cores(uchar*,uchar*,int,int,int,int,int,int,int,int); static void relink(Msp_ptr *,int,int,int,int,int,uchar *,uchar *); static int dispatch_find_ends(int,int,int *,int *,edit_script_list *,int,int,int); static int find_ends(edit_script_list *,int); static bool get_match_quality(Exon *,Exon *,sim4_stats_t *,int); static int check_consistency_intron_ori(Exon *,int,char *); Exon *find_previous(Exon *,Exon *); void script_flip_list(edit_script_list **); #ifdef DEBUG static void debug_print_exons(Exon *, const char *); #endif /* Not currently used: */ #ifdef AUXUTILS static void remove_polyA_tails(Exon *,uchar *,uchar *,int); static void find_introns(Exon *, Intron **); static void print_introns(Intron *); #endif /* seq1 = genomic DNA (text); seq2 = cDNA */ struct edit_script_list *SIM4(uchar *in_seq1, uchar *in_seq2, int in_M, int in_N, int in_W, int in_X, int in_K, int in_C, int in_H, int *dist_ptr, int *pT, int *pA, Exon **Exons, sim4_stats_t *st) { int cflag, diff, cost, rollbflag, sync_flag; int u, v, I, J; bool good_match; Exon *Lblock, *Rblock=NULL, *tmp_block, *last, *prev, *tmp_block1, *tmp_Lblock=NULL, *tmp_Rblock=NULL, *new; struct edit_script_list *Script_head=NULL; uchar tmp[50]; coords *sig; seq1 = in_seq1; seq2 = in_seq2; M = in_M; N = in_N; W = in_W; X = in_X; if (M<=0 || in_N<=0) { *Exons = NULL; return NULL; } if (rs.acc_flag) { last_AG.pos1 = last_AG.pos2 = last_AC.pos1 = last_AC.pos2 = 0; last_GT.pos1 = last_GT.pos2 = last_CT.pos1 = last_CT.pos2 = 0; } /* Compute the distance between two sequences A and B */ *dist_ptr = 0; exon_cores(seq1-1, seq2-1, M, N, 1, 1, 0, W, in_K, PERM); tmp_block = Lblock = exon_list; while (tmp_block) { if (tmp_block->next_exon==NULL) Rblock = tmp_block; tmp_block = tmp_block->next_exon; } if (Lblock && ((Lblock->from1>50000 && Lblock->from2>100) || ((M-Rblock->to1>50000) && (N-Rblock->to2>100)))) { free_list(exon_list); relink(msp,numMSPs,(in_H>0) ? in_H:DEFAULT_RELINK_H,1,1,0,seq1,seq2); tmp_block = Lblock = exon_list; while (tmp_block) { if (tmp_block->next_exon==NULL) Rblock = tmp_block; tmp_block = tmp_block->next_exon; } } free_msps(&msp, &numMSPs); tmp_block = Lblock = exon_list; while (tmp_block) { if (tmp_block->next_exon==NULL) Rblock = tmp_block; tmp_block = tmp_block->next_exon; } /* enclose the current path in the (0,0,0,0) and (M+1,N+1,0,0) brackets */ Lblock = new_exon (0,0,0,0,0,0,0,Lblock); if (Rblock == NULL) Rblock = Lblock; Rblock->next_exon = new_exon (M+1,N+1,0,0,0,0,0,NULL); /* compute current statistics */ good_match = get_match_quality(Lblock, Rblock, st, N); #ifdef DEBUG debug_print_exons(Lblock, "LSIS"); #endif tmp_block = Lblock; while ((tmp_block1 = tmp_block->next_exon)!=NULL) { rollbflag = 0; diff = (int)(tmp_block1->from2-tmp_block->to2-1); if (diff) { if (diff<0) { int best_u; best_u = resolve_overlap(tmp_block,tmp_block1,seq1); tmp_block1->from1 += best_u+1-tmp_block1->from2; tmp_block1->from2 = best_u+1; if (((u=tmp_block1->to2-tmp_block1->from2+1)<=0) || (u<8) || ((v=tmp_block1->to1-tmp_block1->from1+1)<=0) || (v<8)) { /* remove exon associated with tmp_block1 */ tmp_block->next_exon = tmp_block1->next_exon; tmp_block->flag = tmp_block1->flag; rollbflag = 1; free(tmp_block1); tmp_block1 = NULL; /* not necessary, just to keep it 'clean'*/ } tmp_block->to1 -= tmp_block->to2-best_u; tmp_block->to2 = best_u; if (((u=tmp_block->to2-tmp_block->from2+1)<=0) || (u<8) || ((v=tmp_block->to1-tmp_block->from1+1)<=0) || (v<8)) { /* remove exon defined by tmp_block */ prev = find_previous(Lblock,tmp_block); assert (prev!=NULL); prev->next_exon = tmp_block->next_exon; prev->flag = tmp_block->flag; if (u>0) rollbflag = 1; free(tmp_block); tmp_block = prev; } if (tmp_block->to1) tmp_block->length = tmp_block->to2-tmp_block->from2+1; if (tmp_block1 && tmp_block1->to1) tmp_block1->length = tmp_block1->to2-tmp_block1->from2+1; } else { /* bridge the gap */ cflag = (tmp_block1->to2 && tmp_block->to2) ? 0 : 1; if (diff && (tmp_block1->from1-tmp_block->to1-1>0)) { if (!cflag) { if (diff<=MAX_GRINIT) { cost = greedy(seq2+tmp_block->to2, seq1+tmp_block->to1, diff, tmp_block1->from1-tmp_block->to1-1, tmp_block->to2,tmp_block->to1, &tmp_Lblock, &tmp_Rblock); } else cost = max(W,(int)(P*diff+1))+1; if (cost>max(W,(int)(P*diff+1))) { if (!tmp_block->flag && !tmp_block1->flag) { exon_cores(seq1+tmp_block->to1-1, seq2+tmp_block->to2-1, tmp_block1->from1-tmp_block->to1-1, diff, tmp_block->to1+1, tmp_block->to2+1, 1, min(8,W), in_C, /* (min(8,W)==W) ? PERM : TEMP); */ TEMP); tmp_Lblock = tmp_Rblock = exon_list; while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL)) tmp_Rblock = tmp_Rblock->next_exon; if ((!tmp_Lblock && tmp_block1->from1-tmp_block->to1>50000) || (tmp_Lblock && (tmp_Lblock->from2-tmp_block->to2>100) && (tmp_Lblock->from1-tmp_block->from1>50000)) || (tmp_Lblock && (tmp_block1->from2-tmp_Rblock->to2>100) && (tmp_block1->from1-tmp_Rblock->from1>50000))) { /* possible large intron; increase the score weight */ free_list(tmp_Lblock); relink(msp, numMSPs, (in_H>0) ? in_H:DEFAULT_RELINK_H, tmp_block->to1+1, tmp_block->to2+1, 1, seq1, seq2); tmp_Lblock = tmp_Rblock = exon_list; while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL)) tmp_Rblock = tmp_Rblock->next_exon; } free_msps(&msp, &numMSPs); if (tmp_Lblock) rollbflag = 1; else rollbflag = 0; /* already 0 */ } else tmp_Lblock = tmp_Rblock = NULL; } } else if (tmp_block1->to1) { /* start of seq; find last_AG, last_AC */ if (rs.acc_flag) { for (v=tmp_block1->from1-1; v<=tmp_block1->to1-3; v++) if (!strncmp((char *)(seq1+v-2),"AG",(size_t)2)) { last_AG.pos1 = v+1; last_AG.pos2 = tmp_block1->from2+ (v-tmp_block1->from1)+1; break; } for (v=tmp_block1->from1-1; v<=tmp_block1->to1-3; v++) if (!strncmp((char *)(seq1+v-2),"AC",(size_t)2)) { last_AC.pos1 = v+1; last_AC.pos2 = tmp_block1->from2+ (v-tmp_block1->from1)+1; break; } } /* end acc_flag */ diff = (int)(min(diff,(int)(MAX_GRINIT/2))); u = min(4*diff,tmp_block1->from1-tmp_block->to1-1); cost = EXTEND_BW(seq2+tmp_block->to2+ (tmp_block1->from2-tmp_block->to2-1)-diff, seq1+tmp_block->to1+ (tmp_block1->from1-tmp_block->to1-1)-u, (int)diff, u, tmp_block->to2+ (tmp_block1->from2-tmp_block->to2-1)-diff, tmp_block->to1+ (tmp_block1->from1-tmp_block->to1-1)-u, &I, &J); if ((good_match==FALSE) || tmp_block->flag || (J==0) || (I==0)) { tmp_block1->from2 = I+1; tmp_block1->from1 = J+1; tmp_block1->edist += cost; tmp_block1->length = tmp_block1->to2-tmp_block1->from2+1; } /* use blast if marginal gap still exists, and this is first scan */ if (!(diff=(int)(tmp_block1->from2-tmp_block->to2-1)) || tmp_block->flag) { /* blast-treated region or no gap */ tmp_Rblock = tmp_Lblock = NULL; } else { exon_cores(seq1+tmp_block->to1-1, seq2+tmp_block->to2-1, tmp_block1->from1-tmp_block->to1-1, diff, tmp_block->to1+1, tmp_block->to2+1, 1, min(10,W), in_C, /* (min(10,W)==W) ? PERM : TEMP); */ TEMP); tmp_block -> flag = 1; tmp_Lblock = tmp_Rblock = exon_list; while (tmp_Rblock && tmp_Rblock->next_exon) tmp_Rblock = tmp_Rblock->next_exon; if ((!tmp_Lblock && tmp_block1->from1-tmp_block->to1>50000) || (tmp_Lblock && (tmp_Lblock->from2-tmp_block->to2>100) && (tmp_Lblock->from1-tmp_block->from1>50000)) || (tmp_Lblock && (tmp_block1->from2-tmp_Rblock->to2>100) && (tmp_block1->from1-tmp_Rblock->from1>50000))) { /* possible large intron; increase the score weight */ free_list(tmp_Lblock); relink(msp, numMSPs, (in_H>0) ? in_H:DEFAULT_RELINK_H, tmp_block->to1+1, tmp_block->to2+1, 1,seq1,seq2); tmp_Lblock = tmp_Rblock = exon_list; while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL)) tmp_Rblock = tmp_Rblock->next_exon; } free_msps(&msp, &numMSPs); if (tmp_Lblock) rollbflag = 1; else { tmp_block1->from2 = I+1; tmp_block1->from1 = J+1; tmp_block1->edist += cost; tmp_block1->length = tmp_block1->to2-tmp_block1->from2+1; } } } else { if (rs.acc_flag) { for (v=tmp_block->to1; v>=tmp_block->from1; v--) if (!strncmp((char *)(seq1+v),"GT",(size_t)2)) { last_GT.pos1 = v; last_GT.pos2 = tmp_block->to2-(tmp_block->to1-v); break; } for (v=tmp_block->to1; v>=tmp_block->from1; v--) if (!strncmp((char *)(seq1+v),"CT",(size_t)2)) { last_CT.pos1 = v; last_CT.pos2 = tmp_block->to2-(tmp_block->to1-v); break; } } diff = (int)(min(diff,(int)(MAX_GRINIT/2))); cost = EXTEND_FW(seq2+tmp_block->to2, seq1+tmp_block->to1, diff, min(4*diff,tmp_block1->from1-tmp_block->to1-1), tmp_block->to2,tmp_block->to1, &I, &J); if ((good_match==FALSE) || tmp_block1->flag || (I==M) || (J==N)) { if (tmp_block->to1) { tmp_block->to2 = I; tmp_block->to1 = J; tmp_block->edist += cost; tmp_block->length = tmp_block->to2-tmp_block->from2+1; tmp_Rblock = tmp_Lblock = NULL; } else /* special case: no initial exon */ tmp_Lblock = tmp_Rblock = NULL; } /* use blast if marginal gap still exists, and this is first scan */ if (!(diff=(int)(tmp_block1->from2-tmp_block->to2-1)) || tmp_block1->flag) { /* blast-treated region or no gap */ tmp_Rblock = tmp_Lblock = NULL; } else { exon_cores(seq1+tmp_block->to1-1, seq2+tmp_block->to2-1, tmp_block1->from1-tmp_block->to1-1, diff, tmp_block->to1+1, tmp_block->to2+1, 1, min(10,W), in_C, /* (min(10,W)==W) ? PERM : TEMP); */ TEMP); tmp_Lblock = tmp_Rblock = exon_list; while (tmp_Rblock && tmp_Rblock->next_exon) tmp_Rblock = tmp_Rblock->next_exon; if ((!tmp_Lblock && tmp_block1->from1-tmp_block->to1>50000) || (tmp_Lblock && (tmp_Lblock->from2-tmp_block->to2>100) && (tmp_Lblock->from1-tmp_block->from1>50000)) || (tmp_Lblock && (tmp_block1->from2-tmp_Rblock->to2>100) && (tmp_block1->from1-tmp_Rblock->from1>50000))) { /* possible large intron; increase the score weight */ free_list(tmp_Lblock); relink(msp, numMSPs, (in_H>0) ? in_H:DEFAULT_RELINK_H, tmp_block->to1+1, tmp_block->to2+1, 1,seq1,seq2); tmp_Lblock = tmp_Rblock = exon_list; while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL)) tmp_Rblock = tmp_Rblock->next_exon; } free_msps(&msp, &numMSPs); tmp_block1->flag = 1; if (tmp_Lblock) rollbflag = 1; else { if (tmp_block->to1) { tmp_block->to2 = I; tmp_block->to1 = J; tmp_block->edist += cost; tmp_block->length = tmp_block->to2-tmp_block->from2+1; tmp_Rblock = tmp_Lblock = NULL; } else /* special case: no initial exon */ tmp_Lblock = tmp_Rblock = NULL; } } } } else if (diff) { tmp_Rblock = tmp_Lblock = NULL; } /* merge block in the exon list; make connections to the previous list of blocks; maintain increasing order */ if (tmp_Lblock) { tmp_block->next_exon = tmp_Lblock; tmp_Rblock->next_exon = tmp_block1; merge(&tmp_block,&tmp_block1); } } } /* diff!=0 */ if (!rollbflag) tmp_block = tmp_block1; } /* just printing ... */ #ifdef DEBUG debug_print_exons(Lblock, "EXTENSIONS"); #endif /* compaction step; note: it resets the right end of the list to */ /* the last item in the block list */ compact_list(&(Lblock->next_exon), &Rblock); /* just printing ... */ #ifdef DEBUG debug_print_exons(Lblock, "NORMALIZATION"); #endif /* eliminate marginal small blocks at the start of the sequence; */ /* resets the empty alignment to one block (Lblock) only */ tmp_block = Lblock->next_exon; while ((tmp_block!=NULL) && (tmp_block->lengthto1) { tmp_block1 = tmp_block; /* free */ tmp_block = tmp_block->next_exon; free(tmp_block1); /* free */ } Lblock->next_exon = tmp_block; /* eliminate marginal small blocks at the end of the sequence */ last = Lblock->next_exon; tmp_block = last; while (tmp_block!=NULL) { if (tmp_block->length>=W) last = tmp_block; tmp_block = tmp_block->next_exon; } if (last && last->to1) last->next_exon = Rblock->next_exon; Rblock = last; /* if high accuracy requirement, adjust boundaries of marginal exons */ if (rs.acc_flag) { tmp_block = Lblock->next_exon; /* condition for non-signal */ if (tmp_block && tmp_block->to1 && (strncmp((char *)(seq1+tmp_block->from1-3), END_SIG, (size_t)2) || (tmp_block->from2!=1))) { sig = (G_score>=abs(C_score)) ? &last_AG : &last_AC; if (sig->pos1 && (sig->pos2<=20)) { /* generated in extend_bw */ assert(sig->pos2 > 1); (void)strcpy((char *)tmp,END_SIG); (void)strncpy((char *)(tmp+2),(char *)seq2,(size_t)sig->pos2-1); (void)strcpy((char *)(tmp+sig->pos2+1), START_SIG); new = bmatch(seq1,tmp,tmp_block->from1-3,sig->pos2+3,1,1); if (new) { Lblock->next_exon->from1 = sig->pos1; Lblock->next_exon->from2 = sig->pos2; Lblock->next_exon->length -= sig->pos2-1; new->next_exon = Lblock->next_exon; new->ori = (G_score>=abs(C_score)) ? 'G' : 'C'; Lblock->next_exon = new; } } } while (tmp_block && tmp_block->next_exon && tmp_block->next_exon->to1) tmp_block = tmp_block->next_exon; if (tmp_block && tmp_block->to1 && (strncmp((char *)(seq1+tmp_block->to1),START_SIG,(size_t)2) || (tmp_block->to2!=N))) { sig = (G_score>=abs(C_score)) ? &last_GT : &last_CT; if (sig->pos1 && (N-sig->pos2<=20)) { assert(N-sig->pos2 >= 0); (void)strcpy((char *)tmp,END_SIG); (void)strncpy((char *)(tmp+2),(char *)(seq2+sig->pos2), (size_t)N-sig->pos2); (void)strcpy((char *)(tmp+N-sig->pos2+2),START_SIG); new = fmatch(seq1+sig->pos1-1,tmp, M-sig->pos1+1,N-sig->pos2+4, sig->pos1-1,sig->pos2+1); if (new) { tmp_block->to1 = sig->pos1; tmp_block->to2 = sig->pos2; new->next_exon = tmp_block->next_exon; tmp_block->next_exon = new; tmp_block->ori = (G_score>=abs(C_score)) ? 'G' : 'C'; } } } } /* Slide exon boundaries for optimal intron signals */ sync_flag = get_sync_flag(Lblock, Rblock, 6); SLIDE_INTRON(sync_flag)(6,&Lblock,seq1,seq2); /* decreasingly; script will be in reverse order */ flip_list(&Lblock, &Rblock); pluri_align(dist_ptr,&(st->nmatches),Lblock,&Script_head); flip_list(&Lblock, &Rblock); /* increasingly */ if (rs.poly_flag) remove_poly(&Script_head,Lblock,seq1,seq2,N,pT,pA); else *pT = *pA = 0; get_stats(Lblock, st); *Exons = Lblock->next_exon; free(Lblock); if (!rs.ali_flag) { free_align(Script_head); return NULL; } else return Script_head; } struct hash_node { int ecode; /* integer encoding of the word */ int pos; /* positions where word hits query sequence */ struct hash_node *link; /* next word with same last 7.5 letters */ }; #define HASH_SIZE 32767 /* 2**15 - 1 */ #define GEN_LOG4_ENTRIES 45 #define CDNA_LOG4_ENTRIES 25 static struct hash_node *phashtab[HASH_SIZE+1]; static struct hash_node **hashtab; static int mask; static int *next_pos, *pnext_pos; /* The log4 arrays were computed to mimick the behaviour of the log formula for computing the msp threshold in exon_cores(). For genomic_log4s, entry i stores the value for the length of a genomic sequence for which the contribution to the msp threshold is i/2, i.e.: 1.4*log_4(3/4*len1) = i/2; Similarly, cDNA_log4s entries store lengths of the cDNA sequence for which the contribution to the msp threshold is i/2, i.e.: 1.4*log_4(len2) = i/2; Both arrays are sorted in increasing order, and can be searched with binary search. */ static long genomic_log4s[]= {1, 2, 3, 5, 9, 15, 26, 42, 70, 114, \ 188, 309, 507, 832, 1365, 1365, 2240, 2240, 3675, 6029,\ 9892, 16231, 26629, 43690, 71681, \ 117606, 192953, 316573, 519392, 852152, 1398101, 2293823, 3763409, 6174516, 10130347, \ 16620564, 27268873, 44739242, 73402365, 120429110, \ 197584514, 324171126, 531858072, 872603963, 1431655765 }; static long cDNA_log4s[]= {1, 1, 2, 4, 7, 11, 19, 32, 52, 86, \ 141, 231, 380, 624, 1024, 1680, 2756, 4522, 7419, 12173, \ 19972, 32768, 53761, 88204, 144715 }; static int get_msp_threshold(int len1, int len2) { int i, j; i = find_log_entry(genomic_log4s, GEN_LOG4_ENTRIES, len1, 0); j = find_log_entry(cDNA_log4s, CDNA_LOG4_ENTRIES, len2, 0); if (!(i % 2)) return (int)(i/2+j/2); else if (!(j % 2)) return (int)(i/2+j/2); else return (int)(i/2+j/2+1); } static int find_log_entry(long *log4s, int n, int len, int offset) { int a; a = n/2; if ((len=log4s[a-1]))) return max(0,(a-1))+offset; else if ((len>=log4s[a]) && ((a==n-1) || (lenlog4s[a]) return find_log_entry(log4s+a+1,n-a-1,len, offset+a+1); return -1; } /* -------------------- exon_cores() --------------------- */ static void exon_cores(uchar *s1, uchar *s2, int len1, int len2, int offset1, int offset2, int flag, int in_W, int in_K, int type) { int i, W, last_msp, lower, upper; int *allocated; Exon *tmp_block; if (in_K<=0) { /* compute expected length of longest exact match .. */ /* K = (int) (log(.75*(double)len1)+log((double)len2))/log(4.0); */ /* .. and throw in a fudge factor */ /* K *= 1.4; */ K = get_msp_threshold(len1, len2); if (K>=0) K--; /* compensate for the rounding in the log formula */ /* commented this to avoid fragmentation if (flag) K = min(K, DEFAULT_C); second pass */ } else K = in_K; numMSPs = 0; exon_list = NULL; allocated = ckalloc((len1+len2+1)*sizeof(int)); lower = ((file_type==EST_GEN) || (file_type==GEN_EST && type==TEMP)) ? -len1 : -len2; upper = ((file_type==EST_GEN) || (file_type==GEN_EST && type==TEMP)) ? len2 : len1; diag_lev = allocated - lower; for (i=lower; i<=upper; ++i) diag_lev[i]=0; W = min(in_W,len2); switch (file_type) { case EST_GEN: bld_table(s2,len2,W, type); /* use longer sequence for permanent tables */ search(s1,s2,len1,len2,W); break; case GEN_EST: if (type!=TEMP) { uchar *aux; int auxi; aux = s1; s1 = s2; s2 = aux; auxi = len1; len1 = len2; len2 = auxi; } bld_table(s2,len2,W, type); /* use longer sequence for permanent tables */ search(s1,s2,len1,len2,W); if (type!=TEMP) { register int auxi; uchar *aux; Msp_ptr mp; /* change s1 and s2 back */ aux = s1; s1 = s2; s2 = aux; auxi = len1; len1 = len2; len2 = auxi; for (mp=msp_list, i=0; ipos1; mp->pos1 = mp->pos2; mp->pos2 = auxi; mp = mp->next_msp; } } break; default: fatal("sim4b1.c: Invalid file type code."); } free(allocated); if (type==TEMP) { register struct hash_node *hptr, *tptr; register int hval; free(next_pos); for (hval=0; hvallink; free(tptr); } } free(hashtab); } msp = (Msp_ptr *) ckalloc(numMSPs*sizeof(Msp_ptr)); { Msp_ptr mp = msp_list; for (i = 0; i < numMSPs; ++i) { msp[i] = mp; mp = mp->next_msp; } } sort_msps(); /* sort in order of mp->pos2, in the shorter seq */ /* organize Blast hits (MSPs) into exons */ last_msp = link_msps(msp, numMSPs, DEFAULT_WEIGHT, LINK); #ifdef DEBUG for (i = last_msp; i >= 0; i = msp[i]->prev) (void)printf("%d-%d\n", msp[i]->pos1, msp[i]->pos1 + msp[i]->len - 1); #endif msp2exons(msp,last_msp,s1,s2); /* now free msp[]? No - may need to re-link */ /* for (i=0; ilength = tmp_block->to2-tmp_block->from2+1; tmp_block->to1 += offset1; tmp_block->from1 += offset1; tmp_block->to2 += offset2; tmp_block->from2 += offset2; tmp_block->flag = flag; tmp_block = tmp_block->next_exon; } return ; } static void relink(Msp_ptr *in_msp, int in_numMSPs, int H, int offset1, int offset2, int flag, uchar *s1, uchar *s2) { int last_msp; Exon *tmp_block; exon_list = NULL; last_msp = link_msps(in_msp, in_numMSPs, H, RELINK); msp2exons(in_msp,last_msp,s1,s2); tmp_block = exon_list; while (tmp_block!=NULL) { tmp_block->length = tmp_block->to2-tmp_block->from2+1; tmp_block->to1 += offset1; tmp_block->from1 += offset1; tmp_block->to2 += offset2; tmp_block->from2 += offset2; tmp_block->flag = flag; tmp_block = tmp_block->next_exon; } return ; } static void free_msps(Msp_ptr **in_msp, int *in_numMSPs) { int i; for (i=0; i<*in_numMSPs; ++i) free((*in_msp)[i]); free(*in_msp); *in_msp = NULL; *in_numMSPs = 0; } static int scale(int n) { return (n<=100000) ? n : (100000+(int)(10*log((double)(n-100000)))); } static int link_msps(Msp_ptr *msp, int numMSPs, int H, int flag) { int i, j, f1, f2, best, diag, diff_diag, best_sc, try; for (best = -1, best_sc = MININT, i = 0; i < numMSPs; ++i) { f1 = msp[i]->pos1; /* start position in seq1 */ f2 = msp[i]->pos2; /* start position in seq2 */ diag = f1 - f2; msp[i]->prev = -1; msp[i]->Score = 0; for (j = 0; j < i; ++j) { int var_L = ((msp[i]->pos2+msp[i]->len-msp[j]->pos2-msp[j]->len>2*W) && (msp[i]->pos2-msp[j]->pos2>2*W)) ? 2*L : L; diff_diag = diag - msp[j]->pos1 + msp[j]->pos2; if (diff_diag < -rs.DRANGE || (diff_diag > rs.DRANGE && diff_diag < MIN_INTRON) || (msp[j]->pos2+msp[j]->len-1-f2>var_L) || (msp[j]->pos1+msp[j]->len-1-f1>var_L)) continue; try = msp[j]->Score - ((flag==RELINK) ? scale(abs(diff_diag)) : abs(diff_diag)); if (try > msp[i]->Score) { msp[i]->Score = try; msp[i]->prev = j; } } msp[i]->Score += (H*msp[i]->score); if (msp[i]->Score > best_sc) { best = i; best_sc = msp[i]->Score; } } return best; } /* ----------- build table of W-tuples in one of the sequences ------------*/ void bld_table(uchar *s, int len, int in_W, int type) { int ecode; int i, j; uchar *t; if (type == PERM) { mask = (1 << (in_W+in_W-2)) - 1; next_pos = pnext_pos; hashtab = phashtab; return; } /* perform initializations */ if (type == INIT) { /* perform initializations */ for (i=0; ilink) if (h->ecode == ecode) break; if (h == NULL) { h = (struct hash_node *) ckalloc (sizeof(struct hash_node)); h->link = hashtab[hval]; hashtab[hval] = h; h->ecode = ecode; h->pos = -1; } next_pos[pos] = h->pos; h->pos = pos; } /* ----------------------- search the other sequence ---------------------*/ static void search(uchar *s1, uchar *s2, int len1, int len2, int in_W) { register struct hash_node *h; register uchar *t; register int ecode, hval; int i, j, p; t = s1+1; for (i=1; (i<=len1) && *t; ) { restart: ecode = 0L; for (j=1; (jlink) if (h->ecode == ecode) { for (p = h->pos; p >= 0; p = next_pos[p]) extend_hit((int)(t-s1-1),p,s1,s2,len1,len2, in_W); break; } } } /* previous version: allow 'N's in the table -------- t = s1; ecode = 0L; for (i = 1; i < in_W; ++i) ecode = (ecode << 2) + encoding[*++t]; for (; (i<=len1) && (*++t); i++) { ecode = ((ecode & mask) << 2) + encoding[*t]; hval = ecode & HASH_SIZE; for (h = hashtab[hval]; h; h = h->link) if (h->ecode == ecode) { for (p = h->pos; p >= 0; p = next_pos[p]) extend_hit((int)(t-s1),p,s1,s2,len1,len2, in_W); break; } } ---------------------------------------------------*/ } /* extend_hit - extend a word-sized hit to a longer match */ static void extend_hit(int pos1, int pos2, const uchar * const s1, const uchar * const s2, int len1, int len2, int in_W) { const uchar *beg2, *beg1, *end1, *q, *s; int right_sum, left_sum, sum, diag, score; diag = pos2 - pos1; if (diag_lev[diag] > pos1) return; /* extend to the right */ left_sum = sum = 0; q = s1+1+pos1; s = s2+1+pos2; end1 = q; while ((*s != '\0') && (*q != '\0') && (s<=s2+len2) && (q<=s1+len1) && sum >= left_sum - X) { sum += ((*s++ == *q++) ? MATCH : MISMATCH); if (sum > left_sum) { left_sum = sum; end1 = q; } } /* extend to the left */ right_sum = sum = 0; beg1 = q = (s1+1+pos1) - in_W; beg2 = s = (s2+1+pos2) - in_W; while ((s>s2+1) && (q>s1+1) && sum >= right_sum - X) { sum += ((*(--s) == *(--q)) ? MATCH : MISMATCH); if (sum > right_sum) { right_sum = sum; beg2 = s; beg1 = q; } } score = in_W + left_sum + right_sum; if (score >= K) { Msp_ptr mp = (Msp_ptr)ckalloc(sizeof(*mp)); mp->len = end1 - beg1; mp->score = score; mp->pos1 = beg1 - (s1+1); mp->pos2 = beg2 - (s2+1); mp->next_msp = msp_list; msp_list = mp; ++numMSPs; } /*diag_lev[diag] = (end1 - (s1+1)) + in_W; */ diag_lev[diag] = (end1 - s1) - 1 + in_W; } /* ---------------------------- sort the MSPs ----------------------------*/ /* sort_msps - order database sequence for printing */ static void sort_msps(void) { int i; Msp_ptr mp; for (i = (numMSPs/2) - 1; i >= 0; --i) heapify(i, (int) numMSPs-1); for (i = numMSPs-1; i > 0; --i) { mp = msp[0]; msp[0] = msp[i]; msp[i] = mp; if (i > 1) heapify(0, i-1); } } /* heapify - core procedure for heapsort */ static void heapify(i, last) int i, last; { int lim = (last-1)/2, left_son, small_son; Msp_ptr mp; while (i <= lim) { left_son = 2*i + 1; if (left_son == last) small_son = left_son; else small_son = smaller(left_son, left_son+1); if (smaller(i, small_son) == small_son) { mp = msp[i]; msp[i] = msp[small_son]; msp[small_son] = mp; i = small_son; } else break; } } /* smaller - determine ordering relationship between two MSPs */ static int smaller(i, j) int i, j; { Msp_ptr ki = msp[i], kj = msp[j]; if (ki->pos2 > kj->pos2) return i; if (ki->pos2 < kj->pos2) return j; return ((ki->pos1 >= kj->pos1) ? i : j); } /* --------------------- organize the MSPs into exons ---------------------*/ static void msp2exons(Msp_ptr *msp, int last_msp, uchar *s1, uchar *s2) { Msp_ptr mp; int diag_dist, diff; exon_list = NULL; if (last_msp<0) return; /* Note: in new_exon, the 'flag' and 'length' fields need not be computed */ mp = msp[last_msp]; exon_list = new_exon (mp->pos1, mp->pos2, mp->pos1+mp->len-1, mp->pos2+mp->len-1, -1, (mp->len*MATCH-mp->score)/(MATCH-MISMATCH), 0, exon_list); last_msp = mp->prev; while (last_msp>=0) { mp = msp[last_msp]; if (((diag_dist=abs((exon_list->from2-exon_list->from1)-(mp->pos2-mp->pos1)))<=L) && (exon_list->from2-(mp->pos2+mp->len-1))edist += diag_dist; exon_list->edist += (mp->len*MATCH-mp->score)/(MATCH-MISMATCH); if ((diff=mp->pos2+mp->len-exon_list->from2)>0) { /* overlap */ int dist1, dist2; dist1 = get_edist(exon_list->from1,mp->pos2+mp->len-diff, exon_list->from1+diff-1,mp->pos2+mp->len-1,s1,s2); dist2 = get_edist(mp->pos1+mp->len-diff,mp->pos2+mp->len-diff, mp->pos1+mp->len-1,mp->pos2+mp->len-1,s1,s2); exon_list->edist -= max(dist1,dist2); } else if (diff<0) { /* gap */ exon_list->edist += 0.5*P*(-1)*diff; } exon_list->to1 = max(exon_list->to1,mp->pos1+mp->len-1); exon_list->to2 = max(exon_list->to2,mp->pos2+mp->len-1); exon_list->from1 = min(exon_list->from1,mp->pos1); exon_list->from2 = min(exon_list->from2,mp->pos2); } else { /* new exon */ exon_list = new_exon (mp->pos1, mp->pos2, mp->pos1+mp->len-1, mp->pos2+mp->len-1, -1, (mp->len*MATCH-mp->score)/(MATCH-MISMATCH), 0, exon_list); } last_msp = mp->prev; } } static int get_edist(int f1, int f2, int t1, int t2, uchar *seq1, uchar *seq2) { uchar *s1, *s2, *q1, *q2; int dist=0; s1 = seq1+f1+1; /* bc at this stage, the msp pos do not have added +1 */ s2 = seq2+f2+1; q1 = seq1+t1+1; q2 = seq2+t2+1; while (s1<=q1 && s2<=q2) { dist += (*s1!=*s2); s1++; s2++; } return dist; } /* ---------------------- print endpoints of exons --------------------*/ #ifdef AUXUTILS static void find_introns(Exon *eleft, Intron **Ilist) { Exon *tmp_exon, *tmp_exon1; Intron *new, *tail; int GTAG_score, CTAC_score; *Ilist = tail = NULL; if (!eleft) fatal("sim4b1.c: Something wrong in the exon list.\n"); tmp_exon = eleft->next_exon; while ((tmp_exon!=NULL) && (tmp_exon1=tmp_exon->next_exon) && tmp_exon1->to1) { new = (Intron *)ckalloc(sizeof(Intron)); new->from1 = tmp_exon->to1+1; new->to1 = tmp_exon1->from1-1; new->from2 = tmp_exon->to2; new->to2 = tmp_exon1->from2; new->length = new->to1-new->from1+1; new->next_intron = NULL; if (!tail) *Ilist = new; else tail->next_intron = new; tail = new; /* find orientation */ GTAG_score = CTAC_score = 0; if (*(seq1+new->from1-1)=='G') GTAG_score++; else if (*(seq1+new->from1-1)=='C') CTAC_score++; if (*(seq1+new->from1-1)=='T') { GTAG_score++; CTAC_score++; } if (*(seq1+new->to1-1)=='A') { GTAG_score++; CTAC_score++; } if (*(seq1+new->to1-1)=='G') GTAG_score++; else if (*(seq1+new->to1-1)=='C') CTAC_score++; if (GTAG_score>=CTAC_score) new->orientation = '+'; else new->orientation = 'c'; tmp_exon = tmp_exon1; } } #endif /* should only be called when (file_type==EST_GEN) && (match_ori==BWD) */ void complement_exons(Exon **left, int M, int N) { Exon *tmp_block, *right; char prev, ch; prev = 'U'; /* unknown; should trigger error */ tmp_block = *left; while (tmp_block) { if (tmp_block->to1) { register int aux; if (tmp_block->next_exon && tmp_block->next_exon->to1) { ch = tmp_block->ori; tmp_block->ori = prev; switch (ch) { case 'C': prev = 'G'; break; case 'G': prev = 'C'; break; case 'N': prev = 'N'; break; case 'E': prev = 'E'; break; default : fatal("sim4b1.c: Inconsistency. Check exon orientation at complementation."); } } else tmp_block->ori = prev; aux = tmp_block->from1; tmp_block->from1 = M+1-tmp_block->to1; tmp_block->to1 = M+1-aux; aux = tmp_block->from2; tmp_block->from2 = N+1-tmp_block->to2; tmp_block->to2 = N+1-aux; } tmp_block = tmp_block->next_exon; if (tmp_block && tmp_block->to1) right = tmp_block; } flip_list(left,&right); } void print_exons(Exon *left) { Exon *tmp_block, *tmp_block1; tmp_block = left; while (tmp_block!=NULL) { if (tmp_block->to1) { if (file_type==EST_GEN) (void)fprintf(stdout,"%d-%d (%d-%d) %d%%", tmp_block->from2, tmp_block->to2, tmp_block->from1, tmp_block->to1, tmp_block->match); else /* file_type==GEN_EST */ (void)fprintf(stdout,"%d-%d (%d-%d) %d%%", tmp_block->from1, tmp_block->to1, tmp_block->from2, tmp_block->to2, tmp_block->match); if (((tmp_block1=tmp_block->next_exon)!=NULL) && tmp_block1->to1) switch (tmp_block->ori) { case 'C': (void)fprintf(stdout," <-\n"); break; case 'E': (void)fprintf(stdout," ==\n"); break; case 'G': (void)fprintf(stdout," ->\n"); break; case 'N': (void)fprintf(stdout," --\n"); break; default : fatal("sim4b1.c: Inconsistency. Check exon orientations."); } } tmp_block = tmp_block->next_exon; } } /* to and from are in the original cDNA sequence */ void print_pipmaker_exons(Exon *exons, edit_script_list *aligns, char *gene, int from, int to, int M, int N, uchar *seq1, uchar *seq2, int match_ori) { Exon *tmp_block, *left, *right; int From, To, cov, ori; /* print the first line in the record */ if ((exons==NULL) || (!exons->to1 && ((exons->next_exon==NULL) || !exons->next_exon))) return; left = right = tmp_block = (exons->to1) ? exons : exons->next_exon; while (tmp_block) { if (tmp_block->to1) right = tmp_block; tmp_block = tmp_block->next_exon; } /* report any inconsistencies between the intron orientations, as well as between the mRNA (CDS) match strand and the introns orientations */ ori = check_consistency_intron_ori(exons, match_ori, gene); /* determine the matching coordinates for the CDS */ if ((from>0) && (to>0) && aligns) { cov = dispatch_find_ends(from, to, &From, &To, aligns, M, N, match_ori); switch (cov) { case OK: if ((match_ori==FWD) && strncmp((char *)(seq1+From-1),"ATG",3)) fprintf(stderr, "Warning: No start codon at location %d in the genomic sequence (%s).\n", From, gene); else if ((match_ori==BWD) && strncmp((char *)(seq1+To-3),"CAT",3)) fprintf(stderr, "Warning: No (complement) start codon at location %d in the genomic sequence (%s).\n", To-2, gene); if ((match_ori==FWD) && strncmp((char *)(seq1+To-3),"TAA",3) && strncmp((char *)(seq1+To-3),"TAG",3) && strncmp((char *)(seq1+To-3),"TGA",3)) fprintf(stderr, "Warning: No stop codon at location %d in the genomic sequence (%s).\n", To-2, gene); else if ((match_ori==BWD) && strncmp((char *)(seq1+From-1),"TTA",3) && strncmp((char *)(seq1+From-1),"CTA",3) && strncmp((char *)(seq1+From-1),"TCA",3)) fprintf(stderr, "Warning: No (complement) stop codon at location %d in the genomic sequence (%s).\n", From, gene); break; case FREE_START: fprintf(stderr, "Warning: Start of CDS does not match (%s).\n", gene); if ((match_ori==FWD) && strncmp((char *)(seq1+To-3),"TAA",3) && strncmp((char *)(seq1+To-3),"TAG",3) && strncmp((char *)(seq1+To-3),"TGA",3)) fprintf(stderr, "Warning: No stop codon at location %d in the genomic sequence (%s).\n", To-2, gene); else if ((match_ori==BWD) && strncmp((char *)(seq1+From-1),"TTA",3) && strncmp((char *)(seq1+From-1),"CTA",3) && strncmp((char *)(seq1+From-1),"TCA",3)) fprintf(stderr, "Warning: No (complement) stop codon at location %d in the genomic sequence (%s).\n", From, gene); break; case FREE_END: fprintf(stderr, "Warning: End of CDS does not match (%s).\n", gene); if ((match_ori==FWD) && strncmp((char *)(seq1+From-1),"ATG",3)) fprintf(stderr, "Warning: No start codon at location %d in the genomic sequence (%s).\n", From, gene); else if ((match_ori==BWD) && strncmp((char *)(seq1+To-3),"CAT",3)) fprintf(stderr, "Warning: No (complement) start codon at location %d in the genomic sequence (%s).\n", To-2, gene); break; case FREE_BOTH_ENDS: fprintf(stderr, "Warning: Start of CDS does not match (%s).\n", gene); fprintf(stderr, "Warning: End of CDS does not match (%s).\n", gene); break; default: fatal("Unrecognized warning code."); } } /* report codon inconsistencies in the cDNA */ if (to>0 && from>0) { if (strncmp((char *)(seq2+from-1),"ATG",3)) fprintf(stderr, "Warning: No start codon at location %d in the mRNA (%s).\n", from, gene); if (strncmp((char *)(seq2+to-3),"TAA",3) && strncmp((char *)(seq2+to-3),"TAG",3) && strncmp((char *)(seq2+to-3),"TGA",3)) fprintf(stderr, "Warning: No end codon at location %d in the mRNA (%s).\n", to-2, gene); } printf("%c %d %d %s%s\n", (ori==FWD) ? '>':'<', left->from1, right->to1, gene ? gene:"", (match_ori==FWD) ? "":" (complement)"); if ((from>0) && (to>0) && aligns) printf("+ %d %d\n", From, To); /* now print the exons */ /* if (match_ori==BWD) flip_list(&left,&right); not accepted by PipMaker */ tmp_block = left; while (tmp_block!=NULL) { if (tmp_block->to1) (void)fprintf(stdout,"%d %d\n", tmp_block->from1, tmp_block->to1); tmp_block = tmp_block->next_exon; } if (match_ori==BWD) flip_list(&left, &right); return; } static int check_consistency_intron_ori(Exon *exons, int match_ori, char *gene) { Exon *t=exons; int numG, numC, numE, numN; numG = numC = numE = numN = 0; if (!t->to1) t = t->next_exon; while (t && t->to1) { if (t->next_exon && t->next_exon->to1) { switch (t->ori) { case 'G': numG++; break; case 'C': numC++; break; case 'N': numN++; break; case 'E': numE++; break; default : fatal("sim4b1.c: Unrecognized intron orientation."); } } t = t->next_exon; } if (numG && numC) fprintf(stderr, "Warning: Introns reported on both strands (%s).\n", gene); /* Note: a match can be reverse complemented either b/c the complement genomic sequence was used as input, while the mRNA was actually transcribed in forward orientation (CT_AC), or b/c the forward strand was given for the genomic sequence, but the mRNA was transcribed in the reverse direction (GT_AG); hence there is no relevant test for this else if ((numG && (match_ori==BWD)) || (numC && (match_ori==FWD))) fprintf(stderr, "Warning: Introns orientations inconsistent with the reported match strand (%s).\n", gene); */ if (numN) fprintf(stderr, "Warning: Ambiguous intron orientation (%s).\n", gene); if (numE) fprintf(stderr, "Warning: Internal gap in the mRNA (%s).\n", gene); return (numG>=numC) ? FWD:BWD; } /* from and to are given in the original cDNA sequence */ static int dispatch_find_ends(int from, int to, int *From, int *To, edit_script_list *aligns, int M, int N, int match_ori) { int f1, f2, t1, t2, ot1, ot2, xto, xfrom; int free_start, free_end; free_start = free_end = 1; if (aligns->next_script && (aligns->offset2 > aligns->next_script->offset2)) script_flip_list(&aligns); if (match_ori==FWD) { xto = to; xfrom = from; } else if (file_type == EST_GEN) { xto = to; xfrom = from; } else { xto = N-from+1; xfrom = N-to+1; } *From = *To = 0; t1 = t2 = 0; while (aligns) { ot2 = t2; ot1 = t1; f1 = aligns->offset1; f2 = aligns->offset2; t1 = f1+aligns->len1-1; t2 = f2+aligns->len2-1; if (ot2 < xfrom && xfrom < f2) { *From = f1; break; } if (f2 <= xfrom && xfrom <= t2) { *From = find_ends(aligns, xfrom); free_start = 0; break; } aligns = aligns->next_script; } if (*From == 0) return FREE_BOTH_ENDS; if (ot2 < xto && xto < f2) *To = ot1; else if (xto <= t2) { *To = find_ends(aligns, xto); free_end = 0; } else { *To = 0; while (aligns && ((aligns=aligns->next_script)!=NULL)) { ot2 = t2; ot1 = t1; f1 = aligns->offset1; f2 = aligns->offset2; t1 = f1+aligns->len1-1; t2 = f2+aligns->len2-1; if (ot2 < xto && xto < f2) { *To = ot1; break; } if (t2 < xto) *To = t1; else if (f2 <= xto && xto <=t2) { *To = find_ends(aligns, xto); free_end = 0; break; } } if (*To==0) *To = t1; } if (*To == 0) { *From = 0; free_start = 1; } if (*To && *From && match_ori==BWD && file_type==EST_GEN) { int aux = M-(*From)+1; *From = M-(*To)+1; *To = aux; } if (free_start && free_end) return FREE_BOTH_ENDS; else if (free_start) return FREE_START; else if (free_end) return FREE_END; return OK; } static int find_ends(edit_script_list *head, int j0) { int i, j, e1, e2; edit_script *tp; i = head->offset1; e1 = i+head->len1-1; j = head->offset2; e2 = j+head->len2-1; tp = head->script; i--; j--; while (i<=e1 && j<=e2 && tp) { if (j==j0) return i; switch (tp->op_type) { case DELETE: i += tp->num; break; case INSERT: j += tp->num; if (j>=j0) return i; break; case SUBSTITUTE: i += tp->num; j += tp->num; if (j>=j0) return (i-(j-j0)); break; default: fatal("Illegal opcode in script."); } tp = tp->next; } /* not found: failure */ fatal("Inconsistency in script."); } #ifdef AUXUTILS static void print_introns(Intron_ptr intron_list) { Intron_ptr ep=intron_list; (void)printf("\nIntrons {\n\n"); while (ep!=NULL) { (void)printf("genome: %6ld - %-6ld cDNA: %5ld - %-5ld, l: %-5d, o: %c \n", ep->from1, ep->to1, ep->from2, ep->to2, ep->length, ep->orientation); ep = ep->next_intron; } (void)printf("}\n\n"); } #endif static void pluri_align(int *dist_ptr,int *num_matches,Exon *lblock,struct edit_script_list **Aligns) { int tmpi, di_count, i, end1, end2, diff, ali_dist, nmatches; uchar *a, *b; Exon *tmp_block=lblock, *tmp_block1; struct edit_script_list *enew; struct edit_script *head, *tmp_script, *new, *left, *right, *prev; nmatches = 0; head = NULL; *Aligns = NULL; *dist_ptr = ali_dist = 0; end1 = M; end2 = N; while (((tmp_block1=tmp_block->next_exon)!=NULL) && tmp_block1->to1) { if ((diff=tmp_block->from2-tmp_block1->to2-1)!=0) { if (tmp_block->to1) { enew = (edit_script_list *)ckalloc(sizeof(edit_script_list)); enew->next_script = *Aligns; *Aligns = enew; (*Aligns)->script = head; (*Aligns)->offset1 = tmp_block->from1; (*Aligns)->offset2 = tmp_block->from2; (*Aligns)->len1 = end1-(*Aligns)->offset1+1; (*Aligns)->len2 = end2-(*Aligns)->offset2+1; (*Aligns)->score = ali_dist; ali_dist = 0; head = NULL; } end1 = tmp_block1->to1; end2 = tmp_block1->to2; } else if (((diff=tmp_block->from1-tmp_block1->to1-1)!=0) && tmp_block->to1) { new = (edit_script *) ckalloc(sizeof(edit_script)); new->op_type = DELETE; new->num = diff; new->next = head; head = new; } else if (diff) end1 = tmp_block1->to1; diff = align_get_dist(tmp_block1->from1-1, tmp_block1->from2-1, tmp_block1->to1, tmp_block1->to2, max(1000,.2*(tmp_block1->to2-tmp_block1->from2+1))); if (diff<0) { (void)printf("The two sequences are not really similar.\n"); (void)printf("Please try an exact method.\n"); exit(1); } #ifdef STATS if (diff>P*(tmp_block1->to2-tmp_block1->from2+1)) (void)printf("Warning: Distance threshold on segment exceeded.\n"); #endif align_path(tmp_block1->from1-1, tmp_block1->from2-1, tmp_block1->to1, tmp_block1->to2, diff, &left, &right); Condense_both_Ends(&left, &right, &prev); if (!tmp_block->to1 && right->op_type == DELETE) { /* remove gaps at end of alignment */ diff -= 0+right->num; /* subtract GAP_OPEN = 0 */ tmp_block1->to1 -= right->num; end1 -= right->num; if (head && (head->op_type == DELETE)) head->num += right->num; free(right); prev->next = NULL; right = prev; } if ((!tmp_block1->next_exon || !tmp_block1->next_exon->to1) && left && (left->op_type == DELETE)) { diff -= 0+left->num; /* subtract GAP_OPEN = 0 */ tmp_block1->from1 += left->num; tmp_script = left->next; if (right == left) right = tmp_script; free(left); left = tmp_script; } *dist_ptr += diff; ali_dist += diff; a = seq1+tmp_block1->from1-1; b = seq2+tmp_block1->from2-1; tmpi = di_count = 0; tmp_script = left; while (tmp_script) { switch (tmp_script->op_type) { case DELETE: di_count += tmp_script->num; tmpi += tmp_script->num; a += tmp_script->num; break; case INSERT: di_count += tmp_script->num; tmpi += tmp_script->num; b += tmp_script->num; break; case SUBSTITUTE: for (i=0; inum; ++i, ++a, ++b) if (*a!=*b) tmpi++; else nmatches++; break; } tmp_script = tmp_script->next; } tmp_block1->alen = (int)((tmp_block1->to1-tmp_block1->from1+1+ tmp_block1->to2-tmp_block1->from2+1+di_count)/(double)2); tmp_block1->nmatches = tmp_block1->alen - tmpi; tmp_block1->match = (int)floor(100*(1-tmpi/(double)tmp_block1->alen)); right->next = head; head = left; tmp_block = tmp_block1; } /* at the beginning of the sequences */ if (tmp_block1!=NULL) { if ((diff=tmp_block->from2-tmp_block1->to2-1)!=0 && (diff!=N)) { enew = (edit_script_list *)ckalloc(sizeof(edit_script_list)); enew->next_script = *Aligns; *Aligns = enew; (*Aligns)->offset1 = tmp_block->from1; (*Aligns)->offset2 = tmp_block->from2; (*Aligns)->len1 = end1-(*Aligns)->offset1+1; (*Aligns)->len2 = end2-(*Aligns)->offset2+1; (*Aligns)->script = head; (*Aligns)->score = ali_dist; } else if (diff!=N) { /* modified to cut introns at the beginning of the sequence */ enew = (edit_script_list *)ckalloc(sizeof(edit_script_list)); enew->next_script = *Aligns; *Aligns = enew; (*Aligns)->offset1 = tmp_block->from1; (*Aligns)->offset2 = 1; (*Aligns)->len1 = end1-(*Aligns)->offset1+1; (*Aligns)->len2 = end2-(*Aligns)->offset2+1; (*Aligns)->script = head; (*Aligns)->score = ali_dist; } } *num_matches = nmatches; } static Exon *new_exon(int f1, int f2, int t1, int t2, int len, int edist, int flag, Exon *next) { Exon *new = (Exon *)ckalloc(sizeof(Exon)); new->from1 = f1; new->from2 = f2; new->to1 = t1; new->to2 = t2; new->length = (len < 0) ? (t2-f2+1) : len; new->edist = edist; new->flag = flag; new->next_exon = next; return new; } static void get_stats(Exon *lblock, sim4_stats_t *st) { Exon *t, *t1; #ifdef _STATS t = lblock; if (!t->next_exon) { /* no alignment found */ st->marginals = 2.0; } else { while (t) { if ((t1 = t->next_exon)!=NULL) { if (!t->to1 && t1->to1) st->marginals += (float)(t1->from2-1)/t1->to2; else if (t->to1 && !t1->to1) st->marginals += (float)(N-t->to2)/ (N-t->from2+1); else if (!t->to1 && !t1->to1) st->marginals = 2.0; } t = t1; } } st->marginals = st->marginals/2; #endif st->icoverage = 0; st->internal = 1; st->mult = 0; t = lblock->next_exon; while (t) { st->icoverage += t->length; if (t->length) st->mult++; t = t->next_exon; } st->fcoverage = ((float)st->icoverage)/N; t = lblock; if ((t->next_exon==NULL) || !t->next_exon->to1) st->internal = 0; while (t) { if ((t->to1) && ((t1=t->next_exon)!=NULL) && (t1->from2-t->to2-1>0) && t1->to1) st->internal = 0; t = t->next_exon; } } static int resolve_overlap(Exon *tmp_block, Exon *tmp_block1, uchar *seq1) { int diff, best_u, l0, l1, u, cost; int GTAG_score, CTAC_score; uchar *s1, *s2, *e1; diff = tmp_block1->from2-tmp_block->to2-1; if (diff>=0) return (tmp_block1->from2-1); /* resolve overlap using the GT-AG criterion */ /* u-1 = actual position in the sequence */ l0 = tmp_block->length-diff; l1 = tmp_block1->length; best_u = u = tmp_block1->from2-1; s1 = seq1+tmp_block->to1-(tmp_block->to2-u); s2 = seq1-2+tmp_block1->from1+u-tmp_block1->from2; cost = 0; e1 = seq1+tmp_block->to1; while (s1<=e1) { GTAG_score = CTAC_score = 0; GTAG_score += ((char)(*s1)=='G') ? 1 : 0; GTAG_score += ((char)(*(s1+1))=='T') ? 1 : 0; GTAG_score += ((char)(*s2)=='A') ? 1 : 0; GTAG_score += ((char)(*(s2+1))=='G') ? 1 : 0; if (GTAG_score > abs(cost) && ((l0>=8) || (l1>=8))) { cost = GTAG_score; best_u = u; if (cost == 4) break; } CTAC_score += ((char)(*s1)=='C') ? 1 : 0; CTAC_score += ((char)(*(s1+1))=='T') ? 1 : 0; CTAC_score += ((char)(*s2)=='A') ? 1 : 0; CTAC_score += ((char)(*(s2+1))=='C') ? 1 : 0; if (CTAC_score > abs(cost)) { cost = -CTAC_score; best_u = u; if (cost == 4) break; } u++; s1++; s2++; l0++; l1--; } return best_u; } static int greedy(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, Exon **lblock, Exon **rblock) { int col, /* column number */ d, /* current distance */ k, /* current diagonal */ max_d, /* bound on size of edit script */ Cost, blower,flower, /* boundaries for searching diagonals */ bupper,fupper, row, /* row number */ DELTA, /* n-m */ MAX_D, B_ORIGIN, F_ORIGIN; int back, forth; /* backward and forward limits at exit */ int *blast_d, *flast_d, /* rows containing the last d (at crt step, d-1) */ *btemp_d, *ftemp_d; /* rows containing tmp values for the last d */ int *min_row, *min_diag, /* min (b)/ max (f) row (and diagonal) */ *max_row, *max_diag; /* reached for cost d=0, ... m. */ DELTA = n-m; /*max_d = MAX_D = m+1; */ max_d = MAX_D = max(W,(int)(P*m+1)); if (DELTA<0) { if (m<=min(W,(1+P)*n)) { *lblock = *rblock = new_exon(offset2+1,offset1+1,offset2+n,offset1+m, m,n-m+(int)(P*m+1),0,NULL); return m-n+(int)(P*n+1); } else { *lblock = *rblock = NULL; return max(W,(int)(P*m+1))+1; } } F_ORIGIN = MAX_D; B_ORIGIN = MAX_D-DELTA; for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--) /*LINTED empty loop body*/; if (row == 0) { /* hit last row; stop search */ *lblock = *rblock = new_exon(offset2-m+n+1,offset1+1,offset2+n, offset1+m,m,0,0,NULL); return 0; } blast_d = (int *)ckalloc((MAX_D+n+1)*sizeof(int)); btemp_d = (int *)ckalloc((MAX_D+n+1)*sizeof(int)); for (k=0; k<=MAX_D+n; ++k) { blast_d[k]=m+1; btemp_d[k]=m+1; } blast_d[B_ORIGIN+DELTA] = row; blower = B_ORIGIN + DELTA - 1; bupper = B_ORIGIN + DELTA + 1; for (row=0; row 0 && col > 0 && (s1[row-1]==s2[col-1])) { --row; --col; } btemp_d[k] = row; /* if (row == 0 || col == 0) max_d = d; */ } /* for k */ min_row[d] = btemp_d[DELTA+B_ORIGIN]; min_diag[d] = DELTA+B_ORIGIN; for (k=blower; k<=bupper; ++k) { blast_d[k] = btemp_d[k]; btemp_d[k] = m+1; if (blast_d[k] d+Cost) || (max_d==d+Cost && (forth<0)))) { max_d = d+Cost; back = d; forth = Cost; break; } } --blower; ++bupper; /* for each relevant diagonal ... */ for (k = flower; k <= fupper; k++) { /* process the next edit instruction */ /* find a d on diagonal k */ if (k==-d+F_ORIGIN) { /* move down from the last d-1 on diagonal k+1 */ row = flast_d[k+1]+1; /* DELETE */ } else if (k==d+F_ORIGIN) { /* move right from the last d-1 on diagonal k-1 */ row = flast_d[k-1]; /* INSERT */ } else if ((flast_d[k]>=flast_d[k+1]) && (flast_d[k]+1>=flast_d[k-1])) { /* substitution */ row = flast_d[k]+1; /* SUBSTITUTE */ } else if ((flast_d[k+1]+1>=flast_d[k-1]) && (flast_d[k+1]>=flast_d[k])) { /* move left from the last d-1 on diagonal k+1 */ row = flast_d[k+1]+1; /* DELETE */ } else { /* move right from the last d-1 on diagonal k-1 */ row = flast_d[k-1]; /* INSERT */ } /* code common to the three cases */ col = row + k - F_ORIGIN; /* slide down the diagonal */ if (row>=0) while (row < m && col < n && (s1[row]==s2[col])) { ++row; ++col; } ftemp_d[k] = row; /* if (row == m || col == n) max_d = d; */ } /* for k */ max_row[d] = ftemp_d[F_ORIGIN]; max_diag[d] = F_ORIGIN; for (k=flower; k<=fupper; ++k) { flast_d[k] = ftemp_d[k]; ftemp_d[k] = -1; if (flast_d[k]>max_row[d]) { max_row[d] = flast_d[k]; max_diag[d] = k; } } /* record backward and forward limits, if minimum combined * cost in overlapping. Note: it suffices to search up to * Cost=min(d,(max_d-d)). */ for (Cost=0; Cost<=d; Cost++) { if ((min_row[Cost]<=max_row[d]) && ((max_d>d+Cost) || (max_d==d+Cost && (forth<0)))) { max_d = d+Cost; back = Cost; forth = d; break; } } --flower; ++fupper; ++d; /* for d */ } if (d>MAX_D) { *lblock = *rblock = NULL; free(blast_d); free(btemp_d); free(flast_d); free(ftemp_d); free(min_row); free(min_diag); free(max_row); free(max_diag); return d; } /*fin:*/ if (m-min_row[back]>=max_row[forth]) { *rblock = new_exon(offset2+1+min_row[back]+min_diag[back]-B_ORIGIN, offset1+1+min_row[back], offset2+n,offset1+m, m-min_row[back],back,0,NULL); *lblock = new_exon(offset2+1,offset1+1, offset2+min_row[back]+max_diag[forth]-F_ORIGIN, offset1+min_row[back], min_row[back],forth,0,*rblock); } else { *rblock = new_exon(offset2+1+max_row[forth]+min_diag[back]-B_ORIGIN, offset1+1+max_row[forth], offset2+n,offset1+m,m-max_row[forth],back,0,NULL); *lblock = new_exon(offset2+1,offset1+1, offset2+max_row[forth]+max_diag[forth]-F_ORIGIN, offset1+max_row[forth],max_row[forth],forth,0,*rblock); } free(blast_d); free(btemp_d); free(flast_d); free(ftemp_d); free(min_row); free(min_diag); free(max_row); free(max_diag); return back+forth; } void flip_list(Exon **left, Exon **right) { Exon *ep, *ahead, *behind; *right = *left; ahead = *left; ep = NULL; while (ahead!=NULL) { behind = ep; ep = ahead; ahead = ahead->next_exon; ep->next_exon = behind; } *left = ep; } /* reverse a list of edit script chains */ void script_flip_list(edit_script_list **left) { edit_script_list *ep, *ahead, *behind; ahead = *left; ep = NULL; while (ahead!=NULL) { behind = ep; ep = ahead; ahead = ahead->next_script; ep->next_script = behind; } *left = ep; } /* operates on a list sorted in increasing order of exon coordinates */ static void compact_list(Exon **Lblock, Exon **Rblock) { Exon *tmp_block=*Lblock, *tmp_block1; int diff; while ((tmp_block!=NULL) && ((tmp_block1=tmp_block->next_exon)!=NULL) && tmp_block1->to1) { if ((abs((tmp_block1->from2-tmp_block1->from1) - (tmp_block->to2-tmp_block->to1))<=W) && ((diff=tmp_block1->from2-tmp_block->to2-1)<=MAX_INTERNAL_GAP)) { /* merge blocks */ tmp_block->to1 = tmp_block1->to1; tmp_block->to2 = tmp_block1->to2; tmp_block->length = tmp_block->to2-tmp_block->from2+1; tmp_block->edist += tmp_block1->edist; tmp_block->edist -= P*diff; tmp_block->next_exon = tmp_block1->next_exon; free(tmp_block1); } else tmp_block = tmp_block1; } /* reset right end of the list */ *Rblock = tmp_block; } /* ------------------ memory management routines --------------- */ void link_to_data_list(Pointer data, ValNodePtr *head, ValNodePtr *prev) { ValNodePtr curr; curr = (ValNodePtr)ckalloc(sizeof(struct ValNode)); curr->data = data; curr->next = NULL; if(*prev == NULL) *head = curr; else (*prev)->next = curr; *prev = curr; } void ValNodeFreeData(ValNodePtr data_list) { ValNodePtr tmp_node; while ((tmp_node=data_list)!=NULL) { free(tmp_node->data); data_list = data_list->next; free(tmp_node); } } int good_ratio(int length) { if (length<=W/2) return 2; else if (length<2*W) return rs.cutoff; else return (int)(.75*P*length+1); } static int extend_bw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, int *line1, int *line2) { int col, /* column number */ row, /* row number */ max_d, /* bound on the length of the edit script */ d, /* current compressed distance */ k, /* current diagonal */ DELTA, /* n-m */ ORIGIN, lower, upper; int *last_d, *temp_d; /* column containing the last p */ int *min_row, *min_diag; /* min (b)/ max (f) row (and diagonal) */ /* reached for cost d=0, ... m. */ DELTA = n-m; max_d = m+1; ORIGIN = m; for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--) /*LINTED empty loop body*/; if ((row == 0) || (col == 0)) { *line1 = row+offset1; *line2 = col+offset2; return 0; } last_d = (int *)ckalloc((m+n+1)*sizeof(int)); temp_d = (int *)ckalloc((m+n+1)*sizeof(int)); for (k=0; k<=m+n; ++k) last_d[k]=m+1; last_d[ORIGIN+DELTA] = row; lower = ORIGIN + DELTA - 1; upper = ORIGIN + DELTA + 1; min_row = (int *)ckalloc((m+1)*sizeof(int)); min_diag = (int *)ckalloc((m+1)*sizeof(int)); for (d=1; d<=m; d++) min_row[d] = m+1; min_row[0] = last_d[ORIGIN+DELTA]; min_diag[0] = ORIGIN + DELTA; d = 0; while ((++d<=max_d) && ((d-1<=good_ratio(m-min_row[d-1])) || ((d>=2) && (d-2<=good_ratio(m-min_row[d-2]))))) { /* for each relevant diagonal ... */ for (k = lower; k <= upper; k++) { /* find a d on diagonal k */ if (k==-d+DELTA+ORIGIN) { /* move down from the last d-1 on diagonal k+1 */ row = last_d[k+1]; /* op = INSERT; */ } else if (k==d+DELTA+ORIGIN) { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]-1; /* op = DELETE; */ } else if ((last_d[k]-1<=last_d[k+1]) && (last_d[k]-1<=last_d[k-1]-1)) { /* substitution */ row = last_d[k]-1; /* op = SUBSTITUTE; */ } else if ((last_d[k-1]-1<=last_d[k+1]) && (last_d[k-1]-1<=last_d[k]-1)) { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]-1; /* op = DELETE; */ } else { /* move left from the last d-1 on diagonal k+1 */ row = last_d[k+1]; /* op = INSERT; */ } /* code common to the three cases */ /* slide down the diagonal */ col = row+k-ORIGIN; while ((row > 0) && (col > 0) && (s1[row-1]==s2[col-1])) { row--; col--; } temp_d[k] = row; if ((row == 0) && (col == 0)) { /* hit southeast corner; have the answer */ free(last_d); free(temp_d); free(min_row); free(min_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } if (row == 0) { /* hit first row; don't look further */ free(last_d); free(temp_d); free(min_row); free(min_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } if (col == 0) { /* hit last column; don't look further */ free(last_d); free(temp_d); free(min_row); free(min_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } } min_row[d] = last_d[ORIGIN+DELTA]; min_diag[d] = ORIGIN+DELTA; for (k=lower; k<=upper; ++k) if (temp_d[k]0) && (min_row[d-1]-min_row[d]<3)) d--; *line1 = min_row[d]+offset1; *line2 = min_row[d]+min_diag[d]-ORIGIN+offset2; free(min_row); free(min_diag); free(last_d); free(temp_d); return d; } static int extend_fw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, int *line1, int *line2) { int col, /* column number */ row, /* row number */ max_d, /* bound on the length of the edit script */ d, /* current compressed distance */ k, /* current diagonal */ ORIGIN, lower, upper; int *last_d, *temp_d; /* column containing the last p */ int *max_row, *max_diag; /* min (b)/ max (f) row (and diagonal) */ /* reached for cost d=0, ... m. */ max_d = m+1; ORIGIN = m; for (row=0, col=0; col=2) && (d-2<=good_ratio(max_row[d-2]))))) { /* for each relevant diagonal ... */ for (k = lower; k <= upper; k++) { /* find a d on diagonal k */ if (k==-d+ORIGIN) { /* move down from the last d-1 on diagonal k+1 */ row = last_d[k+1]+1; /* op = DELETE; */ } else if (k==d+ORIGIN) { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]; /* op = INSERT; */ } else if ((last_d[k]>=last_d[k+1]) && (last_d[k]+1>=last_d[k-1])) { /* substitution */ row = last_d[k]+1; /* op = SUBSTITUTE; */ } else if ((last_d[k+1]+1>=last_d[k-1]) && (last_d[k+1]>=last_d[k])) { /* move down from the last d-1 on diagonal k+1 */ row = last_d[k+1]+1; /* op = DELETE; */ } else { /* move right from the last d-1 on diagonal k-1 */ row = last_d[k-1]; /* op = INSERT; */ } /* code common to the three cases */ /* slide down the diagonal */ col = row+k-ORIGIN; if (row>=0) while ((row < m) && (col < n) && (s1[row]==s2[col])) { row++; col++; } temp_d[k] = row; if ((row == m) && (col == n)) { /* hit southeast corner; have the answer */ free(last_d); free(temp_d); free(max_row); free(max_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } if (row == m) { /* hit last row; don't look further */ free(temp_d); free(last_d); free(max_row); free(max_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } if (col == n) { /* hit last column; don't look further */ free(temp_d); free(last_d); free(max_row); free(max_diag); *line1 = row+offset1; *line2 = col+offset2; return d; } } max_row[d] = last_d[ORIGIN]; max_diag[d] = ORIGIN; for (k=lower; k<=upper; ++k) if (temp_d[k]>max_row[d]) { max_row[d] = temp_d[k]; max_diag[d] = k; } for (k=lower; k<=upper; k++) { last_d[k] = temp_d[k]; } --lower; ++upper; } /* report here the previous maximal match, stored in max_diag and max_row */ while ((d>0) && (max_row[d]-max_row[d-1]<3)) d--; *line1 = max_row[d]+offset1; *line2 = max_row[d]+max_diag[d]-ORIGIN+offset2; free(max_row); free(max_diag); free(last_d); free(temp_d); return d; /* if ((d>2) && (max_row[d-1]-max_row[d-2]<3)) { *line1 = max_row[d-2]+offset1; *line2 = max_row[d-2]+max_diag[d-2]-ORIGIN+offset2; free(max_row); free(max_diag); free(last_d); free(temp_d); return d-2; } *line1 = max_row[d-1]+offset1; *line2 = max_row[d-1]+max_diag[d-1]-ORIGIN+offset2; free(max_row); free(max_diag); free(last_d); free(temp_d); return d-1; */ } static void merge(Exon **t0, Exon **t1) { Exon *tmp0, *tmp1; int diff; if ((*t0) && !(*t0)->to1) tmp0 = (*t0)->next_exon; else tmp0 = *t0; while (tmp0 && (tmp0!=*t1)) { tmp1 = tmp0->next_exon; assert(tmp1!=NULL); if (tmp1 && tmp1->to1 && tmp0->to1 && (abs((tmp1->from2-tmp1->from1)-(tmp0->to2-tmp0->to1))<=W) && ((diff=tmp1->from2-tmp0->to2-1<=W))) { /* merge blocks tmp0 and tmp1 */ tmp0->from1 = min(tmp0->from1, tmp1->from1); tmp0->from2 = min(tmp0->from2, tmp1->from2); tmp0->to1 = max(tmp1->to1, tmp0->to1); tmp0->to2 = max(tmp1->to2, tmp0->to2); tmp0->length = tmp0->to2-tmp0->from2+1; tmp0->flag = tmp1->flag; tmp0->edist += tmp1->edist; tmp0->edist -= P*diff; if (tmp1==*t1) { /* tmp0->flag = (*t1)->flag; */ *t1 = tmp0; } tmp0->next_exon = tmp1->next_exon; free(tmp1); } else tmp0 = tmp0->next_exon; } } void free_align(edit_script_list *aligns) { edit_script_list *head; head = aligns; while ((head=aligns)!=NULL) { aligns = aligns->next_script; Free_script(head->script); free(head); } } void free_list(Exon *left) { Exon *tmp_block; while ((tmp_block=left)!=NULL) { left = left->next_exon; free(tmp_block); } } void free_table(void) { register struct hash_node *hptr, *tptr; register int hval; free(pnext_pos); for (hval=0; hvallink; free(tptr); } } } static Exon *bmatch (uchar *s1, uchar *s2, int len1, int len2, int offset1, int offset2) { int i, j, i1, score; Exon *new=NULL; for (i1=i=len1-3; i>=len2-3; i--, i1=i) { for (j=len2-3; j>=2; j--, i1--) if (*(s1+i1)!=*(s2+j)) break; if (j<2) { /* exact match for CDS found; check signals */ score = 0; if (*(s1+(i1--))==*(s2+(j--))) score++; if (*(s1+(i1--))==*(s2+(j--))) score++; if (*(s1+i1+len2-1)==*(s2+j+len2-1)) score++; if (*(s1+i1+len2)==*(s2+j+len2)) score++; if (score>=3) { new = new_exon(i1+3+offset1, offset2, i1+3+offset1+len2-5, offset2+len2-5, len2-4, 0, 0, NULL); new->ori = (G_score >= abs(C_score)) ? 'G' : 'C'; return new; } } } return NULL; } static Exon *fmatch (uchar *s1, uchar *s2, int len1, int len2, int offset1, int offset2) { int i, j, i1, score; Exon *new=NULL; for (i1=i=2; i=len2-2) { /* exact match found for internal part, look for signals */ score = 0; if (*(s1+(i1++))==*(s2+(j++))) score++; if (*(s1+(i1++))==*(s2+(j++))) score++; if (*(s1+i1-len2)==*s2) score++; if (*(s1+i1-len2+1)==*(s2+1)) score++; if (score>=3) { new = new_exon(i+offset1,offset2,i1+offset1-2,offset2+len2-5, len2-4,0,0,NULL); new->ori = (G_score >= abs(C_score)) ? 'G' : 'C'; return new; } } } return NULL; } #ifdef DEBUG static void debug_print_exons (Exon *lblock, const char *label) { Exon *tmp_block = lblock; (void)fprintf(stderr,"\n====================%s:\n\n", label); while (tmp_block) { (void)fprintf(stderr," [ %d, %d, %d, %d, l: %d ]\n ", tmp_block->from1, tmp_block->from2, tmp_block->to1, tmp_block->to2, tmp_block->length); tmp_block = tmp_block->next_exon; } } #endif /* -------------------- to be added to psublast ---------------------- */ void seq_toupper(uchar *seq, int len, char *filename) { int i=0, flag = 0; uchar *s=seq; for (; *s && (inext_exon)==NULL) || !t->to1) return FALSE; numx++; e2 = t->to2; while (((t=t->next_exon)!=NULL) && t->to1) { ++numx; if ((t->from2-e2>1) || (t!=rblock && ((t->to2-t->from2+1<2*w+2) || (t->to1-t->from1+1<2*w+2)))) return FALSE; e2 = t->to2; } return ((numx<3) ? FALSE:TRUE); } static void sync_slide_intron(int in_w, Exon **lblock, uchar *seq1, uchar *seq2) { Exon *t0=NULL, *t1=NULL, *head = *lblock; splice_t *g=NULL, *c=NULL, *cell=NULL; splice_t *Glist[500], *Clist[500]; int Gscore=0, Cscore=0; char oris[500]; int w1, w2, ni, i, numC, numG; memset(Glist, 0, 200*sizeof(splice_t *)); memset(Clist, 0, 200*sizeof(splice_t *)); ni = 0; numG = numC = 0; /* assume forward orientation */ t0 = head->next_exon; while (t0 && (t1=t0->next_exon) && t1->to1) { g = c = NULL; if (t1->from2-t0->to2-1==0) { if (!strncmp((char *)(seq1+t0->to1),"GT",2) && !strncmp((char *)(seq1+t1->from1-3),"AG",2)) { g = new_splice('G',t0->to1,t1->from1,t0->to2,t1->from2,-1,NULL); t0->ori = 'G'; oris[ni] = 'G'; numG++; } else if (!strncmp((char *)(seq1+t0->to1),"CT",2) && !strncmp((char *)(seq1+t1->from1-3),"AC",2)) { c = new_splice('C',t0->to1,t1->from1,t0->to2,t1->from2,-1,NULL); t0->ori = 'C'; oris[ni] = 'C'; numC++; } else { w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1)); w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1)); splice(seq1, t0->to1-w1, t0->to1+w1, t1->from1-w2, t1->from1+w2, seq2, t0->to2-w1, t1->from2+w2, &g, &c, BOTH); Gscore += g->score; Cscore += c->score; cell = NULL; oris[ni] = '*'; if (g->score>c->score) { numG++; cell = g; oris[ni] = 'G'; } else if (c->score>g->score) { numC++; cell = c; oris[ni] = 'C'; } else if (c->score==g->score) { numG++; numC++; cell = g; oris[ni] = 'G'; } t0->ori = oris[ni]; t0->to1 = cell->xs; t0->to2 = cell->ys; t1->from1 = cell->xe; t1->from2 = cell->ye; t0->length = t0->to2-t0->from2+1; t1->length = t1->to2-t1->from2+1; } Clist[ni] = c; Glist[ni] = g; } else { t0->ori = 'E'; oris[ni] = 'E'; } ni++; t0 = t1; } /* if (numG==ni) { for (i=0; i=numC) { /* revisit all previous assignments that are inconsistent */ for (i=0, t0=head->next_exon; inext_exon; switch (oris[i]) { case 'G': break; case 'C': if (Glist[i]==NULL) { /* compute the values for C */ w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1)); w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1)); splice(seq1, t0->to1-w1, t0->to1+w1, t1->from1-w2, t1->from1+w2, seq2, t0->to2-w1, t1->from2+w2, &g, &c, FWD); } else g = Glist[i]; t0->ori = 'G'; t0->to1 = g->xs; t0->to2 = g->ys; t1->from1 = g->xe; t1->from2 = g->ye; t0->length = t0->to2-t0->from2+1; t1->length = t1->to2-t1->from2+1; break; case 'E': break; default : fatal("sim4b1.c: intron orientation not initialized."); } if (oris[i]!='E') wobble(&t0,&t1,"GT","AG",seq1); } } else { /* analyze all assignments for consistency */ for (i=0, t0=head->next_exon; inext_exon; switch (oris[i]) { case 'C': break; case 'G': if (Clist[i]==NULL) { /* compute the values for C */ w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1)); w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1)); splice(seq1, t0->to1-w1, t0->to1+w1, t1->from1-w2, t1->from1+w2, seq2, t0->to2-w1, t1->from2+w2, &g, &c, BWD); } else c = Clist[i]; t0->ori = 'C'; t0->to1 = c->xs; t0->to2 = c->ys; t1->from1 = c->xe; t1->from2 = c->ye; t0->length = t0->to2-t0->from2+1; t1->length = t1->to2-t1->from2+1; break; case 'E': break; default : fatal("sim4b1.c: intron orientation not initialized."); } if (oris[i]!='E') wobble(&t0,&t1,"CT","AC",seq1); } } /* now free all memory allocated */ free_all: for (i=0; ito1; /* first nt of donor */ uchar *q = seq1+(*t1)->from1-3; /* first nt of acceptor */ if (!strncmp((char *)(s), donor, 2)) { /* match in place */ if (!strncmp((char *)(q), acceptor, 2)) { return; } else if (!strncmp((char *)(q-1), acceptor, 2)) { (*t1)->from1--; return; } else if (!strncmp((char *)(q+1), acceptor, 2)) { (*t1)->from1++; return; } } else if (!strncmp((char *)(s-1), donor, 2)) { /* match is 1 off to the left */ if (!strncmp((char *)(q), acceptor, 2)) { (*t0)->to1--; return; } else if (!strncmp((char *)(q-1), acceptor, 2)) { (*t0)->to1--; (*t1)->from1--; (*t0)->to2--; (*t1)->from2--; (*t0)->length++; (*t1)->length--; return; } else if (!strncmp((char *)(q+1), acceptor, 2)) { (*t0)->to1--; (*t1)->from1++; return; } } else if (!strncmp((char *)(s+1), donor, 2)) { /* match is 1 off to the right */ if (!strncmp((char *)(q), acceptor, 2)) { (*t0)->to1++; return; } else if (!strncmp((char *)(q-1), acceptor, 2)) { (*t0)->to1++; (*t1)->from1--; return; } else if (!strncmp((char *)(q+1), acceptor, 2)) { (*t0)->to1++; (*t1)->from1++; (*t0)->to2++; (*t1)->from2++; (*t0)->length--; (*t1)->length++; return; } } else if (!strncmp((char *)(q-1), acceptor, 2)) { /* match is 1 off to the left */ (*t1)->from1--; return; } else if (!strncmp((char *)(q+1), acceptor, 2)) { /* match is 1 off to the right */ (*t1)->from1++; return; } return; } static void slide_intron(int in_w, Exon **lblock, uchar *seq1, uchar *seq2) { Exon *t0, *t1, *head = *lblock; splice_t *g, *c, *cell; char type; int w1, w2; t0 = head->next_exon; while (t0 && (t1=t0->next_exon) && t1->to1) { g = c = NULL; if (t1->from2-t0->to2-1==0) { if (!strncmp((char *)(seq1+t0->to1),"GT",2) && !strncmp((char *)(seq1+t1->from1-3),"AG",2)) t0->ori = 'G'; else if (!strncmp((char *)(seq1+t0->to1),"CT",2) && !strncmp((char *)(seq1+t1->from1-3),"AC",2)) t0->ori = 'C'; else { int gtag=0, ctac=0; uchar *s; w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1)); w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1)); splice(seq1, t0->to1-w1, t0->to1+w1, t1->from1-w2, t1->from1+w2, seq2, t0->to2-w1, t1->from2+w2, &g, &c, BOTH); if (g->score>c->score) { cell = g; type = 'G'; } else if (c->score>g->score) { cell = c; type = 'C'; } else { cell = g; type = 'G'; } t0->to1 = cell->xs; t0->to2 = cell->ys; t1->from1 = cell->xe; t1->from2 = cell->ye; t0->length = t0->to2-t0->from2+1; t1->length = t1->to2-t1->from2+1; wobble(&t0,&t1,(type=='G')? "GT":"CT",(type=='G')? "AG":"AC",seq1); free(g); free(c); /* determine the type, based on the # matches w/ GT-AG (CT-AC) */ s = seq1+t0->to1; if (*s=='G') gtag++; else if (*s=='C') ctac++; ++s; if (*s=='T') { gtag++; ctac++;} s = seq1+t1->from1-3; if (*s=='A') { gtag++; ctac++; } ++s; if (*s=='G') gtag++; else if (*s=='C') ctac++; if (gtag>ctac) type = 'G'; else if (ctac>gtag) type = 'C'; else type = 'N'; t0->ori = type; } } else t0->ori = 'E'; t0 = t1; } } #ifdef AUXUTILS static void remove_polyA_tails(Exon *lblock, uchar *seq1, uchar *seq2, int len2) { Exon *t, *prev; uchar *s, *q; int xcut, diff, u, tmp, I, J, first = 1; t = lblock->next_exon; prev = lblock; s = seq2; q = seq2+len2-1; while (s<=q && *s=='T') s++; if (s>=seq2+t->from2-1) { while (t && t->to1) { s = seq2+t->from2-1; q = seq2+t->to2-1; if (first && strncmp((char *)s,"TTTTT",5)) break; first = 0; while ((s<=q) && (*s=='T')) s++; diff = t->to2-(s-seq2); u = min(diff,12); xcut = t->to2-t->from2+1-diff; if (diff>6) { tmp = (int)((1+P)*u); /* was diff */ /* EXTEND_BW(s,seq1+t->to1-(diff-u)-tmp,u,tmp, s-seq2,t->to1-(diff-u)-tmp,&I,&J); */ EXTEND_BW(seq2+t->from2+xcut-1, seq1+t->from1+(int)((1-P)*xcut)-1, u,(int)(P*xcut+(1+P)*u)+1, /* 1 is for looser margin */ t->from2+xcut-1,t->from1+(int)((1-P)*xcut)-1, &I,&J); t->from2 = I+1; t->from1 = J+1; t->length = t->to2-t->from2+1; break; } else if (diff>0) { prev->next_exon = t->next_exon; free(t); t = prev->next_exon; tmp = (int)((1+P)*diff); EXTEND_BW(s,seq1+t->from1-tmp,diff,tmp, s-seq2,t->from1-tmp,&I,&J); t->from2 = I+1; t->from1 = J+1; t->length = t->to2-t->from2+1; break; } else { /* remove entire exon and repeat the process */ prev->next_exon = t->next_exon; free(t); t = prev->next_exon; continue; } } } while (t && t->next_exon && t->next_exon->to1) { prev = t; t = t->next_exon; } first = 1; s = seq2; q = seq2+len2-1; while (q>=s && *q=='A') q--; if (t && t->to1 && qto2-1) { while (t && t->to1) { s = seq2+t->to2-1; q = seq2+t->from2-1; if (first && strncmp((char *)(s-4), "AAAAA", 5)) break; first = 0; while ((s>=q) && (*s=='A')) s--; diff = (int)(s-seq2+1)-t->from2+1; u = min(diff, 12); xcut = t->to2-t->from2+1-diff; if (diff>6) { /* EXTEND_FW(seq2+t->from2+(diff-u)-1, seq1+t->from1+(diff-u)-1, u, (int)((1+P)*u), t->from2+(diff-u)-1, t->from1+(diff-u)-1, &I, &J); */ EXTEND_FW(s-u+1, seq1+t->to1-(int)((1+P)*xcut)-(int)((1+P)*u), u,(int)((1+P)*u+P*xcut)+1, /*+1 is for looser margin */ t->from2+(diff-u)-1, t->to1-(int)((1+P)*xcut)-(int)((1+P)*u), &I, &J); t->to1 = J; t->to2 = I; t->length = t->to2-t->from2+1; break; } else if (diff>0) { if (prev==NULL) prev = find_previous(lblock, t); assert(prev!=NULL); prev->next_exon = t->next_exon; free(t); t = prev->next_exon; prev = NULL; EXTEND_FW(seq2+t->to2, seq1+t->to1, diff, (int)((1+P)*diff), t->to2, t->to1, &I, &J); t->to1 = J; t->to2 = I; t->length = t->to2-t->from2+1; break; } else { if (prev==NULL) prev = find_previous(lblock, t); assert(prev!=NULL); prev->next_exon = t->next_exon; free(t); t = prev->next_exon; prev = NULL; continue; } } } return; } #endif Exon *find_previous(Exon *lblock, Exon *tgt) { Exon *t=lblock; while (t && (t->next_exon!=tgt)) t = t->next_exon; if (t==NULL) fatal("sim4b1.c: Corrupted exon list: could not find previous."); return t; } static bool get_match_quality(Exon *lblock, Exon *rblock, sim4_stats_t *st, int N) { int tcov; bool good_match; Exon *t; good_match = TRUE; st->icoverage = 0; t = lblock->next_exon; while (t->to1) { st->icoverage += t->to2-t->from2+1; if (100*t->edist>=5*(t->to2-t->from2+1)) { good_match = FALSE; break; } t = t->next_exon; } tcov = rblock->to2-lblock->next_exon->from2+1; if (lblock->next_exon->from2>=.5*N && tcov>=.8*(N-lblock->next_exon->from2) && st->icoverage>=max(.95*tcov,100)) ; else if (rblock->to2<=.5*N && tcov>=.8*rblock->to2 && st->icoverage>=max(.95*tcov,100)) ; else if ((tcov<.8*N) || (st->icoverage<.9*tcov)) good_match = FALSE; return good_match; } sim4.2012-10-10/sim4b1.h0000444000515200116500000000165407733353757014113 0ustar floreasalzberg#ifndef SIM4B1_H #define SIM4B1_H /* $Id: sim4b1.h,v 1.16 2001/10/12 03:33:12 florea Exp $ */ extern uchar *seq1, *seq2; extern int M, N, encoding[NACHARS]; extern int file_type; extern coords last_GT, last_CT, last_AG, last_AC; extern sim4_args_t rs; Exon *find_previous(Exon *,Exon *); void script_flip_list(edit_script_list **); void link_to_data_list(Pointer,ValNodePtr *,ValNodePtr *); void ValNodeFreeData(ValNodePtr); int good_ratio(int); void flip_list(Exon **,Exon **); void free_list(Exon *); void free_table(void); void free_align(edit_script_list *); void seq_toupper(uchar *, int, char *); void complement_exons(Exon **,int,int); void print_exons(Exon *); void print_pipmaker_exons(Exon *,edit_script_list *,char *,int,int,int,int,uchar *,uchar*,int); void bld_table(uchar *,int,int,int); struct edit_script_list *SIM4(uchar*,uchar*,int,int,int,int,int,int,int,int*,int*,int*,Exon**,sim4_stats_t *); #endif /* SIM4B1_H */ sim4.2012-10-10/splice.c0000444000515200116500000002661607733353757014273 0ustar floreasalzberg#include "libc.h" #include "sim4.h" #include "psublast.h" #include "splice.h" #ifndef __lint /*@unused@*/ static const char rcsid[] = "$Id: splice.c,v 1.15 2002/03/03 23:29:48 florea Exp $"; #endif static int spl_encode[NACHARS]; static int encodeInit; signal_t gt = {{0, 0, 0, 2, 0},{0, 0, 0, 2, 0},{2, 3, 2, 5, 2},{0, 0, 0, 2, 0},{0, 0, 0, 2, 0}}; signal_t ct = {{0, 0, 0, 2, 0},{2, 2, 2, 5, 2},{0, 0, 0, 2, 0},{0, 0, 0, 2, 0},{0, 0, 0, 2, 0}}; signal_t ag = {{2, 2, 5, 2, 2},{0, 0, 2, 0, 0},{0, 0, 2, 0, 0},{0, 0, 2, 0, 0},{0, 0, 2, 0, 0}}; signal_t ac = {{2, 5, 2, 2, 2},{0, 2, 0, 0, 0},{0, 3, 0, 0, 0},{0, 2, 0, 0, 0},{0, 2, 0, 0, 0}}; #ifdef DEBUG static void print_splice(splice_t *); #endif static void splice_donor(uchar *xseq, uchar *yseq, int M, int N, int *gt_score, int *ct_score, int **max_Gf, int **max_Cf, int **start_Gi, int **start_Ci); static void splice_donor_uni(uchar *xseq, uchar *yseq, int M, int N, int *It_score, int **max_IF, int **end_Ii); static void splice_acceptor(uchar *xseq, uchar *yseq, int M, int N, int *ag_score, int *ac_score, int **max_Gb, int **max_Cb, int **end_Gi, int **end_Ci); static void splice_acceptor_uni(uchar *xseq, uchar *yseq, int M, int N, int *aI_score, int **max_Ib, int **end_Ii); static int stepct(int n); void splice(uchar *in_seqx, int ls, int us, int le, int ue, uchar *in_seqy, int ys, int ye, splice_t **gcell, splice_t **ccell, int ori) { int p, q, *gtscore=NULL, *ctscore=NULL, *agscore=NULL, *acscore=NULL; int i, tmp; int maxCscore, maxGscore, Gxs, Gxe, Gy, Cxs, Cxe, Cy, keep_Ci, keep_Gi; int *max_Cf=NULL, *max_Gf=NULL, *max_Cb=NULL, *max_Gb=NULL; int *start_Gi=NULL, *start_Ci=NULL, *end_Gi=NULL, *end_Ci=NULL; uchar *s; if (!encodeInit) { for (i=0; i=0; q--, s--) agscore[q] = ag[spl_encode[*(s-1)]][spl_encode[*s]]; } if (ori==BWD || ori==BOTH) { ctscore = (int *)ckalloc(((us-ls+2)+(ue-le+2))*sizeof(int)); acscore = ctscore+(us-ls+2); for (p=0, s=in_seqx+ls-1; p<=us-ls+1; p++, s++) ctscore[p] = ct[spl_encode[*s]][spl_encode[*(s+1)]]; for (q=ue-le+1, s=in_seqx+ue-1; q>=0; q--, s--) acscore[q] = ac[spl_encode[*(s-1)]][spl_encode[*s]]; } if (ori==FWD) { splice_donor_uni(in_seqx+ls-1, in_seqy+ys-1, us-ls+1, ye-ys+1, gtscore, &max_Gf, &start_Gi); splice_acceptor_uni(in_seqx+le-1, in_seqy+ys-1, ue-le+1, ye-ys+1, agscore, &max_Gb, &end_Gi); free(gtscore); /* free(agscore); */ } else if (ori==BWD) { splice_donor_uni(in_seqx+ls-1, in_seqy+ys-1, us-ls+1, ye-ys+1, ctscore, &max_Cf, &start_Ci); splice_acceptor_uni(in_seqx+le-1, in_seqy+ys-1, ue-le+1, ye-ys+1, acscore, &max_Cb, &end_Ci); free(ctscore); /* free(acscore); */ } else { splice_donor(in_seqx+ls-1, in_seqy+ys-1, us-ls+1, ye-ys+1, gtscore, ctscore, &max_Gf, &max_Cf, &start_Gi, &start_Ci); splice_acceptor(in_seqx+le-1, in_seqy+ys-1, ue-le+1, ye-ys+1, agscore, acscore, &max_Gb, &max_Cb, &end_Gi, &end_Ci); free(gtscore); /* free(agscore); */ free(ctscore); /* free(acscore); */ } maxCscore = -999999; maxGscore = -999999; Gxs = Gxe = Gy = Cxs = Cxe = Cy = -1; if (ori==FWD || ori==BOTH) { for (i=0; i<=ye-ys+1; i++) { if ((tmp=max_Gf[i]+max_Gb[i])>maxGscore) { maxGscore = tmp; /* save (i, start_Gi[i], end_Gi[i]); */ Gxs = ls+start_Gi[i]-1; Gxe = le+end_Gi[i]-1; Gy = ys+i-1; keep_Gi = i; } } free(max_Gf); free(max_Gb); /* free(start_Gi); free(end_Gi); */ } if (ori==BWD || ori==BOTH) { for (i=0; i<=ye-ys+1; i++) { if ((tmp=max_Cf[i]+max_Cb[i])>maxCscore) { maxCscore = tmp; /* save (i, start_Ci[i], end_Ci[i]); */ Cxs = ls+start_Ci[i]-1; Cxe = le+end_Ci[i]-1; Cy = ys+i-1; keep_Ci = i; } } free(max_Cf); free(max_Cb); /* free(start_Ci); free(end_Ci); */ } *gcell = new_splice('G', Gxs, Gxe, Gy, Gy+1, maxGscore, NULL); *ccell = new_splice('C', Cxs, Cxe, Cy, Cy+1, maxCscore, NULL); #ifdef DEBUG print_splice(*gcell); print_splice(*ccell); #endif return; } splice_t *new_splice(char c, int xs, int xe, int ys, int ye, int score, splice_t *next) { splice_t *sp = ckalloc(sizeof(splice_t)); sp->type = c; sp->xs = xs; sp->xe = xe; sp->ys = ys; sp->ye = ye; sp->score = score; sp->next = next; return sp; } #ifdef DEBUG static void print_splice(splice_t *g) { printf("Type: %c sx: %d se: %d ys: %d score: %d\n", g->type, g->xs, g->xe, g->ys, g->score); } #endif static void splice_donor(uchar *xseq, uchar *yseq, int M, int N, int *gt_score, int *ct_score, int **max_Gf, int **max_Cf, int **start_Gi, int **start_Ci) { int *CCf, *mG, *mC, *sC, *sG, *X; int i, j, tmp, ss, ssx, cx, c; uchar *s, *t; CCf = (int *)ckalloc((M+1)*sizeof(int)); X = (int *)ckalloc((M+1)*sizeof(int)); mG = *max_Gf = (int *)ckalloc((2*N+2)*sizeof(int)); sG = *start_Gi = mG+(N+1); mC = *max_Cf = (int *)ckalloc((2*N+2)*sizeof(int)); sC = *start_Ci = mC+(N+1); t = yseq; X[0] = CCf[0] = 0; for (j=1; j<=M; j++) { CCf[j] = j; X[j] = 0; } mG[0] = mC[0] = -999999; for (j=0; j<=M; j++) { if ((100*gt_score[j])>mG[0]) { mG[0] = 100*gt_score[j]; sG[0] = j; } if ((100*ct_score[j])>mC[0]) { mC[0] = 100*ct_score[j]; sC[0] = j; } } for (i=1; i<=N; i++, t++) { s = xseq; ss = CCf[0]; ssx = X[0]; c = ++CCf[0]; cx = X[0]; for (j=1; j<=M; j++, s++) { tmp=min(min(CCf[j]+1, ss+(*t!=*s)),c+1); if (tmp==c+1); else if (tmp==CCf[j]+1) cx = X[j]; else cx = ssx + (*t==*s); c = tmp; ss = CCf[j]; CCf[j] = c; ssx = X[j]; X[j] = cx; } /* compute max_Gf and max_Cf */ mG[i] = mC[i] = -999999; for (j=0; j<=M; j++) { assert(X[j]+CCf[j]!=0); tmp = (int)(stepct(j)*X[j]/(double)(X[j]+CCf[j])*100); if ((tmp+100*gt_score[j])>mG[i]) { mG[i] = tmp+100*gt_score[j]; sG[i] = j; } if ((tmp+100*ct_score[j])>mC[i]) { mC[i] = tmp+100*ct_score[j]; sC[i] = j; } } } free(CCf); free(X); } static void splice_donor_uni(uchar *xseq, uchar *yseq, int M, int N, int *It_score, int **max_If, int **start_Ii) { int *CCf, *mI, *sI, *X; int i, j, tmp, ss, ssx, cx, c; uchar *s, *t; CCf = (int *)ckalloc((M+1)*sizeof(int)); X = (int *)ckalloc((M+1)*sizeof(int)); mI = *max_If = (int *)ckalloc((2*N+2)*sizeof(int)); sI = *start_Ii = mI+(N+1); t = yseq; X[0] = CCf[0] = 0; for (j=1; j<=M; j++) { CCf[j] = j; X[j] = 0; } mI[0] = -999999; for (j=0; j<=M; j++) if ((100*It_score[j])>mI[0]) { mI[0] = 100*It_score[j]; sI[0] = j; } for (i=1; i<=N; i++, t++) { s = xseq; ss = CCf[0]; ssx = X[0]; c = ++CCf[0]; cx = X[0]; for (j=1; j<=M; j++, s++) { tmp=min(min(CCf[j]+1, ss+(*t!=*s)),c+1); if (tmp==c+1); else if (tmp==CCf[j]+1) cx = X[j]; else cx = ssx + (*t==*s); c = tmp; ss = CCf[j]; CCf[j] = c; ssx = X[j]; X[j] = cx; } /* compute max_If */ mI[i] = -999999; for (j=0; j<=M; j++) { assert(X[j]+CCf[j]!=0); tmp = (int)(stepct(j)*X[j]/(double)(X[j]+CCf[j])*100)+100*It_score[j]; if (tmp>mI[i]) { mI[i] = tmp; sI[i] = j; } } } free(CCf); free(X); } static void splice_acceptor(uchar *xseq, uchar *yseq, int M, int N, int *ag_score, int *ac_score, int **max_Gb, int **max_Cb, int **end_Gi, int **end_Ci) { int *CCb, *X, *mC, *mG, *eC, *eG; int tmp, i, j, ss, ssx, cx, c; uchar *t, *s; CCb = (int *)ckalloc((M+1)*sizeof(int)); X = (int *)ckalloc((M+1)*sizeof(int)); mG = *max_Gb = (int *)ckalloc((2*N+2)*sizeof(int)); eG = *end_Gi = mG+(N+1); mC = *max_Cb = (int *)ckalloc((2*N+2)*sizeof(int)); eC = *end_Ci = mC+(N+1); t = yseq+N-1; CCb[M] = X[M] = 0; for (j=M-1; j>=0; j--) { CCb[j] = M-j; X[j] = 0; } mG[N] = mC[N] = -999999; for (j=M; j>=0; j--) { if ((100*ag_score[j])>mG[N]) { mG[N] = 100*ag_score[j]; eG[N] = j+1; } if ((100*ac_score[j])>mC[N]) { mC[N] = 100*ac_score[j]; eC[N] = j+1; } } for (i=N-1; i>=0; i--, t--) { s = xseq+M-1; ss = CCb[M]; ssx = X[M]; c = ++CCb[M]; cx = X[M]; for (j=M-1; j>=0; j--, s--) { tmp=min(min(CCb[j]+1, ss+(*t!=*s)),c+1); if (tmp==c+1) ; else if (tmp==CCb[j]+1) cx = X[j]; else cx = ssx + (*t==*s); c = tmp; ss = CCb[j]; CCb[j] = c; ssx = X[j]; X[j] = cx; } /* compute max_Gb and max_Cb */ mG[i] = -999999; mC[i] = -999999; for (j=M; j>=0; j--) { assert(CCb[j]+X[j]!=0); tmp = (int)(stepct(M-j)*X[j]/(double)(CCb[j]+X[j])*100); if ((tmp+100*ag_score[j])>mG[i]) { mG[i] = tmp+100*ag_score[j]; eG[i] = j+1; } if ((tmp+100*ac_score[j])>mC[i]) { mC[i] = tmp+100*ac_score[j]; eC[i] = j+1; } } } free(CCb); free(X); } static void splice_acceptor_uni(uchar *xseq, uchar *yseq, int M, int N, int *aI_score, int **max_Ib, int **end_Ii) { int *CCb, *X, *mI, *eI; int tmp, i, j, ss, ssx, cx, c; uchar *t, *s; CCb = (int *)ckalloc((M+1)*sizeof(int)); X = (int *)ckalloc((M+1)*sizeof(int)); mI = *max_Ib = (int *)ckalloc((2*N+2)*sizeof(int)); eI = *end_Ii = mI+(N+1); t = yseq+N-1; CCb[M] = X[M] = 0; for (j=M-1; j>=0; j--) { CCb[j] = M-j; X[j] = 0; } mI[N] = -999999; for (j=M; j>=0; j--) if ((100*aI_score[j])>mI[N]) { mI[N] = 100*aI_score[j]; eI[N] = j+1; } for (i=N-1; i>=0; i--, t--) { s = xseq+M-1; ss = CCb[M]; ssx = X[M]; c = ++CCb[M]; cx = X[M]; for (j=M-1; j>=0; j--, s--) { tmp=min(min(CCb[j]+1, ss+(*t!=*s)),c+1); if (tmp==c+1) ; else if (tmp==CCb[j]+1) cx = X[j]; else cx = ssx + (*t==*s); c = tmp; ss = CCb[j]; CCb[j] = c; ssx = X[j]; X[j] = cx; } /* compute max_Ib */ mI[i] = -999999; for (j=M; j>=0; j--) { assert(CCb[j]+X[j]!=0); tmp = (int)(stepct(M-j)*X[j]/(double)(CCb[j]+X[j])*100)+100*aI_score[j]; if (tmp>mI[i]) { mI[i] = tmp; eI[i] = j+1; } } } free(CCb); free(X); } static int stepct(int n) { if (n<0) fatal("splice.c: Negative value in stepct()."); if (n<=4) return 9; if (n<=8) return 10; if (n<=12) return 12; return 12; } sim4.2012-10-10/splice.h0000444000515200116500000000063607733353757014272 0ustar floreasalzberg#ifndef SPLICE_H #define SPLICE_H /* "$Id: splice.h,v 1.6 2001/12/23 05:29:44 florea Exp $"; */ extern signal_t gt; extern signal_t ct; extern signal_t ag; extern signal_t ac; splice_t *new_splice(char,int,int,int,int,int,splice_t *); void splice(uchar *in_seqx, int ls, int us, int le, int ue, uchar *in_seqy, int ys, int ye, splice_t **g, splice_t **c, int ori); #endif /* SPLICE_H */ sim4.2012-10-10/args.c0000444000515200116500000000461607733353757013744 0ustar floreasalzberg#include "libc.h" #include "types.h" #include "misc.h" #include "args.h" #ifndef __lint static const char rcsid[] = "$Id: args.c,v 1.3 2000/09/15 17:57:02 florea Exp $"; #endif static int argc; static char **argv; char *argv0; /* ckargs -- check that only certain parameters are set on the command line */ void ckargs(const char *options, int argcx, char **argvx, int non_options) { int i; argc = argcx; argv = argvx; argv0 = argv0 ? argv0 : argv[0]; for (i = non_options+1; i < argc; ++i) if (argv[i][1] != '=') fatalf("Improper command option: '%s'.", argv[i]); else if (!strchr(options, argv[i][0])) fatalf("Available options: %s\n", options); } /* get_argval --------------------- get the value of a command-line argument */ bool get_argval(int c, int *val_ptr) { int i; ck_argc("get_argval"); for (i = 0; i < argc; ++i) if (argv[i][0] == c && argv[i][1] == '=') { *val_ptr = atoi(argv[i]+2); return 1; } return 0; } /* get_fargval --------------- get the float value of a command-line argument */ bool get_fargval(int c, double *val_ptr) { int i; ck_argc("get_fargval"); for (i = 0; i < argc; ++i) if (argv[i][0] == c && argv[i][1] == '=') { *val_ptr = atof(argv[i]+2); return 1; } return 0; } /* get_strargval ---------- get the string value of a command-line argument */ bool get_strargval(int c, char **val_ptr) { int i; ck_argc("get_strargval"); for (i = 0; i < argc; ++i) if (argv[i][0] == c && argv[i][1] == '=') { *val_ptr = (char *) ckalloc(strlen(argv[i]+2)+1); strcpy(*val_ptr, argv[i]+2); return 1; } return 0; } bool get_cargval(int c, char **valp) { int i; ck_argc("get_cargval"); for (i = 0; i < argc; ++i) if (argv[i][0] == c && argv[i][1] == '=') { *valp = argv[i]+2; return 1; } return 0; } void fprintf_argv(FILE* fp) { int i; fprintf(fp, "%s", argv0); for (i = 1; i < argc; ++i) (void)fprintf(fp, " %s", argv[i]); } /* ck_argc - die if argc is unknown */ void ck_argc(const char *proc_name) { if (argc == 0) fatalf("Call ckargs() before %s.\n", proc_name); } sim4.2012-10-10/args.h0000444000515200116500000000067707733353757013754 0ustar floreasalzberg#ifndef SIM_ARGS_H #define SIM_ARGS_H /* $Id: args.h,v 1.2 2000/09/15 17:57:02 florea Exp $ */ typedef struct argv_scores { double E; int I; int M; int O; int V; } argv_scores_t; bool get_argval(int, int *); bool get_fargval(int, double *); bool get_strargval(int, char **); bool get_cargval(int, char **); void ckargs(const char *, int , char **, int ); void fprintf_argv(FILE* fp); void ck_argc(const char *); extern char *argv0; #endif sim4.2012-10-10/charvec.c0000444000515200116500000000335407733353757014421 0ustar floreasalzberg/* genvec charvec char ; 1999-10-05 22:59:09 */ #include "charvec.h" charvec_t* charvec_new(void* ((*ra)(void*,size_t)), void (*fr)(void*)) { charvec_t *vec = ra(0, sizeof(*vec)); if (vec) { if (charvec_init(vec, ra, fr)) return vec; fr(vec); } return 0; } charvec_t* charvec_free(charvec_t *t) { charvec_fini(t); t->free(t); return 0; } int charvec_init(charvec_t *t, void* ((*a)(void*,size_t)), void (*f)(void*)) { assert(t); t->a = 0; t->len = 0; t->max = 0; t->alloc = a; t->free = f; return charvec_need(t, 0); } int charvec_fini(charvec_t *t) { assert(t); if (t->a && t->free) { t->free(t->a); t->a = 0; t->max = 0; } t->len = 0; return 1; } #ifndef BASE_ALLOC #define BASE_ALLOC 30 #endif enum { BASE = BASE_ALLOC }; int charvec_need(charvec_t *t, unsigned int n) { assert(t); if (t->a == 0) { assert(t->alloc); t->len = 0; t->max = n; t->a = t->alloc(0, n * sizeof(char)); return t->a != 0; } if (n > t->max) { unsigned int i = BASE + n + (n >> 3); void *p = t->alloc(t->a, i * sizeof(char)); if (!p) return 0; t->max = i; t->a = p; } return 1; } int charvec_more(charvec_t *t, unsigned int n) { assert(t); return charvec_need(t, n + t->len); } int charvec_append(charvec_t *t, char e) { assert(t); if (!charvec_more(t, 1)) return 0; t->a[t->len++] = e; return 1; } int charvec_fit(charvec_t *t) { assert(t); assert(t->alloc); { unsigned int i = t->len; void *p = t->alloc(t->a, i * sizeof(char)); if (!p) return 0; t->max = i; t->a = p; return 1; } } sim4.2012-10-10/charvec.h0000444000515200116500000000165707733353757014432 0ustar floreasalzberg/* genvec charvec char ; 1999-10-05 22:59:09 */ #ifndef HAS_GEN_charvec_H #define HAS_GEN_charvec_H #include #include #include typedef struct charvec { char *a; unsigned int len; unsigned int max; void *((*alloc)(void*, size_t)); void (*free)(void*); } charvec_t; #define charvec_INIT(a,f) {0, 0, 0, a, f} charvec_t* charvec_new(void* ((*)(void*, size_t)), void ((*)(void*))); charvec_t* charvec_free(charvec_t *t); int charvec_init(charvec_t *t, void* ((*)(void*,size_t)), void((*)(void*))); int charvec_fini(charvec_t *t); int charvec_need(charvec_t *t, unsigned int n); int charvec_more(charvec_t *t, unsigned int n); int charvec_append(charvec_t *t, char e); int charvec_fit(charvec_t *t); #ifndef GENVEC_INBOUNDS #define GENVEC_INBOUNDS(t,n) ((0<=(n))&&((n)<(t)->len)) #endif #ifndef GENVEC_GET #define GENVEC_GET(t,n) (assert(GENVEC_INBOUNDS(t,n)) , (t)->a[n]) #endif #endif sim4.2012-10-10/discrim.c0000444000515200116500000000136707733353757014442 0ustar floreasalzberg#include "libc.h" #include "types.h" #include "misc.h" #include "args.h" #include "seq.h" #include "dna.h" #include "discrim.h" #ifndef __lint static const char rcsid[] = "$Id: discrim.c,v 1.2 2000/06/05 22:48:19 florea Exp $"; #endif /* DNA characters */ const uchar dchars[] = "ABCDGHKMNRSTVWXY"; static int is_dchar(int ch); bool is_DNA(uchar *s, int len) { int ACGT, i; for (ACGT = i = 0; i < len; ++i) if (strchr("ACGTNXacgtnx", s[i])) ++ACGT; if (10*ACGT < 9*len) /* ACGT < 90% of len */ return 0; for (i = 0; i < len; ++i) if (!is_dchar(s[i])) { fatalf("Illegal character '%c' in sequence file.\n", s[i]); exit(1); } return 1; } static int is_dchar(int ch) { return !!strchr((const char*)dchars, toupper(ch)); } sim4.2012-10-10/discrim.h0000444000515200116500000000022107733353757014433 0ustar floreasalzberg#ifndef SIM_DISCRIM_H #define SIM_DISCRIM_H /* $Id: discrim.h,v 1.1 2000/06/05 22:23:15 florea Exp $ */ bool is_DNA(uchar *s, int len); #endif sim4.2012-10-10/dna.c0000444000515200116500000000264607733353757013553 0ustar floreasalzberg#include "libc.h" #include "types.h" #include "seq.h" #include "misc.h" #include "args.h" #include "dna.h" #ifndef __lint static const char rcsid[] = "$Id: dna.c,v 1.2 2000/06/05 22:48:19 florea Exp $"; #endif static const argv_scores_t EIMOV = { DEFAULT_E, DEFAULT_I, DEFAULT_M, DEFAULT_O, DEFAULT_V }; static void set_argv_scores(argv_scores_t *s, const argv_scores_t *const dflt) { *s = *dflt; } /* DNA_scores ----------------------------------- set scoring matrix for DNA */ void DNA_scores_dflt(argv_scores_t *ds, ss_t ss, const argv_scores_t *dflt) { int i, j, bad; ck_argc("DNA_scores"); set_argv_scores(ds, dflt); for (i = 0; i < NACHARS; ++i) for (j = 0; j < NACHARS; ++j) ss[i][j] = ds->V; bad = -100*ds->M; for (i = 0; i < NACHARS; ++i) ss['X'][i] = ss[i]['X'] = bad; ss['a']['a'] = ss['c']['c'] = ss['g']['g'] = ss['t']['t'] = ds->M; ss['a']['A'] = ss['c']['C'] = ss['g']['G'] = ss['t']['T'] = ds->M; ss['A']['a'] = ss['C']['c'] = ss['G']['g'] = ss['T']['t'] = ds->M; ss['A']['A'] = ss['C']['C'] = ss['G']['G'] = ss['T']['T'] = ds->M; ss['a']['g'] = ss['g']['a'] = ss['c']['t'] = ss['t']['c'] = ds->I; ss['a']['G'] = ss['g']['A'] = ss['c']['T'] = ss['t']['C'] = ds->I; ss['A']['g'] = ss['G']['a'] = ss['C']['t'] = ss['T']['c'] = ds->I; ss['A']['G'] = ss['G']['A'] = ss['C']['T'] = ss['T']['C'] = ds->I; } void DNA_scores(argv_scores_t *ds, ss_t ss) { DNA_scores_dflt(ds, ss, &EIMOV); } sim4.2012-10-10/dna.h0000444000515200116500000000054307733353757013552 0ustar floreasalzberg#ifndef SIM_DNA_H #define SIM_DNA_H /* $Id: dna.h,v 1.1 2000/06/05 22:23:59 florea Exp $ */ #define DEFAULT_E 1 #define DEFAULT_I 1 #define DEFAULT_M 0 #define DEFAULT_O 0 #define DEFAULT_V 1 void DNA_scores(argv_scores_t *ds, ss_t ss); void DNA_scores_dflt(argv_scores_t *ds, ss_t ss, const argv_scores_t *dflt); #endif sim4.2012-10-10/encoding.h0000444000515200116500000000034507733353757014576 0ustar floreasalzberg/* mechanically generated; do not edit. Mon Jan 24 17:08:16 EST 2000 */ extern const unsigned char nfasta_ctype[256]; extern const unsigned char dna_complement[256]; enum { Nfasta_bad=0, Nfasta_nt=1, Nfasta_ws=2, Nfasta_amb=3 }; sim4.2012-10-10/libc.h0000444000515200116500000000046507733353757013724 0ustar floreasalzberg#ifndef LIBC_H #define LIBC_H /*$Id: libc.h,v 1.2 2000/10/19 22:17:03 schwartz Exp $*/ #include #include #include #include #include #include #include #include #include #include #include #endif sim4.2012-10-10/misc.c0000444000515200116500000000656307733353757013746 0ustar floreasalzberg#define _XOPEN_SOURCE /* tell sun we want popen, etc */ #include "libc.h" #include "types.h" #include "misc.h" #include "args.h" #ifndef __lint static const char rcsid[] = "$Id: misc.c,v 1.2 2000/06/05 22:48:19 florea Exp $"; #endif /* fatal ---------------------------------------------- print message and die */ void fatal(const char *msg) { fflush(stdout); fatalf("%s", msg); exit(1); } /* fatalf --------------------------------- format message, print it, and die */ static void print_argv0(void) { if (argv0) { char *p = strrchr(argv0, '/'); (void)fprintf(stderr, "%s: ", p ? p+1 : argv0); } } void fatalf(const char *fmt, ...) { va_list ap; va_start(ap, fmt); fflush(stdout); print_argv0(); (void)vfprintf(stderr, fmt, ap); (void)fputc('\n', stderr); va_end(ap); exit(1); } void fatalfr(const char *fmt, ...) { va_list ap; va_start(ap, fmt); fflush(stdout); print_argv0(); (void)vfprintf(stderr, fmt, ap); (void)fprintf(stderr, ": %s\n", strerror(errno)); va_end(ap); exit(1); } int psublast_debug = 0; void debugf(const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (psublast_debug) { fflush(stdout); print_argv0(); if (vfprintf(stderr, fmt, ap) < 0) exit(1); } va_end(ap); } void debugff(const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (psublast_debug) { fflush(stdout); print_argv0(); if (vfprintf(stderr, fmt, ap) < 0) exit(1); if (fflush(stderr) != 0) exit(1); } va_end(ap); } /* ckopen -------------------------------------- open file; check for success */ FILE *ckopen(const char *name, const char *mode) { FILE *fp; if ((fp = fopen(name, mode)) == NULL) fatalfr("Cannot open %s.", name); return fp; } /* ckalloc -------------------------------- allocate space; check for success */ void *ckalloc(size_t amount) { void *p; assert((long)amount >= 0); if (amount == 0) amount = 1; /* ANSI portability hack */ if ((p = malloc(amount)) == NULL) fatalf("Ran out of memory trying to allocate %lu.", (unsigned long)amount); #if 0 memset(p, 0, amount); /* XXX */ #endif return p; } /* ckallocz -------------------- allocate space; zero fill; check for success */ void *ckallocz(size_t amount) { void *p = ckalloc(amount); memset(p, 0, amount); return p; } void *ckfree(void *p) { free(p); return 0; } /* strsame --------------------------- tell whether two strings are identical */ bool same_string(const char *s, const char *t) { return (strcmp(s, t) == 0); } /* strsave -------------------------- save string s somewhere; return address */ char *copy_string(const char *s) { char *p; p = ckalloc(strlen(s)+1); /* +1 to hold '\0' */ return strcpy(p, s); } char *copy_substring(const char *s, int n) { char *p = ckalloc((size_t)n+1); /* +1 to hold '\0' */ memcpy(p, s, (size_t)n); p[n] = 0; return p; } long ckftell(FILE *f) { long r = ftell(f); if (r < 0) fatalfr("bad ftell: %s"); return r; } int ckfseek(FILE *f, long i, int m) { int r = fseek(f, i, m); if (r < 0) fatalfr("bad fseek: %s"); return r; } void *ckrealloc(void * p, size_t size) { p = p ? realloc(p, size) : malloc(size); if (!p) fatal("ckrealloc failed"); return p; } FILE *ckpopen(const char *name, const char *mode) { FILE *fp; if ((fp = popen(name, mode)) == NULL) fatalfr("Cannot open %s.", name); return fp; } void ckpclose(FILE *fp) { int r = pclose(fp); if (r != 0) fatalfr("pclose failed (status %d)", r); } sim4.2012-10-10/misc.h0000444000515200116500000000230707733353757013743 0ustar floreasalzberg#ifndef SIM_MISC_H #define SIM_MISC_H /* $Id: misc.h,v 1.1 2000/06/05 22:25:12 florea Exp $ */ #ifdef __GNUC__ #define NORETURN __attribute__((__noreturn__)) #else #define NORETURN /* */ #endif #define CLEN(s) (sizeof((s))-1) /*@exits@*/ void fatal(const char *msg) NORETURN; /*@exits@*/ void fatalf(const char *fmt, ...) NORETURN; /*@exits@*/ void fatalfr(const char *fmt, ...) NORETURN; void debugf(const char *fmt, ...); void debugff(const char *fmt, ...); FILE *ckpopen(const char *name, const char *mode); void ckpclose(FILE*); FILE *ckopen(const char *name, const char *mode); /*@only@*/ void *ckalloc(size_t amount); /*@only@*/ void *ckallocz(size_t amount); void *ckfree(void *p); bool same_string(const char *s, const char *t); char *copy_string(const char *s); char *copy_substring(const char *s, int n); long ckftell(FILE*); int ckfseek(FILE*,long,int); void *ckrealloc(void *, size_t); #define ZFREE(p) /*CONSTCOND*/do{free(p);(p)=0;}while(0) #ifndef RCSID #define RCSID(id) static const char rcsid[] = id #endif #undef MAX #define MAX(x,y) ((x) > (y) ? (x) : (y)) #undef MIN #define MIN(x,y) ((x) < (y) ? (x) : (y)) #undef ICEIL #define ICEIL(x,y) ((((x)-1)/(y))+1) extern int psublast_debug; #endif sim4.2012-10-10/poly.c0000444000515200116500000003215107733353757013766 0ustar floreasalzberg#include "psublast.h" #include "sim4.h" #include "sim4b1.h" #include "align.h" #include "poly.h" #ifndef __lint /*@unused@*/ static const char rcsid[] = "$Id: poly.c,v 1.5 2002/03/03 23:29:48 florea Exp $"; #endif static void remove_polyT_front(struct edit_script_list **,Exon *,uchar *,uchar*,int *); static void remove_polyA_back(struct edit_script_list **,Exon *,uchar *,uchar*,int,int *); static void trim_polyT_align(struct edit_script_list **,Exon **,const int,int *,uchar *,uchar *); static void trim_polyA_align(struct edit_script_list **,Exon *,Exon **,const int,int *,uchar *,uchar *); void get_polyAT(uchar *seq, int len, int *pT, int *pA, int flag) { register int i, sum10, sum20; register uchar *s, *t, *v; int last10; static char encodingA[128]; static char encodingT[128]; const int MAX10 = 2; const int MAX20 = 5; if (flag!=T_ONLY) { memset(encodingA, (char)1, 128); encodingA['A'] = encodingA['X'] = encodingA['N'] = 0; for (i=0, s=seq+len, sum10=0, last10=len+1; i<10 && s>seq && sum10<=MAX20; i++) { sum10 += encodingA[*(--s)]; /* if (!encodingA[*s] && sum10<=MAX10) last10 = s-seq+1; */ } t = v = seq+len; sum20 = sum10; for ( ; s>=seq && (sum10<=MAX10 || sum20<=MAX20); ) { if (!encodingA[*s] && sum10<=MAX10 && (seq+len>=s+20 || sum20seq) { sum10 += encodingA[*s] - encodingA[*(--t)]; sum20 += encodingA[*s] -(((seq+len)-s>20) ? encodingA[*(--v)] : 0); } } if (last10>len-10) *pA = len+1; else { s = seq+last10+8; while (*s && !encodingA[*s]) s--; if ((s-seq+1)-last10+1<=5) *pA = s-seq+2; else *pA = last10; } } else *pA = len+1; *pA = len-(*pA)+1; if (flag!=A_ONLY) { memset(encodingT, (char)1, 128); encodingT['T'] = encodingT['X'] = encodingT['N'] = 0; for (i=0, s=seq-1, sum10=0, last10=0; i<10 && i=19 || sum20=20) ? encodingT[*(++v)] : 0); } } if (last10<=10) *pT = 0; else { s = seq+last10-10; while (*s && !encodingT[*s]) s++; if (last10-(s-seq)+1<=5) *pT = s-seq; else *pT = last10; } } else *pT = 0; } void remove_poly(struct edit_script_list **Script, Exon *Exons, uchar *s1, uchar *s2, int len2, int *pT, int *pA) { remove_polyT_front(Script, Exons, s1, s2, pT); remove_polyA_back(Script, Exons, s1, s2, len2, pA); *pA = len2-(*pA)+1; /* printf("pT: %d pA: %d\n", *pT, *pA); */ return; } static void remove_polyA_back(struct edit_script_list **Sptr, Exon *Exons, uchar *s1, uchar *s2, int len2, int *lastA) { Exon *t, *exons_tail, *prev; /* start from Lblock */ uchar *b, *end; int numA, pA, dummy, trim_p, reverse_script=0; *lastA = len2+1; pA = 0; if (!Exons || ! Exons->next_exon || ! Exons->next_exon->to1) return; if ((*Sptr)->next_script && (*Sptr)->offset1<(*Sptr)->next_script->offset1) { reverse_script = 1; script_flip_list(Sptr); } exons_tail = Exons->next_exon; prev = Exons; for ( ; exons_tail->next_exon && exons_tail->next_exon->to1; prev=exons_tail, exons_tail=exons_tail->next_exon); trim_p = TRUE; while ((t=exons_tail)!=NULL && t->to1 && trim_p) { /* compute the 'A' contents of the exon */ b = s2 + t->to2-1; end = s2+t->from2-1; numA = 0; while (b>=end && numA+(b-s2)>=.60*t->length) { numA += (*b--=='A'); } if (numA>=.60*t->length) { /* remove the entire exon */ trim_polyA_align(Sptr,Exons,&exons_tail,t->from2,lastA,s1,s2); /* assert(*lastA==t->from2); t was removed */ } else { get_polyAT(s2+(*Sptr)->offset2-1,(*Sptr)->len2,&dummy,&pA,A_ONLY); if (pA) { int ct_pA; /* first position to be removed */ ct_pA = t->to2-pA+1; ct_pA = (ct_pA-t->from2>=MIN_EXON) ? ct_pA : t->from2; /* note: pA is the last (innermost) position in the tail */ trim_polyA_align(Sptr,Exons,&exons_tail,ct_pA,lastA,s1,s2); } if (t==exons_tail) trim_p = FALSE; } } if (reverse_script) script_flip_list(Sptr); } static void trim_polyA_align(struct edit_script_list **Sptr, Exon *lblock, Exon **exons, const int bc, int *pA, uchar *s1,uchar *s2) { edit_script_list *head = *Sptr; edit_script *tp; int tmpi = 0, num, idents = 0; uchar *a, *b; Exon *prev; int i, j; /* i index in the cDNA */ if (bc>head->offset2+head->len2-1) { *pA = bc; return; } if (bc==head->offset2) { /* cDNA gap: remove the entire script; this should be properly sorted */ *Sptr = head->next_script; Free_script(head->script); free(head); while ((*exons)->from2>=bc) { prev = find_previous(lblock,*exons); prev->next_exon = (*exons)->next_exon; free(*exons); *exons = prev; } *pA = bc; return; } Flip_script(&(head->script)); i = head->offset2 + head->len2 -1; j = head->offset1 + head->len1 -1; tp = head->script; while (i>=bc && tp) { num = tp->num; switch (tp->op_type) { case INSERT: if (i>=bc && bc>i-num+1) { tmpi += i-bc+1; tp->num -= i-bc+1; i = bc-1; } else { i -= num; tmpi += num; head->script = tp->next; free(tp); tp = head->script; } break; case DELETE: j -= num; tmpi += num; head->script = tp->next; free(tp); tp = head->script; break; case SUBSTITUTE: if (i>=bc && bc>i-num+1) { a = s2+i-1; b = s1+j-1; while (a>=s2+bc-1) { if (*a--!=*b--) tmpi++; else idents++; } j -= i-bc+1; tp->num -= i-bc+1; i = bc-1; } else { /* at most 1 nt remaining */ a = s2+i-1; b = s1+j-1; while (a>=s2+i-num) { if (*a--!=*b--) tmpi++; else idents++; } i -= num; j -= num; head->script = tp->next; free(tp); tp = head->script; } break; default: fatalf("Unrecognized opcode %d.\n",tp->op_type); } /* indel walk */ } assert(i==bc-1); while (tp->op_type!=SUBSTITUTE && j+1>=(*exons)->from1) { if (tp->op_type==INSERT) { i -= tp->num; tmpi += tp->num; } else if (j<(*exons)->from1 && i<(*exons)->from2) { j -= tp->num; } else { j -= tp->num; tmpi += tp->num; } head->script = tp->next; free(tp); tp = head->script; } if (head->script==NULL) { *Sptr = head->next_script; free(head); } else { head->len1 = j-head->offset1+1; head->len2 = i-head->offset2+1; head->score -= tmpi; Flip_script(&(head->script)); } if ((*exons)->from2>i) { prev = find_previous(lblock,*exons); prev->next_exon = (*exons)->next_exon; free(*exons); *exons = prev; } else { double tmp_matches; (*exons)->to2 = i; (*exons)->to1 = j; (*exons)->length = (*exons)->to2-(*exons)->from2+1; tmp_matches = (*exons)->nmatches - idents; (*exons)->alen -= tmpi+idents; (*exons)->match = (int)(100*tmp_matches/(*exons)->alen); } *pA = i+1; return; } static void remove_polyT_front(struct edit_script_list **Sptr, Exon *Exons, uchar *s1, uchar *s2, int *lastT) { Exon *t, *exons_head; /* start from Lblock */ uchar *b, *end; int numT, dummy, trim_p, reverse_script=0, pT; *lastT = pT = 0; if (!Exons || !Exons->next_exon || !Exons->next_exon->to1) return; if ((*Sptr)->next_script && (*Sptr)->offset1>(*Sptr)->next_script->offset1) { script_flip_list(Sptr); reverse_script = 1; } exons_head = Exons->next_exon; trim_p = TRUE; while ((t=exons_head)!=NULL && t->to1 && trim_p) { /* compute the 'T' contents of the exon */ b = s2 + t->from2-1; end = s2+t->to2; numT = 0; while (bto2-(b-s2-t->from2+1)>=.60*t->length)) { numT += (*b++=='T'); } if (numT>=.60*t->length) { /* remove the entire exon */ trim_polyT_align(Sptr,&exons_head,t->to2,lastT,s1,s2); /* assert(*lastT==t->to2); t was removed */ } else { get_polyAT(s2+(*Sptr)->offset2-1,(*Sptr)->len2,&pT,&dummy,T_ONLY); if (pT) { int ct_pT; ct_pT = pT + (*Sptr)->offset2-1; ct_pT = (t->to2-ct_pT>=MIN_EXON) ? ct_pT : t->to2; trim_polyT_align(Sptr,&exons_head,ct_pT,lastT,s1,s2); } if (t==exons_head) trim_p = FALSE; } } Exons->next_exon = exons_head; if (reverse_script) script_flip_list(Sptr); } /* s2 is the cdna */ static void trim_polyT_align(struct edit_script_list **Sptr, Exon **exons, const int ec, int *pT, uchar *s1, uchar *s2) { edit_script_list *head = *Sptr; edit_script *tp; int tmpi = 0, num, idents = 0; uchar *a, *b; Exon *t; int i, j; /* i index in the cDNA */ if (ecoffset2) { *pT = ec; return; } if (ec==head->offset2+head->len2-1) { /* cDNA gap: remove the entire script */ *Sptr = head->next_script; Free_script(head->script); free(head); while ((*exons)->from2next_exon; free(t); } *pT = ec; return; } i = head->offset2; j = head->offset1; tp = head->script; while (i<=ec && tp) { num = tp->num; switch (tp->op_type) { case INSERT: if (i<=ec && ecnum -= ec-i+1; i = ec+1; } else { i += num; tmpi += num; head->script = tp->next; free(tp); tp = head->script; } break; case DELETE: j += num; tmpi += num; head->script = tp->next; free(tp); tp = head->script; break; case SUBSTITUTE: if (i<=ec && ecnum -= ec-i+1; i = ec+1; } else { /* at most 1 nt remaining */ a = s2+i-1; b = s1+j-1; while (anum-1) { if (*a++!=*b++) tmpi++; else idents++; } i +=num; j += num; head->script = tp->next; free(tp); tp = head->script; } break; default: fatalf("Unrecognized opcode %d.\n",tp->op_type); } /* indel walk */ } assert(i==ec+1); while (tp->op_type!=SUBSTITUTE && j-1<=(*exons)->to1) { if (tp->op_type==INSERT) { i += tp->num; tmpi += tp->num; } else if (j>=(*exons)->to1 && i>=(*exons)->to2) { j += tp->num; } else { j += tp->num; tmpi += tp->num; } head->script = tp->next; free(tp); tp = head->script; } if (head->script==NULL) { *Sptr = head->next_script; free(head); } else { head->len1 -= j-head->offset1; head->len2 -= i-head->offset2; head->offset2 = i; head->offset1 = j; head->score -= tmpi; } if ((*exons)->to2next_exon; free(t); } else { double tmp_matches; (*exons)->from2 = i; (*exons)->from1 = j; (*exons)->length = (*exons)->to2-(*exons)->from2+1; tmp_matches = (*exons)->nmatches - idents; (*exons)->alen -= tmpi+idents; (*exons)->match = (int)(100*tmp_matches/(*exons)->alen); } *pT = i-1; return; } sim4.2012-10-10/poly.h0000444000515200116500000000037507733353760013770 0ustar floreasalzberg#ifndef POLY_H #define POLY_H #define MIN_EXON 12 #define T_ONLY 1 #define A_ONLY 2 #define BOTH_AT 3 void get_polyAT(uchar *,int,int *,int *,int); void remove_poly(struct edit_script_list **,Exon *,uchar *,uchar *,int,int *,int *); #endif sim4.2012-10-10/prnt.c0000444000515200116500000000725207733353760013764 0ustar floreasalzberg#include "libc.h" #include "types.h" #include "args.h" #include "seq.h" #include "dna.h" #include "misc.h" #include "prnt.h" #ifndef __lint static const char rcsid[] = "$Id: prnt.c,v 1.2 2000/06/05 22:48:19 florea Exp $"; #endif /* XXX */ static int offset1; static int offset2; enum { BUFSIZE=128 }; #define ckprintf (void)printf /* XXX */ static char *subseq_label(char *buf, unsigned int size, int n); static const char* revflag(SEQ *s); static const char* revlabel(SEQ *s); static void print_align_header_n(SEQ *seq1, SEQ *seq2, argv_scores_t *ds, int n) { int f, t, F, T; char buf[BUFSIZE]; ckprintf("#:lav\n\nd {\n \""); ck_argc("print_align_header"); fprintf_argv(stdout); ckprintf("\n M = %d, I = %d, V = %d", ds->M, ds->I, ds->V); ckprintf(", O = %d, E = %g", ds->O, ds->E); ckprintf("\"\n}\n"); if (get_argval('f', &f)) { if (!get_argval('t',&t) || !get_argval('F',&F) || !get_argval('T',&T)) fatal("Inconsistent use of `f`, `t`, `F', `T' args."); offset1 = SEQ_FROM(seq1) - f; offset2 = SEQ_FROM(seq2) - F; } else { f = SEQ_FROM(seq1); t = SEQ_TO(seq1); F = SEQ_FROM(seq2); T = SEQ_TO(seq2); offset1 = offset2 = 0; } ckprintf("s {\n \"%s%s\" %d %d\n \"%s%s\" %d %d\n}\n", SEQ_NAME(seq1), revflag(seq1), f, t, SEQ_NAME(seq2), revflag(seq2), F, T); ckprintf("h {\n \"%s%s\"\n \"%s%s%s\"\n}\n", SEQ_HEAD(seq1), revlabel(seq1), SEQ_HEAD(seq2), revlabel(seq2), subseq_label(buf, sizeof buf, n)); } /* print_align_header ------------- print the top part of an alignment file */ void print_align_header(SEQ *seq1, SEQ *seq2, argv_scores_t *ds) { print_align_header_n(seq1, seq2, ds, 0); } static char *subseq_label(char *buf, unsigned int size, int n) { assert(size > 0); buf[0] = 0; if (n > 0) snprintf(buf, size, " (subsequence #%d)", n); return buf; } static const char* revflag(SEQ *s) { return (s->flags & SEQ_IS_REVCOMP) ? "-" : ""; } static const char* revlabel(SEQ *s) { return (s->flags & SEQ_IS_REVCOMP) ? " (reverse complement)" : ""; } /* print_align ----------------------------------- print a general alignment */ void print_align(int score, uchar *seq1, uchar *seq2, int beg1, int end1, int beg2, int end2,int *S) { int M, N, i, j, op, start_i, start_j, match, run, pct; uchar *P, *p, *q; beg1 += offset1; end1 += offset1; beg2 += offset2; end2 += offset2; M = end1 - beg1 + 1; N = end2 - beg2 + 1; ckprintf("a {\n s %d\n b %d %d\n e %d %d\n", score, beg1, beg2, end1, end2); for (i = j = 0; i < M || j < N; ) { start_i = i; start_j = j; match = 0; P= p = seq1 + beg1 + i - 1; q = seq2 + beg2 + j - 1; while (i < M && j < N && *S == 0) { if (*p++ == *q++) ++match; ++i; ++j; ++S; } run = p - P; pct = (run > 0) ? ((100*match + run/2)/run) : 0; /* round */ ckprintf(" l %d %d %d %d %d\n", beg1+start_i, beg2+start_j, beg1+i-1, beg2+j-1, pct); if (i < M || j < N) { if ((op = *S++) > 0) j += op; else i -= op; } } ckprintf("}\n"); } sim4.2012-10-10/prnt.h0000444000515200116500000000050107733353760013757 0ustar floreasalzberg#ifndef SIM_PRNT_H #define SIM_PRNT_H /* $Id: prnt.h,v 1.1 2000/06/05 22:27:00 florea Exp $ */ typedef unsigned int edit_op_t; /* 32 bits */ void print_align_header(SEQ *seq1, SEQ *seq2, argv_scores_t *ds); void print_align(int score, uchar *seq1, uchar *seq2, int beg1, int end1, int beg2, int end2, int *S); #endif sim4.2012-10-10/psublast.h0000444000515200116500000000045507733353760014641 0ustar floreasalzberg#ifndef PSUBLASTLIB_H #define PSUBLASTLIB_H /* $Id: psublast.h,v 1.2 2000/06/05 22:48:19 florea Exp $ */ #include "libc.h" #include "types.h" #include "seq.h" #include "args.h" #include "dna.h" #include "prnt.h" #include "misc.h" #include "discrim.h" #include "charvec.h" #endif /* PSUBLASTLIB_H */ sim4.2012-10-10/seq.c0000444000515200116500000001103407733353760013562 0ustar floreasalzberg #include "libc.h" #include "types.h" #include "misc.h" #include "seq.h" #include "encoding.h" static const char rcsid[] = "$Id: seq.c,v 1.2 2000/06/05 22:48:19 florea Exp $"; static int parse_fname(const char* arg, char **fname, int *from, int *len, char **maskfile) { char *p = 0; int flags = 0; /* "seqfile{maskfile}[from,to]-" */ *fname = copy_string(arg); p = (*fname)+strlen(*fname)-1; if (*p == '-') { *p = 0; flags |= SEQ_DO_REVCOMP; } if ((p = strchr(*fname, '['))) { int to; if (sscanf(p+1, "%d,%d", from, &to) != 2) return -1; if (*from <= 0 || *from > to) return -1; *p = '\0'; *len = to - *from + 1; flags |= (SEQ_DO_SUBRANGE|SEQ_IS_SUBRANGE); } else { *from = 1; *len = -1; } if ((p = strchr(*fname, '{'))) { char *q = strchr(p+1, '}'); if (q) { *p = *q = 0; if (maskfile) { *maskfile = copy_string(p+1); flags |= SEQ_DO_MASK; } } } else { *maskfile = copy_string(""); /* XXX ugh */ } return flags; } static int check_flags(int flags) { switch (flags & (SEQ_DISALLOW_AMB|SEQ_ALLOW_AMB)) { case 0: /* default is to allow ambiguious */ flags |= SEQ_ALLOW_AMB; break; case SEQ_ALLOW_AMB: case SEQ_DISALLOW_AMB: break; case SEQ_DISALLOW_AMB|SEQ_ALLOW_AMB: fatalf("seq_open: contradictory flags: SEQ_DISALLOW_AMB|SEQ_ALLOW_AMB"); } return flags; } SEQ* seq_open(const char *fname, const char *mode, int flags) { SEQ *s = ckallocz(sizeof(SEQ)); int r; mode = 0; r = parse_fname(fname, &(s->fname), &(s->from), &(s->slen), &(s->maskname)); if (r == -1) fatalf("improper positions specification: %s", fname); s->flags = check_flags(r|flags); s->fp = ckopen(s->fname, "r"); s->count = 0; s->offset = 0; return s; } SEQ *seq_copy(const SEQ *s) { SEQ *ss = ckallocz(sizeof(SEQ)); *ss = *s; ss->seq = (uchar*)copy_string((const char*)s->seq); ss->header = copy_string(s->header); ss->fname = copy_string(s->fname); ss->maskname = copy_string(s->fname); ss->fp = 0; /* XXX - no subsequent seq_read operations allowed */ ss->offset = 0; /* XXX - no subsequent seq_read operations allowed */ /* alternatively, ss->fp = ckopen(ss->fname, "r"); ckfseek(ss->fp, ckftell(s->fp), SEEK_SET); but this is more expensive. */ return ss; } SEQ *seq_subseq(const SEQ *s, int origin, int length) { SEQ *ss; /* XXX - probably should do reference counting. */ /* 1-indexing is ugly */ if (origin < 1 || length < 0) return 0; if (SEQ_LEN(s) < origin+length-1) return 0; ss = ckallocz(sizeof(SEQ)); *ss = *s; ss->flags = s->flags|SEQ_IS_SUBSEQ; ss->fp = 0; ss->offset = 0; ss->from = 1; ss->seq = s->seq + origin - 1; ss->slen = length; return ss; } SEQ* seq_from_chars(unsigned char *chrs, unsigned int len) { SEQ *s = ckallocz(sizeof(SEQ)); s->fname = 0; s->header = 0; s->hlen = 0; s->seq = chrs; s->from = 1; s->slen = len; s->maskname = 0; s->flags = SEQ_IS_SUBSEQ; s->count = 0; s->fp = 0; s->offset = 0; return s; } const char *seq_set_header(SEQ *s, const char *h) { if (s && s->header) { free(s->header); s->header = copy_string(h); return s->header; } return 0; } SEQ* seq_close(SEQ *s) { if (s) { if (!(s->flags & SEQ_IS_SUBSEQ)) { if (s->fp) { if (s->flags & SEQ_HAS_PIPE) ckpclose(s->fp); else fclose(s->fp); } if (s->fname) free(s->fname); if (s->header) free(s->header); if (s->seq) free(s->seq); if (s->maskname) free(s->maskname); } memset(s, 0, sizeof(SEQ)); free(s); } return 0; } uchar dna_cmpl(uchar ch) { /* XXX - assumes ascii, returns space on error. */ return dna_complement[ch]; } static SEQ *seq_revcomp_helper(SEQ *seq) { uchar *s, *p; /* assert(SEQ_CHARS in dcomp-' '); */ /* seq_read should check this. */ s = SEQ_CHARS(seq); p = s+SEQ_LEN(seq)-1; while (s<=p) { uchar c; c = dna_cmpl(*s); *s = dna_cmpl(*p); *p = c; ++s, --p; } return seq; } SEQ *seq_revcomp_inplace(SEQ *seq) { seq_revcomp_helper(seq); seq->flags ^= SEQ_IS_REVCOMP; return seq; } SEQ *seq_get(const char *fname, const char *mode, int flags) { SEQ *s = seq_open(fname, mode, flags); int r = seq_read(s); if (r < 0) fatalfr("could not read from %s", fname); else if (r == 0) return 0; else return s; /*NOTREACHED*/ return 0; } int seq_count(SEQ *s) { return s->count; } int seq_revisit(SEQ *s, long offset) { return fseek(s->fp, offset, SEEK_SET); } long seq_offset(SEQ *s) { return s->offset; } #ifdef TEST int main() { printf("%d\n", sizeof(dna_complement)); exit(0); } #endif sim4.2012-10-10/seq.h0000444000515200116500000000403607733353760013573 0ustar floreasalzberg#ifndef SIM_SEQ_H #define SIM_SEQ_H /* $Id: seq.h,v 1.1 2000/06/05 22:25:54 florea Exp $ */ typedef struct seq_data { uchar *seq; int slen; /* bytes in seq, not including '\0' */ int origin; } seq_data_t; typedef struct seq_file { FILE *fp; int flags; int count; /* how many contigs we have already read */ long offset; /* the starting offset of the contig we just read */ char *maskname; char *fname; int from; /* 1 based */ char *header; int hlen; /* bytes in header, not including '\0' */ uchar *seq; int slen; /* bytes in seq, not including '\0' */ } SEQ; #define SEQ_NAME(s) ((s)->fname) #define SEQ_LEN(s) ((s)->slen) #define SEQ_TO(s) ((s)->slen + (s)->from - 1) #define SEQ_FROM(s) ((s)->from) #define SEQ_AT(s,i) ((s)->seq[i]) /* #define SEQ_LEN(s) ((s)->to - (s)->from + 1) */ #define SEQ_HEAD(s) ((s)->header) #define SEQ_HLEN(s) ((s)->hlen) #define SEQ_CHARS(s) ((s)->seq) #define SEQ_SAME(a,b) ((a)==(b)) enum /* powerset */ { SEQ_IS_SUBRANGE = (1<<0) /* seq is a subrange of a file */ , SEQ_IS_REVCOMP = (1<<1) /* seq is reverse compliment of a file */ , SEQ_IS_SUBSEQ = (1<<2) /* seq is a reference to another seq */ , SEQ_HAS_MASK = (1<<3) /* seq has a mask applied */ , SEQ_HAS_PIPE = (1<<4) /* input fd is a pipe */ , SEQ_DO_REVCOMP = (1<<5) /* make it so after open */ , SEQ_DO_SUBRANGE = (1<<6) /* make it so after open */ , SEQ_DO_MASK = (1<<7) /* make it so after open */ , SEQ_ALLOW_AMB = (1<<8) /* checked while reading */ , SEQ_DISALLOW_AMB = (1<<9) /* checked while reading */ }; SEQ* seq_open(const char *fname, const char *mode, int flags); SEQ* seq_close(SEQ *s); int seq_read(SEQ *seq); const char *seq_set_header(SEQ *s, const char *h); SEQ *seq_copy(const SEQ *s); SEQ *seq_subseq(const SEQ *s, int origin, int length); SEQ *seq_revcomp_inplace(SEQ *seq); SEQ *seq_get(const char *fname, const char *mode, int flags); SEQ* seq_from_chars(unsigned char *chrs, unsigned int len); uchar dna_cmpl(unsigned char); int seq_count(SEQ *s); int seq_revisit(SEQ *s, long offset); long seq_offset(SEQ *s); #endif sim4.2012-10-10/seq_read.c0000444000515200116500000001002507733353760014554 0ustar floreasalzberg#include "libc.h" #include "types.h" #include "misc.h" #include "seq.h" #include "encoding.h" #include "charvec.h" #ifndef __lint static const char rcsid[] = "$Id: seq_read.c,v 1.2 2000/06/05 22:48:19 florea Exp $"; #endif static SEQ *seq_mask_inplace(SEQ *seq); static int getpair(FILE *fp, int *a, int *b); static char *byte_fill_range(uchar *p, int l, int c, int a, int b); static int ws(int c); static int getnwc(FILE *fp); static void un_getc(int c, FILE *fp); static void char_append(charvec_t *s, int c); static int ws(int c) { return (c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r'); } static int getnwc(FILE *fp) { int c = EOF; if (!feof(fp)) do c = getc(fp); while (c != EOF && ws(c)); return c; } static void un_getc(int c, FILE *fp) { if (c != EOF) if (ungetc(c, fp) == EOF) fatalf("cannot ungetc '%c'", c); } static void char_append(charvec_t *s, int c) { if (!charvec_append(s, c)) fatal("cannot append"); } int seq_read(SEQ *seq) { int b, c; charvec_t shdr = charvec_INIT(ckrealloc,free); charvec_t sseq = charvec_INIT(ckrealloc,free); if (feof(seq->fp)) return 0; if (seq->count > 0) { if (seq->flags & SEQ_IS_SUBRANGE) { return 0; } else { seq->from = 1; seq->slen = -1; /* will be computed below */ } } if (seq->header) ZFREE(seq->header); if (seq->seq) ZFREE(seq->seq); seq->offset = ftell(seq->fp); /* --- header --- */ c = getnwc(seq->fp); if (c == '>') { while (c != '\n' && c != EOF) { char_append(&shdr, c); c = getc(seq->fp); } } else { un_getc(c, seq->fp); } char_append(&shdr, 0); seq->header = shdr.a; seq->hlen = shdr.len; /* --- seq --- */ b = '\n'; c = getnwc(seq->fp); while ((c != EOF) && !(b == '\n' && c == '>')) { switch (nfasta_ctype[c]) { case Nfasta_nt: char_append(&sseq, c); break; case Nfasta_ws: /* skip space */ break; case Nfasta_amb: if (seq->flags & SEQ_ALLOW_AMB) { char_append(&sseq, c); break; } /* FALLTHRU */ default: fatalf("non-DNA character '%c' in sequence '%s'", c, seq->fname); break; } b = c; c = getc(seq->fp); } un_getc(c, seq->fp); /* check conformance */ if (SEQ_LEN(seq) == -1) { char_append(&sseq, 0); charvec_fit(&sseq); seq->seq = (uchar*)sseq.a; seq->slen = sseq.len; if (seq->slen > 0) --seq->slen; /* don't include '\0' */ } else { charvec_t ssub = charvec_INIT(ckrealloc,free); int i; if (SEQ_FROM(seq) < 1 || (int)sseq.len < SEQ_FROM(seq) || SEQ_TO(seq) < 1 || (int)sseq.len < SEQ_TO(seq) || SEQ_TO(seq) < SEQ_FROM(seq)) fatalf("range [%d,%d] incommensurate with sequence [%d,%d]", SEQ_FROM(seq), SEQ_TO(seq), 1, sseq.len); for (i = SEQ_FROM(seq); i <= SEQ_TO(seq); ++i) char_append(&ssub, sseq.a[i-1]); char_append(&ssub, 0); charvec_fini(&sseq); seq->seq = (uchar*)ssub.a; } seq->flags = seq->flags &~ SEQ_IS_REVCOMP; if (seq->flags & SEQ_DO_REVCOMP) { (void)seq_revcomp_inplace(seq); } if (seq->flags & SEQ_HAS_MASK) { (void)seq_mask_inplace(seq); } seq->count++; return 1; } static SEQ *seq_mask_inplace(SEQ *seq) { int a, b; FILE *fp = fopen(seq->maskname, "r"); if (fp == 0) { fatalfr("cannot open '%s'", seq->maskname); return 0; } else { while (getpair(fp, &a, &b)) byte_fill_range(SEQ_CHARS(seq),SEQ_LEN(seq),'X',a,b); fclose(fp); seq->flags |= SEQ_HAS_MASK; return seq; } } #define BUF 128 static int getpair(FILE *fp, int *a, int *b) { char buf[BUF]; /* XXX - should handle comments, etc */ if (fgets(buf, (int)sizeof(buf), fp) == 0) return 0; if (sscanf(buf, "%d%d", a, b) != 2) return 0; return 1; } static char *byte_fill_range(uchar *p, int l, int c, int a, int b) { /* fill [a,b] (1-indexed) in p with c */ a--; b = b-a; /* make it into a 0-indexed, open interval */ return (b < 0 || l < b) ? 0 : memset(p+a, c, (size_t)b); } sim4.2012-10-10/types.h0000444000515200116500000000032307733353760014142 0ustar floreasalzberg#ifndef SIM_TYPES_H #define SIM_TYPES_H /* $Id: types.h,v 1.1 2000/06/05 22:26:45 florea Exp $ */ #define NACHARS 128 typedef int bool; typedef int ss_t[NACHARS][NACHARS]; typedef unsigned char uchar; #endif sim4.2012-10-10/README.sim40000444000515200116500000000152312035277607014357 0ustar floreasalzbergsim4 -- a program to align cDNA and genomic DNA Copyright (C) 1998-2012 Liliana Florea psublast -- a library for sequence alignment Copyright (C) 1998-2012 Scott Schwartz This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA sim4.2012-10-10/README.psublast0000444000515200116500000000140012035277630015326 0ustar floreasalzbergpsublast -- support library for alignment programs Copyright (C) 1998-2012 Scott Schwartz This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA sim4.2012-10-10/INSTALL0000444000515200116500000000043012035277650013647 0ustar floreasalzbergEach distributed version of sim4 will be unpacked into a directory whose name contains that version string. For example, sim4.2000-03-13. Make a note of this version when you fetch the tar file. To compile: gunzip < sim4.[version].tar.gz | tar -xf - cd sim4.[version] make sim4.2012-10-10/encoding.c0000644000515200116500000000547312035302757014563 0ustar floreasalzberg#define _ (-1) static const char rcsid[] = "$Id: encoding.c,v 1.3 2003/09/21 16:41:37 schwartz Exp $"; enum { Nfasta_bad=0, Nfasta_nt=1, Nfasta_ws=2, Nfasta_amb=3 }; extern const unsigned char nfasta_ctype[256]; const unsigned char nfasta_ctype[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 3, 0, 0, 1, 3, 0, 0, 3, 0, 3, 1, 0, 0, 0, 3, 3, 1, 0, 3, 3, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 3, 0, 0, 1, 3, 0, 0, 3, 0, 3, 1, 0, 0, 0, 3, 3, 1, 0, 3, 3, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; extern const unsigned char dna_complement[256]; const unsigned char dna_complement[] = { ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', ' ','T','V','G','H',' ',' ','C','D',' ',' ','M',' ','K','N',' ',' ',' ','Y','S','A',' ','B','W','X','R',' ',' ',' ',' ',' ',' ', ' ','t','v','g','h',' ',' ','c','d',' ',' ','m',' ','k','n',' ',' ',' ','y','s','a',' ','b','w','x','r',' ',' ',' ',' ',' ',' ', ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', }; /* Previously, but then the array would in fact have held 257 chars: Changed 10/10/2012. " " " TVGH CD M KN YSA BWXR tvgh cd m kn ysa bwxr " " " " "; */ /* ................................................................ */ /* @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~. */ /* ................................................................ */ /* ................................................................ */ static const char *osx_linker_bug = ""; sim4.2012-10-10/COPYRIGHT0000644000515200116500000000264212035303021014100 0ustar floreasalzbergThis package implements the Sim4 algorithm for aligning expressed DNA with genomic sequences, described in the paper: L. Florea, G. Hartzell, Z. Zhang, G. Rubin, and W. Miller (1998) "A computer program for aligning a cDNA sequence with a genomic DNA sequence." Genome Research 8, 967-974. Portions copyright by: Copyright (C) 1998-2012 Liliana Florea Copyright (C) 1998-2012 Scott Schwartz This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # The following files were written by Liliana Florea: Xtend1.c Xtend1.h align.c align.h sim4.h sim4.init.c sim4b1.c sim4b1.h splice.c splice.h # The following files were written by Scott Schwartz: args.c args.h charvec.c charvec.h discrim.c discrim.h dna.c dna.h encoding.c encoding.h libc.h misc.c misc.h prnt.c prnt.h psublast.h seq.c seq.h seq_read.c types.h sim4.2012-10-10/sim4.init.c0000444000515200116500000007540207733353757014627 0ustar floreasalzberg/* * sim4 - Align a cDNA sequence with a genomic sequence for it. * * The basic command syntax is * * sim4 seq1 seq2 [[AXWRKCDHEPNBS]=] * * where sequence1 and sequence2 name files containing DNA sequences; a name * like "foo[300,400]" refers to sequence entries 300-400 in file "foo". The * files are to be in FASTA format. Thus a typical sequence file might begin: * * >BOVHBPP3I Bovine beta-globin psi-3 pseudogene, 5' end. * GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT * GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC * * Typically, sequence1 name file contains one DNA sequence, and sequence2 * name file contains a database of sequences in FASTA format. Sim4 can * compare a genomic sequence with a database of cDNAs or a cDNA sequence * with a genomic database. If highly accurate sequences are being compared, * specify option N=1. * * The command permits optional additional arguments, e.g. "A=1" and "W=8", * that reset certain internal parameters. The available parameters are: * * W gives the word size. * X gives the value for terminating word extensions. * K gives the MSP score threshold for the first pass. * C gives the MSP score threshold for the second pass. * R direction of search; 0 - search the '+' strand only; * 1 - search the '-' strand only; 2 - search both strands and * report the best match. (R=2) * D adjusts the range of diagonals in building the exons. * H adjusts the re-linking weight factor * A specifies the output format: exon endpoints only (A=0), * alignment text (A=1), alignment in lav format (A=2) or both * exon endpoints and alignment text (A=3, A=4). For A=3, positions * in sequence 1 are given in the original sequence, and for A=4 in * its reverse complement. A=5 prints the exon and CDS coordinates * (the latter, if known) in the `exon file' format required by PipMaker. * N if !=0, highly accurate exon detection is expected, for highly * accurate sequence data. * P remove polyA tails; if match on complementary strand, change * coordinates in sequence 1 according to the '+' strand and print * #lav alignment header for all alignment options. * B control the presence of ambiguity codes in sequence data. If * 1 (default), allow ambiguity codes (ABCDGKMNRCTVWXY); if 0, * restrict the set of accepted characters to ACGTNX. * S specifies a known coding region (to be used only with A=5, and for * comparing full-mRNA sequences). */ #include "psublast.h" #include "sim4.h" #include "align.h" #include "poly.h" #ifndef __lint /*@unused@*/ static const char rcsid[] = "$Id: sim4.init.c,v 1.40 2001/12/23 05:29:44 florea Exp $"; #endif static void init_stats(sim4_stats_t *); static void sim4_argvals(sim4_args_t *); static void cds_range(char *,int *,int *); static char *extract_tok(char *); static void add_offset_exons(Exon *,int); static void add_offset_aligns(edit_script_list *,int); static void print_align_blk(uchar *,uchar *,int,int,edit_script_list **,int,int); static void print_align_lat(uchar *,uchar *,int,int,edit_script_list **,Exon *,int,int); static const char Usage[] = "%s seq1 seq2_db [[WXKCRDHAPNBS]=]\n\n\ W - word size. (W=12)\n\ X - value for terminating word extensions. (X=12)\n\ K - MSP score threshold for the first pass. (e.g., K=16)\n\ C - MSP score threshold for the second pass. (e.g., C=12)\n\ R - direction of search; 0 - search the '+' (direct) strand only; \n\ 1 - search the '-' strand only; 2 - search both strands and \n\ report the best match. (R=2)\n\ D - bound for the range of diagonals within consecutive msps in an\n\ exon. (D=10)\n\ H - weight factor for MSP scores in relinking. (H=500)\n\ A - output format: exon endpoints only (A=0), alignment text (A=1),\n\ alignment in lav (block) format (A=2), or both exon endpoints\n\ and alignment text (A=3, A=4). If complement match, A=0,1,2,3\n\ give direct positions in the long sequence and complement \n\ positions in the short sequence. A=4 gives direct positions in \n\ the first sequence, regardless of the relative lengths.\n\ A=5 prints the exon and CDS coordinates (the latter, if known)\n\ in the `exon file' format required by PipMaker. To be used\n\ with full-length mRNA sequences.\n\ P - if not 0, remove poly-A tails; report coordinates in the \n\ '+' (direct) strand for complement matches; use lav alignment \n\ headers in all display options. (P=0) \n\ N - accuracy of sequences (non-zero for highly accurate). (N=0)\n\ B - if 0, dis-allow ambiguity codes (other than N and X) in the\n\ sequence data. (B=1)\n\ S - coding region specification (available only with A=5);\n\ format: S=n1..n2\n"; #ifdef _STATS static void print_stats(sim4_stats_t, char *, int); static void print_exon_stats(Exon *, char *, int); #endif #ifdef DEBUG static void polystats(int,int,int,int,int,int,int,int); #endif #include "sim4b1.h" int main(int argc, char *argv[]) { uchar *revseq1=NULL; int len1, len2, count, dist, match_ori, in_K, in_C, in_H; int pA, pT, xpT, xpA, rev_xpT, rev_xpA; int cds_from, cds_to; uchar *seq1, *seq2; char *h2, *h1, *cds_gene=NULL, *line; SEQ *sf1, *sf2, *rf1=NULL; argv_scores_t ds; ss_t ss; Exon *Exons=NULL, *rev_Exons=NULL; edit_script_list *Aligns=NULL, *rev_Aligns=NULL; if (argc < 3) fatalf(Usage, argv[0]); ckargs("AXWRKCDHEPNBST", argc, argv, 2); sim4_argvals(&rs); DNA_scores(&ds, ss); /* read seq1 */ sf1 = seq_open(argv[1], 0, (rs.B ? SEQ_ALLOW_AMB : 0)); if (!seq_read(sf1)) fatalf("Cannot read sequence from %s.", argv[1]); seq1 = SEQ_CHARS(sf1); len1 = SEQ_LEN(sf1); h1 = SEQ_HEAD(sf1); if (!is_DNA(seq1, len1)) fatal("The first sequence is not a DNA sequence."); seq_toupper(seq1, len1, argv[1]); /* read seq2 */ sf2 = seq_open(argv[2], 0, (rs.B ? SEQ_ALLOW_AMB : 0)); if (!seq_read(sf2)) fatalf("Cannot read sequence from %s.", argv[2]); seq2 = SEQ_CHARS(sf2); len2 = SEQ_LEN(sf2); h2 = SEQ_HEAD(sf2); if (!is_DNA(seq2, len2)) fatal("The first sequence is not a DNA sequence."); seq_toupper(seq2, len2, argv[2]); /* determine the type of comparison */ file_type = (len2<=len1) ? GEN_EST : EST_GEN; if (file_type== EST_GEN) { rf1 = seq_copy(sf1); rf1 = seq_revcomp_inplace(rf1); revseq1 = SEQ_CHARS(rf1); if (rs.ali_flag==5) { if (rs.CDS_to>len1) fatal("Command line CDS endpoint exceeds sequence length."); cds_gene = extract_tok(h1); if (cds_gene==NULL) { /* no FastaA header */ cds_from = rs.CDS_from; cds_to = rs.CDS_to; } else { line = strstr(h1, "CDS="); if (line && rs.S) { fprintf(stderr, "Warning: Command line CDS specification overrides header CDS specification."); cds_from = rs.CDS_from; cds_to = rs.CDS_to; } else if (line) { cds_range(line+4, &cds_from, &cds_to); } else if (rs.S) { cds_from = rs.CDS_from; cds_to = rs.CDS_to; } else { cds_from = cds_to = 0; } } if (cds_to>len1) fatal("CDS endpoints exceed sequence length."); } } if (rs.poly_flag && file_type==EST_GEN) { get_polyAT(seq1,len1,&pT,&pA,BOTH_AT); } else pT = pA = 0; bld_table(seq1-1+pT, len1-pA-pT, rs.W, INIT); count = 0; while (!count || (seq_read(sf2)>0)) { sim4_stats_t st, rev_st; char *tok; if (count) { /* skip the first seq2, already in memory */ h2 = SEQ_HEAD(sf2); seq2 = SEQ_CHARS(sf2); len2 = SEQ_LEN(sf2); tok = extract_tok(h2); if (!is_DNA(seq2, len2)) { char tmp[200]; (void)sprintf(tmp,"%s sequence is not a DNA sequence.", tok); perror(tmp); continue; } seq_toupper(seq2, len2, argv[2]); } else { /* first sequence in the file, seq2 is already in memory */ tok = extract_tok(h2); if (tok==NULL) { tok = ckalloc(strlen("(no header)")+1); strcpy(tok, "(no header)"); } } if ((rs.ali_flag==5) && (file_type==GEN_EST)) { cds_gene = tok; if (!cds_gene && !count) { cds_from = rs.CDS_from; cds_to = rs.CDS_to; } else if (!count) { line = strstr(h2, "CDS="); if (rs.S) { if (line) fprintf(stderr, "Warning: Command line CDS specification overrides header CDS specification."); cds_from = rs.CDS_from; cds_to = rs.CDS_to; } else if (line) { cds_range(line+4, &cds_from, &cds_to); } } else if (count) { line = strstr(h2, "CDS="); if (line) { cds_range(line+4, &cds_from, &cds_to); } else { cds_from = cds_to = 0; } } if (cds_to>len2) fatal("CDS endpoints exceed sequence length."); } if (rs.poly_flag && file_type==GEN_EST) { get_polyAT(seq2, len2, &pT, &pA, BOTH_AT); } ++count; init_stats(&st); init_stats(&rev_st); in_K = (rs.set_K==TRUE) ? rs.K:-1; in_C = (rs.set_C==TRUE) ? rs.C:-1; in_H = (rs.set_H==TRUE) ? rs.weight:-1; switch (rs.reverse) { case 0: Aligns = (file_type==EST_GEN) ? SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C, in_H,&dist,&xpT,&xpA,&Exons,&st): SIM4(seq1,seq2+pT,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C, in_H,&dist,&xpT,&xpA,&Exons,&st); break; case 1: sf2 = seq_revcomp_inplace(sf2); seq2 = SEQ_CHARS(sf2); rev_Aligns = (file_type==EST_GEN) ? SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C, in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st): SIM4(seq1,seq2+pA,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C, in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st); break; case 2: Aligns = (file_type==EST_GEN) ? SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C, in_H,&dist,&xpT,&xpA,&Exons,&st): SIM4(seq1,seq2+pT,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C, in_H,&dist,&xpT,&xpA,&Exons,&st); sf2 = seq_revcomp_inplace(sf2); seq2 = SEQ_CHARS(sf2); rev_Aligns = (file_type==EST_GEN) ? SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C, in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st): SIM4(seq1,seq2+pA,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C, in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st); break; default: fatal ("Unrecognized request for EST orientation."); } if (st.nmatches>=rev_st.nmatches) { /* forward ('+') strand match */ match_ori = FWD; if (rs.reverse && rs.ali_flag) { /* reverse-complement back seq2 for alignment */ sf2 = seq_revcomp_inplace(sf2); seq2 = SEQ_CHARS(sf2); } if (rev_Exons) { free_list(rev_Exons); rev_Exons = NULL; } if (rev_Aligns) { free_align(rev_Aligns); rev_Aligns = NULL; } } else { match_ori = BWD; if (Exons) { free_list(Exons); Exons = NULL; } if (Aligns) { free_align(Aligns); Aligns = NULL; } } if (rs.poly_flag) { if (match_ori==FWD) { add_offset_exons(Exons, pT); add_offset_aligns(Aligns, pT); } else { add_offset_exons(rev_Exons,(file_type==EST_GEN)?pT:pA); add_offset_aligns(rev_Aligns, (file_type==EST_GEN)?pT:pA); } } #ifdef DEBUG polystats(pT,pA,xpT,xpA,rev_xpT,rev_xpA,file_type,match_ori); #endif switch (rs.ali_flag) { case 0: (void)printf("\nseq1 = %s, %d bp\n", argv[1], len1); (void)printf("seq2 = %s (%s), %d bp\n",argv[2],tok,len2); if (match_ori==FWD) { if (Exons) (void)printf("\n"); print_exons(Exons); } else { (void)printf("\n(complement)"); if (file_type==EST_GEN) complement_exons(&rev_Exons, len2, len1); if (rev_Exons) (void)printf("\n"); print_exons(rev_Exons); } break; case 1: (void)printf("\nseq1 = %s, %d bp\n", argv[1], len1); (void)printf("seq2 = %s (%s), %d bp\n",argv[2],tok,len2); (void)printf("\n"); if (match_ori==FWD) { (void)printf("\n"); print_align_lat(seq1, seq2, len1, len2, &Aligns, Exons, file_type, FWD); } else { (void)printf("\n(complement)\n"); if (file_type==EST_GEN) { complement_exons(&rev_Exons, len2, len1); sf2 = seq_revcomp_inplace(sf2); seq2 = SEQ_CHARS(sf2); } print_align_lat((file_type==EST_GEN) ? revseq1:seq1, seq2,len1,len2,&rev_Aligns,rev_Exons,file_type,BWD); } break; case 2: print_align_header(sf1, sf2, &ds); if (match_ori==FWD) { (void)printf("\n"); print_align_blk(seq1,seq2,len1,len2,&Aligns, file_type,FWD); } else { if (file_type==EST_GEN) { complement_exons(&rev_Exons, len2, len1); sf2 = seq_revcomp_inplace(sf2); seq2 = SEQ_CHARS(sf2); } (void)printf("\n(complement)\n"); print_align_blk((file_type==EST_GEN) ? revseq1:seq1, seq2,len1,len2,&rev_Aligns,file_type,BWD); } break; case 3: (void)printf("\nseq1 = %s, %d bp\n", argv[1], len1); (void)printf("seq2 = %s (%s), %d bp\n",argv[2],tok,len2); (void)printf("\n"); if (match_ori==FWD) { if (Exons) (void)printf("\n"); print_exons(Exons); (void)printf("\n"); print_align_lat(seq1, seq2, len1, len2, &Aligns, Exons, file_type, FWD); } else { if (file_type==EST_GEN) { complement_exons(&rev_Exons, len2, len1); sf2 = seq_revcomp_inplace(sf2); seq2 = SEQ_CHARS(sf2); } (void)printf("\n(complement)"); if (rev_Exons) (void)printf("\n"); print_exons(rev_Exons); (void)printf("\n"); print_align_lat((file_type==EST_GEN) ? revseq1:seq1, seq2,len1,len2,&rev_Aligns,rev_Exons,file_type,BWD); } break; case 4: (void)printf("\nseq1 = %s, %d bp\n",argv[1],len1); (void)printf("seq2 = %s (%s), %d bp\n",argv[2],tok,len2); (void)printf("\n%s\n", SEQ_HEAD(sf1)); (void)printf("%s", SEQ_HEAD(sf2)); (void)printf("\n"); if (match_ori==FWD) { if (Exons) (void)printf("\n"); print_exons(Exons); (void)printf("\n"); print_align_lat(seq1,seq2,len1,len2,&Aligns,Exons,file_type,FWD); } else { /* seq2 is already reversed, need only interchange from1<->from2, etc, in Exons; the alignment should be read in the forward orientation */ (void)printf("\n(complement)\n"); if (rev_Exons) (void)printf("\n"); print_exons(rev_Exons); (void)printf("\n"); print_align_lat(seq1,seq2,len1,len2,&rev_Aligns,rev_Exons,file_type,FWD); } break; case 5: if (match_ori==FWD) { (void)printf("\n"); if (file_type==EST_GEN) { print_pipmaker_exons(Exons,Aligns,cds_gene, cds_from,cds_to,len2,len1,seq2,seq1,FWD); } else { print_pipmaker_exons(Exons, Aligns, cds_gene, cds_from,cds_to,len1,len2,seq1,seq2,FWD); } } else { (void)printf("\n"); /* give it the "real" sequences */ sf2 = seq_revcomp_inplace(sf2); seq2 = SEQ_CHARS(sf2); if (file_type==EST_GEN) { complement_exons(&rev_Exons, len2, len1); print_pipmaker_exons(rev_Exons,rev_Aligns,cds_gene, cds_from,cds_to,len2,len1,seq2,seq1,BWD); } else { print_pipmaker_exons(rev_Exons,rev_Aligns,cds_gene, cds_from,cds_to,len1,len2,seq1,seq2,BWD); } } break; default:fatal("Unrecognized option for alignment output."); } #ifdef _STATS print_exon_stats((match_ori==FWD) ? Exons:rev_Exons, (file_type==EST_GEN) ? argv[1]:tok+1, (file_type==EST_GEN) ? len1-pA-pT:len2-pA-pT); print_stats((match_ori==FWD) ? st:rev_st, (file_type==EST_GEN) ? argv[1]:tok+1, (file_type==EST_GEN) ? len1-pA-pT:len2-pA-pT); #endif (void)printf("\n"); if (Aligns) { free_align(Aligns); Aligns = NULL; } if (rev_Aligns) { free_align(rev_Aligns); rev_Aligns = NULL; } if (Exons) { free_list(Exons); Exons = NULL; } if (rev_Exons) { free_list(rev_Exons); rev_Exons = NULL; } if (tok) { free(tok); tok = NULL; } } if ((count==1) && (file_type==GEN_EST)) (void)fprintf(stderr,"Try shorter sequence first for better performance.\n"); free_table(); if (file_type==EST_GEN) seq_close(rf1); seq_close(sf1); seq_close(sf2); return 0; } static void print_align_blk(uchar *seq1, uchar *seq2, int len1, int len2, edit_script_list **Aligns, int file_type, int match_ori) { int *S; edit_script_list *head, *aligns; if (*Aligns==NULL) return; aligns = *Aligns; while (aligns!=NULL) { head = aligns; aligns = aligns->next_script; S = (int *)ckalloc((2*head->len2+1+1)*sizeof(int)); S++; S2A(head->script, S, (file_type==EST_GEN) ? 1:0); Free_script(head->script); if (file_type==EST_GEN) { if (match_ori==FWD) { print_align(head->score,seq1,seq2, head->offset2, head->offset2+head->len2-1, head->offset1, head->offset1+head->len1-1,S); } else { align_reverse(S); print_align(head->score,seq1,seq2, len1+1-(head->offset2+head->len2-1), len1+1-head->offset2, len2+1-(head->offset1+head->len1-1), len2+1-head->offset1,S); } } else { /* file_type==GEN_EST */ print_align(head->score, seq1, seq2, head->offset1, head->offset1+head->len1-1, head->offset2, head->offset2+head->len2-1,S); } free(S-1); free(head); } *Aligns = NULL; return; } static void print_align_lat(uchar *seq1, uchar *seq2, int len1, int len2, edit_script_list **Aligns, Exon *Exons, int file_type, int match_ori) { int *S; edit_script_list *head, *aligns; if (*Aligns==NULL) return; aligns = *Aligns; while (aligns!=NULL) { head = aligns; aligns = aligns->next_script; S = (int *)ckalloc((2*head->len2+1+1)*sizeof(int)); S++; S2A(head->script, S, (file_type==1) ? 1:0); Free_script(head->script); if (file_type==EST_GEN) { if (match_ori==FWD) { IDISPLAY(seq1+ head->offset2-1-1, seq2+ head->offset1-1-1, head->len2, head->len1, S, head->offset2, head->offset1, 1, Exons); } else { align_reverse(S); IDISPLAY(seq1+len1+1-(head->offset2+head->len2-1)-1-1, seq2+len2+1-(head->offset1+head->len1-1)-1-1, head->len2, head->len1, S, len1+1-(head->offset2+head->len2-1), len2+1-(head->offset1+head->len1-1), 1, Exons); } } else { /* file_type==GEN_EST */ IDISPLAY(seq1+ head->offset1-1-1, seq2+ head->offset2-1-1, head->len1, head->len2, S, head->offset1, head->offset2, 2, Exons); } free(S-1); free(head); } *Aligns = NULL; return; } static void sim4_argvals(sim4_args_t *args) { if (!get_argval('A', &(args->ali_flag))) args->ali_flag = 0; if ((args->ali_flag>5) || (args->ali_flag<0)) fatal("A options: 0, 1, 2, 3, 4, 5."); if (!get_argval('P', &(args->poly_flag))) args->poly_flag = 0; if (get_argval('R',&(args->reverse))) { if ((args->reverse<0) || (args->reverse>2)) fatal("Direction R must be 0, 1, or 2."); } else args->reverse = 2; if (get_argval('E', &(args->cutoff))) { if ((args->cutoff < 3) || (args->cutoff > 10)) fatal("Cutoff must be within [3,10]."); } else args->cutoff = DIST_CUTOFF; if (get_argval('D', &(args->DRANGE))) { if (args->DRANGE < 0) fatal("Positive number required for D."); } else args->DRANGE = DEFAULT_DRANGE; if (get_argval('H', &(args->weight))) { if (args->weight<0) fatal("Positive number required for H."); args->set_H = TRUE; } else { /* args->weight = DEFAULT_WEIGHT; */ args->set_H = FALSE; } /* if (get_fargval('v',&V)) { if ((V<.7) || (V>1.0)) fatal("V must be in the range [0.7,1.0]."); } else V = DEFAULT_MIN_COV; */ if (get_argval('W', &(args->W))) { if (args->W < 1) fatal("W must be positive."); if (args->W > 15) fatal("W must be <= 15."); } else args->W = DEFAULT_W; if (get_argval('X', &(args->X))) { if (args->X < 1) fatal("X must be positive."); } else args->X = DEFAULT_X; if (get_argval('K',&(args->K))) { if (args->K<0) fatal("K must be positive."); args->set_K = TRUE; } else { args->K = DEFAULT_K; args->set_K = FALSE; } if (get_argval('C',&(args->C))) { if (args->C<0) fatal("C must be positive."); args->set_C = TRUE; } else { args->C = DEFAULT_C; args->set_C = FALSE; } if (!get_argval('N', &(args->acc_flag))) args->acc_flag = 0; if (get_argval('B', &(args->B))) { if (args->B && (args->B!=1)) fatal("B must be either 0 or 1."); } else args->B = 1; if (get_strargval('S', &(args->S))) { cds_range(args->S, &(args->CDS_from), &(args->CDS_to)); if ((args->CDS_from<=0) || (args->CDS_to<=0) || (args->CDS_from>args->CDS_to)) fatal("Illegal endpoints for the CDS region."); } else args->S = NULL; if (args->S && (args->ali_flag!=5)) fatal ("A=5 must accompany CDS specification."); return; } /* extract the CDS endpoints from the command line specification .. */ static void cds_range(char *line, int *from, int *to) { char *s = line; if (line == NULL) fatal ("NULL CDS specification."); if (!isdigit((int)(*s))) fatal("Non-numerical value in the CDS specification."); while (*s && isdigit((int)(*s))) s++; if (*s!='.') fatal ("Illegal CDS specification."); s++; if (*s!='.') fatal ("Illegal CDS specification."); s++; if (!isdigit((int)(*s))) fatal ("Non-numerical value in the CDS specification."); while (*s && isdigit((int)(*s))) s++; if (*s && !isspace((int)(*s))) fatal ("Garbage at the end of the CDS numerical specification."); /* now extract the CDS elements */ if (sscanf(line, "%d..%d", from, to)!=2) fatal ("Error when reading the CDS endpoints."); return; } static void add_offset_exons(Exon *exons, int offset) { Exon *t; if (!offset || !(exons)) return; t = exons; while (t) { if (t->to1) { t->from2 += offset; t->to2 += offset; } t = t->next_exon; } } static void add_offset_aligns(edit_script_list *aligns, int offset) { edit_script_list *head; if (!offset || !aligns) return; head = aligns; while (head) { head->offset2 += offset; head = head->next_script; } return; } static char *extract_tok(char *h2) { char *s, *tmp, *q; if ((h2==NULL) || (*h2=='\0')) return NULL; if (*h2!='>') fatal("Not a FASTA header."); s = h2+1; while (isspace((int)(*s)) && *s!='\n') s++; if (*s=='\n') return NULL; q = s; while (*s && !isspace((int)(*s))) s++; tmp = (char *) ckalloc((unsigned int)(s-q+1)); strncpy(tmp, q, (int)(s-q)); tmp[s-q] = '\0'; return tmp; } /* ---------- utilities for collecting and reporting statistics ---------- */ static void init_stats(sim4_stats_t *st) { (void)memset(st,0,sizeof(sim4_stats_t)); } #ifdef _STATS static void print_exon_stats(Exon *exons, char *seq_name, int len) { FILE *xp; Exon *t; xp = ckopen("EXONS","a"); (void)fprintf(xp,"%s: ", seq_name); (void)fprintf(xp,"{\n %3d\n",len); t = exons; while (t!=NULL) { if (t->to1) (void)fprintf(xp," %6d %6d %3d %3d\n", t->from1,t->to1,t->from2,t->to2); t = t->next_exon; } (void)fprintf(xp,"}\n"); if (fclose(xp)==EOF) perror("sim4.init.c: fclose failed."); return; } #endif #ifdef _STATS static void print_stats(sim4_stats_t st, char *seq_name, int len) { FILE *fp; fp = ckopen("SCORES","a"); (void)fprintf(fp,"%s:\t %5.4f (%1d / %1d) %d %f %d", seq_name, st.fcoverage, st.icoverage, len, st.internal, st.marginals, st.nmatches); if (st.mult>1) (void)fprintf(fp," (%d)\n",st.mult); else if (!st.mult) (void)fprintf(fp,"\t\t*\n"); else (void)fprintf(fp,"\n"); if (fclose(fp)==EOF) perror("sim4.init.c: fclose failed."); return; } #endif #ifdef DEBUG static void polystats(int pT,int pA,int xpT,int xpA,int rev_xpT,int rev_xpA, int file_type,int match_ori) { int tmp; if (file_type==EST_GEN) { printf("Poly(EG): %d %d (%d) %d %d (%d)\n", pT, tmp=((match_ori==FWD) ? xpT:rev_xpT), pT+tmp, pA, tmp=((match_ori==FWD) ? xpA:rev_xpA), pA+tmp); } else { printf("Poly(GE): %d %d (%d) %d %d (%d)\n", pT, tmp=((match_ori==FWD) ? xpT:rev_xpA), pT+tmp, pA, tmp=((match_ori==FWD) ? xpA:rev_xpT), pA+tmp); } } #endif sim4.2012-10-10/CHANGELOG0000644000515200116500000000011412035303737014025 0ustar floreasalzberg10-10-2012 Changed dna_complement[] in encoding.c to avoid 257 chars array.