poaV2/0040775000765400076540000000000010062157342010215 5ustar poapoapoaV2/align_lpo.c0100644000765400076540000002207210024254524012321 0ustar poapoa #include "default.h" #include "poa.h" #include "seq_util.h" #include "lpo.h" typedef struct { unsigned char x:6; unsigned char y:1; unsigned char is_aligned:1; } LPOMove_T; typedef unsigned char LPOGapLength_T; void trace_back_lpo_alignment(int len_x,LPOLetter_T seq_x[], int len_y,LPOLetter_T seq_y[], LPOMove_T **move, LPOLetterRef_T best_x, LPOLetterRef_T best_y, LPOLetterRef_T **x_to_y, LPOLetterRef_T **y_to_x) { int i; LPOLetterRef_T new_x,*x_al=NULL,*y_al=NULL; LPOLetterLink_T *left; CALLOC(x_al,len_x,LPOLetterRef_T); CALLOC(y_al,len_y,LPOLetterRef_T); LOOP (i,len_x) x_al[i]= INVALID_LETTER_POSITION; LOOP (i,len_y) y_al[i]= INVALID_LETTER_POSITION; while (best_x>=0 && best_y>=0) { if (move[best_x][best_y].is_aligned) {/* ALIGNED! MAP best_x <--> best_y */ x_al[best_x]=best_y; y_al[best_y]=best_x; } if (0==move[best_x][best_y].x /* HIT START OF THE ALIGNMENT, SO QUIT */ && 0==move[best_x][best_y].y) break; if ((i=move[best_x][best_y].x)>0) { /* TRACE BACK ON X */ for (left= &seq_x[best_x].left;i-- >0;left=left->more) /* USE iTH MOVE*/ new_x = left->ipos; } else /* NO MOVE ON X */ new_x=best_x; if (move[best_x][best_y].y>0) /* ASSUMING seq_y A LINEAR SEQUENCE*/ best_y=SEQ_Y_LEFT(best_y); /* TRACE BACK ON Y */ best_x=new_x; } if (x_to_y) /* HAND BACK ALIGNMENT RECIPROCAL MAPPINGS */ *x_to_y = x_al; else free(x_al); if (y_to_x) *y_to_x = y_al; else free(y_al); return; } /* FOR CONTROLLING THE FREEING OF score[] ARRAYS */ typedef struct { LPOLetterRef_T last_right; /* LAST POSITION WHICH REFERENCES THIS POSITION*/ LPOLetterRef_T my_pos; /* INDEX OF THIS POSITION */ } LastRightList_T; /* SORT IN ASCENDING ORDER BY last_right */ int last_right_qsort_cmp(const void *void_a,const void *void_b) { const LastRightList_T *a=(const LastRightList_T *)void_a, *b=(const LastRightList_T *)void_b; if (a->last_right < b->last_right) return -1; if (a->last_right == b->last_right) return 0; else return 1; } LastRightList_T *last_right_list(int len,LPOLetter_T seq[]) { int i; LastRightList_T *list=NULL; LPOLetterLink_T *right; CALLOC(list,len,LastRightList_T); LOOP (i,len) { list[i].last_right=list[i].my_pos=i; /* DEFAULT: NOTHING RIGHT OF THIS*/ for (right= &seq[i].right;right && right->ipos>=0;right=right->more) if (right->ipos>list[i].last_right) list[i].last_right=right->ipos; } qsort(list,len,sizeof(LastRightList_T),last_right_qsort_cmp); return list; } typedef struct { LPOScore_T *score; LPOGapLength_T *gap_length; } LPORowFreeList_T; void free_row_list(int nfree_list,LPORowFreeList_T free_list[]) { /* fprintf(stderr,"Maximum #rows allocated was %d\n\n",nfree_list);*/ while (nfree_list-- >0) { /* DUMP EVERYTHING STORED ON THE FREE LIST */ free(free_list[nfree_list].score); free(free_list[nfree_list].gap_length); } } /** performs partial order alignment: seq_x[] may be a partial order; seq_y[] is assumed to be a linear order (regular sequence); returns the alignment in x_to_y[] and y_to_x, and also returns the alignment score as the return value */ LPOScore_T align_lpo (LPOSequence_T *lposeq_x, LPOSequence_T *lposeq_y, ResidueScoreMatrix_T *m, LPOLetterRef_T **x_to_y, LPOLetterRef_T **y_to_x, int use_global_alignment) { int len_x = lposeq_x->length; int len_y = lposeq_y->length; LPOLetter_T *seq_x = lposeq_x->letter; LPOLetter_T *seq_y = lposeq_y->letter; int i,j,j_left,best_x,best_y,nfree_list=0,ilast_right=0; LPOScore_T **score=NULL,*my_score,*my_matrix; LPOMove_T **move=NULL,*move_base=NULL,*my_move; LPOGapLength_T **gap_length=NULL,*my_gap_length; LPOLetterLink_T *left,*my_left; int new_gap_len,insert_x,previous_x,previous_y, i_x,new_score,new_x,new_y,current_gap_length; LPOScore_T match_score,previous_score, insert_x_try,insert_x_score,insert_y_score,best_score= -999999; LPORowFreeList_T *free_list=NULL; LastRightList_T *last_right=NULL; int max_gap_length; LPOScore_T *gap_penalty_x, *gap_penalty_y; max_gap_length = m->max_gap_length; gap_penalty_x = m->gap_penalty_x; gap_penalty_y = m->gap_penalty_y; CALLOC(score,len_x,LPOScore_T *); /* ALLOCATE MATRIX STORAGE: ROW POINTERS */ CALLOC(move,len_x,LPOMove_T *); CALLOC(gap_length,len_x,LPOGapLength_T *); CALLOC(free_list,len_x,LPORowFreeList_T); CALLOC(move_base,len_x*(len_y+1),LPOMove_T); /*ALLOCATE MATRIX RECTANGLE */ last_right=last_right_list(len_x,seq_x); /* GET SORTED LIST TO CONTROL FREE*/ LOOPF (i,len_x) {/* BUILD UP DP MATRIX, ROW BY ROW */ if (nfree_list>0) { /* TAKE THE NEW ROW FROM THE FREE LIST */ nfree_list--; /* MOVE BACK TO LAST FREE LIST ENTRY */ score[i]=free_list[nfree_list].score+1; /* LEAVE SPACE FOR [-1] ENTRY */ gap_length[i]=free_list[nfree_list].gap_length+1; } else { /* NEED TO ALLOCATE A NEW ROW */ CALLOC(score[i],len_y+1,LPOScore_T); score[i]++; /* LEAVE SPACE FOR [-1] ENTRY */ CALLOC(gap_length[i],len_y+1,LPOGapLength_T); gap_length[i]++; /* LEAVE SPACE FOR [-1] ENTRY */ } move[i]=move_base+i*(len_y+1)+1; /* LEAVE SPACE FOR [-1] ENTRY */ my_move=move[i]; /* USED TO SPEED UP MATRIX ACCESS INSIDE INNER LOOP*/ my_score=score[i]; my_gap_length=gap_length[i]; score[i][-1]= -999999; /* UNACCEPTABLE SCORE ENSURES -1 NEVER CHOSEN*/ my_matrix=m->score[seq_x[i].letter]; if (seq_x[i].left.ipos>=0) /* AT LEAST ONE VALID POSITION TO THE LEFT */ my_left= &seq_x[i].left; else /* THERE IS NO POSITION TO THE LEFT */ my_left=NULL; LOOPF (j,len_y) { j_left=SEQ_Y_LEFT(j); /* POSITION TO THE LEFT OF j */ previous_score=previous_x=previous_y=0; i_x=1; insert_x_score= -999999; for (left=my_left;left;left=left->more) { if (move[left->ipos][j].x>0) /* COULD BE [X,0] GAP CONTINUATION */ current_gap_length=gap_length[left->ipos][j]; else/*NOT AN EXTENSION OF A [X,0] GAP, SO TREAT AS START OF NEW GAP*/ current_gap_length=0; insert_x_try=score[left->ipos][j] /* FIND BEST [X,0] MOVE */ + left->score /* INCLUDE WEIGHTING FROM THIS EDGE */ - gap_penalty_x[current_gap_length]; if (insert_x_try>insert_x_score) { /* IF BEST insert_x MOVE, SAVE*/ insert_x=i_x; insert_x_score=insert_x_try; new_gap_len=current_gap_length+1; if (new_gap_len>max_gap_length) /* PREVENT OVERFLOW */ new_gap_len=max_gap_length; } /* FIND BEST [X,1] MOVE */ if (score[left->ipos][j_left]+left->score>previous_score) { previous_score=score[left->ipos][j_left]+left->score; previous_x=i_x; previous_y=1; /* ASSUMING seq_y JUST LINEAR SEQUENCE */ } i_x++; /* ADVANCE X MOVE INDEX */ } /* DONE SCANNING PREDECESSORS ON X */ match_score = previous_score /* TAKE BEST PREDECESSOR */ + my_matrix[seq_y[j].letter]; if (match_score>insert_x_score) { /* PREFER [X,1] MOVE */ new_score=match_score; new_x=previous_x; new_y=previous_y; new_gap_len=0; } else { /* PREFER [X,0] MOVE */ new_score=insert_x_score; new_x=insert_x; new_y=0; } /* [0,1] MOVE */ if (my_move[j_left].y==1) /* COULD BE [0,1] GAP CONTINUATION */ current_gap_length=my_gap_length[j_left]; else /* NOT AN EXTENSION OF A [0,1] GAP, SO TREAT AS START OF NEW GAP*/ current_gap_length=0; insert_y_score=my_score[j_left]-gap_penalty_y[current_gap_length]; if (insert_y_scoremax_gap_length) /* PREVENT OVERFLOW */ my_gap_length[j]=max_gap_length; } if (my_score[j]>best_score) { /* RECORD BEST MOVE */ best_score=my_score[j]; best_x=i; best_y=j; } } while (ilast_right=len_x || best_y>=len_y,1.1,(ERRTXT,"Bounds exceeded!\nbest_x,best_y:%d,%d\tlen:%d,%d\n",best_x,best_y,len_x,len_y),CRASH); trace_back_lpo_alignment(len_x,seq_x,len_y,seq_y,move,best_x,best_y, x_to_y,y_to_x); free_row_list(nfree_list,free_list); FREE(move_base); /* DUMP ALLOCATED MATRIX */ FREE(score); FREE(move); FREE(gap_length); FREE(free_list); FREE(last_right); return best_score; } poaV2/align_lpo2.c0100644000765400076540000002374210024254530012405 0ustar poapoa #include "default.h" #include "poa.h" #include "seq_util.h" #include "lpo.h" /** set nonzero for old scoring (gap-opening penalty for X-Y transition) */ #define DOUBLE_GAP_SCORING (0) typedef struct { unsigned char x:7; unsigned char y:1; } DPMove_T; typedef struct { LPOScore_T score; short gap_x, gap_y; } DPScore_T; /** is node 'i' the first node in any sequence in lposeq? */ static int is_initial_node (int i, LPOSequence_T *lposeq) { LPOLetterSource_T *src = &((lposeq->letter[i]).source); while (src != NULL && src->iseq >= 0) { if (src->ipos == 0) { return 1; } src = src->more; } return 0; } /** is node 'i' the last node in any sequence in lposeq? */ static int is_final_node (int i, LPOSequence_T *lposeq) { LPOLetterSource_T *src = &((lposeq->letter[i]).source); while (src != NULL && src->iseq >= 0) { if (src->ipos == (lposeq->source_seq[src->iseq]).length - 1) { return 1; } src = src->more; } return 0; } static void get_seq_left_and_final (LPOSequence_T *lposeq_x, LPOLetterLink_T ***x_left_ptr, int **is_final_node_ptr) { int i, len_x = lposeq_x->length; LPOLetter_T *seq_x = lposeq_x->letter; int *is_final_node_x = NULL; LPOLetterLink_T **x_left = NULL; CALLOC (is_final_node_x, len_x, int); for (i=0; iipos = -1; x_left[i]->score = 0; x_left[i]->more = &seq_x[i].left; } else { x_left[i] = &seq_x[i].left; } } (*is_final_node_ptr) = is_final_node_x; (*x_left_ptr) = x_left; } static void trace_back_lpo_alignment (int len_x, int len_y, DPMove_T **move, LPOLetterLink_T **x_left, LPOLetterRef_T best_x, LPOLetterRef_T best_y, LPOLetterRef_T **x_to_y, LPOLetterRef_T **y_to_x) { int i, xmove, ymove; LPOLetterRef_T *x_al = NULL, *y_al = NULL; LPOLetterLink_T *left; CALLOC (x_al, len_x, LPOLetterRef_T); CALLOC (y_al, len_y, LPOLetterRef_T); LOOP (i,len_x) x_al[i] = INVALID_LETTER_POSITION; LOOP (i,len_y) y_al[i] = INVALID_LETTER_POSITION; while (best_x >= 0 && best_y >= 0) { xmove = move[best_y][best_x].x; ymove = move[best_y][best_x].y; if (xmove>0 && ymove>0) { /* ALIGNED! MAP best_x <--> best_y */ x_al[best_x]=best_y; y_al[best_y]=best_x; } if (xmove == 0 && ymove == 0) { /* FIRST ALIGNED PAIR */ x_al[best_x]=best_y; y_al[best_y]=best_x; break; /* FOUND START OF ALIGNED REGION, SO WE'RE DONE */ } if (xmove>0) { /* TRACE BACK ON X */ left = x_left[best_x]; while ((--xmove)>0) { left = left->more; } best_x = left->ipos; } if (ymove>0) { /* TRACE BACK ON Y */ best_y--; } } if (x_to_y) /* HAND BACK ALIGNMENT RECIPROCAL MAPPINGS */ *x_to_y = x_al; else free(x_al); if (y_to_x) *y_to_x = y_al; else free(y_al); return; } /** performs partial order alignment: lposeq_x may be a partial order; lposeq_y is assumed to be a linear order (regular sequence); returns the alignment in x_to_y[] and y_to_x[], and also returns the alignment score as the return value. */ LPOScore_T align_lpo (LPOSequence_T *lposeq_x, LPOSequence_T *lposeq_y, ResidueScoreMatrix_T *m, LPOLetterRef_T **x_to_y, LPOLetterRef_T **y_to_x, int use_global_alignment) { int len_x = lposeq_x->length; int len_y = lposeq_y->length; LPOLetter_T *seq_x = lposeq_x->letter; LPOLetter_T *seq_y = lposeq_y->letter; int i, j, xcount, prev_gap, next_gap; int best_x = -2, best_y = -2; LPOScore_T best_score = -999999; int *is_final_node_x; LPOLetterLink_T **x_left = NULL, *xl; DPMove_T **move = NULL, *my_move; DPScore_T *curr_score = NULL, *prev_score = NULL, *init_col_score = NULL, *my_score, *swap; int max_gap_length; LPOScore_T *gap_penalty_x, *gap_penalty_y; int *next_gap_array, *next_perp_gap_array; LPOScore_T try_score, insert_x_score, insert_y_score, match_score; int insert_x_x, insert_x_gap; int insert_y_y, insert_y_gap; int match_x, match_y; long n_edges = 0; max_gap_length = m->max_gap_length; gap_penalty_x = m->gap_penalty_x; gap_penalty_y = m->gap_penalty_y; CALLOC (next_gap_array, max_gap_length + 2, int); CALLOC (next_perp_gap_array, max_gap_length + 2, int); /* GAP LENGTH EXTENSION RULE: */ /* 0->1, 1->2, 2->3, ..., M-1->M, M->M, M+1->M+1. */ for (i=0; imore) { prev_gap = curr_score[xl->ipos].gap_x; try_score = curr_score[xl->ipos].score + xl->score - gap_penalty_x[prev_gap]; if (xcount == 1 || try_score > curr_score[i].score) { curr_score[i].score = try_score; curr_score[i].gap_x = next_gap_array[prev_gap]; curr_score[i].gap_y = next_perp_gap_array[prev_gap]; } } } /* FILL INITIAL COLUMN. */ init_col_score[-1] = curr_score[-1]; for (i=0; imore) { /* IMPROVE XY-MATCH?: trace back to (i-1, j'=xl->ipos) */ try_score = prev_score[xl->ipos].score + xl->score; if (xcount == 1 || try_score > match_score) { match_score = try_score; match_x = xcount; } /* IMPROVE X-INSERTION?: trace back to (i, j'=xl->ipos) */ prev_gap = curr_score[xl->ipos].gap_x; try_score = curr_score[xl->ipos].score + xl->score - gap_penalty_x[prev_gap]; if (xcount == 1 || try_score > insert_x_score) { insert_x_score = try_score; insert_x_x = xcount; insert_x_gap = prev_gap; } } if (0 == use_global_alignment && match_score <= 0) { match_score = 0; match_x = match_y = 0; /* FIRST ALIGNED PAIR */ } n_edges += (xcount-1); match_score += m->score[(int)seq_x[j].letter][(int)seq_y[i].letter]; my_score = &curr_score[j]; my_move = &move[i][j]; if (match_score > insert_y_score && match_score > insert_x_score) { /* XY-MATCH */ my_score->score = match_score; my_score->gap_x = 0; my_score->gap_y = 0; my_move->x = match_x; my_move->y = match_y; } else if (insert_x_score > insert_y_score) { /* X-INSERTION */ my_score->score = insert_x_score; my_score->gap_x = next_gap_array[insert_x_gap]; my_score->gap_y = next_perp_gap_array[insert_x_gap]; my_move->x = insert_x_x; my_move->y = 0; } else { /* Y-INSERTION */ my_score->score = insert_y_score; my_score->gap_x = next_perp_gap_array[insert_y_gap]; my_score->gap_y = next_gap_array[insert_y_gap]; my_move->x = 0; my_move->y = insert_y_y; } /* RECORD BEST START FOR TRACEBACK */ /* KEEPING ONLY FINAL-FINAL BESTS FOR GLOBAL ALIGNMENT */ if (my_score->score >= best_score && (0 == use_global_alignment || (is_final_node_x[j] && i==len_y-1))) { if (my_score->score > best_score || (j == best_x && i < best_y) || j < best_x) { best_score = my_score->score; best_x = j; best_y = i; } } } } IF_GUARD(best_x>=len_x || best_y>=len_y,1.1,(ERRTXT,"Bounds exceeded!\nbest_x,best_y:%d,%d\tlen:%d,%d\n",best_x,best_y,len_x,len_y),CRASH); /* fprintf (stderr, "aligned (%d nodes, %ld edges) to (%d nodes)\n", len_x, n_edges/len_y, len_y); fprintf (stderr, "best score %d @ (%d %d)\n", best_score, best_x, best_y); */ /* DYNAMIC PROGRAMING MATRIX COMPLETE, NOW TRACE BACK FROM best_x, best_y */ trace_back_lpo_alignment (len_x, len_y, move, x_left, best_x, best_y, x_to_y, y_to_x); FREE (next_gap_array); FREE (next_perp_gap_array); prev_score = &(prev_score[-1]); FREE (prev_score); curr_score = &(curr_score[-1]); FREE (curr_score); init_col_score = &(init_col_score[-1]); FREE (init_col_score); FREE (is_final_node_x); for (i=0; i=0 && best_y>=0) { if (move[best_x][best_y].is_aligned) {/* ALIGNED! MAP best_x <--> best_y */ x_al[best_x]=best_y; y_al[best_y]=best_x; } if (0==move[best_x][best_y].x /* HIT START OF THE ALIGNMENT, SO QUIT */ && 0==move[best_x][best_y].y) break; if ((i=move[best_x][best_y].x)>0) { /* TRACE BACK ON X */ for (left= &seq_x[best_x].left;--i >0;left=left->more); /* USE iTH MOVE*/ new_x = left->ipos; } else new_x=best_x; if ((i=move[best_x][best_y].y)>0) { /* TRACE BACK ON Y */ for (left= &seq_y[best_y].left;--i >0;left=left->more); /* USE iTH MOVE*/ best_y = left->ipos; } best_x=new_x; } if (x_to_y) /* HAND BACK ALIGNMENT RECIPROCAL MAPPINGS */ *x_to_y = x_al; else free(x_al); if (y_to_x) *y_to_x = y_al; else free(y_al); return; } /** performs partial order alignment: seq_x[] may be a partial order; seq_y[] may be a partial order; returns the alignment in x_to_y[] and y_to_x, and also returns the alignment score as the return value */ LPOScore_T align_lpo_po (LPOSequence_T *lposeq_x, LPOSequence_T *lposeq_y, ResidueScoreMatrix_T *m, LPOLetterRef_T **x_to_y, LPOLetterRef_T **y_to_x, LPOScore_T (*scoring_function) (int,int,LPOLetter_T [],LPOLetter_T [],ResidueScoreMatrix_T *), int use_global_alignment) { int len_x = lposeq_x->length; int len_y = lposeq_y->length; LPOLetter_T *seq_x = lposeq_x->letter; LPOLetter_T *seq_y = lposeq_y->letter; int i,j,j_left,best_x,best_y,nfree_list=0,ilast_right=0; LPOScore_T **score=NULL,*my_score,*my_matrix,*score_base=NULL; LPOMove_T **move=NULL,*move_base=NULL,*my_move; LPOGapLength_T **gap_length=NULL,*my_gap_length,*gap_length_base=NULL; LPOLetterLink_T *left,*my_left,*y_left; int new_gap_len,insert_x,previous_x,previous_y, i_x,new_score,new_x,new_y,current_gap_length,i_y,insert_y; LPOScore_T match_score,previous_score, insert_x_try,insert_x_score,insert_y_score,best_score= -999999; int max_gap_length; LPOScore_T *gap_penalty_x, *gap_penalty_y; max_gap_length = m->max_gap_length; gap_penalty_x = m->gap_penalty_x; gap_penalty_y = m->gap_penalty_y; CALLOC(score,len_x,LPOScore_T *); /* ALLOCATE MATRIX STORAGE: ROW POINTERS */ CALLOC(move,len_x,LPOMove_T *); CALLOC(gap_length,len_x,LPOGapLength_T *); CALLOC(score_base,len_x*(len_y+1),LPOScore_T); /*ALLOCATE MATRIX RECTANGLE */ CALLOC(move_base,len_x*(len_y+1),LPOMove_T); /*ALLOCATE MATRIX RECTANGLE */ CALLOC(gap_length_base,len_x*(len_y+1),LPOGapLength_T); /*ALLOCATE MATRIX RECTANGLE */ LOOPF (i,len_x) {/* BUILD UP DP MATRIX, ROW BY ROW */ score[i]=score_base+i*(len_y+1)+1; /* LEAVE SPACE FOR [-1] ENTRY */ move[i]=move_base+i*(len_y+1)+1; /* LEAVE SPACE FOR [-1] ENTRY */ gap_length[i]=gap_length_base+i*(len_y+1)+1; /*LEAVE SPACE FOR [-1] ENTRY*/ my_move=move[i]; /* USED TO SPEED UP MATRIX ACCESS INSIDE INNER LOOP*/ my_score=score[i]; my_gap_length=gap_length[i]; score[i][-1]= -999999; /* UNACCEPTABLE SCORE ENSURES -1 NEVER CHOSEN*/ /* my_matrix=m->score[seq_x[i].letter]; NOT USED */ if (seq_x[i].left.ipos>=0) /* AT LEAST ONE VALID POSITION TO THE LEFT */ my_left= &seq_x[i].left; else /* THERE IS NO POSITION TO THE LEFT */ my_left=NULL; LOOPF (j,len_y) { j_left=SEQ_Y_LEFT(j); /* POSITION TO THE LEFT OF j */ previous_score=previous_x=previous_y=0; i_x=1; insert_x_score= -999999; for (left=my_left;left;left=left->more) { if (move[left->ipos][j].x>0) /* COULD BE [X,0] GAP CONTINUATION */ current_gap_length=gap_length[left->ipos][j]; else/*NOT AN EXTENSION OF A [X,0] GAP, SO TREAT AS START OF NEW GAP*/ current_gap_length=0; insert_x_try=score[left->ipos][j] /* FIND BEST [X,0] MOVE */ + left->score /* INCLUDE WEIGHTING FROM THIS EDGE */ - gap_penalty_x[current_gap_length]; if (insert_x_try>insert_x_score) { /* IF BEST insert_x MOVE, SAVE*/ insert_x=i_x; insert_x_score=insert_x_try; new_gap_len=current_gap_length+1; if (new_gap_len>max_gap_length) /* PREVENT OVERFLOW */ new_gap_len=max_gap_length; } /* FIND BEST [X,Y] MOVE */ if (seq_y[j].left.ipos>=0){/*AT LEAST ONE VALID POSITION TO THE LEFT*/ i_y=1; for (y_left= &seq_y[j].left;y_left;y_left=y_left->more) { if (score[left->ipos][y_left->ipos] + left->score + y_left->score >previous_score) { previous_score=score[left->ipos][y_left->ipos] + left->score + y_left->score; previous_x=i_x; previous_y=i_y; } i_y++; } } i_x++; /* ADVANCE X MOVE INDEX */ } /* DONE SCANNING PREDECESSORS ON X */ match_score = previous_score /* TAKE BEST PREDECESSOR */ + scoring_function(i,j,seq_x,seq_y,m); #ifdef USE_LOCAL_NEUTRALITY_CORRECTION /* NO LONGER USED */ if (seq_x[i].scoreinsert_x_score) { /* PREFER [X,Y] MOVE */ new_score=match_score; new_x=previous_x; new_y=previous_y; new_gap_len=0; } else { /* PREFER [X,0] MOVE */ new_score=insert_x_score; new_x=insert_x; new_y=0; } /* [0,Y] MOVE */ insert_y_score= -999999; if (seq_y[j].left.ipos>=0){/*AT LEAST ONE VALID POSITION TO THE LEFT*/ i_y=1; for (y_left= &seq_y[j].left;y_left;y_left=y_left->more) { if (my_move[y_left->ipos].y>0) /* COULD BE [0,1] GAP CONTINUATION */ current_gap_length=my_gap_length[y_left->ipos]; else/*NOT AN EXTENSION OF A [0,1] GAP, SO TREAT AS START OF NEW GAP*/ current_gap_length=0; if (insert_y_score < my_score[y_left->ipos]-gap_penalty_y[current_gap_length]) { insert_y_score=my_score[y_left->ipos]-gap_penalty_y[current_gap_length]; insert_y=i_y; } i_y++; } } if (insert_y_scoremax_gap_length) /* PREVENT OVERFLOW */ my_gap_length[j]=max_gap_length; } if (my_score[j]>best_score) { /* RECORD BEST MOVE */ best_score=my_score[j]; best_x=i; best_y=j; } } } /* DYNAMIC PROGRAMING MATRIX COMPLETE, NOW TRACE BACK FROM best_x,best_y*/ IF_GUARD(best_x>=len_x || best_y>=len_y,1.1,(ERRTXT,"Bounds exceeded!\nbest_x,best_y:%d,%d\tlen:%d,%d\n",best_x,best_y,len_x,len_y),CRASH); trace_back_lpo_po_alignment(len_x,seq_x,len_y,seq_y,move,best_x,best_y, x_to_y,y_to_x); FREE(score_base); /* FREE MEMORY */ FREE(move_base); FREE(gap_length_base); FREE(score); FREE(move); FREE(gap_length); return best_score; } poaV2/align_lpo_po2.c0100644000765400076540000003462210024270031013074 0ustar poapoa #include "default.h" #include "poa.h" #include "seq_util.h" #include "lpo.h" /** set nonzero for old scoring (gap-opening penalty for X-Y transition) */ #define DOUBLE_GAP_SCORING (0) typedef struct { unsigned char x; unsigned char y; } DPMove_T; typedef struct { LPOScore_T score; short gap_x, gap_y; } DPScore_T; #define LPO_INITIAL_NODE 1 #define LPO_FINAL_NODE 2 static void get_lpo_stats (LPOSequence_T *lposeq, int *n_nodes_ptr, int *n_edges_ptr, int **node_type_ptr, int **refs_from_right_ptr, int *max_rows_alloced_ptr, LPOLetterLink_T ***left_links_ptr) { int i, j, rows_alloced = 0, max_rows_alloced = 0, n_edges = 0, len = lposeq->length; LPOLetter_T *seq = lposeq->letter; int *node_type; int *refs_from_right, *tmp; LPOLetterSource_T *src; LPOLetterLink_T *lnk, **left_links; CALLOC (node_type, len, int); CALLOC (refs_from_right, len, int); CALLOC (tmp, len, int); CALLOC (left_links, len, LPOLetterLink_T *); for (i=0; iiseq >= 0; src = src->more) { if (src->ipos == 0) { node_type[i] = (node_type[i] | LPO_INITIAL_NODE); } if (src->ipos == (lposeq->source_seq[src->iseq]).length - 1) { node_type[i] = (node_type[i] | LPO_FINAL_NODE); } } /* COUNTING THE LEFT-LINKS BACK TO EACH NODE ALLOWS FOR EFFICIENT */ /* MEMORY MANAGEMENT OF 'SCORE' ROWS (in align_lpo_po). */ for (lnk = &(seq[i].left); lnk != NULL && lnk->ipos >= 0; lnk = lnk->more) { refs_from_right[lnk->ipos]++; n_edges++; } } /* ALL 'INITIAL' NODES (1st in some seq) MUST BE LEFT-LINKED TO -1. */ /* THIS ALLOWS FREE ALIGNMENT TO ANY 'BRANCH' IN GLOBAL ALIGNMENT. */ for (i=0; iipos = -1; left_links[i]->score = 0; left_links[i]->more = &seq[i].left; } else { left_links[i] = &seq[i].left; } } for (i=0; i max_rows_alloced) { max_rows_alloced = rows_alloced; } for (lnk = &(seq[i].left); lnk != NULL && lnk->ipos >= 0; lnk = lnk->more) { if ((--tmp[lnk->ipos]) == 0) { rows_alloced--; } } } FREE (tmp); (*n_nodes_ptr) = len; (*n_edges_ptr) = n_edges; (*node_type_ptr) = node_type; (*refs_from_right_ptr) = refs_from_right; (*max_rows_alloced_ptr) = max_rows_alloced; (*left_links_ptr) = left_links; } static void trace_back_lpo_alignment (int len_x, int len_y, DPMove_T **move, LPOLetterLink_T **x_left, LPOLetterLink_T **y_left, LPOLetterRef_T best_x, LPOLetterRef_T best_y, LPOLetterRef_T **x_to_y, LPOLetterRef_T **y_to_x) { int i, xmove, ymove; LPOLetterRef_T *x_al = NULL, *y_al = NULL; LPOLetterLink_T *left; CALLOC (x_al, len_x, LPOLetterRef_T); CALLOC (y_al, len_y, LPOLetterRef_T); LOOP (i,len_x) x_al[i] = INVALID_LETTER_POSITION; LOOP (i,len_y) y_al[i] = INVALID_LETTER_POSITION; while (best_x >= 0 && best_y >= 0) { xmove = move[best_y][best_x].x; ymove = move[best_y][best_x].y; if (xmove>0 && ymove>0) { /* ALIGNED! MAP best_x <--> best_y */ x_al[best_x]=best_y; y_al[best_y]=best_x; } if (xmove == 0 && ymove == 0) { /* FIRST ALIGNED PAIR */ x_al[best_x]=best_y; y_al[best_y]=best_x; break; /* FOUND START OF ALIGNED REGION, SO WE'RE DONE */ } if (xmove>0) { /* TRACE BACK ON X */ left = x_left[best_x]; while ((--xmove)>0) { left = left->more; } best_x = left->ipos; } if (ymove>0) { /* TRACE BACK ON Y */ left = y_left[best_y]; while ((--ymove)>0) { left = left->more; } best_y = left->ipos; } } if (x_to_y) /* HAND BACK ALIGNMENT RECIPROCAL MAPPINGS */ *x_to_y = x_al; else free(x_al); if (y_to_x) *y_to_x = y_al; else free(y_al); return; } /** (align_lpo_po:) performs partial order alignment: lposeq_x and lposeq_y are partial orders; returns the alignment in x_to_y[] and y_to_x[], and also returns the alignment score as the return value. */ LPOScore_T align_lpo_po (LPOSequence_T *lposeq_x, LPOSequence_T *lposeq_y, ResidueScoreMatrix_T *m, LPOLetterRef_T **x_to_y, LPOLetterRef_T **y_to_x, LPOScore_T (*scoring_function) (int, int, LPOLetter_T *, LPOLetter_T *, ResidueScoreMatrix_T *), int use_global_alignment) { LPOLetter_T *seq_x = lposeq_x->letter; LPOLetter_T *seq_y = lposeq_y->letter; int len_x, len_y; int n_edges_x, n_edges_y; int *node_type_x, *node_type_y; int *refs_from_right_x, *refs_from_right_y; int max_rows_alloced_x, max_rows_alloced_y, n_score_rows_alloced = 0; int i, j, xcount, ycount, prev_gap; int best_x = -1, best_y = -1; LPOScore_T min_score = -999999, best_score = -999999; int possible_end_square; LPOLetterLink_T **x_left = NULL, **y_left = NULL, *xl, *yl; DPMove_T **move = NULL, *my_move; DPScore_T *curr_score = NULL, *prev_score = NULL, *init_col_score = NULL, *my_score; DPScore_T **score_rows = NULL; int max_gap_length; LPOScore_T *gap_penalty_x, *gap_penalty_y; int *next_gap_array, *next_perp_gap_array; LPOScore_T try_score, insert_x_score, insert_y_score, match_score; int insert_x_x, insert_x_gap; int insert_y_y, insert_y_gap; int match_x, match_y; get_lpo_stats (lposeq_x, &len_x, &n_edges_x, &node_type_x, &refs_from_right_x, &max_rows_alloced_x, &x_left); get_lpo_stats (lposeq_y, &len_y, &n_edges_y, &node_type_y, &refs_from_right_y, &max_rows_alloced_y, &y_left); /* fprintf (stdout, "sequence x: %ld nodes, %ld edges, %ld rows at most --> %ld mem\n", len_x, n_edges_x, max_rows_alloced_x, max_rows_alloced_x * len_y); fprintf (stdout, "sequence y: %ld nodes, %ld edges, %ld rows at most --> %ld mem\n", len_y, n_edges_y, max_rows_alloced_y, max_rows_alloced_y * len_x); */ /* INITIALIZE GAP PENALTIES: */ max_gap_length = m->max_gap_length; gap_penalty_x = m->gap_penalty_x; gap_penalty_y = m->gap_penalty_y; CALLOC (next_gap_array, max_gap_length + 2, int); CALLOC (next_perp_gap_array, max_gap_length + 2, int); for (i=0; i1, 1->2, 2->3, ..., M-1->M; but M->M. */ next_gap_array[i] = (imore) { prev_gap = curr_score[xl->ipos].gap_x; try_score = curr_score[xl->ipos].score + xl->score - gap_penalty_x[prev_gap]; if (try_score > curr_score[i].score) { curr_score[i].score = try_score; curr_score[i].gap_x = next_gap_array[prev_gap]; curr_score[i].gap_y = next_perp_gap_array[prev_gap]; } } } /* FILL INITIAL COLUMN (-1). */ init_col_score[-1] = curr_score[-1]; for (i=0; imore) { prev_gap = init_col_score[yl->ipos].gap_y; try_score = init_col_score[yl->ipos].score + yl->score - gap_penalty_y[prev_gap]; if (try_score > init_col_score[i].score) { init_col_score[i].score = try_score; init_col_score[i].gap_x = next_perp_gap_array[prev_gap]; init_col_score[i].gap_y = next_gap_array[prev_gap]; } } } /** MAIN DYNAMIC PROGRAMMING LOOP **/ /* OUTER LOOP (i-th position in LPO y): */ for (i=0; imore) { prev_score = score_rows[yl->ipos]; /* IMPROVE Y-INSERTION?: trace back to (i'=yl->ipos, j) */ prev_gap = prev_score[j].gap_y; try_score = prev_score[j].score + yl->score - gap_penalty_y[prev_gap]; if (try_score > insert_y_score) { insert_y_score = try_score; insert_y_y = ycount; insert_y_gap = prev_gap; } /* LOOP OVER x-predecessors (INSIDE y-predecessor LOOP): */ for (xcount = 1, xl = x_left[j]; xl != NULL; xcount++, xl = xl->more) { /* IMPROVE XY-MATCH?: trace back to (i'=yl->ipos, j'=xl->ipos) */ try_score = prev_score[xl->ipos].score + xl->score + yl->score; if (try_score > match_score) { match_score = try_score; match_x = xcount; match_y = ycount; } } } /* LOOP OVER x-predecessors (OUTSIDE y-predecessor LOOP): */ for (xcount = 1, xl = x_left[j]; xl != NULL; xcount++, xl = xl->more) { /* IMPROVE X-INSERTION?: trace back to (i, j'=xl->ipos) */ prev_gap = curr_score[xl->ipos].gap_x; try_score = curr_score[xl->ipos].score + xl->score - gap_penalty_x[prev_gap]; if (try_score > insert_x_score) { insert_x_score = try_score; insert_x_x = xcount; insert_x_gap = prev_gap; } } /* USE CUSTOM OR DEFAULT SCORING FUNCTION: */ if (scoring_function != NULL) { match_score += scoring_function (j, i, seq_x, seq_y, m); } else { match_score += m->score[seq_x[i].letter][seq_y[j].letter]; } my_score = &curr_score[j]; my_move = &move[i][j]; if (match_score > insert_y_score && match_score > insert_x_score) { /* XY-MATCH */ my_score->score = match_score; my_score->gap_x = 0; my_score->gap_y = 0; my_move->x = match_x; my_move->y = match_y; } else if (insert_x_score > insert_y_score) { /* X-INSERTION */ my_score->score = insert_x_score; my_score->gap_x = next_gap_array[insert_x_gap]; my_score->gap_y = next_perp_gap_array[insert_x_gap]; my_move->x = insert_x_x; my_move->y = 0; } else { /* Y-INSERTION */ my_score->score = insert_y_score; my_score->gap_x = next_perp_gap_array[insert_y_gap]; my_score->gap_y = next_gap_array[insert_y_gap]; my_move->x = 0; my_move->y = insert_y_y; } /* RECORD BEST ALIGNMENT END FOR TRACEBACK: */ if (possible_end_square && my_score->score >= best_score) { /* BREAK TIES BY CHOOSING MINIMUM (x,y): */ if (my_score->score > best_score || (j == best_x && i < best_y) || j < best_x) { best_score = my_score->score; best_x = j; best_y = i; } } } /* UPDATE # OF REFS TO 'SCORE' ROWS; FREE MEMORY WHEN POSSIBLE: */ for (yl = y_left[i]; yl != NULL; yl = yl->more) if ((j = yl->ipos) >= 0) { if ((--refs_from_right_y[j]) == 0) { score_rows[j] = &(score_rows[j][-1]); FREE (score_rows[j]); n_score_rows_alloced--; } } if (refs_from_right_y[i] == 0) { score_rows[i] = &(score_rows[i][-1]); FREE (score_rows[i]); n_score_rows_alloced--; } } IF_GUARD(best_x>=len_x || best_y>=len_y,1.1,(ERRTXT,"Bounds exceeded!\nbest_x,best_y:%d,%d\tlen:%d,%d\n",best_x,best_y,len_x,len_y),CRASH); /**/ fprintf (stderr, "aligned (%d nodes, %ld edges) to (%d nodes, %ld edges): ", len_x, n_edges_x, len_y, n_edges_y); fprintf (stderr, "best %s score = %d @ (%d %d)\n", (use_global_alignment ? "global" : "local"), best_score, best_x, best_y); /**/ /* DYNAMIC PROGRAMING MATRIX COMPLETE, NOW TRACE BACK FROM best_x, best_y */ trace_back_lpo_alignment (len_x, len_y, move, x_left, y_left, best_x, best_y, x_to_y, y_to_x); /* CLEAN UP AND RETURN: */ FREE (node_type_x); FREE (node_type_y); FREE (refs_from_right_x); FREE (refs_from_right_y); FREE (next_gap_array); FREE (next_perp_gap_array); score_rows[-1] = &(score_rows[-1][-1]); FREE (score_rows[-1]); score_rows = &(score_rows[-1]); FREE (score_rows); init_col_score = &(init_col_score[-1]); FREE (init_col_score); for (i=0; iscore[seq_x[i].letter][seq_y[j].letter]; /*TRIVIAL SCORING FUNC: JUST USE MATRIX VALUE*/ } poaV2/align_score.h0100644000765400076540000000055510024245152012646 0ustar poapoa#ifndef ALIGN_SCORE_HEADER_INCLUDED #define ALIGN_SCORE_HEADER_INCLUDED #include #include #include /*********************************************************** align_score.c */ LPOScore_T matrix_scoring_function(int i, int j, LPOLetter_T seq_x[], LPOLetter_T seq_y[], ResidueScoreMatrix_T *m); #endif poaV2/black_flag.c0100644000765400076540000000564210024245152012423 0ustar poapoa #include "default.h" /* ~~I */ char ERRTXT[1024] =""; char *Program_name="black_flag"; char *Program_version="unknown"; int Already_reported_crash=0; int black_flag(int bug_level, char sourcefile[], int sourceline, char sourcefile_revision[]) { static int last_line= -1; char *error_names[max_black_flag_type] ={"CRASH","DIED","EXCEPTION","BAD_DATA","WARNING","DEBUG"}; switch (bug_level) { #ifndef DEBUG_USER_VERSION case DEBUG_black_flag_type: /* IN DEV'T VERSION JUST CRASH!!! */ #ifdef DEBUG_VERSION case TRAP_black_flag_type: /* FOR DEBUG VERSION AND TRAP, CAUSE A CORE DUMP*/ #endif #endif case CRASH_black_flag_type: /* COULD INCLUDE MECHANISMS TO SEND EMAIL? */ /* print message at level 1, i.e. if we are printing anything at all */ PRINT_DEBUG(1,(DBOUT,"black_flag: %s %s:%s %s %s,%d\n%s\nend_black_flag\n", error_names[bug_level], Program_name,Program_version, sourcefile,sourcefile_revision,sourceline, ERRTXT[0]? ERRTXT:"")); if (Already_reported_crash) return 1; Already_reported_crash=1; abort(); /* FORCE A CORE DUMP */ default: /* JUST PRINT THE ERROR */ PRINT_DEBUG(1,(DBOUT,"black_flag: %s %s:%s %s %s,%d\n%s\nend_black_flag\n", error_names[bug_level], Program_name,Program_version, sourcefile,sourcefile_revision,sourceline, ERRTXT[0]? ERRTXT:"")); break; } ERRTXT[0]='\0'; /* RESET THE ERROR TEXT */ last_line=sourceline; return 1; /* SEND SIGNAL TO HANDLER CLAUSE TO DEAL WITH THIS ERROR */ } void handle_crash(int sigcode) { char *crash_mode; if (Already_reported_crash) exit(-1); /* IN ENDLESS LOOP REPORTING CRASH OVER & OVER? */ Already_reported_crash=1; black_flag(CRASH_black_flag_type,"",0,""); if ((crash_mode=getenv("HANDLE_CRASH")) && 0==strcmp(crash_mode,"NOCORE")) exit(-1); else return; /*ENVIRONMENT SETTING ASKS US TO DUMP A CORE IMMEDIATELY */ } int handle_crash_init(void (*crash_fun)()) { #define HANDLE_CRASH_MAX 5 int i,signal_type[HANDLE_CRASH_MAX] ={SIGSEGV, #ifdef SIGBUS /* LINUX DOESN'T HAVE BUS ERRORS? */ SIGBUS, #else SIGSEGV, /* REUSE SEGV AS DUMMY ENTRY */ #endif SIGABRT,SIGFPE,SIGTRAP}; /*LIST OF SIGNALS TO HANDLE*/ if (!crash_fun) /* NO CRASH FUNCTION SUPPLIED, SO RESET TO DEFAULT */ crash_fun = SIG_DFL; /* RESET TO STANDARD CRASH BEHAVIOR */ LOOP (i,HANDLE_CRASH_MAX) /* SET THIS HANDLER FOR ALL OUR SIGNALS */ signal(signal_type[i],crash_fun); return 0; } void black_flag_init_args(int narg,char *arg[],char progversion[]) { int i,len=0; LOOP (i,narg) len+=strlen(arg[i])+2; CALLOC(Program_name,len,char); LOOPF (i,narg) { strcat(Program_name,arg[i]); strcat(Program_name," "); } if (progversion) Program_version=progversion; handle_crash_init(handle_crash); } void black_flag_init(char progname[],char progversion[]) { black_flag_init_args(1,&progname,progversion); } poaV2/black_flag.h0100644000765400076540000001620310024245152012423 0ustar poapoa #ifndef BLACK_FLAG_HEADER_INCLUDED #define BLACK_FLAG_HEADER_INCLUDED 1 #include extern char ERRTXT[]; #define DBOUT stderr enum { CRASH_black_flag_type, TRAP_black_flag_type, COPE_black_flag_type, USERR_black_flag_type, WARN_black_flag_type, DEBUG_black_flag_type, max_black_flag_type }; #define NOERRMSG (ERRTXT,"") /******************************************************************** * * IF_PARANOID: * IF_PARANOID(CONDITION,REVISION,MESSAGE) * * error checking that will significantly slow execution or * otherwise is desirable only in situations of extreme * unction, e.g. during our debugging!!! * * The beauty of the IF_PARANOID idea is that you should sprinkle * it liberally EVERYWHERE in your code without thought for * whether it is necessary or might hurt performance, * because it will NOT even be compiled into the program in * the production version!!!!!! * * use IF_PARANOID checks EVERYWHERE you can think of * definite error signals, even if you think "That shouldn't * EVER happen". * * if the IF_PARANOID CONDITIONAL is TRUE, the program will abort(); * * * * Also, our emacs custom highlighting system has been programmed * to show IF_PARANOID and IF_DEBUG lines in a dim gray, so * that these extensive error checks will not visually obscure * the layout & organization of your algorithms. * * black_flag() is smart about printing both version information * such as the vmake version name the executable was created by, * and also the exact revision number of the file in which the * error occured. * * *------------------------------------------------- * * IF_DEBUG: * IF_DEBUG(CONDITION,REVISION,MESSAGE) * * also not expected to be included in a final production * release, but should not impact performance so significantly that * it's unpleasant to test such a version in regular use patterns. * Classic examples would be fairly pedantic checks at the entry * and exit of all functions, testing for conditions "that shouldn't * ever happen." * * if the IF_DEBUG CONDITIONAL is TRUE, the program will abort(); * * *------------------------------------------------- * * IF_GUARD: * IF_GUARD(CONDITION,REVISION,MESSAGE,LEVEL) * * checks INCLUDED in final production versions, but taking * advantage of the black_flag system to allow us to easily * control what will be done in response to an error * in any given executable, via compile-time flags: * * e.g. * print error info on stderr, or to special log files, * * force a core dump, * * run dbx via an auto script to generate a stack frame, * and mail the results to develop-support@mag.com, * etc. * * IF_GUARD differs from IF_PARANOID and IF_DEBUG in that it * requires an error_level argument, which must be one of * * CRASH ... the error is fatal, abort() * * TRAP ... the error is being trapped, but * not handled. e.g. quiting from * a function because some essential * file was missing... * * COPE ... the error is being handled nicely. * the handler code following IF_GUARD * knows how to correct for the situation. * * USERR ... the user appears to have done something * that makes no sense; they must be * confused by our interface. * * WARN ... suspicious data or input, not * definitely an error. * * * *------------------------------------------------- * * Examples: * * IF_PARANOID((iatom<0),4.6,(ERRTXT,"wacky iatom=%d",iatom)); * * IF_DEBUG((iatom<0),4.6,(ERRTXT,"wacky iatom=%d",iatom)); * * IF_GUARD((iatom<0),4.6,(ERRTXT,"wacky iatom=%d",iatom),CRASH); * * IF_GUARD((iatom<0),4.6,(ERRTXT,"wacky iatom=%d",iatom),COPE) { * put some code to handle the error condition here; * } * * USE THE 4.6 KEYWORD TO PUT IN VERSION NUMBERS AUTOMATICALLY. * *******************************************************************/ #if defined(PARANOID_VERSION) /*???????????????????*/ #ifndef DEBUG_VERSION #define DEBUG_VERSION #endif #define IF_PARANOID(CONDITION,REVISION,MESSAGE) \ if (CONDITION) {\ sprintf MESSAGE ;\ black_flag(DEBUG_black_flag_type,__FILE__,__LINE__,STRINGIFY(REVISION));\ } #else /*????????????????????????????????????????????*/ #define IF_PARANOID(CONDITION,REVISION,MESSAGE) #endif /* !PARANOID_VERSION ??????????????????????*/ #if defined(DEBUG_VERSION) /*??????????*/ #define IF_DEBUG(CONDITION,REVISION,MESSAGE) \ if (CONDITION) {\ sprintf MESSAGE ;\ black_flag(DEBUG_black_flag_type,__FILE__,__LINE__,STRINGIFY(REVISION));\ } #else /*???????????????????????????????????????????????????????????*/ #define IF_DEBUG(CONDITION,REVISION,MESSAGE) #endif /* !DEBUG_VERSION ????????????????????????????????????????*/ /********************************************************************* * * IF_GUARD: * the basic black_flag macro, for production version trapping * and handling of errors. * * Essentially adds the flexibility of handling / recording * errors however you like in black_flag(). Also, the programmer * can provide, or omit, a clause following this macro that * will only be executed if the CONDITION is true, allowing * any kind of handling you wish. * ********************************************************************/ #define IF_GUARD(CONDITION,REVISION,MESSAGE,LEVEL) \ if ((CONDITION) ? \ (sprintf MESSAGE,\ black_flag(CONCAT_MACRO(LEVEL,_black_flag_type),__FILE__,__LINE__,\ STRINGIFY(REVISION))) : 0) #define WARN_DEBUG(REVISION,MESSAGE,LEVEL) \ (sprintf MESSAGE,\ black_flag(CONCAT_MACRO(LEVEL,_black_flag_type),__FILE__,__LINE__,\ STRINGIFY(REVISION))) #define WARN_MSG(LEVEL,MESSAGE,REVISION) \ (sprintf MESSAGE,\ black_flag(CONCAT_MACRO(LEVEL,_black_flag_type),__FILE__,__LINE__,\ REVISION)) /********************************************************************* * * OUT_OF_BOUNDS: * checks if * MIN <= INDEX < MAX * * e.g. * OUT_OF_BOUNDS(ivar,0,nvar) * ********************************************************************/ #define OUT_OF_BOUNDS(INDEX,MINIMUM_BOUND,MAXIMUM_BOUND) \ ((INDEX)<(MINIMUM_BOUND) || (INDEX)>=(MAXIMUM_BOUND)) void handle_crash(int sigcode); int handle_crash_init(void (*crash_fun)()); int black_flag(int bug_level, char sourcefile[], int sourceline, char sourcefile_revision[]); char *Program_name; char *Program_version; void black_flag_init(char progname[],char progversion[]); void black_flag_init_args(int narg,char *arg[],char progversion[]); #endif poaV2/blosum80.mat0100644000765400076540000000637110024270140012361 0ustar poapoa# Blosum80 # Matrix made by matblas from blosum80.iij # * column uses minimum score # BLOSUM Clustered Scoring Matrix in 1/3 Bit Units # Blocks Database = /data/blocks_5.0/blocks.dat # Cluster Percentage: >= 80 # Entropy = 0.9868, Expected = -0.7442 GAP-PENALTIES=12 6 6 A R N D C Q E G H I L K M F P S T W Y V B Z X ? a g t c u ] n A 7 -3 -3 -3 -1 -2 -2 0 -3 -3 -3 -1 -2 -4 -1 2 0 -5 -4 -1 -3 -2 -1 -9 -9 -9 -9 -9 -9 -9 -9 R -3 9 -1 -3 -6 1 -1 -4 0 -5 -4 3 -3 -5 -3 -2 -2 -5 -4 -4 -2 0 -2 -9 -9 -9 -9 -9 -9 -9 -9 N -3 -1 9 2 -5 0 -1 -1 1 -6 -6 0 -4 -6 -4 1 0 -7 -4 -5 5 -1 -2 -9 -9 -9 -9 -9 -9 -9 -9 D -3 -3 2 10 -7 -1 2 -3 -2 -7 -7 -2 -6 -6 -3 -1 -2 -8 -6 -6 6 1 -3 -9 -9 -9 -9 -9 -9 -9 -9 C -1 -6 -5 -7 13 -5 -7 -6 -7 -2 -3 -6 -3 -4 -6 -2 -2 -5 -5 -2 -6 -7 -4 -9 -9 -9 -9 -9 -9 -9 -9 Q -2 1 0 -1 -5 9 3 -4 1 -5 -4 2 -1 -5 -3 -1 -1 -4 -3 -4 -1 5 -2 -9 -9 -9 -9 -9 -9 -9 -9 E -2 -1 -1 2 -7 3 8 -4 0 -6 -6 1 -4 -6 -2 -1 -2 -6 -5 -4 1 6 -2 -9 -9 -9 -9 -9 -9 -9 -9 G 0 -4 -1 -3 -6 -4 -4 9 -4 -7 -7 -3 -5 -6 -5 -1 -3 -6 -6 -6 -2 -4 -3 -9 -9 -9 -9 -9 -9 -9 -9 H -3 0 1 -2 -7 1 0 -4 12 -6 -5 -1 -4 -2 -4 -2 -3 -4 3 -5 -1 0 -2 -9 -9 -9 -9 -9 -9 -9 -9 I -3 -5 -6 -7 -2 -5 -6 -7 -6 7 2 -5 2 -1 -5 -4 -2 -5 -3 4 -6 -6 -2 -9 -9 -9 -9 -9 -9 -9 -9 L -3 -4 -6 -7 -3 -4 -6 -7 -5 2 6 -4 3 0 -5 -4 -3 -4 -2 1 -7 -5 -2 -9 -9 -9 -9 -9 -9 -9 -9 K -1 3 0 -2 -6 2 1 -3 -1 -5 -4 8 -3 -5 -2 -1 -1 -6 -4 -4 -1 1 -2 -9 -9 -9 -9 -9 -9 -9 -9 M -2 -3 -4 -6 -3 -1 -4 -5 -4 2 3 -3 9 0 -4 -3 -1 -3 -3 1 -5 -3 -2 -9 -9 -9 -9 -9 -9 -9 -9 F -4 -5 -6 -6 -4 -5 -6 -6 -2 -1 0 -5 0 10 -6 -4 -4 0 4 -2 -6 -6 -3 -9 -9 -9 -9 -9 -9 -9 -9 P -1 -3 -4 -3 -6 -3 -2 -5 -4 -5 -5 -2 -4 -6 12 -2 -3 -7 -6 -4 -4 -2 -3 -9 -9 -9 -9 -9 -9 -9 -9 S 2 -2 1 -1 -2 -1 -1 -1 -2 -4 -4 -1 -3 -4 -2 7 2 -6 -3 -3 0 -1 -1 -9 -9 -9 -9 -9 -9 -9 -9 T 0 -2 0 -2 -2 -1 -2 -3 -3 -2 -3 -1 -1 -4 -3 2 8 -5 -3 0 -1 -2 -1 -9 -9 -9 -9 -9 -9 -9 -9 W -5 -5 -7 -8 -5 -4 -6 -6 -4 -5 -4 -6 -3 0 -7 -6 -5 16 3 -5 -8 -5 -5 -9 -9 -9 -9 -9 -9 -9 -9 Y -4 -4 -4 -6 -5 -3 -5 -6 3 -3 -2 -4 -3 4 -6 -3 -3 3 11 -3 -5 -4 -3 -9 -9 -9 -9 -9 -9 -9 -9 V -1 -4 -5 -6 -2 -4 -4 -6 -5 4 1 -4 1 -2 -4 -3 0 -5 -3 7 -6 -4 -2 -9 -9 -9 -9 -9 -9 -9 -9 B -3 -2 5 6 -6 -1 1 -2 -1 -6 -7 -1 -5 -6 -4 0 -1 -8 -5 -6 6 0 -3 -9 -9 -9 -9 -9 -9 -9 -9 Z -2 0 -1 1 -7 5 6 -4 0 -6 -5 1 -3 -6 -2 -1 -2 -5 -4 -4 0 6 -1 -9 -9 -9 -9 -9 -9 -9 -9 X -1 -2 -2 -3 -4 -2 -2 -3 -2 -2 -2 -2 -2 -3 -3 -1 -1 -5 -3 -2 -3 -1 -2 -9 -9 -9 -9 -9 -9 -9 -9 ? -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 a -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 4 -2 -2 -2 -2 -9 0 g -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 4 -2 -2 -2 -9 0 t -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 -2 4 -2 4 -9 0 c -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 -2 -2 4 -2 -9 0 u -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 -2 4 -2 4 -9 0 ] -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 n -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 0 0 0 0 0 -9 0 poaV2/blosum80_trunc.mat0100644000765400076540000000644510024251716013606 0ustar poapoa# Blosum80 # Matrix made by matblas from blosum80.iij # * column uses minimum score # BLOSUM Clustered Scoring Matrix in 1/3 Bit Units # Blocks Database = /data/blocks_5.0/blocks.dat # Cluster Percentage: >= 80 # Entropy = 0.9868, Expected = -0.7442 GAP-TRUNCATION-LENGTH=10 GAP-DECAY-LENGTH=5 GAP-PENALTIES=12 6 0 A R N D C Q E G H I L K M F P S T W Y V B Z X ? a g t c u ] n A 7 -3 -3 -3 -1 -2 -2 0 -3 -3 -3 -1 -2 -4 -1 2 0 -5 -4 -1 -3 -2 -1 -9 -9 -9 -9 -9 -9 -9 -9 R -3 9 -1 -3 -6 1 -1 -4 0 -5 -4 3 -3 -5 -3 -2 -2 -5 -4 -4 -2 0 -2 -9 -9 -9 -9 -9 -9 -9 -9 N -3 -1 9 2 -5 0 -1 -1 1 -6 -6 0 -4 -6 -4 1 0 -7 -4 -5 5 -1 -2 -9 -9 -9 -9 -9 -9 -9 -9 D -3 -3 2 10 -7 -1 2 -3 -2 -7 -7 -2 -6 -6 -3 -1 -2 -8 -6 -6 6 1 -3 -9 -9 -9 -9 -9 -9 -9 -9 C -1 -6 -5 -7 13 -5 -7 -6 -7 -2 -3 -6 -3 -4 -6 -2 -2 -5 -5 -2 -6 -7 -4 -9 -9 -9 -9 -9 -9 -9 -9 Q -2 1 0 -1 -5 9 3 -4 1 -5 -4 2 -1 -5 -3 -1 -1 -4 -3 -4 -1 5 -2 -9 -9 -9 -9 -9 -9 -9 -9 E -2 -1 -1 2 -7 3 8 -4 0 -6 -6 1 -4 -6 -2 -1 -2 -6 -5 -4 1 6 -2 -9 -9 -9 -9 -9 -9 -9 -9 G 0 -4 -1 -3 -6 -4 -4 9 -4 -7 -7 -3 -5 -6 -5 -1 -3 -6 -6 -6 -2 -4 -3 -9 -9 -9 -9 -9 -9 -9 -9 H -3 0 1 -2 -7 1 0 -4 12 -6 -5 -1 -4 -2 -4 -2 -3 -4 3 -5 -1 0 -2 -9 -9 -9 -9 -9 -9 -9 -9 I -3 -5 -6 -7 -2 -5 -6 -7 -6 7 2 -5 2 -1 -5 -4 -2 -5 -3 4 -6 -6 -2 -9 -9 -9 -9 -9 -9 -9 -9 L -3 -4 -6 -7 -3 -4 -6 -7 -5 2 6 -4 3 0 -5 -4 -3 -4 -2 1 -7 -5 -2 -9 -9 -9 -9 -9 -9 -9 -9 K -1 3 0 -2 -6 2 1 -3 -1 -5 -4 8 -3 -5 -2 -1 -1 -6 -4 -4 -1 1 -2 -9 -9 -9 -9 -9 -9 -9 -9 M -2 -3 -4 -6 -3 -1 -4 -5 -4 2 3 -3 9 0 -4 -3 -1 -3 -3 1 -5 -3 -2 -9 -9 -9 -9 -9 -9 -9 -9 F -4 -5 -6 -6 -4 -5 -6 -6 -2 -1 0 -5 0 10 -6 -4 -4 0 4 -2 -6 -6 -3 -9 -9 -9 -9 -9 -9 -9 -9 P -1 -3 -4 -3 -6 -3 -2 -5 -4 -5 -5 -2 -4 -6 12 -2 -3 -7 -6 -4 -4 -2 -3 -9 -9 -9 -9 -9 -9 -9 -9 S 2 -2 1 -1 -2 -1 -1 -1 -2 -4 -4 -1 -3 -4 -2 7 2 -6 -3 -3 0 -1 -1 -9 -9 -9 -9 -9 -9 -9 -9 T 0 -2 0 -2 -2 -1 -2 -3 -3 -2 -3 -1 -1 -4 -3 2 8 -5 -3 0 -1 -2 -1 -9 -9 -9 -9 -9 -9 -9 -9 W -5 -5 -7 -8 -5 -4 -6 -6 -4 -5 -4 -6 -3 0 -7 -6 -5 16 3 -5 -8 -5 -5 -9 -9 -9 -9 -9 -9 -9 -9 Y -4 -4 -4 -6 -5 -3 -5 -6 3 -3 -2 -4 -3 4 -6 -3 -3 3 11 -3 -5 -4 -3 -9 -9 -9 -9 -9 -9 -9 -9 V -1 -4 -5 -6 -2 -4 -4 -6 -5 4 1 -4 1 -2 -4 -3 0 -5 -3 7 -6 -4 -2 -9 -9 -9 -9 -9 -9 -9 -9 B -3 -2 5 6 -6 -1 1 -2 -1 -6 -7 -1 -5 -6 -4 0 -1 -8 -5 -6 6 0 -3 -9 -9 -9 -9 -9 -9 -9 -9 Z -2 0 -1 1 -7 5 6 -4 0 -6 -5 1 -3 -6 -2 -1 -2 -5 -4 -4 0 6 -1 -9 -9 -9 -9 -9 -9 -9 -9 X -1 -2 -2 -3 -4 -2 -2 -3 -2 -2 -2 -2 -2 -3 -3 -1 -1 -5 -3 -2 -3 -1 -2 -9 -9 -9 -9 -9 -9 -9 -9 ? -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 a -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 4 -2 -2 -2 -2 -9 0 g -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 4 -2 -2 -2 -9 0 t -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 -2 4 -2 4 -9 0 c -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 -2 -2 4 -2 -9 0 u -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 -2 4 -2 4 -9 0 ] -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 n -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 0 0 0 0 0 -9 0 poaV2/buildup_lpo.c0100644000765400076540000004613510024245152012676 0ustar poapoa #include "default.h" #include "poa.h" #include "seq_util.h" #include "lpo.h" /** if two align-rings are aligned to each other, make sure that the (single) aligned residue pair consists of identical residues, if possible. : ((a,-),(b,-),(c,d),(-,a),(-b)) ==> ((a,a),(b,-),(c,-),(-,d),(-b)) OR ((a,-),(b,b),(c,-),(-,d)). : ((a,-),(c,d),(-,b)) ==> self. */ void fuse_ring_identities(int len_x,LPOLetter_T seq_x[], int len_y,LPOLetter_T seq_y[], LPOLetterRef_T al_x[], LPOLetterRef_T al_y[]) { int i,j; LOOP (i,len_y) { if (al_y[i]<0 || seq_x[al_y[i]].letter == seq_y[i].letter) continue; /* NOT ALIGNED, OR ALREADY IDENTICAL, SO SKIP */ for (j=seq_x[al_y[i]].align_ring;j!=al_y[i];j=seq_x[j].align_ring) { if (seq_x[j].letter == seq_y[i].letter) { /* IDENTICAL! SO FUSE! */ al_x[al_y[i]]= INVALID_LETTER_POSITION; /* DISCONNECT FROM OLD */ al_y[i]=j; /* CONNECT TO NEW IDENTITY */ al_x[j]=i; break; /* SEARCH YE NO FURTHER */ } } } } /** if two align-rings are aligned to each other, make sure that as many identical-residue pairs are fused as possible. : ((a,-),(b,-),(c,d),(-,a),(-b)) ==> ((a,a),(b,b),(c,-),(-,d)). : ((a,-),(c,d),(-,b)) ==> self. NB: THIS DOES NOT WORK with the current fuse_lpo function. The fusion */ void full_fuse_ring_identities (int len_x, LPOLetter_T *seq_x, int len_y, LPOLetter_T *seq_y, LPOLetterRef_T *al_x, LPOLetterRef_T *al_y) { int i, ip, j, jp; /* i LABELS POS IN seq_x, j LABELS POS IN seq_y */ int ck; /* WAS IDENTICAL-RESIDUE PAIR FOUND? */ for (i=0; i= 0) { al_x[i] = al_y[j] = INVALID_LETTER_POSITION; /* DISCONNECT FROM OLD */ /* i,j ARE AN ALIGNED PAIR. WALK THROUGH RESPECTIVE RINGS: */ ip=i; jp=j; ck=0; do /* while (ip!=i) */ { do /* while (jp!=j) */ { if (seq_x[ip].letter == seq_y[jp].letter) { /* IDENTICAL! SO FUSE! */ al_x[ip] = jp; al_y[jp] = ip; ck=1; jp=j; /* EXIT TO OUTER LOOP... AT MOST ONE FUSED TO EACH POS IN seq_y. */ } else { jp = seq_y[jp].align_ring; } } while (jp!=j); ip = seq_x[ip].align_ring; } while (ip!=i); if (ck==0) { /* NO IDENTICAL-RESIDUE PAIR FOUND, SO RECONNECT ORIGINAL */ al_x[i] = j; al_y[j] = i; } } } /** aligns the sequences in seq[] to the sequence or partial order in new_seq; seq[] must be linear orders (regular sequences); the alignment is built up by iterative partial order alignment, and the resulting partial order is returned in new_seq */ LPOSequence_T *buildup_lpo(LPOSequence_T *new_seq, int nseq,LPOSequence_T seq[], ResidueScoreMatrix_T *score_matrix, int use_aggressive_fusion, int use_global_alignment) { int i,max_alloc=0,total_alloc; LPOLetterRef_T *al1=NULL,*al2=NULL; lpo_index_symbols(new_seq,score_matrix); /* MAKE SURE LPO IS TRANSLATED */ for (i=0;ilength*seq[i].length + sizeof(LPOLetter_T)*new_seq->length; if (total_alloc>max_alloc) { /* DP RECTANGLE ARRAY SIZE */ max_alloc=total_alloc; #ifdef REPORT_MAX_ALLOC fprintf(stderr,"max_alloc: %d bytes\n",max_alloc); #endif if (max_alloc>POA_MAX_ALLOC) { WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.9 $"); break; /* JUST RETURN AND FINISH */ } } align_lpo_po (new_seq,&seq[i], score_matrix,&al1,&al2,NULL,use_global_alignment); /* ALIGN ONE MORE SEQ */ if (use_aggressive_fusion) fuse_ring_identities(new_seq->length,new_seq->letter, seq[i].length,seq[i].letter,al1,al2); fuse_lpo(new_seq,seq+i,al1,al2); /* BUILD COMPOSITE LPO */ free_lpo_letters(seq[i].length,seq[i].letter,TRUE);/*NO NEED TO KEEP*/ seq[i].letter=NULL; /* MARK AS FREED... DON'T LEAVE DANGLING POINTER! */ FREE(al1); /* DUMP TEMPORARY MAPPING ARRAYS */ FREE(al2); } return new_seq; } /**@memo example: aligning a set of sequences to a partial order: lpo_out=buildup_lpo(lpo_in,nseq,seq,&score_matrix); */ /** CLIPS seq->letter[] TO JUST THE SEGMENT ALIGNED TO letter_x[] via al_x[] DOES *NOT* FREE existing seq->letter[]; YOU MUST KEEP IT OR FREE IT YOURSELF*/ int clip_unaligned_ends(LPOSequence_T *seq, LPOLetterRef_T al[], int len_x,LPOLetter_T letter_x[], LPOLetterRef_T al_x[],int *offset,int *match_length) { int i,j=0,start,end,new_length,allow_end_length=0,nidentity=0; LPOLetter_T *temp=NULL; CALLOC(temp,seq->length,LPOLetter_T); /* ALLOCATE NEW letter[] COPY */ for (start=0;startlength;start++) /* FIND 1ST ALIGNED POS */ if (al[start]>=0) break; for (end=seq->length -1;end>=0;end--) /* FIND LAST ALIGNED POS */ if (al[end]>=0) break; for (i=start;i<=end;i++) /* COUNT IDENTITIES TO letter_x[] */ if (al[i]>=0 && seq->letter[i].letter==letter_x[al[i]].letter) nidentity++; if (match_length) /* RETURN THE MATCH LENGTH TO THE CALLER */ *match_length = end-start+1; if (start>allow_end_length) /* ALLOW EXTRA RESIDUES ON EITHER END*/ start-=allow_end_length; else /* KEEP IN BOUNDS */ start=0; if (end+allow_end_lengthlength) end+=allow_end_length; else /* KEEP IN BOUNDS */ end=seq->length-1; LOOP (i,len_x) /* WE ARE SHIFTING al TO THE RIGHT BY start POSITIONS */ if (al_x[i]>=0) /* SO WE HAVE TO TRANSLATE al_x CORRESPONDINGLY */ al_x[i]-= start; seq->length=end-start+1; /* NOW TRANSLATE left, right, align_ring, ring_id*/ memcpy(temp,seq->letter+start,sizeof(LPOLetter_T)*(seq->length)); LOOP (i,seq->length) { /* THIS *ONLY* WORKS FOR PURE LINEAR SEQUENCE!!! */ temp[i].left.ipos -= start; /*IF <0, BECOMES INVALID BY DEFINITION, OK*/ temp[i].right.ipos -= start; if (temp[i].right.ipos>=seq->length) /* PAST THE NEW, CLIPPED END */ temp[i].right.ipos= INVALID_LETTER_POSITION; temp[i].ring_id=temp[i].align_ring=i; } if (offset) /* RETURN THE OFFSET TO THE CALLER */ *offset = start; seq->letter=temp; /* NEW START: FIRST ALIGNED POSITION */ return nidentity; /* NEW LENGTH: FROM 1ST TO LAST ALIGNED POS*/ } void restore_lpo_size(LPOSequence_T *seq,int length,LPOLetter_T *letter) { free_lpo_letters(seq->length,seq->letter,TRUE); /* DUMP CLIPPED VERSION*/ seq->length=length; /* RESTORE ORIGINAL length AND letter[] */ seq->letter=letter; } /** BUILDS UP ALIGNMENT, BUT CLIPS UNALIGNED ENDS OF EACH NEW SEQUENCE ADDED ------------------------------------------------------- --------------------------------------------------------------------------- */ LPOSequence_T *buildup_clipped_lpo(LPOSequence_T *new_seq, int nseq,LPOSequence_T seq[], ResidueScoreMatrix_T *score_matrix, int use_global_alignment) { int i,ntemp,offset=0,nidentity,length_max=0,match_length=0; int total_alloc,max_alloc=0; LPOLetterRef_T *al1=NULL,*al2=NULL; LPOLetter_T *temp; float identity_max=0.,f; lpo_index_symbols(new_seq,score_matrix); /* MAKE SURE LPO IS TRANSLATED */ for (i=0;ilength*seq[i].length + sizeof(LPOLetter_T)*new_seq->length; if (total_alloc>max_alloc) { /* DP RECTANGLE ARRAY SIZE */ max_alloc=total_alloc; #ifdef REPORT_MAX_ALLOC fprintf(stderr,"max_alloc: %d bytes (%d x %d)\n",max_alloc, new_seq->length,seq[i].length); #endif if (max_alloc>POA_MAX_ALLOC) { WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.9 $"); break; /* JUST RETURN AND FINISH */ } } align_lpo_po (new_seq, &seq[i], score_matrix,&al1,&al2,NULL,use_global_alignment); /* ALIGN ONE MORE SEQ */ ntemp=seq[i].length; /* SAVE letter[] BEFORE CLIPPING IT TO ALIGNED AREA*/ temp=seq[i].letter; if ((nidentity=clip_unaligned_ends(seq+i,al2,/*THERE IS AN ALIGNED REGION*/ new_seq->length,new_seq->letter,al1,&offset, &match_length))>0) { f=nidentity/(float)match_length; /* CALCULATE IDENTITY FRACTION */ if (0==i /*f>identity_max*/) { /* REPORT IDENTITY OF TOP HIT */ identity_max=nidentity; length_max=match_length; } fuse_lpo(new_seq,seq+i,al1,al2+offset); /*ADD CLIPPED REGION TO LPO*/ } restore_lpo_size(seq+i,ntemp,temp); /* REVERT FROM CLIPPED TO ORIGINAL*/ FREE(al1); /* DUMP TEMPORARY MAPPING ARRAYS FROM align_lpo() */ FREE(al2); } fprintf(stderr,"%s\tmaximum identity\t%3.1f%%\t%.0f/%d\n",new_seq->name, 100*identity_max/length_max,identity_max,length_max); return new_seq; } /** which LPOSeq is called, or holds a sequence called, `name'? */ int find_seq_name (int nseq, LPOSequence_T **seq, char name[]) { int i,j; for (i=0;iname,name)) return i; for (j=0;jnsource_seq;j++) { if (0==strcmp(seq[i]->source_seq[j].name,name)) return i; } } return -1; } typedef struct { double score; int i; int j; } SeqPairScore_T; /* SORT IN DESCENDING ORDER BY score (SO HIGH SIMILARITY SCORES MERGE FIRST). */ /* FOR TIES, USE ITERATIVE MERGE ORDER (1-2, then 1-3, then 1-4, etc.) */ int seqpair_score_qsort_cmp (const void *void_a, const void *void_b) { const SeqPairScore_T *a = (const SeqPairScore_T *)void_a; const SeqPairScore_T *b = (const SeqPairScore_T *)void_b; if (a->score > b->score) return -1; else if (a->score < b->score) return 1; if (a->i > b->i) return 1; else if (a->i < b->i) return -1; if (a->j > b->j) return 1; else if (a->j < b->j) return -1; return 0; } SeqPairScore_T *read_seqpair_scorefile (int nseq, LPOSequence_T **seq, ResidueScoreMatrix_T *score_matrix, LPOScore_T (*scoring_function) (int,int,LPOLetter_T [],LPOLetter_T [], ResidueScoreMatrix_T *), int use_global_alignment, int do_progressive, FILE *ifile, int *p_nscore) { int i,j,nscore=0,max_nscore=0; int *adj_score = NULL; SeqPairScore_T *score_list=NULL; LPOLetterRef_T *al1=NULL,*al2=NULL; double x, min_score=0.0; char name1[256],name2[256]; CALLOC (adj_score, nseq, int); max_nscore = nseq*nseq; CALLOC (score_list, max_nscore, SeqPairScore_T); if (ifile) { /* IF PAIR SCORE FILE (PROGRESSIVE ASSUMED) */ while (fscanf(ifile," %s %s %lf",name1,name2,&x)==3) { /* READ SCORE FILE */ i=find_seq_name(nseq,seq,name1); j=find_seq_name(nseq,seq,name2); if (i<0 || j<0) { WARN_MSG(USERR,(ERRTXT,"invalid sequence pair, not found: %s,%s",name1,name2),"$Revision: 1.2.2.9 $"); FREE (score_list); FREE (adj_score); return NULL; } /* fprintf(stderr,"i=%d,j=%d,x=%.2f\n",i,j,x); */ fprintf(stderr,"Saving score from file %d (%s), %d (%s) : %.2f\n",i,name1,j,name2,x); if (iname,j,seq[j]->name,x); score_list[nscore].i = i; score_list[nscore].j = j; score_list[nscore].score = x; if (xletter == NULL) { initialize_seqs_as_lpo(1,all_seqs[i],score_matrix); } lpo_index_symbols(all_seqs[i],score_matrix); /* MAKE SURE LPO IS TRANSLATED */ } /* RETURN IF NOTHING TO ALIGN */ if (nseq<=0) return NULL; else if (nseq==1) return all_seqs[0]; new_seq = all_seqs[0]; CALLOC(seq_cluster,nseq,int); /* MAPS SEQS (or CLUSTERS) TO CLUSTER THEY'RE IN */ CALLOC(seq_id_in_cluster,nseq,int); /* INDEXES SEQS (or CLUSTERS) WITHIN EACH CLUSTER */ CALLOC(cluster_size,nseq,int); /* COUNTS SEQS IN SAME CLUSTER (updated w/ merges) */ CALLOC(initial_nseq,nseq,int); /* COUNTS SEQS INITIALLY IN EACH CLUSTER (not updated w/ merges) */ for (i=nseq_tot=0;insource_seq; initial_nseq[i] = cluster_size[i]; nseq_tot += cluster_size[i]; } if (score_file) { ifile=fopen(score_file,"r"); if (ifile==NULL) { WARN_MSG(USERR,(ERRTXT,"Error reading pair score file %s.\nExiting", score_file),"$Revision: 1.2.2.9 $"); goto free_and_exit; } } else { ifile = NULL; } score = read_seqpair_scorefile(nseq,all_seqs,score_matrix,scoring_function,use_global_alignment, do_progressive,ifile,&nscore); if (score==NULL) { WARN_MSG(USERR,(ERRTXT,"Error generating pair scores (file %s).\nExiting", score_file ? score_file : "unspecified"),"$Revision: 1.2.2.9 $"); goto free_and_exit; } if (ifile) fclose (ifile); for (iscore=0;iscore %d (%s, nseq=%d)... score %.2f\n", cluster_j,all_seqs[cluster_j]->name,all_seqs[cluster_j]->nsource_seq, cluster_i,all_seqs[cluster_i]->name,all_seqs[cluster_i]->nsource_seq, score[iscore].score); new_seq = all_seqs[cluster_i]; total_alloc = new_seq->length * (sizeof(LPOLetter_T) + all_seqs[cluster_j]->length); if (total_alloc>max_alloc) { /* DP RECTANGLE ARRAY SIZE */ max_alloc=total_alloc; #ifdef REPORT_MAX_ALLOC fprintf(stderr,"max_alloc: %d bytes\n",max_alloc); #endif if (max_alloc>POA_MAX_ALLOC) { WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.9 $"); break; /* JUST RETURN AND FINISH */ } } #ifdef USE_LOCAL_NEUTRALITY_CORRECTION /* NO LONGER USED */ if (score_matrix->nfreq>0) { /* CALCULATE BALANCED SCORING ON EACH PO */ balance_matrix_score(new_seq->length,new_seq->letter,score_matrix); balance_matrix_score(all_seqs[cluster_j]->length,all_seqs[cluster_j]->letter, score_matrix); } #endif buildup_pairwise_lpo(new_seq,all_seqs[cluster_j],score_matrix, use_aggressive_fusion, scoring_function,use_global_alignment); LOOP (i,nseq) { /* APPEND ALL MEMBERS OF cluster_j TO cluster_i */ if (seq_cluster[i] == cluster_j) { seq_cluster[i] = cluster_i; seq_id_in_cluster[i] += cluster_size[cluster_i]; } } cluster_size[cluster_i] += cluster_size[cluster_j]; cluster_size[cluster_j] = 0; } if (preserve_sequence_order) { /* PUT SEQUENCES WITHIN LPO BACK IN THEIR ORIGINAL ORDER: */ int *perm; CALLOC (perm, nseq_tot, int); for (i=nseq_tot=0; ilength,seq1->letter, seq2->length,seq2->letter,al1,al2); fuse_lpo(seq1,seq2,al1,al2); /* BUILD COMPOSITE LPO */ /* FREE LETTERS IN SECOND LPO */ free_lpo_letters(seq2->length,seq2->letter,TRUE); seq2->letter=NULL; /*MARK AS FREED. DON'T LEAVE DANGLING POINTER*/ FREE(al1); /* DUMP TEMPORARY MAPPING ARRAYS */ FREE(al2); return seq1; /* RETURN THE FINAL LPO */ } poaV2/create_seq.c0100644000765400076540000000224110024245152012461 0ustar poapoa #include "default.h" #include "seq_util.h" void save_sequence_fields(Sequence_T *seq, char seq_name[],char seq_title[],int length) { STRNCPY(seq->name,seq_name,SEQUENCE_NAME_MAX); if (seq_title) seq->title=strdup(seq_title); else seq->title=strdup("untitled"); seq->length=length; /* SAVE LENGTH */ } int create_seq(int nseq,Sequence_T **p_seq, char seq_name[],char seq_title[],char tmp_seq[], int do_switch_case) { int i,j; Sequence_T *seq; REBUFF(*p_seq,nseq,SEQUENCE_BUFFER_CHUNK,Sequence_T); /* ALLOCATE MEMORY*/ seq= (*p_seq)+nseq; /* SET POINTER TO NEWLY ALLOCATED ELEMENT */ for (i=j=0;tmp_seq[i];i++) /* ELIMINATE WHITE SPACE */ if (!isspace(tmp_seq[i])) tmp_seq[j++]=tmp_seq[i]; tmp_seq[j]='\0'; /* TERMINATE COMPRESSED STRING*/ seq->sequence=strdup(tmp_seq); /* SAVE A DYNAMIC COPY */ save_sequence_fields(seq,seq_name,seq_title,j); switch (do_switch_case) { case switch_case_to_lower: LOOP (i,seq->length) seq->sequence[i]=tolower(tmp_seq[i]); break; case switch_case_to_upper: LOOP (i,seq->length) seq->sequence[i]=toupper(tmp_seq[i]); break; } return 1; } poaV2/default.h0100644000765400076540000001640710024245152012010 0ustar poapoa#ifndef DEFAULT_HEADER_INCLUDED #define DEFAULT_HEADER_INCLUDED 1 #ifndef MODULE_NAME #define MODULE_NAME "main" #endif #include #include #include #include #include #include #include "black_flag.h" typedef void *voidptr; /* ~~e: should be moved out to generic typing header --- */ typedef int (*funptr)(); #define LOOPB(i,size) for ((i)=(size);(i)-- >0;) #define LOOP(i,size) for ((i)=(size);(i)-- >0;) #define LOOPF(i,size) for ((i)=0;(i)<(size);(i)++) #define LOOP_FINISHED(i,size) ((i)<0 || (i)>=(size)) /**@memo example usage of argument reading macros FOR_ARGS(i,argc) { ARGMATCH_VAL("-tolower",do_switch_case,switch_case_to_lower); ARGMATCH("-seq_err_report",count_sequence_errors); ARGGET("-printmatrix",print_matrix_letters); NEXTARG(matrix_filename); } */ #define FOR_ARGS(INDEX,ARGC) for (INDEX=1;INDEX(LAST)) { \ REALLOC(memptr,((NUM)+1)-((NUM)+1)%(BUF)+(BUF),TYPE);\ (LAST)=((NUM)+1)-((NUM)+1)%(BUF)+(BUF);\ } #endif poaV2/fasta_format.c0100644000765400076540000000512710024245152013022 0ustar poapoa #include "default.h" #include "seq_util.h" /** reads FASTA formatted sequence file, and saves the sequences to the array seq[]; any comment line preceded by a hash-mark will be saved to comment */ int read_fasta(FILE *seq_file,Sequence_T **seq, int do_switch_case,char **comment) { int c,nseq=0,length=0; char seq_name[FASTA_NAME_MAX]="", line[SEQ_LENGTH_MAX],seq_title[FASTA_NAME_MAX]=""; char *p; stringptr tmp_seq=STRINGPTR_EMPTY_INIT; /* read in sequences */ while (fgets(line,sizeof(line)-1,seq_file)) { if ((p=strrchr(line,'\n'))) /* REMOVE NEWLINE FROM END OF LINE */ *p= '\0'; /* TRUNCATE THE STRING */ switch (line[0]) { case '#': /* SEQUENCE COMMENT, SAVE IT */ if (comment) /* SAVE COMMENT FOR CALLER TO USE */ *comment = strdup(line+1); break; case '>': /* SEQUENCE HEADER LINE */ if (seq_name[0] && tmp_seq.p && tmp_seq.p[0]) { /* WE HAVE A SEQUENCE, SO SAVE IT! */ if (create_seq(nseq,seq,seq_name,seq_title,tmp_seq.p,do_switch_case)) nseq++; } seq_name[0]='\0'; if (sscanf(line+1,"%s %[^\n]", /* SKIP PAST > TO READ SEQ NAME*/ seq_name,seq_title)<2) strcpy(seq_title,"untitled"); /* PROTECT AGAINST MISSING NAME */ if (tmp_seq.p) tmp_seq.p[0]='\0'; /* RESET TO EMPTY SEQUENCE */ length=0; break; case '*': /* IGNORE LINES STARTING WITH *... DON'T TREAT AS SEQUENCE! */ break; default: /* READ AS ACTUAL SEQUENCE DATA, ADD TO OUR SEQUENCE */ if (seq_name[0]) /* IF WE'RE CURRENTLY READING A SEQUENCE, SAVE IT */ stringptr_cat_pos(&tmp_seq,line,&length); } c=getc(seq_file); /* ?FIRST CHARACTER IS UNIGENE CLUSTER TERMINATOR? */ if (c==EOF) break; else { ungetc(c,seq_file); /* PUT THE CHARACTER BACK */ if (c=='#' && nseq>0) /* UNIGENE CLUSTER TERMINATOR, SO DONE!*/ break; } } if (seq_name[0] && tmp_seq.p && tmp_seq.p[0]) { /* WE HAVE A SEQUENCE, SO SAVE IT! */ if (create_seq(nseq,seq,seq_name,seq_title,tmp_seq.p,do_switch_case)) nseq++; } stringptr_free(&tmp_seq); return nseq; /* TOTAL NUMBER OF SEQUENCES CREATED */ } /**@memo example: reading FASTA format file: seq_ifile=fopen(seq_filename,"r"); if (seq_ifile) { nseq=read_fasta(seq_ifile,&seq,do_switch_case,&comment); fclose(seq_ifile); } */ /** writes a FASTA formatted file, saving the sequence given in seq[] */ void write_fasta(FILE *ifile,char name[],char title[],char seq[]) { int j; fprintf(ifile,">%s %s\n",name,title? title : "untitled"); for (j=0;jiseq].weight>0) /* EXCLUDE ZERO WEIGHT SEQS */ contains_pos[source->iseq]=source->ipos+1; /* right MUST BE ADJACENT*/ while (source=source->more); /* KEEP COUNTING TILL NO more */ right_score=right_overlap=0; /*DEFAULT MOVE: NOTHING TO THE RIGHT*/ best_right= INVALID_LETTER_POSITION; for (right= &seq[i].right;right && right->ipos>=0;right=right->more) { my_overlap=0; /* OVERLAP CALCULATION */ source= &seq[right->ipos].source;/*COUNT SEQS SHARED IN i AND right*/ do /* BIAS OVERLAP CALCULATION BY SEQUENCE WEIGHTING */ if (contains_pos[source->iseq]==source->ipos) /* YES, ADJACENT! */ my_overlap += source_seq[source->iseq].weight; while (source=source->more); /* KEEP COUNTING TILL NO more */ if (my_overlap>right_overlap /* FIND BEST RIGHT MOVE: BEST OVERLAP */ || (my_overlap==right_overlap && score[right->ipos]>right_score)) { right_overlap=my_overlap; right_score=score[right->ipos]; best_right=right->ipos; } } path[i]=best_right; /* SAVE THE BEST PATH FOUND */ score[i]=right_score+right_overlap; /* SAVE THE SCORE */ if (score[i]>best_score) { /* RECORD BEST SCORE IN WHOLE LPO */ ibest=i; best_score=score[i]; } } CALLOC(best_path,len,LPOLetterRef_T); /* MEMORY FOR STORING BEST PATH */ for (;ibest>=0;ibest=path[ibest]) /* BACK TRACK THE BEST PATH */ best_path[best_len++]=ibest; FREE(path); /* DUMP SCRATCH MEMORY */ FREE(score); FREE(contains_pos); if (p_best_len) /* RETURN best_path AND ITS LENGTH */ *p_best_len = best_len; return best_path; } int assign_sequence_bundle_id(int path_length,LPOLetterRef_T path[], LPOSequence_T *seq,int bundle_id, float minimum_fraction) { int i,*bundle_count=NULL,nseq_in_bundle=0; LPOLetterSource_T *source; CALLOC(bundle_count,seq->nsource_seq,int); LOOP (i,path_length) /* COUNT #POSITIONS OF EACH SEQ ARE IN path */ for (source= &seq->letter[path[i]].source;source;source=source->more) bundle_count[source->iseq]++; LOOP (i,seq->nsource_seq) {/* FOR EACH SEQ OVER THRESHOLD, ASSIGN bundle_id*/ /* printf("bundle %d:\t%s\t%d/%d %d",bundle_id,seq->source_seq[i].name, bundle_count[i],seq->source_seq[i].length,seq->source_seq[i].weight);*/ if (seq->source_seq[i].bundle_id<0 /* NOT YET BUNDLED */ && seq->source_seq[i].length*minimum_fraction <= bundle_count[i]) { /* printf(" +++++++++++++++++");*/ seq->source_seq[i].bundle_id = bundle_id; /* ASSIGN TO THIS BUNDLE */ seq->source_seq[i].weight = 0; /* REMOVE FROM FUTURE heaviest_bundle */ nseq_in_bundle++; } /* printf("\n");*/ } FREE(bundle_count); return nseq_in_bundle; /* RETURN COUNT OF SEQUENCES IN BUNDLE */ } /** assigns weights for bundling based upon /hb_weight arguments in source_seq titles */ void assign_hb_weights(int nsource_seq,LPOSourceInfo_T source_seq[]) { int i,weight; char *p; LOOP (i,nsource_seq) { if (source_seq[i].title && (p=strstr(source_seq[i].title,"/hb_weight="))) { weight=atoi(p+11); if (weight!=0){ /* 0 COULD MEAN atoi FAILED TO PARSE ARG. IGNORE IT*/ source_seq[i].weight = weight; fprintf(stderr,"assigned weight=%d to %s\n",source_seq[i].weight,source_seq[i].name); } else WARN_MSG(USERR,(ERRTXT,"hb_weight zero or unreadable: %s\nIgnored",p),"$Revision: 1.2 $"); } } } /** generates the complete set of heaviest_bundle traversals of the the LPO seq, using iterative heaviest_bundle() and requiring that at least minimum_fraction of the positions in a sequence match the heaviest bundle path, for that sequence to be assigned to that bundle --------------------------------------------------------------- ------------------------------------------------------------*/ void generate_lpo_bundles(LPOSequence_T *seq,float minimum_fraction) { int nbundled=0,ibundle=0,path_length,iseq,count; LPOLetterRef_T *path=NULL; char name[256],title[1024]; /* assign_hb_weights(seq->nsource_seq,seq->source_seq); TURN THIS ON!!*/ while (nbundled < seq->nsource_seq) {/* PULL OUT BUNDLES ONE BY ONE */ path=heaviest_bundle(seq->length,seq->letter,/*GET NEXT HEAVIEST BUNDLE*/ seq->nsource_seq,seq->source_seq,&path_length); if (!path || path_length<10) /* ??!? FAILED TO FIND A BUNDLE ??? */ goto premature_warning; sprintf(name,"CONSENS%d",ibundle); /* NEXT, MARK SEQUENCES THAT FIT THIS BUNDLE ADEQUATELY */ count=assign_sequence_bundle_id(path_length,path,seq,ibundle, minimum_fraction); sprintf(title,"consensus produced by heaviest_bundle, containing %d seqs", count); /* DON'T INCLUDE CONSENSUS ITSELF IN THE COUNT! */ iseq=add_path_sequence(path_length,path,seq,name,title);/*BUILD CONSENSUS*/ seq->source_seq[iseq].bundle_id=ibundle++; /* INCREMENT BUNDLE ID */ nbundled+=count; /* KEEP TRACK OF TOTAL SEQUENCES BUNDLED */ if (count<1) { premature_warning: fprintf(stderr,"*** WARNING: bundling ended prematurely after %d bundles.\nNo sequences fit inside this last bundle.\nA total of %d sequences incuding consensus were bundled.\n\n",ibundle,nbundled); break; } } } poaV2/lpo.c0100644000765400076540000006402410024245153011150 0ustar poapoa #include "default.h" #include "poa.h" #include "seq_util.h" #include "lpo.h" /** INITIALIZES LINEARIZED PARTIAL ORDER DATA STRUCTURES FOR A LINEAR SEQUENCE */ void lpo_init(LPOSequence_T *seq) { int i; CALLOC(seq->letter,seq->length,LPOLetter_T); LOOP (i,seq->length) { seq->letter[i].left.ipos = SEQ_Y_LEFT(i); /* JUST A LINEAR SEQ */ seq->letter[i].right.ipos= SEQ_Y_RIGHT(i); seq->letter[i].source.iseq=0; /* TRIVIAL SOURCE: POINT TO SELF */ seq->letter[i].source.ipos=i; seq->letter[i].align_ring = seq->letter[i].ring_id=i; /* POINT AT SELF */ seq->letter[i].letter = seq->sequence[i]; /* COPY OUR AA LETTER */ } seq->letter[seq->length -1].right.ipos= INVALID_LETTER_POSITION; /* NB: letter[0].left.ipos IS INVALID_LETTER_POSITION THANKS TO SEQ_Y_LEFT() ABOVE */ /* BECAUSE PO CAN CONTAIN MULTIPLE SEQUENCES, WE ALSO KEEP A SOURCE LIST. FOR PURE LINEAR SEQUENCE, THE LIST IS JUST OUR STARTING SEQUENCE */ save_lpo_source(seq,seq->name,seq->title,seq->length,1,NO_BUNDLE,0,NULL); return; } /**@memo initialize one or more regular sequences (linear orders) to LPO form. This step is REQUIRED before running partial order alignment. This routine processes each sequence with limit_residues() and index_symbols(), then builds a linear LPO using lpo_init(). */ void initialize_seqs_as_lpo(int nseq, Sequence_T seq[],ResidueScoreMatrix_T *m) { int i; LOOP (i,nseq) {/* EXCLUDE LETTERS THAT AREN'T IN MATRIX */ limit_residues(seq[i].sequence,m->symbol); /* TRANSLATE FROM ASCII LETTERS TO COMPACTED NUMBERICAL INDEX*/ index_symbols(seq[i].length,seq[i].sequence,seq[i].sequence, m->nsymbol,m->symbol); lpo_init(seq+i); /* CREATE TRIVIAL, LINEAR SEQUENCE LPO */ } } /** translates letter symbols on lpo.letter[i] to indexes used in scoring matrix*/ void lpo_index_symbols(Sequence_T *lpo,ResidueScoreMatrix_T *m) { int i; if (lpo->letter == NULL) { /* HMM. HASN'T BEEN INITIALIZED AT ALL YET */ initialize_seqs_as_lpo(1,lpo,m); return; } if (lpo->letter[0].letter < m->nsymbol) return; /* LOOKS LIKE IT'S ALREADY TRANSLATED TO INDEXES */ LOOP (i,lpo->length) /* READ FROM FILE, MAY NOT BE TRANSLATED YET */ index_symbols(1,&lpo->letter[i].letter,&lpo->letter[i].letter, m->nsymbol,m->symbol); } /** finds ipos for the designated sequence in the designated letter, or INVALID_LETTER_POSITION if iseq is not found. */ int find_letter_source(LPOLetter_T *letter,int iseq) { LPOLetterSource_T *source; for (source= &letter->source;source;source=source->more)/*SCAN SOURCES*/ if (source->iseq == iseq) /*MATCH! */ return source->ipos; /* RETURN ITS SEQUENCE POSITION */ return INVALID_LETTER_POSITION; } /**@memo finds links to iseq, starting from letter[ipos] and proceeding in the specified direction up to max_step links. Optionally, the search can be constrained to only match node iseq:match_ipos. If iseq is found, the number of links connecting its letter to letter[ipos] is reported. Otherwise it returns max_step+1 to indicate iseq was not found within the given distance. If p_ipos or p_letter_pos are non-NULL, it will return the ipos of the position found in iseq, or the index of the found letter[], respectively. */ int find_sequence_link(int iseq, int match_ipos, int start_pos, LPOLetter_T letter[], int max_step, int please_go_right, int *p_ipos, int *p_letter_pos) { int nstep,ipos; LPOLetterLink_T *link,*link0; if (please_go_right) /* CHOOSE THE DESIRED DIRECTION */ link0= &letter[start_pos].right; else link0= &letter[start_pos].left; for (link=link0;link && link->ipos>=0;link=link->more) /*SCAN ALL LINKS*/ if ((ipos=find_letter_source(letter+link->ipos,iseq))>=0 && (match_ipos<0 /* NO CONSTRAINT ON match_ipos */ || match_ipos==ipos)) { /* POSITION MATCHES */ if (p_ipos) /* HAND BACK THE SEQUENCE ipos TO CALLER */ *p_ipos = ipos; if (p_letter_pos) /* HAND BACK THE letter[] INDEX TO CALLER */ *p_letter_pos = link->ipos; return 1; /* FOUND SEQUENCE ONLY ONE LINK AWAY FROM HERE! */ } if (max_step>1) { /* OK TO TRY ANOTHER LAYER OF RECURSION */ for (link=link0;link && link->ipos>=0;link=link->more) { /*SCAN ALL LINKS*/ nstep=find_sequence_link(iseq,match_ipos,link->ipos,letter,max_step-1, please_go_right,p_ipos,p_letter_pos); if (nstep iletter and back. This is saved as an index seq_to_po[ipos]==>iletter and po_to_seq[iletter]==>ipos . At the same time, it also constructs the actual sequence of the individual source sequences from the data stored in the LPO. */ void build_seq_to_po_index(LPOSequence_T *seq) { int i,j; LPOLetterSource_T *source; LOOP (i,seq->nsource_seq) { /* DUMP EXISTING INDEX, ALLOC NEW*/ FREE(seq->source_seq[i].seq_to_po); FREE(seq->source_seq[i].po_to_seq); FREE(seq->source_seq[i].sequence); CALLOC(seq->source_seq[i].seq_to_po,seq->source_seq[i].length,int); CALLOC(seq->source_seq[i].po_to_seq,seq->length,int); CALLOC(seq->source_seq[i].sequence,seq->source_seq[i].length+1,char); LOOP (j,seq->length) /*DEFAULT: PO LETTERS DON'T MAP TO ANY LETTER IN SEQ*/ seq->source_seq[i].po_to_seq[j]= INVALID_LETTER_POSITION; } LOOP (i,seq->length) { /* MAP EVERY LETTER ONTO SOURCE INDEXES */ for (source= &seq->letter[i].source;source;source=source->more) { seq->source_seq[source->iseq].seq_to_po[source->ipos]=i;/*INVERSE*/ seq->source_seq[source->iseq].po_to_seq[i]=source->ipos;/*MAPPING*/ seq->source_seq[source->iseq].sequence[source->ipos]=seq->letter[i].letter; } } } /** CREATES A NEW SOURCEINFO ENTRY ON seq->source_seq[], SAVING THE FIELDS PASSED BY THE CALLER */ int save_lpo_source(LPOSequence_T *seq, char name[], char title[], int length, int weight, int bundle_id, int ndata, LPONumericData_T data[]) { int i,j; REBUFF(seq->source_seq,seq->nsource_seq,SOURCE_SEQ_BUFFER_CHUNK, LPOSourceInfo_T); i=seq->nsource_seq; seq->source_seq[i].title=strdup(title? title:"untitled"); seq->source_seq[i].length=length; seq->source_seq[i].weight=weight; /* DEFAULT WEIGHTING */ seq->source_seq[i].bundle_id= bundle_id; STRNCPY(seq->source_seq[i].name,name,SEQUENCE_NAME_MAX); LOOPF(j,ndata) /* SAVE NUMERIC DATA FOR THIS SOURCE */ cp_numeric_data(seq->source_seq+i,data+j); return seq->nsource_seq++; /* INCREMENT SOURCE SEQUENCE LIST COUNT */ } /** SAVE source_seq[] ENTRIES FROM ONE LPO TO ANOTHER */ int *save_lpo_source_list(LPOSequence_T *seq, int nsource_seq, LPOSourceInfo_T source_seq[]) { int i,*list=NULL; CALLOC(list,nsource_seq,int); LOOPF (i,nsource_seq) list[i]=save_lpo_source(seq,source_seq[i].name,source_seq[i].title, source_seq[i].length,source_seq[i].weight, source_seq[i].bundle_id, source_seq[i].ndata,source_seq[i].data); return list; /*HAND BACK LIST OF NEW INDICES ASSIGNED TO THESE source_seq*/ } /** ADDS A LINK TO ipos TO THE LINKED LIST STORED IN list, ALLOCATING A NEW LPOLetterLink IF NEEDED */ LPOLetterLink_T *add_lpo_link(LPOLetterLink_T *list,LPOLetterRef_T ipos) { if (list->ipos <0) { /* FIRST ENTRY IS EMPTY, SO USE IT */ list->ipos = ipos; return list;/*RETURNS PTR TO LINK IN WHICH ipos STORED */ } do { /* SCAN LIST CHECKING IF ALREADY STORED */ if (list->ipos == ipos) /* ALREADY STORED, NO NEED TO STORE AGAIN */ return list;/*RETURNS PTR TO LINK IN WHICH ipos STORED */ } while (list->more? (list=list->more):0); CALLOC(list->more,1,LPOLetterLink_T); /* ADD ENTRY TO LINKED LIST */ list->more->ipos=ipos; /* SAVE THE LETTER REFERENCE */ return list->more;/*RETURNS PTR TO LINK IN WHICH ipos STORED */ } void add_lpo_sources(LPOLetterSource_T *new_s,LPOLetterSource_T *old_s, int iseq_new[])/*TRANSLATION TO NEW source_seq[] INDEX*/ { for (;new_s->more;new_s=new_s->more);/* GO TO END*/ for (;old_s;old_s=old_s->more) {/*SAVE SOURCES*/ if (new_s->ipos>=0) { /* ALREADY A SOURCE HERE, SO CREATE NEW ENTRY */ CALLOC(new_s->more,1,LPOLetterSource_T); new_s=new_s->more; } new_s->iseq=iseq_new[old_s->iseq]; /* SAVE SEQUENCE ID, POSITION */ new_s->ipos=old_s->ipos; } } void reindex_lpo_source_seqs (LPOSequence_T *seq, int *perm) { int i, j, len, nseq, *map, *invmap; LPOSourceInfo_T tmp; LPOLetterSource_T *src; len = seq->length; nseq = seq->nsource_seq; CALLOC (map, len, int); CALLOC (invmap, len, int); /* BUILD INITIAL MAP AND INVERSE MAP: */ for (i=0; i=nseq || map[i]<0 || invmap[map[i]]!=-1, 1.1, (ERRTXT,"Bad argument! 'perm' must be a permutation of [0,%d]\n",nseq-1), CRASH); invmap[map[i]] = i; } /* RENUMBER SEQS IN 'source_info' ENTRIES FOR ALL LETTERS: */ for (i=0; iletter[i].source); src!=NULL && src->ipos>=0; src=src->more) { src->iseq = map[src->iseq]; } } /* SHUFFLE 'source_seq' ENTRIES, IN-PLACE, TO NEW ORDER: */ /* (THIS DESTROYS map AND invmap.) */ for (i=0; isource_seq[i]; seq->source_seq[i] = seq->source_seq[j]; seq->source_seq[j] = tmp; /* UPDATE map AND invmap: */ map[j] = map[i]; invmap[map[i]] = j; map[i] = invmap[i] = i; } FREE (map); FREE (invmap); } void copy_lpo_letter(LPOLetter_T *new,LPOLetter_T *old, LPOLetterRef_T old_to_new[], int iseq_new[]) { LPOLetterLink_T *link; new->letter=old->letter; /* SAVE ITS SEQUENCE LETTER */ add_lpo_sources(&new->source,&old->source,iseq_new); /* SAVE SOURCES */ for (link= &old->left;link && link->ipos>=0;link=link->more) /*SAVE left*/ add_lpo_link(&new->left,old_to_new[link->ipos]); for (link= &old->right;link && link->ipos>=0;link=link->more)/*SAVE right*/ add_lpo_link(&new->right,old_to_new[link->ipos]); return; } /** FUSES RING a AND RING b BY CROSSLINK OPERATION */ void crosslink_rings(LPOLetterRef_T a,LPOLetterRef_T b,LPOLetter_T seq[]) { LPOLetterRef_T align_ring; if (seq[a].ring_id==seq[b].ring_id) /* ALREADY ON SAME RING, DO NOTHING! */ return; else if (seq[a].ring_idb SO RESET a.ring_id */ align_ring=a; /* TRAVERSE RING a */ do seq[align_ring].ring_id=seq[b].ring_id; /* ENFORCE a.ring_id == b.ring_id*/ while ((align_ring=seq[align_ring].align_ring) != a); } align_ring=seq[a].align_ring; /* JUST LIKE A SWAP */ seq[a].align_ring=seq[b].align_ring; /* CROSSLINK THE TWO RINGS */ seq[b].align_ring=align_ring; return; } void copy_old_ring_to_new(LPOLetterRef_T start, /* START OF RING TO COPY*/ LPOLetter_T old_lpo[], LPOLetter_T new_lpo[], LPOLetterRef_T old_to_new[]) /* MAPPING */ { LPOLetterRef_T ipos,next_pos; for (ipos=start;(next_pos=old_lpo[ipos].align_ring) != start;ipos=next_pos) crosslink_rings(old_to_new[ipos],old_to_new[next_pos],new_lpo); } /* TEMPORARY: THESE CONSTANTS CONTROL SEGMENT FUSION TESTING THE FOLLOWING NUMBERS ON DNA ASSEMBLY*/ #define FISSION_BREAK 5 /* LENGTH OF MISMATCH THAT SPLITS INTO SEGMENTS */ #define MINIMUM_FUSION 10 /* MINIMUM #IDENTITIES FOR SEGMENT TO FUSE */ #define FUSION_PERCENT 0.8 /* MINIMUM OVERALL IDENTITY FOR SEGMENT TO FUSE*/ void mark_fusion_segments(int len_x,LPOLetter_T seq_x[], int len_y,LPOLetter_T seq_y[], LPOLetterRef_T y_to_x[], int fission_break, int minimum_fusion_length, float minimum_fusion_identity, char do_fuse[]) { int i,i_x,i_y,mismatch_length=0,identity_count=0,fission_break_point= -1; LOOP (i_y,len_y) { if ((i_x=y_to_x[i_y])>=0 && seq_x[i_x].letter==seq_y[i_y].letter) do_fuse[i_y]=1; } #ifdef SOURCE_EXCLUDED LOOPF (i_y,len_y) { if ((i_x=y_to_x[i_y])<0 /* NOT ALIGNED AT ALL */ || seq_x[i_x].letter!=seq_y[i_y].letter /* MISMATCH */ || i_y==len_y-1) { /* END OF SEQ, MUST CHECK LAST SEGMENT! */ if (++mismatch_length>=fission_break /* BREAK POINT */ || i_y==len_y-1) { /* END OF SEQ, MUST CHECK LAST SEGMENT! */ if (identity_count>=minimum_fusion_length /* END OF A FUSION SEGMENT?*/ && minimum_fusion_identity *(i_y-mismatch_length-fission_break_point)<= identity_count) { /* YES, MARK PRECEEDING SEGMENT FOR FUSION*/ for (i=i_y-fission_break;i>fission_break_point;i--)/*MARK SEGMENT!*/ if ((i_x=y_to_x[i])>=0 && seq_x[i_x].letter==seq_y[i].letter) do_fuse[i]=1; /* IDENTITY! MARK THIS POSITION TO BE FUSED */ } fission_break_point=i_y; /* NO FUSION SEGMENT CAN EXTEND PAST HERE */ identity_count=0; /* RESET IDENTITY COUNTER FOR STARTING NEXT SEGMENT*/ } } else { /* PERFECT IDENTITY */ mismatch_length=0; identity_count++; } } #endif } int reindex_lpo_fusion(int len_x,LPOLetter_T seq_x[], int len_y,LPOLetter_T seq_y[], LPOLetterRef_T x_to_y[], LPOLetterRef_T y_to_x[], LPOLetterRef_T new_x[], LPOLetterRef_T new_y[], int fission_break, int minimum_fusion_length, float minimum_fusion_identity) { int new_len=0; LPOLetterRef_T i_x,i_y,i_ring,end_of_ring= -1; char *do_fuse=NULL; CALLOC(do_fuse,len_y,char); /* MARK POSITIONS TO FUSE seq_y TO seq_x */ mark_fusion_segments(len_x,seq_x,len_y,seq_y,y_to_x,fission_break, minimum_fusion_length,minimum_fusion_identity,do_fuse); for (i_x=i_y=new_len=0;i_x=0) { /* IF SO, INSERT Y NOW TO KEEP X RING TOGETHER*/ while (i_y=0 /* ALIGNED, SO FIRST INSERT PRECEEDING FROM seq_y */ && i_yend_of_ring) /* FIND MAXIMUM INDEX ON THIS RING */ end_of_ring=i_ring; /* RING GUARANTEED TO BE ONE CONTIGUOUS BLOCK*/ if (do_fuse[i_y]) /* THIS POSITION MEETS OUR FUSION CRITERIA, SO FUSE*/ new_y[i_y++]=new_len; /* USE SAME INDEX AS WILL BE USED FOR i_x */ else /* NOT IDENTICAL, SO GIVE IT ITS OWN LETTER */ new_y[i_y++]=new_len++; /* ADD TO new_lpo */ } new_x[i_x]=new_len++; /* ADD TO new_lpo */ while (i_y<=end_of_ring) /* CONCATENATE LETTERS ALIGNED TO i_y */ new_y[i_y++]=new_len++; /*KEEP ALIGNED LETTERS AS ONE CONTIGUOUS BLOCK!*/ } while (i_y NEW */ { int i; LOOP (i,nremap_x) { if (remap_x[i]>=0 && remap_x[i]length; /* GET letter ARRAY FROM BOTH x AND y */ seq_x=holder_x->letter; len_y=holder_y->length; seq_y=holder_y->letter; CALLOC(new_seq,1,LPOSequence_T); /* CREATE A NEW HOLDER */ CALLOC(new_x,len_x,LPOLetterRef_T); CALLOC(new_y,len_y,LPOLetterRef_T); new_len=reindex_lpo_fusion(len_x,seq_x,len_y,seq_y,x_to_y,y_to_x,new_x,new_y, FISSION_BREAK,MINIMUM_FUSION,FUSION_PERCENT); CALLOC(new_lpo,new_len,LPOLetter_T); /* ALLOCATE NEW LINEARIZED PO */ LOOP (i,new_len) { /* INITIALIZE ALL LINKS TO INVALID */ new_lpo[i].left.ipos=new_lpo[i].right.ipos=new_lpo[i].source.ipos = INVALID_LETTER_POSITION; new_lpo[i].align_ring=new_lpo[i].ring_id=i; /* POINT TO SELF */ } new_seq->length=new_len; /* SAVE NEW LPO ARRAY IN NEW HOLDER */ new_seq->letter=new_lpo; iseq_new=save_lpo_source_list(new_seq,holder_x->nsource_seq,/*COPY x SOURCE*/ holder_x->source_seq); LOOP (i_x,len_x) /* COPY LETTER DATA TO CORRESPONDING LETTERS OF NEW LPO */ copy_lpo_letter(new_lpo+new_x[i_x],seq_x+i_x,new_x,iseq_new); FREE(iseq_new); iseq_new=save_lpo_source_list(new_seq,holder_y->nsource_seq,/*COPY y SOURCE*/ holder_y->source_seq); LOOP (i_y,len_y) copy_lpo_letter(new_lpo+new_y[i_y],seq_y+i_y,new_y,iseq_new); FREE(iseq_new); LOOP (i_x,len_x) /* COPY OLD ALIGNMENT RINGS TO THE NEW LPO */ copy_old_ring_to_new(i_x,seq_x,new_lpo,new_x); LOOP (i_y,len_y) copy_old_ring_to_new(i_y,seq_y,new_lpo,new_y); LOOP (i_x,len_x) /* SAVE ALIGNMENT OF seq_x AND seq_y TO new_lpo.align_ring*/ if (x_to_y[i_x]>=0) /* seq_x[i_x] ALIGNED TO seq_y, SO SAVE! */ crosslink_rings(new_x[i_x],new_y[x_to_y[i_x]],new_lpo); if (remap_x) /* CONVERT OLD INDEX TABLE TO NEW REFERENCE SYSTEM */ remap_x_to_new(nremap_x,remap_x,len_x,new_x); FREE(new_x); /* DUMP SCRATCH MEMORY AND RETURN */ FREE(new_y); return new_seq; /* HAND BACK THE NEW LPO CONTAINING THE FUSION */ } /** fuses the two partial orders holder_x and holder_y, based upon the letter_x <--> letter_y mapping specified by x_to_y and y_to_x (which must be consistent!) A new LPO is created to store the result; neither holder_x or holder_y are changed */ LPOSequence_T *copy_fuse_lpo(LPOSequence_T *holder_x, /*WRAPPER: NO REMAPPING*/ LPOSequence_T *holder_y, LPOLetterRef_T x_to_y[], LPOLetterRef_T y_to_x[]) { return copy_fuse_lpo_remap(holder_x,holder_y,x_to_y,y_to_x,0,NULL); } LPOSequence_T *copy_lpo(LPOSequence_T *holder_x) { int i; LPOSequence_T dummy,*new_copy; LPOLetterRef_T *x_to_y=NULL; memset(&dummy,0,sizeof(dummy)); /* BLANK ALL FIELDS */ CALLOC(x_to_y,holder_x->length,LPOLetterRef_T); LOOP (i,holder_x->length) x_to_y[i]= INVALID_LETTER_POSITION; new_copy=copy_fuse_lpo(holder_x,&dummy,x_to_y,x_to_y); FREE(x_to_y); return new_copy; } void translate_lpo(int old_len,LPOLetterRef_T old_to_new[], LPOLetter_T seq[]) { int i,block_end; LPOLetterLink_T *link; block_end=old_len; LOOPB (i,old_len) { /* TRANSLATE AND SHIFT THE WHOLE LPO */ seq[i].align_ring=old_to_new[seq[i].align_ring]; /* TRANSLATE INDICES*/ seq[i].ring_id=old_to_new[seq[i].ring_id]; for (link= &seq[i].left;link && link->ipos>=0;link=link->more) link->ipos = old_to_new[link->ipos]; for (link= &seq[i].right;link && link->ipos>=0;link=link->more) link->ipos = old_to_new[link->ipos]; if ((0==i || /* HIT SEQ START, SO COPY THE BLOCK NOW!*/ old_to_new[i] != old_to_new[i-1]+1) /*BLOCK BOUNDARY*/ && old_to_new[i] != i) { /* ACTUALLY REQUIRES A SHIFT */ memmove(seq+old_to_new[i],seq+i,(block_end-i)*sizeof(LPOLetter_T)); block_end=i; /* RESET TO POINT TO END OF NEXT BLOCK TO COPY */ } /* BLOCK NOW SHIFTED TO ITS NEW LOCATION */ } } LPOSequence_T *fuse_lpo_remap(LPOSequence_T *holder_x, LPOSequence_T *holder_y, LPOLetterRef_T x_to_y[], LPOLetterRef_T y_to_x[], int nremap_x, LPOLetterRef_T remap_x[]) { int i,new_len,*iseq_new=NULL,len_x,len_y; LPOLetterRef_T i_x,i_y,*new_x=NULL,*new_y=NULL; LPOLetter_T *new_lpo=NULL,*seq_x,*seq_y; len_x=holder_x->length; /* GET letter ARRAY FROM BOTH x AND y */ seq_x=holder_x->letter; len_y=holder_y->length; seq_y=holder_y->letter; CALLOC(new_x,len_x,LPOLetterRef_T); CALLOC(new_y,len_y,LPOLetterRef_T); new_len=reindex_lpo_fusion(len_x,seq_x,len_y,seq_y,x_to_y,y_to_x,new_x,new_y, FISSION_BREAK,MINIMUM_FUSION,FUSION_PERCENT); REALLOC(seq_x,new_len,LPOLetter_T); /* EXPAND seq_x TO HOLD FUSED LPO */ translate_lpo(len_x,new_x,seq_x); /* SHIFT ALL THE LETTERS TO NEW LOCATIONS*/ new_lpo=seq_x; /* THE EXPANDED VERSION OF seq_x IS OUR NEW LPO */ holder_x->length=new_len; /* SAVE NEW LPO ARRAY IN NEW HOLDER */ holder_x->letter=new_lpo; LOOP (i_y,len_y) { /* INITIALIZE ALL LETTERS FOR STORING seq_y TO BLANK */ if (y_to_x[i_y]>=0 && new_x[y_to_x[i_y]]==new_y[i_y])/*i_y FUSED TO i_x*/ continue; /* THIS LETTER IS ALREADY PART OF seq_x SO DON'T OVERWRITE!!*/ i=new_y[i_y]; /* TRANSLATE TO NEW INDEXING */ memset(new_lpo+i,0,sizeof(LPOLetter_T)); /* NULL INITIALIZE IT! */ new_lpo[i].left.ipos=new_lpo[i].right.ipos=new_lpo[i].source.ipos = INVALID_LETTER_POSITION; /* RESET TO UNLINKED STATE */ new_lpo[i].align_ring=new_lpo[i].ring_id=i; /* POINT TO SELF */ } iseq_new=save_lpo_source_list(holder_x,holder_y->nsource_seq,/*COPY y SRC*/ holder_y->source_seq); LOOP (i_y,len_y) /* COPY LETTER DATA TO CORRESPONDING LETTERS OF NEW LPO */ copy_lpo_letter(new_lpo+new_y[i_y],seq_y+i_y,new_y,iseq_new); FREE(iseq_new); LOOP (i_y,len_y) /* COPY OLD ALIGNMENT RINGS TO THE NEW LPO */ copy_old_ring_to_new(i_y,seq_y,new_lpo,new_y); LOOP (i_x,len_x) /* SAVE ALIGNMENT OF seq_x AND seq_y TO new_lpo.align_ring*/ if (x_to_y[i_x]>=0) /* seq_x[i_x] ALIGNED TO seq_y, SO SAVE! */ crosslink_rings(new_x[i_x],new_y[x_to_y[i_x]],new_lpo); if (remap_x) /* CONVERT OLD INDEX TABLE TO NEW REFERENCE SYSTEM */ remap_x_to_new(nremap_x,remap_x,len_x,new_x); FREE(new_x); /* DUMP SCRATCH MEMORY AND RETURN */ FREE(new_y); return holder_x; /* HAND BACK x LPO CONTAINING THE FUSION */ } /** fuses the two partial orders holder_x and holder_y, based upon the letter_x <--> letter_y mapping specified by x_to_y and y_to_x (which must be consistent!) The result is returned in holder_x */ LPOSequence_T *fuse_lpo(LPOSequence_T *holder_x, /*WRAPPER: NO REMAPPING*/ LPOSequence_T *holder_y, LPOLetterRef_T x_to_y[], LPOLetterRef_T y_to_x[]) { return fuse_lpo_remap(holder_x,holder_y,x_to_y,y_to_x,0,NULL); } /** FREES the linked list link including all nodes beneath it; NB: link itself is freed, so DO NOT pass a static LPOLetterLink */ void free_lpo_link_list(LPOLetterLink_T *link) { LPOLetterLink_T *next; for (;link;link=next) { /* DUMP ALL THE LINKS */ next=link->more; free(link); } } /** FREES the linked list source including all nodes beneath it; NB: source itself is freed, so DO NOT pass a static LPOLetterSource */ void free_lpo_source_list(LPOLetterSource_T *source) { LPOLetterSource_T *next; for (;source;source=next) { /* DUMP ALL THE SOURCE ENTRIES */ next=source->more; free(source); } } /** FREES ALL DATA ASSOCIATED WITH letter[], AND OPTIONALLY letter ITSELF*/ void free_lpo_letters(int nletter,LPOLetter_T *letter,int please_free_block) { int i; if (!letter) /* NOTHING TO FREE... */ return; LOOP (i,nletter) { /* DUMP ALL LINKED LISTS */ if (letter[i].left.more) free_lpo_link_list(letter[i].left.more); if (letter[i].right.more) free_lpo_link_list(letter[i].right.more); if (letter[i].source.more) free_lpo_source_list(letter[i].source.more); } if (please_free_block) /*DON'T ALWAYS WANT TO FREE... MIGHT BE IN AN ARRAY*/ free(letter); } void free_lpo_sourceinfo(int nsource_seq,LPOSourceInfo_T *source_seq, int please_free_block) { int i; LOOP (i,nsource_seq) { FREE(source_seq[i].title); FREE(source_seq[i].sequence); FREE(source_seq[i].seq_to_po); FREE(source_seq[i].po_to_seq); free_lpo_numeric_data(source_seq[i].ndata,source_seq[i].data,TRUE); source_seq[i].data=NULL; /* DON'T LEAVE DANGLING POINTER! */ } if (please_free_block) free(source_seq); } /** FREES ALL DATA FROM seq, AND OPTIONALLY seq ITSELF */ void free_lpo_sequence(LPOSequence_T *seq,int please_free_holder) { int i; if (!seq) /* NOTHING TO FREE... */ return; free_lpo_letters(seq->length,seq->letter,TRUE); seq->letter=NULL; /* MARK AS FREED... DON'T LEAVE DANGLING POINTER! */ FREE(seq->title); FREE(seq->sequence); if (seq->source_seq) { free_lpo_sourceinfo(seq->nsource_seq,seq->source_seq,TRUE); seq->source_seq=NULL; /* MARK AS FREED... DON'T LEAVE DANGLING POINTER! */ } if (please_free_holder) /*DON'T ALWAYS WANT TO FREE... MIGHT BE IN AN ARRAY*/ free(seq); } /**@memo {\bfEXAMPLE}: dump the LPO dna_lpo, and all its associated data: \begin{verbatim} if (dna_lpo) free_lpo_sequence(dna_lpo,TRUE); \end{verbatim} */ /** creates a new sequence which follows path[] through the LPO seq, and gives it the specified name and title */ int add_path_sequence(int path_length, LPOLetterRef_T path[], LPOSequence_T *seq, char name[], char title[]) { int i,iseq_new; LPOSourceInfo_T *new_seq; LPOLetterSource_T save_source={0,0,NULL}; /* CREATE SOURCE ENTRY FOR THIS SEQUENCE */ iseq_new=save_lpo_source(seq,name,title,path_length,0,NO_BUNDLE,0,NULL); LOOP (i,path_length) { /* ADD THIS AS SOURCE TO ALL POSITIONS IN path */ save_source.ipos=i; add_lpo_sources(&seq->letter[path[i]].source,&save_source,&iseq_new); } /* NB: THIS DOESN'T CHECK THAT path IS A VALID WALK THRU THE PARTIAL ORDER MIGHT BE A GOOD IDEA TO CATCH POSSIBLE ERRORS IN path */ return iseq_new; /* RETURN INDEX OF NEWLY CREATED ENTRY */ } /**@memo EXAMPLE: create a consensus sequence from a path: iseq=add_path_sequence(path_length,path,seq,name,title); ------------------------------------------------------- ------------------------------------------------- */ poaV2/lpo.h0100644000765400076540000001372610024245153011160 0ustar poapoa #ifndef LPO_HEADER_INCLUDED #define LPO_HEADER_INCLUDED #include #include #include /*********************************************************** lpo.c */ void lpo_init(LPOSequence_T *seq); void initialize_seqs_as_lpo(int nseq, Sequence_T seq[],ResidueScoreMatrix_T *m); void lpo_index_symbols(LPOSequence_T *lpo,ResidueScoreMatrix_T *m); /** reindex source sequences in `seq' so that the sequence in position i ends up in position perm[i]. `perm' must be a permutation of the integers [0,seq->nsource_seq-1]. */ void reindex_lpo_source_seqs (LPOSequence_T *seq, int *perm); int save_lpo_source(LPOSequence_T *seq, char name[], char title[], int length, int weight, int bundle_id, int ndata, LPONumericData_T data[]); int *save_lpo_source_list(LPOSequence_T *seq, int nsource_seq, LPOSourceInfo_T source_seq[]); LPOLetterLink_T *add_lpo_link(LPOLetterLink_T *list,LPOLetterRef_T ipos); void add_lpo_sources(LPOLetterSource_T *new_s,LPOLetterSource_T *old_s, int iseq_new[]); void crosslink_rings(LPOLetterRef_T a,LPOLetterRef_T b,LPOLetter_T seq[]); LPOSequence_T *copy_fuse_lpo(LPOSequence_T *holder_x, LPOSequence_T *holder_y, LPOLetterRef_T x_to_y[], LPOLetterRef_T y_to_x[]); LPOSequence_T *copy_lpo(LPOSequence_T *holder_x); LPOSequence_T *fuse_lpo(LPOSequence_T *holder_x, LPOSequence_T *holder_y, LPOLetterRef_T x_to_y[], LPOLetterRef_T y_to_x[]); void free_lpo_letters(int nletter,LPOLetter_T *letter,int please_free_block); void free_lpo_sequence(LPOSequence_T *seq,int please_free_holder); int add_path_sequence(int path_length, LPOLetterRef_T path[], LPOSequence_T *seq, char name[], char title[]); /************************************************** FROM align_lpo.c */ int align_lpo(LPOSequence_T *lposeq_x, LPOSequence_T *lposeq_y, ResidueScoreMatrix_T *m, LPOLetterRef_T **x_to_y, LPOLetterRef_T **y_to_x, int use_global_alignment); /************************************************** FROM align_lpo_po.c */ LPOScore_T align_lpo_po(LPOSequence_T *lposeq_x, LPOSequence_T *lposeq_y, ResidueScoreMatrix_T *m, LPOLetterRef_T **x_to_y, LPOLetterRef_T **y_to_x, LPOScore_T (*scoring_function) (int,int,LPOLetter_T [],LPOLetter_T [], ResidueScoreMatrix_T *), int use_global_alignment); /************************************************** FROM buildup_lpo.c */ LPOSequence_T *buildup_lpo(LPOSequence_T *new_seq, int nseq,LPOSequence_T seq[], ResidueScoreMatrix_T *score_matrix, int use_aggressive_fusion, int use_global_alignment); LPOSequence_T *buildup_clipped_lpo(LPOSequence_T *new_seq, int nseq,LPOSequence_T seq[], ResidueScoreMatrix_T *score_matrix, int use_global_alignment); LPOSequence_T *buildup_progressive_lpo(int nseq, LPOSequence_T **seqs, ResidueScoreMatrix_T *score_matrix, int use_aggressive_fusion, int do_progressive, char score_file[], LPOScore_T (*scoring_function) (int,int,LPOLetter_T [],LPOLetter_T [], ResidueScoreMatrix_T *), int use_global_alignment, int preserve_sequence_order); LPOSequence_T *buildup_pairwise_lpo(LPOSequence_T seq1[],LPOSequence_T seq2[], ResidueScoreMatrix_T *score_matrix, int use_aggressive_fusion, LPOScore_T (*scoring_function) (int,int,LPOLetter_T [],LPOLetter_T [], ResidueScoreMatrix_T *), int use_global_alignment); /**************************************************** lpo_format.c */ void write_lpo(FILE *ifile,LPOSequence_T *seq, ResidueScoreMatrix_T *score_matrix); LPOSequence_T *read_lpo(FILE *ifile); LPOSequence_T *read_lpo_select(FILE *ifile,FILE *select_ifile, int keep_all_links,int remove_listed_sequences); void write_lpo_as_fasta(FILE *ifile,LPOSequence_T *seq, int nsymbol,char symbol[]); void write_lpo_bundle_as_fasta(FILE *ifile,LPOSequence_T *seq, int nsymbol,char symbol[],int ibundle); void export_clustal_seqal(FILE *ifile, LPOSequence_T *seq, int nsymbol,char symbol[]); /****************************************************** heaviest_bundle.c */ void generate_lpo_bundles(LPOSequence_T *seq,float minimum_fraction); /****************************************************** make_frame.c */ LPOSequence_T *build_3_frames(char dna_seq[],char name[],char title[], LPOScore_T frameshift_score, ResidueScoreMatrix_T *m); LPOSequence_T *map_protein_to_dna(char dna_name[], LPOSequence_T *lpo_dna, int nseq, Sequence_T seq[], ResidueScoreMatrix_T *score_matrix, int use_aggressive_fusion); /****************************************************** remove_bundle.c */ int remove_bundle(LPOSequence_T *seq,int ibundle,int delete_all_others); /******************************************************* numeric_data.c */ LPONumericData_T *new_numeric_data(LPOSourceInfo_T *source_seq, char name[], char title[], double initial_value); LPONumericData_T *find_numeric_data(LPOSourceInfo_T *source_seq, char name[]); void free_lpo_numeric_data(int ndata,LPONumericData_T *data, int please_free_block); void new_numeric_data_sets(LPOSourceInfo_T *source_seq, int nset,char *set_names[], char source_name_fmt[], char target_name_fmt[], char title_fmt[]); void read_numeric_data(int nsource_seq, LPOSourceInfo_T source_seq[], FILE *ifile); LPONumericData_T *cp_numeric_data(LPOSourceInfo_T *source_seq, LPONumericData_T *data); /******************************************************* balance_matrix.c */ int read_aa_frequencies(char filename[],ResidueScoreMatrix_T *score_matrix) ; void balance_matrix_score(int nletter,LPOLetter_T letter[], ResidueScoreMatrix_T *score_matrix); #endif poaV2/lpo_format.c0100644000765400076540000004350010024245153012514 0ustar poapoa #include "default.h" #include "poa.h" #include "seq_util.h" #include "lpo.h" /** writes the LPO in seq to the stream ifile; optionally a symbol table may be given for translating the letters in the LPO to text */ void write_lpo(FILE *ifile,LPOSequence_T *seq, ResidueScoreMatrix_T *score_matrix) { int i; LPOLetterLink_T *link; LPOLetterSource_T *source; fprintf(ifile,"VERSION=LPO.0.1\n"); fprintf(ifile,"NAME=%s\nTITLE=%s\nLENGTH=%d\nSOURCECOUNT=%d\n", seq->name,seq->title,seq->length,seq->nsource_seq); LOOPF (i,seq->nsource_seq) fprintf(ifile,"SOURCENAME=%s\nSOURCEINFO=%d %d %d %d %s\n", seq->source_seq[i].name,seq->source_seq[i].length, seq->source_seq[i].istart,seq->source_seq[i].weight, seq->source_seq[i].bundle_id,seq->source_seq[i].title); LOOPF (i,seq->length) { fprintf(ifile,"%c:", seq->letter[i].letter < score_matrix->nsymbol ? score_matrix->symbol[seq->letter[i].letter] : seq->letter[i].letter); for (link= &seq->letter[i].left;link && link->ipos>=0;link=link->more) fprintf(ifile,"L%d",link->ipos); for (source= &seq->letter[i].source;source;source=source->more) fprintf(ifile,"S%d",source->iseq); /* SOURCE ID */ if (seq->letter[i].align_ring!=i) /* ALIGNED TO SOMETHING ELSE */ fprintf(ifile,"A%d",seq->letter[i].align_ring); fputc('\n',ifile); } } /**@memo example: writing a PO file: if (lpo_file_out) write_lpo(lpo_file_out,lpo_out,score_matrix.symbol); */ /** reads an LPO from the stream ifile, dynamically allocates memory for it, and returns a pointer to the LPO */ LPOSequence_T *read_lpo(FILE *ifile) { int i,j,length,nsource_seq,istart,field_id,*pos_count=NULL,value; int weight,bundle_id,last_alloc=0; LPOSequence_T *seq=NULL; char c,name[1024]="",title[4096]="",version[256]=""; LPOLetterSource_T save_source={0,0,NULL}; CALLOC(seq,1,LPOSequence_T); fscanf(ifile,"VERSION=%s",version); fscanf(ifile," NAME=%[^\n]",name); fscanf(ifile," TITLE=%[^\n]",title); if (fscanf(ifile," LENGTH=%d SOURCECOUNT=%d", &length,&nsource_seq)!=2) return NULL; STRNCPY(seq->name,name,SEQUENCE_NAME_MAX); seq->title=strdup(title); seq->length=length; seq->nsource_seq=nsource_seq; CALLOC(seq->letter,length,LPOLetter_T); GETMEM(seq->source_seq,nsource_seq,last_alloc,SOURCE_SEQ_BUFFER_CHUNK,LPOSourceInfo_T); CALLOC(pos_count,nsource_seq,int); LOOP (i,length) { /* INITIALIZE ALL LINKS TO INVALID */ seq->letter[i].align_ring=i; /* POINT TO SELF */ seq->letter[i].ring_id= INVALID_LETTER_POSITION; /* BLANK! */ seq->letter[i].left.ipos=seq->letter[i].right.ipos= seq->letter[i].source.ipos= INVALID_LETTER_POSITION; } LOOPF(i,nsource_seq) { /* SAVE SOURCE INFO LIST */ if (fscanf(ifile," SOURCENAME=%[^\n] SOURCEINFO=%d %d %d %d", name,&length,&istart,&weight,&bundle_id)!=5) return NULL; /* SKIP WHITESPACE BEFORE TITLE; ALLOW EMPTY TITLE. */ fscanf(ifile,"%*[ \t]"); if (fscanf(ifile,"%[^\n]",title)!=1) title[0] = '\0'; STRNCPY(seq->source_seq[i].name,name,SEQUENCE_NAME_MAX); seq->source_seq[i].length=length; seq->source_seq[i].istart=istart; seq->source_seq[i].weight=weight; seq->source_seq[i].bundle_id=bundle_id; seq->source_seq[i].title=strdup(title); } LOOPF (i,seq->length) { /* NOW READ THE ACTUAL PARTIAL ORDER */ if (fscanf(ifile," %c:",&c)!=1) /* READ SEQUENCE LETTER */ return NULL; seq->letter[i].letter=c; while ((field_id=getc(ifile))!=EOF && '\n'!=field_id) {/* READ FIELDS*/ if (1!=fscanf(ifile,"%d",&value)) return NULL; switch (field_id) { case 'L': add_lpo_link(&seq->letter[i].left,value); /* ADD LEFT-RIGHT LINKS*/ add_lpo_link(&seq->letter[value].right,i); break; case 'S': /* SAVE THE SOURCE ID */ save_source.ipos=pos_count[value]++; add_lpo_sources(&seq->letter[i].source,&save_source,&value); break; case 'A': /* SAVE THE ALIGN RING POINTER */ seq->letter[i].align_ring=value; break; } } } LOOPF (i,seq->length) { /* SET ring_id TO MINIMUM VALUE ON EACH RING */ if (seq->letter[i].ring_id<0) {/* NEW RING, UPDATE IT! */ j=i; /* GO AROUND THE ENTIRE RING, SETTING ring_id TO i */ do seq->letter[j].ring_id=i; /* i IS MINIMUM VALUE ON THIS RING */ while ((j=seq->letter[j].align_ring)!=i); } } FREE(pos_count); return seq; } #define INVALID_LPO_LINK (-99) enum { default_retention_mode, default_no_retention_mode }; /** reads an LPO from the stream ifile, dynamically allocates memory for it, and returns a pointer to the LPO */ LPOSequence_T *read_lpo_select(FILE *ifile,FILE *select_file, int keep_all_links,int remove_listed_sequences) { int i,j,k,length,nsource_seq,istart,field_id,*pos_count=NULL,value; int weight,bundle_id,last_alloc=0,*iseq_compact=NULL,*last_pos=NULL; int nlink,*link_list=NULL,*match_pos=NULL,*ring_old=NULL; int *pos_compact=NULL,npos_compact=0,keep_this_letter,retention_mode; LPOSequence_T *seq=NULL; char c,name[1024]="",title[4096]="",version[256]=""; LPOLetterSource_T save_source={0,0,NULL},*source=NULL; if (remove_listed_sequences) retention_mode=default_retention_mode;/*KEEP SEQS AS DFLT, SKIP IF LISTED*/ else /* SKIP SEQS UNLESS LISTED IN select_file */ retention_mode=default_no_retention_mode; CALLOC(seq,1,LPOSequence_T); fscanf(ifile,"VERSION=%s",version); fscanf(ifile," NAME=%[^\n]",name); fscanf(ifile," TITLE=%[^\n]",title); if (fscanf(ifile," LENGTH=%d SOURCECOUNT=%d", &length,&nsource_seq)!=2) return NULL; STRNCPY(seq->name,name,SEQUENCE_NAME_MAX); seq->title=strdup(title); seq->length=length; seq->nsource_seq=nsource_seq; CALLOC(seq->letter,length,LPOLetter_T); GETMEM(seq->source_seq,nsource_seq,last_alloc,SOURCE_SEQ_BUFFER_CHUNK,LPOSourceInfo_T); CALLOC(pos_count,nsource_seq,int); LOOP (i,length) { /* INITIALIZE ALL LINKS TO INVALID */ seq->letter[i].align_ring=i; /* POINT TO SELF */ seq->letter[i].ring_id= INVALID_LETTER_POSITION; /* BLANK! */ seq->letter[i].left.ipos=seq->letter[i].right.ipos= seq->letter[i].source.ipos= INVALID_LETTER_POSITION; } LOOPF(i,nsource_seq) { /* SAVE SOURCE INFO LIST */ if (fscanf(ifile," SOURCENAME=%[^\n] SOURCEINFO=%d %d %d %d", name,&length,&istart,&weight,&bundle_id)!=5) return NULL; /* SKIP WHITESPACE BEFORE TITLE; ALLOW EMPTY TITLE. */ fscanf(ifile,"%*[ \t]"); if (fscanf(ifile,"%[^\n]",title)!=1) title[0] = '\0'; STRNCPY(seq->source_seq[i].name,name,SEQUENCE_NAME_MAX); seq->source_seq[i].length=length; seq->source_seq[i].istart=istart; seq->source_seq[i].weight=weight; seq->source_seq[i].bundle_id=bundle_id; seq->source_seq[i].title=strdup(title); } CALLOC(iseq_compact,nsource_seq,int); CALLOC(last_pos,nsource_seq,int); CALLOC(match_pos,nsource_seq,int); if (select_file) { LOOP (i,nsource_seq) /* DEFAULT: MARKED AS INVALID */ iseq_compact[i]= -retention_mode; while (fscanf(select_file,"SOURCENAME=%[^\n]\n",name)==1) { LOOP (i,nsource_seq) if (strcmp(seq->source_seq[i].name,name)==0) { iseq_compact[i]= retention_mode-1; break; } } } j=0; LOOPF (i,nsource_seq) { last_pos[i]= -1; /* DEFAULT: INVALID */ if (iseq_compact[i]>=0) { iseq_compact[i]=j; if (i>j) memcpy(seq->source_seq+j,seq->source_seq+i,sizeof(LPOSourceInfo_T)); match_pos[j]= INVALID_LPO_LINK; /* DEFAULT: NO VALID LINK! */ j++; } else { /* EXCLUDE THIS SEQUENCE FROM THE FILTERED LPO */ iseq_compact[i]= INVALID_LETTER_POSITION; if (seq->source_seq[i].title) free(seq->source_seq[i].title); } } seq->nsource_seq=nsource_seq=j; /* COMPACTED COUNT OF SEQUENCES TO KEEP*/ CALLOC(link_list,seq->length,int); /*TEMPORARY DATA FOR COMPACTION MAPPING */ CALLOC(pos_compact,seq->length,int); CALLOC(ring_old,seq->length,int); npos_compact=0; LOOPF (i,seq->length) { /* NOW READ THE ACTUAL PARTIAL ORDER */ if (fscanf(ifile," %c:",&c)!=1) /* READ SEQUENCE LETTER */ return NULL; seq->letter[npos_compact].letter=c; nlink=0; keep_this_letter=0; /*DEFAULT */ while ((field_id=getc(ifile))!=EOF && '\n'!=field_id) {/* READ FIELDS*/ if (1!=fscanf(ifile,"%d",&value)) return NULL; switch (field_id) { case 'L': if (pos_compact[value]>=0) /*COULD BE VALID LINK: WAIT TO CHECK SRCs*/ link_list[nlink++]=pos_compact[value]; /* SAVE IT TEMPORARILY */ break; case 'S': /* SAVE THE SOURCE ID */ if (iseq_compact[value]>=0) { /*KEEP THIS SOURCE, SO KEEP THIS LETTER*/ keep_this_letter=1; value=iseq_compact[value]; /* TRANSLATE TO ITS COMPACTED INDEX*/ if (last_pos[value]>=0) { /* MAKE SURE WE HAVE LINK TO LAST POSITION */ LOOP (j,nlink) /* CHECK TO SEE IF LINK ALREADY SAVED */ if (link_list[j]==last_pos[value]) break; if (LOOP_FINISHED(j,nlink)) { /* NO LINK??? ADD IT!!! */ link_list[nlink++]=last_pos[value]; } } last_pos[value]=npos_compact; /* THIS SEQ POS IS AT THIS NODE */ save_source.ipos=pos_count[value]++;/* COUNT LENGTH OF THIS SEQ */ add_lpo_sources(&seq->letter[npos_compact].source, &save_source,&value); seq->letter[npos_compact].ring_id= INVALID_LETTER_POSITION; seq->letter[npos_compact].align_ring=i; ring_old[i]=i; /* DEFAULT: SELF-RING OF ONE LETTER*/ } break; case 'A': /* SAVE THE ALIGN RING POINTER */ ring_old[i]=value; /* SAVE OLD ALIGN RING INDICES */ if (keep_this_letter) /* TEMP'Y: SAVE REVERSE MAPPING TO A-R INDICES*/ seq->letter[npos_compact].align_ring=i; break; } } if (keep_this_letter) { for (source= &seq->letter[npos_compact].source;source;source=source->more) match_pos[source->iseq]=source->ipos - 1; /*VALID LINK MUST MATCH m_p*/ LOOPF (j,nlink) { /*ADD LEFT-RIGHT LINKS*/ for (source= &seq->letter[link_list[j]].source;source;source=source->more) if (keep_all_links /* KEEP LINKS EVEN IF NOT FROM SELECTED SEQS*/ || source->ipos == match_pos[source->iseq]) { /*VALID LINK! */ add_lpo_link(&seq->letter[npos_compact].left,link_list[j]); add_lpo_link(&seq->letter[link_list[j]].right,npos_compact); break; /* SAVED THIS LINK! */ } } for (source= &seq->letter[npos_compact].source;source;source=source->more) match_pos[source->iseq]= INVALID_LPO_LINK;/*DFLT:NO VALID LINK*/ pos_compact[i]=npos_compact++; } else pos_compact[i]= INVALID_LETTER_POSITION; } seq->length=npos_compact; LOOPF (i,seq->length) { /* SET ring_id TO MINIMUM VALUE ON EACH RING */ if (seq->letter[i].ring_id<0) {/* NEW RING, UPDATE IT! */ j=seq->letter[i].align_ring; /* GO AROUND THE ENTIRE RING, SETTING ring_id TO i */ do { /*printf("i=%d\tj=%d\tpos_compact[j]=%d\tring_old[j]=%d\n",i,j,pos_compact[j],ring_old[j]);*/ if (pos_compact[j]>=0) { k=pos_compact[j]; seq->letter[k].ring_id=i; /* i IS MINIMUM VALUE ON THIS RING */ } j=ring_old[j]; /* ADVANCE TO NEXT LETTER ON THE RING */ if (pos_compact[j]>=0) /* IF VALID, POINT IT BACK TO PREVIOUS LETTER*/ seq->letter[pos_compact[j]].align_ring=k; } while (pos_compact[j]!=i); /* STOP WHEN WE'VE COMPLETED THE RING, BACK TO START*/ } } FREE(pos_count); FREE(pos_compact); FREE(iseq_compact); FREE(last_pos); FREE(link_list); FREE(ring_old); FREE(match_pos); return seq; } /* LPOSequence_T *read_lpo(FILE *ifile) { return read_lpo_select(ifile,NULL); } */ #define FASTA_GAP_CHARACTER '.' int xlate_lpo_to_al(LPOSequence_T *seq, int nsymbol,char symbol[],int ibundle, char gap_character, char ***p_seq_pos,char **p_p,char **p_include) { int i,j,iring=0,nring=0,current_ring=0,iprint; char **seq_pos=NULL,*p=NULL,*include_in_save=NULL; LPOLetterSource_T *source; LOOPF (i,seq->length) /* COUNT TOTAL #ALIGNMENT RINGS IN THE LPO */ if (seq->letter[i].ring_id != current_ring) { /* NEXT RING */ current_ring=seq->letter[i].ring_id; nring++; } nring++; /* DON'T FORGET TO COUNT THE LAST RING!!! */ CALLOC(seq_pos,seq->nsource_seq,char *); /* ALLOCATE MAP ARRAY*/ CALLOC(p,seq->nsource_seq*nring,char); LOOP (i,seq->nsource_seq) /* BUILD POINTER ARRAY INTO MAP ARRAY */ seq_pos[i]=p+i*nring; memset(p,gap_character,seq->nsource_seq*nring); /* DEFAULT IS NO SEQUENCE PRESENT AT THIS POSITION */ current_ring=0; /* RESET TO BEGINNING */ LOOPF (i,seq->length) { /* NOW MAP THE LPO TO A FLAT LINEAR ORDER */ if (seq->letter[i].ring_id != current_ring) { /* NEXT RING */ current_ring=seq->letter[i].ring_id; iring++; } /* MAP EACH SOURCE SEQ ONTO LINEAR ORDER INDEXED BY iring */ for (source= &seq->letter[i].source;source;source=source->more) if (symbol && seq->letter[i].letteriseq][iring]= symbol[seq->letter[i].letter]; else /* NO NEED TO TRANSLATE */ seq_pos[source->iseq][iring]= seq->letter[i].letter; } if (ibundle>=0) { /* ONLY SAVE SEQS THAT ARE IN THIS BUNDLE */ CALLOC(include_in_save,nring,char); /* BLANK FLAGS: WHAT RINGS TO SHOW*/ LOOP (iring,nring) { /* CHECK EACH RING TO SEE IF IT'S IN BUNDLE */ LOOP (i,seq->nsource_seq) { if (seq_pos[i][iring]!=gap_character /* ALIGNED HERE! */ && seq->source_seq[i].bundle_id == ibundle) { /* PART OF BUNDLE!*/ include_in_save[iring]=1; /* SO INCLUDE THIS RING */ break; } } } } if (p_seq_pos) *p_seq_pos = seq_pos; return nring; } /** writes the LPO in FASTA format, including all sequences in the specified bundle */ void write_lpo_bundle_as_fasta(FILE *ifile,LPOSequence_T *seq, int nsymbol,char symbol[],int ibundle) { int i,j,nring=0,iprint; char **seq_pos=NULL,*p=NULL,*include_in_save=NULL; nring=xlate_lpo_to_al(seq,nsymbol,symbol,ibundle, /* TRANSLATE TO */ FASTA_GAP_CHARACTER, /* RC-MSA FMT */ &seq_pos,&p,&include_in_save); LOOPF (i,seq->nsource_seq) { /* NOW WRITE OUT FASTA FORMAT */ if (ibundle<0 /* PRINT ALL BUNDLES */ || seq->source_seq[i].bundle_id == ibundle) { /* OR JUST THIS BUNDLE*/ fprintf(ifile,">%s %s",seq->source_seq[i].name,seq->source_seq[i].title); iprint=0; LOOPF (j,nring) { /* WRITE OUT 60 CHARACTER SEQUENCE LINES */ if (NULL==include_in_save || include_in_save[j]) { fprintf(ifile,"%s%c",iprint%60? "":"\n", seq_pos[i][j]); iprint++; /* KEEP COUNT OF PRINTED CHARACTERS */ } } fputc('\n',ifile); } } FREE(p); /* DUMP TEMPORARY MEMORY */ FREE(include_in_save); FREE(seq_pos); } /**@memo example: writing FASTA format file: if (seq_ifile=fopen(fasta_out,"w")) { write_lpo_bundle_as_fasta(seq_ifile,lpo_out, score_matrix.nsymbol,score_matrix.symbol,ibundle); fclose(seq_ifile); } */ /** writes the LPO in FASTA format, including all sequences in all bundles ------------------------------------------------------- --------------------------------------------------------------------------- */ void write_lpo_as_fasta(FILE *ifile,LPOSequence_T *seq, int nsymbol,char symbol[]) { /* WRAPPER FUNCTION FOR SAVING ALL BUNDLES!! */ write_lpo_bundle_as_fasta(ifile,seq,nsymbol,symbol,ALL_BUNDLES); } /**************************************************************** * * WRITE_SEQUENCES * * This function writes out sequences. Each line of sequnces * will be written out in blocks with spacing in between * each block. * ***************************************************************/ int write_sequences(FILE *ifile, LPOSequence_T *seq, int indent, int nblock, /* # OF BLOCKS */ int block_size, /* SIZE OF BLOCKS */ int block_spacing, /* SPACING BETWEEN BLOCKS */ int paragraph_spacing, /* SPACING BETWEEN PRAGRAPHS */ int names, /* BOOLEAN: PRINT NAMES AFTER 1ST PARAGRAPH? */ char gap_char, int nsymbol,char symbol[]) { int i,ip,ial,iseq,iblock,nparagraph,remainder,ipos,ipos_local,broken; int len,nring; char **seq_pos=NULL,*p=NULL,*include_in_save=NULL; nring=xlate_lpo_to_al(seq,nsymbol,symbol,ALL_BUNDLES, /* TRANSLATE TO */ gap_char, /* RC-MSA FMT */ &seq_pos,&p,&include_in_save); nparagraph = nring/(nblock*block_size);/*# OF FULL PARAGRAPHS */ remainder = nring%(nblock*block_size); if (remainder != 0) nparagraph++; LOOPF(ip,nparagraph){ /* FOR EACH PARAGRAPH */ LOOPF(iseq,seq->nsource_seq){ /* FOR EACH SEQUENCE */ if (ip == 0 || names){ if ((len=strlen(seq->source_seq[iseq].name))>indent-1){ /*MUST WE TRUNCATE NAME? */ LOOPF(i,indent-1) /* PRINT AS MUCH OF NAME AS POSSIBLE */ putc(seq->source_seq[iseq].name[i],ifile); putc(' ',ifile); /* LEAVE A SPACE */ } else{ /* PRINT WHOLE NAME */ fprintf(ifile,"%s",seq->source_seq[iseq].name); /* PRINT NAME */ LOOP(i,indent-len) /* INDENT LINE */ putc(' ',ifile); } } else{ LOOP(i,indent) /* INDENT LINE */ putc(' ',ifile); } LOOPF(iblock,nblock){ /* FOR EACH BLOCK */ broken=0; LOOPF(i,block_size){ ipos = i + (iblock*block_size) + (ip*nblock*block_size); if (ipos>=nring){ broken=1; break; } putc(seq_pos[iseq][ipos],ifile); /* APPROP SYMBOL FOR THIS POS*/ } if (broken) break; LOOP(i,block_spacing) /* ADD SPACING BETWEEN BLOCKS */ putc(' ',ifile); } /* END OF BLOCK LOOP */ putc('\n',ifile); /* NEW LINE AT END OF SEQUENCE LINE */ } /* END OF SEQ LOOP */ LOOP(i,paragraph_spacing) /* ADD SPACING BETWEEN PARAGRAPHS */ putc('\n',ifile); } /* END OF PARAGRAPH LOOP */ done: FREE(p); /* DUMP TEMPORARY MEMORY */ FREE(include_in_save); FREE(seq_pos); return 0; } void export_clustal_seqal(FILE *ifile, LPOSequence_T *seq, int nsymbol,char symbol[]) { fprintf(ifile,"CLUSTAL W (1.74) multiple sequence alignment\n\n\n"); /* WRITE OUT SEQUNENCES: INDENT 36, 1 BLOCK OF 50 CHARS 0 CHARS BLOCK SPACING, 2 LINES PARAGRAPH SPACING, PRINT NAMES ON ALL LINES. */ write_sequences(ifile,seq,36,1,50,0,2,1,'-',nsymbol,symbol); } poaV2/main.c0100644000765400076540000003726310024266006011307 0ustar poapoa #include "lpo.h" #include "msa_format.h" #include "align_score.h" static LPOSequence_T *read_partial_order_file (char *po_filename, char *subset_filename, int remove_listed_seqs, int keep_all_links, int do_switch_case, ResidueScoreMatrix_T *mat); int main(int argc,char *argv[]) { int i,j,ibundle=ALL_BUNDLES,nframe_seq=0,use_reverse_complement=0; int nseq=0,do_switch_case=dont_switch_case,do_analyze_bundles=0; int nseq_in_list=0,n_input_seqs=0,max_input_seqs=0; char score_file[256],seq_file[256],po_list_entry_filename[256],*comment=NULL,*al_name="test align"; ResidueScoreMatrix_T score_matrix; /* DEFAULT GAP PENALTIES*/ LPOSequence_T *seq=NULL,*lpo_out=NULL,*frame_seq=NULL,*dna_lpo=NULL,*lpo_in=NULL; LPOSequence_T **input_seqs=NULL; FILE *errfile=stderr,*logfile=NULL,*lpo_file_out=NULL,*po_list_file=NULL,*seq_ifile=NULL; char *print_matrix_letters=NULL,*fasta_out=NULL,*po_out=NULL,*matrix_filename=NULL, *seq_filename=NULL,*frame_dna_filename=NULL,*po_filename=NULL,*po2_filename=NULL, *po_list_filename=NULL, *hbmin=NULL,*numeric_data=NULL,*numeric_data_name="Nmiscall", *dna_to_aa=NULL,*pair_score_file=NULL,*aafreq_file=NULL,*termval_file=NULL, *bold_seq_name=NULL,*subset_file=NULL,*subset2_file=NULL,*rm_subset_file=NULL, *rm_subset2_file=NULL; float bundling_threshold=0.9; int exit_code=0,count_sequence_errors=0,please_print_snps=0, report_consensus_seqs=0,report_major_allele=0,use_aggressive_fusion=0; int show_allele_evidence=0,please_collapse_lines=0,keep_all_links=0; int remove_listed_seqs=0,remove_listed_seqs2=0,please_report_similarity; int do_global=0, do_progressive=0, do_preserve_sequence_order=0; char *reference_seq_name="CONSENS%d",*clustal_out=NULL; black_flag_init(argv[0],PROGRAM_VERSION); if (argc<2) { fprintf(stderr,"\nUsage: %s [OPTIONS] MATRIXFILE\n" "Align a set of sequences or alignments using the scores in MATRIXFILE.\n" "Example: %s -read_fasta multidom.seq -clustal m.aln blosum80.mat\n\n" "INPUT:\n" " -read_fasta FILE Read in FASTA sequence file.\n" " -read_msa FILE Read in MSA alignment file.\n" " -read_msa2 FILE Read in second MSA file. \n" " -subset FILE Filter MSA to include list of seqs in file.\n" " -subset2 FILE Filter second MSA to include list of seqs in file.\n" " -remove FILE Filter MSA to exclude list of seqs in file.\n" " -remove2 FILE Filter second MSA to exclude list of seqs in file.\n" " -read_msa_list FILE Read an MSA from each filename listed in file.\n" " -tolower Force FASTA/MSA sequences to lowercase\n" " (nucleotides in our matrix files)\n" " -toupper Force FASTA/MSA sequences to UPPERCASE\n" " (amino acids in our matrix files)\n" "\nALIGNMENT:\n" " -do_global Do global alignment.\n" " -do_progressive Perform progressive alignment using a guide tree\n" " built by neighbor joining from a set of\n" " sequence-sequence similarity scores.\n" " -read_pairscores FILE Read tab-delimited file of similarity scores.\n" " (If not provided, scores are constructed\n" " using pairwise sequence alignment.)\n" " -fuse_all Fuse identical letters on align rings.\n" "\nANALYSIS:\n" " -hb Perform heaviest bundling to generate consensi.\n" " -hbmin VALUE Include in heaviest bundle sequences with\n" " percent ID (as a fraction) >= value.\n" "\nOUTPUT:\n" " -pir FILE Write out MSA in PIR format.\n" " -clustal FILE Write out MSA in CLUSTAL format.\n" " -po FILE Write out MSA in PO format.\n" " -preserve_seqorder Write out MSA with sequences in their input order.\n" " -printmatrix LETTERS Print score matrix to stdout.\n" " -best Restrict MSA output to heaviest bundles (PIR only).\n" " -v Run in verbose mode (e.g. output gap penalties).\n\n" " NOTE: One of the -read_fasta, -read_msa, or -read_msa_list arguments\n" " must be used, since a sequence or alignment file is required.\n\n" "For more information, see http://www.bioinformatics.ucla.edu/poa.\n\n" ,argv[0],argv[0]); exit(-1); } FOR_ARGS(i,argc) { /* READ ALL THE ARGUMENTS */ ARGMATCH_VAL("-tolower",do_switch_case,switch_case_to_lower); ARGMATCH_VAL("-toupper",do_switch_case,switch_case_to_upper); ARGMATCH_VAL("-v",logfile,stdout); ARGMATCH_VAL("-best",ibundle,0); /*RESTRICT FASTA OUTPUT TO HB */ ARGMATCH_VAL("-hb",do_analyze_bundles,1);/*CALCULATE HEAVIEST BUNDLING*/ ARGGET("-printmatrix",print_matrix_letters); ARGGET("-read_msa",po_filename); /* READ A MSA FILE FOR ALIGNMENT/ANALYSIS*/ ARGGET("-read_msa2",po2_filename); /* READ A SECOND MSA FILE FOR ALIGNMENT/ANALYSIS*/ ARGGET("-read_msa_list",po_list_filename); /* READ A LIST OF MSAs FOR ALIGNMENT/ANALYSIS */ ARGGET("-pir",fasta_out); /* SAVE FASTA-PIR FORMAT ALIGNMENT FILE */ ARGGET("-clustal",clustal_out); /* SAVE CLUSTAL FORMAT ALIGNMENT FILE */ ARGGET("-po",po_out); /* SAVE PO FORMAT ALIGNMENT FILE */ ARGMATCH("-preserve_seqorder",do_preserve_sequence_order); /* DO PRESERVE SEQUENCE ORDER */ ARGGET("-hbmin",hbmin); /* SET THRESHOLD FOR BUNDLING */ ARGMATCH("-fuse_all",use_aggressive_fusion); ARGMATCH("-do_global",do_global); /* DO GLOBAL */ ARGGET("-read_pairscores",pair_score_file); /* FILENAME TO READ PAIR SCORES*/ ARGMATCH("-do_progressive", do_progressive); /* DO PROGRESSIVE ALIGNMENT */ ARGGET("-subset",subset_file); /* FILENAME TO READ SEQ SUBSET LIST*/ ARGGET("-subset2",subset2_file); /* FILENAME TO READ SEQ SUBSET LIST*/ ARGGET("-remove",rm_subset_file); /* FILENAME TO READ SEQ REMOVAL LIST*/ ARGGET("-remove2",rm_subset2_file); /* FILENAME TO READ SEQ REMOVAL LIST*/ ARGGET("-read_fasta",seq_filename); /* READ FASTA FILE FOR ALIGNMENT */ NEXTARG(matrix_filename); /* NON-FLAG ARG SHOULD BE MATRIX FILE */ } /** CHECK FOR CONFLICTING FLAGS **/ if (po_list_filename && (po_filename || po2_filename)) { WARN_MSG(USERR,(ERRTXT, "Error: The -read_po_list and -read_po flags cannot be used at the same time.\nExiting."), "$Revision: 1.2.2.9 $"); exit_code = 1; goto free_memory_and_exit; } if (((subset_file || rm_subset_file) && !po_filename) || ((subset2_file || rm_subset2_file) && !po2_filename)) { WARN_MSG(USERR,(ERRTXT, "Error: Each -subset/-remove flag must have a corresponding -read_po flag.\nExiting."),"$Revision: 1.2.2.9 $"); exit_code = 1; goto free_memory_and_exit; } if ((subset_file && rm_subset_file) || (subset2_file && rm_subset2_file)) { WARN_MSG(USERR,(ERRTXT, "Error: The -subset and -remove flags cannot be used at the same time.\nExiting."),"$Revision: 1.2.2.9 $"); exit_code = 1; goto free_memory_and_exit; } if (rm_subset_file) { subset_file = rm_subset_file; remove_listed_seqs = 1; } if (rm_subset2_file) { subset2_file = rm_subset2_file; remove_listed_seqs2 = 1; } if (hbmin) bundling_threshold=atof(hbmin); if (!matrix_filename || read_score_matrix(matrix_filename,&score_matrix)<=0){/* READ MATRIX */ WARN_MSG(USERR,(ERRTXT,"Error reading matrix file %s.\nExiting", matrix_filename ? matrix_filename: "because none specified"),"$Revision: 1.2.2.9 $"); exit_code=1; /* SIGNAL ERROR CONDITION */ goto free_memory_and_exit; } if (logfile) { fprintf(logfile,"X-Gap Penalties (Open, Aff1, Aff2; LTrunc, LDecay): %d %d %d %d %d\n", score_matrix.gap_penalty_set[0][0], score_matrix.gap_penalty_set[0][1], score_matrix.gap_penalty_set[0][2], score_matrix.trunc_gap_length, score_matrix.decay_gap_length); fprintf(logfile,"X-Gap Penalties (0, 1, 2, ...): "); for (i=0; i<=score_matrix.max_gap_length; i++) { fprintf (logfile, "%d ", score_matrix.gap_penalty_x[i]); } fprintf(logfile,"... \n"); fprintf(logfile,"Y-Gap Penalties (Open, Aff1, Aff2; LTrunc, LDecay): %d %d %d %d %d\n", score_matrix.gap_penalty_set[1][0], score_matrix.gap_penalty_set[1][1], score_matrix.gap_penalty_set[1][2], score_matrix.trunc_gap_length, score_matrix.decay_gap_length); fprintf(logfile,"Y-Gap Penalties (0, 1, 2, ...): "); for (i=0; i<=score_matrix.max_gap_length; i++) { fprintf (logfile, "%d ", score_matrix.gap_penalty_y[i]); } fprintf(logfile,"... \n"); } if (print_matrix_letters) /* USER WANTS US TO PRINT A MATRIX */ print_score_matrix(stdout,&score_matrix,print_matrix_letters /*"ARNDCQEGHILKMFPSTWYV"*/); /** READ INPUT FILES **/ n_input_seqs = 0; max_input_seqs = 10; CALLOC (input_seqs, max_input_seqs, LPOSequence_T *); if (po_filename) { lpo_in = read_partial_order_file (po_filename, subset_file, remove_listed_seqs, keep_all_links, do_switch_case, &score_matrix); if (lpo_in == NULL) { exit_code = 1; goto free_memory_and_exit; } fprintf(errfile,"...Read %d sequences from MSA file %s...\n",lpo_in->nsource_seq,po_filename); input_seqs[n_input_seqs++] = lpo_in; lpo_in = NULL; } if (po2_filename) { lpo_in = read_partial_order_file (po2_filename, subset2_file, remove_listed_seqs2, keep_all_links, do_switch_case, &score_matrix); if (lpo_in == NULL) { exit_code = 1; goto free_memory_and_exit; } fprintf(errfile,"...Read %d sequences from second MSA file %s...\n",lpo_in->nsource_seq,po2_filename); input_seqs[n_input_seqs++] = lpo_in; lpo_in = NULL; } if (po_list_filename) { po_list_file = fopen (po_list_filename, "r"); while (po_list_file && fscanf (po_list_file, " %s", po_list_entry_filename) == 1) { lpo_in = read_partial_order_file (po_list_entry_filename, NULL, 0, 0, do_switch_case, &score_matrix); if (lpo_in == NULL) { exit_code = 1; goto free_memory_and_exit; } fprintf(errfile,"...Read %d sequences from PO list entry %s...\n",lpo_in->nsource_seq,po_list_entry_filename); nseq_in_list += lpo_in->nsource_seq; input_seqs[n_input_seqs++] = lpo_in; lpo_in = NULL; if (n_input_seqs == max_input_seqs) { max_input_seqs *= 2; REALLOC (input_seqs, max_input_seqs, LPOSequence_T *); } } if (nseq_in_list==0) { WARN_MSG(USERR,(ERRTXT,"Error reading PO list file %s.\nExiting", po_list_file),"$Revision: 1.2.2.9 $"); exit_code=1; /* SIGNAL ERROR CONDITION */ goto free_memory_and_exit; } } if (seq_filename) { seq_ifile = fopen (seq_filename, "r"); if (seq_ifile == NULL) { WARN_MSG(USERR,(ERRTXT,"Couldn't open sequence file %s.\nExiting", seq_filename),"$Revision: 1.2.2.9 $"); exit_code=1; /* SIGNAL ERROR CONDITION */ goto free_memory_and_exit; } nseq = read_fasta (seq_ifile, &seq, do_switch_case, &comment); fclose (seq_ifile); if (nseq == 0) { WARN_MSG(USERR,(ERRTXT,"Error reading sequence file %s.\nExiting", seq_filename),"$Revision: 1.2.2.9 $"); exit_code=1; /* SIGNAL ERROR CONDITION */ goto free_memory_and_exit; } fprintf(errfile,"...Read %d sequences from sequence file %s...\n",nseq,seq_filename); for (i=0; ititle); lpo_out->title=strdup(comment); } /* DIVIDE INTO BUNDLES W/ CONSENSUS USING PERCENT ID */ if (do_analyze_bundles) generate_lpo_bundles(lpo_out,bundling_threshold); if (po_out) { /* WRITE FINAL PARTIAL ORDER ALIGNMENT TO OUTPUT */ if (lpo_file_out=fopen(po_out, "w")) { write_lpo(lpo_file_out,lpo_out,&score_matrix); fclose(lpo_file_out); fprintf(errfile,"...Wrote %d sequences to PO file %s...\n",lpo_out->nsource_seq,po_out); } else { WARN_MSG(USERR,(ERRTXT,"*** Could not save PO file %s. Exiting.", po_out),"$Revision: 1.2.2.9 $"); exit_code=1; /* SIGNAL ERROR CONDITION */ } } if (fasta_out) { /* WRITE FINAL ALIGNMENT IN FASTA-PIR FORMAT */ if (seq_ifile=fopen(fasta_out,"w")) { /* FASTA-PIR ALIGNMENT*/ write_lpo_bundle_as_fasta(seq_ifile,lpo_out,score_matrix.nsymbol, score_matrix.symbol,ibundle); fclose(seq_ifile); fprintf(errfile,"...Wrote %d sequences to FASTA-PIR file %s...\n",lpo_out->nsource_seq,fasta_out); } else { WARN_MSG(USERR,(ERRTXT,"*** Could not save FASTA-PIR file %s. Exiting.", fasta_out),"$Revision: 1.2.2.9 $"); exit_code=1; /* SIGNAL ERROR CONDITION */ } } if (clustal_out) { /* WRITE FINAL ALIGNMENT IN CLUSTAL FORMAT */ if (seq_ifile=fopen(clustal_out,"w")) { /* CLUSTAL ALIGNMENT*/ export_clustal_seqal(seq_ifile,lpo_out,score_matrix.nsymbol, score_matrix.symbol); fclose(seq_ifile); fprintf(errfile,"...Wrote %d sequences to CLUSTAL file %s...\n",lpo_out->nsource_seq,clustal_out); } else { WARN_MSG(USERR,(ERRTXT,"*** Could not save CLUSTAL file %s. Exiting.", clustal_out),"$Revision: 1.2.2.9 $"); exit_code=1; /* SIGNAL ERROR CONDITION */ } } free_memory_and_exit: /* FREE ALL DYNAMICALLY ALLOCATED DATA!!!! */ if (dna_lpo) free_lpo_sequence(dna_lpo,TRUE); for (i=0;i0) FREE (seq); exit (exit_code); } static LPOSequence_T *read_partial_order_file (char *po_filename, char *subset_filename, int remove_listed_seqs, int keep_all_links, int do_switch_case, ResidueScoreMatrix_T *mat) { LPOSequence_T *lpo_in; FILE *po_file=NULL, *subset_file=NULL; if (!po_filename) return NULL; po_file = fopen (po_filename, "r"); if (!po_file) { WARN_MSG (USERR, (ERRTXT,"Couldn't open MSA file %s.\nExiting.",po_filename), "$Revision: 1.2.2.9 $"); return NULL; } if (subset_filename) { subset_file = fopen (subset_filename, "r"); if (!subset_file) { WARN_MSG (USERR, (ERRTXT,"Couldn't open subset file %s.\nExiting.",subset_filename), "$Revision: 1.2.2.9 $"); return NULL; } } if (subset_file) { lpo_in = read_msa_select (po_file, UNKNOWN_MSA, subset_file, keep_all_links, remove_listed_seqs, do_switch_case, mat); fclose (subset_file); fclose (po_file); if (lpo_in==NULL || lpo_in->nsource_seq == 0) { WARN_MSG (USERR, (ERRTXT,"MSA file %s, filtered with subset file %s, couldn't be read or contains no sequences.\nExiting.", po_filename, subset_filename), "$Revision: 1.2.2.9 $"); return NULL; } } else { lpo_in = read_msa (po_file, UNKNOWN_MSA, do_switch_case, mat); fclose (po_file); if (lpo_in==NULL || lpo_in->nsource_seq == 0) { WARN_MSG (USERR, (ERRTXT,"MSA file %s couldn't be read or contains no sequences.\nExiting.", po_filename), "$Revision: 1.2.2.9 $"); return NULL; } } return lpo_in; } poaV2/make_pscores.pl0100755000765400076540000000240410027642411013220 0ustar poapoa#!/usr/bin/perl # # Usage: make_pscores.pl SEQFILE SCOREFILE # # Runs BLAST and writes output to a file "SEQFILE.out". # This file is parsed into lines of the form "seqname1 \t seqname2 \t bitscore" # (suitable for input to POA), which are written to SCOREFILE. # # NB: The bitscore increases with increasing sequence similarity. # $seq_file = $ARGV[0]; $pscore_file = $ARGV[1]; open(PSCORE_OUT, ">$pscore_file"); system("./formatdb -i $seq_file -p T"); system("./blastall -p blastp -d $seq_file -i $seq_file -M BLOSUM80 -o $seq_file.out"); open(BLAST_OUT, "<$seq_file.out"); while(){ @my_parse = split(/\s/, $_); if ($my_parse[0] =~ /^Query=/){ $seq_name1 = $my_parse[1]; while(){ if ($_ =~ />/){ last; } if ($_ =~ /bits/){ while(){ if ($_ =~ />/){ last; } @other_parse = split(/\s+/, $_); $seq_name2 = $other_parse[0]; $bit_score = $other_parse[1]; if ($seq_name2 ne ""){ printf PSCORE_OUT "$seq_name1\t$seq_name2\t$bit_score.0\n"; } } if ($_ =~ />/){ last; } } } } } close(BLAST_OUT); close(PSCORE_OUT); poaV2/msa_format.c0100644000765400076540000004401310062157011012477 0ustar poapoa /****************/ /* msa_format.c */ /****************/ /* --- Implementation of "msa_format.h": Functions for reading CLUSTAL- and FASTA-PIR-formatted files into the LPOSequence_T data structure, and for determining file type from the initial line(s) of a file. --- */ #include "msa_format.h" /** is `ch' an allowed residue? (a-z OR A-Z OR ? OR [ OR ]) */ static int is_residue_char (char ch); /** is `ch' an allowed gap? (. OR -) */ static int is_gap_char (char ch); /** could `ch' be the first character of a sequence name? (NOT # AND NOT * AND NOT whitespace) */ static int is_name_first_char (char ch); /** decide which sequences in an RC-MSA alignment to keep based on a filter file `select_ifile'. remove discarded sequences and reindex the remaining ones. returns the NEW number of sequences. */ static int filter_sequence_set (int n_seqs, FILE *select_ifile, int remove_listed_sequences, char **seq_names, char **seq_titles, char **aln_mat, int *aln_lengths); /** remove sequence bad_seq_id from each letter containing it. ASSUMES THAT NO LETTER CONTAINS _ONLY_ bad_seq_id... e.g. bad_seq_id IS A CONSENSUS. either keep or don't keep links present only in the removed sequence. */ static void strip_seq_from_lpo (LPOSequence_T *lposeq, int bad_seq_id, int keep_all_links); static LPOSequence_T *read_clustal (FILE *ifile, const char *first_line, FILE *select_ifile, int remove_listed_sequences, int do_switch_case, ResidueScoreMatrix_T *score_matrix); static LPOSequence_T *read_pir (FILE *ifile, const char *first_line, FILE *select_ifile, int remove_listed_sequences, int do_switch_case, ResidueScoreMatrix_T *score_matrix); /** Reads an MSA from a file. If `format' is UNKNOWN_MSA, the file format is determined from the first line(s) of the file. Uses `select_ifile', if non-NULL, to filter the sequence set. Uppercases or lowercases sequence characters according to `do_switch_case'. Indexes LPO symbols using `score_matrix'. */ LPOSequence_T *read_msa_select (FILE *ifile, msa_file_format format, FILE *select_ifile, int keep_all_links, int remove_listed_sequences, int do_switch_case, ResidueScoreMatrix_T *score_matrix) { static char line[512]=""; /* USE format TO FIX FILE FORMAT IF POSSIBLE. */ /* OTHERWISE, PO FILES START WITH 'VERSION=' (and this line is discarded), FASTA-PIR FILES START WITH '>', AND CLUSTAL FILES WITH 'CLUSTAL' OR SIMPLY WITH THE FIRST ALIGNMENT LINE. LINES STARTING WITH whitespace OR '#' OR '*' ARE IGNORED. */ while (format!=UNKNOWN_MSA || fgets (line, sizeof(line)-1, ifile)) { if (format==PIR_MSA || line[0] == '>') { return read_pir (ifile, line, select_ifile, remove_listed_sequences, do_switch_case, score_matrix); } else if (format==PO_MSA || 0==strncmp(line,"VERSION=",8)) { if (select_ifile != NULL) { return read_lpo_select (ifile, select_ifile, keep_all_links, remove_listed_sequences); } else { return read_lpo (ifile); } } else if (format==CLUSTAL_MSA || 0==strncmp(line,"CLUSTAL",7)) { return read_clustal (ifile, line, select_ifile, remove_listed_sequences, do_switch_case, score_matrix); } else if (line[0] == '#' || line[0] == '*' || line[0] == ' ' || line[0] == '\t' || line[0] == '\n' || line[0] == '\r') { continue; } else { WARN_MSG(USERR,(ERRTXT, "Unable to determine MSA file type... trying CLUSTAL.\n"),"$Revision: 1.1.2.3 $"); return read_clustal (ifile, line, select_ifile, remove_listed_sequences, do_switch_case, score_matrix); } } WARN_MSG(USERR,(ERRTXT, "No data in MSA file.\n"),"$Revision: 1.1.2.3 $"); return NULL; } /** Reads an MSA from a file (cf. read_msa_select). */ LPOSequence_T *read_msa (FILE *ifile, msa_file_format format, int do_switch_case, ResidueScoreMatrix_T *score_matrix) { return read_msa_select (ifile, format, NULL, 0, 0, do_switch_case, score_matrix); } /** Reads a CLUSTAL-formatted alignment file. */ LPOSequence_T *read_clustal (FILE *fp, const char *first_line, FILE *select_ifile, int remove_listed_sequences, int do_switch_case, ResidueScoreMatrix_T *score_matrix) { int i, j, n_seqs=0, curr_seq=0, expect_repeats=0, expect_header=1, line_num=0; char ch; char **seq_names=NULL, **seq_titles=NULL, **aln_mat=NULL; int *aln_lengths=NULL; char line[512]="", name[512]="", aln[512]=""; LPOSequence_T *lposeq = NULL; while ((first_line!=NULL && line_num==0) || fgets (line, sizeof(line)-1, fp)) { if (first_line!=NULL && line_num==0) { strcpy (line, first_line); } line_num++; if (expect_header) { /* LOOKING FOR 'CLUSTAL' HEADER LINE */ if (0 == strncmp(line,"CLUSTAL",7)) { /* FOUND IT; STOP LOOKING */ expect_header=0; continue; } else if (0 == is_name_first_char(line[0])) { /* COMMENT LINE; KEEP LOOKING */ continue; } else { expect_header=0; /* FOUND SOMETHING ELSE; NO HEADER, SO CONTINUE */ } } if (0 == is_name_first_char(line[0])) { /* BLOCK SEPARATOR? */ curr_seq = 0; if (n_seqs>0) { expect_repeats=1; /* YES; NOW EXPECT SAME SEQUENCES OVER AGAIN IN EACH BLOCK */ } continue; } if (sscanf (line, "%s %[^\n\r]", name, aln) < 2) { WARN_MSG(USERR,(ERRTXT, "Error: Trouble reading CLUSTAL-formatted file near line %d: \n>>>\n%s<<<\nBailing out.\n",line_num,line), "$Revision: 1.1.2.3 $"); goto free_memory_and_exit; } if (0 == expect_repeats) { /* FIRST BLOCK STILL, SO MAKE ROOM FOR NEW SEQ */ n_seqs++; REALLOC (seq_names, n_seqs, char *); seq_names[n_seqs-1] = strdup(name); REALLOC (seq_titles, n_seqs, char *); seq_titles[n_seqs-1] = strdup(""); REALLOC (aln_mat, n_seqs, char *); aln_mat[n_seqs-1] = strdup(" "); REALLOC (aln_lengths, n_seqs, int); aln_lengths[n_seqs-1] = 0; } else if (curr_seq>=n_seqs || strcmp(name,seq_names[curr_seq])) { /* NAME SHOULD BE A REPEAT */ WARN_MSG(USERR,(ERRTXT, "Error: Trouble reading CLUSTAL-formatted file at line %d: \n>>>\n%s<<<\nSequence name (%s) does not match expected sequence name (%s). Bailing out.\n",line_num,line,name,seq_names[curr_seq]), "$Revision: 1.1.2.3 $"); goto free_memory_and_exit; } REALLOC (aln_mat[curr_seq], aln_lengths[curr_seq] + strlen(aln), char); for (i=0; i0) { strcpy (lposeq->name, lposeq->source_seq[0].name); FREE (lposeq->title); lposeq->title = strdup (lposeq->source_seq[0].title); } return lposeq; } /** Reads a FASTA-PIR-formatted alignment file. */ LPOSequence_T *read_pir (FILE *fp, const char *first_line, FILE *select_ifile, int remove_listed_sequences, int do_switch_case, ResidueScoreMatrix_T *score_matrix) { int i, j, n_seqs=0, curr_seq=0, line_num=0; char ch; char **seq_names=NULL, **seq_titles=NULL, **aln_mat=NULL; int *aln_lengths=NULL, *keep_seq=NULL; char line[512]="", name[512]="", title[512]="", aln[512]=""; LPOSequence_T *lposeq = NULL; while ((first_line!=NULL && line_num==0) || fgets (line, sizeof(line)-1, fp)) { if (first_line!=NULL && line_num==0) { strcpy (line, first_line); } line_num++; if (line[0] == '>') { /* HEADER LINE FOR NEW SEQUENCE */ title[0] = '\0'; if (sscanf (line, ">%s %[^\n\r]", name, title) < 1) { WARN_MSG(USERR,(ERRTXT, "Error: Trouble reading PIR-formatted file near line %d (no sequence name?):\n>>>\n%s<<<\nBailing out.\n",line_num,line), "$Revision: 1.1.2.3 $"); goto free_memory_and_exit; } n_seqs++; curr_seq=n_seqs-1; REALLOC (seq_names, n_seqs, char *); seq_names[n_seqs-1] = strdup(name); REALLOC (seq_titles, n_seqs, char *); seq_titles[n_seqs-1] = strdup(title); REALLOC (aln_mat, n_seqs, char *); aln_mat[n_seqs-1] = strdup(" "); REALLOC (aln_lengths, n_seqs, int); aln_lengths[n_seqs-1] = 0; } else if (line[0]=='#' || line[0]=='*') { /* COMMENT LINE */ continue; } else { /* ALIGNMENT ROW FOR CURRENT SEQUENCE */ sscanf (line, "%[^\n\r]", aln); if (n_seqs==0) { WARN_MSG(USERR,(ERRTXT, "Error: Trouble reading PIR-formatted file near line %d (no preceding '>seqname' line?):\n>>>\n%s<<<\nBailing out.\n",line_num,line), "$Revision: 1.1.2.3 $"); goto free_memory_and_exit; } REALLOC (aln_mat[curr_seq], aln_lengths[curr_seq] + strlen(aln), char); for (i=0; i0) { strcpy (lposeq->name, lposeq->source_seq[0].name); FREE (lposeq->title); lposeq->title = strdup (lposeq->source_seq[0].title); } return lposeq; } /** Creates an LPO from an RC-MSA alignment matrix. */ LPOSequence_T *lpo_from_aln_mat (int n_seqs, char **seq_names, char **seq_titles, char **aln_mat, int *aln_lengths, int do_switch_case, ResidueScoreMatrix_T *score_matrix) { int i, j, k, len, col, res_id, letter_id; char ch; LPOSequence_T *curr_seq, *lposeq; int **column_ids; /** which column contains each residue */ int **res_ids; /** which residue is in each column */ int *al_x, *al_y; int max_aln_length = 0; char *consens_row; if (n_seqs==0) return NULL; CALLOC (column_ids, n_seqs+1, int *); CALLOC (res_ids, n_seqs+1, int *); for (i=0; i max_aln_length) { max_aln_length = aln_lengths[i]; } } for (i=0; iname, "consens_row"); lposeq->title = strdup (""); CALLOC (lposeq->sequence, max_aln_length+1, char); for (len=col=0; colsequence[len] = ch; column_ids[0][len] = col; res_ids[0][col] = len; len++; } } lposeq->length = len; lposeq->sequence[col] = '\0'; REALLOC (lposeq->sequence, len+1, char); initialize_seqs_as_lpo (1, lposeq, score_matrix); for (i=0; iname, seq_names[i]); curr_seq->title = strdup (seq_titles[i]); /* READ CHARACTERS FROM ALIGNMENT ROW INTO SEQUENCE: */ CALLOC (curr_seq->sequence, aln_lengths[i]+1, char); for (len=col=0; colsequence[len] = ch; column_ids[i+1][len] = col; res_ids[i+1][col] = len; len++; } } curr_seq->length = len; curr_seq->sequence[len] = '\0'; REALLOC (curr_seq->sequence, len+1, char); initialize_seqs_as_lpo (1, curr_seq, score_matrix); /* ALIGN THIS SEQUENCE TO EXISTING ALIGNMENT USING CONSENSUS ROW */ build_seq_to_po_index (lposeq); CALLOC (al_x, len, int); CALLOC (al_y, lposeq->length, int); for (j=0; jlength; j++) { al_y[j] = INVALID_LETTER_POSITION; } /* FOR EACH RESIDUE IN THIS SEQUENCE: */ for (j=0; jsource_seq[0].seq_to_po[res_id]; al_x[j] = letter_id; al_y[letter_id] = j; } /* DO THE FUSION-BASED BUILDUP */ fuse_ring_identities (lposeq->length, lposeq->letter, curr_seq->length, curr_seq->letter, al_y, al_x); fuse_lpo (lposeq, curr_seq, al_y, al_x); free_lpo_sequence (curr_seq, 0); FREE (al_x); FREE (al_y); FREE (curr_seq); } /* STRIP OUT CONSENSUS SEQUENCE */ strip_seq_from_lpo (lposeq, /*remove #*/ 0, /*0=don't keep all links*/ 0); for (i=0; i='a' && ch<='z') return 1; if (ch>='A' && ch<='Z') return 1; if (ch=='?' || ch=='[' || ch==']') return 1; return 0; } static int is_gap_char (char ch) { if (ch=='.' || ch=='-') return 1; return 0; } static int is_name_first_char (char ch) { if (ch=='\n' || ch=='\r' || ch==' ' || ch=='\t' || ch=='#' || ch=='*') return 0; return 1; } static int filter_sequence_set (int n_seqs, FILE *select_ifile, int remove_listed_sequences, char **seq_names, char **seq_titles, char **aln_mat, int *aln_lengths) { int i, j; int *keep_seq; char name[512]=""; /* DECIDE WHICH SEQUENCES TO KEEP: */ CALLOC (keep_seq, n_seqs, int); for (i=0; i= 1) { for (i=0; insource_seq - 1; /* FREE MEM ASSOCIATED WITH BAD SEQ */ FREE (lposeq->source_seq[bad_seq_id].title); FREE (lposeq->source_seq[bad_seq_id].sequence); FREE (lposeq->source_seq[bad_seq_id].seq_to_po); FREE (lposeq->source_seq[bad_seq_id].po_to_seq); /* COMPACT SOURCE LIST */ for (i=0; i=bad_seq_id) { lposeq->source_seq[i] = lposeq->source_seq[i+1]; } /* This de-allocation causes a bug in conjunction with later REBUFF calls: */ /* REALLOC (lposeq->source_seq, n_seqs, LPOSourceInfo_T); */ lposeq->nsource_seq = n_seqs; /* RENUMBER SOURCES IN EACH LETTER */ for (i=0; ilength; i++) { lett = &(lposeq->letter[i]); prev = NULL; src = &(lett->source); while (src != NULL && src->iseq >= 0) { if (src->iseq == bad_seq_id) { if (prev) { /* NOT IN LIST HEAD, SO RELINK FROM PREV AND FREE */ prev->more = src->more; FREE (src); src = prev->more; } else { /* IN LIST HEAD, SO REASSIGN HEAD DATA, RELINK, FREE */ if (src->more) { tmp_src = src->more; src->ipos = tmp_src->ipos; src->iseq = tmp_src->iseq; src->more = tmp_src->more; FREE (tmp_src); } else { src->ipos = -1; src->iseq = -1; } } } else { if (src->iseq>=bad_seq_id) src->iseq--; prev = src; src = src->more; } } } if (0==keep_all_links) { build_seq_to_po_index (lposeq); /* FREE ALL LINK INFO */ for (i=0; ilength; i++) { lett = &(lposeq->letter[i]); for (lnk = &(lett->left); lnk->more != NULL; ) { tmp_lnk = lnk->more; lnk->more = tmp_lnk->more; FREE (tmp_lnk); } lnk->ipos = -1; for (lnk = &(lett->right); lnk->more != NULL; ) { tmp_lnk = lnk->more; lnk->more = tmp_lnk->more; FREE (tmp_lnk); } lnk->ipos = -1; } /* REBUILD LINK INFO FROM REMAINING SEQS */ for (i=0; isource_seq[i].length; for (j=0; jsource_seq[i].seq_to_po[j]; id_right = lposeq->source_seq[i].seq_to_po[j+1]; add_lpo_link (&(lposeq->letter[id_left].right), id_right); add_lpo_link (&(lposeq->letter[id_right].left), id_left); } } } } poaV2/msa_format.h0100644000765400076540000000272210024245153012510 0ustar poapoa /****************/ /* msa_format.h */ /****************/ /* --- Functions for reading CLUSTAL- and FASTA-PIR-formatted files into the LPOSequence_T data structure, and for determining file type from the initial line(s) of a file. --- */ #ifndef MSA_FORMAT_HEADER_INCLUDED #define MSA_FORMAT_HEADER_INCLUDED #include "default.h" #include "seq_util.h" #include "lpo.h" /** types of MSA files supported */ typedef enum { UNKNOWN_MSA, CLUSTAL_MSA, PIR_MSA, PO_MSA } msa_file_format; /** Reads an MSA from a file (cf. read_msa_select). */ LPOSequence_T *read_msa (FILE *ifile, msa_file_format format, int do_switch_case, ResidueScoreMatrix_T *score_matrix); /** Reads an MSA from a file. If `format' is UNKNOWN_MSA, the file format is determined from the first line(s) of the file. Uses `select_ifile', if non-NULL, to filter the sequence set. Uppercases or lowercases sequence characters according to `do_switch_case'. Indexes LPO symbols using `score_matrix'. */ LPOSequence_T *read_msa_select (FILE *ifile, msa_file_format format, FILE *select_ifile, int keep_all_links, int remove_listed_sequences, int do_switch_case, ResidueScoreMatrix_T *score_matrix); /** Creates an LPO from an RC-MSA alignment matrix. */ LPOSequence_T *lpo_from_aln_mat (int n_seqs, char **seq_names, char **seq_titles, char **aln_mat, int *aln_lengths, int do_switch_case, ResidueScoreMatrix_T *score_matrix); #endif /* MSA_FORMAT_HEADER_INCLUDED */ poaV2/multidom.pscore0100644000765400076540000000067210027642422013263 0ustar poapoaABL1_HUMAN ABL1_HUMAN 2475.0 ABL1_HUMAN MATK_HUMAN 260.0 ABL1_HUMAN GRB2_HUMAN 100.0 ABL1_HUMAN CRKL_HUMAN 63.0 CRKL_HUMAN CRKL_HUMAN 736.0 CRKL_HUMAN GRB2_HUMAN 70.0 CRKL_HUMAN ABL1_HUMAN 60.0 CRKL_HUMAN MATK_HUMAN 55.0 GRB2_HUMAN GRB2_HUMAN 551.0 GRB2_HUMAN ABL1_HUMAN 100.0 GRB2_HUMAN MATK_HUMAN 77.0 GRB2_HUMAN CRKL_HUMAN 70.0 MATK_HUMAN MATK_HUMAN 1215.0 MATK_HUMAN ABL1_HUMAN 279.0 MATK_HUMAN GRB2_HUMAN 69.0 MATK_HUMAN CRKL_HUMAN 55.0 poaV2/multidom.seq0100644000765400076540000000527310024245153012557 0ustar poapoa>ABL1_HUMAN PROTO-ONCOGENE TYROSINE-PROTEIN KINASE ABL (EC 2.7.1.112) (P150) (C-ABL). MLEICLKLVG CKSKKGLSSS SSCYLEEALQ RPVASDFEPQ GLSEAARWNS KENLLAGPSE NDPNLFVALY DFVASGDNTL SITKGEKLRV LGYNHNGEWC EAQTKNGQGW VPSNYITPVN SLEKHSWYHG PVSRNAAEYL LSSGINGSFL VRESESSPGQ RSISLRYEGR VYHYRINTAS DGKLYVSSES RFNTLAELVH HHSTVADGLI TTLHYPAPKR NKPTVYGVSP NYDKWEMERT DITMKHKLGG GQYGEVYEGV WKKYSLTVAV KTLKEDTMEV EEFLKEAAVM KEIKHPNLVQ LLGVCTREPP FYIITEFMTY GNLLDYLREC NRQEVNAVVL LYMATQISSA MEYLEKKNFI HRDLAARNCL VGENHLVKVA DFGLSRLMTG DTYTAHAGAK FPIKWTAPES LAYNKFSIKS DVWAFGVLLW EIATYGMSPY PGIDLSQVYE LLEKDYRMER PEGCPEKVYE LMRACWQWNP SDRPSFAEIH QAFETMFQES SISDEVEKEL GKQGVRGAVS TLLQAPELPT KTRTSRRAAE HRDTTDVPEM PHSKGQGESD PLDHEPAVSP LLPRKERGPP EGGLNEDERL LPKDKKTNLF SALIKKKKKT APTPPKRSSS FREMDGQPER RGAGEEEGRD ISNGALAFTP LDTADPAKSP KPSNGAGVPN GALRESGGSG FRSPHLWKKS STLTSSRLAT GEEEGGGSSS KRFLRSCSAS CVPHGAKDTE WRSVTLPRDL QSTGRQFDSS TFGGHKSEKP ALPRKRAGEN RSDQVTRGTV TPPPRLVKKN EEAADEVFKD IMESSPGSSP PNLTPKPLRR QVTVAPASGL PHKEEAEKGS ALGTPAAAEP VTPTSKAGSG APGGTSKGPA EESRVRRHKH SSESPGRDKG KLSRLKPAPP PPPAASAGKA GGKPSQSPSQ EAAGEAVLGA KTKATSLVDA VNSDAAKPSQ PGEGLKKPVL PATPKPQSAK PSGTPISPAP VPSTLPSASS ALAGDQPSST AFIPLISTRV SLRKTRQPPE RIASGAITKG VVLDSTEALC LAISRNSEQM ASHSAVLEAG KNLYTFCVSY VDSIQQMRNK FAFREAINKL ENNLRELQIC PATAGSGPAA TQDFSKLLSS VKEISDIVQR >CRKL_HUMAN CRK-LIKE PROTEIN. MSSARFDSSD RSAWYMGPVS RQEAQTRLQG QRHGMFLVRD SSTCPGDYVL SVSENSRVSH YIINSLPNRR FKIGDQEFDH LPALLEFYKI HYLDTTTLIE PAPRYPSPPM GSVSAPNLPT AEDNLEYVRT LYDFPGNDAE DLPFKKGEIL VIIEKPEEQW WSARNKDGRV GMIPVPYVEK LVRSSPHGKH GNRNSNSYGI PEPAHAYAQP QTTTPLPAVS GSPGAAITPL PSTQNGPVFA KAIQKRVPCA YDKTALALEV GDIVKVTRMN INGQWEGEVN GRKGLFPFTH VKIFDPQNPD ENE >GRB2_HUMAN GROWTH FACTOR RECEPTOR-BOUND PROTEIN 2 (GRB2 ADAPTOR PROTEIN)(SH2) MEAIAKYDFK ATADDELSFK RGDILKVLNE ECDQNWYKAE LNGKDGFIPK NYIEMKPHPW FFGKIPRAKA EEMLSKQRHD GAFLIRESES APGDFSLSVK FGNDVQHFKV LRDGAGKYFL WVVKFNSLNE LVDYHRSTSV SRNQQIFLRD IEQVPQQPTY VQALFDFDPQ EDGELGFRRG DFIHVMDNSD PNWWKGACHG QTGMFPRNYV TPVNRNV >MATK_HUMAN MEGAKARYOCYTE-ASSOCIATED TYROSINE-PROTEIN KINASE (EC 2.7.1.112) (TYROSINE-PROTEIN KINASE CTK) (PROTEIN KINASE HYL) (HEMATOPOIETIC CONSENSUS TYROSINE-LACKING KINASE). MAGRGSLVSW RAFHGCDSAE ELPRVSPRFL RAWHPPPVSA RMPTRRWAPG TQCITKCEHT RPKPGELAFR KGDVVTILEA CENKSWYRVK HHTSGQEGLL AAGALREREA LSADPKLSLM PWFHGKISGQ EAVQQLQPPE DGLFLVRESA RHPGDYVLCV SFGRDVIHYR VLHRDGHLTI DEAVFFCNLM DMVEHYSKDK GAICTKLVRP KRKHGTKSAE EELARAGWLL NLQHLTLGAQ IGEGEFGAVL QGEYLGQKVA VKNIKCDVTA QAFLDETAVM TKMQHENLVR LLGVILHQGL YIVMEHVSKG NLVNFLRTRG RALVNTAQLL QFSLHVAEGM EYLESKKLVH RDLAARNILV SEDLVAKVSD FGLAKAERKG LDSSRLPVKW TAPEALKHGK FTSKSDVWSF GVLLWEVFSY GRAPYPKMSL KEVSEAVEKG YRMEPPEGCP GPVHVLMSSC WEAEPARRPP FRKLAEKLAR ELRSAGAPAS VSGQDADGST SPRSQEP poaV2/numeric_data.c0100644000765400076540000001135210024245153013005 0ustar poapoa #include /**@memo create a new LPONumericData_T record for the designated source_seq. Dynamically allocates data storage array equal in length to the length of the source_seq. Initializes the array values to initial_value if non-zero. Returns a pointer to the new LPONumericData_T and also adds it to the source_seq->data[] list. */ LPONumericData_T *new_numeric_data(LPOSourceInfo_T *source_seq, char name[], char title[], double initial_value) { int i; LPONumericData_T *data=NULL; REBUFF(source_seq->data,source_seq->ndata,NUMDATA_BUFFER_CHUNK,LPONumericData_T); data=source_seq->data + source_seq->ndata++; STRNCPY(data->name,name,SEQUENCE_NAME_MAX); /* COPY NAME AND TITLE */ if (title) data->title=strdup(title); CALLOC(data->data,source_seq->length,double); /* ALLOCATE THE ARRAY */ if (initial_value) /* INITIALIZE VALUES IF DESIRED */ LOOP (i,source_seq->length) data->data[i]=initial_value; return data; /* RETURN POINTER TO THE NEW DATA HOLDER */ } LPONumericData_T *cp_numeric_data(LPOSourceInfo_T *source_seq, LPONumericData_T *data) { int i; LPONumericData_T *new_data; new_data=new_numeric_data(source_seq,data->name,data->title,0); LOOP (i,source_seq->length) /* COPY ALL THE VALUES */ new_data->data[i]=data->data[i]; return new_data; } /**@memo finds LPONumericData from the source_seq, matching the specified name, or returns NULL if not found. */ LPONumericData_T *find_numeric_data(LPOSourceInfo_T *source_seq, char name[]) { int i; LOOP (i,source_seq->ndata) if (0==strcmp(source_seq->data[i].name,name)) /*FOUND IT. RETURN POINTER*/ return source_seq->data+i; return NULL; /* NOT FOUND! */ } /**@memo frees the set of numeric_data passed as arguments. If requested, will also free the block of memory for the array of entries data[]. */ void free_lpo_numeric_data(int ndata,LPONumericData_T *data, int please_free_block) { int i; LOOP (i,ndata) { /* DUMP ASSOCIATED ARRAYS */ FREE(data[i].title); FREE(data[i].data); } if (please_free_block) /* DUMP THE BLOCK ITSELF */ free(data); } /**@memo creates one or more new numeric_data for a given sequence, based on the presence of corresponding named numeric_data. Specifically, the list of set_names[] is processed one by one, creating a source_name according to the source_name_fmt string, and finding a numeric_data entry with that name. If it is not found, the routine calls exit(-1). If it is found, a new set of numeric_data is created, with a name created according to target_name_fmt, and titled according to the title_fmt string and source_data->title. */ void new_numeric_data_sets(LPOSourceInfo_T *source_seq, int nset,char *set_names[], char source_name_fmt[], char target_name_fmt[], char title_fmt[]) { int j; LPONumericData_T *data; char name[256],title[4096]; LOOPF (j,nset) { sprintf(name,source_name_fmt,set_names[j]); /* GENERATE NAME TO MATCH*/ data=find_numeric_data(source_seq,name); /*FIND SOURCE DATA*/ if (!data) { WARN_MSG(USERR,(ERRTXT,"*** could not find dataset %s for seq %s.\nExiting\n\n", name,source_seq->name),"$Revision: 1.2 $"); exit(-1); } sprintf(name,target_name_fmt,set_names[j]); /* GENERATE NEW NAME */ sprintf(title,title_fmt,data->title); new_numeric_data(source_seq,name,title,0.); /* CREATE NEW ARRAY */ } } /**@memo reads a stream of FASTA-formatted numeric data, and stores them in the corresponding set of source_seq, matching the sequences by name. Multiple numeric data entries can be read from a single stream. */ void read_numeric_data(int nsource_seq, LPOSourceInfo_T source_seq[], FILE *ifile) { int i,j; char line[4096],seq_name[128],data_name[1024],title[2048]; LPONumericData_T *data; while (fgets(line,sizeof(line),ifile)) { title[0]='\0'; if (sscanf(line,">%s NUMERIC_DATA=%s %s",seq_name,data_name,title)>=2) { LOOP (i,nsource_seq) /* FIND THE MATCHING SEQUENCE */ if (0==strcmp(seq_name,source_seq[i].name)) /* MATCH */ break; if (LOOP_FINISHED(i,nsource_seq)) /* SEQ NOT FOUND!! */ WARN_MSG(USERR,(ERRTXT,"Error! NUMERIC_DATA %s, sequence %s does not exist. Skipping.\n\n",data_name,seq_name),"$Revision: 1.2 $"); else { /* FOUND THE SEQ, SAVE THE DATA */ if (data=find_numeric_data(source_seq+i,data_name)) /*REUSE EXISTING*/ WARN_MSG(WARN,(ERRTXT,"NUMERIC_DATA %s already exists on sequence %s. Overwriting.\n",data_name,seq_name),"$Revision: 1.2 $"); else /* CREATE A NEW DATA HOLDER */ data=new_numeric_data(source_seq+i,data_name,title,0.); LOOPF (j,source_seq[i].length) /* READ IN THE VALUES */ fscanf(ifile," %lf",data->data+j); } } } } poaV2/poa.h0100644000765400076540000001356610024245153011147 0ustar poapoa #ifndef POA_HEADER_INCLUDED #define POA_HEADER_INCLUDED /** MAXIMUM GAP LENGTH TRACKED IN align_lpo; LARGER THAN THIS WILL BE CAPPED (DEFAULT VALUE) */ #ifndef TRUNCATE_GAP_LENGTH #define TRUNCATE_GAP_LENGTH 16 #endif /** LENGTH OVER WHICH GAP PENALTY DECAYS IN align_lpo (DEFAULT VALUE) */ #ifndef DECAY_GAP_LENGTH #define DECAY_GAP_LENGTH 0 #endif #define REV_COMP_STRING "/rev_comp" /** THE NULL LETTER-REFERENCE */ #define INVALID_LETTER_POSITION (-1) typedef int LPOLetterRef_T; typedef int LPOScore_T; /** NEEDED FOR seq_util.h */ typedef LPOScore_T ResidueScore_T; #define RESIDUE_SCORE_DEFINED /** linked list for storing source origin (sequence position) from which this letter was derived */ struct LPOLetterSource_S { /** index of the sequence, referencing the source_seq[] array*/ int iseq; /** index of the corresponding position in that sequence */ LPOLetterRef_T ipos; /** next node in the linked list */ struct LPOLetterSource_S *more; } ; typedef struct LPOLetterSource_S LPOLetterSource_T; /** linked list for connecting an LPOLetter to either right or left */ struct LPOLetterLink_S { /** ADJACENT LETTER LINKED TO THIS LETTER */ LPOLetterRef_T ipos; #ifdef USE_WEIGHTED_LINKS /** transition cost for traversing this link */ LPOScore_T score; #endif /** next node in the linked list */ struct LPOLetterLink_S *more; } ; typedef struct LPOLetterLink_S LPOLetterLink_T; /** the chunk size for allocating additional letters in an LPOLetter_T array */ #define LPO_LETTER_BUFFER_CHUNK 64 /** Structure for storing individual LPO Letters*/ struct LPOLetter_S { /** ADJACENT LETTER(S) TO THE LEFT */ LPOLetterLink_T left; /** ADJACENT LETTER(S) TO THE RIGHT */ LPOLetterLink_T right; /** SOURCE SEQ POSITION(S) */ LPOLetterSource_T source; /** CIRCULAR LIST OF ALIGNED POSITIONS */ LPOLetterRef_T align_ring; /** MINIMUM INDEX OF ALL POSITIONS ON THE RING */ LPOLetterRef_T ring_id; /** SCORE FOR BALANCING PARTIAL ORDER EFFECTS ON MATRIX NEUTRALITY */ float score; /** THE ACTUAL RESIDUE CODE! */ char letter; } ; typedef struct LPOLetter_S LPOLetter_T; /** maximum length of a sequence name */ #define SEQUENCE_NAME_MAX 32 /** buffer chunk size for expanding a block of seq storage */ #define SEQUENCE_BUFFER_CHUNK 8 /** buffer chunk size for expanding a source_seq[] array */ #define SOURCE_SEQ_BUFFER_CHUNK 16 #define NUMDATA_BUFFER_CHUNK 4 /** storage for quantitative data attached to a sequence */ struct LPONumericData_S { /** */ char name[SEQUENCE_NAME_MAX]; /** */ char *title; /** */ double *data; }; typedef struct LPONumericData_S LPONumericData_T; /** Structure for storing individual source sequence information, stuff like name, title etc. */ struct LPOSourceInfo_S { /** */ char name[SEQUENCE_NAME_MAX]; /** */ char *title; /** */ char *sequence; /** */ int *seq_to_po; /** */ int *po_to_seq; /** */ LPONumericData_T *data; /** */ int ndata; /** */ int length; /** */ int istart; /** FOR PURPOSES OF HEAVIEST BUNDLE CALCULATION */ int weight; /** WHAT BUNDLE IS THIS A MEMBER OF? */ int bundle_id; }; typedef struct LPOSourceInfo_S LPOSourceInfo_T; /** the NULL bundle-reference */ #define NO_BUNDLE (-1) /** bundle-reference meaning "include all bundles" */ #define ALL_BUNDLES (-1) /** holder for an LPO sequence, its letters, and associated information */ struct LPOSequence_S {/** */ int length;/** */ LPOLetter_T *letter;/** */ char *title;/** */ char *sequence;/** */ char name[SEQUENCE_NAME_MAX];/** */ int nsource_seq;/** */ LPOSourceInfo_T *source_seq; }; typedef struct LPOSequence_S LPOSequence_T; typedef LPOSequence_T Sequence_T; /**@memo GENERAL FORM IS seq_y[j].left.ipos */ #define SEQ_Y_LEFT(j) (j-1) #define SEQ_Y_RIGHT(j) (j+1) /**@memo Data structure for analyzing sequence differences in MSA*/ struct LPOLetterCount_S { unsigned int is_error:2; unsigned int meets_criteria:1; unsigned int seq_count:29; }; typedef struct LPOLetterCount_S LPOLetterCount_T; /** classification of sequence differences */ enum { no_error, substitution_error, insertion_error, deletion_error, max_error_states }; /** DON'T ALLOCATE MORE THAN THIS TOTAL AMOUNT OF MEMORY --------------------------------------------------------------- --------------------------------------------------------------- */ #define POA_MAX_ALLOC 300000000 #endif /**@name The lpo library*/ /*@{*/ /**@memo This set of web pages documents the functionality of the lpo function library. This is a set of C functions for reading, writing, creating, manipulating, and aligning partial order sequences. These functions divide into several groups: \begin{itemize} \item \URL[File utilities]{General.html#read_fasta}: reading and writing FASTA and po files \item \URL[lpo utilities]{General.html#add_lpo_link}: creating, fusing, freeing, manipulating lpo data \item \URL[alignment]{General.html#align_lpo}: aligning one or more linear sequences to an lpo \item analysis: analyzing lpo structure, e.g. to find consensus \end{itemize} */ /**@memo Click \URL[here]{../poa} for more information about partial order alignment.*/ /*@}*/ /**@name linking to the lpo library */ /*@{*/ /**@memo To use function from this library in your code, you must do two things. First you must include in your source files, to access the prototypes. Second, when you compile, you must tell the compiler where the lpo header and library files are located. {\bfNB it appears gcc loads libraries in reverse order of the command line arguments, so you have to specify your source files BEFORE the -llpo library argument on the command line, or the linker will give you unresolved reference errors}. e.g. \begin{verbatim} gcc -o myprog myfile.c -I~leec/lib/include -L~leec/lib -llpo \end{verbatim} */ /**@memo Click \URL[here]{../poa} for more information about partial order alignment.*/ /*@}*/ poaV2/project.h0100644000765400076540000000021410024245153012020 0ustar poapoa #ifndef PROGRAM_NAME #define PROGRAM_NAME "poa" #endif #ifndef PROGRAM_VERSION #define PROGRAM_VERSION "v1.0.0" #endif #include "poa.h" poaV2/remove_bundle.c0100644000765400076540000001056110024245153013201 0ustar poapoa #include "default.h" #include "poa.h" #include "seq_util.h" #include "lpo.h" int compact_links(LPOLetterLink_T *list,int old_to_new[]) { LPOLetterLink_T *link=NULL,*link_last=NULL,*next_link,*link_head=NULL; CALLOC(link,1,LPOLetterLink_T); memcpy(link,list,sizeof(LPOLetterLink_T)); for (;link && link->ipos>=0;link=next_link){ next_link=link->more; if (old_to_new[link->ipos]<0) /* THIS POSITION NO LONGER EXISTS! */ free(link); /* DELETE THIS LINK ENTRY */ else { /* COPY THIS BACK TO PREVIOUS LINK ENTRY: COMPACT THE LIST*/ link->ipos = old_to_new[link->ipos]; /* REMAP TO NEW INDEX SYSTEM */ if (link_last) /* CONNECT TO PREVIOUS NODE IN LIST */ link_last->more=link; else /* THIS IS THE NEW HEAD OF THE LIST */ link_head=link; link_last=link; } } if (link) /* AN EMPTY LINK I.E. link->ipos<0 ... JUNK IT */ free(link); if (link_last) /* TERMINATE LAST NODE IN LIST */ link_last->more=NULL; if (link_head) { memcpy(list,link_head,sizeof(LPOLetterLink_T)); free(link_head); return 1; /* COMPACTED LINK LIST IS NON-EMPTY */ } else { /* NOTHING LEFT IN LIST, SO BLANK IT */ list->more=NULL; list->ipos= INVALID_LETTER_POSITION; return 0; /* COMPACTED LINK LIST IS EMPTY */ } } int compact_sources(LPOLetterSource_T *list,int ibundle_delete, LPOSourceInfo_T source_seq[]) { LPOLetterSource_T *source=NULL,*source_last=NULL,*next_source,*source_head=NULL; CALLOC(source,1,LPOLetterSource_T); memcpy(source,list,sizeof(LPOLetterSource_T)); for (;source;source=next_source){ next_source=source->more; if (source_seq[source->iseq].bundle_id == ibundle_delete) free(source); /* DELETE THIS SOURCE ENTRY */ else { /* COPY THIS BACK TO PREVIOUS SOURCE ENTRY: COMPACT THE LIST*/ if (source_last) /* CONNECT TO PREVIOUS NODE IN LIST */ source_last->more=source; else /* THIS IS THE NEW HEAD OF THE LIST */ source_head=source; source_last=source; } } if (source_last) /* TERMINATE LAST NODE IN LIST */ source_last->more=NULL; if (source_head) { memcpy(list,source_head,sizeof(LPOLetterSource_T)); free(source_head); return 1; /* COMPACTED SOURCE LIST IS NON-EMPTY */ } else { /* NOTHING LEFT IN LIST, SO BLANK IT */ list->more=NULL; list->ipos= INVALID_LETTER_POSITION; return 0; /* COMPACTED SOURCE LIST IS EMPTY */ } } void reindex_compact_rings(LPOSequence_T *seq) { int i,iring= -1,ring_start= -1; LOOPF (i,seq->length) { /* REMOVE ALL LINKS TO OLD, DELETED POSITIONS */ if (iring==seq->letter[i].ring_id) { seq->letter[i].ring_id=ring_start; /* USE FIRST INDEX ON RING */ seq->letter[i].align_ring= i-1; /* LINK TO LETTER TO LEFT */ } else { /* START OF A NEW RING */ if (ring_start>=0) seq->letter[ring_start].align_ring = i-1; iring=seq->letter[i].ring_id; seq->letter[i].ring_id=seq->letter[i].align_ring=ring_start=i; } } if (ring_start>=0) seq->letter[ring_start].align_ring = i-1; } #define DELETE_THIS_BUNDLE (-2) int remove_bundle(LPOSequence_T *seq,int ibundle,int delete_all_others) { int i,j=0,*old_to_new=NULL,new_length; if (delete_all_others) { /* INSTEAD OF DELETING THIS BUNDLE, */ LOOP (i,seq->nsource_seq) /*MARK ALL OTHERS TO BE DELETED! */ if (seq->source_seq[i].bundle_id!=ibundle) seq->source_seq[i].bundle_id = DELETE_THIS_BUNDLE; ibundle=DELETE_THIS_BUNDLE; } CALLOC(old_to_new,seq->length,int); /* CREATE MAPPING ARRAY */ LOOPF (i,seq->length) { if (compact_sources(&seq->letter[i].source,ibundle,seq->source_seq)){ if (i>j) /* COPY LETTER TO COMPACTED POSITION */ memcpy(seq->letter+j,seq->letter+i,sizeof(LPOLetter_T)); old_to_new[i]=j++; /* SAVE MAPPING FROM OLD TO NEW, COMPACTED POSITION*/ } else { free_lpo_letters(1,seq->letter+i,FALSE); /* DUMP DATA FOR THIS LETTER */ old_to_new[i]= INVALID_LETTER_POSITION; } } new_length=j; if (new_lengthlength) /* ERASE UNUSED PORTIONS AFTER COMPACTED ARRAY*/ memset(seq->letter+new_length,0,(seq->length - new_length)*sizeof(LPOLetter_T)); LOOP (i,new_length) { /* REMOVE ALL LINKS TO OLD, DELETED POSITIONS */ compact_links(&seq->letter[i].left,old_to_new); compact_links(&seq->letter[i].right,old_to_new); } seq->length=new_length; reindex_compact_rings(seq); FREE(old_to_new); return new_length; } poaV2/seq_util.c0100644000765400076540000002057710024266007012211 0ustar poapoa #include "default.h" #include "seq_util.h" /** randomizes seq[] by shuffling, and places the result in randseq[]; if randseq[] and seq[] are distinct, seq[] is left unchanged */ void shuffle_seq(int len, char seq[], char randseq[]) { int i,j; char c; for (i=0;insymbol,m->symbol); */ /** converts characters in seq[] to the INDEX of the matching character in symbols[], and returns the result in out[] */ void index_symbols(int nseq,char seq[],char out[], int nsymbs,char symbols[]) { int i,j,k; LOOP (i,nseq) { k=nsymbs-1; /* DEFAULT: UNMATCHABLE SYMBOL */ LOOP (j,nsymbs) { /* FIND MATCHING SYMBOL */ if (symbols[j]==seq[i]) { /* FOUND IT! */ k=j; break; } } out[i]=k; /* SAVE THE TRANSLATED CODE */ } return; } int *Score_matrix_row=NULL; int best_match_qsort_cmp(const void *void_a,const void *void_b) { int *a=(int *)void_a,*b=(int *)void_b; if (Score_matrix_row[*a]>Score_matrix_row[*b]) return -1; else if (Score_matrix_row[*a]gap_penalty_set[0][0]=m->gap_penalty_set[1][0]=12; /*SAVE PENALTIES*/ m->gap_penalty_set[0][1]=m->gap_penalty_set[1][1]=2; m->gap_penalty_set[0][2]=m->gap_penalty_set[1][2]=0; m->trunc_gap_length = TRUNCATE_GAP_LENGTH; m->decay_gap_length = DECAY_GAP_LENGTH; ifile=fopen(filename,"r"); if (!ifile) { WARN_MSG(USERR,(ERRTXT,"Can't open alignment matrix from %s\n",filename),"$Revision: 1.2.2.2 $"); return -2; /* FAILED TO FIND FILE TO READ */ } while (fgets(line,1023,ifile)) { if ('#'==line[0] || '\n'==line[0]) /* SKIP COMMENT OR BLANK LINES */ continue; else if (1==sscanf(line,"GAP-TRUNCATION-LENGTH=%d",&i)) { m->trunc_gap_length = i; } else if (1==sscanf(line,"GAP-DECAY-LENGTH=%d",&i)) { m->decay_gap_length = i; } else if (3==sscanf(line,"GAP-PENALTIES=%d %d %d",&i,&j,&k)) { m->gap_penalty_set[0][0]=m->gap_penalty_set[1][0]=i; /*SAVE PENALTIES*/ m->gap_penalty_set[0][1]=m->gap_penalty_set[1][1]=j; m->gap_penalty_set[0][2]=m->gap_penalty_set[1][2]=k; } else if (3==sscanf(line,"GAP-PENALTIES-X=%d %d %d",&i,&j,&k)) { m->gap_penalty_set[1][0]=i; /*SAVE PENALTIES ONLY FOR X DIRECTION*/ m->gap_penalty_set[1][1]=j; m->gap_penalty_set[1][2]=k; } #ifdef SOURCE_EXCLUDED else if (1==sscanf(line,"DNACODES=%99s",dna_codes)) { /* READ DNACODES*/ strcpy(DNA_symbols,dna_codes);/*SYMBOLS COUNTED AS DNA FOR AUTORECOG*/ } else if (1==sscanf(line,"DNASCALE=%f",&DNA_rescale_score)) continue; #endif else if (!found_symbol_line) { /* READ THIS LINE AS LIST OF SEQ SYMBOLS*/ for (i=0;'\0'!=line[i];i++) if (!isspace(line[i])) /* IGNORE WHITESPACE */ m->symbol[nsymb++]=line[i]; /* SAVE TO LIST OF SYMBOLS */ found_symbol_line=1; /* SET FLAG SO WE NOW READ MATRIX SCORE VALUES */ } else { /* READ SCORING MATRIX LINES */ found_symbol_line=0; /* DEFAULT: FAILED TO FIND MATCHING SYMBOL IN LIST*/ LOOP (isymb,nsymb) /* FIND MATCH TO THIS SYMBOL */ if (m->symbol[isymb]==line[0]) { found_symbol_line=1; /* SIGNAL THAT WE SUCCESFULLY FOUND MATCH */ j=1; /* SKIP FIRST CHARACTER: OUR SEQUENCE SYMBOL */ LOOPF (i,nsymb) { /* READ ALL THE SCORE VALUES ON THIS LINE */ if (1==sscanf(line+j,"%d%n",&(m->score[isymb][i]),&k)) j+=k; /* ADVANCE THE READING POSITION */ else { /* MISSING SCORE DATA: ERROR! */ IF_GUARD(1,5.23,(ERRTXT,"Missing score value for pair %c:%c", m->symbol[isymb],m->symbol[i]),TRAP) ; fclose(ifile); /* CLOSE OUR STREAM */ return -1; } } break; } IF_GUARD(!found_symbol_line,1.5,(ERRTXT,"Missing or unknown sequence symbol: %c",line[0]),TRAP) { /* ERROR: AN INVALID SYMBOL, NOT IN LIST */ fclose(ifile); /* CLOSE OUR STREAM */ return -1; } } } fclose(ifile); /* CONSTRUCT GAP PENALTY ARRAYS FROM GAP PARAMETERS: */ m->max_gap_length = m->trunc_gap_length + m->decay_gap_length; CALLOC (m->gap_penalty_x, m->max_gap_length+2, LPOScore_T); CALLOC (m->gap_penalty_y, m->max_gap_length+2, LPOScore_T); /*** GAP OPENING PENALTY @ L=0->1 */ m->gap_penalty_x[0] = m->gap_penalty_set[0][0]; m->gap_penalty_y[0] = m->gap_penalty_set[1][0]; /*** 1st AFFINE EXTENSION PENALTY (A1) @ L=1->2,2->3,...T-1->T */ for (i=1;itrunc_gap_length;i++) { m->gap_penalty_x[i] = m->gap_penalty_set[0][1]; m->gap_penalty_y[i] = m->gap_penalty_set[1][1]; } /*** DECAYING EXTENSION PENALTY (A1-->A2; skipped if D=0) @ L=T->T+1,...T+D-1->T+D */ for (i=0;idecay_gap_length;i++) { double dec_x = (m->gap_penalty_set[0][1] - m->gap_penalty_set[0][2]) / ((double)(m->decay_gap_length + 1)); double dec_y = (m->gap_penalty_set[1][1] - m->gap_penalty_set[1][2]) / ((double)(m->decay_gap_length + 1)); m->gap_penalty_x[i+m->trunc_gap_length] = m->gap_penalty_set[0][1] - (i+1) * dec_x; m->gap_penalty_y[i+m->trunc_gap_length] = m->gap_penalty_set[1][1] - (i+1) * dec_y; } /*** 2nd AFFINE EXTENSION PENALTY (A2) @ L>=T+D */ m->gap_penalty_x[m->max_gap_length] = m->gap_penalty_set[0][2]; m->gap_penalty_y[m->max_gap_length] = m->gap_penalty_set[1][2]; m->gap_penalty_x[m->max_gap_length+1] = 0; /* DON'T REMOVE THIS!... SPECIAL STATE USED IN align_lpo. */ m->gap_penalty_y[m->max_gap_length+1] = 0; /* DON'T REMOVE THIS!... SPECIAL STATE USED IN align_lpo. */ LOOPF (i,nsymb) { Score_matrix_row= m->score[i]; /* ROW TO USE FOR SORTING best_match */ LOOP (j,nsymb) m->best_match[i][j] = j; qsort(m->best_match[i],nsymb,sizeof(m->best_match[0][0]), best_match_qsort_cmp); #ifdef SOURCE_EXCLUDED printf("%c SORT",m->symbol[i]); /* TEST: PRINT OUT SORTED TABLE */ LOOPF (j,nsymb) printf("\t%c:%d",m->symbol[m->best_match[i][j]], m->score[i][m->best_match[i][j]]); printf("\n"); #endif } m->symbol[nsymb]='\0'; /* TERMINATE THE SYMBOL STRING */ m->nsymbol=nsymb; return nsymb; } /** prints a scoring matrix, only including those symbols in subset[] */ void print_score_matrix(FILE *ifile,ResidueScoreMatrix_T *m,char subset[]) { int i,i_m,j,j_m,nsubset; nsubset=strlen(subset); printf(" "); LOOPF (i,nsubset) printf(" %c",subset[i]); printf("\n"); LOOPF (i,nsubset) { LOOP (i_m,m->nsymbol) if (m->symbol[i_m]==subset[i]) break; printf("%c",subset[i]); LOOPF (j,nsubset) { LOOP (j_m,m->nsymbol) if (m->symbol[j_m]==subset[j]) break; printf("%3d",m->score[i_m][j_m]); } printf("\n"); } return; } /** restricts seq[] to the set of allowed characters given by symbol[]; other characters will be replaced by the default symbol[0] */ int limit_residues(char seq[],char symbol[]) { int i,len,nreplace=0; len=strlen(seq); for (i=strspn(seq,symbol);ip; if (0 == s1->last_alloc) /* SAVE STATIC STRING */ stringptr_cat_temp_p = s1->p; /* SAVE OLD */ total_len=s2_len=strlen(s2)+1; CALLOC(s2_temp,s2_len,char); /* ALLOCATE TEMP STORAGE */ memcpy(s2_temp,s2,s2_len); /* COPY THE STRING */ if (s1->p) { /* CALCULATE ADDITIONAL SPACE NEEDED FOR ORIGINAL STRING */ if (pos) /* USE THE CALLER-SUPPLIED STRING LENGTH */ total_len += *pos; else /* OTHERWISE MEASURE IT */ total_len += strlen(s1->p); } GETMEM(s1->p,total_len, s1->last_alloc,STRINGPTR_BUFFER_CHUNK,char); if (stringptr_cat_temp_p) /* PUT OLD STATIC STRING BACK IN */ strcpy(s1->p,stringptr_cat_temp_p); if (pos) { /* ATTACH NEW STRING AT CALLER-SUPPLIED END-POSITION */ strcpy(s1->p + *pos,s2_temp); *pos = total_len-1; /* EXCLUDE TERMINATOR */ } else /* OTHERWISE strcat AS USUAL */ strcat(s1->p,s2_temp); FREE(s2_temp); /* FREE THE TEMP STORAGE */ return s1->p; } char *stringptr_cat(stringptr *s1,const char s2[]) /* ~~g --- */ { return stringptr_cat_pos(s1,s2,NULL); } char *stringptr_cpy(stringptr *s1,const char s2[]) /* ~~g --- */ { GETMEM(s1->p,strlen(s2)+1,s1->last_alloc,STRINGPTR_BUFFER_CHUNK,char); strcpy(s1->p,s2); return s1->p; } /**stringptr_free************************************************* * * stringptr_free: * AUTHOR: tal * Wed Jul 27 03:04:41 PDT 1994 * * Frees stringptr type. * ***************************************************************/ int stringptr_free(stringptr *s) /* ~~g --- */ { FREE(s->p); s->last_alloc=0; return 0; } poaV2/README0100644000765400076540000003363010027643067011102 0ustar poapoa -- POA INSTALLATION NOTES -- September 2001, updated March 2004. Chris Lee Dept. of Chemistry & Biochemistry UCLA I. COMPILATION To compile this program, simply type 'make poa'. This produces an executable for sequence alignment (poa) and also a linkable library liblpo.a. The software has been compiled and tested on LINUX and Mac OS X. II. RUNNING POA POA has a variety of command line options. Running POA without any arguments will print a list of the possible command line arguments. POA may be used to construct a PO-MSA, or to analyze a PO-MSA. A. Constructing a PO-MSA ------------------------- 1. Required Input: i. An Alignment Score Matrix File: A score matrix file is required, because POA uses it to get the residue alphabet and indexing. Even if POA is not being used to perform multiple sequence alignment, this file must be provided. Any basic alignment matrix which may be used with BLAST may be used here. This file must be the first command line argument without a flag in order to be interpreted by POA as the score matrix file. Two example score matrix files, blosum80.mat and blosum80_trunc.mat, are provided in this directory. They includes scores for matching nucleotides, as well as amino acids. Header lines may be used to specify gap parameters, as in the examples: GAP-PENALTIES=A B C GAP-TRUNCATION-LENGTH=T GAP-DECAY-LENGTH=D means that the gap opening penalty is A; the gap extension penalty is B until the gap length reaches T; the gap extension penalty decreases linearly from B to C for gap lengths between T and T+D; and the gap extension penalty is C for all longer gaps. An additional line, "GAP-PENALTIES-X=Ax Bx Cx", can be inserted after the GAP-PENALTIES line to specify the opening and extension penalties for gaps in the first sequence relative to the second sequence in an alignment, i.e., for asymmetric gap scoring. Use the -v flag to see what gap penalties POA is using for a given run. NOTE: POA is case-sensitive. In order to distinguish amino acid residues from nucleic acid residues, POA is case sensitive. Residues that are uppercase are interpreted as amino acids, while residues that are lowercase are interpreted as nucleotides. POA can handle mixed score matrices containing both amino acid and nucleotide scores, as long as the column and row labels of the matrix are case-sensitive. The blosum80.mat file is an example. ii. A FASTA file, or MSA Files in PO, CLUSTAL or PIR Format A FASTA file is required only if POA is being used to construct a new PO-MSA from a list of sequences, or to align a list of sequences to an already existing PO-MSA (see Analyzing a PO-MSA below). This FASTA file should contain sequences to be aligned by POA. The command line argument to get POA to accept a FASTA file as input is '-read_fasta FILENAME'. POA will interpret FILENAME as the FASTA sequence file. An example file, multidom.seq, is provided in this directory. POA is case-sensitive (see NOTE above). All residues in the FASTA file must be uppercase to be interpreted as amino acids by POA, or lowercase to be interpreted as nucleotides. To switch the case of all of the letters in the FASTA file to uppercase, use the '-toupper' command line argument. To switch the case of all the letters in the FASTA file to lowercase, use the '-tolower' command line argument. POA will also read in a set of MSA files to be aligned (see below). 2. MSA Construction Options: i. Global vs. Local Alignment POA will build alignments using local or global alignment. The default is set to local aligment. To call global alignment, use the '-do_global' flag. ii. Iterative vs. Progressive Alignment POA will build the alignment iteratively, aligning sequences and MSAs in the order they are provided. It will also align sequences and MSAs in the order dictated by a guide tree built from a matrix of pairwise similarity scores. To call progressive alignment, use the '-do_progressive' flag. To provide POA with a set of pairwise similarity scores, use the '-read_pairscores' flag followed by the name of the text file containing the list of pairwise similarity scores. This file should be a tab-delimited file, where each row contains two sequence names followed by the pairwise similarity score. The included file "multidom.pscore" shows an example. Example row in pairscore file: ABL1_HUMAN MATK_HUMAN 260.0 To quickly compute a set of pairwise similarity scores, run BLAST, and set the similarity scores to the set of BLAST bitscores. A simple BLAST driver/parser is provided as "make_pscores.pl"; you may need to modify this script for your particular configuration. If the '-do_progressive' flag is specified without a corresponding pairscore file, POA will compute pairwise similarity scores itself, by performing all pairwise alignments. This is very slow, and we do not recommend it. iii. Aggressive Fusion: By default, during the building up of a PO-MSA, if a node i with label 'A' is aligned to a node j with label 'B' that belongs to an align ring containing a third node k with label 'A', POA simply adds node i to the j-k align ring. It is possible to force POA to do aggressive fusion, so that node i is instead fused to node k. Use the '-fuse_all' flag to accomplish this. 3. MSA Output Formats: POA can output a PO-MSA in several formats simultaneously, including CLUSTAL, PIR, and PO. The PO format is the best format since it contains all of the information in the PO-MSA. The other formats accurately represent the MSA, but since they are RC-MSA formats, they may lose some of the information in the full PO-MSA. i. CLUSTAL format: This format is the standard CLUSTAL format. The command line argument to get the MSA output in this format is '-clustal FILENAME'. ii. PIR format: This format is the standard PIR format, which is like FASTA with a '.' character representing gaps. The command line argument to get the MSA output in this format is '-pir FILENAME'. iii. PO format: This format is the standard PO format. It is described below in the section PO format. The command line argument to get the MSA output in this format is '-po FILENAME'. EXAMPLE: Constructing an MSA of Four Protein Sequences Running POA with the following statement will take the FASTA-formatted sequences in the multidom.seq file, construct a PO-MSA using the scoring matrix in the file blosum80.mat, and then output the PO-MSA in CLUSTAL format to the file multidom.aln. poa -read_fasta multidom.seq -clustal multidom.aln blosum80.mat 4. Other Output: i. Score Matrix POA will also print to stdout the score matrix stored in the '.mat' file. The command line argument to get POA to do this is '-printmatrix LETTERSET', where LETTERSET is a string of letters to be printed with the score matrix. For example, if the score matrix is designed for protein alignment the letter set might be 'ARNDCQEGHILKMFPSTWYV'. ii. Verbose Mode POA will run in verbose mode, printing additional information generated during the run (such as the set of gap scores used) to stdout. The command line argument for verbose mode is '-v'. B. Analyzing a PO-MSA ----------------------- POA can also take as input an MSA in PO, CLUSTAL or PIR file format and rebuild the PO-MSA data structure. Once this data structure has been rebuilt, it may be analyzed for features. In "liblpo.a", the linkable POA library, we have included the functions necessary to do heaviest bundling and thereby find consensus sequences in the PO-MSA (the details of the heaviest bundling algorithm are described elsewhere). POA has been written so that users may create their own functions for analyzing a PO-MSA. We have not included in the "liblpo.a" library the functions that we wrote to analyze PO-MSAs constructed with ESTs and genome sequence to find snps and alternative splice sites. However, it is possible to design modular library functions that will look for highly specific biological features in any PO-MSA data structure. 1. Required Input: Before the PO-MSA data structure can be analyzed it must be built. It can be built iteratively or using a guide tree, or converted from another file type. POA can align a set of FASTA-formatted sequences to each other or to an existing PO-MSA. It can align two PO-MSAs. It can also align an arbitrary set of PO-MSAs. Note: POA Requires Either An MSA File or a FASTA File If neither type of file is read in by POA it will terminate early, since it has not received any sequence data. i. An MSA file in PO, CLUSTAL or PIR format: POA will read in an MSA file in PO, CLUSTAL or PIR format. The command line argument to get poa to read in an MSA file and rebuild the PO-MSA data structure is '-read_msa FILENAME'. POA automatically determines whether the MSA file is in PO, CLUSTAL, or PIR format. POA will read in a second MSA file when the '-read_msa2 FILENAME' flag is used. POA will read in a set of MSA files using the '-read_msa_list FILENAME' flag. The file should contain a list of names of MSA files. It is possible to filter the PO-MSA data structure as it is being rebuilt. In order to filter the PO-MSA in the MSA file to include only a subset of sequences use the command line argument '-subset FILENAME', where the file named FILENAME contains the list of sequence names to be included in the new PO-MSA. In order to filter the PO-MSA in the MSA file to exclude a subset of sequences, use the command line argument '-remove FILENAME', where the file named FILENAME contains the list of sequences to be excluded from the new PO-MSA. The names of sequences to be included or excluded should be in the format "SOURCENAME=*", as they are in a PO file. Lists of sequence source names can be created by using the unix grep utility on the PO file. Each line in the list of sequences to be filtered should read, "SOURCENAME=" followed by the name of the sequence, e.g. "SOURCENAME=ABL1_HUMAN". To filter the second PO-MSA read in using the '-read_msa2 FILENAME', use the '-subset2' and '-remove2' flags. ii. A FASTA File: The FASTA file should contain sequences to be aligned by POA. The command line argument to get POA to accept a FASTA file as input is '-read_fasta FILENAME'. POA will interpret FILENAME as the FASTA sequence file. An example file, "multidom.seq", is provided in this directory. (See note above on case-sensitivity). NOTE: POA Can Take Both An MSA File And A FASTA File As Input If both the '-read_msa FILENAME' argument and the '-read_fasta FILENAME' argument are given to POA on the command line, then POA will first rebuild the PO-MSA in the MSA file, and then it will align the sequences in the FASTA file to this PO-MSA. Similarly, if both the '-read_msa_list FILENAME' flag and the '-read_fasta FILENAME' flag are given to POA, then POA will rebuild all of the PO-MSAs and will align them to each other and to the sequences in the FASTA file. 2. Additional PO Utilities: i. Consensus Generation Via Heaviest Bundling Algorithm: The heaviest bundling algorithm finds consensus sequences in the PO-MSA. The command line argument for heaviest bundling is '-hb'. This function adds the new consensus sequences to the PO-MSA by storing new consensus sequence indices on the in the PO-MSA nodes corresponding to the consensus sequence paths. The sequence source names for consensus sequences generated by heaviest bundling are CONSENS'i' where 'i' is the index of the bundle corresponding to the consensus sequence. The heaviest bundling algorithm can also take as input a bundling threshold value. The command line argument for setting a bundling threshold value for heaviest bundling is '-hbmin VALUE'. This threshold is used during the process of associating sequences with bundles. If a sequence has a percentage of nodes shared with bundle 'i' greater than this threshold value, it is associated with bundle 'i'. Iterative heaviest bundling can also be affected by the bundling threshold. A detailed description of heaviest bundling and heaviest bundling thresholds is given elsewhere. The consensus sequences corresponding to bundles generated by heaviest bundling are listed in the sequence source list. Additionally, in the SOURCEINFO line for each sequence the index of the bundle to which that sequence belongs is give. Finally, using the command line argument '-best' restricts the MSA output to the consensus sequences generated by heaviest bundling (NB: this applies to PIR output only). III. PO FILE FORMAT ****************************HEADER**************************************** VERSION= ~Current version of POA,e.g. LPO.1.0~ NAME= ~Name of PO-MSA. Defaults to name of 1st sequence in PO-MSA~ TITLE= ~Title of PO-MSA. Defaults to title of 1st sequence in PO-MSA~ LENGTH= ~Number of nodes in PO-MSA~ SOURCECOUNT= ~Number of sequences in PO-MSA~ *********************SEQUENCE SOURCE LIST********************************* /* For each sequence in the PO-MSA: */ SOURCENAME= ~Name of sequence taken from FASTA sequence header~ SOURCEINFO= ~Number of nodes in sequence~ ~Index of first node containing sequence~ ~Sequence weight~ ~Index of bundle containing sequence~ ~Title of sequence taken from FASTA sequence header~ /* Example: */ SOURCENAME=GRB2_HUMAN SOURCEINFO=217 10 0 3 GROWTH FACTOR RECEPTOR-BOUND PROTEIN 2 (GRB2 ADAPTOR PROTEIN)(SH2) ********************PO-MSA DATA STRUCTURE********************************* /* For each node in the PO-MSA: */ ~Residue label~:~'L' delimited index list of other nodes with edges into node~ ~'S' delimited index list of sequences stored in each node~ ~'A' index of next node in same align ring~ NB: align ring indices must form a cycle. e.g. if two nodes 121 and 122 are aligned, then the line for node 121 indicates "A122", and the line for node 122 indicates "A121". /* Example: */ F:L156L155L22S2S3S7A158 ********************END*************************************************** For more information, see http://www.bioinformatics.ucla.edu/poa. poaV2/Makefile0100644000765400076540000000173610024245652011657 0ustar poapoa AR=ar rc TARGETS=poa liblpo.a poa_doc libbflag.a # align_score.c CAN BE USED TO ADD CUSTOMIZED SCORING FUNCTIONS OBJECTS= \ align_score.o \ main.o LIBOBJECTS= \ black_flag.o \ seq_util.o \ fasta_format.o \ msa_format.o \ align_lpo2.o \ align_lpo_po2.o \ buildup_lpo.o \ lpo.o \ heaviest_bundle.o \ lpo_format.o \ create_seq.o \ remove_bundle.o \ numeric_data.o \ stringptr.o CC = gcc #CFLAGS= -g -ansi-strict -W -Wall -DUSE_WEIGHTED_LINKS -DUSE_PROJECT_HEADER -I. CFLAGS= -g -DUSE_WEIGHTED_LINKS -DUSE_PROJECT_HEADER -I. # -I$(HOME)/lib/include # -DREPORT_MAX_ALLOC clean: rm -f $(OBJECTS) $(LIBOBJECTS) $(TARGETS) liblpo.a: $(LIBOBJECTS) rm -f $@ $(AR) $@ $(LIBOBJECTS) ranlib $@ # NB: LIBRARY MUST FOLLOW OBJECTS OR LINK FAILS WITH UNRESOLVED REFERENCES!! poa: $(OBJECTS) liblpo.a $(CC) -o $@ $(OBJECTS) -lm liblpo.a what: @echo poa: partial-order based sequence alignment program @echo liblpo.a: partial-order alignment and utilities function library