lagan20/0000755000076500007650000000000010502546662013126 5ustar brudnobrudno00000000000000lagan20/anal_gloc.pl0000755000076500007650000000605210502337064015402 0ustar brudnobrudno00000000000000#!/usr/bin/env perl $savname1 = ""; $savname2 = ""; $skip = 0; $endblock = 0; $score = 0; $strand = ""; $initstrnd; $s1s = 999999999; $s2s = 999999999; $first = 1; $plus_sc = 0; $minus_sc = 0; while ($line = ) { if ($line =~ /^>/) { if (!$first) { if ($strand eq "+") { print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n"; } else { print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n"; } if ($strand ne $initstrnd) { print STDOUT "INV\n" } if ($strand eq "+") { $plus_sc += $score; } else { $minus_sc += $score; } if ($plus_sc > $minus_sc) { print STDOUT "Main score (+) $plus_sc; Inverted $minus_sc\n"; } else { print STDOUT "Main score (-) $minus_sc; Inverted $plus_sc\n"; } $plus_sc = 0; $minus_sc = 0; $score = 0; $s1s = 999999999; $s2s = 999999999; $strand = ""; } $first = 1; $name1 = $line; chomp $name1; $line = ; if ($line !~ /^>/) { print STDERR "Expecting a name, but got $line"; exit (1); } $name2 = $line; chomp $name2; $inblock = 1; $skip = 0; if (($name1 eq $savname1) && ($name2 eq $savname2)) { $skip = 1; } else { print STDOUT "$name1 $name2\n"; } $savname1 = $name1; $savname2 = $name2; } elsif (!$skip) { $endblock = 0; $line =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) ([0-9\.]*) (.) (.*)/; if ($1 == 0 || $3 == 0) { next; } # print STDOUT "strand $strand $s2s $4\n"; if (($strand eq "+") && ($6 eq "+") && ($s2s + 20 < $4) ) { $endblock += 2; } if (($strand eq "-") && ($6 eq "-") && ($s2s > $4 + 20) ) { $endblock += 2; } if ($strand eq "") { $strand = $6; } if ($6 ne $strand) { $endblock += 1; } if (!$endblock) { $s2s = $3; $s1s = $1; $s1e = $2; $s2e = $4; $score += $5; if ($first) { print STDOUT " "; print STDOUT " "; $initstrnd = $strand; $reg1s = $2; $reg2s = $4; $first = 0; } } else { if ($strand eq "+") { print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n"; } else { print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n"; } if ($strand eq "+") { $plus_sc += $score; } else { $minus_sc += $score; } if ($endblock %2) { print STDOUT "INV "; } else {print STDOUT " "; } if ($endblock > 1) { print STDOUT "TRL "; } else {print STDOUT " "; } $s2s = $3; $s1s = $1; $s1e = $2; $s2e = $4; $reg1s = $s1e; $reg2s = $s2e; $score = $5; $strand = $6; # print STDOUT "strand $strand\n"; } } } if (!$first){ if ($strand eq "+") { print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n"; } else { print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n"; } if ($strand eq "+") { $plus_sc += $score; } else { $minus_sc += $score; } } if ($plus_sc > $minus_sc) { print STDOUT "Main score (+) $plus_sc; Inverted $minus_sc\n"; } else { print STDOUT "Main score (-) $minus_sc; Inverted $plus_sc\n"; } lagan20/blosum62.txt0000644000076500007650000000355310502337063015337 0ustar brudnobrudno00000000000000 A R N D C Q E G H I L K M F P S T W Y V B Z X * A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 lagan20/blosum62s.txt0000644000076500007650000000567310502337063015527 0ustar brudnobrudno00000000000000 A R N D C Q E G H I L K M F P S T W Y V B Z X * A 223 -55 -111 -111 0 -55 -55 0 -111 -55 -55 -55 -55 -111 -55 55 0 -167 -111 0 -111 -55 0 -223 R -55 278 0 -111 -167 55 0 -111 0 -167 -111 111 -55 -167 -111 -55 -55 -167 -111 -167 -55 0 -55 -223 N -111 0 334 55 -167 0 0 0 55 -167 -167 0 -111 -167 -111 55 0 -223 -111 -167 167 0 -55 -223 D -111 -111 55 334 -167 0 111 -55 -55 -167 -223 -55 -167 -167 -55 0 -55 -223 -167 -167 223 55 -55 -223 C 0 -167 -167 -167 502 -167 -223 -167 -167 -55 -55 -167 -55 -111 -167 -55 -55 -111 -111 -55 -167 -167 -111 -223 Q -55 55 0 0 -167 278 111 -111 0 -167 -111 55 0 -167 -55 0 -55 -111 -55 -111 0 167 -55 -223 E -55 0 0 111 -223 111 278 -111 0 -167 -167 55 -111 -167 -55 0 -55 -167 -111 -111 55 223 -55 -223 G 0 -111 0 -55 -167 -111 -111 334 -111 -223 -223 -111 -167 -167 -111 0 -111 -111 -167 -167 -55 -111 -55 -223 H -111 0 55 -55 -167 0 0 -111 446 -167 -167 -55 -111 -55 -111 -55 -111 -111 111 -167 0 0 -55 -223 I -55 -167 -167 -167 -55 -167 -167 -223 -167 223 111 -167 55 0 -167 -111 -55 -167 -55 167 -167 -167 -55 -223 L -55 -111 -167 -223 -55 -111 -167 -223 -167 111 223 -111 111 0 -167 -111 -55 -111 -55 55 -223 -167 -55 -223 K -55 111 0 -55 -167 55 55 -111 -55 -167 -111 278 -55 -167 -55 0 -55 -167 -111 -111 0 55 -55 -223 M -55 -55 -111 -167 -55 0 -111 -167 -111 55 111 -55 278 0 -111 -55 -55 -55 -55 55 -167 -55 -55 -223 F -111 -167 -167 -167 -111 -167 -167 -167 -55 0 0 -167 0 334 -223 -111 -111 55 167 -55 -167 -167 -55 -223 P -55 -111 -111 -55 -167 -55 -55 -111 -111 -167 -167 -55 -111 -223 390 -55 -55 -223 -167 -111 -111 -55 -111 -223 S 55 -55 55 0 -55 0 0 0 -55 -111 -111 0 -55 -111 -55 223 55 -167 -111 -111 0 0 0 -223 T 0 -55 0 -55 -55 -55 -55 -111 -111 -55 -55 -55 -55 -111 -55 55 278 -111 -111 0 -55 -55 0 -223 W -167 -167 -223 -223 -111 -111 -167 -111 -111 -167 -111 -167 -55 55 -223 -167 -111 613 111 -167 -223 -167 -111 -223 Y -111 -111 -111 -167 -111 -55 -111 -167 111 -55 -55 -111 -55 167 -167 -111 -111 111 390 -55 -167 -111 -55 -223 V 0 -167 -167 -167 -55 -111 -111 -167 -167 167 55 -111 55 -55 -111 -111 0 -167 -55 223 -167 -111 -55 -223 B -111 -55 167 223 -167 0 55 -55 0 -167 -223 0 -167 -167 -111 0 -55 -223 -167 -167 223 55 -55 -223 Z -55 0 0 55 -167 167 223 -111 0 -167 -167 55 -55 -167 -55 0 -55 -167 -111 -111 55 223 -55 -223 X 0 -55 -55 -55 -111 -55 -55 -55 -55 -55 -55 -55 -55 -55 -111 0 0 -111 -55 -55 -55 -55 -55 -223 * -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 55 lagan20/lagan.pl0000755000076500007650000001416310502337064014547 0ustar brudnobrudno00000000000000#!/usr/bin/env perl $lagandir = $ENV{LAGAN_DIR}; $consrate = 45; $consupperrate = 65; if (@ARGV < 2) { print ("usage:\n lagan seqfile1 seqfile2 [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1,rsc1),(wl2,nd2,co2,rsc2),...\"] [-bin] [-mfa] [-out \"filename\"] [-lazy] [-maskedonly] [-debug] [-usebounds] [-rc] [-translate] [-draft] [-info] [-fastreject]\n"); exit(1); } $firstName = $ARGV[0]; $secondName = $ARGV[1]; $rcFlag = 0; $arglist = ""; $contigflag = 0; $infofile = 0; $okformat = 0; $binfile = 0; $infofilename = "alignment"; $direction = "+"; $gfc = " -gfc "; $rundraft = 0; $draftparams = ""; $dofastreject = 0; $doxmfa = 0; $filename = ""; $format = ""; for ($i = 2; $i < @ARGV; $i++) { if ($ARGV[$i] =~ /-order/) { $orderfl = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-bin/) { $orderfl = $orderfl." -bin"; $binfile = 1; $okformat = 1; } elsif ($ARGV[$i] =~ /-info/) { $infofile++; } elsif ($ARGV[$i] =~ /-mfa/) { $orderfl = $orderfl." -mfa"; $okformat = 1; } elsif ($ARGV[$i] =~ /-xmfa/) { $orderfl = $orderfl." -xmfa"; $doxmfa = 1; $okformat = 1; } elsif ($ARGV[$i] =~ /-out/) { $filename = $ARGV[++$i]; $infofile++; $infofilename = $ARGV[$i]; } elsif (($ARGV[$i] =~ /-gs/) || ($ARGV[$i] =~ /-gc/) || ($ARGV[$i] =~ /-mt/) || ($ARGV[$i] =~ /-ms/) || ($ARGV[$i] =~ /-bw/)){ $orderfl = $orderfl." ".$ARGV[$i]; $orderfl = $orderfl." ".$ARGV[++$i]; } elsif ($ARGV[$i] =~ /-s1/) { $orderfl = $orderfl." -s1 $ARGV[++$i]"; $orderfl = $orderfl." ".$ARGV[++$i]; } elsif ($ARGV[$i] =~ /-maskedonly/) { $arglist = $arglist." -maskedonly"; } elsif ($ARGV[$i] =~ /-translate/) { $arglist = $arglist." -translate"; $draftparams = $draftparams." -translate"; } elsif ($ARGV[$i] =~ /-fastreject/) { $arglist = $arglist." -fastreject"; $dofastreject = 1; $doxmfa = 1; $okformat = 1; } elsif ($ARGV[$i] =~ /-draftreject/) { $draftparams = $draftparams." -fastreject"; } elsif ($ARGV[$i] =~ /-gap/) { $arglist = $arglist." -gap ".$ARGV[++$i]; $arglist = $arglist." ".$ARGV[++$i]; } elsif ($ARGV[$i] =~ /-recurse/) { $arglist = $arglist." -recurse \"".$ARGV[++$i]."\""; } elsif ($ARGV[$i] =~ /-chaos/) { $arglist = $arglist." -chaos \"".$ARGV[++$i]."\""; } elsif ($ARGV[$i] =~ /-usebounds/) { $contigflag = 1; } elsif ($ARGV[$i] =~ /-rc/) { `$lagandir/utils/rc < $ARGV[1] > $ARGV[1].rc`; if ($?) { exit(1); } $secondName = "$ARGV[1].rc"; if (-e "$ARGV[1].masked") { `$lagandir/utils/rc < $ARGV[1].masked > $ARGV[1].rc.masked`; if ($?) { exit(1);} } $rcFlag = 1; $direction = "-"; } elsif ($ARGV[$i] =~ /-draft/){ $rundraft = 1; } elsif ($ARGV[$i] =~ /-cons/){ $draftparams = $draftparams." -cons $ARGV[$++i]"; } elsif ($ARGV[$i] =~ /-draftskipfr/){ $draftparams = $draftparams." -skipfr $ARGV[$++i]"; } elsif ($ARGV[$i] =~ /-lazy/){ $draftparams = $draftparams." -cons $ARGV[$++i]"; } else { print "Invalid option for lagan: $ARGV[$i]"; exit(1); } } $arglist = $arglist." -ext "; if ($rundraft){ `$lagandir/draft.pl $firstName $secondName $draftparams`; if ($?) { exit(1);} $secondName = "merged_seq.fa"; } # print STDERR "perl $lagandir/rechaos.pl $firstName $secondName $gfc $arglist > $$.anchs.final\n"; `perl $lagandir/rechaos.pl $firstName $secondName $gfc $arglist > $$.anchs.final`; $ex_val = $? >> 8; if ($ex_val == 3) { exit(0); } if ($ex_val) { exit(1); } if ($contigflag){ @bounds = `$lagandir/utils/getbounds $$.anchs.final $firstName $secondName`; if ($?) { exit(1); } chomp $bounds[0]; print STDERR ("Aligning with bounds: $bounds[0]\n"); print `$lagandir/order $firstName $secondName $bounds[0] $orderfl -anc $$.anchs.final`; if ($?) { exit(1); } } else { if ($dofastreject){ if (!$filename) { print STDERR "-fastreject requires -out filename!\n"; exit(1); } open(SFILE, "$$.anchs.final"); @anchors = ; close(SFILE); $anchors[0] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $end1 = $1 - 1; $end2 = $3 - 1; $anchors[@anchors - 1] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $start1 = $2 + 1; $start2 = $4 + 1; $bounds = "-s1 $start1 $end1 -s2 $start2 $end2 "; @anchors = 0; $orderfl = $bounds.$orderfl." -xmfa"; } if (!$okformat) { $format = "-bin"; } `$lagandir/order $firstName $secondName $format -out $$.align $orderfl -anc $$.anchs.final`; if ($?) { exit(1); } if (!$okformat) { if ($filename) { `$lagandir/utils/bin2bl $$.align > $filename`; } else { print `$lagandir/utils/bin2bl $$.align`; } } else { if ($filename) { `cat $$.align > $filename`; } else { print `cat $$.align`; } } if ($dofastreject){ `$lagandir/utils/scorealign $filename $consrate -ibounds -cropxmfa > $$.temp`; if ($?) { exit(1); } `mv $$.temp $filename`; } } $infofile += $okformat; if ($infofile == 3){ open (INFOFILE, ">$infofilename.info"); if ($binfile){ `$lagandir/utils/bin2mf $infofilename > $infofilename.mfa`; if ($?) { exit(1); } $infofilename = $infofilename.".mfa"; } @temp = `head $secondName`; if ($?) { exit(1); } chomp $temp[0]; $temp[0] = substr $temp[0], 1; print INFOFILE "$temp[0]\n"; $len = `$lagandir/utils/getlength $secondName`; chomp $len; if ($?) { exit(2); } $first = $last = $first2 = $last2 = -1; $score = `$lagandir/utils/scorealign $infofilename $consupperrate`; chomp $score; if ($?) { exit(3); } if ($score > 0){ $score = `$lagandir/utils/scorealign $infofilename $consrate`; chomp $score; if ($?) { exit(4); } @temp = `$lagandir/utils/scorealign $infofilename $consrate -bounds 0`; if ($?) { exit(5); } $temp[0] =~ /(.*) (.*)/; $first = $1; $last = $2; @temp = `$lagandir/utils/scorealign $infofilename $consrate -bounds 1`; if ($?) { exit(6); } $temp[0] =~ /(.*) (.*)/; $first2 = $1; $last2 = $2; } print INFOFILE "1 $first $last 1 $len 0 0 $direction $score $first2 $last2\n"; close (INFOFILE); # `$lagandir/utils/rm $infofilename` if ($binfile); } `rm $secondName` if ($rcflag); `rm $$.*`; if ($?) { exit(1); } exit(0); lagan20/Makefile0000644000076500007650000000056610502343027014563 0ustar brudnobrudno00000000000000all: (cd src; $(MAKE)) clean: rm -f chaos anchors order glocal utils/bin2bl mlagan utils/cstat utils/bin2mf utils/rc *~ utils/contigorder utils/getbounds utils/cextract utils/seqmerge utils/getlength utils/getoverlap utils/*~ utils/scorealign utils/scorecontigs mlagan.purify utils/getcontigpos utils/fa2xfa utils/Glue utils/dotplot utils/overlay (cd src; $(MAKE) clean) lagan20/nucmatrix.txt0000644000076500007650000000035210502337064015673 0ustar brudnobrudno00000000000000 A C G T . N A 91 -114 -31 -123 0 -43 C -114 100 -125 -31 0 -43 G -31 -125 100 -114 0 -43 T -123 -31 -114 91 0 -43 . 0 0 0 0 0 0 N -43 -43 -43 -43 0 -43 -400 -25 lagan20/Readmes/0000755000076500007650000000000010502361147014477 5ustar brudnobrudno00000000000000lagan20/Readmes/LICENSE0000644000076500007650000004312410502337063015510 0ustar brudnobrudno00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. lagan20/Readmes/README.chaos0000644000076500007650000001731010502337063016455 0ustar brudnobrudno00000000000000README for CHAOS (CHAins Of Score) version 0.933 10/22/2003 Author: Michael Brudno (brudno@cs.stanford.edu) 0. Availability + Legalese The source code of this version of CHAOS is freely available to all users under the GNU Public License (GPL). See the file LICENSE in this directory for more information.You can download it from http://www.stanford.edu/~brudno/chaos/ If you use CHAOS regularly please consider contacting brudno@cs.stanford.edu to be placed on a mailing list to be contacted about any updates and bug-fixes. If you use CHAOS in a published result please cite: Michael Brudno and Burkhard Morgenstern. "Fast and sensitive alignment of large genomic sequences" Proceedings of the IEEE Computer Society Bioinformatics Conference (CSB) 2002 pp. 138-47 I. Installation To install CHAOS you need to copy the source files to your local computer, untar/ungzip them, and run "make". I am assuming you have a reasonably modern installation of gcc. The sequence of commands should be: % gunzip chaos.tar.gz % tar xvf chaos.tar % make This will create the executable files "chaos" and "anchors". This distibutiuon also includes the program ancs4dialign.pl, a perl script for connecting CHAOS with DIALIGN. Both these tools are described in section V. Because CHAOS uses no system-dependent or implementation dependent libraries it should compile on all platforms and ANSI C compilers. If you have problems compiling the sources please e-mail the author. You will need to also set the environment variable LAGAN_DIR to the directory where you installed CHAOS. in c-shell this can be done by executing % setenv LAGAN_DIR `pwd` on the prompt. For other shells the command differs. II Description CHAOS is a heuristic local alignment tool optimized for non-coding regions of the genome. The main idea behind the algorithm lies in the chaining together of similar regions, or seeds. A seed is a pair of k-long words with at least n identical base pairs (bp). A seed k1 can then be chained to the seed k2 whenever the indeces of k1 in both sequences are higher than the indeces of k2, and k1 and k2 are "near" each other, with "near" defined by both a distance and a gap criteria. The final score of a chain is the total number of matching bp in it. There is no explicit gap penalty for matching seeds which are seperated by an unequal number of bases in the two sequences. III Usage 1. Input Parameters The main input are two fasta files. the first should contain a single query sequence, while the second can be a database of several sequences. There are followed by any number of command line options. This list is partial, (run chaos without args for the full list): nucmatrix.txt -- This file has the substitution matrix used by lagan and the gap penalties. The gaps penalties are on the line immediately after the matrix, the first number is the gap open, the second the gap continue. blosum62s.txt -- This file has a (scaled) version of the blosum62 matrix and appropriate gap parameters. -p = Peptide sequence [default genomic] Whether the input is a peptide or genomic sequence. For peptide sequences we call "similar" letters equal. In the default configuration we have "PCMH[DE][KR][NQ][ST][ILV][FYW][AG]X*", where letters in the same brackets are considered equal. Currently this is not user-settable, but as usual if you really want to be able to change this e-mail me. -v = Verbose mode [default brief] Displays the Smith-Waterman alignments of the resulting conserved regions. -b = Both strands [default forward-only] Add this if you are interested in similarities on both strands of the DNA. Meaningless if used with -p. -t = Translated [default off] Makes the 6 translated frames of the sequences and compares them, forward against forward, backward against backward (all against all if -b specified). -wl # = Word Length [default 10 for genomic, 4 for peptide] The length of the seed (k in the description above). -nd # = Number of Degeneracy [default 1 for genomic, 0 for peptide] Amount of degeneracy allowed in the seed (k-n in the description above). -co # = score CutOff [default 25] Scores above this cutoff are shown. -rsc $ = reScoring cutoff [default 0] After the alignments are found they are rescored using a fast Smith-Waterman like algorithm. This lets you set the rescoring cutoff, to see only the high confidence hits. Scores around 2500 and greater are indicative of strong homology. One common use of this is to set -co to something small, and control only the S-W quality of alignments. -lb # = LookBack distance [default 20 for genomic, 8 for peptide] How far away two seeds are allowed to be so that they are chained. -gl # = maximum Gap Length [default 5] Maximum sized gap allowed between two seeds if they are to be chained. -version = prints the version number 2. Usage notes/suggestions The part of the algorithm which usually takes longest is chaining. So if it is too slow, try increasing the wl parameter, decreasing the -nd parameter or both. If you do so, you probably need to adjust the -co or -rsc paramters so that the results you get are meaningful. The -ext parameter seems to be very effective, we strongly suggest it. IV Description of Algorithm 1. Seed Location Seeds are found by first indexing the query sequence in a "threaded trie" of height k. In a trie every node corresponds to some [part of a] word. In a threaded trie, every node has a back pointer to the node which corresponds to the same word without its first letter. We start by inserting into the threaded trie all of the k-mers of the query sequence. Then we do a "walk" using the database sequence, where starting at the root, for every letter if the current node has a child corresponding to this letter we go down to it, and if it does not we folloe back pointers until it does, or we hit the root. If degeneracy is allowed, we just allow multiple current nodes, which correspond to the possible degenerate words. 2. Search Space and Chaining The seeds seen over the course of the past -lb basepairs are stored in a skip list, indexed by the difference of its indeces in the two sequences (diagonal number). For each seed we do a range query in the skip list, finding the possible hits with which it can be chained. the highest scoring chain is picked, and it can then be further extended by future hits. IV anchors and ancs4dialign Anchors is a small C program, that given a list of CHAOS local alignments resolves them into a strictly increasing list of anchors using an algorithm based on the Longest increasing subsequence problem. The anchors given out by the program can be used to anchor any global aligner that supports an external anchors file, e.g. LAGAN or dialign. For Dialign we include an extra script, ancs4dialign, written by Burkhard Morgenstern that given a multi-fasta file with several sequences will create a .anc file that dialign will use if given the -anc option. V Future Work I am interested in further extending CHAOS. However with most such features I will be user driven: if you want a specific feature, ask me. This way I'll spend less time working on things no one will ever use. One issue which is of particular interest is placing statistical confidence estimates on the chains. If you are interested in helping me work on CHAOS please contact me, I am open to collaborations in this area. +-----------------------------------------------------------------+ | Michael Brudno | 260S Clark Center | | PhD Candidate | (650) 725-6094 | | Dept. of Computer Science | brudno@cs.stanford.edu | | Stanford University | http://www.stanford.edu/~brudno | +-----------------------------------------------------------------+ lagan20/Readmes/README.FIRST0000644000076500007650000000737010502344074016254 0ustar brudnobrudno00000000000000README.first for LAGAN Toolkit (Limited Area Global Alignment of Nucleotides) v2.0 Author: Michael Brudno (brudno@cs.toronto.edu) 09/14/2006 LAGAN was developed by Michael Brudno, Chuong Do, Sanket Malde, Michael F Kim and Serafim Batzoglou of the Dept of Computer Science at Stanford University, with assistance from many other people. See http://lagan.stanford.edu or contact lagan@cs.stanford.edu for more information. 0. Availability + Legalese The source code of this version of LAGAN is freely available to all users under the GNU Public License (GPL). See the file LICENSE in this directory for more information. You can download the LAGAN sources from http://lagan.stanford.edu If you use LAGAN regularly please consider contacting lagan@cs.stanford.edu to be placed on a mailing list to be contacted about any updates and bug-fixes. If you use LAGAN in a published result please see http://lagan.stanford.edu/cite.html for the latest citation information. I. Installation To install LAGAN you need to copy the source files to your local computer, untar/ungzip them, and run "make". I am assuming you have a reasonably modern installation of gcc and perl. The sequence of commands should be: % gunzip lagan.tar.gz % tar xvf lagan.tar % make This will create the executable files chaos, anchors, order, mlagan, glocal, prolagan as well as many tools in the utils directory. You may also need to go into all the .pl file, and change the first line to call your perl interpreter. You must also specify an environment variable $LAGAN_DIR to point to the directory where you installed LAGAN. Because LAGAN uses no system-dependent or implementation dependent libraries it should compile on all platforms and ANSI C compilers. We use it on a Linux box. Please tell us if you have trouble compiling/running LAGAN tools on your favorite platform, we have found that most of these problems are easily resolved. II Description LAGAN toolkit is a set of tools for local, global, and multiple alignment of DNA sequences. Please see our website (http://lagan.stanford.edu) for publications describing LAGAN and its components. The 4 main parts of LAGAN, each documented in its own README file are: 1. CHAOS local alignment tool 2. LAGAN pairwise global alignment tool 3. MLAGAN multiple global alignment tool. 4. Shuffle-LAGAN pairwise glocal alignment (with the SuperMap chaining addition) There are also numerous utilities and scripts, mainly in the utils subdirectory. Some of these are documented in the README.tools file. Of particular interest may be scorealign, that can score a LAGAN or MLAGAN alignment, and the series of "m" tools: mproject, mextract, mpretty, mrunfile for running mlagan and parsing its output. III Repeat Masking LAGAN, MLAGAN, and Shuffle-LAGAN can use masking information to improve the quality of the alignment. If you are trying to align sequence seq1.fa and seq2.fa you should create the files seq1.fa.masked and seq2.fa.masked which should have repeats masked to Ns. LAGAN, M-LAGAN and S-LAGAN will know to look for these files when aligning. CHAOS doesn't recognise repeat information, you should just use it on the masked files if this is appropriate. IV Changes from previous version 0.9 -> 1.0: Several bug fixes, alignment parameters are now in the nucmatrix.txt file 1.0 -> 1.1: Several bug fixes, Fastreject now clips at intersection rather than union. 1.1 -> 1.2: A few bug fixes, Shuffle-LAGAN added. 1.2 -> 1.21: A few bug fixes, sped up shuffle-lagan, code is now GPLed 1.21-> 2.0: A few minor (and couple of major) bug fixes. MLAGAN no longer requires a tree, and takes a substitution matrix as an argument, added supermap chaining (and a new implementation of glocal chaining), updated to align up to 63 sequences.lagan20/Readmes/README.lagan0000644000076500007650000000556110502360765016454 0ustar brudnobrudno00000000000000NOTE: Pairwise lagan has not changed in the 2.0 release README.lagan for LAGAN aligner (Limited Area Global Alignment of Nucleotides) v1.1 Author: Michael Brudno (brudno@cs.stanford.edu) 04/02/2003 LAGAN was developed by Michael Brudno, Chuong Do, Michael F Kim and Serafim Batzoglouof the Dept of Computer Science at Stanford University, with assistance from many other people. See http://lagan.stanford.edu or contact lagan@cs.stanford.edu for more information. I Description LAGAN is a global alignment tool. It does a Needleman-Wunsch alignment in a limited area of the matrix, determined during an anchoring phase. The algorithm consists of 3 main parts, each documented in its own README file: 1. Generation of local alignments, using the CHAOS local alignment tool 2. Finding a monotonically increasing set of anchors from these local alignment, using the anchors program. 3. Doing global alignment in a limited area of thw NW matrix given the set of anchors (order tool). lagan.pl is the main executable that calls the three steps. II Usage 1. Input Lagan accepts requires two fasta files (first two arguments),reads gap and substitution parameters from the nucmatrix.txt file and takes several optional command line options. nucmatrix.txt -- This file has the substitution matrix used by lagan and the gap penalties. The gaps penalties are on the line immediately after the matrix, the first number is the gap open, the second the gap continue. -chaos "string" [default none] The contents of string will be passed as arguments to CHAOS. See the CHAOS readme for details. -order "string" [default none] The contents of string will be passed as arguments to order. -recurfl "list of k-tuplets" [default: "(12,0,25,0),(13,1,25,0),(8,1,30,0)(7,1,30,0)"] A list of (wordlength,number of degeneracies,score cutoff, rescoringcutoff) k-tuplets to be used in the recursive anchoring. See README.chaos for the meaning of these numbers. -translate [default off] Use translated anchoring (homology done on the amino acid level). This is useful for distant (human/chicken, human/fish, and the like) comparisons. -bin [default off] print the output in binary format, for use by the bin2bl tool, or VISTA -mfa [default off] print the output in Multi-FASTA format, for use by many standard tools -rc [default off] reverse-complement the second sequence before doing the alignment -fastreject Abandon the alignment if the homology looks weak. Currently tuned for human/mouse distance, or closer. Please contact the authors for more details on this option. 2. Output The output by default is in a blast like format, but you can use the -mfa or -bin options to save the results in multi-fasta, or binary format respectively. The binary format is a compact representation accepted by VISTA. There are some converters between the formats in the utils directory (see README.tools) lagan20/Readmes/README.mlagan0000644000076500007650000000564610502344706016632 0ustar brudnobrudno00000000000000README.mlagan for MLAGAN multiple aligner v2.0 Author: Michael Brudno (brudno@cs.toronto.edu) Updated 09/14/2006 LAGAN was developed by Michael Brudno, Chuong Do, Michael F Kim, Mukund Sundararajan and Serafim Batzoglou of the Dept of Computer Science at Stanford University, with assistance from many other people. See http://lagan.stanford.edu or contact lagan@cs.stanford.edu for more information. I Description MLAGAN is a multiple global alignment tool. It does a Needleman-Wunsch alignment in a limited area of the matrix, determined during an anchoring phase. The algorithm consists of 3 main parts, each documented in its own README file: 1. Generation of ordered local alignments (anchors) between all pairs of sequences, using the CHAOS local alignment tool and anchors program 2. Doing progressive global alignment, guided by a phylogenetic tree, in a limited area of thw NW matrix given the set of anchors. mlagan is the main executable. II Usage 1. Input Mlagan accepts requires two or more fasta files (first arguments), optionally takes a -tree argument specifying a phylogenetic tree, reads gap and substitution parameters from nucmatrix.txt file (or another optionally provided file) and takes several optional command line options: nucmatrix.txt -- This file has the substitution matrix used by lagan and the gap penalties. The gaps penalties are on the line immediately after the matrix, the first number is the gap open, the second the gap continue. -tree "string" You need to specify a phylogenetic tree for the sequences. This must be a pairwise tree, with parenthesis specifying nodes. Here are a few examples: "(human (mouse rat))" "((human mouse)(fugu zebrafish))" The name of each sequence must be specified somewhere on the fasta line of the input sequence: >g324325|Homo sapiens human ACTGG.... Either "Homo" or "sapiens" or "human" are valid names to call the sequence. -translate [default off] Use translated anchoring (homology done on the amino acid level). This is useful for distant (human/chicken, human/fish, and the like) comparisons. -fastreject [default off] Abandon the alignment if the homology looks weak. Currently tuned for human/mouse distance, or closer. Please contact the authors for more details on this option. -out filename [default standard out] Output the alignment to filename, rather than standard out. 2. Output The output by default is in Multi-FASTA format. You can use the mpretty tool in the utils directory to view a human-friendly version. 3. Prolagan Prolagan is the pairwise progressive step of mlagan. It should be run just like mlagan, but with two additional arguments, -pro1 and -pro2 which are files with profiles (alignments) which should be aligned together. Note that all sequences (and the tree) must still be given to prolagan. This program is useful if you have two alignments already and want to just align them, instead of realigning all sequences. lagan20/Readmes/README.shuffle0000644000076500007650000000517110502361140017007 0ustar brudnobrudno00000000000000Shuffle-LAGAN with SuperMap README Michael Brudno, brudno@cs.toronto.edu 0. Overview This directory contains the code for Shuffle-LAGAN, a glocal alignment tool described in Brudno, Malde, Poliakov, Do, Couronee, Dubchak & Batzoglou "Glocal alignment: Finding rearrangements during alignment", ISMB 2003 Proceedings (see http://lagan.stanford.edu/cite.html for detailed citation information). It also It is distributed under the SuperMap chaining algorithm which is currently unpublished. 1. Installation If you received Shuffle-LAGAN as part of the LAGAN toolkit it is installed automatically with the rest of the package. The code assumes $LAGAN_DIR has been set. 2. Running Just give it two sequences and let it roll: #slagan.pl seq1.fa seq2.fa 3. Input The input sequences should be in FASTA format. You should provide a .masked file for each of the sequences (see README.FIRST) Output will be in XMFA format, described lower. 4. Output The overall result are three files, a .chaos file with the local alignments in the chaos format, a .mon file with the 1-monotonic chain (see http://lagan.stanford.edu/manual.html for what this is) and a .xmfa file with the actual alignments in the XMFA format. A. XMFA Format The format is based on Multi-FASTA, but allows for several multiple local alignments to be stored in a file. It is as follows: > seq_num:start1-end1 +/- comments (sequence name, etc.) AC-TG-NAC--TG AC-TG-NACTGTG ... > seq_num:startN-endN +/- comments (sequence name, etc.) AC-TG-NAC--TG AC-TG-NACTGTG ... = (line starting with an "=" separates different alignments, and can have any comments) > seq_num:start1-end1 +/- comments (sequence name, etc.) AC-TG-NAC--TG AC-TG-NACTGTG ... > seq_num:startN-endN +/- comments (sequence name, etc.) AC-TG-NAC--TG AC-TG-NACTGTG ... 5. Parameters Will be described for the next release. E-mail the author for details. 6. Utilities The utilities directory ($LAGAN_DIR/utils) has 2 programs which may be of use to Shuffle-LAGAN users: A. Glue Given a Shuffle-LAGAN alignment in XMFA format it glues together a "fake" second sequence and builds a single pairwise alignment in multi-fasta format. This can then be visualized using VISTA, or used in other ways (e.g. you can get several of these "fake" sequence and use MLAGAN to do multiple alignment). B. dotplot Given a list of local alignments in the format of the monotinic file (.mon) it builds a series of gnuplot commands that build a dotplot of the local alignments. Useful for seeing which rearrangements were found. This README will be extended in the future. Please send questions to Michael Brudno, brudno@cs.stanford.edu lagan20/Readmes/README.tools0000644000076500007650000002116010502337063016516 0ustar brudnobrudno00000000000000LAGAN tools README (Authors: Michael Brudno, Michael F. Kim & Chuong Do) lagan@cs.stanford.edu 04/02/2003 This document describes how to use LAGAN associated wrappers and tools. Both mrun.pl and mrunpairs.pl are wrappers to mlagan. The only difference is that mrunpairs.pl generates a set of pairwise alignments, whereas mrun.pl does the standard multiple alignment. Both of these tools use a helper script mextract.pl to parse out the individual sequence files from a Multi-FASTA file. Having run MLAGAN, we can visualize the output on a nucleotide level in a "pretty" format using mpretty.pl. We can also project the multiple sequence alignment into any number of its constituent sequences, using mproject.pl. We provide a tool (mviz.pl) which will take a multiple alignment in Multi-FASTA form and create a VISTA plot. Using the parameter file, you can completely specify the parameters to an mlagan job. We provide a sample file (sample.params) with more information on how to use the various parameters. Sequence names are always taken to be the first white-space terminated string after the ">" in a FASTA or Multi-FASTA file, e.g.: >sample1 This is the first sample sequence. ACGT... >sample2 This is the second sapmle sequence. ACGT... Here the sequence names would be sample1 and sample2. The scorealign tool scores an alignment (multiple or pairwise in MFA format). The rc script reverse-complements a sequence, and the bin2mf, mf2bin.pl and bin2bl scripts convert between the various output formats. mrunfile.pl ----------- Usage: mrunfile.pl filename [-pairwise] [-vista] Required Parameter: filename : name of the parameter file (e.g. sample.params) Optional parameters: -pairwise : generates a set of pairwise alignments -vista : creates a VISTA plot using the output Example: mrunfile.pl sample.params -vista This would run MLAGAN using the parameters in sample.params and generate a VISTA plot at the end. Uses: mrun.pl or mrunpairs.pl mrun.pl ------- Usage: mrun.pl filename -tree "(tree...)" Required parameters: filename : name of the Multi-FASTA file with the sequences to align. -tree "(tree)" : a fully parenthesized phylogenetic tree over the sequence names. Optional parameters: [base sequence name [sequence pairs]] : For projection into pairs for VISTA output, you may wish to specify a base sequence and specific pairs of sequences to have projected. If you do not specify sequence pairs, then all possible pairings to the base sequence will be generated. If you do not specify a base sequence, the default base sequence is the first sequence in the multi-FASTA input. other MLAGAN parameters: -nested : runs iterative improvement in a nested fashion -postir : incorporates the final improvement phase -lazy : uses lazy mode for anchor generation -verbose : give verbose output -translate : do translated comparisons -out "filename": outputs to filename -version : prints version info other VISTA parameters: (see VISTA plotfile definition for more info) per sequence pair: --regmin # (default: 75) --regmax # (default: 100) --min # (default: 50) per plotfile: --bases # (default: 10000) --tickdist # (default: 2000) --resolution # (default: 25) --window # (default: 40) --numwindows # (default: 4) Example: mrun.pl sample.fasta -tree "(sample1 (sample2 sample3))" This will run mlagan on the sequences in sample.fasta with the phylogenetic tree specified above. Uses: mextract.pl to parse out the constituent sequences into individual FASTA files for use by mlagan. Also uses mextract.pl with -masked option for parsing out .masked multi-FASTA files. mrunpairs.pl ------------ Usage: mrunpairs.pl filename Required parameter: filename : multi-FASTA file. Optional parameters: (same as mrun.pl optional parameters, see above) Example: mrunpairs.pl sample.fasta sample1 sample1 sample2 sample1 sample3 This will generate the pairs (sample1 sample2), (sample1 sample3), using sample1 as a base sequence (for VISTA plots). Uses: mextract.pl to parse out the constituent sequences into individual FASTA files for use by mlagan. Also uses mextract.pl with -masked option for parsing out .masked multi-FASTA files. mpretty.pl ---------- Usage: mpretty.pl filename Required parameter: filename : Multi-FASTA file to view. Optional parameters: -linelen value : number of bases to display per line (min: 10, default: 50) -interval value : frequency of markers (min: 10, default: 10, none: 0) -labellen value : length of the sequence label (min: 5, default: 5, none: 0) -start value : position to start from (>=1) -end value : position to end from (>=start position) -base sequence_name : sequence name on which to base start/end positions. -nocounts : turn off sequence position counts Example: mpretty.pl sample.fasta -nocounts -interval 0 -linelen 72 This will print out the contents of sample.fasta without sequence position counters, without interval markers and at 72 bases per line, with the sequence labels on each line at their default length. Because of the way the labels are printed, this will cause each line to have length 80 characters. mpretty.pl sample.fasta -start 101 -end 150 This will print out the contents of sample.fasta from positions 101 to positions 150 in the alignment, inclusive. mpretty.pl sample.fasta -start 131 -end 140 -base sample1_aligned This will print out the contents of sample.fasta from position 131 to position 140 relative to the sequence sample1_aligned. mextract.pl ----------- Usage: mextract.pl filename [-masked] Required parameter: filename : Multi-FASTA file to extract sequences from. Optional parameter: -masked : For dealing with masked Multi-FASTA files. Example: mextract.pl sample.fasta This will extract the contents of sample.fasta (e.g. sample1, sample2, sample3) and put them into files: sample_sample1.fa sample_sample2.fa sample_sample3.fa Masked Example: mextract.pl sample.fasta.masked -masked This will extract the contents of sample.fasta.masked (e.g. sample1, sample2, sample3) and put them into files: sample_sample1.fa.masked sample_sample2.fa.masked sample_sample3.fa.masked For use with rechaos.pl in anchoring. mproject.pl ----------- Usage: mproject.pl filename seqname1 [seqname2 ... ] Required parameters: filename : Multi-FASTA file to extract sequences from. and at least one sequence name. Example: mproject.pl sample.out sample1 sample2 In this example, sample.out is the resulting alignment of a number of sequences -- including sample1 and sample2. This script will project the multiple alignment into the pair sample1 and sample2. mviz.pl ------- Usage: mviz.pl data_file param_file [plotfile] Required parameters: data_file : Multi-FASTA file to visualize using VISTA (this must be the first argument) param_file : Parameter file (same format as used in other scripts) (this must be the second argument) Optional parameter: plotfile : VISTA plotfile (if specified, must be specified third) Script will use this plotfile instead of automatically generated one. Example: mviz.pl sample.out sample.params sample.plotfile This will generate a VISTA plot using the data in sample.out, the settings in sample.params, but with sample.plotfile as the given plotfile. Uses: RunVista scorealign ---------- Usage: scorealign mfa_alignment %cutoff [-regions] Optional parameters: regions: Print the high scoring regions in the alignment. Example: scorealign alignment.mfa 80 This will return the score of the alignment in the file "alignment.mfa" that meat an 80% threshold. scorealign ---------- Usage: scorealign mfa_alignment %cutoff [-regions] Optional parameters: regions: Print the high scoring regions in the alignment. Example: scorealign alignment.mfa 80 This will return the score of the alignment in the file "alignment.mfa" that meat an 80% threshold. mf2bin.pl --------- Usage: mf2bin.pl inputfile [-out outputfile] Required parameter: inputfile : Multi-FASTA file with two sequences to convert to bin. Optional parameter: -out outputfile : Put bin output to ouputfile. Example: mf2bin.pl sample1_sample2.fa -out sample1_sample2.bin This will take the file sample1_sample2.fa (which contains the alignment or projection of a larger alignment of sample1 and sample2) and pack it into VISTA binary format and output the result to sample1_sample2.bin. bin2mf ------ Usage: bin2mf { - | alignment_file} Example bin2mf align.bin > align.mfa cat align.bin | bin2mf - > align.mfa This will convert the binary file in align.bin into multi-fasta format, and save it as align.mfa. bin2bl ------ Usage: bin2mf { - | alignment_file} Example bin2mf align.bin > align.bl cat align.bin | bin2mf - > align.bl This will convert the binary file in align.bin into BLAST-like format, and save it as align.bl. lagan20/rechaos.pl0000755000076500007650000002231510502337064015107 0ustar brudnobrudno00000000000000#!/usr/bin/env perl $lagandir = $ENV{LAGAN_DIR}; # Status # -- extension problems if (@ARGV < 2) { print ("usage:\n rechaos seqfile1 seqfile2 [-chaos \"chaos flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-out \"filename\"] [-lazy] [-maskedonly] [-debug] [-translate] [-fastreject]\n"); exit(1); } #$recurfl = "(12,0,25,0)x,(13,1,30,0)x,(8,1,30,0)x,(7,1,30,0)x"; $recurfl = "(12,0,25,0)x,(13,1,30,0)x,(4,0,4,3000)xt,(8,1,30,0)x,(7,1,30,0)x"; #$recurfl = "(12,0,10,200)x,(12,0,10,150)x,(3,0,10,150)xt,(8,0,10,150)x,(12,0,25,0),(13,1,30,0),(3,0,30,0)t,(8,1,30,0),(7,1,25,0)"; $minbox = 10; $minside = 5; $seq1 = $ARGV[0]; $seq2 = $ARGV[1]; $tofile = 0; $masker = 1; $lazycheck = 0; $fastreject = 0; $frminlevel = 0; $frmaxlevel = 3; @frseq1 = (150000, 50000, 30000, 15000); @frseq2 = (150000, 50000, 30000, 15000); #@frseq1 = (70000, 60000, 60000, 20000); #@frseq2 = (70000, 60000, 60000, 20000); $sentinelleft = 1.1; $sentinelright = 1.2; $gfc = " "; $dounmasked = 1; $filename = ""; $debug = 0; $anchparams = ""; $translate = 0; sub max { my ($a, $b) = @_; return $a if ($a > $b); return $b; } sub min { my ($a, $b) = @_; return $a if ($a < $b); return $b; } $i = 2; while ($i < @ARGV) { if ($ARGV[$i] =~ /-\chaos/) { $chaosfl = $chaosfl." ".$ARGV[++$i]; } elsif ($ARGV[$i] =~ /-ext/) { $chaosfl = $chaosfl." -ext "; } elsif ($ARGV[$i] =~ /-recurse/) { $recurfl = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-lazy/) { $lazycheck = 1; } elsif ($ARGV[$i] =~ /-nomask/) { $masker = 0; } elsif ($ARGV[$i] =~ /-out/) { $tofile = 1; $filename = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-maskedonly/) { $dounmasked = 0; } elsif ($ARGV[$i] =~ /-fastreject/) { $fastreject = 1; } elsif ($ARGV[$i] =~ /-debug/) { $debug = 1; } elsif ($ARGV[$i] =~ /-translate/) { $translate = 1; } elsif ($ARGV[$i] =~ /-gfc/) { $gfc = " -gfc "; } elsif ($ARGV[$i] =~ /-gap/){ $anchparams = $anchparams." -gap ".$ARGV[++$i]; $anchparams = $anchparams." ".$ARGV[++$i]; } else { die ("Unrecognized option $ARGV[$i]\n"); } $i++; } if ($lazycheck) { if (-f $filename) { print STDERR "Output file already exists, lazy mode exit!\n"; exit (0); } } $extracase1 = 0; $extracase2 = 0; if (-e "$seq1.masked") { $extra1 = $seq1; $seq1 = "$seq1.masked"; $extracase1 = 1; } if (-e "$seq2.masked") { $extra2 = $seq2; $seq2 = "$seq2.masked"; $extracase2 = 1; } if (! $dounmasked){ $extracase1 = 0; $extracase2 = 0; } #open(SEQ1, "$seq1"); #open(SEQ2, "$seq2"); #$line1 = ; #while ($line1 = ) { # chomp $line1; # $seq1len += length($line1); #} # #$line2 = ; #while ($line2 = ) { # chomp $line2; # $seq2len += length($line2); #} $seq1len = `$lagandir/utils/getlength $seq1`; chomp $seq1len; $seq2len = `$lagandir/utils/getlength $seq2`; chomp $seq2len; $b1[0] = $b2[0] = 1; $e1[0] = $seq1len; $e2[0] = $seq2len; $cumanchs = 0; $clipleft1 = 0; $clipleft2 = 0; $clipright1 = $seq1len + 1; $clipright2 = $seq2len + 1; $app_str = ""; $i = 0; while (1) { $goodanchs = 0; $totalanchs = 0; $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/); if (! $stillmore) { if ($extracase1 || $extracase2) { if ($extracase1) { $seq1 = $extra1; $extracase1 = 0; } if ($extracase2) { $seq2 = $extra2; $extracase2 = 0; } } else { last; } } else { $wordlen = $1; $degeneracy = $2; $cutoff = $3; $extcutoff = $4; $tail = $5; $extraparams = ""; $extraparams = "-t ".$extraparams if ((index ($tail, "t") != -1) && ($translate)); $extraparams = $extraparams." -rsc $extcutoff" if (index ($tail, "x") != -1); } $recurfl = $6; next if ((index ($tail, "t") != -1) && (!$translate)); print STDERR "Using $seq1 $seq2 ($wordlen, $degeneracy, $cutoff, $extcutoff) $tail\n"; # PRINT OUT LIST OF REGIONS TO ALIGN open (PFILE, ">$$.anchs.pairs"); for ($j = 0; $j < @b1; $j++) { print PFILE "-s1 $b1[$j] $e1[$j] -s2 $b2[$j] $e2[$j]\n"; } close (PFILE); # print STDERR "PAIRS hits\n"; # print STDERR `cat $$.anchs.pairs`; # print STDERR "-----------------\n"; # print STDERR `cat $$.anchs.pairs`; # print STDERR "-----------------\n"; # print STDERR "$lagandir/chaos $seq1 $seq2 -wl $wordlen -nd $degeneracy -co $cutoff $extraparams $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp"; # PERFORM THE ALIGNMENTS USING CHAOS $saver = "$lagandir/chaos $seq1 $seq2 $extraparams -wl $wordlen -nd $degeneracy -co $cutoff $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp"; `$lagandir/chaos $seq1 $seq2 $extraparams -wl $wordlen -nd $degeneracy -co $cutoff $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp`; if ($?) { print STDERR "$saver\n"; exit(1); } # ADD IN BOUNDARIES $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/); if ($fastreject || $stillmore || $extracase1 || $extracase2){ $temp1 = $seq1len + 1; $temp2 = $seq2len + 1; $app_str = $app_str."seq1 0 $clipleft1; seq2 0 $clipleft2; score=$sentinelleft (+)\n"; $app_str = $app_str."seq1 $clipright1 $temp1; seq2 $clipright2 $temp2; score=$sentinelright (+)\n"; } # APPEND HITS FROM $app_str TO LOCAL ALIGNMENT LIST open (OFILE, ">>$$.anchtemp"); print OFILE $app_str; close (OFILE); # `wc $$.anchtemp` =~ /(\d+)/x; # $totalanchs = $totalanchs + $1; # print STDERR "CHAOS hits\n"; # print STDERR `cat $$.anchtemp`; # FIND MAXIMAL-SCORING CONSISTENT CHAIN `$lagandir/anchors $$.anchtemp $gfc $anchparams | sort -n +1 > $$.anchs.sorted`; if ($?) { exit(1); } # IF WE'RE DONE, THEN QUIT! $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/); if (!$stillmore && !$extracase1 && !$extracase2) { last; } # `wc $$.anchs` =~ /(\d+)/x; # print STDERR "ANCHS hits\n"; # print STDERR `cat $$.anchs.sorted`; # $goodanchs = $goodanchs + $1; # if ($?) { exit(1); } # READ SORTED ANCHORS TO @anchors open(SFILE, "$$.anchs.sorted"); @anchors = ; close(SFILE); @b1new = 0; @b2new = 0; @e1new = 0; @e2new = 0; @scores = 0; $app_str = ""; # FOR EACH UNALIGNED REGION $area = 0; $maxarea = 0; $k = 0; for ($m = 0; $m < @anchors; $m++){ # SAVE OLD ANCHORS (SKIP FIRST AND LAST FAKE ANCHORS) if ($m >= 1 && $m < @anchors - 1){ $anchors[$m] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $score = $5; chomp $score; $app_str = $app_str."seq1 $1 $2; seq2 $3 $4; score=$score (+)\n"; } if ($m == 0){ next; } # DETERMINE REGION BOUNDARIES $anchors[$m-1] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $gap1begin = $2 + 1; $gap2begin = $4 + 1; $prevanchorscore = $5; chomp $prevanchorscore; $anchors[$m] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $gap1end = $1 - 1; $gap2end = $3 - 1; $nextanchorscore = $5; chomp $nextanchorscore; # CHECK IF RECURSION NEEDED $boxarea = ($gap1end - $gap1begin + 1) * ($gap2end - $gap2begin + 1); $area = $area + $boxarea; $maxarea = $boxarea if ($boxarea > $maxarea); if ($boxarea >= $minbox && ($gap1end - $gap1begin + 1) > $minside && ($gap2end - $gap2begin + 1) > $minside ){ # FAST REJECT if ($fastreject && ($i >= $frminlevel) && ($i <= $frmaxlevel)){ # SKIP MARKED ENDS OF ALIGNMENT if ($nextanchorscore == $sentinelleft || $prevanchorscore == $sentinelright){ next; } # TRIM NEW ENDS OF ALIGNMENT if ($prevanchorscore == $sentinelleft){ # if ($boxarea > $frseq1[$i] * $frseq2[$i]){ if (($gap1end - $gap1begin > $frseq1[$i]) || ($gap2end - $gap2begin > $frseq2[$i])){ if (@anchors == 2){ exit(3); } $clipleft1 = max ($gap1begin-1, $gap1end - $frseq1[$i]); $clipleft2 = max ($gap2begin-1, $gap2end - $frseq2[$i]); $gap1begin = $clipleft1 + 1; $gap2begin = $clipleft2 + 1; } } elsif ($nextanchorscore == $sentinelright){ # if ($boxarea > $frseq1[$i] * $frseq2[$i]){ if (($gap1end - $gap1begin > $frseq1[$i]) || ($gap2end - $gap2begin > $frseq2[$i])){ if (@anchors == 2){ exit(3); } $clipright1 = min ($gap1end+1, $gap1begin + $frseq1[$i]); $clipright2 = min ($gap2end+1, $gap2begin + $frseq2[$i]); $gap1end = $clipright1 - 1; $gap2end = $clipright2 - 1; } } } # ADD REGION if ($gap1begin < $gap1end && $gap2begin < $gap2end){ $b1new[$k] = $gap1begin; $b2new[$k] = $gap2begin; $e1new[$k] = $gap1end; $e2new[$k] = $gap2end; $k++; } } } @b1 = @b1new; @b2 = @b2new; @e1 = @e1new; @e2 = @e2new; if ($debug) { print STDERR "Level $i Summary:\n"; print STDERR " Using $seq1 $seq2 ($wordlen, $degeneracy, $cutoff)\n"; if ($totalanchs == 0) { $percentage = 0; } else { $percentage = $goodanchs / $totalanchs * 100.0; } print STDERR " $goodanchs good out of $totalanchs total anchors ($percentage%)\n"; $area = $area / 1000000; $maxarea = $maxarea / 1000000; print STDERR " Total area left = $area (max = $maxarea)\n"; } $cumanchs = $cumanchs + $goodanchs; $i++; } $res = `sort -nr +1 $$.anchs.sorted`; if ($?) { exit(1); } `rm $$.*`; if($tofile) { open(OUTFILE, ">$filename"); print OUTFILE "$res"; close OUTFILE; } else { print "$res"; } print STDERR "$cumanchs cumulative anchors\n" lagan20/sample.fasta0000644000076500007650000000230210502337064015416 0ustar brudnobrudno00000000000000>sample1 GGCATGTCCAGAAAATCCAAGTGCCTCTTCCTCTTGATCTTCTCCAACGATGTCCAGA AAATCCAAGTGCCTCATTCCTCTTGATCTTCTCCAGGCATGTCCAGAAAATCCAAGTG CCTCTTCCTCTCTGATCTTCTCCTCGGTTGGTCCAGAAAATCCAAGTGCCTCTTCCTC TTGATCTTCTCCAGAAATGTCCAGAAAATCCAAGTAGCCTCTTCCTCTTGATCGGCTC CAGAAATGTCCAGAAAAATCCAAGTGCCTCTTCCTCTTGATCGGCTCCATAAATGTCC AGAAAATCCAACGTGCCTCTTCCTCTTGATCGGCTCCAGAAATGTCCAGAAATATCCA AGTGCCTCTTCCTCTTGATCGGCTCCTTA >sample2 CGATCCCAAATCCAAGTGCCTCAGAGTCTACTTGATCTTCAATTCAGATCCCAAATCC AAGTGCCTCAGAGTCTACTTGAATCTTCTATCGGGTCCCAAATCCAAGTGCCTCAGAG TCTACTTGATCTTCTCTCTCGATCCCATATCCAAGTGCCTCCTAGAGTCTACTTGATC TTCTCGATAACCAAAATCCAAGTGCCTCAGAGTCTACTTCACTCTTCTCGACTAACCC AAATCCAAGTGCCTCAGATGAGTCTACTTCCTCTTCTCATAACTCAAATCCAAGTGCC TCAGAGTCTAACTTCCTCTTCTCGAATAACCCAAATCCAAGTGCCTCAGAGTGTCTAC TTCCTCTTCTCG >sample3 TACCCAAATCCAAGTGCCTCAGCGTCTAATAAAACAAGTCTTGATCTTCAACTCCTCC CAAATCCAAGTGCAACCTCAGCCGCTAATAAAAAGTCTTGATCTTCTCGCGTCCGGCA AATCCAAGTGCCTCAGCGCTAATAAAAAGTCTTGATCTTCTCGGGAGTCCCAAATCCA AGTGCCTCAGCGCTAATAAAAAGTCTTGATCTTCTCGGAGGAACAACAAATCCAAGTG CCTCAAGCGCTAATAAAAAGTCCCGATCTTCTCGTGACAATACAAATCCAAGTGCCTC AGCGCTAATAAAAAGTCCCGATCTTCTCCCGTGTAAACAAATCCAAGTGCCTCAGCGC TAATAAAAAGTCCCGATCTTCTCTGGTAACACAACAAATCCAAGTCACGCCTCAGATA CGCTAATAAAAAGTCCCGATCTTCTC lagan20/sample.params0000644000076500007650000000170110502337064015605 0ustar brudnobrudno00000000000000# This is a comment # The first thing in the file must be the Multi-FASTA file sample.fasta # The rest of the parameters can be in any order. # Compound parameters must be on one line. # Base sequence MUST appear before sequence pairs # Optional base sequence (default is the first sequence in file). sample1 # Optional pairs sample1 sample2 sample1 sample3 sample2 sample3 # MLAGAN parameters # preceded by "-" # Phylogenetic tree specification (required). #-tree "(...)" -tree "((sample1 sample2) sample3)" # Lazy evaluation... #-lazy # Nested Iterative Refinement #-nested # Turning on post-alignment Iterative Refinement #-postir # VISTA parameters (defaults) # preceded by "--" # ALIGN REGION MIN (75), REGION MAX (100), MIN (50) #--regmin 75 #--regmax 100 #--min 50 # BASES (10000) #--bases 10000 # TICK DISTANCE (2000) #--tickdist 1000 # RESOLUTION (25) #--resolution 25 # WINDOW (70) #--window 70 # NUM WINDOWS (4) #--numwindows 4 lagan20/slagan-mfa.pl0000755000076500007650000000217210502337064015470 0ustar brudnobrudno00000000000000#!/usr/bin/perl use strict; $0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0; die("$0: LAGAN_DIR not defined. Stopped") unless defined $ENV{"LAGAN_DIR"}; my $LAGAN_DIR = $ENV{LAGAN_DIR}; my ($outfile, $base); foreach my $arg (@ARGV) { if ($arg =~ /-out\s+([^\s]+)/) { $outfile = $1; $arg =~ s/-out\s+([^\s]+)//; } elsif ($arg =~ /-base[\s\=]+([^\s]+)/) { $base = $1; $arg =~ s/-base[\s\=]+([^\s]+)//; die("$0: Invalid base parameter (expected 1 or 2). Stopped") unless $base eq "1" or $base eq "2"; } } if (@ARGV < 2) { print ("Usage:\n$0 seqfile1 seqfile2 [-glocal \"glocal flags\"] [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-mfa] [-out \"filename\"] [-maskedonly] [-debug] [-translate] [-fastreject]\n"); exit(1); } my $args = join(" ", @ARGV); system($LAGAN_DIR."/slagan.pl $args > slagan.pl.out"); die("$0: slagan.pl returned error $?. Stopped") if $?; system($LAGAN_DIR."/xmfa2mfa.pl ".($base eq "2" ? "2" : "1")." < slagan.pl.out ".($outfile ? "> $outfile" : "")); die("$0: xmfa2mfa.pl returned error $?. Stopped") if $?; unlink "slagan.pl.out"; lagan20/slagan.pl0000755000076500007650000001222210502337064014724 0ustar brudnobrudno00000000000000#!/usr/bin/perl -w use strict; my $lagandir = $ENV{LAGAN_DIR}; if (@ARGV < 2) { print ("Usage:\n slagan.pl seqfile1 seqfile2 [-glocal \"glocal flags\"] [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-mfa] [-out \"filename\"] [-maskedonly] [-debug] [-translate] [-fastreject]\n"); exit(1); } my ($seq1, $firstName) = ($ARGV[0], $ARGV[0]); die("$0: File not found: $seq1. Stopped") unless -f $seq1; my ($seq2, $secondName) = ($ARGV[1], $ARGV[1]); die("$0: File not found: $seq2. Stopped") unless -f $seq2; my ($extra1, $extra2) =(0, 0); if (-e "$seq1.masked") { $seq1 = "$seq1.masked"; $extra1 = 1;} if (-e "$seq2.masked") { $seq2 = "$seq2.masked"; $extra2 = 1;} my ($outName1, $outName2) = ($ARGV[0], $ARGV[1]); $outName1 =~ s/^.*\///; $outName1 =~ s/\..*//; $outName2 =~ s/^.*\///; $outName2 =~ s/\..*//; my $max_ext = 25000; my $ext_mul = 1; my $arglist = ""; my $glocal_fl = " -gapopen 0,1000,2000,2000 -gapcont 0.2,0.06,0.06,0.06 -dist 0,1.0,2.5,2.5"; my $chaos_fl = " -wl 11 -nd 1 -co 10 -ext -rsc 2250 -b"; my $lagan_fl = ""; my $supermap_fl = "-glocal_out=${outName1}_$outName2.out.glocal"; my $outfile = 0; my $fastrej = 0; my $lazy = 0; for (my $i = 2; $i < @ARGV; $i++) { if ($ARGV[$i] =~ /-glocal_fl/) { $glocal_fl = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-chaos_fl/) { $chaos_fl = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-lagan_fl/) { $lagan_fl = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-max_ext/) { $max_ext = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-ext_mul/) { $ext_mul = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-out/) { $outfile = $ARGV[++$i]; if (-e "$outfile") { system("rm $outfile") and exit(1); } } elsif ($ARGV[$i] =~ /-order/) { $arglist = $arglist." -order $ARGV[++$i]"; } elsif (($ARGV[$i] =~ /-gs/) || ($ARGV[$i] =~ /-gc/) || ($ARGV[$i] =~ /-mt/) || ($ARGV[$i] =~ /-ms/) || ($ARGV[$i] =~ /-bw/)) { $arglist = $arglist." ".$ARGV[$i]; $arglist = $arglist." ".$ARGV[++$i]; } elsif ($ARGV[$i] =~ /-ext/) { $arglist = $arglist." -ext $ARGV[++$i]"; } elsif ($ARGV[$i] =~ /-maskedonly/) { $arglist = $arglist." -maskedonly"; } elsif ($ARGV[$i] =~ /-lazy/) { $lazy = 1; } elsif ($ARGV[$i] =~ /-translate/) { $arglist = $arglist." -translate"; } elsif ($ARGV[$i] =~ /-fastreject/) { $fastrej = 1; # $arglist = $arglist." -fastreject"; } elsif ($ARGV[$i] =~ /-recurse/) { $arglist = $arglist." -recurse \"".$ARGV[++$i]."\""; } elsif ($ARGV[$i] =~ /-chaos/) { $chaos_fl = $chaos_fl." ".$ARGV[++$i]; } else { die("$0: Invalid option for rlagan: $ARGV[$i]"); } } my $seq1len = `$lagandir/utils/getlength $firstName`; my $seq2len = `$lagandir/utils/getlength $secondName`; chomp $seq1len; chomp $seq2len; if ($lazy && -e "${outName1}_$outName2.chaos") { `cp ${outName1}_$outName2.chaos chaos.$$`; } else { `$lagandir/chaos $seq1 $seq2 $chaos_fl > chaos.$$`; if ($?) { exit(1); } `cat chaos.$$ > ${outName1}_$outName2.chaos`; } open(FH, "> seq1len"); print FH $firstName." ".$seq1len."\n"; close FH; open(FH, "> seq2len"); print FH $secondName." ".$seq2len."\n"; close FH; my $supermap_outfile = "${outName1}_$outName2.out.smap"; my $supermap_inv = "$lagandir/supermap.pl -sizes1=seq1len -sizes2=seq2len $supermap_fl chaos.$$ -no_clust_run -f -out=$supermap_outfile 1>&2"; #print $supermap_inv."\n"; system($supermap_inv); open(FH, "< $supermap_outfile"); my @regs = ; die("$0: Supermap generated no regions. Stopped") unless scalar @regs; close FH; unlink "seq1len"; unlink "seq2len"; # unlink $supermap_outfile; for (my $k = 0; $k < @regs; $k++) { $regs[$k] =~ /^([^\s]+)\s([\d]+)\s([\d]+)\s\s\s([^\s]+)\s([\d]+)\s([\d]+)\s(\+|\-)\s\((DM|M1|M2),\s([\d]+)\saligns\)$/o; my ($startreg1, $endreg1, $startreg2, $endreg2, $strand, $type) = ($2, $3, $5, $6, $7, $8); my $rcf = ""; if ($strand eq "+") { $rcf = "" } else { $rcf = "-rc"; } #print "$lagandir/utils/fa2xfa $firstName $startreg1 $endreg1 1 > seq1$k.$$\n"; `$lagandir/utils/fa2xfa $firstName $startreg1 $endreg1 1 > seq1$k.$$\n`; #print "$lagandir/utils/fa2xfa $secondName $startreg2 $endreg2 2 $rcf > seq2$k.$$\n"; `$lagandir/utils/fa2xfa $secondName $startreg2 $endreg2 2 $rcf > seq2$k.$$\n`; if ($extra1) { `$lagandir/utils/fa2xfa $seq1 $startreg1 $endreg1 1 > seq1$k.$$.masked\n`; } if ($extra2) { `$lagandir/utils/fa2xfa $seq2 $startreg2 $endreg2 2 $rcf > seq2$k.$$.masked\n`; } #print "$lagandir/lagan.pl seq1$k.$$ seq2$k.$$ $arglist $lagan_fl -mfa -out lagan.$k.$$\n"; `$lagandir/lagan.pl seq1$k.$$ seq2$k.$$ $arglist $lagan_fl -mfa -out lagan.$k.$$\n`; my $suff = ""; if ($outfile) { $suff = " >> $outfile"; } if (-e "lagan.$k.$$") { if ($fastrej) { #print "$lagandir/utils/scorealign lagan.$k.$$ 45 -cropxmfa -ibounds $suff\n"; print `$lagandir/utils/scorealign lagan.$k.$$ 45 -cropxmfa -ibounds $suff`; } else { #print "$lagandir/utils/scorealign lagan.$k.$$ 45 -ibounds\n"; my $sc = `$lagandir/utils/scorealign lagan.$k.$$ 45 -ibounds`; chomp($sc); if ($sc) { print `cat lagan.$k.$$ $suff`; print `echo \"=$sc $type\n\" $suff`; } } } } ####`cat out.$$ > ${outName1}_$outName2.mon`; unlink(glob("*.$$")); if ($extra1 || $extra2) { `rm *.$$.masked`; } exit(0); # out: .chaos .mon->.smap .xmfa lagan20/src/0000755000076500007650000000000010502546653013715 5ustar brudnobrudno00000000000000lagan20/src/.gdb_history0000600000076500007650000000001110502337063016204 0ustar brudnobrudno00000000000000run quit lagan20/src/anal_gloc.pl0000755000076500007650000000605210502337063016170 0ustar brudnobrudno00000000000000#!/usr/bin/env perl $savname1 = ""; $savname2 = ""; $skip = 0; $endblock = 0; $score = 0; $strand = ""; $initstrnd; $s1s = 999999999; $s2s = 999999999; $first = 1; $plus_sc = 0; $minus_sc = 0; while ($line = ) { if ($line =~ /^>/) { if (!$first) { if ($strand eq "+") { print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n"; } else { print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n"; } if ($strand ne $initstrnd) { print STDOUT "INV\n" } if ($strand eq "+") { $plus_sc += $score; } else { $minus_sc += $score; } if ($plus_sc > $minus_sc) { print STDOUT "Main score (+) $plus_sc; Inverted $minus_sc\n"; } else { print STDOUT "Main score (-) $minus_sc; Inverted $plus_sc\n"; } $plus_sc = 0; $minus_sc = 0; $score = 0; $s1s = 999999999; $s2s = 999999999; $strand = ""; } $first = 1; $name1 = $line; chomp $name1; $line = ; if ($line !~ /^>/) { print STDERR "Expecting a name, but got $line"; exit (1); } $name2 = $line; chomp $name2; $inblock = 1; $skip = 0; if (($name1 eq $savname1) && ($name2 eq $savname2)) { $skip = 1; } else { print STDOUT "$name1 $name2\n"; } $savname1 = $name1; $savname2 = $name2; } elsif (!$skip) { $endblock = 0; $line =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) ([0-9\.]*) (.) (.*)/; if ($1 == 0 || $3 == 0) { next; } # print STDOUT "strand $strand $s2s $4\n"; if (($strand eq "+") && ($6 eq "+") && ($s2s + 20 < $4) ) { $endblock += 2; } if (($strand eq "-") && ($6 eq "-") && ($s2s > $4 + 20) ) { $endblock += 2; } if ($strand eq "") { $strand = $6; } if ($6 ne $strand) { $endblock += 1; } if (!$endblock) { $s2s = $3; $s1s = $1; $s1e = $2; $s2e = $4; $score += $5; if ($first) { print STDOUT " "; print STDOUT " "; $initstrnd = $strand; $reg1s = $2; $reg2s = $4; $first = 0; } } else { if ($strand eq "+") { print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n"; } else { print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n"; } if ($strand eq "+") { $plus_sc += $score; } else { $minus_sc += $score; } if ($endblock %2) { print STDOUT "INV "; } else {print STDOUT " "; } if ($endblock > 1) { print STDOUT "TRL "; } else {print STDOUT " "; } $s2s = $3; $s1s = $1; $s1e = $2; $s2e = $4; $reg1s = $s1e; $reg2s = $s2e; $score = $5; $strand = $6; # print STDOUT "strand $strand\n"; } } } if (!$first){ if ($strand eq "+") { print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n"; } else { print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n"; } if ($strand eq "+") { $plus_sc += $score; } else { $minus_sc += $score; } } if ($plus_sc > $minus_sc) { print STDOUT "Main score (+) $plus_sc; Inverted $minus_sc\n"; } else { print STDOUT "Main score (-) $minus_sc; Inverted $plus_sc\n"; } lagan20/src/anchors.c0000644000076500007650000001443210502337063015513 0ustar brudnobrudno00000000000000#include #include #include #include #include "skiplist.h" typedef struct GapFreeChunkList { int x; int y; int length; int score; struct GapFreeChunkList *next; } gfc; typedef struct HitLocationList { int seq1start; int seq2start; int seq1end; int seq2end; float score; struct HitLocationList *next; struct HitLocationList *bkptr; gfc* first; gfc* last; float scoreSoFar; } hll; typedef struct hllpointer { int number; char isstart; hll* myhll; } hptr; char seq1name[255]; char seq2name[255]; float gapopen =0, gapcont=0; int gapfreechunks = 0; hll* parseCHAOS(FILE* infile, int* numhits); hll* findBestChain(hptr* myarr, int arrsize); void doOutput(hll* mylist); hll* sortList(hll* mylist); static int hptrcomp (const void *p1, const void *p2) { int i = ((hptr*)p1)->number; int j = ((hptr*)p2)->number; int it = ((hptr*)p1)->isstart; int jt = ((hptr*)p2)->isstart; if (i > j) return (1); if (i < j) return (-1); if (it) return -1; else return 1; } int main(int argc, char** argv){ FILE* inf; hll* mylist, *temp, *best; int numhits, i=0; hptr* myptrs; if (argc < 1 || argc > 6) { printf("usage: anchors [filename] [-gap # #]\n"); printf("For -gap the first # is the gap open penalty, the second the gap continue"); return 1; } i = 2; if (argc == 1 || strchr(argv[1], '-')) { i = 1; inf = stdin; } else if (!(inf = fopen(argv[1],"r"))) { printf("couldn't open input file\n"); return 2; } while (i < argc) { if (!strcmp(argv[i], "-gap")) { sscanf(argv[i+1],"%f",&gapopen); sscanf(argv[i+2],"%f",&gapcont); i += 3; } else if (!strcmp(argv[i], "-gfc")) { gapfreechunks = 1; i += 1; } } initLib(); mylist = parseCHAOS(inf, &numhits); if (!numhits) return 0; myptrs = (hptr*) malloc (sizeof(hptr) * numhits *2); i = 0; for (temp = mylist; temp; temp = temp->next) { myptrs[i].number = temp->seq1start; myptrs[i].isstart = 1; myptrs[i].myhll = temp; myptrs[i+1].number = temp->seq1end; myptrs[i+1].isstart = 0; myptrs[i+1].myhll = temp; i = i+2; } qsort(myptrs, numhits*2, sizeof(hptr), hptrcomp); best = findBestChain(myptrs, numhits*2); doOutput(best); return 0; } int whRulez(hll* one, hll* two) { float gapdiff = ((float)(two->seq2end - one->seq2end)) * gapcont; return two->scoreSoFar-one->scoreSoFar-gapdiff > 0; } float gapPen(hll* next, hll* prev) { float j= ((float)(next->seq2start-prev->seq2end))*gapcont + gapopen; // printf("%d (%f)*(%f) %f gap\n", next->seq2start-prev->seq2end, ((float)(next->seq2start-prev->seq2end)),gapcont,j); return j; } hll* findBestChain(hptr* array, int arrsize) { sklst* skipper = makeSkLst(); sle* help, *bestptr; float best = -1; int i; for (i = 0; i < arrsize; i++) { if (array[i].isstart) { help = SLfind(skipper, array[i].myhll->seq2start); if (help->myelem && (gapPen(array[i].myhll, ((hll*)help->myelem)) + ((hll*)help->myelem)->scoreSoFar) > 0) { array[i].myhll->bkptr = help->myelem; array[i].myhll->scoreSoFar = ((hll*)help->myelem)->scoreSoFar + array[i].myhll->score + gapPen(array[i].myhll, ((hll*)help->myelem)); } else { array[i].myhll->bkptr = 0; array[i].myhll->scoreSoFar = array[i].myhll->score; } } else { help = SLfind(skipper, array[i].myhll->seq2end); if (help->myelem && whRulez(array[i].myhll,((hll*)help->myelem))) continue; SLinsertAfter(skipper, help, array[i].myhll->seq2end, array[i].myhll); help = help->next[0]; while (help->next[0] && !whRulez(((hll*)help->myelem), ((hll*)help->next[0]->myelem))) SLremove(skipper, help->next[0]); } } help = skipper->sentinel->next[0]; while (help) { if (((hll*)help->myelem)->scoreSoFar > best) { best = ((hll*)help->myelem)->scoreSoFar; bestptr = help; } help = help->next[0]; } return (hll*)bestptr->myelem; } void doOutput(hll* best) { int len; hll *bestPtr=best, *temp; int chl=0, i, bestscore=-1; gfc* tmpgf; for (temp = bestPtr; temp; temp = temp->bkptr) { chl++; } for (temp = bestPtr; temp; temp = temp->bkptr) { len = temp->seq1end - temp->seq1start + 1 ; if (!gapfreechunks || !temp->first) { printf("(%d %d)=",temp->seq2start, temp->seq2end); printf("(%d %d) %f\n",temp->seq1start, temp->seq1end, temp->score); } else { for (tmpgf = temp->first; tmpgf ; tmpgf = tmpgf->next) { printf("(%d %d)=(%d %d) %d\n", tmpgf->y, tmpgf->y + tmpgf->length-1, tmpgf->x, tmpgf->x + tmpgf->length-1, tmpgf->score); } } } } char* rolltonum(char* str) { char *got1=0, *got2=0; int in=0, i=0; while (1) { if (str[i] == 0) { break; } if (str[i] == ';' && got1 && got2){ return got1; } if (isdigit(str[i])) { if (!in && (!i || isspace(str[i-1]))) { if (got1) got2 = &str[i]; else got1 = &str[i]; in = 1; } } else if (in && (isspace(str[i]))) { if (got2) { got1 = got2; got2=0; in = 0; } in = 0; } else { in = 0; got1=got2=0; } i++; } return &str[i]; } int getline(FILE* infile, hll* tt) { char temp[1024]; char* help; int z, h; fgets(temp, 1024, infile); help = rolltonum(temp); z = sscanf(help, "%d %d;%n", &tt->seq2start, &tt->seq2end, &h); if (z<2) return 0; help = rolltonum(help+h); if (sscanf(help,"%d %d; score = %f (%*c)\n", &tt->seq1start, &tt->seq1end,&tt->score)<3) return 0; return 1; } hll* parseCHAOS(FILE* infile, int* totnum) { hll *myres=0, *tt; gfc* temp; *totnum = 0; while(!feof(infile)) { tt = (hll*) malloc(sizeof(hll)); while (!feof(infile) && !getline(infile, tt)) ; if (feof(infile)) break; if (gapfreechunks) { tt->first = tt->last = temp = (gfc*) malloc(sizeof (gfc)); temp->next = 0; while (fscanf(infile, "%d %d %d %d", &temp->y, &temp->x, &temp->length, &temp->score) == 4){ tt->first = temp; temp = (gfc*) malloc(sizeof (gfc)); temp->next = tt->first; } free(temp); if (temp == tt->last) { tt->first = tt->last = 0; } } tt->next = myres; tt->bkptr = 0; tt->scoreSoFar = 0; (*totnum)++; myres = tt; } return myres; } lagan20/src/ancseq.cpp0000644000076500007650000005276110502337063015677 0ustar brudnobrudno00000000000000/** * @file * Compiles ancestor FASTA file using ansestor generation script. * * Arguments: * * -i filename : ansestor generation script
* -g genome genomeindex : genome index, genomeindex refers to 2 files: genomeindex.ind and genomeindex.seq
* -a alignmentindex : alignment index, alignmentindex refers to 2 files: alignmentindex.ind and alignmentindex.seq
* -o filename : output -- ancestor fasta file * * Ansestor generation script example: * * [TODO] * * Comment: [TODO]. * * * @author Mikhail Soloviev * @date 31.03.2006 * @version 1.0 * */ #include #include #include #include #include #include using namespace std; #include "util.cpp" #include "faindex.cpp" #define fastaRowLength 50 void revComp(char* seq,char* rev,long size) { rev+=size-1; for (long i=0;i%s\n",header.c_str()); char buf[fastaRowLength+1]; FILE *in=openFile(path,"r"); while (!feof(in)) { buf[0]='\0'; fgets(buf,fastaRowLength,in); if (strlen(buf)>0) fprintf(out,"%s\n",buf); } fclose(in); } typedef char* pchar; typedef FILE* pfile; typedef pfile* ppfile; struct Range { int start; int end; }; struct AlignLocation { string org; string name; // sequence name/id int start; int end; char strand; }; struct AlignMap { string id; map location; // string: orgId char strand; }; map alignMap; // string: alignId void loadAlignMap(string path) { char line[2000]; char id[1000]; char name1[1000]; char name2[1000]; char org0[1000]; char org1[1000]; char org2[1000]; AlignLocation loc0; AlignLocation loc1; AlignLocation loc2; FILE *in=openFile(path,"r"); while (!feof(in)) { line[0]='\0'; fgets(line,2000,in); if (strlen(line)==0) continue; AlignMap aMap; sscanf(line,"%s %s %d %d %c %s %s %d %d %c %s %s %d %d %c", org0,id,&loc0.start,&loc0.end,&loc0.strand, org1,name1,&loc1.start,&loc1.end,&loc1.strand, org2,name2,&loc2.start,&loc2.end,&loc2.strand); loc0.org="0"; loc1.org=org1; loc2.org=org2; loc0.name=id; loc1.name=name1; loc2.name=name2; aMap.id=id; aMap.strand=loc2.strand; aMap.location[loc0.org]=loc0; aMap.location[loc1.org]=loc1; aMap.location[loc2.org]=loc2; alignMap[aMap.id]=aMap; } fclose(in); } // direct cut calculation: genome -> align, receives relative coord., returns absolute coord. int calcCutStartLetter(char* seq,int start,int end,int relCut) { if (relCut==0) return start; int j=0; for (int i=start;i<=end;i++) { if (seq[i]!='-') j++; if (j==relCut) return i; } return start; } int calcCutEndLetter(char* seq,int start,int end,int relCut) { if (relCut==0) return end; int j=0; for (int i=end;i>=start;i--) { if (seq[i]!='-') j++; if (j==relCut) return i; } return end; } // reverse cut calculation: align -> genome, receives absolute coord., returns relative coord. int revCalcCutStartLetter(char* seq,int start,int end,int absCut) { if (absCut==0) return 0; int j=0; for (int i=start;(i<=end && i=start && i>absCut);i--) { if (seq[i]!='-') j++; } return j; } char* readSeqBuf(FILE *seq,long offset,int length) { fseek(seq,offset,0); char* buf=(char*)malloc(length*sizeof(char)); fread(buf,sizeof(char),length,seq); return buf; } void writeSeqBuf(FILE *out,char* buf,int length,int sameStrand) { if (sameStrand) { fwrite(buf,sizeof(char),length,out); } else { char* rev=(char*)malloc(length*sizeof(char)); revComp(buf,rev,length); fwrite(rev,sizeof(char),length,out); free(rev); } free(buf); } void writeSeq(FILE *out,FILE *seq,long offset,int length,int sameStrand) { char* buf=readSeqBuf(seq,offset,length); writeSeqBuf(out,buf,length,sameStrand); } /*OLD void writeSeqCut(FILE *out,FILE *seq,long offset,int length,int sameStrand,int cutStart,int cutEnd) { offset+=cutStart; length-=cutStart+cutEnd; writeSeq(out,seq,offset,length,sameStrand); } */ /*OLD Range writeSeqCutLetter(FILE *out,FILE *seq,long offset,int length,int sameStrand,int cutStart,int cutEnd) { char* buf=readSeqBuf(seq,offset,length); cutStart=cutStartLetter(buf,length,cutStart); cutEnd=cutEndLetter(buf,length,cutEnd); length-=cutStart+cutEnd; memmove(buf,&buf[cutStart],length); writeSeqBuf(out,buf,length,sameStrand); Range r; r.start=cutStart; r.end=cutEnd; return r; } */ map genomeIndex; void openGenomeIndex(string genomeName,string protoNumber,string genomePath) { FaIndex index; index.id=genomeName; index.proto=atoi(protoNumber.c_str()); index.file=openFile(genomePath+".seq","r+"); FILE *ind=openFile(genomePath+".ind","r"); while (!feof(ind)) { FaRecord record=readIndexRecord(ind); if (record.id.size()>0) index.record[record.id]=record; } fclose(ind); genomeIndex[index.id]=index; } AlignLocation writeGenomeSeq(pfile out[],string orgName,int orgProto,string seqName,int start,int end,char strand) { FILE *seq=genomeIndex[orgName].file; for (int p=1;p<=orgProto;p++) { string recId=seqName+":"+itoa(p); FaRecord ind=genomeIndex[orgName].record[recId]; writeSeq(out[p-1],seq,ind.offset+start-1,end-start+1,strand=='+'); } AlignLocation loc; loc.org=orgName; loc.name=seqName; loc.start=start; loc.end=end; // TODO check loc.strand='+'; return loc; } AlignLocation writeGenomeGap(pfile out[],string orgName,int orgProto,string seqName,int start,int end) { int size=end-start+1; char* buf=(char*)malloc(size*sizeof(char)); memset(buf,'-',size); for (int p=1;p<=orgProto;p++) { fwrite(buf,sizeof(char),size,out[p-1]); } free(buf); AlignLocation loc; loc.org=orgName; loc.name=seqName; loc.start=start; loc.end=end; // TODO check loc.strand='+'; return loc; } FaIndex alignIndex; void openAlignIndex(string path) { alignIndex.file=openFile(path+".seq","r+"); FILE *ind=openFile(path+".ind","r"); while (!feof(ind)) { FaRecord record=readIndexRecord(ind); if (record.id.size()>0) alignIndex.record[record.id]=record; } fclose(ind); } int writeAlignSeq(pfile out1[],int proto1,pfile out2[],int proto2,string alignId,string orgName,char strand) { FILE *seq=alignIndex.file; AlignLocation loc=alignMap[alignId].location[orgName]; AlignLocation loc0=alignMap[alignId].location["0"]; int start=loc0.start-1; int length=loc0.end-loc0.start+1; FaRecord ind; for (int p=1;p<=proto1;p++) { string recId=alignId+":"+itoa(p); ind=alignIndex.record[recId]; writeSeq(out1[p-1],seq,ind.offset+start,length,strand==loc.strand); } for (int p=1;p<=proto2;p++) { string recId=alignId+":"+itoa(proto1+p); ind=alignIndex.record[recId]; writeSeq(out2[p-1],seq,ind.offset+start,length,strand==loc.strand); } return length; } /* not used anymore AlignLocation writeAlignSeqCut(FILE *out,string alignId,string orgIndex,string orgName,char strand,int cutAlignStart,int cutAlignEnd) { FILE *seq=alignIndex.file; FaRecord ind=alignIndex.record[alignId+":"+orgIndex]; AlignLocation loc=alignMap[alignId].location[orgName]; writeSeqCut(out,seq,ind.offset,ind.length,strand==loc.strand,cutAlignStart,cutAlignEnd); // TODO -- find it via cutAlignStart,cutAlignEnd -- !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! //loc.start+=cutStart; //loc.end-=cutEnd; return loc; } */ // TODO check implementation when start implementing overlapping, compare with writeAlignSeq /* OLD AlignLocation writeAlignSeqCutLetterAlign(FILE *out,string alignId,string orgIndex,string orgName,char strand,int cutAlignStart,int cutAlignEnd) { FILE *seq=alignIndex.file; FaRecord ind=alignIndex.record[alignId+":"+orgIndex]; AlignLocation loc=alignMap[alignId].location[orgName]; // TODO -- optimize by excluding double reading the same sequence writeSeqCut(out,seq,ind.offset,ind.length,strand==loc.strand,cutAlignStart,cutAlignEnd); char* buf=readSeqBuf(seq,ind.offset,ind.length); loc.start+=reCutStartLetter(buf,ind.length,cutAlignStart); loc.end-=reCutEndLetter(buf,ind.length,cutAlignEnd); free(buf); return loc; } */ // TODO check implementation when start implementing overlapping, compare with writeAlignSeq /* OLD AlignLocation writeAlignSeqCutLetter(FILE *out,string alignId,string orgIndex,string orgName,char strand,int cutStart,int cutEnd,int& cutAlignStart,int& cutAlignEnd) { FILE *seq=alignIndex.file; FaRecord ind=alignIndex.record[alignId+":"+orgIndex]; AlignLocation loc=alignMap[alignId].location[orgName]; Range r=writeSeqCutLetter(out,seq,ind.offset,ind.length,strand==loc.strand,cutStart,cutEnd); cutAlignStart=r.start; cutAlignEnd=r.end; loc.start+=cutStart; loc.end-=cutEnd; return loc; } */ Range calcCutRangeLetter(char* seqBuf,int start,int end,int cutStartLength,int cutEndLength) { Range r; r.start=calcCutStartLetter(seqBuf,start,end,cutStartLength); r.end=calcCutEndLetter(seqBuf,start,end,cutEndLength); return r; } char* makeCons(string alignId,int protoStart,int protoEnd) { FILE *seqFile=alignIndex.file; char* cons=NULL; for (int p=protoStart;p<=protoEnd;p++) { string recId=alignId+":"+itoa(p); FaRecord ind=alignIndex.record[recId]; char* buf=readSeqBuf(seqFile,ind.offset,ind.length); if (p==protoStart) { cons=(char*)malloc(ind.length*sizeof(char)); memcpy(cons,buf,ind.length); } else { for (int i=0;i command; void loadCommand(string path) { char line[1000]; char orgName[100]; char seqName[100]; char alignId1[100]; char alignId2[100]; char operation; FILE *in=openFile(path,"r"); while (!feof(in)) { line[0]='\0'; fgets(line,1000,in); if (strlen(line)==0) continue; Command com; operation=' '; orgName[100]='\0'; seqName[100]='\0'; alignId1[100]='\0'; alignId2[100]='\0'; com.over1=0; com.over2=0; sscanf(line,"%c ",&operation); if (operation=='g') { sscanf(line,"%c %s %s %d %d %c",&operation,orgName,seqName,&com.start,&com.end,&com.strand); } else if (operation=='s') { sscanf(line,"%c %s %s %c",&operation,alignId1,orgName,&com.strand); } else if (operation=='o') { sscanf(line,"%c %s %s %s %c %d %d",&operation,alignId1,alignId2,orgName,&com.strand,&com.over1,&com.over2); } else if (operation=='d') { sscanf(line,"%c %s %s %s %c",&operation,alignId1,alignId2,orgName,&com.strand); } else if (operation=='e') { } com.operation=operation; com.orgName=orgName; com.seqName=seqName; com.alignId1=alignId1; com.alignId2=alignId2; command.push_back(com); } fclose(in); } void writeChunkLocation(FILE* blockChunk,AlignLocation loc) { fprintf(blockChunk,"%s %s %d %d %c",loc.org.c_str(),loc.name.c_str(),loc.start,loc.end,loc.strand); } void writeChunk(FILE* blockChunk,AlignMap chunk,string org[]) { writeChunkLocation(blockChunk,chunk.location[org[0]]); fprintf(blockChunk," "); writeChunkLocation(blockChunk,chunk.location[org[1]]); fprintf(blockChunk," "); writeChunkLocation(blockChunk,chunk.location[org[2]]); fprintf(blockChunk,"\n"); } void openTmp(pfile tmp[],string outPath,int size,int offset) { for (int i=0;i outtmp; map other; map orgIndex; map proto; map protoStart; AlignMap chunk; string header; int block=1; int multi=0; int start=0; int end=0; int ancProto=0; int ancEnd=0; int cutAlignStart=0; int cutAlignEnd=0; string outPath=getArg("-o",argc,argv); FILE* out=openFile(outPath,"w"); FILE* blockChunk=openFile(getArg("-b",argc,argv),"w"); org[1]=getArg("-g1",argc,argv); org[2]=getArg("-g2",argc,argv); proto[org[1]]=atoi(getArgAt("-g1",2,argc,argv).c_str()); proto[org[2]]=atoi(getArgAt("-g2",2,argc,argv).c_str()); protoStart[org[1]]=1; protoStart[org[2]]=proto[org[1]]+1; ancProto=proto[org[1]]+proto[org[2]]; loadAlignMap(getArg("-c",argc,argv)); openAlignIndex(getArg("-a",argc,argv)); openGenomeIndex(getArgAt("-g1",1,argc,argv),getArgAt("-g1",2,argc,argv),getArgAt("-g1",3,argc,argv)); openGenomeIndex(getArgAt("-g2",1,argc,argv),getArgAt("-g2",2,argc,argv),getArgAt("-g2",3,argc,argv)); ancOrg=org[1]+"_"+org[2]; org[0]=ancOrg; chunk.location[org[0]].org=org[0]; chunk.location[org[1]].org=org[1]; chunk.location[org[2]].org=org[2]; header=ancOrg+"-anc"+itoa(block); chunk.location[org[0]].name=header; chunk.location[org[0]].start=0; chunk.location[org[0]].end=0; other[org[1]]=org[2]; other[org[2]]=org[1]; orgIndex[org[1]]="1"; orgIndex[org[2]]="2"; pfile tmp1[proto[org[1]]]; pfile tmp2[proto[org[2]]]; outtmp[org[1]]=tmp1; outtmp[org[2]]=tmp2; openTmp(outtmp[org[1]],outPath,proto[org[1]],1); openTmp(outtmp[org[2]],outPath,proto[org[2]],proto[org[1]]+1); loadCommand(getArg("-i",argc,argv)); // TODO: check and implement if necessary linking between s,d,o,g // in the same block, currently only d & o is linked for (int i=0;i * -g genomeindex : genome index, it refers to 2 files: genomeindex.ind and genomeindex.seq
* -n {1|2} : which genome is taken (1st or 2nd) from block chunk mapping
* -p proto : number of original species in genome * -o filename : ancestor fasta file, output sequence data to be appended here * * Block chunk mapping example: * * [TODO] * * Comment: [TODO]. * * * @author Mikhail Soloviev * @date 23.05.2006 * @version 1.0 * */ #include #include #include #include #include #include using namespace std; #define fastaRowLength 50 typedef char* pchar; pchar seqData[100]; char seqStrand; string itoa(int i) { char buf[20]; sprintf(buf,"%d",i); return buf; } FILE* openFile(string path,char* mode) { FILE *f=fopen(path.c_str(),mode); if (f==NULL) { fprintf(stderr,"ERROR: Failed open file: %s\n",path.c_str()); exit(1); } return f; } int isArg(char* key,int argc, char* argv[]) { for (int i=0;i chunkMap; void loadChunkMap(string path) { char line[2000]; char genome0[1000]; char genome1[1000]; char genome2[1000]; char name0[1000]; char name1[1000]; char name2[1000]; int tmp; FILE *in=openFile(path,"r"); while (!feof(in)) { line[0]='\0'; fgets(line,2000,in); if (strlen(line)==0) continue; ChunkMap chunk; sscanf(line,"%s %s %d %d %c %s %s %d %d %c %s %s %d %d %c", genome0,name0,&chunk.location[0].start,&chunk.location[0].end,&chunk.location[0].strand, genome1,name1,&chunk.location[1].start,&chunk.location[1].end,&chunk.location[1].strand, genome2,name2,&chunk.location[2].start,&chunk.location[2].end,&chunk.location[2].strand); chunk.location[0].genome=genome0; chunk.location[1].genome=genome1; chunk.location[2].genome=genome2; chunk.location[0].name=name0; chunk.location[1].name=name1; chunk.location[2].name=name2; chunkMap.push_back(chunk); } fclose(in); } void writeChunkSeq(FILE *out,string header,int start,int end,int protoStart,int protoEnd) { start--; end--; for (int p=protoStart;p<=protoEnd;p++) { fprintf(out,">%s\n",header.c_str()); int j=0; for (int i=start;i<=end;i++) { fputc(seqData[p][i],out); j++; if (j==fastaRowLength) { j=0; fputc('\n',out); } } if (j>0) fputc('\n',out); } } void writeChunkGap(FILE *out,string header,int start,int end,int proto) { start--; end--; for (int p=1;p<=proto;p++) { fprintf(out,">%s\n",header.c_str()); int j=0; for (int i=start;i<=end;i++) { fputc('-',out); j++; if (j==fastaRowLength) { j=0; fputc('\n',out); } } if (j>0) fputc('\n',out); } } Range noNext={0,0,'+'}; Range nextRange(int seqSize,Range prev) { Range next; prev.start--; prev.end--; next.start=prev.end+1; if (next.start>=seqSize) return noNext; while (seqData[1][next.start]=='*') { next.start++; if (next.start>=seqSize) return noNext; } next.end=next.start; while (next.end record; }; FaRecord readIndexRecord(FILE *ind) { FaRecord record; record.id=""; char line[2000]; char id[200]; line[0]='\0'; id[0]='\0'; fgets(line,2000,ind); if (strlen(line)>0) { sscanf(line,"%s %ld %d",id,&record.offset,&record.length); record.id=id; } return record; } FaIndex genomeIndex; void openGenomeIndex(string genomePath) { genomeIndex.file=openFile(genomePath+".seq","r+"); FILE *ind=openFile(genomePath+".ind","r"); while (!feof(ind)) { FaRecord record=readIndexRecord(ind); if (record.id.size()>0) genomeIndex.record[record.id]=record; } fclose(ind); } char* readSeqBuf(FILE *seq,long offset,int length) { fseek(seq,offset,0); char* buf=(char*)malloc(length*sizeof(char)); fread(buf,sizeof(char),length,seq); return buf; } void readGenomeSeq(string seqName,int& seqSize,int proto) { FILE *seq=genomeIndex.file; for (int i=1;i<=proto;i++) { string id=seqName+":"+itoa(i); FaRecord ind=genomeIndex.record[id]; seqSize=ind.length; seqData[i]=readSeqBuf(seq,ind.offset,ind.length); } } int main (int argc,char* argv[]) { int block=0; string seqName=""; string ancestor=""; string desc1=""; string desc2=""; int seqSize=0; int proto=1; int proto1=1; int proto2=1; int genomeNumber=1; int first=1; FILE* out=openFile(getArg("-o",argc,argv),"w"); FILE* chunk=openFile(getArg("-c",argc,argv),"w"); loadChunkMap(getArg("-b",argc,argv)); openGenomeIndex(getArg("-g",argc,argv)); genomeNumber=atoi(getArg("-n",argc,argv).c_str()); proto1=atoi(getArg("-p1",argc,argv).c_str()); proto2=atoi(getArg("-p2",argc,argv).c_str()); ancestor=getArg("-a",argc,argv); desc1=getArg("-d1",argc,argv); desc2=getArg("-d2",argc,argv); proto=genomeNumber==1?proto1:proto2; for (int i=0;i * -o filename : output fasta file
* -c filename : alignments' coordinate ranges (supermap output data)
* -s number : take prototype organism sequences starting with number
* -e number : take prototype organism sequences ending with number
* -u number : which alignment coordinate range to use -- first or second, * correspondingly number can be 1 or 2
* -g {0|1} : allow gaps
* * Alignments' coordinate range example: * * mouse-ENm001 1 12433 rat-ENm001 400 28619 + (DM, 13 aligns)
* mouse-ENm001 7001 14975 rat-ENm001 1 15303 + (M1, 1 aligns)
* mouse-ENm001 12872 51014 rat-ENm001 6891 71164 + (DM, 106 aligns) * * Comment: Only the first 6 fields are read, the rest can be anything. * * Resulted output example: * * >mouse-ENm001
* GGACTCGTCGCAGTGCCTTGT
* TTTACTGTGCACTTCGCCTGG
* ACTGTCTACGCCATGCTTGAT
* * Comment: FASTA header contains sequence name (mouse-ENm001). * * @author Mikhail Soloviev * @date 05.04.2006 * @version 1.0 * */ #include #include #include #include #include #include using namespace std; // TODO refactor in classes and normal make project #include "util.cpp" #include "faindex.cpp" FaIndex faIndex; void writeSeqDirect(FILE *out,char* seq,int start,int end,int gapped,int masked) { start--; end--; int j=0; for (int i=start;i<=end;i++) { if (gapped || seq[i]!='-') { fputc(masked?mask(seq[i]):seq[i],out); j++; if (j==fastaRowLength) { j=0; fputc('\n',out); } } } if (j>0) fputc('\n',out); } void writeSeqRevComp(FILE *out,char* seq,int start,int end,int gapped,int masked) { start--; end--; int j=0; for (int i=end;i>=start;i--) { if (gapped || seq[i]!='-') { fputc(masked?mask(comp(seq[i])):comp(seq[i]),out); j++; if (j==fastaRowLength) { j=0; fputc('\n',out); } } } if (j>0) fputc('\n',out); } void writeSeq(FILE *out,char* seq,int start,int end,int direct,int gapped,int masked) { if (direct) writeSeqDirect(out,seq,start,end,gapped,masked); else writeSeqRevComp(out,seq,start,end,gapped,masked); } int main (int argc,char* argv[]) { char buf[bufSize]; char name[bufSize]; int start; int end; char name2[bufSize]; int start2; int end2; int count=0; char strand; int gapped=1; int useOrg=1; int protoStart=1; int protoEnd=1; int masked=0; string id; char* seq; FILE *out=openFile(getArg("-o",argc,argv),"w"); FILE *in=openFile(getArg("-c",argc,argv),"r"); readFaIndex(faIndex,getArg("-i",argc,argv)); useOrg=atoi(getArg("-u",argc,argv).c_str()); gapped=atoi(getArg("-g",argc,argv).c_str()); protoStart=atoi(getArg("-s",argc,argv).c_str()); protoEnd=atoi(getArg("-e",argc,argv).c_str()); masked=atoi(getArg("-m",argc,argv).c_str()); while (!feof(in)) { buf[0]='\0'; fgets(buf,bufSize,in); if (strlen(buf)==0) continue; sscanf(buf,"%s %d %d %s %d %d %c ",name,&start,&end,name2,&start2,&end2,&strand); if (useOrg==2) { strcpy(name,name2); start=start2; end=end2; } for (int n=protoStart;n<=protoEnd;n++) { id=name; id=id+":"+itoa(n); seq=getFaIndexSeq(faIndex,id); fprintf(out,">%s\n",name); writeSeq(out,seq,start,end,(useOrg==2 && strand=='-'),gapped,masked); free(seq); } } fclose(in); fclose(out); return 0; } lagan20/src/diagmatrix.c0000644000076500007650000002413610502337063016211 0ustar brudnobrudno00000000000000#ifndef __DIAGMATRIX_C #define __DIAGMATRIX_C #include #include #include #include "diagmatrix.h" #define MAX2(x,y) ( (x) >= (y) ? (x) : (y) ) #define MIN2(x,y) ( (x) <= (y) ? (x) : (y) ) alel dummy; #ifdef MULTIAL__FLAG extern int *freed, freedsize, freedcap; extern align *freedptr; #endif dmat* makeDM(int d1, int d2) { dmat* trgt = (dmat*)malloc(sizeof(dmat)); int i; trgt->d1 = d1; trgt->d2 = d2; trgt->diagindex = (int*) calloc(d1+d2+1, sizeof(int)); trgt->diagstart = (int*) calloc(d1+d2+1, sizeof(int)); trgt->diagend = (int*) calloc(d1+d2+1, sizeof(int)); trgt->isneck = (int*) calloc(d1+d2+1, sizeof(int)); for (i=0; i < d1+ d2+1; i++) { trgt->diagindex[i] = trgt->diagstart[i] = trgt->diagend[i] = -1; trgt->isneck[i] = 0; } trgt->numelems = 0; trgt->currdiag = 0; trgt->currneck = 0; dummy.M = dummy.N = dummy.O = INT_MIN+(1<<28); return trgt; } void freeDM(dmat* trgt) { int olddiag = trgt->neckdiag[trgt->currneck%2]; int prevsize = (olddiag>0)?trgt->diagend[olddiag]- trgt->diagstart[olddiag]+1 + trgt->diagend[olddiag-1]- trgt->diagstart[olddiag-1]+1 : 0; int i, j; // printf("next neck\n"); for (i=0; i < prevsize; i++) { for (j=0; j<3; j++) { // freeAlign(trgt->myneck[trgt->currneck%2][j][i]); } } for (i=0; i< NACT; i++) { free (trgt->myelems[i]); } free(trgt->myptrs); free(trgt->diagindex); free(trgt->diagstart); free(trgt->diagend); free(trgt->isneck); free(trgt); } void DMinitDiag(dmat* trgt, int* starts, int* ends) { int i, sav = 0; long long int j = 0, ts = 0; int k = ends[1]-starts[1]+1, ko=-1, kf; int ctr=0, cond=0; for (i=1; i < trgt->d1+trgt->d2; i++) { trgt->diagindex[i] = j; trgt->diagstart[i] = starts[i]; trgt->diagend[i] = ends[i]; kf = (i == trgt->d1+trgt->d2-1)? -1 : ends[i+1]-starts[i+1]+1; j += k; cond = (k < kf) || (k <= kf && ctr >= 1000 && k <= 200); if ((ko >= k) && cond) { ctr = 0; // printf("neck %d\n",i); if (sav) { trgt->isneck[sav] = j; } else { trgt->myptrs = (char*) calloc (j/2+1, sizeof(char)); } ts += j; j = k + ko; sav = i; } ctr++; ko = k; k = kf; } trgt->diagindex[i] = j; trgt->diagstart[i] = starts[i]; trgt->diagend[i] = ends[i]; if (sav) trgt->isneck[sav] = j; else trgt->myptrs = (char*) calloc (j/2+1, sizeof(char)); trgt->numelems = j; trgt->currdiag = 0; ts += j; for (i=0; i < NACT; i++) trgt->myelems[i] = 0; for (i=0; i < 2; i++) { for (j=0; j<3; j++) trgt->myneck[i][j] = 0; trgt->neckdiag[i] = -1; } fprintf(stderr,"Total size = %lld * 10^6\n", ts/1000000); } alel* DMgetDiagStart(dmat* trgt, int dn, int* size, int* startx, int* starty) { alel* res = trgt->myelems[dn%NACT]; *size = trgt->diagend[dn] - trgt->diagstart[dn]+1; if (dn < trgt->d2) { *startx = trgt->diagstart[dn]+1; *starty = dn - trgt->diagstart[dn]; } else { *startx = dn - trgt->d2 + trgt->diagstart[dn]+1; *starty = trgt->d2 - trgt->diagstart[dn]; } return res; } char DMgetPtr(dmat* trgt, int x, int y) { int dn = x+y-1; int elem = (dn < trgt->d2)? (x-1): trgt->d2-y; int res, loc; if (dn <= 0 || dn >= trgt->d1+trgt->d2 || elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){ return -1; } loc = trgt->diagindex[dn] + elem-trgt->diagstart[dn]; res= trgt->myptrs[loc >> 1]; if (!(loc & 1)) res = res >> 4; return res & 0xf; } void DMsetPtr(dmat* trgt, char ptr, int x, int y) { int dn = x+y-1, loc; char res; int elem = (dn < trgt->d2)? (x-1): trgt->d2-y; if (dn <= 0 || dn >= trgt->d1+trgt->d2 || elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){ fprintf(stderr,"range error!!!\n"); return; } dn = trgt->diagindex[dn] + elem-trgt->diagstart[dn]; if (dn & 1) trgt->myptrs[dn >> 1] = (char)(trgt->myptrs[dn >> 1] & 0xf0) | (char)(ptr & 0x0f); else trgt->myptrs[dn >> 1] = (char)(trgt->myptrs[dn >> 1] & 0x0f) | (char)(ptr << 4); } alel* DMgetElem(dmat* trgt, int x, int y) { register int dn = x+y-1; register int elem = (dn < trgt->d2)? (x-1): trgt->d2-y; if (dn <= 0 || dn >= trgt->d1+trgt->d2 || elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){ return &dummy; } return (trgt->myelems[dn % NACT] + elem-trgt->diagstart[dn]); } alel* DMgetElem2(dmat* trgt, int x, int y, alel* prev) { register int dn = x+y-1; register int elem = (dn < trgt->d2)? (x-1): trgt->d2-y; if (dn <= 0 || dn >= trgt->d1+trgt->d2 || elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){ return &dummy; } if (prev != &dummy) return prev + 1; return (trgt->myelems[dn % NACT] + elem-trgt->diagstart[dn]); } void DMsetElem(dmat* trgt, alel* tbi, int x, int y, char ptr) { int dn = x+y-1; int elem = (dn < trgt->d2)? x: trgt->d2-y; if (elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]) { fprintf(stderr,"Dummy\n"); return; } *(trgt->myelems[dn%NACT]+elem-trgt->diagstart[dn]) = *tbi; trgt->myptrs[trgt->diagindex[dn] + elem-trgt->diagstart[dn]]=ptr; } char DMnextDiag(dmat* trgt) { char* newptrs; int i; int size = trgt->diagend[trgt->currdiag+1] - trgt->diagstart[trgt->currdiag+1] + 1; free(trgt->myelems[(trgt->currdiag+1)%NACT]); trgt->myelems[(trgt->currdiag+1)%NACT] = (alel*) calloc(size, sizeof(alel)); if (trgt->isneck[trgt->currdiag]) { // printf("new pointers!\n"); newptrs = (char*) calloc ((trgt->isneck[trgt->currdiag]+1)/2+1, sizeof(char)); for (i=0; i< (trgt->isneck[trgt->currdiag]+1)/2+1; i++) newptrs[i] = -1; free(trgt->myptrs); trgt->myptrs = newptrs; trgt->diagindex[trgt->currdiag-1] = 0; trgt->diagindex[trgt->currdiag] = (trgt->diagend[trgt->currdiag-1] - trgt->diagstart[trgt->currdiag-1] + 1); } return trgt->isneck[++trgt->currdiag] != 0; } int DMnextNecks(dmat* trgt, int diag) { int size = trgt->diagend[diag]-trgt->diagstart[diag]+1 + trgt->diagend[diag-1]-trgt->diagstart[diag-1]+1; int olddiag = trgt->neckdiag[trgt->currneck%2]; int prevsize = (olddiag>0)?trgt->diagend[olddiag]-trgt->diagstart[olddiag]+1 + trgt->diagend[olddiag-1]-trgt->diagstart[olddiag-1]+1 : 0; int i, j, t1; int norm=0; int minn = 0; // printf("next neck\n"); for (i=0; i < prevsize; i++) { for (j=0; j<3; j++) { if ((trgt->myneck[trgt->currneck%2][j])[i] && !(trgt->myneck[trgt->currneck%2][j])[i]->dirty){ freeAlign(trgt->myneck[trgt->currneck%2][j][i]); trgt->myneck[trgt->currneck%2][j][i] = 0; } /* else if ((trgt->myneck[trgt->currneck%2][j])[i] && (trgt->myneck[trgt->currneck%2][j])[i]->dirty && !(trgt->myneck[trgt->currneck%2][j])[i]->nextalign) { fprintf(stderr, "WARN: diag = %d(%d:%d) \n", diag, olddiag, (trgt->myneck[trgt->currneck%2][j])[i]->algnlen); } */ } } for (j=0; j<3; j++) { free (trgt->myneck[trgt->currneck%2][j]); trgt->myneck[trgt->currneck%2][j] = (align**) calloc (size, sizeof (align*)); trgt->neckdiag[trgt->currneck%2] = diag; for (i=0; i< size; i++) (trgt->myneck[trgt->currneck%2][j])[i] = 0; } size = trgt->diagend[trgt->currdiag] - trgt->diagstart[trgt->currdiag]+1; // fprintf(stderr, "size = %d\n ", size); minn = norm = trgt->myelems[(trgt->currdiag)%NACT][0].M; for (j=1; jmyelems[(trgt->currdiag)%NACT][j].M , norm); minn = MIN2 (trgt->myelems[(trgt->currdiag)%NACT][j].M , minn); } // fprintf(stderr, "currdiag = %d norm = %d minn = %d\n", trgt->currdiag, norm, minn); for (i=0; i < NACT; i++) { size = trgt->diagend[trgt->currdiag-i] - trgt->diagstart[trgt->currdiag-i]+1; for (j=0; jmyelems[(trgt->currdiag-i)%NACT][j].M - norm; trgt->myelems[(trgt->currdiag-i)%NACT][j].M = (norm > 0)? MIN2(trgt->myelems[(trgt->currdiag-i)%NACT][j].M, t1): MAX2(trgt->myelems[(trgt->currdiag-i)%NACT][j].M, t1); t1 = trgt->myelems[(trgt->currdiag-i)%NACT][j].N - norm; trgt->myelems[(trgt->currdiag-i)%NACT][j].N = (norm > 0)? MIN2(trgt->myelems[(trgt->currdiag-i)%NACT][j].N, t1): MAX2(trgt->myelems[(trgt->currdiag-i)%NACT][j].M, t1); t1 = trgt->myelems[(trgt->currdiag-i)%NACT][j].O - norm; trgt->myelems[(trgt->currdiag-i)%NACT][j].O = (norm > 0)? MIN2(trgt->myelems[(trgt->currdiag-i)%NACT][j].O, t1): MAX2(trgt->myelems[(trgt->currdiag-i)%NACT][j].M, t1); } } trgt->currneck++; return norm; } align* DMgetNeck(dmat* trgt, int x, int y, int which) { int dn = x + y - 1; int elem = (dn < trgt->d2)? (x-1): trgt->d2-y; int fd; if (dn <= 0 || dn >= trgt->d1+trgt->d2) { return 0; } if (elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){ return 0; } if (trgt->neckdiag[trgt->currneck%2] == dn) { return *(trgt->myneck[trgt->currneck%2][which] + elem-trgt->diagstart[dn]); } else if (trgt->neckdiag[trgt->currneck%2] == dn+1) { fd = trgt->diagend[dn+1]-trgt->diagstart[dn+1]+1; return *(trgt->myneck[trgt->currneck%2][which] + elem-trgt->diagstart[dn] + fd); } else { fprintf(stderr, "Some dumb error: %d/%d %d %d\n", dn, trgt->d1+trgt->d2-1, trgt->neckdiag[(trgt->currneck-1)%2], trgt->currneck); return 0; } } void DMsetNeck(dmat* trgt, align* myal, int x, int y, int which) { int dn = x + y - 1; int elem = (dn < trgt->d2)? (x-1): trgt->d2-y; int fd; if (dn <= 0 || dn >= trgt->d1+trgt->d2) { fprintf(stderr, "setNeck failed at %d, %d\n", x,y); return; } if (elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){ fprintf(stderr, "setNeck failed2 at %d, %d\n", x,y); return; } if (trgt->neckdiag[(trgt->currneck-1)%2] == dn) { *(trgt->myneck[(trgt->currneck-1)%2][which] + elem-trgt->diagstart[dn]) = myal; } else if (trgt->neckdiag[(trgt->currneck-1)%2] == dn+1) { fd = trgt->diagend[dn+1]-trgt->diagstart[dn+1]+1; *(trgt->myneck[(trgt->currneck-1)%2][which] + elem-trgt->diagstart[dn] + fd)=myal; } else { fprintf(stderr, "Some dumb error2: %d %d %d\n", dn, trgt->neckdiag[(trgt->currneck)%2], trgt->currneck); } } #endif lagan20/src/diagmatrix.h0000644000076500007650000000302510502337063016210 0ustar brudnobrudno00000000000000#ifndef __DIAGMATRIX_H #define __DIAGMATRIX_H #ifdef MULTIAL__FLAG #include "multial.h" #else #include "order.h" #endif #define Mmask 0x3 #define Nmask 0x4 #define Omask 0x8 #define NACT 3 typedef struct AlignElement { long int M; long int N; long int O; } alel; typedef struct diagmatrix { int d1; int d2; int* diagindex; /* this points to where in myelems a certain diagonal starts*/ int* diagstart; /* the elem on which the "cross-section" starts*/ int* diagend; /* the elem on which the "cross-section" ends */ int* isneck; /* if so, give size of next block, 0 ow */ int numelems; int elemsize; char* myptrs; alel* myelems[NACT]; /* NACT(3) diags active at a time */ int currdiag; /*current diagonal */ int rangelow; int currneck; align** myneck[2][3]; /* The past 2 necks, 3 ptrs for each */ int neckdiag[2]; /* For each the size of its 2 diagonals */ } dmat; dmat* makeDM(int d1, int d2); void freeDM(dmat* trgt); void DMinitDiag(dmat* trgt, int* starts, int* ends); alel* DMgetElem(dmat* trgt, int x, int y); alel* DMgetElem2(dmat* trgt, int x, int y, alel* prev); char DMgetPtr(dmat* trgt, int x, int y); void DMsetPtr(dmat* trgt, char ptr, int x, int y); align* DMgetNeck(dmat* trgt, int x, int y, int which); void DMsetNeck(dmat* trgt, align* myal, int x, int y, int which); alel* DMgetDiagStart(dmat* trgt, int dn, int* size, int* startx, int* starty); void DMsetElem(dmat* trgt, alel* elem, int x, int y, char ptr); char DMnextDiag(dmat* trgt); int DMnextNecks(dmat* trgt, int diag); #endif lagan20/src/faindex.cpp0000644000076500007650000000240310502337063016027 0ustar brudnobrudno00000000000000struct FaRecord { string id; long offset; int length; }; struct FaIndex { string id; int proto; FILE* file; map record; }; FaRecord readIndexRecord(FILE *ind) { FaRecord record; record.id=""; char line[1000]; char id[100]; line[0]='\0'; id[0]='\0'; fgets(line,1000,ind); if (strlen(line)>0) { sscanf(line,"%s %ld %d",id,&record.offset,&record.length); record.id=id; } return record; } void readFaIndex(FaIndex& faIndex,string path) { faIndex.file=openFile(path+".seq","r+"); FILE *ind=openFile(path+".ind","r"); while (!feof(ind)) { FaRecord record=readIndexRecord(ind); if (record.id.size()>0) faIndex.record[record.id]=record; } fclose(ind); } char* getFaIndexSeq(FaIndex& faIndex,string seqId) { FaRecord ind=faIndex.record[seqId]; fseek(faIndex.file,ind.offset,0); char* seq=(char*)malloc(ind.length*sizeof(char)); fread(seq,sizeof(char),ind.length,faIndex.file); return seq; } char* getMFaIndexSeq(FaIndex& faIndex,string seqId,int protoIndex) { char protoId[20]; sprintf(protoId,"%d",protoIndex); string id=seqId+":"+protoId; FaRecord ind=faIndex.record[id]; fseek(faIndex.file,ind.offset,0); char* seq=(char*)malloc(ind.length*sizeof(char)); fread(seq,sizeof(char),ind.length,faIndex.file); return seq; } lagan20/src/fchaos.c0000644000076500007650000010301710502337063015317 0ustar brudnobrudno00000000000000#include #include #include #include #include #include #include "fchaos.h" #include "skiplist.h" #include "thrtrie.h" #include "global.h" #include "translate.h" #include "filebuffer.h" #define VER_NUM "0.932" #define BLOSUM_FILE "blosum62s.txt" #define BLOSUM_FILE_SIZE 24 #define NUC_FILE "nucmatrix.txt" #define NUC_FILE_SIZE 6 #define MAX2(x,y) ( (x) >= (y) ? (x) : (y) ) #define MIN2(x,y) ( (x) <= (y) ? (x) : (y) ) #define ABS(x) ( ((x) >= (0)) ? (x) : (-x) ) #define WEQ2(x,y,a) (((x)==(a))? 0: ((y)==(a))? 1:-1) #define MIN(A,B) (A>B)?B:A #define MAX(A,B) (A>B)?A:B typedef struct SeqMatch { LList* myll; int offset; } match; extern int indeces[256]; void remElem(LList* tbf, int i); int verbose = 0; int wordlen = 10; int ndegen = 1; int cutoff = 25; int lookback = 20; int gapfreechunks = 0; int mgaplen = 5; int gappenc = -1; int gappeno = 0 ; int both = 0; int translated = 0; int s1start = 0; int s1end = 0; int s2start = 0; int s2end = 0; int extend = 0; int reScoreCutoff = 0; //int matchsco = 12; //int mismatchsco = -8; int gappenstart = -1500; int gappenext = -50; int dropcutoff = 1500; int substmatrix[256][256]; hll* allhits = 0; sklst* mylist; int gapstart=20; int gapcont=1; char* alpha = "ATCGN"; char* triealpha = "ATCG"; char* protalpha = "PCMH[DE][KR][NQ][ST][ILV][FYW][AG]X*"; char* prottriealpha = "PCMH[DE][KR][NQ][ST][ILV][FYW][AG]"; char direction; FILE* pairfile = 0; char comp(char c) { switch(c) { case 'a': case 'A': return 'T'; case 't': case 'T': return 'A'; case 'c': case 'C': return 'G'; case 'g': case 'G': return 'C'; case 'n': case 'N': return 'N'; default: printf("ERROR, Bad letter to RC: %c\n",c); return -1; } } void revComplement(char* a) { int length = strlen(a); char lft; int i; for (i=0; i < length/2; i++) { lft = a[i]; a[i] = comp(a[length-i-1]); a[length-i-1] = comp(lft); } if (length % 2) a[length/2] = comp(a[length/2]); } void freeSeq (seq* tbf) { free(tbf->name); free(tbf->rptr); free(tbf); } void freeHLL (hll* tbf) { gfc *t = tbf->first; gfc *n; while (t) { n = t->next; free (t); t = n; } free (tbf); } void printHLL(hll* res, seq* query, seq* dbase, int len) { hll* temp; align* myal; gfc* tmpgf; int currx, curry; char *qptr = query->lets, *dptr = dbase->lets; if (direction == '+') { while (res) { if (s1start > 0) { res->seq1start += (s1start-1); res->seq1end += (s1start-1); query->lets = query->rptr; } if (s2start > 0) { res->seq2start += (s2start-1); res->seq2end += (s2start-1); dbase->lets = dbase->rptr; } printf("%s %d %d; %s %d %d; score = %f (%c)\n", query->name, res->seq1start+1, res->seq1end+1, dbase->name, res->seq2start+1, res->seq2end+1, res->score,direction); if (verbose) { myal = global(query->lets, res->seq1start, res->seq1end, dbase->lets, res->seq2start, res->seq2end, gapstart, gapcont); printalign(query->lets, res->seq1start, res->seq1end, dbase->lets, res->seq2start, res->seq2end, myal); } if (gapfreechunks) { currx = res->seq1start+1; curry = res->seq2start+1; tmpgf = res->first; while (tmpgf) { if (tmpgf->length) { printf ("%d %d %d %d\n", currx, curry, tmpgf->length, tmpgf->score); currx += tmpgf->length; curry += tmpgf->length; } tmpgf = tmpgf->next; if (!tmpgf) break; if (tmpgf->offset > 0) { curry += tmpgf->offset; } else { currx -= tmpgf->offset; } } } temp = res; res = res->next; freeHLL(temp); } } else { while (res) { if (s1start > 0) { res->seq1start += (s1start-1); res->seq1end += (s1start-1); query->lets = query->rptr; } if (s2start > 0) { res->seq2start += (len-s2end); res->seq2end += (len-s2end); } printf("%s %d %d; %s %d %d; score = %f (%c)\n", query->name, res->seq1start+1, res->seq1end+1, dbase->name, len-(res->seq2start), len - (res->seq2end), res->score, direction); if (verbose) { myal = global(query->lets, res->seq1start, res->seq1end, dbase->lets, res->seq2start, res->seq2end, gapstart, gapcont); printalign(query->lets, res->seq1start, res->seq1end, dbase->lets, res->seq2start, res->seq2end, myal); } if (gapfreechunks) { currx = res->seq1start+1; curry = len - res->seq2start; tmpgf = res->first; while (tmpgf) { if (tmpgf->length) { printf ("%d %d %d %d \n", currx, curry, tmpgf->length, tmpgf->score); currx += tmpgf->length; curry -= tmpgf->length; } tmpgf = tmpgf->next; if (!tmpgf) break; if (tmpgf->offset < 0) { currx -= tmpgf->offset; } else { curry -= tmpgf->offset; } } } temp = res; res = res->next; freeHLL(temp); } } query->lets=qptr; dbase->lets = dptr; } void printList (hll *ptr){ if (ptr){ fprintf (stderr, "(%d %d)=(%d %d) %f\n", ptr->seq1start, ptr->seq1end, ptr->seq2start, ptr->seq2end, ptr->score); printList (ptr->next); } } int compare (hll *list1, hll *list2){ return (list1->seq1start < list2->seq1start) || (list1->seq1start == list2->seq1start && list1->seq1end > list2->seq1end); } hll* merge2(hll* list1, hll* list2) { hll *totallist = 0, *temp = 0; if (!list1) return list2; if (!list2) return list1; while (list1 || list2) { if (list1 && (!list2 || compare (list1, list2))){ if (!totallist) totallist = temp = list1; else { temp->next = list1; temp = temp->next; } list1 = list1->next; } else { if (!totallist) totallist = temp = list2; else { temp->next = list2; temp = temp->next; } list2 = list2->next; } } temp->next = 0; return totallist; } hll* findmiddle(hll* mylist) { hll* other = mylist->next; while (other && other->next) { other = other->next->next; mylist = mylist->next; } return mylist; } hll* sortList(hll* mylist) { hll* premid; hll* mid; if (!mylist || !mylist->next) return mylist; premid = findmiddle(mylist); mid = premid->next; premid->next = 0; mylist = sortList(mylist); mid = sortList(mid); return merge2(mylist,mid); } int duplicates(hll* f, hll* s) { return (s->seq2start >= f->seq2start) && (s->seq2end <= f->seq2end); } hll* removeDups(hll* allhits, seq* seq1, seq* seq2) { hll *i, *j, *jprev, *temp; for (i = allhits; i; i = i->next){ jprev = i; for (j = i->next; j && (j->seq2start >= i->seq2end) ; j = j->next){ if (duplicates (i, j) || mergeOverlap (i, j, seq1, seq2)){ jprev->next = j->next; freeHLL (j); j = jprev; } else { jprev = j; } } } allhits = sortList (allhits); for (i = allhits; i; i = i->next){ jprev = i; for (j = i->next; j && (j->seq1start <= i->seq1end) ; j = j->next){ if (duplicates (i, j) || mergeOverlap (i, j, seq1, seq2)){ jprev->next = j->next; freeHLL (j); j = jprev; } else { jprev = j; } } } return allhits; } seq* readfile(FILE* input, int seqnum) { char* res = (char*) malloc(sizeof(char)); int ressize = 1, numread=0; char temp[256]; seq* myseq = (seq*) malloc(sizeof(seq)); char currchar; if (feof(input)) return 0; fgets(temp, 255, input); if (temp[0] != '>') { fprintf(stderr, "File is not in FASTA format!!\n"); exit(1); } myseq->name = (char*) malloc((strlen(temp))*sizeof(char)); strcpy(myseq->name, temp+1); *(strchr(myseq->name, '\n')) = 0; currchar = fgetc(input); while ((currchar != '>') && (currchar != EOF)) { if (!isspace(currchar)) { currchar = toupper(currchar); if (!strchr(alpha, currchar)) { fprintf(stderr, "WARNING %c converted to N\n", currchar, alpha); currchar = 'N'; } res[numread++] = currchar; if (numread >= ressize) { res=(char*)realloc(res, sizeof(char)*(ressize*=2)); } } currchar = fgetc(input); } if (currchar == '>') ungetc(currchar, input); res[numread]=0; myseq->rptr = res; if (seqnum == 1) { if (s1start > 0) { res[s1end] = 0; res = &res[s1start-1]; numread = s1end-s1start+1; } } else { if (s2start > 0) { res[s2end] = 0; res = &res[s2start-1]; numread = s2end-s2start+1; } } myseq->lets = res; myseq->numlets = numread; return myseq; } int isin (char* arr, int size, int elem) { while (--size>=0) { if (arr[size] == elem) return 1; } return 0; } int chain(LList* second, int off2, LList* first, int off1, int diff1, int gap, float baseval) { int i, d1=0, d2=0; int diff2 = second->myloc->locs[off2] - first->myloc->locs[off1]; int mindiff; int score=wordlen-second->degleft; gap = abs(gap)*gappenc + gappeno; if (diff2 <= 0 || diff2 >= lookback) return -1; if (diff1 >= wordlen && diff2 >= wordlen) { return score*baseval+gap; } mindiff = MIN(diff1, diff2); /* TODO for (i=second->degleft-1; i >=0; i--) { printf(" %d %d %d \n", second->degloc[i], diff1, diff2); if (!d1 && second->degloc[i] - diff1 <= 0) d1 = 1; if (&d2 && second->degloc[i] - diff2 <= 0) d2 = 1; if (d1 || d2) { break; } } */ return mindiff*baseval+gap; } int tc =0; int wc = 0; inline void findPrev(LList* curr, int position, int offset, float baseval) { int j,k; LList* temp; sle* iterator; float bestscore = 0; LList* bestelem = 0; int bestoffset = -1; int doneset = 0; int tempscore, myscore = wordlen - curr->degleft; tc++; iterator = SLfind(mylist, position-curr->myloc->locs[offset]-mgaplen+1); if (iterator) { curr->mysles[offset] = iterator; } if (iterator && iterator->index <= position-curr->myloc->locs[offset]-mgaplen) { iterator = iterator->next[0]; } if (iterator && (iterator->index < position-curr->myloc->locs[offset])) { curr->mysles[offset] = iterator; } while (iterator && (iterator->index < position-curr->myloc->locs[offset]+mgaplen)) { if (iterator->next[0] && (iterator->index < position-curr->myloc->locs[offset]) && (iterator->next[0]->index >= position-curr->myloc->locs[offset])) { curr->mysles[offset] = iterator; } temp = ((match*)iterator->myelem)->myll; k = ((match*)iterator->myelem)->offset; j = position-temp->location; tempscore = chain(curr, offset, temp, k,j, iterator->index - position+curr->myloc->locs[offset], baseval); if (tempscore > 0) { if (temp->scores[k]+tempscore > bestscore) { bestscore = temp->scores[k]+tempscore; bestelem = temp; bestoffset=k; } else { temp->scores[k] = -1; } } /* printf("it = %x next = %x\n", iterator, iterator->next[0]); */ iterator = iterator->next[0]; if (temp->toberemoved[k]) { remElem(temp, k); temp->mysles[k] = 0; } } if (bestelem) { wc++; curr->scores[offset] = bestscore; /* printf("offs = %d, numlocs = %d\n",offset, curr->myloc->numlocs);*/ curr->seq1startpnt[offset] = bestelem->seq1startpnt[bestoffset]; curr->seq2startpnt[offset] = bestelem->seq2startpnt[bestoffset]; curr->myhits[offset].inds1 = (int*) malloc (sizeof(int)*(bestelem->myhits[bestoffset].numind+1)); curr->myhits[offset].inds2 = (int*) malloc (sizeof(int)*(bestelem->myhits[bestoffset].numind+1)); curr->myhits[offset].numind = bestelem->myhits[bestoffset].numind+1; memcpy (curr->myhits[offset].inds2, bestelem->myhits[bestoffset].inds2, bestelem->myhits[bestoffset].numind*sizeof(int)); memcpy (curr->myhits[offset].inds1, bestelem->myhits[bestoffset].inds1, bestelem->myhits[bestoffset].numind*sizeof(int)); curr->myhits[offset].inds2[bestelem->myhits[bestoffset].numind] = position; curr->myhits[offset].inds1[bestelem->myhits[bestoffset].numind] = (int) curr->myloc->locs[offset]; } else { curr->scores[offset] = myscore; curr->seq2startpnt[offset] = position; curr->seq1startpnt[offset] = (int)curr->myloc->locs[offset]; curr->myhits[offset].inds1 = (int*) malloc (sizeof(int)); curr->myhits[offset].inds2 = (int*) malloc (sizeof(int)); curr->myhits[offset].inds2[0] = position; curr->myhits[offset].inds1[0] = (int)curr->myloc->locs[offset]; curr->myhits[offset].numind = 1; } } void connectToPrev(LList* curr, int index, float baseval) { int j; curr->scores = (float*) malloc(sizeof(float) * curr->myloc->numlocs); curr->myhits = (phits*) malloc(sizeof(phits) * curr->myloc->numlocs); curr->toberemoved = (char*) malloc(sizeof(char) * curr->myloc->numlocs); curr->seq1startpnt = (int*) malloc(sizeof(int) * curr->myloc->numlocs); curr->seq2startpnt = (int*) malloc(sizeof(int) * curr->myloc->numlocs); curr->seq1endpnt = (int*) malloc(sizeof(int) * curr->myloc->numlocs); curr->seq2endpnt = (int*) malloc(sizeof(int) * curr->myloc->numlocs); curr->mysles = (sle**) malloc(sizeof(sle*) * curr->myloc->numlocs); for (j = 0; j < curr->myloc->numlocs; j++) { curr->toberemoved[j] = 0; curr->myhits[j].numind = 0; curr->scores[j] = 0; curr->seq1startpnt[j] = 0; curr->seq2startpnt[j] = 0; curr->mysles[j] = 0; findPrev(curr,index,j,baseval); } } int doAlgo(TNode* root, seq* query, seq* dbase) { char* currword = dbase->lets; LList** LListArr = (LList**) malloc(sizeof(LList*) * dbase->numlets); LList* temp; match* mattemp; int i = 0, j; float bestscore=-1, baseval; int bestqueryloc=-1, bestdbaseloc=-1, numhits; while (*currword) { if (!(i%10000)) { // fprintf(stderr,"WORKING %d\n",i); } if (*currword == '.') { /*TODO */ } LListArr[i] = temp = getNextWords(root, currword++, ndegen); /*****/ numhits = 1; while (temp){ numhits += temp->myloc->numlocs; temp = temp->next; } baseval = (float) log ((double) query->numsiglets / (double) numhits) / (float) wordlen; temp = LListArr[i]; /*****/ while (temp) { temp->location = i-wordlen+1; connectToPrev(temp, temp->location, baseval); for (j = 0; j < temp->myloc->numlocs; j++) { mattemp = (match*) malloc (sizeof(match)); mattemp->myll = temp; mattemp->offset = j; if (temp->mysles[j]) temp->mysles[j] = SLinsertAfter(mylist, temp->mysles[j], temp->location-(int)temp->myloc->locs[j], mattemp); else temp->mysles[j] = SLinsert(mylist, temp->location-(int)temp->myloc->locs[j], mattemp); } temp = temp->next; } if (i-lookback >= 0) { LListArr[i-lookback] = savenfreeLList(LListArr[i-lookback], query, dbase); } i++; } j = (i-lookback>=0)?i-lookback:0; for ( ; j < i; j++) { LListArr[j] = savenfreeLList(LListArr[j], query,dbase); } cleanJobQueue(); free(LListArr); // fprintf(stderr, "%d chained of %d\n", wc , tc); return 0; } char getLetter (FILE *file){ char ch; while (!feof (file)){ ch = fgetc (file); if (!isspace (ch)) return ch; } return 0; } void readSubstMatrix (char *filename, int size){ FILE *file; char line[1024], *symbs; int i, j; sprintf (line, "%s/%s", getenv ("LAGAN_DIR"), filename); file = fopen (line, "r"); assert (file); for (i = 0; i < 256; i++){ for (j = 0; j < 256; j++){ substmatrix[i][j] = 0; } } symbs = (char *) malloc (sizeof (char) * size); assert (symbs); for (i = 0; i < size; i++) symbs[i] = getLetter (file); for (i = 0; i < size; i++){ getLetter (file); for (j = 0; j < size; j++){ fscanf (file, "%d", &(substmatrix[(unsigned char) symbs[i]][(unsigned char) symbs[j]])); } } fscanf (file, "%d", &gappenstart); fscanf (file, "%d", &gappenext); gappenstart = (gappenext *= 2); fclose (file); } void paramParse(int argc, char** argv) { int i = 3; for ( ; i < argc; i++) { if (!strcmp(argv[i], "-p") || !strcmp(argv[i], "-P")) { alpha = protalpha; triealpha = prottriealpha; wordlen = 4; lookback = 8; // dropcutoff = 50; readSubstMatrix (BLOSUM_FILE, BLOSUM_FILE_SIZE); } else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "-V")) { verbose = 1; } else if (!strcmp(argv[i], "-b") || !strcmp(argv[i], "-B")) { both = 1; } else if (!strcmp(argv[i], "-t") || !strcmp(argv[i], "-T")) { translated = 1; triealpha = prottriealpha; wordlen = 4; mgaplen = 3; lookback = 8; // dropcutoff = 50; readSubstMatrix (BLOSUM_FILE, BLOSUM_FILE_SIZE); } else if (!strcmp(argv[i], "-rsc") || !strcmp(argv[i], "-RSC")) { reScoreCutoff = atoi(argv[++i]); } else if (!strcmp(argv[i], "-gfc") || !strcmp(argv[i], "-GFC")) { gapfreechunks = 1; } else if (!strcmp(argv[i], "-ext") || !strcmp(argv[i], "-EXT")) { extend = 1; } else if (!strcmp(argv[i], "-wl") || !strcmp(argv[i], "-WL")) { wordlen = atoi(argv[++i]); } else if (!strcmp(argv[i], "-nd") || !strcmp(argv[i], "-ND")) { ndegen = atoi(argv[++i]); } else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "-CO")) { cutoff = atoi(argv[++i]); } else if (!strcmp(argv[i], "-lb") || !strcmp(argv[i], "-LB")) { lookback = atoi(argv[++i]); } else if (!strcmp(argv[i], "-gl") || !strcmp(argv[i], "-GL")) { mgaplen = atoi(argv[++i]); } else if (!strcmp(argv[i], "-gs") || !strcmp(argv[i], "-GS")) { gappeno = atoi(argv[++i]); } else if (!strcmp(argv[i], "-gc") || !strcmp(argv[i], "-GC")) { gappenc = atoi(argv[++i]); } else if (!strcmp(argv[i], "-s1") || !strcmp(argv[i], "-S1")) { s1start = atoi(argv[++i]); s1end = atoi(argv[++i]); } else if (!strcmp(argv[i], "-s2") || !strcmp(argv[i], "-S2")) { s2start = atoi(argv[++i]); s2end = atoi(argv[++i]); } else if (!strcmp(argv[i], "-pairs") || !strcmp(argv[i], "-PAIRS")) { if (!(pairfile = fopen(argv[++i],"r"))) { printf("couldnt open pairs file %s\n",argv[i]); exit (2); } } } if (!translated) readSubstMatrix (NUC_FILE, NUC_FILE_SIZE); } void usage() { printf("usage: \nchaos queryfile dbasefile [options]\n\n"); printf("Options:\n"); printf("-p = Peptide sequence [default genomic]\n"); printf("-v = Verbose mode [default brief]\n"); printf("-b = Both strands [default forward-only]\n"); printf("-t = Translated [default off]\n"); printf("-ext = do BLAST-like extention with given cutoff [default off]\n"); printf("-wl # = Word Length [default 10 for genomic, 4 for peptide]\n"); printf("-nd # = Number of Degeneracy [default 1 for genomic, 0 for peptide]\n"); printf("-co # = score CutOff [default 25]\n"); printf("-rsc # = Rescoring cutoff [default 0]\n"); printf("-lb # = LookBack distance [default 20 for genomic, 8 for peptide]\n"); printf("-gl # = maximum Gap Length [default 5 for genomic, 3 for peptide]\n"); printf("-gs # = Gap Start penalty [default 0]\n"); printf("-gc # = Gap Continue penalty [default -1]\n"); printf("-s1 # # = use the given substring of the query [default whole]\n"); printf("-s2 # # = use the givensubstring of the dbase [default whole]\n"); printf("-pairs pairfile = read \"-s1 # # -s2 # #\" from pairfile [default off]\n\t[This is not fully functional!!!]\n"); printf("-version = prints the version of this CHAOS\n"); } void rc(seq* dbase) { revComplement(dbase->lets); } int paircnt = 0; char savs[2]; int savlocs[2] = {-1,-1}; void procPairs(seq* currquery, seq* currdbase) { // int s1start, s1end, s2start, s2end; if (savlocs[0]>=0) currquery->rptr[savlocs[0]] = savs[0]; if (savlocs[1]>=0) currdbase->rptr[savlocs[1]] = savs[1]; do { //fprintf(stderr,"here\n"); if (fscanf(pairfile, "-s1 %d %d -s2 %d %d\n", &s1start, &s1end, &s2start, &s2end) < 4) { pairfile = 0; return; } currquery->numlets = s1end-s1start+1; currdbase->numlets = s2end-s2start+1; // fprintf (stderr, "%d %d; %d\n",currquery->numlets, // currdbase->numlets, wordlen+1); } while (currquery->numlets < wordlen+1 && currdbase->numlets < wordlen+1) ; savlocs[0] = s1end; savs[0] = currquery->rptr[s1end]; currquery->rptr[s1end] = 0; currquery->lets = &(currquery->rptr[s1start-1]); currquery->numlets = s1end-s1start+1; savlocs[1] = s2end; savs[1] = currdbase->rptr[s2end]; currdbase->rptr[s2end] = 0; currdbase->lets = &(currdbase->rptr[s2start-1]); currdbase->numlets = s2end-s2start+1; paircnt++; if (paircnt%20 ==19) fprintf(stderr, "done with %d\n", paircnt); } void transloc(hll* myhits, int frseq1, int frseq2, int seq1len, int seq2len) { int temp; while (myhits) { if (frseq1<=2) { myhits->seq1start = myhits->seq1start*3 + frseq1; myhits->seq1end = myhits->seq1end*3 + frseq1; } else { temp = (seq1len - myhits->seq1start)*3 + frseq1%3; myhits->seq1start = (seq1len - myhits->seq1end)*3 + frseq1%3; myhits->seq1end = temp; } if (frseq2<=2) { myhits->seq2start = myhits->seq2start*3 + frseq2; myhits->seq2end = myhits->seq2end*3 + frseq2; } else { temp = (seq2len - myhits->seq2start)*3 + frseq2%3; myhits->seq2start = (seq2len - myhits->seq2end)*3 + frseq2%3; myhits->seq2end = temp; } myhits = myhits->next; } } void doTranslated(FileBuffer query, FileBuffer dbase) { seq *currquery, *currdbase, *temp; seq *queryframes[6], *dbaseframes[6]; char* currword; TNode *roots[6]; int i, j; currquery = FileRead(query, s1start, s1end, VER_FCHAOS); currdbase = FileRead(dbase, s2start, s2end, VER_FCHAOS); if (pairfile) { procPairs(currquery, currdbase); if (!pairfile) { FileClose (query); FileClose (dbase); return; } } do { for (i = 0; i < 6; i++) { queryframes[i] = transSeq(currquery,i); roots[i] = makeTrie(wordlen, triealpha); currword = queryframes[i]->lets; insertString(roots[i],currword); } mylist = makeSkLst(); while (currdbase) { for (i = 0; i < 6; i++) { dbaseframes[i] = transSeq(currdbase,i); } direction = '+'; for (i=0; i < 6; i++) for (j=(i/3)*3; j < (i/3+1)*3; j++) { // fprintf(stderr, "1DOING FRAME %d AGAINST %d\n",i,j); doAlgo(roots[i], queryframes[i], dbaseframes[j]); /****/ allhits = removeDups(allhits, queryframes[i], dbaseframes[j]); transloc(allhits, i, j, queryframes[i]->numlets, dbaseframes[j]->numlets); printHLL(allhits, queryframes[i], dbaseframes[j], currdbase->numlets); allhits = 0; } if (both) { direction = '-'; for (i=0; i < 6; i++) for (j=(i>2)?0:3; j < ((i>2)?3:6); j++) { // fprintf(stderr, "2DOING FRAME %d AGAINST %d\n",i,j); doAlgo(roots[i], queryframes[i], dbaseframes[j]); /****/ allhits = removeDups(allhits, queryframes[i], dbaseframes[j]); transloc(allhits, i, j, queryframes[i]->numlets, dbaseframes[j]->numlets); printHLL(allhits, queryframes[i], dbaseframes[j], currdbase->numlets); allhits = 0; } } temp = currdbase; if (!pairfile) freeSeq(currdbase); currdbase = FileRead(dbase, s2start, s2end, VER_FCHAOS); } currdbase = temp; if (pairfile) { procPairs(currquery, currdbase); for (i=0; i < 6; i++) { freeSeq(queryframes[i]); freeTrie(roots[i]); } } } while (pairfile) ; FileClose (query); FileClose (dbase); } int main(int argc, char** argv) { FileBuffer query; FileBuffer dbase; seq *currquery, *currdbase, *temp; char* currword; TNode* root; int i; if (argc < 3) { if (argc == 2) if (!strcmp(argv[1], "-version") || !strcmp(argv[1], "-Version")) { printf("CHAOS version %s\n", VER_NUM); exit(0); } usage(); return 1; } if (!(query = FileOpen(argv[1]))) { printf("couldnt open query file %s\n",argv[1]); usage(); return 2; } if (!(dbase = FileOpen(argv[2]))) { printf("couldnt open dbase file %s\n",argv[2]); usage(); return 2; } paramParse(argc, argv); initLib(); if (translated) { doTranslated(query, dbase); return 0; } currquery = FileRead(query, s1start, s1end, VER_FCHAOS); currdbase = FileRead(dbase, s2start, s2end, VER_FCHAOS); if (pairfile) { procPairs(currquery, currdbase); if (!pairfile) { FileClose (query); FileClose (dbase); return 0; } } do { root = makeTrie(wordlen, triealpha); mylist = makeSkLst(); currword = currquery->lets; insertString(root,currword); while (currdbase) { direction = '+'; doAlgo(root, currquery, currdbase); /***/ allhits = removeDups(allhits, currquery, currdbase); printHLL(allhits, currquery, currdbase, currdbase->numlets); allhits = 0; if (both) { direction = '-'; rc(currdbase); doAlgo(root, currquery, currdbase); /****/ allhits = removeDups(allhits, currquery, currdbase); printHLL(allhits, currquery, currdbase, currdbase->numlets); allhits = 0; } temp = currdbase; if (!pairfile) { freeSeq(currdbase); } currdbase = FileRead(dbase, s2start, s2end, VER_FCHAOS); } currdbase = temp; if (pairfile) { procPairs(currquery, currdbase); freeTrie(root); } } while (pairfile) ; FileClose (query); FileClose (dbase); return 0; } void saveScore(LList* final, int index, gfc* first, gfc* last) { hll* myhit = (hll*) malloc(sizeof(hll)); int temp; myhit->score = final->scores[index]; myhit->seq1end = final->seq1endpnt[index]; myhit->seq2end = final->seq2endpnt[index]; myhit->seq1start = final->seq1startpnt[index]; myhit->seq2start = final->seq2startpnt[index]; myhit->last = last; myhit->first = first; myhit->next = allhits; allhits = myhit; } void remElem(LList* tbf, int i) { free(tbf->mysles[i]->myelem); SLremove(mylist, tbf->mysles[i]); } inline int CHmatchscore(unsigned char a, unsigned char b) { return substmatrix[a][b]; /* if (translated) return substmatrix[a][b]; if (a == 'N' || b == 'N' || a == 'X' || b == 'X') return 0; if ((a == '*' || b == '*') && a != b) return -50; if (indeces[a] == indeces[b]) return matchsco; return mismatchsco; */ } int extendBLAST(int s1i, int s2i, char* s1, char* s2, int s1l, int s2l, int dir) { int peak=0, peakloc = 0, currscore=0, i = 1; while (peak - currscore < dropcutoff) { if (s1i+dir*i < 0 || s2i+dir*i < 0 || !s1[s1i+dir*i] || !s2[s2i+dir*i] || s1i+dir*i >= s1l || s2i+dir*i >= s2l) break; currscore += CHmatchscore (s1[s1i+dir*i], s2[s2i+dir*i]); // fprintf(stderr, "%d(%c %c) ", currscore, s1[s1i+dir*i], s2[s2i+dir*i]); if (currscore > peak) { peak = currscore; peakloc = i; } i++; } // fprintf(stderr, "got to %d, score %d(%d)\n", i, currscore, peak); return peakloc; } int extendMerge(int s1l, int s2l, int s1r, int s2r, char* s1, char* s2, int* dir) { int length, i; int *s1arr, *s2arr, bestscore=-9999999, bestloc=0; // HACK if (s1l < 0){ int err = -s1l; s1l += err; s2l += err; } if (s2l < 0){ int err = -s2l; s1l += err; s2l += err; } length = MIN2(s1r-s1l, s2r-s2l); // fprintf(stderr,"extmerge (%d %d) (%d %d)\n", s1l, s2l, s1r, s2r); *dir = WEQ2(s1r-s1l, s2r-s2l, length); //0 vertical, 1 horizontal if (length <= 0) return 0; s1arr = (int*) malloc (sizeof(int) * (length+1)); s2arr = (int*) malloc (sizeof(int) * (length+1)); s1arr[0] = s2arr[length] = 0; for (i = 1; i <= length; i++) { s1arr[i] = s1arr[i-1] + CHmatchscore(s1[s1l+i], s2[s2l+i]); s2arr[length-i] = s2arr[length-i+1] + CHmatchscore(s1[s1r-i], s2[s2r-i]); } for (i = 0; i < length; i++) { if (s1arr[i]+s2arr[i+1] > bestscore) { bestscore = s1arr[i]+s2arr[i+1]; bestloc = i; } } // fprintf(stderr, "extMer score = %d\n", bestscore); free (s1arr); free (s2arr); return bestloc; } int reScore(int s1l, int s2l, int len, char* s1, char* s2) { int i; int totscore = 0; // HACK if (s1l < 0){ int err = -s1l; s1l += err; s2l += err; len -= err; } if (s2l < 0){ int err = -s2l; s1l += err; s2l += err; len -= err; } for (i=0; i < len; i++) { totscore += CHmatchscore(s1[s1l+i], s2[s2l+i]); } return totscore; } void reScoreHit(LList* tbf, int index, char* s1, char* s2, int s1l, int s2l, gfc **frstgf, gfc **mygf) { int totscore = 0, myscore; int ts1, ts2, te1, te2; int i=0, temp=0, offset, dir; if (extend) { temp = extendBLAST(tbf->myhits[index].inds1[i], tbf->myhits[index].inds2[i], s1, s2, s1l, s2l, -1); } tbf->seq1startpnt[index] = ts1 = tbf->myhits[index].inds1[i] - temp; tbf->seq2startpnt[index] = ts2 = tbf->myhits[index].inds2[i] - temp; *frstgf = *mygf = (gfc*) malloc (sizeof (gfc)); (*frstgf)->offset = 0; for (i = 0; i < tbf->myhits[index].numind-1; i++) { if (!(offset = ((tbf->myhits[index].inds1[i]-tbf->myhits[index].inds2[i]) - (tbf->myhits[index].inds1[i+1]-tbf->myhits[index].inds2[i+1])))) { continue; } else { temp = extendMerge(tbf->myhits[index].inds1[i]+wordlen-1, tbf->myhits[index].inds2[i]+wordlen-1, tbf->myhits[index].inds1[i+1], tbf->myhits[index].inds2[i+1], s1, s2, &dir); te1 = tbf->myhits[index].inds1[i] + wordlen - 1 + temp; te2 = tbf->myhits[index].inds2[i] + wordlen - 1 + temp; myscore = reScore(ts1, ts2, te1-ts1+1, s1, s2); totscore += myscore; totscore += (gappenstart + gappenext * ABS(offset)); (*mygf)->length = te1-ts1+1; (*mygf)->score = myscore; (*mygf)->next = (gfc*) malloc (sizeof (gfc)); (*mygf) = (*mygf)->next; (*mygf)->offset = offset; if (dir) { ts1 = te1+ABS(offset)+1; ts2 = te2+1; } else { ts2 = te2+ABS(offset)+1; ts1 = te1+1; } } } temp = 0; if (extend) { temp = extendBLAST(tbf->myhits[index].inds1[i]+wordlen-1, tbf->myhits[index].inds2[i]+wordlen-1, s1, s2, s1l, s2l, 1); } myscore = reScore(ts1, ts2, tbf->myhits[index].inds1[i]+wordlen-ts1+temp, s1, s2); (*mygf)->length = tbf->myhits[index].inds1[i]+wordlen-ts1+temp; (*mygf)->score = myscore; (*mygf)->next = 0; totscore += myscore; tbf->scores[index] = totscore; tbf->seq1endpnt[index] = tbf->myhits[index].inds1[i]+wordlen-1 + temp; tbf->seq2endpnt[index] = tbf->myhits[index].inds2[i]+wordlen-1 + temp; } LList* savenfreeLList(LList* tbf, seq* seq1, seq* seq2) { int i,j; LList* next; gfc *first, *last; if (!tbf) return 0; for (i=0; i < tbf->myloc->numlocs; i++) { if (tbf->scores[i] > cutoff) { tbf->seq1endpnt[i] = (int) tbf->myloc->locs[i] + wordlen - 1; tbf->seq2endpnt[i] = tbf->location +wordlen - 1; reScoreHit(tbf, i, seq1->lets, seq2->lets, seq1->numlets, seq2->numlets, &first, &last); j = tbf->scores[i]; if (tbf->scores[i] > reScoreCutoff){ saveScore(tbf,i, first, last); } } } for (i=0; i < tbf->myloc->numlocs; i++) { if (tbf->mysles[i]) { remElem(tbf,i); } free (tbf->myhits[i].inds1); free (tbf->myhits[i].inds2); } next = tbf->next; free (tbf->myhits); free (tbf->scores); free (tbf->mysles); free (tbf->seq1startpnt); free (tbf->seq2startpnt); free (tbf->seq1endpnt); free (tbf->seq2endpnt); free (tbf->toberemoved); free (tbf); return savenfreeLList(next, seq1, seq2); } int mergeOverlap(hll* h1, hll* h2, seq* seq1, seq* seq2) { int offset, myscore, nextscore, newscore, bestloc, dir, gappen; int s1l, s2l, s1r, s2r, s1n, s2n; // return 0; // fprintf (stderr, "(%d %d) (%d %d)", h1->seq1end, h1->seq2end, h2->seq1start, h2->seq2start); if ((h1->seq2end < h2->seq2start) && (h1->seq1end < h2->seq1start)) { // fprintf (stderr, " no\n"); return 0; } offset = (h1->seq1end-h1->seq2end) - (h2->seq1start-h2->seq2start); if (ABS(offset) > mgaplen) return 0; gappen = gappenstart + gappenext * ABS(offset); if ((-gappen) > h1-> score || (-gappen) > h2->score) { // fprintf (stderr, " gap\n"); return 0; } s1l = h1->seq1end - h1->last->length; s2l = h1->seq2end - h1->last->length; s1r = h2->seq1start + h2->first->length; s2r = h2->seq2start + h2->first->length; if (s1r <= s1l || s2r <= s2l) { // fprintf (stderr, " swap\n"); return 0; } if (offset) { bestloc = extendMerge(s1l, s2l, s1r, s2r, seq1->lets, seq2->lets, &dir); myscore = reScore(s1l, s2l, bestloc, seq1->lets, seq2->lets); if (dir) { s1n = s1l + bestloc + ABS(offset)+1; s2n = s2l + bestloc + 1; } else { s2n = s2l + bestloc + ABS(offset)+1; s1n = s1l + bestloc + 1; } nextscore = reScore(s1n, s2n, s2r - s2n, seq1->lets, seq2->lets); // fprintf (stderr, " %d %d %d\n", bestloc, myscore, nextscore); // fprintf (stderr, "a %d %d %d\n", s1l, s1n, s1r); newscore = h1->score + h2->score - (h2->first->score - nextscore) - (h1->last->score - myscore) + gappen; if (newscore < h1-> score || newscore < h2->score) { // fprintf (stderr, " score1\n"); return 0; } h1->score = newscore; h1->last->length = bestloc; h2->first->score = nextscore; h2->first->offset = offset; h2->first->length = s2r - s2n; h1->last->score = myscore; h1->last->next = h2->first; if (h1->last->next) h1->last = h2->last; h2->first = 0; } else { myscore = reScore(s1l, s2l, s1r-s1l, seq1->lets, seq2->lets); newscore = h1->score + h2->score - (h1->last->score - myscore) + gappen; if (newscore < h1-> score || newscore < h2->score) { // fprintf (stderr, " score2\n"); return 0; } h1->score = newscore; h1->last->score = myscore; h1->last->next = h2->first->next; h1->last->length = s1r - s1l; if (h1->last->next) h1->last = h2->last; h2->first->next = 0; } h1->seq2end = h2->seq2end; h1->seq1end = h2->seq1end; return 1; } lagan20/src/fchaos.h0000644000076500007650000000111310502337063015316 0ustar brudnobrudno00000000000000#ifndef __FCHAOS_H #define __FCHAOS_H typedef struct GapFreeChunkList { int offset; int length; int score; struct GapFreeChunkList *next; } gfc; typedef struct HitLocationList { int seq1start; int seq2start; int seq1end; int seq2end; float score; gfc* first; gfc* last; struct HitLocationList *next; char dirty; } hll; typedef struct Sequence { char* lets; int numlets, numsiglets; int leftbound, rightbound; char* name; char* rptr; } seq; hll* fchaos(int argc, char** argv); int mergeOverlap(hll* h1, hll* h2, seq* seq1, seq* seq2); #endif lagan20/src/filebuffer.c0000644000076500007650000001060710502337063016167 0ustar brudnobrudno00000000000000#include "filebuffer.h" #include #include #include #include #ifdef CHAOS__FLAG char* alphabet = "ATCGNPCMHDEKRQSILVFYWX*"; #else char* alphabet = "ATCGN-."; #endif FileBuffer FileOpen (const char *path){ FileBuffer buf; FILE *data = fopen (path, "r"); if (!data) return NULL; buf = (FileBuffer) malloc (sizeof (struct FileBufferImplementation)); if (!buf) return NULL; buf->filename = (char*) path; buf->head = NULL; buf->tail = NULL; buf->startpos = 0; //100000000; buf->endpos = 100000000; //0; //buf->pos = BUFFER_SIZE; //buf->len = BUFFER_SIZE; buf->data = data; return buf; } void FileUpdate (FileBuffer buf){ if (buf->head >= buf->tail){ buf->tail = buf->buffer + fread (buf->buffer, sizeof(char), BUFFER_SIZE, buf->data); buf->head = buf->buffer; } } int FileEOF (FileBuffer buf){ FileUpdate (buf); return buf->head >= buf->tail && feof (buf->data); } void FileGetS (char *buffer, int length, FileBuffer buf){ int a; for (a = 0; a < length && !FileEOF (buf); a++){ buffer[a] = FilePeekC (buf); buf->head++; if (a + 1 < length && buffer[a] == '\n'){ buffer[a + 1] = '\0'; break; } } } char *FileGetLine (FileBuffer buf){ int a = 0, length = 1; char *buffer = (char *) malloc (1 * sizeof(char)); assert (buffer); while (!FileEOF (buf)){ buffer[a] = FilePeekC (buf); buf->head++; if (buffer[a] == '\n'){ buffer[a] = '\0'; break; } a++; if (a == length){ buffer = (char *) realloc (buffer, (length *= 2) * sizeof(char)); assert (buffer); } } return buffer; } void FilePopC (FileBuffer buf){ buf->head++; } char FilePeekC (FileBuffer buf){ FileUpdate (buf); return *(buf->head); // return buf->buffer[buf->pos]; } void FileClose (FileBuffer buf){ fclose (buf->data); free (buf); } seq* FileRead (FileBuffer buf, int start, int finish, int version){ char* res = (char*) malloc(sizeof(char)); int ressize = 1, numread = 0, i, numNs = 0; char *tempname, temp[256], currchar, *curr, *resend; seq* myseq = (seq*) malloc(sizeof(seq)); if (FileEOF(buf)) return 0; if (start == 1 && finish == 0) { start = buf->startpos; finish = buf->endpos; if (start == 0) start = 1; } tempname = FileGetLine (buf); if (tempname[0] != '>') { fprintf(stderr, "File is not in FASTA format!!\n"); exit(1); } myseq->name = (char*) malloc((strlen(tempname))*sizeof(char)); strcpy(myseq->name, tempname+1); if (strchr(myseq->name, '\n')) *(char *)(strchr(myseq->name, '\n')) = 0; free (tempname); for (i = 0; i < 256; i++){ temp[i] = (strchr (alphabet, toupper ((char) i)) != 0) ? toupper((char) i) : 'N'; } FileUpdate (buf); curr = res; resend = res + ressize; if (version == VER_ORDER || version == VER_MLAGAN){ ressize = 2; numread = 1; if (version == VER_ORDER) res[0] = 0; else res[0] = 'N'; curr++; } while (buf->head < buf->tail || !feof (buf->data)){ while (buf->head < buf->tail){ currchar = *(buf->head); if (currchar == '>') goto outer; if (currchar != ' ' && currchar != '\n' && currchar != '\r' && currchar != '\t' && currchar != '\t' && currchar != '\v') { if (currchar == 'N') numNs++; *curr++ = temp[(int) currchar]; if (curr >= resend) { numread = curr - res; res = (char *) realloc (res, sizeof(char) * (ressize *= 2)); curr = res + numread; resend = res + ressize; } } buf->head++; } buf->tail = buf->buffer + fread (buf->buffer, sizeof(char), BUFFER_SIZE, buf->data); buf->head = buf->buffer; } outer: numread = curr - res; res[numread]=0; myseq->rptr = res; if (version == VER_FCHAOS){ if (start > 0) { res[finish] = 0; res = &res[start-1]; numread = finish-start+1; } myseq->numlets = numread; } else if (version == VER_ORDER){ if (start > 0){ res = &res[start-1]; res[0] = 0; res[finish-start+2] = 0; numread = finish-start+2; } myseq->numlets = numread-1; } else if (version == VER_MLAGAN){ if (start > 0 || finish > 0) { res[finish] = 0; res = &res[start-1]; numread = finish-start+1; } myseq->numlets = numread; myseq->leftbound = start; myseq->rightbound = finish; } myseq->numsiglets = numread - numNs; myseq->lets = res; return myseq; } lagan20/src/filebuffer.h0000644000076500007650000000135710502337063016176 0ustar brudnobrudno00000000000000#ifndef __FILEBUFFER_H #define __FILEBUFFER_H #include #ifndef MULTIAL__FLAG #include "fchaos.h" #else #include "multial.h" #endif #define BUFFER_SIZE 1048576 #define VER_FCHAOS 0 #define VER_ORDER 1 #define VER_MLAGAN 2 struct FileBufferImplementation { FILE *data; char* filename; char buffer[BUFFER_SIZE]; char *head, *tail; int startpos, endpos; // int pos, len; }; typedef struct FileBufferImplementation *FileBuffer; FileBuffer FileOpen (const char *path); int FileEOF (FileBuffer buf); void FileGetS (char *buffer, int length, FileBuffer buf); char FilePeekC (FileBuffer buf); void FilePopC (FileBuffer buf); void FileClose (FileBuffer buf); seq* FileRead (FileBuffer buf, int start, int end, int version); #endif lagan20/src/global.c0000644000076500007650000001064710502337063015322 0ustar brudnobrudno00000000000000#include "global.h" #include #include extern int indeces[256]; #define MAX2(x,y) ( (x) >= (y) ? (x) : (y) ) #define MAX3(x,y,z) MAX2(MAX2(x,y),z) int ismatch(char a, char b) { return indeces[a] == indeces[b]; } int matchscore (char a, char b) { if (a == b) return 4; return -3; } void reverse (char* a, int length) { char lft; int i; for (i=0; i < length/2; i++) { lft = a[i]; a[i] = a[length-i-1]; a[length-i-1] = lft; } } align* global(char* seq1, int start1, int end1, char* seq2, int start2, int end2, int gapopen, int gapext) { int mm = end2 - start2 + 1, score; int i,j,k,c, temp, lastdiag=0; int* M = (int*) malloc (sizeof(int) * (end1-start1+1) * (end2 - start2+1)); int* N = (int*) malloc (sizeof(int) * (end1-start1+1) * (end2 - start2+1)); int* O = (int*) malloc (sizeof(int) * (end1-start1+1) * (end2 - start2+1)); align* result = (align*) malloc (sizeof(align)); char* almt = (char*) malloc ( sizeof(char) * ((end1-start1)+(end2-start2)+2)); M[mm*0+0] = matchscore(seq1[start1],seq2[start2]); N[mm*0+0] = -1*gapopen; O[mm*0+0] = -1*gapopen; for (i = 1; i <= end1-start1; i++) { O[mm*i+0] = O[mm*(i-1)+0]-gapext; N[mm*i+0] = 0; M[mm*i+0] = O[mm*(i-1)+0]+matchscore(seq1[start1+i],seq2[start2]); } for (j = 1; j <= end2-start2; j++) { N[mm*0+j] = N[mm*0 + (j-1)]-gapext; O[mm*0+j] = 0; M[mm*0+j] = N[mm*0+(j-1)]+matchscore(seq1[start1],seq2[start2+j]); } for ( k = 2; k <= end1-start1; k++) { for (i = k-1, j = 1; (i > 0) && (j <= end2-start2); i--, j++) { N[mm*i + j] = MAX2(M[mm*(i-1)+j] - gapopen, N[mm*(i-1)+j] - gapext); O[mm*i + j] = MAX2(M[mm*i+(j-1)] - gapopen, O[mm*i+(j-1)] - gapext); M[mm*i + j] = MAX3(M[mm*(i-1)+(j-1)],N[mm*(i-1)+(j-1)],O[mm*(i-1)+(j-1)]) + matchscore(seq1[start1+i], seq2[start2+j]); } } for ( k = 1; k <= end2-start2; k++) { for (j = k, i = end1-start1; (i>0) && (j <= end2-start2); j++, i--) { N[mm*i + j] = MAX2(M[mm*(i-1)+j] - gapopen, N[mm*(i-1)+j] - gapext); O[mm*i + j] = MAX2(M[mm*i+(j-1)] - gapopen, O[mm*i+(j-1)] - gapext); M[mm*i + j] = MAX3(M[mm*(i-1)+(j-1)],N[mm*(i-1)+(j-1)],O[mm*(i-1)+(j-1)]) + matchscore(seq1[start1+i], seq2[start2+j]); } } i = end1-start1; j = end2-start2; c = 0; result->score = MAX3 ( M[mm*(i)+(j)], N[mm*(i)+(j)], O[mm*(i)+(j)]); while(i >= 0 && j >= 0) { if (!i) { almt[c++] = ismatch(seq1[start1], seq2[start2+j]); for ( j = j -1; j >=0; j--,c++) { lastdiag = 0; almt[c] = DELETION; } } else if (!j) { almt[c++] = ismatch(seq1[start1+i], seq2[start2]); for ( i = i -1; i >=0; i--,c++) { almt[c] = INSERTION; lastdiag = 0; } } else { if (!lastdiag) { M[mm*i+j] = M[mm*i+j] - gapopen; N[mm*i+j] = N[mm*i+j] - gapext; O[mm*i+j] = O[mm*i+j] - gapext; } temp = MAX3 ( M[mm*(i)+(j)], N[mm*(i)+(j)], O[mm*(i)+(j)]); if (temp == N[mm*(i)+(j)]) { lastdiag = 0; almt[c++] = INSERTION; i--; } else if (temp == O[mm*(i)+(j)]) { lastdiag = 0; almt[c++] = DELETION; j--; } else if (temp == M[mm*(i)+(j)]) { lastdiag = 1; almt[c++] = ismatch(seq1[start1+i], seq2[start2+j]); i--; j--; } } } free(M); free(N); free(O); result->algnlen = c; reverse(almt,c); result->algn = almt; return result; } int printalign(char* seq1, int start1, int end1, char* seq2, int start2, int end2, align* myalign) { int s1=start1, s2=start2, c, k; int nm=0, nga=0, ngb=0, nlets=0; int hasst=0; for (c = 0; c < myalign->algnlen; c = c + 60) { for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] != DELETION) printf("%c", seq1[s1++]); else { printf("-"); if (hasst) nga++; } } printf("\n"); for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] == 1) { printf(":"); nm++; nlets++; hasst = 1; } else { printf(" "); if (hasst) nlets++; } } printf("\n"); for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] != INSERTION) printf("%c", seq2[s2++]); else { printf("-"); if (hasst) ngb++; } } printf("\n\n"); } printf("score = %d, nmatches = %d, nga=%d, ngb=%d nletters=%d, perc = %f\n", myalign->score,nm,nga,ngb,nlets,(float)nm/(float)nlets); printf("\n"); } lagan20/src/global.h0000644000076500007650000000053010502337063015315 0ustar brudnobrudno00000000000000#define INSERTION 2 #define DELETION 3 typedef struct align_res { int score; int algnlen; char* algn; } align; align* global(char* seq1, int start1, int end1, char* seq2, int start2, int end2, int gapstart, int gapcont); int printalign(char* seq1, int start1, int end1, char* seq2, int start2, int end2, align* myalign); lagan20/src/glocal/0000755000076500007650000000000010502546662015156 5ustar brudnobrudno00000000000000lagan20/src/glocal/default.score0000755000076500007650000000026710502337063017640 0ustar brudnobrudno00000000000000{+R+;-L-}{0 0.02 0 0;40000 0 0 0} {+R-;-L+}{3000 0.02 0.1 0;40000 0 0 0} {-R+;+L-}{7000 0.02 0.5 0;40000 0 0 0} {+L+;-R-}{7000 0.02 0.5 0;40000 0 0 0} {+U+;+U-;-U+;-U-}{30000 0 0 0} lagan20/src/glocal/glocal.cpp0000755000076500007650000001623410502337063017125 0ustar brudnobrudno00000000000000#include #include #include bool seq1StartCompare(const Fragment &f1, const Fragment &f2) { return f1.seq1Start < f2.seq1Start; } //vectors that would be needed globally vector fragments; vectorstartPoints; vectorendPoints; long long int numFragments; InterPoint inter; /*SLAGANCHANGE This has to change*/ RI RI_regions[1<<(UPSTRANDBITS+DOWNSTRANDBITS+RELPOSBITS)]; LI LI_regions[1<<(UPSTRANDBITS+DOWNSTRANDBITS+RELPOSBITS)]; vector scoreFunctions[1<<(UPSTRANDBITS+DOWNSTRANDBITS+RELPOSBITS)]; Name allNames; extern Fragment LI_dummy; Fragment * unrelatedFrag; Fragment *max_score_index; float max_score; int main(int, char **argv) { long long int nextEndRow,nextStartRow, nextInterPointRow; long long int i; Point intersectionPoint; numFragments = readInput(argv[1]); findAllNames( numFragments); decideContigBase(); storeIterators(numFragments); initScoreFunctionPointers(argv[2]); unrelatedFrag = &LI_dummy; /*SLAGANCHANGE need a LI, RI pointer array and init */ /*SLAGANCHANGE:: Need score function init */ if (DEBUG) { fprintf(stderr,"Numfrg::%lld",numFragments); } max_score_index=NULL; max_score =-INF; long long int break_flag =0; createPointLists(numFragments); // printFragmentsInPointListOrder(numFragments); // exit(0); //The initial Row upto which startPointHandler goes nextEndRow = endPoints[0].seq1; nextStartRow = startPoints[0].seq1; for (i=0;i<1<first; nextInterPointRow = intersectionPoint.seq1; if (DEBUG) { fprintf(stderr,"\nHERE"); } } if (nextStartRow <= nextEndRow) { //CHANGE HERE if (nextStartRowstrand; relPos = startPoints[current].seq2 > 0 ? RIGHT:LEFT; upStrand = POSITIVE; possibleCase = downStrand << DOWNSTRANDSHIFT | upStrand <back == NULL) { if (DEBUG) { fprintf(stderr, "\n The fragment did not chain!"); } // exit(1); } else if (DEBUG) { fprintf(stderr, "Score for the current fragment is::%f", startPoints[current].frag->totalScore); fprintf(stderr, "Score for the owner fragment is::%f", startPoints[current].frag->back->totalScore); } if (startPoints[current].frag->totalScore > max_score) { max_score = startPoints[current].frag->totalScore; max_score_index = startPoints[current].frag ; } current++; if (DEBUG) { fprintf(stderr,"\ncurrent fragment is %lld",current); } if (current>=2*numFragments) { return INF; } } return startPoints[current].seq1; } //takes as arguements the start row number and the end row number and processes all the rows //This would usually have to find the case long long int endPointHandler() { static long long int current=0; long long int current_seq1= endPoints[current].seq1; if (DEBUG) { fprintf(stderr,"\nEnd PointHandler"); } /*SLAGANCHANGE:: There is going to be a commit to 4 strucures depending on the strand, loop with continue*/ /*SLAGANCHANGE:: find the best scoring fragment in the current row and update the best so far at the end*/ while (endPoints[current].seq1 == current_seq1) { long long int upStrand, downStrand, relPos, possibleCase; //MUKFIXME: This sends the highest scoring one into the leftinfluence machinery while (current<2*numFragments-1 &&( endPoints[current].seq1== endPoints[current+1].seq1) && (endPoints[current+1].seq2 == endPoints[current].seq2)) { if ((endPoints[current].frag->totalScore) > (endPoints[current+1].frag->totalScore)) { Fragment * temp; temp=endPoints[current+1].frag; endPoints[current+1].frag=endPoints[current].frag; endPoints[current].frag =temp; } current++; } /* if( current>1 &&(endPoints[current].seq1== endPoints[current-1].seq1) && (endPoints[current-1].seq2 == endPoints[current].seq2)) { current++; continue; } */ upStrand = endPoints[current].frag->strand; // This works because POSITIVE and NEGATIVE are 0 and 1 // This works because LEFT and RIGHT are 0 and 1 for (downStrand=0;downStrand<2;downStrand++) { for (relPos=0;relPos<2;relPos++) { possibleCase = downStrand << DOWNSTRANDSHIFT | upStrand <totalScore > unrelatedFrag->totalScore) unrelatedFrag = endPoints[current].frag; current++; } return endPoints[current].seq1; } void intersectionPointHandler() { long long int current_seq1; Point p,curr; p=inter.begin()->first; current_seq1=p.seq1; if (DEBUG) { fprintf(stderr,"\nIntersection PointHandler"); } do { // printState(&LI_regions[0]); HandleOneIntersectionPoint(); //printState(&LI_regions[0]); p=inter.begin()->first; current_seq1=p.seq1; } while (current_seq1==curr.seq1); } lagan20/src/glocal/glocal.h0000755000076500007650000000071410502337063016566 0ustar brudnobrudno00000000000000#ifndef GLOCAL #define GLOCAL #define DEBUG 1 #ifndef LLONG_MAX // limits.h entries from ISO C99 #define LLONG_MAX 9223372036854775807LL #define LLONG_MIN (-LLONG_MAX - 1LL) #endif #include #include #include #include #include long long int startPointHandler(); long long int endPointHandler(); float fragmentSetScore(Fragment * current,Fragment *owner); void intersectionPointHandler(); #endif lagan20/src/glocal/io.cpp0000755000076500007650000001430510502337063016270 0ustar brudnobrudno00000000000000#include #include #include #include extern vector fragments; extern vector startPoints; extern vector endPoints; extern Name allNames; bool PointCompare(const Point &f1, const Point &f2) { if (f1.seq1 < f2.seq1) { return (f1.seq1 < f2.seq1); } else if (f1.seq1 == f2.seq1) { return (f1.seq2 < f2.seq2); } else { return (f1.seq1 < f2.seq1); } } //internal function that i dont need to care about. char* rolltonum(char* str) { char *got1 = 0, *got2 = 0; long long int in = 0, i = 0; while (1) { if (str[i] == 0) { break; } if (str[i] == ';' && got1 && got2) { return got1; } if (isdigit(str[i])) { if (!in && (!i || isspace(str[i-1]))) { if (got1) { got2 = &str[i]; } else { got1 = &str[i]; } in = 1; } } else if (in && isspace(str[i])) { if (got2) { got1 = got2; got2 = 0; in = 0; } in = 0; } else { got1 = got2 = NULL; } i++; } return &str[i]; } //reads one line of input at a time. long long int getline(FILE *infile, hll *tt) { char temp[1024]; char* help; long long int z; int h; fgets(temp, 1024, infile); sscanf(temp, "%s", tt->seq1Name); help = rolltonum(temp); z = sscanf(help, "%lld %lld;%n", &tt->seq1start, &tt->seq1end, &h); if (z < 2) { return 0; } sscanf(help+h, "%s", tt->seq2Name); help = rolltonum(help + h); if (sscanf(help, "%lld %lld; score = %f (%c)\n", &tt->seq2start, &tt->seq2end, &tt->score, &tt->strand)<3) { return 0; } else { return 1; } } void printFragment ( Fragment * curfrag ) { if (curfrag == NULL) { printf("done"); return; } else if (curfrag->score == -1) { return; } // TODO: remove space after s2 and check supermap sorts and regexes printf("(%lld %lld)=(%lld %lld) %f %c [%f] s1:%s s2: %s\n", curfrag->seq1Start, curfrag->seq1End, curfrag->seq2Start-curfrag->base, curfrag->seq2End-curfrag->base, curfrag->score, (curfrag->strand==POSITIVE)?'+':'-', curfrag->totalScore, curfrag->seq1Name, curfrag->seq2Name ); } void printAllFragments(long long int numFragments) { long long int i; for (i=0; iback; } return 0; } void swap(long long int *a, long long int *b) { long long int temp; temp = *a; *a = *b; *b = temp; } // initialises the parameters for a fragment. // note the swap at the end of this function. Fragment createFragment(hll *temp) { Fragment frag; frag.seq1Start = temp->seq1start; frag.seq1End = temp->seq1end; frag.seq2Start = temp->seq2start; frag.seq2End = temp->seq2end; strcpy(frag.seq1Name, temp->seq1Name); strcpy(frag.seq2Name, temp->seq2Name); if (temp->strand == '+') { frag.strand = POSITIVE; } else { frag.strand = NEGATIVE; } frag.score = temp->score; frag.back = NULL; frag.totalScore = -1; frag.deleted = FALSE; if (frag.seq1Start > frag.seq1End) { swap(&(frag.seq1Start), &(frag.seq1End)); } return frag; } // reads the input file and returns the number of fragments read. long long int readInput(char * fileName) { hll tempInput; FILE * fp; long long int i=0; char line[1024]; unsigned long long int line_count = 0; fp = fopen(fileName, "r"); if (!fp) { printf("SLAGAN: Error: Could not open file '%s'\n", fileName); exit(0); } else if (feof(fp)) { printf("SLAGAN: Error: Empty file %s\n", fileName); exit(0); } // Count the number of lines in the file while (fgets(line, 1023, fp)) { line_count++; } rewind(fp); fragments.reserve(line_count); while (!feof(fp)) { while (!feof(fp) && !getline(fp, &tempInput)); if (feof(fp)) { break; } // ignoring the low scoring fragments ? if (tempInput.score < CUTOFF ) { continue; } //createfragment fragments.push_back(createFragment(&tempInput)); i++; } return i; } void createPointLists(long long int numFragments) { long long int i; Point startPoint, endPoint; //SLAGANCHANGE:: Push -seq2,seq1 on the start list as well. for (i=0; ifragments[i].seq2End ? fragments[i].seq2Start : fragments[i].seq2End; currName = allNames.find(fragments[i].seq2Name); if (currName != allNames.end()) { if (currName->second < size) { currName->second = size; } } else { allNames[fragments[i].seq2Name] = size; numContigs ++; } } if (DEBUG) { fprintf(stderr, "The number of contigs is %lld",numContigs); } } void decideContigBase() { Name::iterator currName; long long int offset =0; long long int temp; for (currName=allNames.begin(); currName!=allNames.end(); currName++) { temp = currName->second; currName->second = offset; offset += (10 + temp); } } void storeIterators(long long int numFragments) { long long int i; for (i=0; isecond; fragments[i].seq2End += (fragments[i].nameIter)->second; fragments[i].base = (fragments[i].nameIter)->second; } } lagan20/src/glocal/io.h0000755000076500007650000000104010502337063015725 0ustar brudnobrudno00000000000000#ifndef IO #define IO #include #include #include #include #include #include long long int printChain(Fragment *current); long long int readInput(char * fileName); void printAllFragments( long long int numFragments); void createPointLists(long long int numFragments); void printPointLists(long long int numFragments); void printFragment ( Fragment * curfrag ); void findAllNames(long long int numFragments); void storeIterators(long long int numFragments); void decideContigBase(); #endif lagan20/src/glocal/leftinfluence.cpp0000755000076500007650000005210410502337063020503 0ustar brudnobrudno00000000000000#include Fragment LI_dummy; // Returns the fragment who is the owner of the region in which the current point is Owner::iterator LILookUpOwnerIterator(LI * LeftInfluence, long long int seq1, long long int seq2) { CBound::iterator citer; DBound::iterator diter; citer = (LeftInfluence->c).lower_bound(seq2); if ((LeftInfluence->c).end() == (LeftInfluence->c).begin() || (citer == (LeftInfluence->c).begin())) { return (LeftInfluence->o).end(); } citer--; diter = (LeftInfluence->d).upper_bound(seq2 - seq1); if (diter == (LeftInfluence->d).begin()) { return citer->second; } diter--; if ((citer->first - diter->first) > seq1) { return citer->second; } else { return diter->second; } } Fragment * LILookUpOwnerEnd(LI * LeftInfluence,Fragment * current) { Owner::iterator own = LILookUpOwnerIterator(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag)); if (own == (LeftInfluence->o).end()) { return &LI_dummy; } else { return *own; } } Fragment * LILookUpOwnerStart(LI * LeftInfluence, Fragment * current) { Owner::iterator own = LILookUpOwnerIterator(LeftInfluence, current->seq1Start, current->getSeq2Start(LeftInfluence->reflectFlag)); if (own == (LeftInfluence->o).end()) { return &LI_dummy; } else { return *own; } } // Returns the column boundary before the current point, if there is none it returns end CBound::iterator LICColumn(LI * LeftInfluence, long long int /* seq1 */, long long int seq2) { CBound::iterator citer; citer = (LeftInfluence->c).lower_bound(seq2); //should not decrement, also means that the point is before all the column boundaries. //FIX #2 if(citer == (LeftInfluence->c).begin()) if ((LeftInfluence->c).end() == (LeftInfluence->c).begin() || (citer == (LeftInfluence->c).begin())) { return (LeftInfluence->c).end(); } else { citer--; return citer; } } Fragment * LICOwner(LI * LeftInfluence, long long int seq1, long long int seq2) { CBound::iterator citer; citer = LICColumn(LeftInfluence, seq1, seq2); if (citer == (LeftInfluence->c).end()) { return &LI_dummy; } else { return *(citer->second); } } Fragment * LIDOwner(LI * LeftInfluence, long long int seq1, long long int seq2) { DBound::iterator diter; diter = LIDDiagonal(LeftInfluence, seq1, seq2); if (diter == (LeftInfluence->d).end()) { return &LI_dummy; } else { return *(diter->second); } } //returns the diagonal boundary, or end if all the point is before all the diagonal boundaries DBound::iterator LIDDiagonal(LI * LeftInfluence, long long int seq1, long long int seq2) { DBound::iterator diter; diter = (LeftInfluence->d).upper_bound(seq2-seq1); if ((LeftInfluence->d).end() == (LeftInfluence->d).begin() || diter == (LeftInfluence->d).begin()) { return (LeftInfluence->d).end(); } else { diter--; return diter; } } // this function should never get called with the LI dummy // can the scores become negative and how do we handle this? float LILookUpScore(LI * LeftInfluence, Fragment * current) { Fragment * owner = LILookUpOwnerStart(LeftInfluence, current); if (owner==NULL) { fprintf(stderr,"Owner NULL in call LILookUpScore"); exit(0); } if (owner->score == -1) { //MUKCHECK return -1; } else { return scoreAll(owner,current,LeftInfluence->scoreIndex); } } void InitLI(LI * LeftInfluence, long long int scoreIndex) { LeftInfluence->scoreIndex = scoreIndex; if (((scoreIndex >> RELPOSSHIFT) & 1) == LEFT) { LeftInfluence->reflectFlag = TRUE; } else { LeftInfluence->reflectFlag = FALSE; } LI_dummy.score = -1; LI_dummy.totalScore = 0; LI_dummy.back = NULL; //there will be a list of structures to insert this into (LeftInfluence->o).insert((LeftInfluence->o).begin(), &LI_dummy); } long long int LI_Winner(LI * LeftInfluence, Fragment * first, Fragment * second) { Fragment dummy; if (first->score == -1) { return FALSE; } if (second->score == -1) { return TRUE; } dummy.seq1Start = max(first->seq1End, second->seq1End) + 2; dummy.seq2Start = max(first->getSeq2End(LeftInfluence->reflectFlag), second->getSeq2End(LeftInfluence->reflectFlag)) + 1; if (first->getSeq2End(LeftInfluence->reflectFlag) > second->getSeq2End(LeftInfluence->reflectFlag)) { dummy.nameIter = first->nameIter; } else { dummy.nameIter = second->nameIter; } if (scoreAll(first, &dummy, LeftInfluence->scoreIndex) >= scoreAll(second, &dummy, LeftInfluence->scoreIndex)) { return TRUE; } else { return FALSE; } } long long int LICommitPoint(LI * LeftInfluence, Fragment * current) { Owner::iterator cowner, ownerIter; Fragment * owner; CBound::iterator citer; DBound::iterator diter; long long int colFlag; ownerIter = LILookUpOwnerIterator(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag)); citer = LICColumn(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag)); diter = LIDDiagonal(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag)); owner = LILookUpOwnerEnd(LeftInfluence, current); if (citer == (LeftInfluence->c).end()) { colFlag = TRUE; } else if (diter == (LeftInfluence->d).end()) { colFlag = TRUE; } else { cowner = citer->second; if (cowner == ownerIter) { colFlag = TRUE; } else { colFlag = FALSE; } } if (LI_Winner(LeftInfluence, owner, current)) { return FALSE; } if (colFlag) { return LI_CommitColumnOwner(LeftInfluence, current, owner); } else { return LI_CommitDiagonalOwner(LeftInfluence, current, owner); } } Owner::iterator LI_OwnerInsertAfter(LI * LeftInfluence, Owner::iterator current, Fragment * curfrag) { current++; return (LeftInfluence->o).insert(current, curfrag); } long long int LI_CommitDiagonalOwner(LI * LeftInfluence, Fragment * current, Fragment * owner) { CBound::iterator current_column, next_column; DBound::iterator current_diagonal, prevDiag; DInter::iterator current_diag_inter, my_diag_inter, prevDiagInter; CInter::iterator my_col_inter, next_column_inter, colInter; Owner::iterator own, tempowner; //searching for the next column to switch on current_column = LICColumn(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag)); current_diagonal = LIDDiagonal(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag)); current_diag_inter = (LeftInfluence->di).find(current_diagonal->first); own = LILookUpOwnerIterator(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag)); //this implies that the point is before all the cbounds:: THIS CANT HAPPEN!! if (current_column == (LeftInfluence->c).end()) { //FIX#7 fprintf(stderr, "\n diagonal owner, but no column before it"); exit(0); } else { next_column = current_column; next_column++; } //2cases if (next_column == (LeftInfluence->c).end() || next_column->first > current->getSeq2End(LeftInfluence->reflectFlag)) { if (current_diagonal->first < current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End) { if (DEBUG) { fprintf(stderr, "In Diagonal Commit::FIRSTCASE"); } tempowner = LI_OwnerInsertAfter(LeftInfluence, current_diagonal->second, current); (LeftInfluence->c)[current->getSeq2End(LeftInfluence->reflectFlag)] = tempowner; (LeftInfluence->ci)[current->getSeq2End(LeftInfluence->reflectFlag)] = inter.end(); my_col_inter = (LeftInfluence->ci).find(current->getSeq2End(LeftInfluence->reflectFlag)); tempowner = LI_OwnerInsertAfter(LeftInfluence, tempowner, owner); (LeftInfluence->d)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = tempowner; (LeftInfluence->di)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = inter.end(); my_diag_inter = (LeftInfluence->di).find(current->getSeq2End(LeftInfluence->reflectFlag)-current->seq1End); if (next_column!= (LeftInfluence->c).end()) { next_column_inter = (LeftInfluence->ci).find(next_column->first); if (next_column_inter->second == current_diag_inter->second && current_diag_inter->second!=inter.end()) { DeleteIntersectionPoint(next_column_inter->second, next_column_inter, current_diag_inter); CreateIntersectionPoint(LeftInfluence, next_column->first, current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End, next_column_inter, my_diag_inter); } else if (next_column_inter->second == inter.end()) { CreateIntersectionPoint(LeftInfluence, next_column->first, current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End, next_column_inter, my_diag_inter); } } CreateIntersectionPoint(LeftInfluence, current->getSeq2End(LeftInfluence->reflectFlag), current_diagonal->first, my_col_inter, current_diag_inter); } else { if (DEBUG) { fprintf(stderr, "\n In Diagonal Commit:SECONDCASE"); } //There will be a previous owner as this is a diagonal case own = LILookUpOwnerIterator(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag)); own--; if (LI_Winner(LeftInfluence, *own, current)) { return FALSE; } own++; tempowner = (LeftInfluence->o).insert(own, current); (LeftInfluence->c)[current->getSeq2End(LeftInfluence->reflectFlag)] = tempowner; (LeftInfluence->ci)[current->getSeq2End(LeftInfluence->reflectFlag)] = inter.end(); colInter = (LeftInfluence->ci).find(current->getSeq2End(LeftInfluence->reflectFlag)); //There is no diagonal here //intersection Point Handling // check is the previous intersection Point exists, if it does check if the flag is off in which //case insert an intersection Point into Intersect and Handle flags appropriately //There is a problem here //FIX #7 #4 major fix if (current_diagonal != (LeftInfluence->d).begin()) { prevDiag = current_diagonal; prevDiag--; prevDiagInter = (LeftInfluence->di).find(prevDiag->first); if (prevDiagInter->second == inter.end()) { CreateIntersectionPoint(LeftInfluence, current->getSeq2End(LeftInfluence->reflectFlag), prevDiag->first, colInter, prevDiagInter); } } } } else { if (DEBUG) { fprintf(stderr, "\n In Diagonal Commit:THIRDCASE"); } if (LI_Winner(LeftInfluence, *(next_column->second), current)) { return false; } tempowner = (LeftInfluence->o).insert(next_column->second, current); //He does the intersection point processing with lower priority!!? //This might mean that the diagonal entry already exists, also this might mean that //The intersection point processing removes the entry?! (LeftInfluence->d)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = next_column->second; (LeftInfluence->di)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = inter.end(); my_diag_inter = (LeftInfluence->di).find(current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End); next_column->second = tempowner; //checking if the next column exists next_column++; if (next_column!= (LeftInfluence->c).end()) { next_column_inter =(LeftInfluence->ci).find(next_column->first); if (next_column_inter->second == inter.end()) { CreateIntersectionPoint(LeftInfluence, next_column->first, current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End, next_column_inter, my_diag_inter); } } } return TRUE; } long long int LI_CommitColumnOwner(LI * LeftInfluence, Fragment * current, Fragment * owner) { CBound::iterator current_column, next_column; CInter::iterator nextColInter, colInter; DInter::iterator diagInter; Owner::iterator tempowner; current_column= LICColumn(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag)); if ((LeftInfluence->c).end() == (LeftInfluence->c).begin()) { //Init has already put in one fragment tempowner = LI_OwnerInsertAfter(LeftInfluence, (LeftInfluence->o).begin(), current); (LeftInfluence->c)[current->getSeq2End(LeftInfluence->reflectFlag)] = tempowner; (LeftInfluence->ci)[current->getSeq2End(LeftInfluence->reflectFlag)] = inter.end(); //FIX #5 FIRST MAJOR FIX tempowner = LI_OwnerInsertAfter(LeftInfluence, tempowner, &LI_dummy); (LeftInfluence->d)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = tempowner; (LeftInfluence->di)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = inter.end(); return TRUE; } // If the current_column is the end , that means that we are before all the column boundaries //as the other case has been taken care of above if (current_column == (LeftInfluence->c).end()) { next_column = (LeftInfluence->c).begin(); } else { next_column = current_column; next_column++; } // Either the case that the column boundary is that last column boundary or that the next column is after the current point if (next_column == (LeftInfluence->c).end() || next_column->first > current->getSeq2End(LeftInfluence->reflectFlag)) { if (DEBUG) { fprintf(stderr, "\nColCommit::FIRSTCASE"); } // this means that the next column is not the first column if (current_column != (LeftInfluence->c).end()) { tempowner = LI_OwnerInsertAfter(LeftInfluence, current_column->second, current); } else { // this means that the next column is the first column tempowner = LI_OwnerInsertAfter(LeftInfluence, (LeftInfluence->o).begin(), current); } (LeftInfluence->c)[current->getSeq2End(LeftInfluence->reflectFlag)] = tempowner; (LeftInfluence->ci)[current->getSeq2End(LeftInfluence->reflectFlag)] = inter.end(); //This is inefficient colInter = (LeftInfluence->ci).find(current->getSeq2End(LeftInfluence->reflectFlag)); tempowner = LI_OwnerInsertAfter(LeftInfluence, tempowner, owner); (LeftInfluence->d)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = tempowner; (LeftInfluence->di)[current->getSeq2End(LeftInfluence->reflectFlag)-current->seq1End] = inter.end(); //This is inefficient diagInter = (LeftInfluence->di).find(current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End); //if there is a next column then there is an issue of an intersection point if (next_column != (LeftInfluence->c).end()) { nextColInter = (LeftInfluence->ci).find(next_column->first); if (nextColInter->second == inter.end()) { CreateIntersectionPoint(LeftInfluence, next_column->first, current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End, nextColInter, diagInter); } } } else { if (DEBUG) { fprintf(stderr, "\nColCommit::SECONDCASE"); } if (LI_Winner(LeftInfluence, *(next_column->second), current)) { return FALSE; } tempowner = (LeftInfluence->o).insert(next_column->second, current); (LeftInfluence->d)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = next_column->second; //FIX #6 SECOND MAJOR FIX (LeftInfluence->di)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = inter.end(); //I dont think that i need this diagInter = (LeftInfluence->di).find(current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End); colInter = (LeftInfluence->ci).find(current->getSeq2End(LeftInfluence->reflectFlag)); next_column->second = tempowner; //intersection Point handling next_column++; if (next_column != (LeftInfluence->c).end()) { nextColInter = (LeftInfluence->ci).find(next_column->first); if (nextColInter->second == inter.end()) { CreateIntersectionPoint(LeftInfluence, next_column->first, current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End, nextColInter, diagInter); } } } return TRUE; } void CreateIntersectionPoint(LI * LeftInfluence, long long int col, long long int diag, CInter::iterator colInter, DInter::iterator diagInter) { Point temp; InterPoint::iterator tempinter; temp.seq1 = col - diag; temp.seq2 = col; pair pairp(temp, LeftInfluence); tempinter = inter.insert(pairp); colInter->second = tempinter; diagInter->second = tempinter; } void DeleteIntersectionPoint(InterPoint::iterator tobeerased, CInter::iterator colInter, DInter::iterator diagInter) { inter.erase(tobeerased); colInter->second = inter.end(); diagInter->second = inter.end(); } // handles one intersection point that is at the head of inter void HandleOneIntersectionPoint() { InterPoint::iterator head; Owner::iterator delOwner, leftOwner, rightOwner; CBound::iterator col, nextCol; CInter::iterator nextColInter, colInter; DInter::iterator prevDiagInter, diagInter; DBound::iterator diag, prevDiag; head = inter.begin(); LI * LeftInfluence; //find the three owners that are invloved. LeftInfluence = head->second; col = (LeftInfluence->c).find((head->first).seq2); if (col == (LeftInfluence->c).end()) { fprintf(stderr, "\nIn HandleOneIntersectionPoint::The column does not exist. Point is %lld %lld", (head->first).seq1, (head->first).seq2); exit(0); } colInter = (LeftInfluence->ci).find(col->first); diag = (LeftInfluence->d).find((head->first).seq2 - (head->first).seq1); if (DEBUG) { fprintf(stderr, "\nIn HandleOneIntersectionPoint::The intersection point that is being handled: %lld %lld", (head->first).seq1, (head->first).seq2); } if (diag == (LeftInfluence->d).end()) { fprintf(stderr, "\nIn HandleOneIntersectionPoint::The diagonal does not exist Point is %lld %lld", (head->first).seq1, (head->first).seq2); exit(0); } diagInter = (LeftInfluence->di).find(diag->first); delOwner = diag->second; leftOwner = delOwner; leftOwner--; rightOwner = delOwner; rightOwner++; if (*leftOwner == *rightOwner) { fprintf(stderr, "\nIn HandleOneIter:: The leftOwner is the same as the right owner"); exit(0); } if (LI_Winner(LeftInfluence, *leftOwner, *rightOwner)) { //the diagonal continues if (DEBUG) { fprintf(stderr, "\nIn HandleOneIter:: Diagonal continues"); } diag->second = col->second; nextCol = col; nextCol++; nextColInter = (LeftInfluence->ci).find(nextCol->first); (LeftInfluence->c).erase(col); //FIX #8 MAJOR FIX (LeftInfluence->ci).erase(colInter); if (nextCol != (LeftInfluence->c).end()) { // the column exists if (nextColInter->second == inter.end()) { // the column is not involved in an intersection diagInter->second = inter.end(); CreateIntersectionPoint(LeftInfluence, nextCol->first, diag->first, nextColInter, diagInter); } else { //should unset the diagonal diagInter->second = inter.end(); } } else { diagInter->second = inter.end(); } } else { if (DEBUG) { fprintf(stderr, "\nIn HandleOneIter Column continues %f %f %f", (*delOwner)->score, (*leftOwner)->score, (*rightOwner)->score); } prevDiag = diag; prevDiag--; prevDiagInter = (LeftInfluence->di).find(prevDiag->first); (LeftInfluence->d).erase(diag); (LeftInfluence->di).erase(diagInter); if (prevDiag != (LeftInfluence->d).end()) { if (prevDiagInter == (LeftInfluence->di).end()) { fprintf(stderr, "\nIn HandleOneIter:No diag inter corresponding to PrevDiag: %lld", prevDiag->first); exit(0); } if (prevDiagInter->second == inter.end()) { // the diagonal is not involved in an intersection colInter->second = inter.end(); CreateIntersectionPoint(LeftInfluence, col->first,prevDiag->first, colInter, prevDiagInter); } else { //should unset the column flag colInter->second = inter.end(); } } else { colInter->second = inter.end(); } } //delete the owner (LeftInfluence->o).erase(delOwner); inter.erase(inter.begin()); } long long int printDBound(LI * LeftInfluence) { if (DEBUG) { return 0; } DBound::iterator i; long long int diagCount = 0; fprintf(stderr, "\nThe DBound is ::"); for (i = (LeftInfluence->d).begin(); i != (LeftInfluence->d).end(); i++) { fprintf(stderr, "%lld ", i->first); diagCount++; } fprintf(stderr, "Dbound Done/n"); return diagCount; } long long int printCBound(LI * LeftInfluence) { if (DEBUG) { return 0; } CBound::iterator i; long long int colCount = 0; fprintf(stderr, "\nThe CBound is ::"); for (i = (LeftInfluence->c).begin(); i != (LeftInfluence->c).end(); i++) { fprintf(stderr, "%lld ", i->first); colCount++; } fprintf(stderr, "Cbound Done/n"); return colCount; } long long int printOwners(LI * LeftInfluence) { if (DEBUG) { return 0; } Owner::iterator i; long long int ownerCount = 0; fprintf(stderr, "\nThe Owner is ::"); for (i = (LeftInfluence->o).begin(); i != (LeftInfluence->o).end(); i++) { ownerCount++; fprintf(stderr, "%f ", (*i)->score); } fprintf(stderr, "Owners Done/n"); return ownerCount; } void printState(LI * LeftInfluence) { if (DEBUG) { return; } long long int colCount, diagCount, ownerCount; fprintf(stderr, "\nCurrent State:\n"); ownerCount = printOwners(LeftInfluence); colCount = printCBound(LeftInfluence); diagCount = printDBound(LeftInfluence); interPointPrint(); } void interPointPrint() { if (DEBUG) { return; } InterPoint::iterator i; fprintf(stderr, "\nThe Inter is ::"); for (i = inter.begin(); i != inter.end(); i++) { fprintf(stderr, "%lld %lld ", (i->first).seq1, (i->first).seq2); } fprintf(stderr, "Inter Done/n"); } lagan20/src/glocal/leftinfluence.h0000755000076500007650000000516310502337063020153 0ustar brudnobrudno00000000000000#ifndef LEFTINFLUENCE #define LEFTINFLUENCE #include #include struct LI; struct longlongCompare2 { bool operator()(long long int p1,long long int p2) const { if(p1< p2) return 1; else return 0; } }; struct paircomp { bool operator()(const Point p1,const Point p2) const { if(p1.seq1< p2.seq1) return 1; else if((p1.seq1 == p2.seq1) && (p1.seq2 < p2.seq2)) return 1; else return 0; } }; typedef list Owner; typedef map CBound; typedef multimap InterPoint; typedef map CInter; typedef map DBound; typedef map DInter; typedef struct LI { Owner o; CBound c; DBound d; CInter ci; DInter di; long long int scoreIndex; long long int reflectFlag; }LI; extern InterPoint inter; Owner::iterator LILookUpOwnerIterator(LI* LeftInfluence,long long int seq1,long long int seq2) ; Fragment * LILookUpOwnerStart(LI* LeftInfluence,Fragment *current); Fragment * LILookUpOwnerEnd(LI* LeftInfluence,Fragment *current); CBound::iterator LICColumn(LI* LeftInfluence,long long int seq1, long long int seq2); Fragment *LICOwner(LI* LeftInfluence,long long int seq1, long long int seq2); Fragment *LIDOwner(LI* LeftInfluence,long long int seq1, long long int seq2); DBound::iterator LIDDiagonal(LI* LeftInfluence,long long int seq1, long long int seq2); float LILookUpScore(LI *LeftInfluence,Fragment *current); void InitLI(LI* LeftInfluence, long long int scoreIndex); long long int LI_Winner(LI* LeftInfluence,Fragment * first,Fragment * second); long long int LICommitPoint(LI *LeftInfluence,Fragment *current); Owner::iterator LI_OwnerInsertAfter(LI* LeftInfluence,Owner::iterator current,Fragment * curfrag); long long int LI_CommitDiagonalOwner(LI* LeftInfluence,Fragment *current,Fragment *owner); long long int LI_CommitColumnOwner(LI* LeftInfluence,Fragment *current,Fragment *owner); void CreateIntersectionPoint(LI* LeftInfluence,long long int col,long long int diag,CInter::iterator colInter,DInter::iterator diagInter); void DeleteIntersectionPoint(InterPoint::iterator tobeerased,CInter::iterator colInter,DInter::iterator diagInter); void HandleOneIntersectionPoint(); long long int printDBound(LI * LeftInfluence); long long int printOwners(LI * LeftInfluence); long long int printCBound(LI * LeftInfluence); void printState(LI* LeftInfluence); void interPointPrint(); #endif lagan20/src/glocal/Makefile0000755000076500007650000000061310502342435016611 0ustar brudnobrudno00000000000000CC = g++ OPTFLAGS = CFLAGS = $(OPTFLAGS) -O3 CLINKER = g++ # LIBDIR = -L/usr/local/lib MLIB = -lm INCDIR = -I./ TRGT_DIR = ../.. TRGT = glocal OBJECTS = glocal.o io.o rightinfluence.o leftinfluence.o score.o .cpp.o: $(CC) -Wno-deprecated $(CFLAGS) $(INCDIR) -c $*.cpp $(TRGT): $(OBJECTS) $(CLINKER) $(OPTFLAGS) $(OBJECTS) -o $(TRGT_DIR)/$(TRGT) $(MLIB) clean : rm -f *.o ./*~ *~ core lagan20/src/glocal/rightinfluence.cpp0000755000076500007650000001410710502337063020667 0ustar brudnobrudno00000000000000#include Fragment origin, end; // Sets the first default owner of the whole region void initRI(RI *RightInfluence, long long int scoreIndex) { RightInfluence->scoreIndex = scoreIndex; if (((scoreIndex >> RELPOSSHIFT) & 1) == LEFT) { RightInfluence->reflectFlag = TRUE; } else { RightInfluence->reflectFlag = FALSE; } // will lose to anyone origin.seq1End = 0; origin.seq2End = 0; origin.seq1Start = 0; origin.seq2Start = 0; // hack to aid winner selection origin.score = -1; end.score = -2; origin.totalScore = end.totalScore = 0; // will win against anyone end.seq1End = 0; end.seq2End = 0; end.seq1Start = 0; end.seq2Start = 0; origin.back = NULL; RightInfluence->act[-INF] = &origin; RightInfluence->act[+INF] = &end; } // Finds the owner in the current right influence region and returns the score using the appropriate score function float lookUpScore(RI * RightInfluence, Fragment * current) { Fragment* owner; // find the owner of the region that you are in owner = lookUpOwnerStart(RightInfluence, current); // return the score using the appropriate score function return scoreAll(owner, current, RightInfluence->scoreIndex); } // Returns the owner of the region Fragment * lookUpOwnerStart(RI * RightInfluence, Fragment * current) { Active::iterator ownerIterator; // find the owner of the region that you are in. ownerIterator = RightInfluence->act.upper_bound(current->getSeq2Start(RightInfluence->reflectFlag) - current->seq1Start); ownerIterator--; return (*ownerIterator).second; } Fragment * lookUpOwnerEnd(RI * RightInfluence, Fragment * current) { Active::iterator ownerIterator; // find the owner of the region that you are in. ownerIterator=RightInfluence->act.upper_bound(current->getSeq2End(RightInfluence->reflectFlag) - current->seq1End); ownerIterator--; return (*ownerIterator).second; } // Returns true if the first argument is the winner in their common region long long int RIWinner(RI * RightInfluence, Fragment * first, Fragment * second) { Fragment dummy; //if the first frag is the origin or the second frag is the end then the first frag loses if (first->score==-1 || second->score==-2) { return FALSE; } //if the first frag is the end or the second frag is the origin then the first frag wins if (second->score==-1 || first->score==-2) { return TRUE; } dummy.seq1Start = Mymax(first->seq1End, second->seq1End) + 1; dummy.seq2Start = Mymax(first->getSeq2End(RightInfluence->reflectFlag), second->getSeq2End(RightInfluence->reflectFlag)) + 2; if (first->getSeq2End(RightInfluence->reflectFlag) > second->getSeq2End(RightInfluence->reflectFlag)) { dummy.nameIter = first->nameIter; } else { dummy.nameIter = second->nameIter; } if (scoreAll(first, &dummy, RightInfluence->scoreIndex) > scoreAll(second, &dummy, RightInfluence->scoreIndex)) { return TRUE; } else { return FALSE; } } long long int RICommitEndPoint(RI * RightInfluence, Fragment * current) { Fragment * owner; Fragment * temp; owner = lookUpOwnerEnd(RightInfluence, current); if (RIWinner(RightInfluence, owner, current)) { return 0; } owner = nextOnActive(RightInfluence, owner); while (1) { if (RIWinner(RightInfluence, current, owner)) { temp = owner; owner = nextOnActive(RightInfluence, owner); RightInfluence->act.erase(temp->getSeq2End(RightInfluence->reflectFlag)-temp->seq1End); } else { break; } } //inserting into the list of active owners RightInfluence->act[current->getSeq2End(RightInfluence->reflectFlag) - current->seq1End] = current; int possibleCase = NEGATIVE << DOWNSTRANDSHIFT | NEGATIVE <scoreIndex == possibleCase) { Active::iterator j,i = RightInfluence->act.begin(); i++; while(i != RightInfluence->act.end()) { // if (i == NULL) { continue;} j = i; j++; if (j != RightInfluence->act.end()) { if ((*j).second->score == -2) { break;} // j is act.end (why does the check above fail?) if ((*i).second->totalScore > (*j).second->totalScore) { /* fprintf(stdout,"Assertion failed in RICommitEndPoint: Cur frag:\n"); printFragment(current); fprintf(stdout,"Cur orig owner:\n"); printFragment(tempOwner); fprintf(stdout,"Cur frag diag: %lld\n", (current->getSeq2End(RightInfluence->reflectFlag) - current->seq1End)); fprintf(stdout," Frag 1 in pair (j):\n "); printFragment((*j).second); fprintf(stdout," Frag 2 in pair (i):\n "); printFragment((*i).second); fprintf(stdout,"RI:\n"); printActive(RightInfluence); assert (0); */ break; // assert(i->first->score >= j->first->score); } } i++; } } return 1; } long long int diagonal(Fragment * current, RI * RightInfluence) { return (current->getSeq2End(RightInfluence->reflectFlag) - current->seq1End); } // Returns the successor on the active list Fragment * nextOnActive(RI * RightInfluence, Fragment * current) { Active::iterator holder; long long int diagCurrent; diagCurrent = current->getSeq2End(RightInfluence->reflectFlag) - current->seq1End; //MUKMOD start if(current->score==-1) { diagCurrent = -INF; } if(current->score ==-2) { diagCurrent = INF; } //MUKMOD end holder = RightInfluence->act.upper_bound(diagCurrent); if (holder != RightInfluence->act.end()) { return (*holder).second; } else { return NULL; } } long long int printActive(RI * RightInfluence) { Active::iterator temp; long long int i = 0; fprintf(stdout, "Active RI:\n"); for (temp = RightInfluence->act.begin(); temp != RightInfluence->act.end(); temp++) { fprintf(stdout, " %lld", (*temp).first); fprintf(stdout, ":sc=%f:totsc=%f;",((*temp).second)->score, ((*temp).second)->totalScore); i++; } fprintf(stdout, "\n"); return i; } lagan20/src/glocal/rightinfluence.h0000755000076500007650000000213710502337063020334 0ustar brudnobrudno00000000000000#ifndef RIGHTINFLUENCE #define RIGHTINFLUENCE #include #include #include struct longlongCompare { bool operator()(long long int p1,long long int p2) const { if (p1 < p2) { return 1; } else { return 0; } } }; typedef map Active; typedef struct RI { //List of active regions Active act; long long int scoreIndex; long long int reflectFlag; } RI; void initRI(RI *RightInfluence,long long int scoreIndex); float lookUpScore(RI * RightInfluence,Fragment *current); Fragment* lookUpOwnerEnd(RI * RightInfluence,Fragment *current); Fragment* lookUpOwnerStart(RI * RightInfluence,Fragment *current); long long int RIWinner(RI *RightInfluence,Fragment *first,Fragment * second); //long long int processRowofEndPoints(RI *RightInfluence,long long int firstIndex); long long int diagonal(Fragment * current,RI * RightInfluence); Fragment * nextOnActive(RI* RightInfluence,Fragment * current); long long int printActive(RI * RightInfluence); long long int RICommitEndPoint(RI *RightInfluence,Fragment *current); #endif lagan20/src/glocal/score.cpp0000755000076500007650000001324010502337063016771 0ustar brudnobrudno00000000000000#include #include #include #include #include extern vector scoreFunctions[1<<(UPSTRANDBITS+DOWNSTRANDBITS+RELPOSBITS)]; float Score::getScore(Fragment *up, Fragment * down) { long long int absSeq1,absSeq2,absDiagonal,absMin,absMax; absSeq1= Myabs((up->seq1End) - (down->seq1Start)); absSeq2= Myabs((up->seq2End) - (down->seq2Start)); absMin = Mymin(absSeq1,absSeq2); absMax=Mymax(absSeq1,absSeq2); absDiagonal = absMax-absMin; return absMin*(-minConstant) + absMax* (-maxConstant) + absDiagonal *(-diagConstant) -openConstant +up->totalScore; } ScoreInterface::ScoreInterface (float iopenConstant, float iminConstant, float imaxConstant, float idiagConstant) { openConstant = iopenConstant; minConstant = iminConstant; maxConstant = imaxConstant; diagConstant = idiagConstant; } Score::Score (float iopenConstant , float iminConstant ,float imaxConstant,float idiagConstant):ScoreInterface(iopenConstant,iminConstant, imaxConstant, idiagConstant) { } void initScoreFunctionPointers(char * scoreFileName) { ifstream SFP; char line[255]; SFP.open(scoreFileName); if (!SFP.good()) { printf("The score file is invalid"); exit(0); } while (1) { SFP.getline(line,255); if (line[0]=='\0') { break; } createScoreFunctionObjects(line); } } void createScoreFunctionObjects(char * line) { long long int i; long long int j; long long int rem[4]; long long int remCases[MAXCASES],remObjects[MAXOBJECTS]; long long int numCases; long long int numObjects; long long int cases [MAXCASES]; float objects[MAXOBJECTS][4]; char updir,downdir,relpos; Score * SFObjects[MAXOBJECTS]; j=0; for (i=0; (unsigned)inameIter != down->nameIter) { if (ret_case >> RELPOSSHIFT != UNRELATED) { //MUKCHECK HOPE THIS WORKS return NEGINF; } } for (i=0; igetScore(up,down); if (temp_score > ret_score) { ret_score = temp_score; } } if (ret_score == NEGINF) { printf("Score function case not handled::%lld\n",ret_case); //exit(0); } return ret_score; } long long int Mymax(long long int a, long long int b) { return (a>=b ? a : b); } long long int Mymin(long long int a,long long int b) { return (a<=b ? a : b); } long long int Myabs(long long int a) { return (a<0 ? -a : a); } float fragmentSetScore(Fragment * current, Fragment *owner, LI *LeftInfluence, RI * RightInfluence, long long int rightInfluenceFlag) { /*SLAGANCHANGE change call to the score based on the Leftinfluence, this has to be passed i guess*/ float tempScore; if (rightInfluenceFlag == 3) { tempScore = scoreAll(owner,current, current->strand << DOWNSTRANDSHIFT | owner->strand <totalScore <= 0) { current->totalScore = current->score; current->back = owner; } } else if (tempScore + current->score > current->totalScore) { current->totalScore = tempScore + current->score; current->back = owner; } } else if (rightInfluenceFlag == TRUE) { tempScore = scoreAll(owner,current,RightInfluence->scoreIndex); if (tempScore == NEGINF) { // TODO if (current->totalScore <= 0) { current->totalScore = current->score; current->back = owner; } } else if (tempScore + current->score > current->totalScore) { current->totalScore = tempScore + current->score; current->back = owner; } } else { tempScore = scoreAll(owner,current,LeftInfluence->scoreIndex); if (tempScore == NEGINF) { // TODO if (current->totalScore <= 0) { current->totalScore = current->score; current->back = owner; } } else if (tempScore + current->score > current->totalScore) { current->totalScore = tempScore + current->score; current->back = owner; } } return current->totalScore; } lagan20/src/glocal/score.h0000755000076500007650000000206410502337063016440 0ustar brudnobrudno00000000000000#ifndef SCORE #define SCORE #include #include #define MAXCASES 20 #define MAXOBJECTS 10 struct LI; struct RI; class ScoreInterface { protected: float openConstant,minConstant,maxConstant,diagConstant; ScoreInterface (float iopenConstant , float iminConstant ,float imaxConstant,float idiagConstant); float getScore(Fragment *up, Fragment * down){return -1;}; }; class Score :public ScoreInterface { public: Score(float iopenConstant , float iminConstant ,float imaxConstant,float idiagConstant); float getScore(Fragment *up, Fragment * down); }; void initScoreFunctionPointers(char *scoreFileName); void createScoreFunctionObjects(char * line); long long int charToCase(char in); float scoreAll(Fragment *up,Fragment *down, long long int ret_case); long long int Myabs(long long int a); long long int Mymin(long long int a,long long int b); long long int Mymax(long long int a,long long int b); float fragmentSetScore(Fragment * current,Fragment *owner,LI *LeftInfluence, RI * RightInfluence,long long int rightInfluenceFlag); #endif lagan20/src/glocal/structs.h0000755000076500007650000000335010502337063017033 0ustar brudnobrudno00000000000000#ifndef STRUCTS #define STRUCTS //general defines #include #include #include #include #include #include #include #include #include using namespace std; #define RIGHT 0 #define LEFT 1 #define UNRELATED 2 #define NEGINF LLONG_MIN #define UPSTRANDBITS 3 #define DOWNSTRANDBITS 3 #define RELPOSBITS 3 #define UPSTRANDSHIFT 0 #define DOWNSTRANDSHIFT UPSTRANDBITS #define RELPOSSHIFT UPSTRANDBITS + DOWNSTRANDBITS #define TOTALSHIFT UPSTRANDBITS + DOWNSTRANDBITS + RELPOSBITS #define POSITIVE 1 #define NEGATIVE 0 #define CUTOFF 0 #define TRUE 1 #define FALSE 0 #define INF LLONG_MAX #define MIN LLONG_MIN #define NAMESIZE 100 struct ltstr { bool operator() (const char* s1, const char* s2) const { return strcmp(s1,s2) < 0; } }; typedef map Name; typedef struct Fragment { long long int seq1Start,seq2Start,seq1End,seq2End; char strand; float score; float totalScore; struct Fragment *back; char deleted; char seq1Name[NAMESIZE]; Name::iterator nameIter; char seq2Name[NAMESIZE]; long long int base; long long int getSeq2End(long long int reflectFlag){ return this->seq2End*((reflectFlag == TRUE)?(-1): 1);}; long long int getSeq2Start(long long int reflectFlag){return this->seq2Start*((reflectFlag == TRUE)?(-1): 1);}; } Fragment; typedef struct HitLocationList { long long int seq1start; long long int seq2start; long long int seq1end; long long int seq2end; float score; char strand; struct HitLocationList *next; struct HitLocationList *bkptr; float scoreSoFar; char seq1Name[NAMESIZE]; char seq2Name[NAMESIZE]; } hll; typedef struct Point { long long int seq1,seq2; Fragment *frag; } Point; #endif lagan20/src/glocal/test.score0000755000076500007650000000026710502337063017173 0ustar brudnobrudno00000000000000{+R+;-L-}{0 0.02 0 0;40000 0 0 0} {+R-;-L+}{3000 0.02 0.1 0;40000 0 0 0} {-R+;+L-}{7000 0.02 0.5 0;40000 0 0 0} {+L+;-R-}{7000 0.02 0.5 0;40000 0 0 0} {+U+;+U-;-U+;-U-}{30000 0 0 0} lagan20/src/lagan.pl0000755000076500007650000001416310502337063015335 0ustar brudnobrudno00000000000000#!/usr/bin/env perl $lagandir = $ENV{LAGAN_DIR}; $consrate = 45; $consupperrate = 65; if (@ARGV < 2) { print ("usage:\n lagan seqfile1 seqfile2 [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1,rsc1),(wl2,nd2,co2,rsc2),...\"] [-bin] [-mfa] [-out \"filename\"] [-lazy] [-maskedonly] [-debug] [-usebounds] [-rc] [-translate] [-draft] [-info] [-fastreject]\n"); exit(1); } $firstName = $ARGV[0]; $secondName = $ARGV[1]; $rcFlag = 0; $arglist = ""; $contigflag = 0; $infofile = 0; $okformat = 0; $binfile = 0; $infofilename = "alignment"; $direction = "+"; $gfc = " -gfc "; $rundraft = 0; $draftparams = ""; $dofastreject = 0; $doxmfa = 0; $filename = ""; $format = ""; for ($i = 2; $i < @ARGV; $i++) { if ($ARGV[$i] =~ /-order/) { $orderfl = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-bin/) { $orderfl = $orderfl." -bin"; $binfile = 1; $okformat = 1; } elsif ($ARGV[$i] =~ /-info/) { $infofile++; } elsif ($ARGV[$i] =~ /-mfa/) { $orderfl = $orderfl." -mfa"; $okformat = 1; } elsif ($ARGV[$i] =~ /-xmfa/) { $orderfl = $orderfl." -xmfa"; $doxmfa = 1; $okformat = 1; } elsif ($ARGV[$i] =~ /-out/) { $filename = $ARGV[++$i]; $infofile++; $infofilename = $ARGV[$i]; } elsif (($ARGV[$i] =~ /-gs/) || ($ARGV[$i] =~ /-gc/) || ($ARGV[$i] =~ /-mt/) || ($ARGV[$i] =~ /-ms/) || ($ARGV[$i] =~ /-bw/)){ $orderfl = $orderfl." ".$ARGV[$i]; $orderfl = $orderfl." ".$ARGV[++$i]; } elsif ($ARGV[$i] =~ /-s1/) { $orderfl = $orderfl." -s1 $ARGV[++$i]"; $orderfl = $orderfl." ".$ARGV[++$i]; } elsif ($ARGV[$i] =~ /-maskedonly/) { $arglist = $arglist." -maskedonly"; } elsif ($ARGV[$i] =~ /-translate/) { $arglist = $arglist." -translate"; $draftparams = $draftparams." -translate"; } elsif ($ARGV[$i] =~ /-fastreject/) { $arglist = $arglist." -fastreject"; $dofastreject = 1; $doxmfa = 1; $okformat = 1; } elsif ($ARGV[$i] =~ /-draftreject/) { $draftparams = $draftparams." -fastreject"; } elsif ($ARGV[$i] =~ /-gap/) { $arglist = $arglist." -gap ".$ARGV[++$i]; $arglist = $arglist." ".$ARGV[++$i]; } elsif ($ARGV[$i] =~ /-recurse/) { $arglist = $arglist." -recurse \"".$ARGV[++$i]."\""; } elsif ($ARGV[$i] =~ /-chaos/) { $arglist = $arglist." -chaos \"".$ARGV[++$i]."\""; } elsif ($ARGV[$i] =~ /-usebounds/) { $contigflag = 1; } elsif ($ARGV[$i] =~ /-rc/) { `$lagandir/utils/rc < $ARGV[1] > $ARGV[1].rc`; if ($?) { exit(1); } $secondName = "$ARGV[1].rc"; if (-e "$ARGV[1].masked") { `$lagandir/utils/rc < $ARGV[1].masked > $ARGV[1].rc.masked`; if ($?) { exit(1);} } $rcFlag = 1; $direction = "-"; } elsif ($ARGV[$i] =~ /-draft/){ $rundraft = 1; } elsif ($ARGV[$i] =~ /-cons/){ $draftparams = $draftparams." -cons $ARGV[$++i]"; } elsif ($ARGV[$i] =~ /-draftskipfr/){ $draftparams = $draftparams." -skipfr $ARGV[$++i]"; } elsif ($ARGV[$i] =~ /-lazy/){ $draftparams = $draftparams." -cons $ARGV[$++i]"; } else { print "Invalid option for lagan: $ARGV[$i]"; exit(1); } } $arglist = $arglist." -ext "; if ($rundraft){ `$lagandir/draft.pl $firstName $secondName $draftparams`; if ($?) { exit(1);} $secondName = "merged_seq.fa"; } # print STDERR "perl $lagandir/rechaos.pl $firstName $secondName $gfc $arglist > $$.anchs.final\n"; `perl $lagandir/rechaos.pl $firstName $secondName $gfc $arglist > $$.anchs.final`; $ex_val = $? >> 8; if ($ex_val == 3) { exit(0); } if ($ex_val) { exit(1); } if ($contigflag){ @bounds = `$lagandir/utils/getbounds $$.anchs.final $firstName $secondName`; if ($?) { exit(1); } chomp $bounds[0]; print STDERR ("Aligning with bounds: $bounds[0]\n"); print `$lagandir/order $firstName $secondName $bounds[0] $orderfl -anc $$.anchs.final`; if ($?) { exit(1); } } else { if ($dofastreject){ if (!$filename) { print STDERR "-fastreject requires -out filename!\n"; exit(1); } open(SFILE, "$$.anchs.final"); @anchors = ; close(SFILE); $anchors[0] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $end1 = $1 - 1; $end2 = $3 - 1; $anchors[@anchors - 1] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $start1 = $2 + 1; $start2 = $4 + 1; $bounds = "-s1 $start1 $end1 -s2 $start2 $end2 "; @anchors = 0; $orderfl = $bounds.$orderfl." -xmfa"; } if (!$okformat) { $format = "-bin"; } `$lagandir/order $firstName $secondName $format -out $$.align $orderfl -anc $$.anchs.final`; if ($?) { exit(1); } if (!$okformat) { if ($filename) { `$lagandir/utils/bin2bl $$.align > $filename`; } else { print `$lagandir/utils/bin2bl $$.align`; } } else { if ($filename) { `cat $$.align > $filename`; } else { print `cat $$.align`; } } if ($dofastreject){ `$lagandir/utils/scorealign $filename $consrate -ibounds -cropxmfa > $$.temp`; if ($?) { exit(1); } `mv $$.temp $filename`; } } $infofile += $okformat; if ($infofile == 3){ open (INFOFILE, ">$infofilename.info"); if ($binfile){ `$lagandir/utils/bin2mf $infofilename > $infofilename.mfa`; if ($?) { exit(1); } $infofilename = $infofilename.".mfa"; } @temp = `head $secondName`; if ($?) { exit(1); } chomp $temp[0]; $temp[0] = substr $temp[0], 1; print INFOFILE "$temp[0]\n"; $len = `$lagandir/utils/getlength $secondName`; chomp $len; if ($?) { exit(2); } $first = $last = $first2 = $last2 = -1; $score = `$lagandir/utils/scorealign $infofilename $consupperrate`; chomp $score; if ($?) { exit(3); } if ($score > 0){ $score = `$lagandir/utils/scorealign $infofilename $consrate`; chomp $score; if ($?) { exit(4); } @temp = `$lagandir/utils/scorealign $infofilename $consrate -bounds 0`; if ($?) { exit(5); } $temp[0] =~ /(.*) (.*)/; $first = $1; $last = $2; @temp = `$lagandir/utils/scorealign $infofilename $consrate -bounds 1`; if ($?) { exit(6); } $temp[0] =~ /(.*) (.*)/; $first2 = $1; $last2 = $2; } print INFOFILE "1 $first $last 1 $len 0 0 $direction $score $first2 $last2\n"; close (INFOFILE); # `$lagandir/utils/rm $infofilename` if ($binfile); } `rm $secondName` if ($rcflag); `rm $$.*`; if ($?) { exit(1); } exit(0); lagan20/src/lagan2mfa.cpp0000644000076500007650000000343410502337063016246 0ustar brudnobrudno00000000000000#include #include #include #include #include #include using namespace std; // TODO refactor in classes and normal make project #include "util.cpp" #include "faindex.cpp" FaIndex faIndex; void writeSeq(FILE *f,char* seq,int start,int end) { start--; end--; int j=0; for (int i=start;i<=end;i++) { fputc(seq[i],f); j++; if (j==fastaRowLength) { j=0; fputc('\n',f); } } if (j>0) fputc('\n',f); } int main (int argc,char* argv[]) { char buf[bufSize]; char org0[1000]; char name0[1000]; int start0; int end0; char strand0; char org1[1000]; char name1[1000]; int start1; int end1; char strand1; char org2[1000]; char name2[1000]; int start2; int end2; char strand2; int proto=1; string id; string name; char* seq; FILE *out=openFile(getArg("-o",argc,argv),"w"); FILE *chunk=openFile(getArg("-c",argc,argv),"w"); FILE *in=openFile(getArg("-m",argc,argv),"r"); proto=atoi(getArg("-p",argc,argv).c_str()); readFaIndex(faIndex,getArg("-i",argc,argv)); while (!feof(in)) { buf[0]='\0'; fgets(buf,bufSize,in); if (strlen(buf)==0) continue; sscanf(buf,"%s %s %d %d %c %s %s %d %d %c %s %s %d %d %c", org0,name0,&start0,&end0,&strand0,org1,name1,&start1,&end1,&strand1,org2,name2,&start2,&end2,&strand2); name=org0; name=name+"-anc"+name0; for (int n=1;n<=proto;n++) { id=name0; id=id+":"+itoa(n); seq=getFaIndexSeq(faIndex,id); fprintf(out,">%s\n",name.c_str()); writeSeq(out,seq,start0,end0); free(seq); } end0=end0-start0+1; start0=1; fprintf(chunk,"%s %s %d %d %c %s %s %d %d %c %s %s %d %d %c\n",org0,name.c_str(),start0,end0,strand0,org1,name1,start1,end1,strand1,org2,name2,start2,end2,strand2); } fclose(in); fclose(out); fclose(chunk); return 0; } lagan20/src/makecons.cpp0000644000076500007650000001043010502337063016210 0ustar brudnobrudno00000000000000/** * @file * * [TODO] * * @author Mikhail Soloviev * @date 31.03.2006 * @version 1.0 * */ //#include //#include #include #include #include #include #include using namespace std; #define fastaRowLength 50 #define bufSize 2000 typedef char* pchar; int isArg(char* key,int argc, char* argv[]) { for (int i=0;i0 && (s[i-1]=='\n' || s[i-1]=='\r')) s[--i]='\0'; return i; } FILE* openFile(char* path,char* mode) { FILE *f=fopen(path,mode); if (f==NULL) { printf("ERROR: Failed open file: %s\n",path); exit(1); } return f; } char* loadSeq(FILE *f,char* annot,int& seqLen) { char* seq=NULL; char buf[bufSize]; int bufLen=0; seqLen=0; while (!feof(f)) { buf[0]='\0'; fgets(buf,bufSize,f); bufLen=trim(buf); if (bufLen>0) { if (buf[0]=='>') { strcpy(annot,buf); break; } else { if (seqLen==0) seq=(char*)malloc(sizeof(char)*bufLen); else seq=(char*)realloc(seq,sizeof(char)*(seqLen+bufLen)); memcpy(&seq[seqLen],buf,bufLen); seqLen+=bufLen; } } } return seq; } void writeSeq(FILE *f,char* seq,int len) { int j=0; for (int i=0;i0) fputc('\n',f); } /* char* makeCons(char* seq1,char* seq2,int len) { char* cons=seq1; char ch=' '; for (int i=0;imax) max=count[i]; for (int i=1;i<5;i++) if (count[i]==max) letter[index++]=dna[i]; return index; } char makeConsLetter(char letter[],int proto) { int count[5]; char maxLetter[5]; int maxNumber; for (int j=0;j<5;j++) count[j]=0; for (int i=0;i #include #define MINPAGESIZE 1000000 typedef struct MemoryPage { char* memory; int size; int used; struct MemoryPage* next; } mpage; mpage* globalpage = 0; void initMP(int pagesize) { mpage* newpage; if (pagesize < MINPAGESIZE) pagesize = MINPAGESIZE; newpage = (mpage*) malloc(sizeof(mpage)); newpage->next = globalpage; globalpage = newpage; globalpage->memory = (char*) malloc (pagesize); globalpage->used = 0; globalpage->size = pagesize; } void* MPmalloc(int size) { void* tbr; if (globalpage->size - globalpage->used < size) { initMP(size); } tbr = globalpage->memory+ globalpage->used; globalpage->used += size; return tbr; } void* MPallfree() { mpage *n; while (globalpage) { free (globalpage->memory); n = globalpage; globalpage = globalpage->next; free(n); } initMP(0); } void* MPrealloc(void* prevptr, int prevsize, int newsize) { void* tbr = MPmalloc(newsize); memcpy(tbr, prevptr, prevsize); // fprintf(stderr, "realloc returns %x instead of %x, (%d %d)\n", tbr, prevptr, prevsize, newsize); return tbr; } lagan20/src/mempage.h0000644000076500007650000000126310502337063015474 0ustar brudnobrudno00000000000000#define MINPAGESIZE 256 typdef struct MemoryPage { void* memory; int size; int used; struct MemoryPage* next; } mpage; mpage globalpage; void* initMP() { globalpage.memory = realloc (globalpage.memory, MINPAGESIZE); globalpage.used = 0; globalpage.size = MINPAGESIZE; } void* MPmalloc(int size) { void* tbr; while (globalpage.size - globalpage.used > size) globalpage.memory = realloc (globalpage.memory, (globalpage.size *=2)); tbr = &(globalpage.memory[globalpage.used]); globalpage.used += size; return tbr; } void* MPallfree() { globalpage.memory = realloc (globalpage.memory, MINPAGESIZE); globalpage.used = 0; globalpage.size = MINPAGESIZE; } lagan20/src/mlagan.c0000644000076500007650000006513610502546574015335 0ustar brudnobrudno00000000000000#include #include #include #include #include #include #include "skiplist.h" #include "multial.h" #include "filebuffer.h" #define VER_NUM "2.0" #define MIN2(x,y) ( (x) >= (y) ? (y) : (x) ) #define MAX2(x,y) ( (x) >= (y) ? (x) : (y) ) // Global variables static int nested = 0; static int postir = 0; static int lazy = 0; static int notree = 1; static int verbose = 0; static int numseqs = 0; static int itertimes = 1; static int cutoffmatch = 12; static int translate = 0; static int extend = 1; static int fastreject = 0; static int gapfreechunks = 0; static align *simaligns[MAX_SEQ]; static char* lagan_dir; static int hptrcomp (const void *p1, const void *p2) { int i = ((hptr*)p1)->number; int j = ((hptr*)p2)->number; int it = ((hptr*)p1)->isstart; int jt = ((hptr*)p2)->isstart; if (i > j) return (1); if (i < j) return (-1); if (it) return -1; else return 1; } void usage(void) { printf("mlagan seqfile_1 seqfile_2 [... seqfile_%d] [-parameters]\n\n", MAX_SEQ); printf("-nested : runs improvement in a nested fashion\n"); printf("-postir : incorporates the final improvement phase\n"); printf("-lazy : uses lazy mode\n"); printf("-translate : use translated anchors\n"); // printf("-ext : extend the anchors\n"); This is now default printf("-fastreject : use fast rejection (tuned for human/mouse or closer)\n"); // printf("-gfc : find gap free chunks as anchors\n"); This is currently broken printf("-verbose : give debug output\n"); printf("-tree \"(...)\" : runs with given phylogenetic tree\n"); printf("-out \"filename\": outputs to filename\n"); printf("-nucmatrixfile \"filename\": uses given substitution matrix instead of $LAGAN_DIR/nucmatrix.txt\n"); printf("-version : prints version info\n"); } seq* readfile(FILE* input) { int seqstart=0; int seqend=0; char* res = (char*) malloc(sizeof(char)*2); int ressize = 2, numread=1; //N at 1st letter char temp[256]; seq* myseq = (seq*) malloc(sizeof(seq)); char currchar; res[0] = 'N'; if (feof(input)) return 0; fgets(temp, 255, input); if (temp[0] != '>') { fprintf(stderr, "File is not in FASTA format!!\n"); exit(1); } myseq->name = (char*) malloc((strlen(temp))*sizeof(char)); strcpy(myseq->name, temp+1); *(strchr(myseq->name, '\n')) = 0; currchar = fgetc(input); while ((currchar != '>') && (currchar != EOF)) { if (!isspace(currchar)) { currchar = toupper(currchar); if (!strchr(alpha, currchar)) { fprintf(stderr, "Warning: %c converted to 'N'\n", currchar, alpha); currchar = 'N'; } res[numread++] = currchar; if (numread >= ressize) { res=(char*)realloc(res, sizeof(char)*(ressize*=2)); } } currchar = fgetc(input); } if (currchar == '>') ungetc(currchar, input); res[numread]=0; myseq->rptr = res; if (seqstart > 0) { res = &res[seqstart-1]; res[seqend-seqstart+1] = 0; numread = seqend-seqstart+1; } myseq->lets = res; myseq->numlets = numread; // printf("read: %d lets\n",numread); return myseq; } int starts_with(char *str, char *word) { int len; char *first_word; len = strlen(str); first_word = (char *)malloc((len + 1) * sizeof(char)); sscanf(str, "%s", first_word); return strcmp(word, first_word); } align* findAlignByName(align *aligns[], char *name) { int i=0; // printf("findAlignByName: %s\n", name); while(iseqs[0]->name, name)) { return(aligns[i]); } i++; } fprintf(stderr, "alignment not found for: %s", name); exit(2); return NULL; } int kk = 0; void printHLL(hll *myres) { fprintf(stderr, "into %d\n", ++kk); fflush(stderr); while(myres) { fprintf(stderr, "(%d %d)=(%d %d) %f\n", myres->seq1start, myres->seq1end, myres->seq2start, myres->seq2end, myres->score); fflush(stderr); myres=myres->next; } } hll* getAnchsFromFile(char *fname, FileBuffer f1, FileBuffer f2) { FILE *ancfile; hll *myres = 0, *tt = 0, *first = 0; char buff[256]; int i=0, j=0; // printf("getHLLFromNames: %s, %s\n", name1, name2); sprintf(buff, "%s.anchors", fname); ancfile=fopen(buff, "r"); if(ancfile==NULL) { fprintf(stderr, "anchor file not found:: %s.anchors\n", fname); exit(2); } while (!feof(ancfile)) { if (!fgets(buff, 256, ancfile)) { break; } tt = (hll*) malloc(sizeof(hll)); sscanf(buff, "(%d %d)=(%d %d) %f", &tt->seq1start, &tt->seq1end, &tt->seq2start, &tt->seq2end, &tt->score); tt->next = myres; i++; myres = tt; } if (fastreject) { f1->startpos = MAX2(f1->startpos, myres->seq1end); f2->startpos = MAX2(f2->startpos, myres->seq2end); for (tt = myres; tt->next->next; tt = tt->next) { j++; } f1->endpos = MIN2(f1->endpos, tt->next->seq1start); f2->endpos = MIN2(f2->endpos, tt->next->seq2start); // fprintf (stderr, "%d %d %d %d %d\n", j, f1->startpos, f1->endpos, f2->startpos, f2->endpos); myres = myres->next; tt->next = 0; } fprintf(stderr,"read %d anchs\n", i); fclose(ancfile); return myres; } hll* generateAnchors( FileBuffer a1, FileBuffer a2) { char buff[256]; char fname[80]; char *name1, *name2; char *endpnt; int diff1, diff2; align* temp; hll* res; char flip = 0; int retstat; name1 = strrchr (a1->filename, '/'); if (!name1) name1 = a1->filename; else name1++; name2 = strrchr (a2->filename, '/'); if (!name2) name2 = a2->filename; else name2++; endpnt = strchr ( name1, '.'); diff1 = (endpnt)? endpnt - name1: strlen(name1); endpnt = strchr ( name2, '.'); diff2 = (endpnt)? endpnt - name2: strlen(name2); strncpy (fname, name1, diff1); strncpy (fname+diff1, name2, diff2); fname[diff1+diff2] = 0; sprintf(buff, "%s/rechaos.pl %s %s -out %s.anchors %s %s %s %s %s\n", lagan_dir, a1->filename, a2->filename, fname, (extend ? "-ext" : ""), (translate ? "-translate" : ""), (fastreject ? "-fastreject" : ""), (gapfreechunks ? "-gfc" : ""), (lazy ? "-lazy" : "")); retstat = system(buff) >> 8; if (fastreject && (retstat == 3)) { return 0; } else if (retstat) { fprintf (stderr, "Error from rechaos\n"); exit (1); } res = getAnchsFromFile(fname, a1, a2); return res; } void printFASTASeq(FILE *outfile, seq *myseq) { int i; // printf("kva\n"); if (!outfile) outfile = stdout; fprintf(outfile, ">%s\n", myseq->name); // printf("kva2\n"); for(i=0; inumlets; i++) fprintf(outfile, "%c", myseq->rptr[i]); // printf("kva %d\n",i); fprintf(outfile, "\n"); if (outfile!=stdout) fclose(outfile); } hll* findBestChain(hptr* array, int arrsize) { sklst* skipper = makeSkLst(); sle* help; int i; hll* t; for (i = 0; i < arrsize; i++) { if (array[i].isstart) { help = SLfind(skipper, array[i].myhll->seq2start); if (help->myelem) { array[i].myhll->bkptr = help->myelem; array[i].myhll->scoreSoFar = ((hll*)help->myelem)->scoreSoFar + array[i].myhll->score; } else { array[i].myhll->bkptr = 0; array[i].myhll->scoreSoFar = array[i].myhll->score; } } else { help = SLfind(skipper, array[i].myhll->seq2end); if (help->myelem && (array[i].myhll->scoreSoFar <= ((hll*)help->myelem)->scoreSoFar)) continue; SLinsertAfter(skipper, help, array[i].myhll->seq2end, array[i].myhll); help = help->next[0]; while (help->next[0] && ((hll*)help->myelem)->scoreSoFar >= ((hll*)help->next[0]->myelem)->scoreSoFar) SLremove(skipper, help->next[0]); } } t= (hll*)SLgetLast(skipper)->myelem; delSkLst(skipper); return t; } hll* remakeHLL(hll* bestPtr) { int len; hll *res=0; hll *temp, *t2, *t3; int i, bestscore=-1; for (temp = bestPtr; temp; temp = temp->bkptr) { temp->next=res; temp->dirty = 1; res=temp; } return res; } hll* reanchorHLL(hll* mylist) { hll *temp, *best, *t2; int numhits=0, i=0; hptr* myptrs; temp=mylist; while (temp) { numhits++; temp->dirty = 1; temp=temp->next; } myptrs = (hptr*) malloc (sizeof(hptr) * numhits *2); for (temp = mylist; temp; temp = temp->next) { myptrs[i].number = temp->seq1start; myptrs[i].isstart = 1; myptrs[i].myhll = temp; myptrs[i+1].number = temp->seq1end; myptrs[i+1].isstart = 0; myptrs[i+1].myhll = temp; i = i+2; } qsort(myptrs, numhits*2, sizeof(hptr), hptrcomp); best = findBestChain(myptrs, numhits*2); temp=best; while (temp) { temp->dirty = 0; temp=temp->bkptr; } temp=mylist; while (temp) { t2 = temp; temp=temp->next; if (t2->dirty) free(t2); } best = remakeHLL(best); // printf("newbest\n"); // printHLL(best); free (myptrs); return best; } void orderAligns(align *a1, align *a2, align **first, align **second, int *index, int *hllindex) { int a1index, a2index; a1index = a1->index; a2index = a2->index; if (a1index > a2index) { *first = a2; *second = a1; *index = a2index; *hllindex = a1index; } else { *first = a1; *second = a2; *index = a1index; *hllindex = a2index; } } void doRemapHLLs(align *aligns[], align *uni, int *index, int hllindex) { int i, mapi, done=0; // take all hlls into first, and into the second and remap them for(mapi=*index; !done; mapi=hllindex) { for (i=0; ihlls[mapi] != NULL && i != *index) { // remap them into i // fprintf(stderr, "\n called1 %d %d(%d)\n", i, mapi, *index); aligns[i]->hlls[mapi] = remapHLLs(aligns[i]->hlls[mapi], 1, uni, (mapi!=*index)); } } for (i=mapi+1; ihlls[i] != NULL && i != hllindex) { // remap them into first or second // fprintf(stderr, "\n called2 %d %d(%d)\n", mapi, i,*index); aligns[mapi]->hlls[i] = remapHLLs(aligns[mapi]->hlls[i], 0, uni, (mapi!=*index)); } } if (mapi==hllindex) done=1; } // free memory? what's that? // aligns[*index] = result; // aligns[hllindex] = result; } void doReanchorHLLs(align *aligns[], int *index, int hllindex) { int i; // for each pair of hlls from (i to first) and (i to second) for(i=0; i<*index; i++) { aligns[i]->hlls[*index] = reanchorHLL(mergeHLLs(aligns[i]->hlls[*index], 0, aligns[i]->hlls[hllindex], 0)); // if (verbose) { // printf("aligns[%d]->hlls[%d]\n",i ,*index); // printHLL(aligns[i]->hlls[*index]); // } aligns[i]->hlls[hllindex] = 0; } for(i=*index+1; ihlls[i] = reanchorHLL(mergeHLLs(aligns[*index]->hlls[i], 0, aligns[i]->hlls[hllindex], 1)); // if (verbose) { // printf("aligns[%d]->hlls[%d]\n",*index ,i); // printHLL(aligns[*index]->hlls[i]); // } aligns[i]->hlls[hllindex] = 0; } for(i=hllindex+1; ihlls[i] = reanchorHLL(mergeHLLs(aligns[*index]->hlls[i], 0, aligns[hllindex]->hlls[i], 0)); // if (verbose) { // printf("aligns[%d]->hlls[%d]\n", *index, i); // printHLL(aligns[*index]->hlls[i]); // } aligns[hllindex]->hlls[i] = 0; } } align* processAlign(align *aligns[], align *a1, align *a2, int *index) { int hllindex; align *first, *second, *result, *uni; orderAligns(a1, a2, &first, &second, index, &hllindex); // if (verbose // printHLL(aligns[first->index]->hlls[hllindex]); result = makeAlign(first, second, aligns[first->index]->hlls[hllindex], &uni); result->index = *index; freeHLLs(aligns[first->index]->hlls[hllindex]); aligns[first->index]->hlls[hllindex] = 0; doRemapHLLs(aligns, uni, index, hllindex); doReanchorHLLs(aligns, index, hllindex); // if the constituent alignments were not simple alignments, free them freeAlign(uni); uni = 0; if (first->numseq > 1){ freeAlign(first); first = 0; } if (second->numseq > 1){ freeAlign(second); second = 0; } return(result); } align* iterativeImprovement (align *current, align *rpntree[], int length) { int converged = 0; int i=0, oldscore, cutoff; seq *removed; align *readd, *old, *new; hll* anchs, *tt; if (current->numseq <= 2) return current; // printf("iterative improvement!\n"); cutoff = cutoffmatch * 100; fprintf(stderr, "cutoff = %d\n", cutoff); while (!converged) { // Throw out a sequence. Calling code in multial. removed = current->seqs[0]; new = findAlignByName(simaligns, removed->name); old = current; anchs = getAnchsFromAlign(current, 0, cutoff); current = removeSeq(current, 0); free (old); // Re-align this thrown-out sequence to the remaining alignment. current = makeAlign (current, new, anchs, &old); if (verbose) { printf("improved:\n"); printHLL(anchs); printTextAlign(stdout, current); } while (anchs) { tt = anchs; anchs = anchs->next; free (tt); } free (old); i++; if (i==numseqs*itertimes) converged = 1; } return current; } int treeToRPN(char *treestr, align *stack[MAX_SEQ*2], int *depth) { int i=0; int j, k; char buffer[256]; while (treestr[i]!='(') { i++; } i++; while ((treestr[i] != ')') && (treestr[i] != '\0')) { // printf("%d: %s\n", *depth, treestr+i); if (treestr[i]=='(') { i += treeToRPN(treestr+i, stack, depth); } else if (isalnum(treestr[i])) { k = 0; // push alignment while((!isspace(treestr[i])) && (treestr[i]!='(') && (treestr[i]!=')')) { buffer[k++] = treestr[i++]; } buffer[k] = 0; stack[(*depth)++]=findAlignByName(simaligns, buffer); // printf("pushed: %s\n", stack[*depth-1]->seqs[0]->name); } else if (treestr[i]==')') // (*depth)++; break; else { i++; } } if (treestr[i]==')') { (*depth)++; //null is '+' return i+1; } if (treestr[i] == '\0') { fprintf(stderr, "ERROR parsing tree, depth %d, %d chars read", *depth, i); exit(1); } } align* procStack(align* rpntree[MAX_SEQ*2], int length, align *myaligns[]) { align* stack[MAX_SEQ]; int i = 0, sp = 0; int index=0; while (i < length) { if (rpntree[i]) { stack[sp++] = rpntree[i]; } else { stack[sp-2] = processAlign(myaligns, stack[sp-2], stack[sp-1], &index); stack[--sp] = 0; if(verbose) printTextAlign(stdout, stack[sp-1]); } if (nested) { iterativeImprovement(stack[sp-1], rpntree, i); } i++; } return stack[sp-1]; } char* buildTree (align *simalign[], float distances[MAX_SEQ][MAX_SEQ]) { char *names[MAX_SEQ]; int namelens[MAX_SEQ]; float max; int mli, mlj; int i, j; char *result, *temp; // fprintf (stderr, "into build\n"); for (i=0; i< numseqs; i++) { namelens[i] = strlen(simalign[i]->seqs[0]->name); names[i] = (char*) malloc ((namelens[i]+1) * sizeof (char)); sscanf (simalign[i]->seqs[0]->name,"%s",names[i]); } do { max = -1; for (i=0; i<(numseqs-1); i++) { for (j=i+1; j max) { max = distances[i][j]; mli = i; mlj = j; } } } if (max < 0) break; // fprintf (stderr, "join! %d %d (score %f)\n", mli, mlj, distances[mli][mlj]); temp = (char*) malloc ((namelens[mli] + namelens[mlj] +4)* sizeof(char)); sprintf(temp, "(%s %s)", names[mli], names[mlj]); // fprintf (stderr, "%d(%d)+%d(%d)+3=%d(really %d)\n", namelens[mli],strlen(names[mli]), // namelens[mlj], strlen(names[mlj]), strlen(temp), namelens[mli]+namelens[mlj]+3); // fprintf (stderr, "malloc gave %x\n", temp); // fprintf (stderr, "new = %s\n", temp); // fprintf (stderr, "done free1 %x\n", names[mli]); free (names[mli]); // fprintf (stderr, "done free2 %x\n", names[mlj]); free (names[mlj]); names[mlj] = 0; names[mli] = result = temp; namelens[mli] = namelens[mli] + namelens[mlj] + 3; distances[mli][mlj] = -1; // fprintf (stderr, "done concat\n"); for (i=0; i < mli; i++) { // fprintf (stderr, "h1\n"); if (distances[i][mli] >= 0) distances[i][mli] = (distances[i][mli] + distances[i][mlj]) / 2; distances[i][mlj] = -1; } for (i=mli+1; i < mlj; i++) { // fprintf (stderr, "h2\n"); if (distances[mli][i] >= 0) distances[mli][i] = (distances[mli][i] + distances[i][mlj]) / 2; distances[i][mlj] = -1; } for (i=mlj+1; i < numseqs; i++) { // fprintf (stderr, "h3\n"); if (distances[mli][i] >= 0) distances[mli][i] = (distances[mli][i] + distances[mlj][i]) / 2; distances[mlj][i] = -1; } // fprintf (stderr, "end of loop\n"); } while (max >= 0); for (i=0; i< numseqs; i++) { if (names[i] != result) free (names[i]); } fprintf (stderr, "We built the tree: \"%s\"\n", result); return result; } char* graphCollapsal (align *simaligns[]) { float distances[MAX_SEQ][MAX_SEQ]; int i, j; float sum = 0, length = 0; float score = 0, count = 0; hll* temp; for (i=0; i< MAX_SEQ; i++) for (j=0; j< MAX_SEQ; j++) distances[i][j] = -1; for (i=0; i<(numseqs-1); i++) { for (j=i+1; jhlls[j]; while (temp) { sum += temp->score; length += (temp->seq1end - temp->seq1start); score += temp->score/(temp->seq1end - temp->seq1start); count += 1; temp = temp->next; } if (count != 0 && sum > 0) { //distances[i][j] = score/count; distances[i][j] = sum/length; //MIN2(simaligns[i]->seqs[0]->numsiglets, simaligns[j]->seqs[0]->numsiglets); fprintf (stderr, "Similarity %s and %s = %f\n", simaligns[i]->seqs[0]->name, simaligns[j]->seqs[0]->name, distances[i][j]); } else distances[i][j] = 0; } } return buildTree (simaligns, distances); } int parseParameters(int argc, char** argv, FileBuffer *files, char **treestr) { int i=1; if (argc < 3) { if (argc == 2) if (!strcmp(argv[1], "-version") || !strcmp(argv[1], "-Version")) { fprintf(stderr, "MLAGAN version %s\n", VER_NUM); exit(0); } usage(); return 1; } while((argv[i][0]!='-')) { // Read in sequence files. // printf("sequence %d: %s\n", i, argv[i]); if (!(files[numseqs++] = FileOpen(argv[i]))) { fprintf(stderr, "couldnt open dbase file %s\n",argv[i]); usage(); return 2; } // seqs[numseqs] = FileRead(seqfile, 0, 0, VER_MLAGAN); // seqs[numseqs]->filename = argv[i]; // numseqs++; if(++i>=argc) break; } // printf("\n"); while (i=argc) || (argv[i][0]=='-')) { fprintf(stderr, "missing parameter specification for [-out].\n"); return 1; } fprintf(stderr, "outputting to: %s\n", argv[i]); outfile = fopen(argv[i], "w"); if (outfile==NULL) { fprintf(stderr, "error with output file...\n"); exit(2); } } if (!strcmp(argv[i], "-tree")) { i++; if ((i>=argc) || (argv[i][0]=='-')) { fprintf(stderr, "missing parameter specification for [-tree].\n"); return 1; } notree = 0; *treestr = argv[i]; fprintf(stderr, "using given phylogenetic tree:\n%s\n", *treestr); } if (!strcmp(argv[i], "-gapperseq")) { i++; if (i>=argc) { fprintf(stderr, "missing parameter specification for [-gapperseq].\n"); return 1; } gapperseq = atoi(argv[i]); fprintf(stderr, "using gapperseq score: %d\n", gapperseq); } if (!strcmp(argv[i], "-overlap")) { i++; if (i>=argc) { fprintf(stderr, "missing parameter specification for [-overlap].\n"); return 1; } overlap = atoi(argv[i]); fprintf(stderr, "using overlap value: %d\n", overlap); } if (!strcmp(argv[i], "-glwidth")) { i++; if (i>=argc) { fprintf(stderr, "missing parameter specification for [-glwidth].\n"); return 1; } glwidth = atoi(argv[i]); fprintf(stderr, "using glwidth value: %d\n", glwidth); } if (!strcmp(argv[i], "-nucmatrixfile")) { i++; if (i>=argc) { fprintf(stderr, "missing parameter specification for [-scorematrix.\n"); return 1; } nucmatrixfile = argv[i]; fprintf(stderr, "using nucmatrixfile value: %s\n", nucmatrixfile); } i++; } // setScores(gapstart, gapcont, gapend, gapperseq, overlap, glwidth); return 0; } hll* updateAnchorPos(hll* myhll, FileBuffer f1, FileBuffer f2) { hll *res, *temp, *prev=0; res = myhll; fprintf (stderr, "Updating anchs...\n"); for ( ; myhll; myhll = myhll->next) { myhll->seq1start -= (f1->startpos-1); myhll->seq1end -= (f1->startpos-1); myhll->seq2start -= (f2->startpos-1); myhll->seq2end -= (f2->startpos-1); } while (res && (res->seq1start < 0 || res->seq2start < 0)) { // fprintf (stderr, "first..\n"); temp = res; // fprintf(stderr, "Tossed %d %d(%d %d)\n", temp->seq1end, temp->seq2end, // f1->endpos, f2->endpos); res = res->next; free(temp); } temp = res; while (temp && temp->seq1end < (f1->endpos-f1->startpos) && temp->seq2end < (f2->endpos-f2->startpos)) { // fprintf (stderr, "second...\n"); // fprintf(stderr, "Kept %d %d(%d %d)\n", temp->seq1end, temp->seq2end, // f1->endpos-f1->startpos, f2->endpos-f2->startpos); prev = temp; temp = temp->next; } if (prev) { temp = prev; prev = prev->next; temp->next = 0; } else if (temp == res) { res = 0; } else { // fprintf (stderr, "returning %d\n", res); return res; } while ( prev ) { // fprintf (stderr, "third...\n"); // fprintf(stderr, "Tossed %d %d(%d %d)\n", temp->seq1end, temp->seq2end, // f1->endpos, f2->endpos); temp = prev; prev = prev->next; free(temp); } return res; } int connectedGraph(hll* graph[MAX_SEQ][MAX_SEQ], int numseqs) { int M[MAX_SEQ][MAX_SEQ]; int i, j, k; for (i = 0; i < numseqs - 1; i++){ for (j = i + 1; j < numseqs; j++){ M[i][j] = M[j][i] = (graph[i][j] != NULL); } } for (k = 0; k < numseqs; k++) for (i = 0; i < numseqs; i++) for (j = 0; j < numseqs; j++) if (M[i][k] && M[k][j]) M[i][j] = 1; k = 1; for (i = 0; k && i < numseqs; i++) k = M[0][i]; return k; } int main(int argc, char** argv) { FileBuffer seqfile; seq **seqs; int i = 1, j = 1, x, y; char command[256]; char *treestr = NULL; align *stack[MAX_SEQ*2]; align *final; align *myaligns[MAX_SEQ]; hll* table[MAX_SEQ][MAX_SEQ]; FileBuffer files[MAX_SEQ]; outfile = stdout; lagan_dir = getenv ("LAGAN_DIR"); if (!lagan_dir) { fprintf(stderr, "Environment variable LAGAN_DIR not set\n"); exit(1); } buildcache(); initLib(); seqs = (seq**) malloc((argc-1)*sizeof(seq*)); if (parseParameters(argc, argv, files, &treestr)) return 1; for (i=0; i<(numseqs-1); i++) { for (j=i+1; jstartpos > files[i]->endpos) { if (outfile != stdout) fclose (outfile); exit (0); } seqs[i] = FileRead(files[i], 1, 0, VER_MLAGAN); } else seqs[i] = FileRead(files[i], 0, 0, VER_MLAGAN); seqs[i]->index = i+1; myaligns[i]=simaligns[i]=mkSimAlign(seqs[i]); simaligns[i]->index = i; } // Find all pairwise anchors. for (i=0; i<(numseqs-1); i++) { for (j=i+1; jhlls[j]=table[i][j]; } } // printf("\n"); for (i=0; ihlls[%d].score=%g\n", i,j, simaligns[i]->hlls[j]==NULL ? 0 : simaligns[i]->hlls[j]->score); } } */ fprintf(stderr, "\n****************************\n"); fprintf(stderr, "gs: %d; ge: %d;\n", gapstart, gapend); fprintf(stderr, "gc: %d; gp: %d\n", gapcont, gapperseq); //fprintf(stderr, "match: %d; mismatch: %d\n", match, mismatch); fprintf(stderr, "overlap: %d; glwidth: %d\n", overlap, glwidth); fprintf(stderr, "\n****************************\n"); if (notree) { treestr = graphCollapsal(myaligns); } //REMOVE the next line once debugged!!! // exit(2); //End of remove i = 0; treeToRPN(treestr, stack, &i); final = procStack(stack, i, myaligns); if (postir) { final = iterativeImprovement(final, stack, i); } // Ouput end result. fprintf(stderr, "final alignment... \n"); if (fastreject) { printXMFAAlign(outfile, final); } else { printFASTAAlign(outfile, final); } if (outfile != stdout) fclose (outfile); fprintf(stderr, "mlagan -- end.\n"); return 0; } lagan20/src/multial.c0000644000076500007650000012262410502337063015530 0ustar brudnobrudno00000000000000#include #include #include #include #include #include #include "diagmatrix.h" #include "multial.h" #define INSERTION 1 #define DELETION 2 #define BOTH 3 #define MISMATCH_CUTOFF 8 #define ANCHOR_LENGTH_CUTOFF 10 #define ANCHOR_SCORE_CUTOFF 1500 #define MAX_SQ_SIZE (100 * (1 << 20)) #define BIG_SQ_WIDTH 20 #define CONS_FRAC 0.6 #define MIN2(x,y) ( (x) >= (y) ? (y) : (x) ) #define MAX2(x,y) ( (x) >= (y) ? (x) : (y) ) #define MAX3(x,y,z) MAX2(MAX2(x,y),z) #define MIN3(x,y,z) MIN2(MIN2(x,y),z) #define PROD(x,y) ( (x) * (y) ) #define WEQ2(x,y,a) (((x)==(a))? 0: ((y)==(a))? 1:-1) #define WEQ3(x,y,z,a) (((x)==(a))? 0: ((y)==(a))? 1: ((z)==(a))? 2:-1) char* alpha = "ATCG.N"; char* nucmatrixfile = 0; int s1start = 0; int s1end = 0; int s2start = 0; int s2end = 0; //int match = 18; //int mismatch = -8; int gapstart = -50; int gapend = -50; int gapcont = -5; int gapperseq = -1; int overlap = 0; int glwidth= 15; char dobin = 0; float factor, offset; int logs[MAX_SEQ*MAX_SEQ]; FILE* outfile; static int substmatrix[256][256]; static int matchcache[1 << 24], gapcache[1 << 24]; int *freed = 0, freedsize, freedcap; align **freedptr; int normf; int normprev; inline int ismatch(char a, char b) { return (a == b); } inline int isGap(align* ali, int seqn, int loc) { int i = !((ali->algn[loc] >> seqn) & 1); return i; } inline int scoreLocal(int which, align* ali, int loc) { int i, lets = 0; for (i=0; i < 4; i++) lets += ali->cnts[i][loc]; // printf ("which is %d lets is %d, cnts[w] is %d \n",which, lets, ali->cnts[which][loc]); if (which <4) return (ali->cnts[which][loc]-1) * 100 + (lets - ali->cnts[which][loc]) * -70 + ali->cnts[CNTS_GS][loc] * gapstart + ali->cnts[CNTS_GC][loc] * gapcont; if (which == CNTS_GS) return lets * gapstart; if (which == CNTS_GC) return lets+ali->cnts[CNTS_GS][loc] * gapcont; } inline hll* reverseHLL(hll* tbr) { hll *nn, *prev=0; while (tbr) { nn = tbr->next; tbr->next = prev; prev = tbr; tbr = nn; } return prev; } hll* getAnchsFromAlign(align* current, int seqnum, int cutoff) { int i=0, j, newj=0; int currscore=0, oldscore, peakscore; hll *res = 0, *temp = (hll*) malloc (sizeof(hll)); int which; long long int mask = ~(1<algnlen; j++) { if (!isGap(current, seqnum, j)) { ingap = 0; which = strchr(alpha, current->seqs[seqnum]->lets[i]) - alpha; which = (which>3)?CNTS_LEN:which; i++; } else { if (ingap) which = CNTS_GC; else { ingap = 1; which = CNTS_GS; } } currscore += scoreLocal(which, current, j); if (currscore > cutoff) { temp->score = currscore; temp->seq1end = newj; temp->seq2start = i; temp->seq2end = i; temp->seq1start = newj; currscore = 0; temp->next = res; res = temp;temp = (hll*) malloc (sizeof(hll)); assert (temp); } if (currscore < 0) currscore = 0; if (current->algn[j]&mask) newj++; } if (currscore > cutoff) { temp->score = currscore; temp->seq1end = newj; temp->seq2start = i; temp->seq2end = i; temp->seq1start = newj; temp->next = res; res = temp; } else free(temp); return reverseHLL(res); } int cons_cnt = 0; seq* mkConsensus(align* ali) { int i, j; seq* res = (seq*) malloc (sizeof(seq)); assert (res); res->name = (char*) malloc(sizeof(char)*64); assert (res->name); sprintf(res->name, "Consensus_%d", ++cons_cnt); res->numlets = ali->algnlen; res->rptr = res->lets = (char*) malloc (sizeof(char) * res->numlets); assert (res->lets); for (i=0; i< res->numlets; i++) { res->lets[i] = 'N'; for (j=0; j< 4; j++) { if (ali->cnts[j][i] >= ((float)ali->numseq) * CONS_FRAC) res->lets[i] = alpha[j]; } } return res; } inline void reverse (long long int* a, int length) { long long int lft; int i; for (i=0; i < length/2; i++) { lft = a[i]; a[i] = a[length-i-1]; a[length-i-1] = lft; } } align* unifyAlign(align* ali1, align* ali2, align* uni){ char *mat[MAX_SEQ]; int i,j,k, cbc, brcount; int s1 = 0, s2 = 0, tgs, tgc; align *res = (align*) malloc(sizeof(align)); assert (res); res->score = uni->score; res->numseq = ali1->numseq + ali2->numseq; res->algnlen = uni->algnlen; res->nextalign = 0; res->dirty = 0; // memory allocation and alignment creation res->algn = (long long int*) malloc ((res->algnlen+1) * sizeof (long long int)); assert (res->algn); res->algn[0] = 0; for (j = 0; j < CNTS_LEN; j++){ res->cnts[j] = (char*) malloc((res->algnlen+1) * sizeof(char)); assert (res->cnts[j]); } for (i=0; i<= res->algnlen; i++){ res->algn[i] = 0; for (j=0; jcnts[j][i] = 0; if (!isGap(uni, 0, i)) res->algn[i] |= ali1->algn[s1++]; if (!isGap(uni, 1, i)) res->algn[i] |= (ali2->algn[s2++] << ali1->numseq); } for (i = 0; i < res->numseq; i++){ res->seqs[i] = (i < ali1->numseq) ? ali1->seqs[i] : ali2->seqs[i - ali1->numseq]; mat[i] = (char *) malloc (sizeof (char) * (res->algnlen + 1)); assert (mat[i]); mat[i][0] = 0; for (j = 0, k = 0; j <= res->algnlen; j++) mat[i][j] = isGap (res, i, j) ? '-' : res->seqs[i]->lets[k++]; } s1 = s2 = 1; for (i=0; i<=res->algnlen; i++){ for (j = 0; j < res->numseq; j++){ switch (mat[j][i]){ case 'A': res->cnts[CNTS_A][i]++; if (i > 1 && mat[j][i-1] == '-') res->cnts[CNTS_GE][i]++; break; case 'T': res->cnts[CNTS_T][i]++; if (i > 1 && mat[j][i-1] == '-') res->cnts[CNTS_GE][i]++; break; case 'C': res->cnts[CNTS_C][i]++; if (i > 1 && mat[j][i-1] == '-') res->cnts[CNTS_GE][i]++; break; case 'G': res->cnts[CNTS_G][i]++; if (i > 1 && mat[j][i-1] == '-') res->cnts[CNTS_GE][i]++; break; case '-': if (i > 0 && mat[j][i-1] == '-') res->cnts[CNTS_GC][i]++; else res->cnts[CNTS_GS][i]++; break; } } } for (i = 0; i < res->numseq; i++) free (mat[i]); return res; } align* getChain(dmat* mydm, int x, int y, int j) { int temp; align *res = (align*) malloc (sizeof(align)), *help; long long int* almt = (long long int*) malloc ( sizeof(long long int)); int i=0, almtsize = 1, which, inrun = j; char zz = DMgetPtr(mydm, x, y); assert (res); assert (almt); for (i=0; icnts[i] = 0; i = 0; /////////////// res->dirty = 0; res->nextalign = 0; res->algn = 0; res->algnlen = 0; res->num = freedsize; freed[freedsize] = 0; freedptr[freedsize] = res; if (++freedsize == freedcap){ freedcap *= 2; freed = (int *) realloc (freed, sizeof (int) * freedcap); freedptr = (align **) realloc (freedptr, sizeof (align *) * freedcap); } do { // printf("I am at %d,%d %x\n", x,y, zz); which = zz & Mmask; if (which == 0x3) { help = DMgetNeck(mydm, x, y, inrun); if (!help) { if (i > 2) fprintf (stderr, "PROBLEM %d %d after %d (norm %d, %d)\n", x, y,i, normf, normprev); free(almt); res->algn = 0; res->algnlen = i; return res; } /* if (! help->nextalign) fprintf (stderr, "check %d %d after %d\n", x, y,i); */ help->dirty++; res->nextalign = help; break; } if (inrun == 1 && (zz & Nmask)) which = 1; else if (inrun == 2 && (zz & Omask)) which = 2; else which = 0; /* if (inrun == 1) { if (zz & Nmask) { which = 1; } } else if (inrun == 2) { if (zz & Omask) { which = 2; } } */ if (which == 0) { inrun = zz & Mmask; almt[i++] = BOTH; zz = DMgetPtr(mydm,--x,--y); } else if (which == 1) { /*N*/ inrun = 1; almt[i++] = INSERTION; zz = DMgetPtr(mydm, --x, y); } else if (which == 2) { inrun = 2; almt[i++] = DELETION; zz = DMgetPtr(mydm, x, --y); } else printf("a really dumb error %d\n", i); if (i >= almtsize) { almt = realloc (almt, sizeof(long long int)* (almtsize *= 2)); } // printf ("retrace %d %d after %d\n", x, y,i); } while (x > 0 && y > 0); reverse(almt, i); // fprintf(stderr, "getChain done at %d %d after %d\n", x , y , i); // printf("gotChain\n"); res->algn = almt; res->algnlen = i; // printf("done w it\n"); return res; } void saveNeck(dmat* mydm, int neckdiag) { int size1, size2, x1, x2, y1, y2; alel *first = DMgetDiagStart(mydm, neckdiag-1, &size1, &x1, &y1), *second = DMgetDiagStart(mydm, neckdiag, &size2, &x2, &y2); int i, j; align* a; // printf("saving neck %d\n", neckdiag); normprev = normf; normf = DMnextNecks(mydm, neckdiag); for (i=0; inextalign, *t; long long int* temp, *temp2; int totsize=0; int i =0; for (t = a; t; t = t->nextalign) { totsize += t->algnlen; i++; } temp = malloc ((totsize+1)*sizeof(long long int)); assert (temp); temp[totsize] = 0; temp2 = temp + totsize; totsize = 0; for (t=a; t; t = t->nextalign) { totsize += t->algnlen; memcpy(temp2-totsize, t->algn, t->algnlen*sizeof(long long int)); } free (a->algn); a->algn = temp; a->algnlen = totsize; a->nextalign = 0; /* for (a = a->nextalign; a;) { t = a; a = a->nextalign; freeAlign(t); } */ } inline int scoreGap(int numgs, int numgc, int numge, int numseq) { return (MIN2(numgc, numseq-numgc) * gapcont) + (MIN2(numgs, numseq-numgs) * gapstart) + (MIN2(numge, numseq-numge) * gapend); } void printcache(){ int a, b, c, d; for (a = 0; a < 3; a++){ for (b = 0; b < 3; b++){ for (c = 0; c < 3; c++){ for (d = 0; d < 3; d++){ fprintf (stderr, "%d %d %d %d -- %d\n", a, b, c, d, matchcache[a | (b << 6) | (c << 12) | (d << 18)]); } } } } } char getLetter (FILE *file){ char ch; while (!feof (file)){ ch = fgetc (file); if (!isspace (ch)){ // fprintf (stderr, "LETTER READ: \"%c\"\n", ch); return ch; } } assert (0); return 0; } int readit = 0; void readSubstMatrix (char *filename, int size, int substmatrix[256][256]){ FILE *file; char line[1024]; unsigned char *symbs, ch; int i, j, k; if (readit) return; readit = 1; if (!nucmatrixfile) { sprintf (line, "%s/%s", getenv ("LAGAN_DIR"), filename); file = fopen (line, "r"); assert (file); } else { file = fopen (nucmatrixfile, "r"); assert (file); } for (i = 0; i < 256; i++){ for (j = 0; j < 256; j++){ substmatrix[i][j] = 0; } } symbs = (unsigned char *) malloc (sizeof (unsigned char) * size); assert (symbs); for (i = 0; i < size; i++) symbs[i] = (unsigned char) getLetter (file); for (i = 0; i < size; i++){ ch = getLetter (file); assert (ch == symbs[i]); for (j = 0; j < size; j++){ fscanf (file, "%d", &k); // fprintf (stderr, "NUMBER READ: %d\n", k); substmatrix[(int) symbs[i]][(int) symbs[j]] = k; assert ((int) symbs[i] > 0); assert ((int) symbs[j] > 0); } } fscanf (file, "%d", &gapstart); fscanf (file, "%d", &gapcont); // fprintf (stderr, "GAP SCORES: %d %d\n", gapstart, gapcont); gapend = gapstart / 2; gapstart -= gapend; free (symbs); fclose (file); } inline int chmatchscore (unsigned char a, unsigned char b, int substmatrix[256][256]) { return substmatrix[a][b]; } void buildcache (){ int score, i, j; int gs, gc, ge, ns; char *lets = "ATCG"; int num[4]; int numseqs = MAX_SEQ; readSubstMatrix (NUC_FILE, NUC_FILE_SIZE, substmatrix); for (num[0] = 0; num[0] <= numseqs; num[0]++){ // A for (num[1] = 0; num[1] <= numseqs; num[1]++){ // T for (num[2] = 0; num[2] <= numseqs; num[2]++){ // C for (num[3] = 0; num[3] <= numseqs; num[3]++){ // G score = 0; for (i = 0; i < 4; i++){ score += num[i] * (num[i] - 1) / 2 * chmatchscore ((unsigned char)lets[i], (unsigned char)lets[i], substmatrix); for (j = i + 1; j < 4; j++){ score += num[i] * num[j] * chmatchscore ((unsigned char) lets[i], (unsigned char) lets[j], substmatrix); } } matchcache[num[0] | (num[1] << 6) | (num[2] << 12) | (num[3] << 18)] = score; } } } } for (gs = 0; gs <= numseqs; gs++){ for (gc = 0; gc <= numseqs; gc++){ for (ge = 0; ge <= numseqs; ge++){ for (ns = 0; ns <= numseqs; ns++){ gapcache[gs | (gc << 6) | (ge << 12) | (ns << 18)] = scoreGap (gs, gc, ge, ns); } } } } // builtcache = 1; // printcache(); } inline int v (int y){ if (y >= 0 && y <= MAX_SEQ) return y; fprintf(stderr, "Got %d in v\n", y); assert (0); return 0; } inline int matchscore (align*a, int ai, align *b, int bi){ return matchcache[v(a->cnts[0][ai] + b->cnts[0][bi]) | (v(a->cnts[1][ai] + b->cnts[1][bi]) << 6) | (v(a->cnts[2][ai] + b->cnts[2][bi]) << 12) | (v(a->cnts[3][ai] + b->cnts[3][bi]) << 18)] + gapcache[v(a->cnts[CNTS_GS][ai] + b->cnts[CNTS_GS][bi]) | (v(a->cnts[CNTS_GC][ai] + b->cnts[CNTS_GC][bi]) << 6) | (v(a->cnts[CNTS_GE][ai] + b->cnts[CNTS_GE][bi]) << 12) | (v(a->numseq + b->numseq - (a->cnts[CNTS_CB][ai] + b->cnts[CNTS_CB][bi])) << 18)]; } inline int scoreOpp (align *other, int ow, int oppnum){ return matchcache[v(other->cnts[0][ow]) | (v(other->cnts[1][ow]) << 6) | (v(other->cnts[2][ow]) << 12) | (v(other->cnts[3][ow]) << 18)]; } inline int endGap0 (align* a, int ai, align* b, int bi){ return gapcache[(v(a->cnts[CNTS_GE][ai]+b->cnts[CNTS_GE][bi])<<12) | (v(a->numseq + b->numseq-(b->cnts[CNTS_CB][bi]+a->cnts[CNTS_CB][ai])) << 18)]; } inline int endGap1 (align* a, int ai, align* b, int bi){ return gapcache[(v((b->numseq - b->cnts[CNTS_GS][bi] - b->cnts[CNTS_GC][bi]) + a->cnts[CNTS_GE][ai]) << 12) | (v(a->numseq + b->numseq - (b->cnts[CNTS_CB][bi]+a->cnts[CNTS_CB][ai])) << 18)]; } inline int endGap2 (align* a, int ai, align* b, int bi){ return gapcache[(v((a->numseq - a->cnts[CNTS_GS][ai] - a->cnts[CNTS_GC][ai]) + b->cnts[CNTS_GE][bi])<<12) | (v(a->numseq + b->numseq - (b->cnts[CNTS_CB][bi]+a->cnts[CNTS_CB][ai])) << 18)]; } inline int contGap(align* ali, int myw, align* other, int ow, int *sopp) { return gapcache[(v(other->cnts[CNTS_GS][ow])) | (v(ali->numseq + other->cnts[CNTS_GC][ow]) << 6) | (v(other->cnts[CNTS_GE][ow]) << 12) | (v(ali->numseq + other->numseq - (ali->cnts[CNTS_CB][myw] + other->cnts[CNTS_CB][ow])) << 18)] + sopp[ow]; } inline int openGap(align* ali, int w, align* other, int ow, int *sopp, char *desc) { int alopen, pen, sav, i; alopen = ali->cnts[CNTS_GC][w] + ali->cnts[CNTS_GE][w]; /** * Watch out for running off end of array. */ // if (w < ali->algnlen) alopen += ali->cnts[CNTS_GS][w+1]; sav = gapcache[(v(ali->numseq - (alopen + ali->cnts[CNTS_CB][w]) + other->cnts[CNTS_GS][ow])) | (v(alopen + other->cnts[CNTS_GC][ow]) << 6) | (v(other->cnts[CNTS_GE][ow]) << 12) | (v(ali->numseq+other->numseq - (ali->cnts[CNTS_CB][w]+other->cnts[CNTS_CB][ow])) << 18)]; return sav; } void mkBarrel(int s1, int s2, int e1, int e2, int width, int *dn, int dt, int* starts, int *ends, dmat* mydm) { int sd = s1+s2-1, dlen; int elem = (sd < mydm->d2)? s1: mydm->d2-s2; int incr; double fl = 0; double slope = (double)(e2-s2)/(double)(e1-s1); double cloc = elem; if ((e2-s2 == 0) && (e1-s1 == 0)) slope = 1; else if (e1-s1 == 0) slope = 100000; // // printf("dt = %d\n", dt); // printf("BA: %d, %d to %d, %d %f\n", s1,s2,e1,e2,slope); for ( ; sd <(*dn); sd++) { if (fl>=slope || (int)(cloc) == (int)(cloc+slope)) { cloc+=slope; fl -= slope; } else { elem--; fl++; } if (sd <= mydm->d2) elem++; } fl = 0; for ( ; *dn < dt; (*dn)++) { // // printf("dn =%d ", *dn); if (fl>=slope || (int)(cloc) == (int)(cloc+slope)) { cloc+=slope; fl -= slope; } else { elem -=1; fl++; } if (*dn <= mydm->d2) elem++; if (*dn < MIN2(mydm->d2, mydm->d1)) dlen = *dn; else if (*dn < MAX2(mydm->d2, mydm->d1)) dlen = MIN2(mydm->d2, mydm->d1); else dlen = mydm->d2 + mydm->d1 - *dn; starts[*dn] = MAX2(elem - width, 0); ends[*dn] = MIN2(elem+width, dlen-1); } } void mkSquare(int s1, int s2, int e1, int e2, int *dn, int dt, int* starts, int *ends, dmat* mydm) { int dists[2], dlen; long long int size = ((long long int)e1-(long long int)s1) * ((long long int)e2-(long long int)s2); int dn2; int eval, sval; if (size > MAX_SQ_SIZE) { fprintf (stderr, "SQUARE TOO BIG: %d,%d to %d,%d\n", s1, e1,s2,e2); mkSquare(s1, s2, (s1+e1)/2+glwidth, (s2+e2)/2+glwidth, dn, (*dn+dt)/2, starts, ends, mydm); mkSquare((s1+e1)/2-glwidth, (s2+e2)/2-glwidth, e1, e2, dn, dt, starts, ends, mydm); return; } // // printf("dt = %d\n", dt); // // printf("SQ: %d, %d to %d, %d\n", s1,s2,e1,e2); // fill in part before square dn2 = *dn - 1; while (1){ if (dn2 < mydm->d2) { dists[0] = s1-1; dists[1] = dn2 - e2; } else { dists[0] = mydm->d2 - e2; dists[1] = s1 - (dn2 - mydm->d2)-1; } starts[dn2] = MIN2(starts[dn2], sval = MAX3(dists[0], dists[1],0)); if (dn2 < mydm->d2) { dists[0] = e1-1; dists[1] = dn2 - s2; } else { dists[0] = mydm->d2 - s2; dists[1] = e1 - (dn2-mydm->d2)-1; } if (dn2 < MIN2(mydm->d2, mydm->d1)) dlen = dn2; else if (dn2 < MAX2(mydm->d2, mydm->d1)) dlen = MIN2(mydm->d2, mydm->d1); else dlen = mydm->d2 + mydm->d1 - dn2; ends[dn2] = MAX2(ends[dn2], eval = MIN3(dists[0], dists[1],dlen-1)); if (eval - sval <= 5) break; // break after fill in dn2--; } for ( ; *dn < dt; (*dn)++) { // // printf("square dn = %d\n", *dn); if (*dn < mydm->d2) { dists[0] = s1-1; dists[1] = *dn - e2; } else { dists[0] = mydm->d2 - e2; dists[1] = s1 - (*dn - mydm->d2)-1; } starts[*dn] = MAX3(dists[0], dists[1],0); if (*dn < mydm->d2) { dists[0] = e1-1; dists[1] = *dn - s2; } else { dists[0] = mydm->d2 - s2; dists[1] = e1 - (*dn-mydm->d2)-1; } if (*dn < MIN2(mydm->d2, mydm->d1)) dlen = *dn; else if (*dn < MAX2(mydm->d2, mydm->d1)) dlen = MIN2(mydm->d2, mydm->d1); else dlen = mydm->d2 + mydm->d1 - *dn; ends[*dn] = MIN3(dists[0], dists[1],dlen-1); } } void doShapes(hll* myres, dmat* mydm, int* starts, int *ends) { int p1=MAX2(overlap,glwidth)+1, p2=MAX2(overlap,glwidth)+1; int t1, t2; int dn = 1, dt; int width = glwidth; while (myres) { while (1){ if (!myres || (myres->seq1start >= 1 && myres->seq2start >= 1 && myres->seq1end >= 1 && myres->seq2end >= 1 && myres->seq1start < mydm->d1 && myres->seq2start < mydm->d2 && myres->seq1start < myres->seq1end && myres->seq2start < myres->seq2end && myres->seq1end < mydm->d1 && myres->seq2end < mydm->d2 && abs((myres->seq1end-myres->seq1start) - (myres->seq2end-myres->seq2start)) <= MISMATCH_CUTOFF)) break; myres = myres->next; } if (!myres) break; /* printf("--> (%d %d)=(%d %d)\n", myres->seq1start, myres->seq1end, myres->seq2start, myres->seq2end); */ t1 = myres->seq1start; /* between hits */ t2 = myres->seq2start; dt = t1 + t2 - 1 + overlap; mkSquare(p1-MAX2(overlap, width), p2-MAX2(overlap, width), t1+MAX2(overlap, width), t2+MAX2(overlap, width), &dn, dt, starts, ends, mydm); p1 = myres->seq1end; /* within a hit */ p2 = myres->seq2end; dt = p1 + p2 - 1 - overlap; mkBarrel(t1, t2, p1, p2, width, &dn, dt, starts, ends, mydm); myres = myres->next; } t1 = mydm->d1; t2 = mydm->d2; dt = t1 + t2; mkSquare(p1-MAX2(overlap,width), p2-MAX2(overlap,width), t1, t2, &dn, dt, starts, ends, mydm); } void doAncs(dmat* mydm, align* ali1, align* ali2, hll* ancs) { int *starts, *ends; starts = (int*) malloc(sizeof(int)*(ali1->algnlen + ali2->algnlen+2)); assert (starts); ends = (int*) malloc(sizeof(int)*(ali1->algnlen + ali2->algnlen+2)); assert (ends); doShapes(ancs, mydm, starts, ends); DMinitDiag(mydm, starts,ends); free(starts); free(ends); } align* doNW(dmat* mydm, align* ali1, align* ali2) { int i, j; int x, y, size; int gapstartN = 0, gapstartO = 0; int gapcontN, gapcontO; int gapend[3]; int tt, prevgap; alel *curr, *pasts0, *pasts1, *pasts2; align* a, *b; char rh, ptr=0, isneck; int ndiags = mydm->d1 + mydm->d2 -1; int *sopp1, *sopp2; int numNecks =0, oldneck =0; register int s1, s2, s3, z1, z2,z3; // int M[20][20][6]; isneck = DMnextDiag(mydm); curr = DMgetDiagStart(mydm, 1, &size, &x, &y); curr->N = curr->O = 0; curr->M = 0; DMsetPtr(mydm, 0, 1, 1); buildcache(); sopp1 = (int*) malloc (sizeof (int) * (ali1->algnlen+1)); sopp2 = (int*) malloc (sizeof (int) * (ali2->algnlen+1)); assert (sopp1); assert (sopp2); for (i = 0; i < ali1->algnlen; i++) sopp1[i] = scoreOpp (ali1, i, 0); for (i = 0; i < ali2->algnlen; i++) sopp2[i] = scoreOpp (ali2, i, 0); /*fprintf (stderr, "Checking diagonals...\n"); for (i = ndiags - 50; i <= ndiags; i++){ DMgetDiagStart (mydm, i, &size, &x, &y); */ // fprintf (stderr, "ndiag = %d (%d %d)\n", ndiags, ali1->algnlen, ali2->algnlen); for (i = 2; i <= ndiags; i++) { isneck = DMnextDiag(mydm); if (!(i%10000)) fprintf(stderr, "WORKING %d/%d\n", i/10000,ndiags/10000 ); curr = DMgetDiagStart(mydm, i, &size, &x, &y); pasts2 = DMgetElem(mydm, x-1, y); pasts1 = DMgetElem(mydm, x-1, y-1); for (j = 0; j < size; j++) { gapstartN = openGap(ali2, y, ali1, x, sopp1, "gapstartN"); gapstartO = openGap(ali1, x, ali2, y, sopp2, "gapstartO"); gapcontN = contGap(ali2, y, ali1, x-1, sopp1); gapcontO = contGap(ali1, x, ali2, y-1, sopp2); pasts0 = pasts2; pasts2 = DMgetElem2(mydm, x, y-1, pasts2); curr->M = matchscore (ali1, x - 1, ali2, y - 1); z1 = pasts1->M + endGap0 (ali1, x - 1, ali2, y - 1); z2 = pasts1->N + endGap1 (ali1, x - 1, ali2, y - 1); z3 = pasts1->O + endGap2 (ali1, x - 1, ali2, y - 1); if (z1 >= z2){ if (z1 >= z3){ curr->M += z1; ptr = 0; }// + endGap0 (ali1, x - 0, ali2, y - 0); } else { curr->M += z3; ptr = 2; }// + endGap2 (ali1, x - 0, ali2, y - 0); } } else { if (z2 >= z3){ curr->M += z2; ptr = 1; } // + endGap1 (ali1, x - 0, ali2, y - 0); } else { curr->M += z3; ptr = 2; } // + endGap2 (ali1, x - 0, ali2, y - 0); } } s2 = pasts0->N + gapcontN; s3 = pasts2->O + gapcontO; s1 = curr->M + gapstartN; if (s1 >= s2){ curr->N = s1; } else { curr->N = s2; ptr |= 4; } s1 = curr->M + gapstartO; if (s1 >= s3){ curr->O = s1; } else { curr->O = s3; ptr |= 8; } DMsetPtr(mydm, ptr, x, y); curr++; x++; y--; pasts1 = DMgetElem2(mydm, x-1, y-1, pasts1); } if (isneck) { numNecks++; saveNeck(mydm, i); oldneck = i; } } free (sopp1); free (sopp2); mydm->currneck++; a = getChain(mydm, mydm->d1, mydm->d2, 0); curr--; a->score = MAX3(curr->M, curr->N, curr->O); freed[a->num] = 1; joinAligns(a); // fprintf(stderr, "done NW\n"); return a; } align* makeAlign(align* ali1, align* ali2, hll* anchors, align **uni) { align *res; dmat* mydm; int numseq = ali1->numseq + ali2->numseq, i; int oldgapstart = gapstart, oldgapcont = gapcont, oldgapend = gapend; mydm = makeDM(ali1->algnlen, ali2->algnlen); gapstart *= (numseq-1); gapend *= (numseq-1); gapcont *= (numseq-1); fprintf (stderr, "gs ge gc %d %d %d\n", gapstart, gapend, gapcont); // initEntropy(ali1, ali2); doAncs(mydm, ali1, ali2, anchors); freedsize = 0; freedcap = 1; freed = (int *) malloc (sizeof (int) * freedcap); freedptr = (align **) malloc (sizeof (align *) * freedcap); assert (freed); assert (freedptr); *uni = doNW(mydm, ali1, ali2); res = unifyAlign(ali1, ali2, *uni); // printf("firstlen = %d, seclen = %d, relen = %d\n", ali1->algnlen, ali2->algnlen, res->algnlen); freeDM(mydm); // fprintf(stderr, "Final freeing\n"); for (i = freedsize-1; i >= 0; i--){ if (!freed[i]){ freeAlign (freedptr[i]); freedptr[i] = 0; } } // fprintf(stderr, "Final freeing done\n"); free (freed); free (freedptr); freed = 0; gapstart = oldgapstart; gapend = oldgapend; gapcont = oldgapcont; return res; } align* mkSimAlign(seq* seq1) { int i,j,k,oldk=-1; align* res = (align*) malloc( sizeof(align)); assert (res); res->score = 0; res->nextalign = 0; res->dirty = 0; res->numseq = 1; res->algnlen = seq1->numlets; res->seqs[0] = seq1; /** * Evidence that you need one more character. */ res->algn = (long long int*) malloc((res->algnlen+1) * sizeof(long long int)); assert (res->algn); for (j=0; jcnts[j] = (char*) malloc((res->algnlen+1) * sizeof(char)); assert (res->cnts[j]); } for (i=0; i< res->algnlen;i++) { for (j=0; jcnts[j][i] = 0; res->algn[i] = 1; k=strchr(alpha,seq1->lets[i])-alpha; if (k<5) res->cnts[k][i]++; if (oldk == 4) res->cnts[4][i]++; oldk = k; } for (j=0; jcnts[j][i] = 0; res->algn[i] = 0; return res; } align* removeSeq(align* ali, int seqnum) { int i,j, k, n, p, bit = (1 << seqnum); int mask = bit - 1, resint, flag = 0; align* res = (align*) malloc(sizeof(align)); res->score = 0; res->numseq = ali->numseq-1; for (i=0; i< seqnum; i++) res->seqs[i] = ali->seqs[i]; for (i++; i< ali->numseq; i++) res->seqs[i-1] = ali->seqs[i]; res->algn = (long long int*) malloc(ali->algnlen * sizeof(long long int)); for (j=0; jcnts[j] = (char*) malloc(ali->algnlen * sizeof(char)); for (i=0, j=0, n=0; i < ali->algnlen; i++) { resint = (ali->algn[i] & mask) | ((ali->algn[i] & ~(mask|bit)) >> 1); if (resint) { for (k=0; kcnts[k][j] = ali->cnts[k][i]; res->algn[j] = resint; if (!isGap(ali, seqnum, i)) { k=strchr(alpha,ali->seqs[seqnum]->lets[n])-alpha; if (k<5) res->cnts[k][j]--; if (i && isGap(ali, seqnum, i-1)) res->cnts[CNTS_GE][j]--; n++; } else { if (i && isGap(ali, seqnum, i-1)) res->cnts[CNTS_GC][j]--; else res->cnts[CNTS_GS][j]--; } if (flag) { res->cnts[CNTS_GS][j] = 0; res->cnts[CNTS_GC][j] = 0; res->cnts[CNTS_GE][j] = 0; for (p = 0; p < res->numseq; p++) { if (j<=1 || isGap(res, p, j-1)) { if (!isGap(res, p, j)) res->cnts[CNTS_GE][j]++; else res->cnts[CNTS_GC][j]++; } else { if (j && isGap(res, p, j)) res->cnts[CNTS_GS][j]++; } } } j++; } else { n++; flag = 1;} } res->algnlen = j; for (i=0; icnts[i][j] = 0; // printf("%d squished to %d\n", ali->algnlen, res->algnlen); return res; } align* removeSeqByName(align* ali, char *name) { int i=0; seq *removed; while (strcmp(ali->seqs[i]->name, name)) { i++; } removed = ali->seqs[i]; removeSeq(ali, i); } int getSeqNum(align* ali, seq* trgt) { int i=0; seq *removed; while (ali->seqs[i] != trgt) { i++; } return i; } void swapHLL(hll* h1) { int i, j; while(h1) { i=h1->seq1start; j=h1->seq1end; h1->seq1start=h1->seq2start; h1->seq1end=h1->seq2end; h1->seq2start=i; h1->seq2end=j; h1=h1->next; } } int countpos (align* aln, int seqnum){ int i, j = 0; for (i = 0; i < aln->algnlen; i++){ if (!isGap (aln, seqnum, i)) j++; } return j; } hll* remapHLLs(hll* anchs, int which, align* aln, int seqnum) { int mybp, i, *searchint, stmybp, mylen, olen, osize; hll *wlist = anchs, *temp, *prev; float scale; char isfrst=1; // fprintf (stderr, "which=%d\n", which); // // fprintf (stderr, "This is a list of the entries before going into remapHLLs:\n"); // printHLL (anchs); if (!anchs) return anchs; mylen = countpos (aln, seqnum); // olen = countpos (aln, !seqnum); // fprintf (stderr, "Here is some information about the alignment:\n"); // fprintf (stderr, " alignment length = %d\n", aln->algnlen); // fprintf (stderr, " number of positions in sequence to remap = %d\n", mylen); // fprintf (stderr, " number of positions in other sequence = %d\n", olen); prev = NULL; for (temp = wlist; temp; temp = temp->next){ if (temp->seq1start < 1) temp->seq1start = 1; if (temp->seq2start < 1) temp->seq2start = 1; if (!which && temp->seq1end > mylen) temp->seq1end = mylen; else if (which && temp->seq2end > mylen) temp->seq2end = mylen; if (temp->seq1start > temp->seq1end) { fprintf(stderr, "1 (%d %d)(%d %d)", temp->seq1start, temp->seq1end, temp->seq2start, temp->seq2end); assert(0); } if (temp->seq2start > temp->seq2end) { fprintf(stderr, "2 (%d %d)(%d %d)", temp->seq1start, temp->seq1end, temp->seq2start, temp->seq2end); assert(0); } } wlist = (hll*)malloc(sizeof(hll)); assert (wlist); wlist->next = anchs; prev = wlist; mybp = stmybp = 0; searchint = (!which)?&(anchs->seq1start):&(anchs->seq2start); for (i=1; i<=aln->algnlen; i++) { if (isGap(aln,seqnum,i)){ if (isfrst) continue; scale = (!which) ? ((anchs->seq1end == stmybp) ? 0 : (float)(mybp - stmybp) / (float)(anchs->seq1end - stmybp)) : ((anchs->seq2end == stmybp) ? 0 : (float)(mybp - stmybp) / (float)(anchs->seq2end - stmybp)); osize = (!which) ? (int)((anchs->seq2end - anchs->seq2start) * scale) : (int)((anchs->seq1end - anchs->seq1start) * scale); assert (osize >= 0); if (//mybp - stmybp < ANCHOR_LENGTH_CUTOFF || osize < ANCHOR_LENGTH_CUTOFF || anchs->score * scale < ANCHOR_SCORE_CUTOFF){ // fprintf (stderr, "1. The region from %d to %d was cropped.\n", stmybp, mybp); if (!which){ anchs->score -= anchs->score * scale; anchs->seq1start = mybp+1; anchs->seq2start = anchs->seq2start + osize + 1; isfrst = 1; searchint = &(anchs->seq1start); } else { anchs->score -= anchs->score * scale; anchs->seq1start = anchs->seq1start + osize + 1; anchs->seq2start = mybp+1; isfrst = 1; searchint = &(anchs->seq2start); } if (anchs->seq1start >= anchs->seq1end || anchs->seq2start >= anchs->seq2end){ // fprintf (stderr, "6. The region from %d to %d was thrown away.\n", stmybp, mybp); temp = anchs; prev->next = anchs->next; anchs = anchs->next; free (temp); if (!anchs) break; searchint = (!which)?&(anchs->seq1start):&(anchs->seq2start); } continue; } temp = (hll*) malloc(sizeof(hll)); assert (temp); temp->next = anchs->next; anchs->next = temp; temp->seq1end = anchs->seq1end; temp->seq2end = anchs->seq2end; // fprintf (stderr, "2. A new region from %d to %d was created.\n", stmybp, mybp); //fprintf (stderr, "Currently looking at (%d %d)=(%d %d)\n", anchs->seq1start, anchs->seq1end, anchs->seq2start, anchs->seq2end); if (!which){ temp->score = anchs->score * scale; anchs->score -= temp->score; anchs->seq1end = i; anchs->seq2end = anchs->seq2start + osize; temp->seq1start = mybp+1; temp->seq2start = anchs->seq2end + 1; isfrst = 1; searchint=&(temp->seq1start); } else { temp->score = anchs->score * scale; anchs->score -= temp->score; anchs->seq1end = anchs->seq1start + osize; anchs->seq2end = i; temp->seq1start = anchs->seq1end + 1; temp->seq2start = mybp+1; isfrst = 1; searchint=&(temp->seq2start); } assert (anchs->seq1start <= anchs->seq1end); assert (anchs->seq2start <= anchs->seq2end); prev = anchs; anchs = temp; if (anchs->seq1start >= anchs->seq1end || anchs->seq2start >= anchs->seq2end){ // fprintf (stderr, "5. The region from %d to %d was thrown away.\n", stmybp, mybp); temp = anchs; prev->next = anchs->next; anchs = anchs->next; free (temp); if (!anchs) break; searchint = (!which)?&(anchs->seq1start):&(anchs->seq2start); } // fprintf (stderr, "Now, I am looking for %d, isfrst=%d (%d %d).\n", *searchint, isfrst, temp->seq1start, temp->seq1end); // fprintf (stderr, "Currently, we are position %d in the sequence.\n", mybp); continue; } mybp++; if (mybp==*searchint){ if (isfrst) { *searchint = i; searchint = (!which)?&(anchs->seq1end):&(anchs->seq2end); stmybp = mybp; isfrst = !isfrst; // fprintf (stderr, "2) Now, I am looking for %d, isfrst=%d.\n", *searchint, isfrst); // fprintf (stderr, "Currently, we are position %d in the sequence.\n", mybp); } } if (mybp==*searchint){ if (!isfrst){ *searchint = i; assert (anchs->seq1start <= anchs->seq1end); assert (anchs->seq2start <= anchs->seq2end); if (which == 0 && anchs->seq1end - anchs->seq1start < ANCHOR_LENGTH_CUTOFF || which == 1 && anchs->seq2end - anchs->seq2start < ANCHOR_LENGTH_CUTOFF){ // fprintf (stderr, "4. The region from %d to %d was thrown away.\n", stmybp, mybp); temp = anchs; prev->next = anchs->next; anchs = anchs->next; free (temp); } else { // fprintf (stderr, "3. The region from %d to %d was saved.\n", stmybp, mybp); prev = anchs; anchs = anchs->next; } if (!anchs) break; searchint = (!which)?&(anchs->seq1start):&(anchs->seq2start); isfrst = !isfrst; // fprintf (stderr, "Now, I am looking for %d, isfrst=%d.\n", *searchint, isfrst); // fprintf (stderr, "Currently, we are position %d in the sequence.\n", mybp); } } } // fprintf (stderr, "By the end, I have reached mybp=%d, stmybp=%d.\n", mybp, stmybp); // fprintf (stderr, " number of positions in sequence to remap = %d\n", mylen); // fprintf (stderr, " number of positions in other sequence = %d\n", olen); temp = wlist; wlist = wlist->next; free (temp); for (temp = wlist; temp; temp = temp->next){ // fprintf (stderr, "(%d %d)=(%d %d) %f\n", temp->seq1start, temp->seq1end, temp->seq2start, temp->seq2end, temp->score); assert (temp->seq1start <= temp->seq1end); assert (temp->seq2start <= temp->seq2end); assert (temp->seq1start >= 0); assert (temp->seq2start >= 0); assert (temp->seq1end >= 0); assert (temp->seq2end >= 0); } return wlist; } int hllIntersection(hll *h1, hll *h2) { int i, j; int r1, r2; if (!h1 || !h2) return 0; i=MAX2(h1->seq1start, h2->seq1start); j=MIN2(h1->seq1end, h2->seq1end); r1 = ((iseq2start, h2->seq2start); j=MIN2(h1->seq2end, h2->seq2end); r2 = ((iseq1end - h2->seq1start, h2->seq2end - h2->seq2start); if (!h2) return MAX2(h1->seq1end - h1->seq1start, h1->seq2end - h1->seq2start); i=MIN2(h1->seq1start, h2->seq1start); j=MAX2(h1->seq1end, h2->seq1end); r1 = ((iseq2start, h2->seq2start); j=MAX2(h1->seq2end, h2->seq2end); r2 = ((iseq1start=MIN2(h1->seq1start, h2->seq1start); res->seq1end=MAX2(h1->seq1end, h2->seq1end); res->seq2start=MIN2(h1->seq2start, h2->seq2start); res->seq2end=MAX2(h1->seq2end, h2->seq2end); res->score = score; return res; } int minHLL(hll *h1, hll *h2){ int i, j; i=MIN2(h1->seq1end, h2->seq1end); return (i==h2->seq1end); } float scoreMerge(hll* h1, hll *h2) { float i, u; i = hllIntersection(h1, h2); u = hllUnion(h1, h2); return (h1->score + h2->score)*(i/u); } void printSeqsNames(align *a) { int i; printf("( "); for (i=0; inumseq; i++) { printf("%s ", a->seqs[i]->name); } printf(")\n"); } void printMyHLL(hll *myres) { /* while(myres) { printf("***: (%d %d)=(%d %d)\n", myres->seq1start, myres->seq1end, myres->seq2start, myres->seq2end); myres=myres->next; } */ } hll* mergeHLLs(hll* anchs1, int wh1, hll* anchs2, int wh2) { int i, j, mscore; hll* res=0, *temp; if(wh1) swapHLL(anchs1); if(wh2) swapHLL(anchs2); /* printf("anchs1: \n"); printMyHLL(anchs1); printf("anchs2: \n"); printMyHLL(anchs2); */ if (anchs1==anchs2) { // fprintf(stderr, "mergeHLLs called on same hll!\n"); return anchs1; } while((anchs1 && anchs2)) { // printf("calling printMyHLL!\n"); // printMyHLL(res); if (hllIntersection(anchs1, anchs2)) { mscore = scoreMerge(anchs1, anchs2); if (MAX3(anchs1->score, anchs2->score, mscore) == mscore) { temp = hllJoin(anchs1, anchs2, mscore); temp->next = res; res = temp; } } if (minHLL(anchs1, anchs2)) { temp = anchs2->next; anchs2->next = res; res = anchs2; anchs2 = temp; } else { temp = anchs1->next; anchs1->next = res; res = anchs1; anchs1 = temp; } } if (anchs1 && !anchs2) while (anchs1) { temp = anchs1->next; anchs1->next = res; res = anchs1; anchs1 = temp; } if (!anchs1 && anchs2) while (anchs2) { temp = anchs2->next; anchs2->next = res; res = anchs2; anchs2 = temp; } return res; } int printTextAlign(FILE* outfile, align* myalign) { int s1=0, s2=0, c, k, i; int nlets=0; int* inds = (int*) malloc (sizeof(int)* myalign->numseq); if (!outfile) outfile = stdout; for (i=0; i< myalign->numseq; i++) { inds[i] = 1; } // fprintf(outfile, "ALIGNMENT LENGTH=%d\n\n", myalign->algnlen); for (c = 1; c < myalign->algnlen; c = c + 60) { for (i=0; i< myalign->numseq; i++) { for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] & (1<seqs[i]->lets[inds[i]++]); else fprintf(outfile,"-"); } fprintf(outfile,"\n"); } for (i=4; i < CNTS_LEN; i++) { for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { fprintf(outfile, "%d", myalign->cnts[i][k] % 10 ); } fprintf(outfile,"\n"); } /* fprintf(outfile,"\n"); for (k=c;(k < (c + 60)) && (k < myalign->algnlen); k++) { fprintf(outfile, "%d", k/100); } fprintf(outfile,"\n"); for (k=c;(k < (c + 60)) && (k < myalign->algnlen); k++) { fprintf(outfile, "%d", (k/10)%10); } fprintf(outfile,"\n"); for (k=c;(k < (c + 60)) && (k < myalign->algnlen); k++) { fprintf(outfile, "%d", k%10); } fprintf(outfile,"\n"); */ fprintf(outfile,"\n\n"); } fprintf(outfile,"\n"); free(inds); } int printFASTAAlign(FILE* outfile, align* myalign) { int s1=0, s2=0, c, k, i; int nlets=0; int* inds = (int*) malloc (sizeof(int)* myalign->numseq); if (!outfile) outfile = stdout; for (i=0; i< myalign->numseq; i++) { inds[i] = 1; } for (i=0; i< myalign->numseq; i++) { fprintf(outfile, ">%s\n", myalign->seqs[i]->name); for (c = 1; c < myalign->algnlen; c = c + 60) { for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] & (1<seqs[i]->lets[inds[i]++]); else fprintf(outfile,"-"); } fprintf(outfile,"\n"); } } fprintf(outfile,"\n"); free (inds); } int printXMFAAlign(FILE* outfile, align* myalign) { int s1=0, s2=0, c, k, i; int nlets=0; int* inds = (int*) malloc (sizeof(int)* myalign->numseq); if (!outfile) outfile = stdout; for (i=0; i< myalign->numseq; i++) { inds[i] = 1; } for (i=0; i< myalign->numseq; i++) { fprintf(outfile, ">%d:%d-%d + %s\n", myalign->seqs[i]->index, myalign->seqs[i]->leftbound, myalign->seqs[i]->rightbound-1, myalign->seqs[i]->name); for (c = 1; c < myalign->algnlen; c = c + 60) { for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] & (1<seqs[i]->lets[inds[i]++]); else fprintf(outfile,"-"); } fprintf(outfile,"\n"); } fprintf(outfile,"\n"); } free (inds); } void freeHLLs(hll *myHLL) { hll* a = myHLL; while (a) { myHLL = myHLL->next; free (a); a = myHLL; } } void freeSequence(seq *mySeq) { free(mySeq->rptr); free(mySeq->name); // rptr is a utility pointer, do not free // filename is not allocated, do not free free(mySeq); } void freeAlign(align *myAlign) { int i; // if (freed[myAlign->num]) { // fprintf (stderr, "Something very wrong... %d/%d", myAlign->num, freedsize); // } assert (myAlign->dirty != 23); if (myAlign->nextalign) { myAlign->nextalign->dirty--; if (!myAlign->nextalign->dirty){ freeAlign(myAlign->nextalign); } } myAlign->nextalign = 0; myAlign->dirty = 23; if (myAlign->algn){ free(myAlign->algn); myAlign->algn = (long long int *) 0; } for (i=0; icnts[i]){ free(myAlign->cnts[i]); myAlign->cnts[i] = (char *) 0; } } // sequences not freed // HLLs not freed if (freed) freed[myAlign->num] = 1; free(myAlign); } /* void setScores(int gapstartV, int gapcontV, int gapendV, int gapperseqV, int overlapV, int glwidthV) { gapstart = gapstartV; gapcont = gapcontV; gapend = gapendV; gapperseq = gapperseqV; overlap = overlapV; glwidth = glwidthV; }*/ lagan20/src/multial.h0000644000076500007650000000412110502337063015524 0ustar brudnobrudno00000000000000#ifndef __MULTIAL_H #define __MULTIAL_H #include #define NUC_FILE "nucmatrix.txt" #define NUC_FILE_SIZE 6 #define MAX_SEQ 63 #define CNTS_LEN 8 #define CNTS_A 0 #define CNTS_T 1 #define CNTS_C 2 #define CNTS_G 3 #define CNTS_CB 4 #define CNTS_GS 5 #define CNTS_GC 6 #define CNTS_GE 7 typedef struct HitLocationList { int seq1start; int seq2start; int seq1end; int seq2end; float score; struct HitLocationList *next; struct HitLocationList *bkptr; float scoreSoFar; char dirty; } hll; typedef struct hllpointer { int number; char isstart; hll* myhll; } hptr; typedef struct Sequence { char* lets; int numlets, numsiglets; char* name; char* rptr; char* filename; int leftbound, rightbound; int index; } seq; typedef struct align_res { int num; int index; int score; int algnlen; int numseq; seq* seqs[MAX_SEQ]; long long int* algn; char* cnts[CNTS_LEN]; hll* hlls[MAX_SEQ]; int dirty; struct align_res* nextalign; } align; seq* mkConsensus(align* ali); align* mkSimAlign(seq* seq1); align* makeAlign(align* ali1, align* ali2, hll* anchors, align **uni); align* removeSeq(align* ali, int seqnum); void swapHLL(hll* arg); hll* remapHLLs(hll* anchs, int which, align* aln, int seqnum); hll* mergeHLLs(hll* anchs1, int wh1, hll* anchs2, int wh2); hll* getAnchsFromAlign(align* current, int seqnum, int cutoff); int getSeqNum(align* ali, seq* trgt); int printTextAlign(FILE *, align* myalign); int printFASTAAlign(FILE *, align* myalign); void printSeqsNames(align *a); void buildcache(); void freeHLLs(hll *myHLL); void freeSequence(seq *mySequence); void freeAlign(align *myAlign); void setScores(int gapperseqV, int overlapV, int glwidthV); extern char* alpha; extern int s1start; extern int s1end; extern int s2start; extern int s2end; //int match; //int mismatch; extern int gapstart; extern int gapend; extern int gapcont; extern int gapperseq; extern int overlap; extern int glwidth; extern char dobin; extern char* nucmatrixfile; extern float factor, offset; extern int logs[MAX_SEQ*MAX_SEQ]; extern FILE* outfile; #endif lagan20/src/order.c0000644000076500007650000005336010502337063015174 0ustar brudnobrudno00000000000000#include #include #include #include #include #include "diagmatrix.h" #include "filebuffer.h" #define NUC_FILE "nucmatrix.txt" #define NUC_FILE_SIZE 6 #define MAX_SQ_SIZE (500 * (1 << 20)) #define BIG_SQ_WIDTH 20 #define VER_NUM "1.1" #define INSERTION 2 #define DELETION 3 #define ISCB(c) ((c)=='.') #define MIN2(x,y) ( (x) >= (y) ? (y) : (x) ) #define MAX2(x,y) ( (x) >= (y) ? (x) : (y) ) #define MAX3(x,y,z) MAX2(MAX2(x,y),z) #define WEQ2(x,y,a) ((x==a)? 0: (y==a)? 1:-1) #define WEQ3(x,y,z,a) ((x==a)? 0: (y==a)? 1: (z==a)? 2:-1) align* makeAlign(dmat* mydm, char* seq1, char* seq2); char* alpha = "ATCGN."; int s1start = 0; int s1end = 0; int s2start = 0; int s2end = 0; int gapstart = -1500; int gapcont = -50; //int match =12; //int mismatch = -8; int overlap = 0; int glwidth= 15; char dobin = 0; char domfa = 0; char doxmfa = 0; FILE* ancfile = 0; FILE* outfile; int substmatrix[256][256]; seq* readfile(FILE* input, int seqnum) { char* res = (char*) malloc(sizeof(char)*2); int ressize = 2, numread=1; char temp[256]; seq* myseq = (seq*) malloc(sizeof(seq)); char currchar; if (feof(input)) return 0; fgets(temp, 255, input); if (temp[0] != '>') { fprintf(stderr, "File is not in FASTA format!!\n"); exit(1); } myseq->name = (char*) malloc((strlen(temp))*sizeof(char)); strcpy(myseq->name, temp+1); *(strchr(myseq->name, '\n')) = 0; res[0] = 0; currchar = fgetc(input); while ((currchar != '>') && (currchar != EOF)) { if (!isspace(currchar)) { currchar = toupper(currchar); if (!strchr(alpha, currchar)) { fprintf(stderr, "WARNING %c converted to 'N'\n", currchar); } res[numread++] = currchar; if (numread >= ressize) { res=(char*)realloc(res, sizeof(char)*(ressize*=2)); } } currchar = fgetc(input); } if (currchar == '>') ungetc(currchar, input); res[numread]=0; myseq->rptr = res; if (seqnum == 1) { if (s1start > 0) { res = &res[s1start-1]; res[s1end-s1start+1] = 0; numread = s1end-s1start+1; } else { s1start = 1; s1end = numread; } } else { if (s2start > 0) { res = &res[s2start-1]; res[s2end-s2start+1] = 0; numread = s2end-s2start+1; } else { s2start = 1; s2end = numread; } } myseq->lets = res; myseq->numlets = numread-1; // printf("red %d lets\n",numread); return myseq; } char getLetter (FILE *file){ char ch; while (!feof (file)){ ch = fgetc (file); if (!isspace (ch)) return ch; } return 0; } void readSubstMatrix (char *filename, int size){ FILE *file; char line[1024], *symbs; int i, j; sprintf (line, "%s/%s", getenv ("LAGAN_DIR"), filename); file = fopen (line, "r"); assert (file); for (i = 0; i < 256; i++){ for (j = 0; j < 256; j++){ substmatrix[i][j] = 0; } } symbs = (char *) malloc (sizeof (char) * size); assert (symbs); for (i = 0; i < size; i++) symbs[i] = getLetter (file); for (i = 0; i < size; i++){ getLetter (file); for (j = 0; j < size; j++){ fscanf (file, "%d", &(substmatrix[(unsigned char) symbs[i]][(unsigned char) symbs[j]])); } } fscanf (file, "%d", &gapstart); fscanf (file, "%d", &gapcont); fclose (file); } void paramParse(int argc, char** argv) { int i = 3; for ( ; i < argc; i++) { if (!strcmp(argv[i], "-gs") || !strcmp(argv[i], "-GS")) { gapstart = atoi(argv[++i]); } else if (!strcmp(argv[i], "-gc") || !strcmp(argv[i], "-GC")) { gapcont = atoi(argv[++i]); } else if (!strcmp(argv[i], "-bin") || !strcmp(argv[i], "-BIN")) { dobin =1; } else if (!strcmp(argv[i], "-mfa") || !strcmp(argv[i], "-MFA")) { domfa =1; } else if (!strcmp(argv[i], "-xmfa") || !strcmp(argv[i], "-XMFA")) { doxmfa =1; } /* else if (!strcmp(argv[i], "-mt") || !strcmp(argv[i], "-MT")) { match = atoi(argv[++i]); } else if (!strcmp(argv[i], "-ms") || !strcmp(argv[i], "-MS")) { mismatch = atoi(argv[++i]); }*/ else if (!strcmp(argv[i], "-bw") || !strcmp(argv[i], "-BW")) { glwidth = atoi(argv[++i]); } else if (!strcmp(argv[i], "-s1") || !strcmp(argv[i], "-S1")) { s1start = atoi(argv[++i]); s1end = atoi(argv[++i]); } else if (!strcmp(argv[i], "-s2") || !strcmp(argv[i], "-S2")) { s2start = atoi(argv[++i]); s2end = atoi(argv[++i]); } else if (!strcmp(argv[i], "-anc") || !strcmp(argv[i], "-ANC")) { if (!(ancfile = fopen(argv[++i],"r"))) { printf("couldnt open anchors file %s\n",argv[i]); exit(2); } } else if (!strcmp(argv[i], "-out") || !strcmp(argv[i], "-OUT")) { if (!(outfile = fopen(argv[++i],"w"))) { printf("couldnt open output file %s\n",argv[i]); exit(2); } } } readSubstMatrix (NUC_FILE, NUC_FILE_SIZE); } void usage() { printf("usage: \norder seq1file seq2file [options]\n\n"); printf("Options:\n"); printf("-gs # = Gap Start [default -100]\n"); printf("-gc # = Gap Continue [default -2]\n"); /* printf("-mt # = MaTch [default 12]\n"); printf("-ms # = MiSmatch [default -8]\n");*/ printf("-bw # = Barrel Width around conserved regions [default 15]\n"); printf("-anc anchorfile = specify an anchorfile to use [default no file]\n"); printf("-out outfile = write output to outfile [default screen]\n"); printf("-bin = write output in BINary format [default text]\n"); printf("-mfa = write output in MultiFAsta format [default text]\n"); printf("-s1 # # = use the given substring of the query [default whole]\n"); printf("-s2 # # = use the givensubstring of the dbase [default whole]\n"); printf("-version = prints the version of this ORDER\n"); } hll* readAncFile(seq* seq1, seq* seq2) { hll *myres = 0, *tt; char buff[256]; int i=0; while (!feof(ancfile)) { if (!fgets(buff, 256, ancfile)) { break; } tt = (hll*) malloc(sizeof(hll)); sscanf(buff, "(%d %d)=(%d %d) %*f", &tt->seq1start, &tt->seq1end, &tt->seq2start, &tt->seq2end); if ((tt->seq1start >= s1start && tt->seq1end <= s1end || s1start == 0 && s1end == 0) && (tt->seq2start >= s2start && tt->seq2end <= s2end || s2start == 0 && s2end == 0)){ if (tt->seq1start <= 0 && tt->seq1end <= 0) continue; if (tt->seq2start <= 0 && tt->seq2end <= 0) continue; if (tt->seq1start > s1start + seq1->numlets && tt->seq1end > s1start + seq1->numlets) continue; if (tt->seq2start > s2start + seq2->numlets && tt->seq2end > s2start + seq2->numlets) continue; if (s1start > 0){ tt->seq1start = MAX2 (tt->seq1start - s1start + 1, 1); tt->seq1end = MIN2 (tt->seq1end - s1start + 1, s1end); } if (s2start > 0){ tt->seq2start = MAX2 (tt->seq2start - s2start + 1, 1); tt->seq2end = MIN2 (tt->seq2end - s2start + 1, s2end); } tt->seq1start = MAX2 (tt->seq1start, 1); tt->seq2start = MAX2 (tt->seq2start, 1); tt->seq1end = MIN2 (tt->seq1end, seq1->numlets); tt->seq2end = MIN2 (tt->seq2end, seq2->numlets); tt->next = myres; i++; myres = tt; } } fprintf(stderr,"read %d anchs\n", i); return myres; } void mkBarrel(int s1, int s2, int e1, int e2, int width, int *dn, int dt, int* starts, int *ends, dmat* mydm) { int sd = s1+s2-1, dlen; int elem = (sd < mydm->d2)? s1: mydm->d2-s2; int incr; double fl = 0; double slope = (double)(e2-s2)/(double)(e1-s1); double cloc = elem; if ((e2-s2 == 0) && (e1-s1 == 0)) slope = 1; // printf("dt = %d\n", dt); // printf("BA: %d, %d to %d, %d %f\n", s1,s2,e1,e2,slope); for ( ; sd <(*dn); sd++) { if (fl>=slope || (int)(cloc) == (int)(cloc+slope)) { cloc+=slope; fl -= slope; } else { elem--; fl++; } if (sd <= mydm->d2) elem++; } fl = 0; for ( ; *dn < dt; (*dn)++) { // printf("dn =%d ", *dn); if (fl>=slope || (int)(cloc) == (int)(cloc+slope)) { cloc+=slope; fl -= slope; } else { elem -=1; fl++; } if (*dn <= mydm->d2) elem++; if (*dn < MIN2(mydm->d2, mydm->d1)) dlen = *dn; else if (*dn < MAX2(mydm->d2, mydm->d1)) dlen = MIN2(mydm->d2, mydm->d1); else dlen = mydm->d2 + mydm->d1 - *dn; // if (*dn < 0 || *dn >= 34939) fprintf (stderr, "%d %d\n", *dn, dt); starts[*dn] = MAX2(elem - width, 0); ends[*dn] = MIN2(elem+width, dlen-1); // printf("BARREL %d %d %d\n",*dn,starts[*dn],ends[*dn]); } } void mkSquare(int s1, int s2, int e1, int e2, int *dn, int dt, int* starts, int *ends, dmat* mydm) { int dists[2]; long long int size = ((long long int)e1-(long long int)s1) * ((long long int)e2-(long long int)s2); // printf("dt = %d\n", dt); // printf("SQ: %d, %d to %d, %d\n", s1,s2,e1,e2); if (size > MAX_SQ_SIZE) { fprintf (stderr, "SQUARE TOO BIG: %d,%d to %d,%d\n", s1, e1,s2,e2); mkSquare(s1, s2, (s1+e1)/2+glwidth, (s2+e2)/2+glwidth, dn, (*dn+dt)/2, starts, ends, mydm); mkSquare((s1+e1)/2-glwidth, (s2+e2)/2-glwidth, e1, e2, dn, dt, starts, ends, mydm); return; } for ( ; *dn < dt; (*dn)++) { // printf("square dn = %d\n", *dn); if (*dn < mydm->d2) { dists[0] = s1-1; dists[1] = *dn - e2; } else { dists[0] = mydm->d2 - e2; dists[1] = s1 - (*dn - mydm->d2)-1; } // if (*dn < 0 || *dn >= 34939) fprintf (stderr, "%d\n", *dn); starts[*dn] = MAX2(dists[0], dists[1]); if (*dn < mydm->d2) { dists[0] = e1-1; dists[1] = *dn - s2; } else { dists[0] = mydm->d2 - s2; dists[1] = e1 - (*dn-mydm->d2)-1; } ends[*dn] = MIN2(dists[0], dists[1]); // printf("SQUARE %d %d %d\n",*dn, starts[*dn],ends[*dn]); } } void doShapes(hll* myres, dmat* mydm, int* starts, int *ends) { int p1=MAX2(overlap,glwidth)+1, p2=MAX2(overlap,glwidth)+1; int t1, t2; int dn = 1, dt; int width = glwidth; while (myres) { t1 = myres->seq1start; /* between hits */ t2 = myres->seq2start; dt = t1 + t2 - 1 + overlap; mkSquare(p1-MAX2(overlap, width), p2-MAX2(overlap, width), t1+MAX2(overlap, width), t2+MAX2(overlap, width), &dn, dt, starts, ends, mydm); p1 = myres->seq1end; /* within a hit */ p2 = myres->seq2end; dt = p1 + p2 - 1 - overlap; mkBarrel(t1, t2, p1, p2, width, &dn, dt, starts, ends, mydm); myres = myres->next; } t1 = mydm->d1; t2 = mydm->d2; dt = t1 + t2; mkSquare(p1-MAX2(overlap,width), p2-MAX2(overlap,width), t1, t2, &dn, dt, starts, ends, mydm); } void parseAncs(dmat* mydm, seq* seq1, seq* seq2) { int *starts = (int*) malloc(sizeof(int)*(seq1->numlets + seq2->numlets+2)); int *ends = (int*) malloc(sizeof(int)*(seq1->numlets + seq2->numlets+2)); hll* myres = 0; if (ancfile) { myres = readAncFile(seq1, seq2); } // printf("khe0\n"); doShapes(myres, mydm, starts, ends); // printf("khe1\n"); DMinitDiag(mydm, starts,ends); // printf("khe2\n"); free(starts); free(ends); } void doAlign(dmat* mydm, seq* seq1, seq* seq2) { align *a = (align*) makeAlign(mydm, seq1->lets, seq2->lets); // printf("into printing\n"); if (!dobin && !domfa && !doxmfa) printTextAlign(seq1->lets, seq2->lets, a); else if (!domfa && !doxmfa) printBinAlign(seq1->lets, seq2->lets, a); else if (!doxmfa) printMFAAlign(seq1->lets, seq2->lets, a, seq1->name, seq2->name); else printXMFAAlign(seq1->lets, seq2->lets, a, seq1->name, seq2->name); // printf("doneprinting\n"); } int main(int argc, char** argv) { FileBuffer fseq1, fseq2; seq *seq1, *seq2; dmat* mydm; if (argc < 3) { if (argc == 2) if (!strcmp(argv[1], "-version") || !strcmp(argv[1], "-Version")) { printf("ORDER version %s\n", VER_NUM); exit(0); } usage(); return 1; } if (!(fseq1 = FileOpen(argv[1]))) { printf("couldnt open query file %s\n",argv[1]); usage(); return 2; } if (!(fseq2 = FileOpen(argv[2]))) { printf("couldnt open dbase file %s\n",argv[2]); usage(); return 2; } outfile = stdout; paramParse(argc, argv); seq1 = FileRead(fseq1, s1start, s1end, VER_ORDER); seq2 = FileRead(fseq2, s2start, s2end, VER_ORDER); if (s1start == s1end && s1end == 0) { s1start = 1; s1end = seq1->numlets; } if (s2start == s2end && s2end == 0) { s2start = 1; s2end = seq2->numlets; } mydm = makeDM(seq1->numlets+1, seq2->numlets+1); parseAncs(mydm, seq1, seq2); doAlign(mydm, seq1, seq2); return 0; } inline int ismatch(char a, char b) { return a == b; } inline int matchscore (unsigned char a, unsigned char b) { return substmatrix[a][b]; /* if (!a || !b) return 0; if (a == 'N' || b == 'N') return 0; if (a == b) return match; return mismatch; */ } void reverse (char* a, int length) { char lft; int i; for (i=0; i < length/2; i++) { lft = a[i]; a[i] = a[length-i-1]; a[length-i-1] = lft; } } align* getChain(dmat* mydm, char* seq1, char* seq2, int x, int y, int inrun) { int temp; align *res = (align*) malloc (sizeof(align)), *help; char* almt = (char*) malloc ( sizeof(char)); int i=0, almtsize = 1, which; char zz; zz = DMgetPtr(mydm, x, y); res->dirty = 0; res->nextalign = 0; res->algn = 0; res->algnlen = 0; do { // printf("I am at %d,%d %x\n", x,y, zz); which = zz & Mmask; if (which == 0x3) { help = DMgetNeck(mydm, x, y,inrun); if (!help) { return res; } help->dirty = 1; res->nextalign = help; break; } if (inrun == 1) { if (zz & Nmask) { which = 1; } } else if (inrun == 2) { if (zz & Omask) { which = 2; } } if (which == 0) { inrun = 0; almt[i++] = ismatch(seq1[x-1], seq2[y-1]); zz = DMgetPtr(mydm,--x,--y); } else if (which == 1) { /*N*/ inrun = 1; almt[i++] = INSERTION; zz = DMgetPtr(mydm, --x, y); } else if (which == 2) { inrun = 2; almt[i++] = DELETION; zz = DMgetPtr(mydm, x, --y); } else printf("a really dumb error %d\n", i); if (i >= almtsize) { almt = (char *) realloc (almt, sizeof(char)* (almtsize *= 2)); } } while (x > 0 && y > 0); // printf("gotChain\n"); reverse(almt, i); res->algn = almt; res->algnlen = i; // printf("done w it\n"); return res; } void saveNeck(dmat* mydm, char* seq1, char* seq2, int neckdiag) { int size1, size2, x1, x2, y1, y2; alel *first = DMgetDiagStart(mydm, neckdiag-1, &size1, &x1, &y1), *second = DMgetDiagStart(mydm, neckdiag, &size2, &x2, &y2); int i, j; align* a; DMnextNecks(mydm, neckdiag); for (i=0; ialgn); free(t); } void joinAligns (align* a) { align *n = a->nextalign, *t; char* temp, *temp2; int totsize=0; for (t = a; t; t = t->nextalign) { totsize += t->algnlen; } temp = (char *) malloc (totsize*sizeof(*temp)); temp2 = temp + totsize; totsize = 0; for (t=a; t; t = t->nextalign) { totsize += t->algnlen; memcpy(temp2-totsize, t->algn, t->algnlen*sizeof(*temp)); } free (a->algn); a->algn = temp; a->algnlen = totsize; for (a = a->nextalign; a;) { t = a; a = a->nextalign; freeAlign(t); } } align* makeAlign(dmat* mydm, char* seq1, char* seq2) { int i, j; int x, y, size; alel *curr, *pasts0, *pasts1, *pasts2; align* a; char isneck; int ndiags = mydm->d1 + mydm->d2 -1; register int s1, s2, s3; register char ptr; isneck = DMnextDiag(mydm); curr = DMgetDiagStart(mydm, 1, &size, &x, &y); curr->N = curr->O = gapstart; curr->M = 0; DMsetPtr(mydm, 0, 1, 1); // printf("[%d %d]=%d %d %d\n",x,y,curr->M, curr->N, curr->O); for (i = 2; i <= ndiags; i++) { isneck = DMnextDiag(mydm); if (!(i%10000)) fprintf(stderr, "WORKING %d/%d\n", i/10000, ndiags/10000); curr = DMgetDiagStart(mydm, i, &size, &x, &y); pasts2 = DMgetElem(mydm, x-1, y); pasts1 = DMgetElem(mydm, x-1, y-1); for (j = 0; j < size; j++) { /***************************************************/ pasts0 = pasts2; pasts2 = DMgetElem2(mydm, x, y-1, pasts2); s1 = pasts1->M; s2 = pasts1->N + ((ISCB(seq2[y-1]))?0:gapcont); s3 = pasts1->O + ((ISCB(seq1[x-1]))?0:gapcont); curr->M = matchscore (seq1[x-1], seq2[y-1]); if (s1 >= s2){ if (s1 >= s3){ curr->M += s1; /*ptr = 0;*/ } else { curr->M += s3; /*ptr = 2;*/ } } else { if (s2 >= s3){ curr->M += s2; /*ptr = 1;*/ } else { curr->M += s3; /*ptr = 2;*/ } } s1 = curr->M + ((ISCB(seq2[y-1]))?0:gapstart); s2 = pasts0->N + ((ISCB(seq2[y-1]))?0:gapcont); if (s1 >= s2){ curr->N = s1; ptr = 0; } else { curr->N = s2; ptr = 4; } s1 = curr->M + ((ISCB(seq1[x-1]))?0:gapstart); s2 = pasts2->O + ((ISCB(seq1[x-1]))?0:gapcont); if (s1 >= s2){ curr->O = s1; } else { curr->O = s2; ptr |= 8; } s1 = curr->M; s2 = curr->N; s3 = curr->O; if (curr->M >= curr->N){ if (curr->M < curr->O) ptr |= 2; } else { if (curr->N >= curr->O) ptr |= 1; else ptr |= 2; } //ptr |= WEQ3(curr->M, curr->N, curr->O, MAX3(curr->M, curr->N, curr->O)); //ptr = ptr | (WEQ2(curr->M+gapstart, pasts0->N+gapcont, curr->N) << 2); //ptr = ptr | (WEQ2(curr->M+gapstart, pasts0->O+gapcont, curr->O) << 3); /***************************************************/ /* curr->M = MAX3(pasts[1]->M, pasts[1]->N+gapcont, pasts[1]->O+gapcont); curr->M += matchscore(seq1[x-1], seq2[y-1]); curr->N = MAX2(curr->M+gapstart, pasts[0]->N+gapcont); curr->O = MAX2(curr->M+gapstart, pasts[2]->O+gapcont); ptr = WEQ3(curr->M, curr->N, curr->O, MAX3(curr->M, curr->N, curr->O)); ptr = ptr | (WEQ2(curr->M+gapstart, pasts[0]->N+gapcont, curr->N) << 2); ptr = ptr | (WEQ2(curr->M+gapstart, pasts[0]->O+gapcont, curr->O) << 3); */ DMsetPtr(mydm, ptr, x, y); curr++; x++; y--; pasts1 = DMgetElem2(mydm, x-1, y-1, pasts1); } if ((i < ndiags - 2) && isneck) { saveNeck(mydm, seq1, seq2, i); } } mydm->currneck++; a = getChain(mydm, seq1, seq2, mydm->d1, mydm->d2, 0); curr--; a->score = MAX3(curr->M, curr->N, curr->O); // printf("here! %d\n", a); joinAligns(a); return a; } int printBinAlign(char* seq1, char* seq2, align* myalign) { int s1=1, s2=1, c; char lets[256]; char left, right; // fprintf(stderr,"kuku\n"); for (c = 0; c < 256; c++) lets[c] = -1; lets['A'] = 1; lets['C'] = 2; lets['T'] = 3; lets['G'] = 4; lets['N'] = 5; lets['.'] = 0; for (c = 1; c < myalign->algnlen; c++) { left=right=0; if (myalign->algn[c] != DELETION) left = lets[seq1[s1++]]; if (myalign->algn[c] != INSERTION) right = lets[seq2[s2++]]; right = right | (left << 4); putc(right, outfile); } fclose(outfile); } int printTextAlign(char* seq1, char* seq2, align* myalign) { int s1=1, s2=1, c, k; int nm=0, nga=0, ngb=0, nlets=0; int hasst=0; for (c = 1; c < myalign->algnlen; c = c + 60) { for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] != DELETION) fprintf(outfile, "%c", seq1[s1++]); else { fprintf(outfile,"-"); if (hasst) nga++; } } fprintf(outfile,"\n"); for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] == 1) { fprintf(outfile, ":"); nm++; nlets++; hasst = 1; } else { fprintf(outfile, " "); if (hasst) nlets++; } } fprintf(outfile, "\n"); for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] != INSERTION) fprintf(outfile, "%c", seq2[s2++]); else { fprintf(outfile, "-"); if (hasst) ngb++; } } fprintf(outfile, "\n\n"); } fprintf(outfile,"score = %d, nmatches = %d, nga=%d, ngb=%d nletters=%d, perc = %f\n", myalign->score,nm,nga,ngb,nlets,(float)nm/(float)nlets); fprintf(outfile,"\n"); } int printMFAAlign(char* seq1, char* seq2, align* myalign, char* n1, char* n2) { int s1=1, s2=1, c, k; int nm=0, nga=0, ngb=0, nlets=0; int hasst=0; fprintf(outfile,">%s\n", n1); for (c = 1; c < myalign->algnlen; c = c + 60) { for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] != DELETION) fprintf(outfile, "%c", seq1[s1++]); else { fprintf(outfile,"-"); if (hasst) nga++; } } fprintf(outfile,"\n"); } fprintf(outfile,">%s\n", n2); for (c = 1; c < myalign->algnlen; c = c + 60) { for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] != INSERTION) fprintf(outfile, "%c", seq2[s2++]); else { fprintf(outfile, "-"); if (hasst) ngb++; } } fprintf(outfile, "\n"); } } int printXMFAAlign(char* seq1, char* seq2, align* myalign, char* n1, char* n2) { int s1=1, s2=1, c, k; int nm=0, nga=0, ngb=0, nlets=0; int hasst=0; fprintf(outfile,">1:%d-%d + %s\n", s1start, s1end, n1); for (c = 1; c < myalign->algnlen; c = c + 60) { for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] != DELETION) fprintf(outfile, "%c", seq1[s1++]); else { fprintf(outfile,"-"); if (hasst) nga++; } } fprintf(outfile,"\n"); } fprintf(outfile,">2:%d-%d + %s\n", s2start, s2end, n2); for (c = 1; c < myalign->algnlen; c = c + 60) { for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) { if (myalign->algn[k] != INSERTION) fprintf(outfile, "%c", seq2[s2++]); else { fprintf(outfile, "-"); if (hasst) ngb++; } } fprintf(outfile, "\n"); } } lagan20/src/order.h0000644000076500007650000000072410502337063015175 0ustar brudnobrudno00000000000000#ifndef ORDER__H #define ORDER__H #include "fchaos.h" typedef struct align_res { int score; int algnlen; char* algn; struct align_res *nextalign; int nextloc; char dirty; } align; //align* makeAlign(dmat* mydm, char* seq1, char* seq2); int printAlign(char* seq1, char* seq2, align* myalign); void freeAlign(align* t); int printBinAlign(char* seq1, char* seq2, align* myalign); int printTextAlign(char* seq1, char* seq2, align* myalign); #endif lagan20/src/prolagan.c0000644000076500007650000006607410502343354015672 0ustar brudnobrudno00000000000000#include #include #include #include #include #include #include "skiplist.h" #include "multial.h" #include "filebuffer.h" #define VER_NUM "1.1" #define MIN2(x,y) ( (x) >= (y) ? (y) : (x) ) #define MAX2(x,y) ( (x) >= (y) ? (x) : (y) ) // Global variables static int nested = 0; static int postir = 0; static int lazy = 0; static int notree = 1; static int verbose = 0; static int numseqs = 0; static int itertimes = 1; static int cutoffmatch = 12; static int translate = 0; static int extend = 1; static int fastreject = 0; static int gapfreechunks = 0; static align *simaligns[MAX_SEQ]; static char* lagan_dir; static align *profile1 = 0; static align *profile2 = 0; static int hptrcomp (const void *p1, const void *p2) { int i = ((hptr*)p1)->number; int j = ((hptr*)p2)->number; int it = ((hptr*)p1)->isstart; int jt = ((hptr*)p2)->isstart; if (i > j) return (1); if (i < j) return (-1); if (it) return -1; else return 1; } void usage(void) { printf("mlagan seqfile_1 seqfile_2 [... seqfile_%d] [-parameters]\n\n", MAX_SEQ); printf("-lazy : uses lazy mode\n"); printf("-translate : use translated anchors\n"); // printf("-ext : extend the anchors\n"); This is now default printf("-fastreject : use fast rejection (tuned for human/mouse or closer)\n"); // printf("-gfc : find gap free chunks as anchors\n"); This is currently broken printf("-verbose : give debug output\n"); printf("-tree \"(...)\" : runs with given phylogenetic tree\n"); printf("-out \"filename\": outputs to filename\n"); printf("-version : prints version info\n"); } seq* readfile(FILE* input) { int seqstart=0; int seqend=0; char* res = (char*) malloc(sizeof(char)*2); int ressize = 2, numread=1; //N at 1st letter char temp[256]; seq* myseq = (seq*) malloc(sizeof(seq)); char currchar; res[0] = 'N'; if (feof(input)) return 0; fgets(temp, 255, input); if (temp[0] != '>') { fprintf(stderr, "File is not in FASTA format!!\n"); exit(1); } myseq->name = (char*) malloc((strlen(temp))*sizeof(char)); strcpy(myseq->name, temp+1); *(strchr(myseq->name, '\n')) = 0; currchar = fgetc(input); while ((currchar != '>') && (currchar != EOF)) { if (!isspace(currchar)) { currchar = toupper(currchar); if (!strchr(alpha, currchar)) { fprintf(stderr, "Warning: %c converted to 'N'\n", currchar, alpha); currchar = 'N'; } res[numread++] = currchar; if (numread >= ressize) { res=(char*)realloc(res, sizeof(char)*(ressize*=2)); } } currchar = fgetc(input); } if (currchar == '>') ungetc(currchar, input); res[numread]=0; myseq->rptr = res; if (seqstart > 0) { res = &res[seqstart-1]; res[seqend-seqstart+1] = 0; numread = seqend-seqstart+1; } myseq->lets = res; myseq->numlets = numread; // printf("read: %d lets\n",numread); return myseq; } int starts_with(char *str, char *word) { int len; char *first_word; len = strlen(str); first_word = (char *)malloc((len + 1) * sizeof(char)); sscanf(str, "%s", first_word); return !strcmp(word, first_word); } align* findAlignByName(align *aligns[], char *name) { int i=0; // printf("findAlignByName: %s\n", name); while(iseqs[0]->name, name)) { return(aligns[i]); } i++; } fprintf(stderr, "alignment not found for: %s", name); exit(2); return NULL; } int kk = 0; // Profile stuff start // replaces the sequence of same name with replacer, returning which was // replaced or -1 if none. int getSeqNumber(align* res, seq* replacer) { int i; for (i=0; i < res->numseq; i++) { if (!strcmp(res->seqs[i]->name, replacer->name)) { res->seqs[i] = replacer; return i; } } return -1; } void appendAlignProfile(align *res, seq* seqwgaps) { int i,j,k; res->seqs[res->numseq] = seqwgaps; for (i=1; i < res->algnlen; i++) { if (seqwgaps->lets[i] != '-') { k=strchr(alpha,seqwgaps->lets[i])-alpha; if (k < 4) { res->cnts[k][i]++; } res->algn[i] |= (1 << res->numseq); if (i > 0 && seqwgaps->lets[i-1] == '-') res->cnts[CNTS_GE][i]++; } else if (i > 0) { if (i > 0 && seqwgaps->lets[i-1] != '-') { res->cnts[CNTS_GS][i]++; } else res->cnts[CNTS_GC][i]++; res->algn[i] |= (0 << res->numseq); } } res->numseq++; } align* readProfile(FileBuffer with_gaps) { int i,j; seq* myseq; align* res = (align*) malloc (sizeof(align)); res->score = 0; res->nextalign = 0; res->dirty = 0; res->numseq = 0; res->algnlen = -1; res->index = 32; while ( myseq = FileRead( with_gaps,0,0,VER_MLAGAN )) { // fprintf(stdout, "seq: %s\n", myseq->lets); if (res->algnlen < 0) { res->algnlen = myseq->numlets; res->algn = (long long int*) malloc((res->algnlen+1) * sizeof(long long int)); assert (res->algn); for (j=0; jcnts[j] = (char*) malloc((res->algnlen+1) * sizeof(char)); assert (res->cnts[j]); } for (i=0; i<= res->algnlen;i++) { for (j=0; jcnts[j][i] = 0; res->algn[i] = 0; } } if ( res->algnlen != myseq->numlets) { fprintf (stderr, "Lengths screwed up!!!\n"); exit(1); } appendAlignProfile(res, myseq); } if (verbose) { fprintf(stdout, "LOADED RES\n"); printTextAlign(stdout,res); } return res; } // Profile stuff end void printHLL(hll *myres) { fprintf(stderr, "into %d\n", ++kk); fflush(stderr); while(myres) { fprintf(stderr, "(%d %d)=(%d %d) %f\n", myres->seq1start, myres->seq1end, myres->seq2start, myres->seq2end, myres->score); fflush(stderr); myres=myres->next; } } hll* getAnchsFromFile(char *fname, FileBuffer f1, FileBuffer f2) { FILE *ancfile; hll *myres = 0, *tt = 0, *first = 0; char buff[256]; int i=0, j=0; // printf("getHLLFromNames: %s, %s\n", name1, name2); sprintf(buff, "%s.anchors", fname); ancfile=fopen(buff, "r"); if(ancfile==NULL) { fprintf(stderr, "anchor file not found:: %s.anchors\n", fname); exit(2); } while (!feof(ancfile)) { if (!fgets(buff, 256, ancfile)) { break; } tt = (hll*) malloc(sizeof(hll)); sscanf(buff, "(%d %d)=(%d %d) %f", &tt->seq1start, &tt->seq1end, &tt->seq2start, &tt->seq2end, &tt->score); tt->next = myres; i++; myres = tt; } if (fastreject) { f1->startpos = MAX2(f1->startpos, myres->seq1end); f2->startpos = MAX2(f2->startpos, myres->seq2end); for (tt = myres; tt->next->next; tt = tt->next) { j++; } f1->endpos = MIN2(f1->endpos, tt->next->seq1start); f2->endpos = MIN2(f2->endpos, tt->next->seq2start); // fprintf (stderr, "%d %d %d %d %d\n", j, f1->startpos, f1->endpos, f2->startpos, f2->endpos); myres = myres->next; tt->next = 0; } fprintf(stderr,"read %d anchs\n", i); fclose(ancfile); return myres; } hll* generateAnchors( FileBuffer a1, FileBuffer a2) { char buff[256]; char fname[80]; char *name1, *name2; char *endpnt; int diff1, diff2; align* temp; hll* res; char flip = 0; int retstat; name1 = strrchr (a1->filename, '/'); if (!name1) name1 = a1->filename; else name1++; name2 = strrchr (a2->filename, '/'); if (!name2) name2 = a2->filename; else name2++; endpnt = strchr ( name1, '.'); diff1 = (endpnt)? endpnt - name1: strlen(name1); endpnt = strchr ( name2, '.'); diff2 = (endpnt)? endpnt - name2: strlen(name2); strncpy (fname, name1, diff1); strncpy (fname+diff1, name2, diff2); fname[diff1+diff2] = 0; sprintf(buff, "%s/rechaos.pl %s %s -out %s.anchors %s %s %s %s %s\n", lagan_dir, a1->filename, a2->filename, fname, (extend ? "-ext" : ""), (translate ? "-translate" : ""), (fastreject ? "-fastreject" : ""), (gapfreechunks ? "-gfc" : ""), (lazy ? "-lazy" : "")); retstat = system(buff) >> 8; if (fastreject && (retstat == 3)) { return 0; } else if (retstat) { fprintf (stderr, "Error from rechaos\n"); exit (1); } res = getAnchsFromFile(fname, a1, a2); return res; } void printFASTASeq(FILE *outfile, seq *myseq) { int i; // printf("kva\n"); if (!outfile) outfile = stdout; fprintf(outfile, ">%s\n", myseq->name); // printf("kva2\n"); for(i=0; inumlets; i++) fprintf(outfile, "%c", myseq->rptr[i]); // printf("kva %d\n",i); fprintf(outfile, "\n"); if (outfile!=stdout) fclose(outfile); } hll* findBestChain(hptr* array, int arrsize) { sklst* skipper = makeSkLst(); sle* help; int i; hll* t; for (i = 0; i < arrsize; i++) { if (array[i].isstart) { help = SLfind(skipper, array[i].myhll->seq2start); if (help->myelem) { array[i].myhll->bkptr = help->myelem; array[i].myhll->scoreSoFar = ((hll*)help->myelem)->scoreSoFar + array[i].myhll->score; } else { array[i].myhll->bkptr = 0; array[i].myhll->scoreSoFar = array[i].myhll->score; } } else { help = SLfind(skipper, array[i].myhll->seq2end); if (help->myelem && (array[i].myhll->scoreSoFar <= ((hll*)help->myelem)->scoreSoFar)) continue; SLinsertAfter(skipper, help, array[i].myhll->seq2end, array[i].myhll); help = help->next[0]; while (help->next[0] && ((hll*)help->myelem)->scoreSoFar >= ((hll*)help->next[0]->myelem)->scoreSoFar) SLremove(skipper, help->next[0]); } } t= (hll*)SLgetLast(skipper)->myelem; delSkLst(skipper); return t; } hll* remakeHLL(hll* bestPtr) { int len; hll *res=0; hll *temp, *t2, *t3; int i, bestscore=-1; for (temp = bestPtr; temp; temp = temp->bkptr) { temp->next=res; temp->dirty = 1; res=temp; } return res; } hll* reanchorHLL(hll* mylist) { hll *temp, *best, *t2; int numhits=0, i=0; hptr* myptrs; temp=mylist; while (temp) { numhits++; temp->dirty = 1; temp=temp->next; } myptrs = (hptr*) malloc (sizeof(hptr) * numhits *2); for (temp = mylist; temp; temp = temp->next) { myptrs[i].number = temp->seq1start; myptrs[i].isstart = 1; myptrs[i].myhll = temp; myptrs[i+1].number = temp->seq1end; myptrs[i+1].isstart = 0; myptrs[i+1].myhll = temp; i = i+2; } qsort(myptrs, numhits*2, sizeof(hptr), hptrcomp); best = findBestChain(myptrs, numhits*2); temp=best; while (temp) { temp->dirty = 0; temp=temp->bkptr; } temp=mylist; while (temp) { t2 = temp; temp=temp->next; if (t2->dirty) free(t2); } best = remakeHLL(best); // printf("newbest\n"); // printHLL(best); free (myptrs); return best; } void orderAligns(align *a1, align *a2, align **first, align **second, int *index, int *hllindex) { int a1index, a2index; a1index = a1->index; a2index = a2->index; if (a1index > a2index) { *first = a2; *second = a1; *index = a2index; *hllindex = a1index; } else { *first = a1; *second = a2; *index = a1index; *hllindex = a2index; } } void doRemapHLLs(align *aligns[], align *uni, int *index, int hllindex) { int i, mapi, done=0; // take all hlls into first, and into the second and remap them for(mapi=*index; !done; mapi=hllindex) { for (i=0; ihlls[mapi] != NULL && i != *index) { // remap them into i // fprintf(stderr, "\n called1 %d %d(%d)\n", i, mapi, *index); aligns[i]->hlls[mapi] = remapHLLs(aligns[i]->hlls[mapi], 1, uni, (mapi!=*index)); } } for (i=mapi+1; ihlls[i] != NULL && i != hllindex) { // remap them into first or second // fprintf(stderr, "\n called2 %d %d(%d)\n", mapi, i,*index); aligns[mapi]->hlls[i] = remapHLLs(aligns[mapi]->hlls[i], 0, uni, (mapi!=*index)); } } if (mapi==hllindex) done=1; } // free memory? what's that? // aligns[*index] = result; // aligns[hllindex] = result; } void doReanchorHLLs(align *aligns[], int *index, int hllindex) { int i; // for each pair of hlls from (i to first) and (i to second) for(i=0; i<*index; i++) { aligns[i]->hlls[*index] = reanchorHLL(mergeHLLs(aligns[i]->hlls[*index], 0, aligns[i]->hlls[hllindex], 0)); // if (verbose) { // printf("aligns[%d]->hlls[%d]\n",i ,*index); // printHLL(aligns[i]->hlls[*index]); // } aligns[i]->hlls[hllindex] = 0; } for(i=*index+1; ihlls[i] = reanchorHLL(mergeHLLs(aligns[*index]->hlls[i], 0, aligns[i]->hlls[hllindex], 1)); // if (verbose) { // printf("aligns[%d]->hlls[%d]\n",*index ,i); // printHLL(aligns[*index]->hlls[i]); // } aligns[i]->hlls[hllindex] = 0; } for(i=hllindex+1; ihlls[i] = reanchorHLL(mergeHLLs(aligns[*index]->hlls[i], 0, aligns[hllindex]->hlls[i], 0)); // if (verbose) { // printf("aligns[%d]->hlls[%d]\n", *index, i); // printHLL(aligns[*index]->hlls[i]); // } aligns[hllindex]->hlls[i] = 0; } } align* processAnchors(align *aligns[], align *a1, align *a2, int *index) { int hllindex; align *first, *second, *result, *uni; result = (align*) malloc(sizeof(align)); assert (result); result->score = -1; result->numseq = a1->numseq + a2->numseq; result->algnlen = -1; result->nextalign = 0; result->dirty = 0; orderAligns(a1, a2, &first, &second, index, &hllindex); if (verbose) printHLL(aligns[first->index]->hlls[hllindex]); // result = makeAlign(first, second, aligns[first->index]->hlls[hllindex], &uni); result->index = *index; doReanchorHLLs(aligns, index, hllindex); fprintf(stderr,"done reanchor, leaving processAnchors\n"); return(result); } align* processAlign(align *aligns[], align *a1, align *a2, int *index) { int hllindex; align *first, *second, *result, *uni; fprintf(stderr, "into processalign\n"); orderAligns(a1, a2, &first, &second, index, &hllindex); if (verbose) printHLL(aligns[first->index]->hlls[hllindex]); fprintf(stderr, "about to make\n"); result = makeAlign(first, second, aligns[first->index]->hlls[hllindex], &uni); fprintf(stderr, "done make\n"); result->index = *index; return(result); } align* iterativeImprovement (align *current, align *rpntree[], int length) { int converged = 0; int i=0, oldscore, cutoff; seq *removed; align *readd, *old, *new; hll* anchs, *tt; if (current->numseq <= 2) return current; // printf("iterative improvement!\n"); cutoff = cutoffmatch * 100; fprintf(stderr, "cutoff = %d\n", cutoff); while (!converged) { // Throw out a sequence. Calling code in multial. removed = current->seqs[0]; new = findAlignByName(simaligns, removed->name); old = current; anchs = getAnchsFromAlign(current, 0, cutoff); current = removeSeq(current, 0); free (old); // Re-align this thrown-out sequence to the remaining alignment. current = makeAlign (current, new, anchs, &old); if (verbose) { printf("improved:\n"); printHLL(anchs); printTextAlign(stdout, current); } while (anchs) { tt = anchs; anchs = anchs->next; free (tt); } free (old); i++; if (i==numseqs*itertimes) converged = 1; } return current; } int treeToRPN(char *treestr, align *stack[MAX_SEQ*2], int *depth) { int i=0; int j, k; char buffer[256]; while (treestr[i]!='(') { i++; } i++; while ((treestr[i] != ')') && (treestr[i] != '\0')) { // printf("%d: %s\n", *depth, treestr+i); if (treestr[i]=='(') { i += treeToRPN(treestr+i, stack, depth); } else if (isalnum(treestr[i])) { k = 0; // push alignment while((!isspace(treestr[i])) && (treestr[i]!='(') && (treestr[i]!=')')) { buffer[k++] = treestr[i++]; } buffer[k] = 0; stack[(*depth)++]=findAlignByName(simaligns, buffer); // printf("pushed: %s\n", stack[*depth-1]->seqs[0]->name); } else if (treestr[i]==')') // (*depth)++; break; else { i++; } } if (treestr[i]==')') { (*depth)++; //null is '+' return i+1; } if (treestr[i] == '\0') { fprintf(stderr, "ERROR parsing tree, depth %d, %d chars read", *depth, i); exit(1); } } align* procStack(align* rpntree[MAX_SEQ*2], int length, align *myaligns[]) { align* stack[MAX_SEQ]; int i = 0, sp = 0; int index=0; while (i < (length-1)) { if (rpntree[i]) { stack[sp++] = rpntree[i]; } else { stack[sp-2] = processAnchors(myaligns, stack[sp-2], stack[sp-1], &index); stack[--sp] = 0; // if(verbose) printTextAlign(stdout, stack[sp-1]); } i++; } if (rpntree[i]) { fprintf(stderr,"Unexpeceted error\n"); } else { stack[sp-2] = processAlign(myaligns, profile1, profile2, &index); stack[--sp] = 0; if(verbose) printTextAlign(stdout, stack[sp-1]); } return stack[sp-1]; } void graphCollapsal (align *simaligns[]) { // for now... fprintf(stderr, "Please specify a phylogenetic tree, using [-tree]\n"); exit(1); } int parseParameters(int argc, char** argv, FileBuffer *files, char **treestr) { int i=1; FileBuffer fb; if (argc < 3) { if (argc == 2) if (!strcmp(argv[1], "-version") || !strcmp(argv[1], "-Version")) { fprintf(stderr, "PROLAGAN version %s\n", VER_NUM); exit(0); } usage(); return 1; } while((argv[i][0]!='-')) { // Read in sequence files // printf("sequence %d: %s\n", i, argv[i]); if (!(files[numseqs++] = FileOpen(argv[i]))) { fprintf(stderr, "couldnt open dbase file %s\n",argv[i]); usage(); return 2; } // seqs[numseqs] = FileRead(seqfile, 0, 0, VER_MLAGAN); // seqs[numseqs]->filename = argv[i]; // numseqs++; if(++i>=argc) break; } // printf("\n"); while (i=argc) || (argv[i][0]=='-')) { fprintf(stderr, "missing parameter specification for [-out].\n"); return 1; } fprintf(stderr, "outputting to: %s\n", argv[i]); outfile = fopen(argv[i], "w"); if (outfile==NULL) { fprintf(stderr, "error with output file...\n"); exit(2); } } if (!strcmp(argv[i], "-tree")) { i++; if ((i>=argc) || (argv[i][0]=='-')) { fprintf(stderr, "missing parameter specification for [-tree].\n"); return 1; } notree = 0; *treestr = argv[i]; fprintf(stderr, "using given phylogenetic tree:\n%s\n", *treestr); } if (!strcmp(argv[i], "-gapperseq")) { i++; if (i>=argc) { fprintf(stderr, "missing parameter specification for [-gapperseq].\n"); return 1; } gapperseq = atoi(argv[i]); fprintf(stderr, "using gapperseq score: %d\n", gapperseq); } if (!strcmp(argv[i], "-overlap")) { i++; if (i>=argc) { fprintf(stderr, "missing parameter specification for [-overlap].\n"); return 1; } overlap = atoi(argv[i]); fprintf(stderr, "using overlap value: %d\n", overlap); } if (!strcmp(argv[i], "-glwidth")) { i++; if (i>=argc) { fprintf(stderr, "missing parameter specification for [-glwidth].\n"); return 1; } glwidth = atoi(argv[i]); fprintf(stderr, "using glwidth value: %d\n", glwidth); } if (!strcmp(argv[i], "-pro1")) { i++; if (i>=argc) { fprintf(stderr, "missing filename for [-pro1].\n"); return 1; } fb = FileOpen (argv[i]); profile1 = readProfile(fb); fprintf(stderr, "Profile1 is: %s\n", argv[i]); } if (!strcmp(argv[i], "-pro2")) { i++; if (i>=argc) { fprintf(stderr, "missing filename for [-pro2].\n"); return 1; } fb = FileOpen (argv[i]); profile2 = readProfile(fb); fprintf(stderr, "Profile2 is: %s\n", argv[i]); } i++; } // setScores(gapstart, gapcont, gapend, gapperseq, overlap, glwidth); return 0; } hll* updateAnchorPos(hll* myhll, FileBuffer f1, FileBuffer f2) { hll *res, *temp, *prev=0; res = myhll; fprintf (stderr, "Updating anchs...\n"); for ( ; myhll; myhll = myhll->next) { myhll->seq1start -= (f1->startpos-1); myhll->seq1end -= (f1->startpos-1); myhll->seq2start -= (f2->startpos-1); myhll->seq2end -= (f2->startpos-1); } while (res && (res->seq1start < 0 || res->seq2start < 0)) { // fprintf (stderr, "first..\n"); temp = res; // fprintf(stderr, "Tossed %d %d(%d %d)\n", temp->seq1end, temp->seq2end, // f1->endpos, f2->endpos); res = res->next; free(temp); } temp = res; while (temp && temp->seq1end < (f1->endpos-f1->startpos) && temp->seq2end < (f2->endpos-f2->startpos)) { // fprintf (stderr, "second...\n"); // fprintf(stderr, "Kept %d %d(%d %d)\n", temp->seq1end, temp->seq2end, // f1->endpos-f1->startpos, f2->endpos-f2->startpos); prev = temp; temp = temp->next; } if (prev) { temp = prev; prev = prev->next; temp->next = 0; } else if (temp == res) { res = 0; } else { // fprintf (stderr, "returning %d\n", res); return res; } while ( prev ) { // fprintf (stderr, "third...\n"); // fprintf(stderr, "Tossed %d %d(%d %d)\n", temp->seq1end, temp->seq2end, // f1->endpos, f2->endpos); temp = prev; prev = prev->next; free(temp); } return res; } int connectedGraph(hll* graph[MAX_SEQ][MAX_SEQ], int numseqs) { int M[MAX_SEQ][MAX_SEQ]; int i, j, k; for (i = 0; i < numseqs - 1; i++){ for (j = i + 1; j < numseqs; j++){ M[i][j] = M[j][i] = (graph[i][j] != NULL); } } for (k = 0; k < numseqs; k++) for (i = 0; i < numseqs; i++) for (j = 0; j < numseqs; j++) if (M[i][k] && M[k][j]) M[i][j] = 1; k = 1; for (i = 0; k && i < numseqs; i++) k = M[0][i]; return k; } int main(int argc, char** argv) { FileBuffer seqfile; seq **seqs; int i = 1, j = 1, x, y; int pro1cnt=0, pro2cnt=0; int pro1lst[MAX_SEQ], pro2lst[MAX_SEQ]; int pro1ptr[MAX_SEQ], pro2ptr[MAX_SEQ]; char command[256]; char *treestr = NULL; align *stack[MAX_SEQ*2]; align *final; align *myaligns[MAX_SEQ]; hll* table[MAX_SEQ][MAX_SEQ]; FileBuffer files[MAX_SEQ]; outfile = stdout; lagan_dir = getenv ("LAGAN_DIR"); if (!lagan_dir) { fprintf(stderr, "Environment variable LAGAN_DIR not set\n"); exit(1); } buildcache(); initLib(); seqs = (seq**) malloc((argc-1)*sizeof(seq*)); if (parseParameters(argc, argv, files, &treestr)) return 1; gapstart += gapcont; // Take all sequences and make simple alignments for (i=0; iindex = i+1; myaligns[i]=simaligns[i]=mkSimAlign(seqs[i]); simaligns[i]->index = i; x = getSeqNumber(profile1, seqs[i]); y = getSeqNumber(profile2, seqs[i]); if (x < 0 && y < 0) { fprintf(stderr, "Sequence %s not found in either profile!!!\n", seqs[i]->name); exit(1); } if (x >= 0 && y >= 0) { fprintf(stderr, "Sequence %s found in both profiles!!!\n", seqs[i]->name); exit(1); } if (x >= 0) { fprintf(stderr, "Sequence %s[%d/%d] in 1st profile\n", seqs[i]->name, i, numseqs); if (profile1->index > i) { profile1->index = i; } pro1lst[pro1cnt++] = i; pro1ptr[i] = x; pro2ptr[i] = -1; } if (y >= 0) { fprintf(stderr, "Sequence %s[%d/%d] in 2nd profile\n", seqs[i]->name, i, numseqs); if (profile2->index > i) { profile2->index = i; } pro2lst[pro2cnt++] = i; pro1ptr[i] = -1; pro2ptr[i] = y; } } // Find all pairwise anchors. fprintf(stderr,"pro1cnt = %d, pro2cnt = %d\n", pro1cnt, pro2cnt); for (i=0; i<(numseqs-1); i++) { for (j=i+1; jhlls[j]=0; } } for (i=0; i< pro1cnt; i++) { for (j=0; j< pro2cnt; j++) { if (pro1lst[i] < pro2lst[j]) { simaligns[pro1lst[i]]->hlls[pro2lst[j]] = generateAnchors(files[pro1lst[i]], files[pro2lst[j]]); simaligns[pro1lst[i]]->hlls[pro2lst[j]] = remapHLLs(simaligns[pro1lst[i]]->hlls[pro2lst[j]], 0, profile1, pro1ptr[pro1lst[i]]); simaligns[pro1lst[i]]->hlls[pro2lst[j]] = remapHLLs(simaligns[pro1lst[i]]->hlls[pro2lst[j]], 1, profile2, pro2ptr[pro2lst[j]]); } else { simaligns[pro2lst[j]]->hlls[pro1lst[i]] = generateAnchors(files[pro2lst[j]], files[pro1lst[i]]); simaligns[pro2lst[j]]->hlls[pro1lst[i]] = remapHLLs(simaligns[pro2lst[j]]->hlls[pro1lst[i]], 0, profile2, pro2ptr[pro2lst[j]]); simaligns[pro2lst[j]]->hlls[pro1lst[i]] = remapHLLs(simaligns[pro2lst[j]]->hlls[pro1lst[i]], 1, profile1, pro1ptr[pro1lst[j]]); } } } // printf("\n"); for (i=0; ihlls[%d].score=%g\n", i,j, simaligns[i]->hlls[j]==NULL ? 0 : simaligns[i]->hlls[j]->score); } } */ // Processall closest pairs if (notree) { // Not yet implemented graphCollapsal(myaligns); } else { fprintf(stderr, "\n****************************\n"); fprintf(stderr, "gs: %d; ge: %d;\n", gapstart, gapend); fprintf(stderr, "gc: %d; gp: %d\n", gapcont, gapperseq); //fprintf(stderr, "match: %d; mismatch: %d\n", match, mismatch); fprintf(stderr, "overlap: %d; glwidth: %d\n", overlap, glwidth); fprintf(stderr, "\n****************************\n"); i = 0; treeToRPN(treestr, stack, &i); final = procStack(stack, i, myaligns); } // Ouput end result. fprintf(stderr, "final alignment... \n"); if (fastreject) { printXMFAAlign(outfile, final); } else { printFASTAAlign(outfile, final); } if (outfile != stdout) fclose (outfile); fprintf(stderr, "mlagan -- end.\n"); return 0; } lagan20/src/rechaos.pl0000755000076500007650000002231510502337063015675 0ustar brudnobrudno00000000000000#!/usr/bin/env perl $lagandir = $ENV{LAGAN_DIR}; # Status # -- extension problems if (@ARGV < 2) { print ("usage:\n rechaos seqfile1 seqfile2 [-chaos \"chaos flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-out \"filename\"] [-lazy] [-maskedonly] [-debug] [-translate] [-fastreject]\n"); exit(1); } #$recurfl = "(12,0,25,0)x,(13,1,30,0)x,(8,1,30,0)x,(7,1,30,0)x"; $recurfl = "(12,0,25,0)x,(13,1,30,0)x,(4,0,4,3000)xt,(8,1,30,0)x,(7,1,30,0)x"; #$recurfl = "(12,0,10,200)x,(12,0,10,150)x,(3,0,10,150)xt,(8,0,10,150)x,(12,0,25,0),(13,1,30,0),(3,0,30,0)t,(8,1,30,0),(7,1,25,0)"; $minbox = 10; $minside = 5; $seq1 = $ARGV[0]; $seq2 = $ARGV[1]; $tofile = 0; $masker = 1; $lazycheck = 0; $fastreject = 0; $frminlevel = 0; $frmaxlevel = 3; @frseq1 = (150000, 50000, 30000, 15000); @frseq2 = (150000, 50000, 30000, 15000); #@frseq1 = (70000, 60000, 60000, 20000); #@frseq2 = (70000, 60000, 60000, 20000); $sentinelleft = 1.1; $sentinelright = 1.2; $gfc = " "; $dounmasked = 1; $filename = ""; $debug = 0; $anchparams = ""; $translate = 0; sub max { my ($a, $b) = @_; return $a if ($a > $b); return $b; } sub min { my ($a, $b) = @_; return $a if ($a < $b); return $b; } $i = 2; while ($i < @ARGV) { if ($ARGV[$i] =~ /-\chaos/) { $chaosfl = $chaosfl." ".$ARGV[++$i]; } elsif ($ARGV[$i] =~ /-ext/) { $chaosfl = $chaosfl." -ext "; } elsif ($ARGV[$i] =~ /-recurse/) { $recurfl = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-lazy/) { $lazycheck = 1; } elsif ($ARGV[$i] =~ /-nomask/) { $masker = 0; } elsif ($ARGV[$i] =~ /-out/) { $tofile = 1; $filename = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-maskedonly/) { $dounmasked = 0; } elsif ($ARGV[$i] =~ /-fastreject/) { $fastreject = 1; } elsif ($ARGV[$i] =~ /-debug/) { $debug = 1; } elsif ($ARGV[$i] =~ /-translate/) { $translate = 1; } elsif ($ARGV[$i] =~ /-gfc/) { $gfc = " -gfc "; } elsif ($ARGV[$i] =~ /-gap/){ $anchparams = $anchparams." -gap ".$ARGV[++$i]; $anchparams = $anchparams." ".$ARGV[++$i]; } else { die ("Unrecognized option $ARGV[$i]\n"); } $i++; } if ($lazycheck) { if (-f $filename) { print STDERR "Output file already exists, lazy mode exit!\n"; exit (0); } } $extracase1 = 0; $extracase2 = 0; if (-e "$seq1.masked") { $extra1 = $seq1; $seq1 = "$seq1.masked"; $extracase1 = 1; } if (-e "$seq2.masked") { $extra2 = $seq2; $seq2 = "$seq2.masked"; $extracase2 = 1; } if (! $dounmasked){ $extracase1 = 0; $extracase2 = 0; } #open(SEQ1, "$seq1"); #open(SEQ2, "$seq2"); #$line1 = ; #while ($line1 = ) { # chomp $line1; # $seq1len += length($line1); #} # #$line2 = ; #while ($line2 = ) { # chomp $line2; # $seq2len += length($line2); #} $seq1len = `$lagandir/utils/getlength $seq1`; chomp $seq1len; $seq2len = `$lagandir/utils/getlength $seq2`; chomp $seq2len; $b1[0] = $b2[0] = 1; $e1[0] = $seq1len; $e2[0] = $seq2len; $cumanchs = 0; $clipleft1 = 0; $clipleft2 = 0; $clipright1 = $seq1len + 1; $clipright2 = $seq2len + 1; $app_str = ""; $i = 0; while (1) { $goodanchs = 0; $totalanchs = 0; $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/); if (! $stillmore) { if ($extracase1 || $extracase2) { if ($extracase1) { $seq1 = $extra1; $extracase1 = 0; } if ($extracase2) { $seq2 = $extra2; $extracase2 = 0; } } else { last; } } else { $wordlen = $1; $degeneracy = $2; $cutoff = $3; $extcutoff = $4; $tail = $5; $extraparams = ""; $extraparams = "-t ".$extraparams if ((index ($tail, "t") != -1) && ($translate)); $extraparams = $extraparams." -rsc $extcutoff" if (index ($tail, "x") != -1); } $recurfl = $6; next if ((index ($tail, "t") != -1) && (!$translate)); print STDERR "Using $seq1 $seq2 ($wordlen, $degeneracy, $cutoff, $extcutoff) $tail\n"; # PRINT OUT LIST OF REGIONS TO ALIGN open (PFILE, ">$$.anchs.pairs"); for ($j = 0; $j < @b1; $j++) { print PFILE "-s1 $b1[$j] $e1[$j] -s2 $b2[$j] $e2[$j]\n"; } close (PFILE); # print STDERR "PAIRS hits\n"; # print STDERR `cat $$.anchs.pairs`; # print STDERR "-----------------\n"; # print STDERR `cat $$.anchs.pairs`; # print STDERR "-----------------\n"; # print STDERR "$lagandir/chaos $seq1 $seq2 -wl $wordlen -nd $degeneracy -co $cutoff $extraparams $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp"; # PERFORM THE ALIGNMENTS USING CHAOS $saver = "$lagandir/chaos $seq1 $seq2 $extraparams -wl $wordlen -nd $degeneracy -co $cutoff $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp"; `$lagandir/chaos $seq1 $seq2 $extraparams -wl $wordlen -nd $degeneracy -co $cutoff $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp`; if ($?) { print STDERR "$saver\n"; exit(1); } # ADD IN BOUNDARIES $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/); if ($fastreject || $stillmore || $extracase1 || $extracase2){ $temp1 = $seq1len + 1; $temp2 = $seq2len + 1; $app_str = $app_str."seq1 0 $clipleft1; seq2 0 $clipleft2; score=$sentinelleft (+)\n"; $app_str = $app_str."seq1 $clipright1 $temp1; seq2 $clipright2 $temp2; score=$sentinelright (+)\n"; } # APPEND HITS FROM $app_str TO LOCAL ALIGNMENT LIST open (OFILE, ">>$$.anchtemp"); print OFILE $app_str; close (OFILE); # `wc $$.anchtemp` =~ /(\d+)/x; # $totalanchs = $totalanchs + $1; # print STDERR "CHAOS hits\n"; # print STDERR `cat $$.anchtemp`; # FIND MAXIMAL-SCORING CONSISTENT CHAIN `$lagandir/anchors $$.anchtemp $gfc $anchparams | sort -n +1 > $$.anchs.sorted`; if ($?) { exit(1); } # IF WE'RE DONE, THEN QUIT! $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/); if (!$stillmore && !$extracase1 && !$extracase2) { last; } # `wc $$.anchs` =~ /(\d+)/x; # print STDERR "ANCHS hits\n"; # print STDERR `cat $$.anchs.sorted`; # $goodanchs = $goodanchs + $1; # if ($?) { exit(1); } # READ SORTED ANCHORS TO @anchors open(SFILE, "$$.anchs.sorted"); @anchors = ; close(SFILE); @b1new = 0; @b2new = 0; @e1new = 0; @e2new = 0; @scores = 0; $app_str = ""; # FOR EACH UNALIGNED REGION $area = 0; $maxarea = 0; $k = 0; for ($m = 0; $m < @anchors; $m++){ # SAVE OLD ANCHORS (SKIP FIRST AND LAST FAKE ANCHORS) if ($m >= 1 && $m < @anchors - 1){ $anchors[$m] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $score = $5; chomp $score; $app_str = $app_str."seq1 $1 $2; seq2 $3 $4; score=$score (+)\n"; } if ($m == 0){ next; } # DETERMINE REGION BOUNDARIES $anchors[$m-1] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $gap1begin = $2 + 1; $gap2begin = $4 + 1; $prevanchorscore = $5; chomp $prevanchorscore; $anchors[$m] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $gap1end = $1 - 1; $gap2end = $3 - 1; $nextanchorscore = $5; chomp $nextanchorscore; # CHECK IF RECURSION NEEDED $boxarea = ($gap1end - $gap1begin + 1) * ($gap2end - $gap2begin + 1); $area = $area + $boxarea; $maxarea = $boxarea if ($boxarea > $maxarea); if ($boxarea >= $minbox && ($gap1end - $gap1begin + 1) > $minside && ($gap2end - $gap2begin + 1) > $minside ){ # FAST REJECT if ($fastreject && ($i >= $frminlevel) && ($i <= $frmaxlevel)){ # SKIP MARKED ENDS OF ALIGNMENT if ($nextanchorscore == $sentinelleft || $prevanchorscore == $sentinelright){ next; } # TRIM NEW ENDS OF ALIGNMENT if ($prevanchorscore == $sentinelleft){ # if ($boxarea > $frseq1[$i] * $frseq2[$i]){ if (($gap1end - $gap1begin > $frseq1[$i]) || ($gap2end - $gap2begin > $frseq2[$i])){ if (@anchors == 2){ exit(3); } $clipleft1 = max ($gap1begin-1, $gap1end - $frseq1[$i]); $clipleft2 = max ($gap2begin-1, $gap2end - $frseq2[$i]); $gap1begin = $clipleft1 + 1; $gap2begin = $clipleft2 + 1; } } elsif ($nextanchorscore == $sentinelright){ # if ($boxarea > $frseq1[$i] * $frseq2[$i]){ if (($gap1end - $gap1begin > $frseq1[$i]) || ($gap2end - $gap2begin > $frseq2[$i])){ if (@anchors == 2){ exit(3); } $clipright1 = min ($gap1end+1, $gap1begin + $frseq1[$i]); $clipright2 = min ($gap2end+1, $gap2begin + $frseq2[$i]); $gap1end = $clipright1 - 1; $gap2end = $clipright2 - 1; } } } # ADD REGION if ($gap1begin < $gap1end && $gap2begin < $gap2end){ $b1new[$k] = $gap1begin; $b2new[$k] = $gap2begin; $e1new[$k] = $gap1end; $e2new[$k] = $gap2end; $k++; } } } @b1 = @b1new; @b2 = @b2new; @e1 = @e1new; @e2 = @e2new; if ($debug) { print STDERR "Level $i Summary:\n"; print STDERR " Using $seq1 $seq2 ($wordlen, $degeneracy, $cutoff)\n"; if ($totalanchs == 0) { $percentage = 0; } else { $percentage = $goodanchs / $totalanchs * 100.0; } print STDERR " $goodanchs good out of $totalanchs total anchors ($percentage%)\n"; $area = $area / 1000000; $maxarea = $maxarea / 1000000; print STDERR " Total area left = $area (max = $maxarea)\n"; } $cumanchs = $cumanchs + $goodanchs; $i++; } $res = `sort -nr +1 $$.anchs.sorted`; if ($?) { exit(1); } `rm $$.*`; if($tofile) { open(OUTFILE, ">$filename"); print OUTFILE "$res"; close OUTFILE; } else { print "$res"; } print STDERR "$cumanchs cumulative anchors\n" lagan20/src/skiplist.c0000644000076500007650000001000410502337063015707 0ustar brudnobrudno00000000000000#include #include #include #include "skiplist.h" #include #include char init = 0; void printSLE(sle* tbp) { printf(" %d %x\n", tbp->index, tbp->myelem); } int makeLevel() { unsigned int r = lrand48(); int i = 1; while ((r&1) && (i> 1; } /* printf("lev = %d\n", i);*/ return i; } void initLib() { init = 1; srand48(time(0)); } /* makes a new skip list*/ sklst* makeSkLst() { int i; sklst* res = (sklst*) malloc (sizeof(sklst)); if (!init) { fprintf(stderr, "Skip Lists not initialized\n"); exit(2); } res->sentinel = mksle(MAX_LISTS, INT_MIN, 0); res->maxlevel = 1; return res; } /*deletes an old skip list */ void delSkLst(sklst* trgt) { sle *next, *tbd = trgt->sentinel; while(tbd) { next = tbd->next[0]; delSLE(tbd); tbd = next; } } void chklst2(sklst* trgt) { sle* tt = trgt->sentinel; sle* tt2 = tt->next[0]; while (tt2) { assert(tt->index <= tt2->index); assert(tt == tt2->prev[0]); tt = tt->next[0]; tt2 = tt2->next[0]; } } void chklst(sklst* trgt) { sle* tt = trgt->sentinel; sle* tt2 = tt->next[0]; while (tt2) { assert(tt->index <= tt2->index); assert(tt == tt2->prev[0]); tt = tt->next[0]; tt2 = tt2->next[0]; } } sle* SLinsertAfter(sklst* trgt, sle* prev, int index, void* elem) { int i; sle *tbe; int lc = makeLevel(); if (lc > trgt->maxlevel) { trgt->maxlevel = lc; } tbe = mksle(lc, index, elem); for (i = 0; i < tbe->linkcnt; i++) { tbe->prev[i] = prev; if (prev->next[i]) { prev->next[i]->prev[i] = tbe; } tbe->next[i] = prev->next[i]; prev->next[i] = tbe; while (prev && i >= prev->linkcnt-1) prev = prev->prev[i]; } return tbe; } /*inserts the elem with the index */ sle* SLinsert(sklst* trgt, int index, void* elem) { sle* prev = SLfind(trgt, index), *tbe; return SLinsertAfter(trgt, prev, index, elem); } /*removes & destroys this element */ void SLremove(sklst* trgt, sle* tbr) { int i; if (trgt) for (i = 0; i < tbr->linkcnt; i++) { if (tbr->prev[i]) tbr->prev[i]->next[i] = tbr->next[i]; if (tbr->next[i]) tbr->next[i]->prev[i] = tbr->prev[i]; } delSLE(tbr); } /* I could just keep a pointer to last, but since I'll rarely use it I'll find it this way instead.. */ sle* SLgetLast(sklst* trgt) { int i; sle* currpivot = trgt->sentinel; i = trgt->maxlevel-1; for ( ; i >= 0; i--) { while (currpivot->next[i]) { currpivot = currpivot->next[i]; } } return currpivot; } /* Same as the method below, but good for searching for things near the beginning. it uses an up-down method */ sle* SLlowFind(sklst* trgt, int index) { int i; sle* currpivot = trgt->sentinel; i = 0; for ( ; i < trgt->maxlevel-1; i++) { if (!currpivot->next[i] || currpivot->next[i]->index > index) break; currpivot = currpivot->next[i]; } for ( ; i >= 0; i--) { while (currpivot->index < index) { if (!currpivot->next[i]) { goto cont; } currpivot = currpivot->next[i]; } currpivot = currpivot->prev[i]; cont: {} } return currpivot; } /*gets the elem with the next lowest index. 0 if none */ sle* SLfind(sklst* trgt, int index) { int i; sle* currpivot = trgt->sentinel; i = trgt->maxlevel-1; for ( ; i >= 0; i--) { while (currpivot->index < index) { if (!currpivot->next[i]) { goto cont; } currpivot = currpivot->next[i]; } currpivot = currpivot->prev[i]; cont: {} } return currpivot; } sle* mksle(int linkcnt, int index, void* myelem) { int i; sle* res = (sle*)malloc (sizeof(sle)); res->next = (sle**) malloc(linkcnt*sizeof(sle*)); res->prev = (sle**) malloc(linkcnt*sizeof(sle*)); res->linkcnt = linkcnt; res->index = index; res->myelem = myelem; for (i = 0; i < linkcnt; i++) { res->next[i] = 0; res->prev[i] = 0; } return res; } void delSLE(sle* tbd) { free(tbd->next); free(tbd->prev); free(tbd); } lagan20/src/skiplist.h0000644000076500007650000000122410502337063015720 0ustar brudnobrudno00000000000000#define MAX_LISTS 32 typedef struct skiplistelem { struct skiplistelem** next; struct skiplistelem** prev; int linkcnt; int index; void* myelem; } sle; typedef struct skiplist { sle* sentinel; int maxlevel; } sklst; void initLib(); sklst* makeSkLst(); void chklst(sklst* trgt); void delSkLst(sklst* trgt); sle* SLinsertAfter(sklst* trgt, sle* prev, int index, void* elem); sle* SLinsert(sklst* trgt, int index, void* elem); sle* SLgetLast(sklst* trgt); void SLremove(sklst* trgt, sle* tbr); sle* SLfind(sklst* trgt, int index); sle* SLlowFind(sklst* trgt, int index); sle* mksle(int linkcnt, int index, void* myelem); void delSLE(sle* tbd); lagan20/src/slagan-mfa.pl0000755000076500007650000000217210502337063016256 0ustar brudnobrudno00000000000000#!/usr/bin/perl use strict; $0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0; die("$0: LAGAN_DIR not defined. Stopped") unless defined $ENV{"LAGAN_DIR"}; my $LAGAN_DIR = $ENV{LAGAN_DIR}; my ($outfile, $base); foreach my $arg (@ARGV) { if ($arg =~ /-out\s+([^\s]+)/) { $outfile = $1; $arg =~ s/-out\s+([^\s]+)//; } elsif ($arg =~ /-base[\s\=]+([^\s]+)/) { $base = $1; $arg =~ s/-base[\s\=]+([^\s]+)//; die("$0: Invalid base parameter (expected 1 or 2). Stopped") unless $base eq "1" or $base eq "2"; } } if (@ARGV < 2) { print ("Usage:\n$0 seqfile1 seqfile2 [-glocal \"glocal flags\"] [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-mfa] [-out \"filename\"] [-maskedonly] [-debug] [-translate] [-fastreject]\n"); exit(1); } my $args = join(" ", @ARGV); system($LAGAN_DIR."/slagan.pl $args > slagan.pl.out"); die("$0: slagan.pl returned error $?. Stopped") if $?; system($LAGAN_DIR."/xmfa2mfa.pl ".($base eq "2" ? "2" : "1")." < slagan.pl.out ".($outfile ? "> $outfile" : "")); die("$0: xmfa2mfa.pl returned error $?. Stopped") if $?; unlink "slagan.pl.out"; lagan20/src/slagan.pl0000755000076500007650000001415510502337063015521 0ustar brudnobrudno00000000000000#!/usr/bin/perl -w use strict; my $lagandir = $ENV{LAGAN_DIR}; if (@ARGV < 2) { print ("Usage:\n slagan.pl seqfile1 seqfile2 [-glocal \"glocal flags\"] [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-mfa] [-out \"filename\"] [-maskedonly] [-debug] [-translate] [-fastreject]\n"); exit(1); } my ($seq1, $firstName) = ($ARGV[0], $ARGV[0]); die("$0: File not found: $seq1. Stopped") unless -f $seq1; my ($seq2, $secondName) = ($ARGV[1], $ARGV[1]); die("$0: File not found: $seq2. Stopped") unless -f $seq2; my ($extra1, $extra2) =(0, 0); if (-e "$seq1.masked") { $seq1 = "$seq1.masked"; $extra1 = 1;} if (-e "$seq2.masked") { $seq2 = "$seq2.masked"; $extra2 = 1;} my $max_ext = 25000; my $ext_mul = 1; my $arglist = ""; my $glocal_fl = " -gapopen 0,1000,2000,2000 -gapcont 0.2,0.06,0.06,0.06 -dist 0,1.0,2.5,2.5"; my $chaos_fl = " -wl 11 -nd 1 -co 10 -ext -rsc 2250 -b"; my $lagan_fl = ""; my $supermap_fl = "-glocal_out=slagan.out.glocal"; my $outfile = 0; my $fastrej = 0; for (my $i = 2; $i < @ARGV; $i++) { if ($ARGV[$i] =~ /-glocal_fl/) { $glocal_fl = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-chaos_fl/) { $chaos_fl = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-lagan_fl/) { $lagan_fl = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-max_ext/) { $max_ext = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-ext_mul/) { $ext_mul = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-out/) { $outfile = $ARGV[++$i]; if (-e "$outfile") { system("rm $outfile") and exit(1); } } elsif ($ARGV[$i] =~ /-order/) { $arglist = $arglist." -order $ARGV[++$i]"; } elsif (($ARGV[$i] =~ /-gs/) || ($ARGV[$i] =~ /-gc/) || ($ARGV[$i] =~ /-mt/) || ($ARGV[$i] =~ /-ms/) || ($ARGV[$i] =~ /-bw/)) { $arglist = $arglist." ".$ARGV[$i]; $arglist = $arglist." ".$ARGV[++$i]; } elsif ($ARGV[$i] =~ /-ext/) { $arglist = $arglist." -ext $ARGV[++$i]"; } elsif ($ARGV[$i] =~ /-maskedonly/) { $arglist = $arglist." -maskedonly"; } elsif ($ARGV[$i] =~ /-translate/) { $arglist = $arglist." -translate"; } elsif ($ARGV[$i] =~ /-fastreject/) { $fastrej = 1; # $arglist = $arglist." -fastreject"; } elsif ($ARGV[$i] =~ /-recurse/) { $arglist = $arglist." -recurse \"".$ARGV[++$i]."\""; } elsif ($ARGV[$i] =~ /-chaos/) { $chaos_fl = $ARGV[++$i]; } else { die("$0: Invalid option for rlagan: $ARGV[$i]"); } } my $seq1len = `$lagandir/utils/getlength $firstName`; my $seq2len = `$lagandir/utils/getlength $secondName`; chomp $seq1len; chomp $seq2len; `$lagandir/chaos $seq1 $seq2 $chaos_fl > chaos.$$`; if ($?) { exit(1); } #`$lagandir/glocal chaos.$$ $glocal_fl > out.$$`; #@regs = `$lagandir/anal_gloc.pl < out.$$`; #print @regs; open(FH, "> seq1len"); print FH $firstName." ".$seq1len."\n"; close FH; open(FH, "> seq2len"); print FH $secondName." ".$seq2len."\n"; close FH; my $supermap_outfile = "slagan.out.smap"; my $supermap_inv = "$lagandir/supermap.pl -sizes1=seq1len -sizes2=seq2len $supermap_fl chaos.$$ -no_clust_run -f -out=$supermap_outfile 1>&2"; #print $supermap_inv."\n"; system($supermap_inv); open(FH, "< $supermap_outfile"); my @regs = ; die("$0: Supermap generated no regions. Stopped") unless scalar @regs; close FH; unlink "seq1len"; unlink "seq2len"; # unlink $supermap_outfile; #$prevend1 = $seq1len; #$prevend2 = $seq2len; #$nextstart1 = 1; #$nextstart2 = 1; for (my $k = 0; $k < @regs; $k++) { $regs[$k] =~ /^([^\s]+)\s([\d]+)\s([\d]+)\s\s\s([^\s]+)\s([\d]+)\s([\d]+)\s(\+|\-)\s\((DM|M1|M2),\s([\d]+)\saligns\)$/o; my ($startreg1, $endreg1, $startreg2, $endreg2, $strand, $type) = ($2, $3, $5, $6, $7, $8); =head1 $regs[$k] =~ /.* Region \[(\d+) (\d+)\]\[(\d+) (\d+)\] (.*) (.)/; $startreg1 = $1; $endreg1 = $2; $startreg2 = $3; $endreg2 = $4; $strand = $6; if ($k+2 < @regs) { $regs[$k+1] =~ /.* Region \[(\d+) (\d+)\]\[(\d+) (\d+)\] (.*) (.)/; $nextstart1 = $2; } else { $nextstart1 = 1; } $y1 = $prevend1-$endreg1; $y2 = $startreg1-$nextstart1; $expandback = ($max_ext < $y1)? $max_ext:$prevend1-$endreg1; $expandforw = ($max_ext < $y2)? $max_ext:$startreg1-$nextstart1; $prevend1 = $startreg1; $startreg1 = $startreg1 - $expandforw; $endreg1 = $endreg1 + $expandback; =cut my $rcf = ""; if ($strand eq "+") { # $endreg2 = ($endreg2 + $expandback * $ext_mul > $prevend2)? $prevend2:($endreg2 + $expandback * $ext_mul); # $startreg2 = ($startreg2 - $expandforw * $ext_mul < $nextstart2)? $nextstart2:($startreg2 - $expandforw * $ext_mul); } else { $rcf = "-rc"; # $endreg2 = ($endreg2 + $expandforw * $ext_mul > $prevend2)? $prevend2:($endreg2 + $expandforw * $ext_mul); # $startreg2 = ($startreg2 - $expandback * $ext_mul < $nextstart2)? $nextstart2:($startreg2 - $expandback * $ext_mul); } #print "$lagandir/utils/fa2xfa $firstName $startreg1 $endreg1 1 > seq1$k.$$\n"; `$lagandir/utils/fa2xfa $firstName $startreg1 $endreg1 1 > seq1$k.$$\n`; #print "$lagandir/utils/fa2xfa $secondName $startreg2 $endreg2 2 $rcf > seq2$k.$$\n"; `$lagandir/utils/fa2xfa $secondName $startreg2 $endreg2 2 $rcf > seq2$k.$$\n`; # if ($extra1) { `$lagandir/utils/fa2xfa $seq1 $startreg1 $endreg1 1 > seq1$k.$$.masked\n`; } # if ($extra2) { `$lagandir/utils/fa2xfa $seq2 $startreg2 $endreg2 2 $rcf > seq2$k.$$.masked\n`; } #print "$lagandir/lagan.pl seq1$k.$$ seq2$k.$$ $arglist $lagan_fl -mfa -out lagan.$k.$$\n"; `$lagandir/lagan.pl seq1$k.$$ seq2$k.$$ $arglist $lagan_fl -mfa -out lagan.$k.$$\n`; my $suff = ""; if ($outfile) { $suff = " >> $outfile"; } if (-e "lagan.$k.$$") { if ($fastrej) { #print "$lagandir/utils/scorealign lagan.$k.$$ 45 -cropxmfa -ibounds $suff\n"; print `$lagandir/utils/scorealign lagan.$k.$$ 45 -cropxmfa -ibounds $suff`; } else { #print "$lagandir/utils/scorealign lagan.$k.$$ 45 -ibounds\n"; my $sc = `$lagandir/utils/scorealign lagan.$k.$$ 45 -ibounds`; chomp($sc); if ($sc) { print `cat lagan.$k.$$ $suff`; print `echo \"=$sc $type\n\" $suff`; } } } } my ($outName1, $outName2) = ($ARGV[0], $ARGV[1]); $outName1 =~ s/^.*\///; $outName1 =~ s/\..*//; $outName2 =~ s/^.*\///; $outName2 =~ s/\..*//; `cat chaos.$$ > ${outName1}_$outName2.chaos`; ####`cat out.$$ > ${outName1}_$outName2.mon`; unlink(glob("*.$$")); if ($extra1 || $extra2) { `rm *.$$.masked`; } exit(0); # out: .chaos .mon->.smap .xmfa lagan20/src/sortlist.c0000644000076500007650000000154010502337063015735 0ustar brudnobrudno00000000000000hll* merge2(hll* list1, hll* list2) { hll* totallist = 0; hll* temp; while (list1 || list2) { if ((list1 && !list2) || (list1->seq1start > list2->seq1start)) { temp = list1->next; list1->next = totallist; totallist = list1; list1 = temp; } else { temp = list2->next; list2->next = totallist; totallist = list2; list2 = temp; } } return totallist; } hll* findmiddle(hll* mylist) { hll* other = mylist; while (other && other->next) { other = other->next->next; mylist = mylist->next; } return mylist; } hll* sortList(hll* mylist) { hll* premid; hll* mid; if (!mylist || !mylist->next) return mylist; premid = findmiddle(mylist); mid = premid->next; premid->next = 0; mylist = sortList(mylist); mid = sortList(mylist); mylist = merge2(mylist,mid); } lagan20/src/supermap.pl0000755000076500007650000021531510502337063016111 0ustar brudnobrudno00000000000000#!/usr/bin/perl # Supermap: Piecewise monotonic alignment map generator for Shuffle-LAGAN # Author: Andrey Kislyuk (kislyuk@ocf.berkeley.edu) package Supermap; require 5.005; my ($VERSION) = ('$Id: supermap.pl,v 1.50 2005/06/15 22:40:04 kislyuk Exp $' =~ /,v\s+(\d+\S+)/o); # Default constant values my $overlap_factor = 0.8; # Aligns will be discarded if another align overlaps them by this factor or more in both seqs and has the same orientation my $max_asym = 10; # Chains will be formed only if the resulting region's lengths differ by at most this factor my $min_seq_score; # All aligns for sequences with this total score will be discarded. See getMinSeqScore my $max_expand_len = 30000; # Aligns will be expanded or contracted on both sides on both strands by this amount up to the total length below my $expand_factor = 4; # When one of an align's sequences is constrained in its expansion by a neighbor/start/end, the other one will be expanded by this times more than the first one my $max_chainlen = 1500000; # Aligns will not be joined if the total length on either strand exceeds this. Set 0 to disable (no chain length limit) my $max_job_size = 50000; # Maximum job size, in blat hits, for chunking when running glocal in parallel my $erode_align = 15; # Amount by which to erode the coords of each align loaded (to avoid overlap problems when chaining) my ($c1, $c2, $c3, $c4) = (100, 50, 400, 25); # BLAT->CHAOS score conversion parameters #my $max_dist_y = 10000; # Join x-monotonic into same single-chain only if at most that apart in y-species. my $default_lagan_dir = "/home/genome/glocal"; my $glocal_name = (0 ? "SLAGAN" : "glocal"); use Getopt::Long; use File::Path; use File::Copy; use Cwd; use IPC::Open2; use IO::Handle; #use Carp; use strict; use warnings; no warnings "uninitialized"; sub main(); sub init(); sub getSeqSizes($$$); sub prepareHits(); sub runSLAGAN(); sub reprintInputHits($$$); sub processResults(); sub removeSLAGANOutput(); sub seqBelowMinScore($); sub alignHashID($); sub printChainToTemp($$$$); sub chainBase1Hits($$); sub chainBase2Hits($); sub load2MHashes($); sub loadBase2Hashes($); sub postProcessRegions(); sub workerRun($$$$); sub dequeueClustJobs($); sub get_all_seqs($$); sub isBLAT($); sub useIf($$); sub writeSizes($$); sub getMinSeqScore($); sub checkAlignCoords($); sub expandSeq1($$); sub expandSeq2($$); sub finalExpand($$); sub expSeq1Reg($$$$$); sub expSeq2Reg($$$$$); sub finalExpReg($$$$$); # array index constants use constant START1 => 0; use constant END1 => 1; use constant START2 => 2; use constant END2 => 3; use constant SEQ1 => 4; use constant SEQ2 => 5; use constant ORIENT => 6; use constant ORIGIN => 7; use constant SCORE => 8; use constant TOTSC => 9; use constant HASHID => 10; use constant FLIPPED=> 11; use constant CHALO1 => 12; use constant CHAHI1 => 13; use constant CHALO2 => 14; use constant CHAHI2 => 15; use constant CHALO1E=> 16; use constant CHAHI1E=> 17; use constant CHALO2E=> 18; use constant CHAHI2E=> 19; #use constant PREV1 => 8; use constant NEXT1 => 9; #use constant PREV2 => 10; use constant NEXT2 => 11; #use constant OSTART1=> 12; use constant OEND1 => 13; #use constant OSTART2=> 14; use constant OEND2 => 15; $SIG{'INT'} = $SIG{'QUIT'} = $SIG{'HUP'} = $SIG{'TRAP'} = $SIG{'ABRT'} = $SIG{'STOP'} = $SIG{'TERM'} = \&dequeueClustJobs; my ($debug, $quiet, $outfile, $proflip, $skip, $no_pid, $input_glob, $input_dir, $server, $db, $gen1, $gen2, $gen1sizefile, $gen2sizefile, $write_sizes1, $write_sizes2, $score_file, $cfg, $cfg_file, $sizes1, $sizes2, $dbh, $tmp_dir, $tmp_prefix, $nodelete, $clust_run_pid, $print_chains, $no_aligntotals, $no_clust_run, $num_jobs, $input_is_blat, $force_overwrite, $print_csv, $using_GP, $slagan_params, $tmp_existed, $print_stats, $lagan_dir, $glocal_out_logfile); my (@input_files); my (%offsets1, %offsets2, %aligns1, %aligns2, %flipped_aligns); my $supermapexec = $0; my $mycwd = getcwd(); $supermapexec =~ s/^\./$mycwd/ unless $supermapexec =~ /^\.\./; $supermapexec = $mycwd."/".$supermapexec if $supermapexec =~ /^\.\./; die("$0: Problem resolving my name, \'$supermapexec\' is not a file") unless -f $supermapexec or $ARGV[0] eq "worker"; $0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0; $lagan_dir = $ENV{"LAGAN_DIR"} if defined $ENV{"LAGAN_DIR"}; $lagan_dir = $ENV{"LAGAN_DIR"} = $default_lagan_dir unless defined $ENV{"LAGAN_DIR"}; $lagan_dir =~ s/^\.\./$mycwd\/\.\./; $lagan_dir =~ s/^\./$mycwd\//; $ENV{"LAGAN_DIR"} = $lagan_dir; print STDERR "$0: Warning: LAGAN_DIR=$lagan_dir is not a valid directory\n" unless -d $lagan_dir; push @INC, $lagan_dir; my $SLAGAN = $lagan_dir."/".$glocal_name; my $error_file = "./$0.$$.error.log"; my $default_score_file = $lagan_dir."/test.score"; my $default_outfile = "$0.out"; my $worker_tmp_dir = "/tmp/$0.$$.worker/"; # The directory where workers store their intermediate files (two workers should not use the same directory) my $usage = " -infile=file \t Name of input file containing all hits for the two genomes -outfile=file \t Output filename (default: $default_outfile) -gen1=id \t First genome ID (must exist in the GPDB) -gen2=id \t Second genome ID (must exist in the GPDB) -sizes1=file \t File with sequence sizes for first genome -sizes2=file \t File with sequence sizes for second genome -bacteria \t Rearrange circular DNA to find a better alignment map -server=hostname GPDB server (default: lemur) -db=dbname \t GPDB name (default: GP) -config=file \t GPDB config file (default: ~/.gprc) -score=file \t Score file for SLAGAN (default: $default_score_file) -glocal_out=file \t Save intermediate GLOCAL alignment hits to this file -no_clust_run \t Run CPU/memory intensive jobs locally, not on the GP cluster -tmp_dir=dir \t Working directory (default: /tmp/$0.pid) -f \t\t Overwrite output file without prompting if it exists -v \t\t Verbose mode -q \t\t Quiet mode -k \t\t Keep all temporary files -expand_length=N Maximum length by which to expand alignments (default: $max_expand_len) -max_length=N \t Maximum length for any alignment chain in either strand \t\t (default: $max_chainlen) -min_seq_score=N Sequences with total align score below this threshold will be \t\t discarded (default: U penalty in SLAGAN score file) -max_job_size=N Threshold, in hits, for splitting workload into separate jobs \t\t for clust_run (default: $max_job_size) -c1, c2, c3, c4=N: Score factors for BLAT->CHAOS conversion \t\t (default: $c1, $c2, $c3, $c4) Options may be abbreviated. Input file format is BLAT or CHAOS. Sequence names should not contain spaces. Alignments with negative scores are discarded. Sequence size file format, one sequence per line: seq_name seq_size "; exit(main()); # ___ Subroutines _______________ sub main() { if ($ARGV[0] eq "worker") { workerRun($ARGV[1], $ARGV[2], $ARGV[3], $ARGV[4]); exit(0); } # Running SLAGAN in distributed mode init(); print("$0: Retrieving sequence info...\n") unless $quiet; $sizes1 = getSeqSizes($dbh, $gen1, $gen1sizefile); (writeSizes($sizes1, $write_sizes1), exit(0)) if defined $write_sizes1; $sizes2 = getSeqSizes($dbh, $gen2, $gen2sizefile); (writeSizes($sizes2, $write_sizes2), exit(0)) if defined $write_sizes2; die("$0: No sequence size data found. Stopped") if (keys(%$sizes1) < 1 or keys(%$sizes2) < 1); die("$0: Flip mode is only applicable for two single-sequence organisms. Stopped") if ($proflip and not (keys(%$sizes1) == 1 and keys(%$sizes2) == 1)); # Sort and separate the alignments, run SLAGAN on them prepareHits(); runSLAGAN(); # Chain SLAGAN alignments into supermonotonic chain and save the intermediate results my ($dc, $sc1, $sc2) = processResults(); # Load the results back and expand regions, then print them postProcessRegions(); print "$0: Output written to $outfile\n" unless $quiet; print "$0: Intermediate files kept in $tmp_dir\n" if $nodelete and not $quiet; rmdir $tmp_dir unless $tmp_existed or $nodelete; return 0; } # Startup tasks sub init() { system('export LC_ALL="C"'); # Things may misbehave if locale is set to UTF-8 # Berkeley Genome Pipeline functionality is used if corresponding Perl modules are found in @INC foreach my $dir (@INC) { $using_GP = 1 if -f $dir."/GPDBI.pm" and -f $dir."/GPutils.pm"; } useIf $using_GP, "GPDBI"; useIf $using_GP, "GPutils"; useIf 1, "Utils"; # useIf 1, "Desoverlap"; die("$0: GetOptions failed to retrieve options. Check the input options. Usage:".$usage) unless GetOptions( "server=s" => \$server, "gen1=s" => \$gen1, "gen2=s" => \$gen2, "sizes1=s" => \$gen1sizefile, "sizes2=s" => \$gen2sizefile, "blatfile=s" => \$input_glob, "infile=s" => \$input_glob, "outfile=s" => \$outfile, "glocal_out=s" => \$glocal_out_logfile, "bacteria" => \$proflip, "server=s" => \$server, "db=s" => \$db, "config=s" => \$cfg_file, "tmp_dir=s" => \$tmp_dir, "skip" => \$skip, "no_pid" => \$no_pid, "no_clust_run" => \$no_clust_run, "print_chains" => \$print_chains, "print_stats" => \$print_stats, "no_aligntotals"=> \$no_aligntotals, "print_csv" => \$print_csv, "max_job_size" => \$max_job_size, "max_length=i" => \$max_chainlen, "expand_length=i"=>\$max_expand_len, "min_seq_score=i"=>\$min_seq_score, "max_asym=i" => \$max_asym, "overlap_factor"=> \$overlap_factor, "score=s" => \$score_file, "c1=i" => \$c1, "c2=i" => \$c2, "c3=i" => \$c3, "c4=i" => \$c4, "slagan_params" => \$slagan_params, "write_sizes1=s"=> \$write_sizes1, "write_sizes2=s"=> \$write_sizes2, "keep" => \$nodelete, "f" => \$force_overwrite, "v" => \$debug, "q" => \$quiet ); undef $quiet if $debug; my @uinfo = getpwuid($>); print("$0: Version ".$VERSION." started ".localtime()." by ".$uinfo[0]."\n") unless $quiet; $tmp_prefix = $0.($no_pid ? "" : ".".$$); unless ($no_clust_run) { $no_clust_run = `which clust_run 2> /dev/null`; $no_clust_run = not $no_clust_run; print("$0: clust_run not found - cluster operation disabled\n") if $no_clust_run and not $quiet; } if ($tmp_dir) { $tmp_existed = 1 if -d $tmp_dir; mkdir $tmp_dir unless -d $tmp_dir; $tmp_dir .= "/" unless /\/^Z/; } else { $tmp_dir = "/tmp/".$tmp_prefix; mkdir $tmp_dir; $tmp_dir .= "/"; } die("$0: No write permissions in working directory $tmp_dir. Stopped") unless -w $tmp_dir; die("$0: Genome IDs or size files not specified. Usage:".$usage) unless ($gen1 or $gen1sizefile) and ($gen2 or $gen2sizefile); die("$0: '-gen' options are invalid because GPDB is not available. Use '-sizes'. Stopped") if (($gen1 or $gen2) and not $using_GP); die("$0: Sequence size file $gen1sizefile not found. Stopped") unless -f $gen1sizefile or $gen1; die("$0: Sequence size file $gen2sizefile not found. Stopped") unless -f $gen2sizefile or $gen2; die("$0: Maximum job size too small, must exceed 10000 hits. Stopped") if $max_job_size < 10000; die("$0: Overlap factor must be between 0 and 1. Stopped") if $overlap_factor < 0 or $overlap_factor > 1; print("$0: SLAGAN score file not specified, using default $default_score_file\n") unless $score_file or $quiet; print("$0: Output file not specified, using default $default_outfile\n") unless $outfile or $quiet; # Check input file or glob if (defined $input_glob) { if ($input_glob =~ /\//) { ($input_dir, $input_glob) = ($input_glob =~ /\A(.*\/)([^\/]+)\Z/); } $input_glob .= "\$" unless $input_glob =~ /\$$/; $input_glob = "^".$input_glob unless $input_glob =~ /^\^/; @input_files = Utils::safe_glob($input_glob, $input_dir); } elsif (@ARGV > 0) { foreach my $file (@ARGV) { if ($file =~ /\//) { ($input_dir, $file) = ($file =~ /\A(.*\/)([^\/]+)\Z/); } push @input_files, $file; } } else { # TODO: split stdin for >2GB input open(FH, "> $tmp_dir$tmp_prefix.in"); print FH while ; close FH; push @input_files, "$tmp_prefix.in"; $input_dir = $tmp_dir; } unless ($input_dir =~ /\A\//) { $input_dir = $mycwd."/".$input_dir; } die("$0: No input files matching \"$input_dir$input_glob\" found. Stopped") unless @input_files > 0; print "$0: ".@input_files." input file(s)\n" if $debug; # Check output file $outfile = $default_outfile unless $outfile; if (-f $outfile and not $force_overwrite and -t STDERR) { print STDERR "$0: $outfile exists. Overwrite? (y/N, '-f' to force) "; my $overwrite = ; chomp $overwrite; (print("Move \"$outfile\" or use option '-f'.\n"), exit(1)) unless ($overwrite eq "Y" or $overwrite eq "y" or $overwrite eq "yes"); } open(FH, "> ".$outfile) or die("$0: Cannot open $outfile for writing: $!"); close FH; # Check SLAGAN score file $score_file = $default_score_file unless $score_file; unless ($score_file =~ /\A\//) { $score_file = $mycwd."/".$score_file; } $max_expand_len += $erode_align; die("$0: max_length cannot be less than 0. Stopped") if $max_chainlen < 0; $max_chainlen = 1000000000 if $max_chainlen == 0; $max_chainlen -= 2*$max_expand_len; # SLAGAN output for a given sequence will be discarded if the total score for the sequence is below this threshold. Default value is the SLAGAN unrelated gap penalty. $min_seq_score = getMinSeqScore($score_file) unless defined $min_seq_score; # Connect to GPDB if ($using_GP) { $GPutils::Error = ""; $cfg = read_gp_config(Get_Abs_Path($cfg_file)) or die($GPutils::Error); $server ||= $cfg->Get_Val("DB", "server"); $db ||= $cfg->Get_Val("DB", "main_db"); $dbh = GPDBI->connect($server, 0, $db, undef, undef, "gp_cgi", undef, {PrintError => 0, RaiseError => 1}); } } # Load sequence names and sizes either from GPDB or from file sub getSeqSizes($$$) { my ($dbh, $dataset, $gen_size_file) = @_; if ($dataset) { return get_all_seqs($dbh, $dataset); } else { my %sizes; open(FH, "< ".$gen_size_file) or die("$0: Could not open file $gen_size_file for reading: ".$!); while () { chomp; my ($seq, $size) = split; die("$0: Invalid format in file $gen_size_file") unless $seq and $size; $sizes{$seq} = $size; } close FH; return \%sizes; } } # Convert BLAT to CHAOS if necessary # Flip hits on circular sequence if necessary sub prepareHits() { my ($cur_align); local (*FH, *OUT1); print "$0: Preparing files...\n" unless $quiet; $input_is_blat = 1 if isBLAT($input_dir.$input_files[0]); if ($input_is_blat) { foreach my $file (@input_files) { system('awk \'{$13=($13+$15)?$13:1; print $1,$2,$3";",$5,$6,$7"; '. 'score = "' . $c1 . '*$8-' . $c2 . '*$9-' . $c3 . '*($12+$14)-' . $c4 . '*log($13+$15),"("$4")"}\''. "< $input_dir$file > $tmp_dir$file.chaos"); } } else { foreach my $file (@input_files) { system('ln -s "'.$input_dir.$file.'" "'.$tmp_dir.$file.'.chaos"'); } } if ($proflip) { open(FH, "< ".$tmp_dir.$input_files[0].".chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".chaos for reading: ".$!); open(OUT1, "> ".$tmp_dir.$input_files[0].".flipped.chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".flipped.chaos for writing: ".$!); my (@seq1s, @seq1e, @seq2s, @seq2e, @scores, @orientations, @seqn1, @seqn2); my ($seq1center, $seq2center, $seq1median, $seq2median); my $i = 0; while () { /\A[\s]*.*\s([\d]+)\s([\d]+)\;\s.*\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/; # ($seqn1[$i], $seq1s[$i], $seq1e[$i], $seqn2[$i], $seq2s[$i], $seq2e[$i], $scores[$i], $orientations[$i]) = ($1, $2, $3, $4, $5, $6, $7, $8); ($seq1s[$i], $seq1e[$i], $seq2s[$i], $seq2e[$i], $scores[$i], $orientations[$i]) = ($1, $2, $3, $4, $5, $6); if ($seq1s[$i] > $seq1e[$i]) { my $j = $seq1s[$i]; $seq1s[$i] = $seq1e[$i]; $seq1e[$i] = $j; } if ($seq2s[$i] > $seq2e[$i]) { my $j = $seq2s[$i]; $seq2s[$i] = $seq2e[$i]; $seq2e[$i] = $j; } $i++; } # For each interval pair, # if the seq1 interval median is greater than seq1 median, and the corresponding interval median in seq2 is less than seq2 median, # OR if the seq1 interval median is less than seq1 median, and the corresponding interval median in seq2 is greater than seq2 median, # set start of interval in seq1 to 2CoM1 - previous end of interval # set end of interval in seq1 to 2CoM1 - previous start of interval # flip the orientation (+/-) $seq1center = $$sizes1{(keys(%$sizes1))[0]} / 2; $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2; my $flip_counter = 0; foreach $i (0..@seq1s-1) { $seq1median = ($seq1s[$i] + $seq1e[$i]) / 2; $seq2median = ($seq2s[$i] + $seq2e[$i]) / 2; if (($seq1median > $seq1center and $seq2median < $seq2center) or ($seq1median < $seq1center and $seq2median > $seq2center)) { my $j = $seq2s[$i]; $seq2s[$i] = (2 * $seq2center) - $seq2e[$i]; $seq2e[$i] = (2 * $seq2center) - $j; if ($orientations[$i] eq "+") { $orientations[$i] = "-"; } else { $orientations[$i] = "+"; } $cur_align = []; $$cur_align[START1] = $seq1s[$i]; $$cur_align[START2] = $seq2s[$i]; $$cur_align[END1] = $seq1e[$i]; $$cur_align[END2] = $seq2e[$i]; $$cur_align[SCORE] = $scores[$i]; $$cur_align[ORIENT] = $orientations[$i]; $$cur_align[SEQ1] = (keys(%$sizes1))[0]; $$cur_align[SEQ2] = (keys(%$sizes2))[0]; $$cur_align[START1] += $erode_align; $$cur_align[END1] -= $erode_align; $$cur_align[START2] += $erode_align; $$cur_align[END2] -= $erode_align; $flipped_aligns{alignHashID($cur_align)} = $cur_align; $flip_counter++; } print OUT1 "seq1 ".$seq1s[$i]." ".$seq1e[$i]."; seq2 ".$seq2s[$i]." ".$seq2e[$i]."; score = ".$scores[$i]." (".$orientations[$i].")\n"; } close FH; close OUT1; print "$0: Single-sequence flip mode: ".($flip_counter+0)." hits flipped\n" if $debug; } } # Load all hits into a hash table, then write the hits for each sequence into a file # Run SLAGAN on each of these files, via worker instances either on the cluster or sequentially sub runSLAGAN() { my ($clust_run_invoke, $num_jobs, $sort_pid1, $sort_pid2, $sort_pid3, $one_seq_mode, $cur_align, $next_align, $curlen1, $curlen2, $nextlen1, $nextlen2, $overlap1, $overlap2, $dump_count); local (*RH1, *WH1, *RH2, *WH2, *RH3, *WH3, *IN, *DUPES); # my $filter = Desoverlap->new($overlap_factor, $debug); print "$0: Sorting input hits...\n" if $debug; open(DUPES, "> supermap.duplicates") if $debug; $one_seq_mode = 1 if (keys(%$sizes1) == 1 and keys(%$sizes2) == 1); $sort_pid1 = open2(\*RH1, \*WH1, "sort --key=1,1 --key=2,2n"); # pre-scan $sort_pid2 = open2(\*RH2, \*WH2, "sort --key=1,1 --key=2,2n"); # gen1base $sort_pid3 = open2(\*RH3, \*WH3, "sort --key=4,4 --key=5,5n"); # gen2base # Sort input on seq1 foreach my $file (@input_files) { open(IN, "< $tmp_dir$file".($proflip?".flipped":"").".chaos"); print WH1 while ; close IN; } close WH1; # Scan input, check if start2, end2 are ascending for sorting, erode alignments while () { /\A[\s]*(.*)\s([\d]+)\s([\d]+)\;\s(.*)\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/o; $next_align=[]; ($$next_align[SEQ1], $$next_align[START1], $$next_align[END1], $$next_align[SEQ2], $$next_align[START2], $$next_align[END2], $$next_align[SCORE], $$next_align[ORIENT]) = ($1, $2, $3, $4, $5, $6, $7, $8); next if $$next_align[SCORE] <= 0; if ($one_seq_mode) { $$next_align[SEQ1] = (keys(%$sizes1))[0]; $$next_align[SEQ2] = (keys(%$sizes2))[0]; } checkAlignCoords($next_align); unless ($$next_align[END1]-$$next_align[START1] <= $erode_align*2 or $$next_align[END2]-$$next_align[START2] <= $erode_align*2) { $$next_align[START1] += $erode_align; $$next_align[END1] -= $erode_align; $$next_align[START2] += $erode_align; $$next_align[END2] -= $erode_align; } =head1 # Overlap scan if ($$next_align[START1] <= $$cur_align[END1] and $$next_align[END1] >= $$cur_align[START1] # overlap in seq1 and $$next_align[START2] <= $$cur_align[END2] and $$next_align[END2] >= $$cur_align[START2] # overlap in seq2 and $$cur_align[SEQ1] eq $$next_align[SEQ1] and $$cur_align[SEQ2] eq $$next_align[SEQ2] and $$cur_align[ORIENT] eq $$next_align[ORIENT]) { ($curlen1, $curlen2, $nextlen1, $nextlen2) = ($$cur_align[END1] - $$cur_align[START1] + 1, $$cur_align[END2] - $$cur_align[START2] + 1, $$next_align[END1] - $$next_align[START1] + 1, $$next_align[END2] - $$next_align[START2] + 1); if ($$next_align[START1] <= $$cur_align[START1] and $$next_align[END1] >= $$cur_align[END1]) { $overlap1 = $$cur_align[END1] - $$cur_align[START1] + 1; # next covers cur } elsif ($$next_align[START1] <= $$cur_align[START1]) { $overlap1 = $$next_align[END1] - $$cur_align[START1] + 1; # next is to the left } elsif ($$next_align[END1] >= $$cur_align[END1]) { $overlap1 = $$cur_align[END1] - $$next_align[START1] + 1; # next is to the right } else { $overlap1 = $$next_align[END1] - $$next_align[START1] + 1; # cur covers next } if ($$next_align[START2] <= $$cur_align[START2] and $$next_align[END2] >= $$cur_align[END2]) { $overlap2 = $$cur_align[END2] - $$cur_align[START2] + 1; } elsif ($$next_align[START2] <= $$cur_align[START2]) { $overlap2 = $$next_align[END2] - $$cur_align[START2] + 1; } elsif ($$next_align[END2] >= $$cur_align[END2]) { $overlap2 = $$cur_align[END2] - $$next_align[START2] + 1; } else { $overlap2 = $$next_align[END2] - $$next_align[START2] + 1; } die("$0: Bad internal state") if $overlap1 < 0 or $overlap2 < 0; if (($overlap1 / $curlen1 > $overlap_factor) and ($overlap2 / $curlen2 > $overlap_factor) and $$cur_align[SCORE] <= $$next_align[SCORE]) { $dump_count++; print DUPES "Cur: (".$$cur_align[START1]."-".$$cur_align[END1].")(".$$cur_align[START2]."-".$$cur_align[END2].") ".$$cur_align[SCORE]." over with (".$$next_align[START1]."-".$$next_align[END1].")(".$$next_align[START2]."-".$$next_align[END2].") ".$$next_align[SCORE]."\n" if $debug; $cur_align = $next_align; next; # discard current align } elsif (($overlap1 / $nextlen1 > $overlap_factor) and ($overlap2 / $nextlen2 > $overlap_factor) and $$cur_align[SCORE] >= $$next_align[SCORE]) { $dump_count++; print DUPES "Nxt: (".$$next_align[START1]."-".$$next_align[END1].")(".$$next_align[START2]."-".$$next_align[END2].") ".$$next_align[SCORE]." over with (".$$cur_align[START1]."-".$$cur_align[END1].")(".$$cur_align[START2]."-".$$cur_align[END2].") ".$$cur_align[SCORE]."\n" if $debug; next; # discard next align } } =cut foreach my $cur_align ($next_align){ # (@{$filter->put($next_align)}) { print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n"; print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n"; } # print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if @$cur_align; # print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if @$cur_align; # $cur_align = $next_align; } # $filter->printAll(); # Flush alignments remaining in filter buffer # foreach my $cur_align (@{$filter->getBuffer()}) { # print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if $cur_align != 0; # print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if $cur_align != 0; # } close RH1; waitpid $sort_pid1, 0; close WH2; $num_jobs = reprintInputHits(1, 1, \*RH2); close RH2; waitpid $sort_pid2, 0; close WH3; $num_jobs = reprintInputHits(2, $num_jobs, \*RH3); close RH3; waitpid $sort_pid3, 0; close DUPES if defined fileno DUPES; # print STDERR "$0: Warning: ".$filter->{dump_count}." near duplicate alignments discarded (overlap factor $overlap_factor)\n" if $filter->{dump_count} and not $quiet; open(FH, "> ".$tmp_dir."CLUSTER_JOB_PARAMS") or die; foreach my $i (1..$num_jobs-1) { print FH "worker JOB".$i.".tar ".$score_file." ".$SLAGAN.($debug ? " -v" : ""); print FH " << JOB$i.tar > CLUSTER_JOB_MESSAGES.$i >> CLUSTER_JOB_ERRMSG.$i" unless $no_clust_run; print FH "\n"; } close FH; if ($no_clust_run) { open(FH, "< ".$tmp_dir."CLUSTER_JOB_PARAMS") or die; print "$0: Running ".($num_jobs-1)." SLAGAN jobs locally...\n" unless $quiet; while () { chomp; print("Job $.: \"$0 $_\"\n") if $debug; system("cd $tmp_dir; $supermapexec ".$_); } close FH; } else { $clust_run_invoke = "clust_run -program=".$supermapexec." -parameters=".$tmp_dir."CLUSTER_JOB_PARAMS -init_dir=$tmp_dir -wait"; print "$0: Running ".($num_jobs-1)." distributed SLAGAN jobs with clust_run...\n" unless $quiet; print "$0: \"$clust_run_invoke\"\n" if $debug; if ($clust_run_pid = fork()) { # I am the parent waitpid($clust_run_pid, 0); } elsif (not defined $clust_run_pid) { die("$0: Could not fork"); } else { # I am the child die("$0: Could not exec \"$clust_run_invoke\"") unless exec($clust_run_invoke); } undef $clust_run_pid; } foreach my $i (1..$num_jobs-1) { system("cd $tmp_dir; tar -xf ".$tmp_dir."JOB".$i.".results.tar"); unlink $tmp_dir."JOB".$i.".tar" unless $nodelete; unlink $tmp_dir."JOB".$i.".results.tar" unless $nodelete; unlink $tmp_dir."CLUSTER_JOB_MESSAGES.$i" unless $nodelete; unlink $tmp_dir."CLUSTER_JOB_ERRMSG.$i" unless $nodelete; } unlink "$tmp_dir$input_glob.chaos" unless $nodelete; unlink $tmp_dir."CLUSTER_JOB_PARAMS" unless $nodelete; foreach my $file (@input_files) { unlink $tmp_dir.$file.".chaos" unless $nodelete; } } sub reprintInputHit($$$) { my ($base_gen, $align, $FH) = @_; if ($base_gen == 1 and $$align[ORIENT] eq "+") { print $FH $$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n"; } elsif ($base_gen == 1 and $$align[ORIENT] eq "-") { print $FH $$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[END2]." ".$$align[START2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n"; } elsif ($base_gen == 2 and $$align[ORIENT] eq "+") { print $FH $$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; ".$$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n"; } elsif ($base_gen == 2 and $$align[ORIENT] eq "-") { print $FH $$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; ".$$align[SEQ1]." ".$$align[END1]." ".$$align[START1]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n"; } else { die("$0: Bad internal state from hit ".$$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")"); } } sub writeJobFile($$) { my ($job_id, $seq_list) = @_; local *LIST; open(LIST, "| cd $tmp_dir; xargs tar --append --file=".$tmp_dir."JOB".$job_id.".tar"); foreach my $file (sort alnum keys(%$seq_list)) { $file =~ /\/([^\/]+)$/; print LIST $1." "; } close LIST; foreach my $file (sort alnum keys(%$seq_list)) { unlink $file unless $nodelete; } } # Separate input into files based on sequence name and reverse order in gen2base hits sub reprintInputHits($$$) { my ($base_gen, $job_id, $RH) = @_; my ($one_seq_mode, $line_count, $prev_seq, $cur_seq, $cur_align); my (%cur_seq_list, %pruned_sizes); local (*OUT, *LIST); $one_seq_mode = 1 if (keys(%$sizes1) == 1 and keys(%$sizes2) == 1); print "$0: Reprinting hits (base genome $base_gen)..." if $debug; $line_count = 0; while (<$RH>) { /\A[\s]*(.*)\s([\d]+)\s([\d]+)\;\s(.*)\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/o; $cur_align=[]; ($$cur_align[SEQ1], $$cur_align[START1], $$cur_align[END1], $$cur_align[SEQ2], $$cur_align[START2], $$cur_align[END2], $$cur_align[SCORE], $$cur_align[ORIENT]) = ($1, $2, $3, $4, $5, $6, $7, $8); $cur_seq = ($base_gen == 1 ? $$cur_align[SEQ1] : $$cur_align[SEQ2]); if ($cur_seq ne $prev_seq) { $pruned_sizes{$cur_seq} = ($base_gen == 1 ? $$sizes1{$cur_seq} : $$sizes2{$cur_seq}); print " ".$cur_seq if $debug; close OUT if defined fileno OUT; open(OUT, "> ".$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos for writing: ".$!); if ($line_count > $max_job_size) { writeJobFile($job_id, \%cur_seq_list); undef %cur_seq_list; $line_count = 0; $job_id++; } $cur_seq_list{$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos"} = 1; } reprintInputHit($base_gen, $cur_align, \*OUT) if @$cur_align; $prev_seq = $cur_seq; # $cur_align = $next_align; $line_count++; } # reprintInputHit($base_gen, $next_align, \*OUT) if @$next_align; writeJobFile($job_id, \%cur_seq_list); $job_id++; close OUT; print "\n" if $debug; $sizes1 = \%pruned_sizes if $base_gen == 1; $sizes2 = \%pruned_sizes if $base_gen == 2; return $job_id; } sub seqBelowMinScore($) { my ($line) = @_; $line =~ /\A[\s]*\([\d]+\s[\d]+\)\=\([\d]+\s[\d]+\)\s([\d\.\-]+)\s[\+\-]+\s\[([\d\.\-]+)\][\s]*s1\:.*[\s]*s2\:.*\n\Z/; die("$0: Unable to extract score values from SLAGAN output:\n$line") if not defined $2; return ($2 < $min_seq_score); } sub processResults() { my ($cur_seq, $input_prefix, $dropped_seqs, $sort_pid, $sort_pid2); local (*RH, *WH, *IN, *OUT, *hashesDM_RH, *hashesDM_WH); print "$0: Loading SLAGAN output...\n" unless $quiet; open(GLOCAL_OUT_LOG, "> ".$glocal_out_logfile) if $glocal_out_logfile; # Sort gen2base aligns on seq1, then seq2, then start2, then print them to separate files, one file per gen1 seq # These files will be loaded on demand when scanning gen1base aligns (chainBase1Hits()) $sort_pid = open2(\*RH, \*WH, "sort --key=9,9 --key=7,7 --key=1.2,1n"); # input is base 2, key is 9 because a space is expected between s2: and seq2name $input_prefix = $tmp_dir.$input_files[0].".gen2base"; foreach my $seq (sort alnum keys(%$sizes2)) { open(IN, "< $input_prefix.$seq.chaos.glocal-out") or (delete($$sizes2{$seq}), next); my $line = ; die("$0: Empty SLAGAN output file $input_prefix.$seq.chaos.glocal-out, check corresponding job logs. Stopped") unless $line; if (seqBelowMinScore($line)) { print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; } seek IN, 0, 0; # back to start print WH while ; close IN; } close WH or die("$0: Error executing sort"); while () { /\ss2\:[\s]*([^\s]+)[\s]*\n\Z/; if ($1 ne $cur_seq or not defined $cur_seq) { next unless $1; close OUT if defined fileno OUT; $cur_seq = $1; open(OUT, "> $input_prefix.sorted-gen1.$cur_seq.chaos.glocal-out") or die("$0: Could not open file $input_prefix.sorted-gen1.$cur_seq.chaos.glocal-out for writing: ".$!); } print OUT $_; } close RH; close OUT if defined fileno OUT; waitpid $sort_pid, 0; # Sort gen1base aligns on seq1, then start1 $sort_pid = open2(\*RH, \*WH, "sort --key=7,7 --key=1.2,1n"); # input is base 1 $input_prefix = $tmp_dir.$input_files[0].".gen1base"; foreach my $seq (sort alnum keys(%$sizes1)) { open(IN, "< $input_prefix.$seq.chaos.glocal-out") or (delete($$sizes1{$seq}), next); my $line = ; if (seqBelowMinScore($line)) { $dropped_seqs++; print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; } seek IN, 0, 0; # back to start print WH while ; if ($glocal_out_logfile) { seek IN, 0, 0; print GLOCAL_OUT_LOG while ; } close IN; unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; } unlink $input_prefix.".chaos" unless $nodelete; close WH or die("$0: Error executing sort"); # Feed the gen1base aligns to the 2M/1M1 chain scanner (chainBase1Hits()) # The hashesDM handle is used to write 2M aligns' hashes to be sorted in seq2 order print "$0: Generating supermonotonic map...\n" unless $quiet; $sort_pid2 = open2(\*hashesDM_RH, \*hashesDM_WH, "sort --key=2,2"); chainBase1Hits(*RH, *hashesDM_WH); close RH; waitpid $sort_pid, 0; close hashesDM_WH or die("$0: Error executing sort"); # Print sorted 2M aligns' hashes, one file per gen2 seq undef $cur_seq; while() { my $line = $_; $line =~ /\A[^\s]+\s([^\s]+)\s[^\s]+\n\Z/; if ($1 ne $cur_seq or not defined $cur_seq) { close OUT if defined fileno OUT; $cur_seq = $1; open(OUT, "> $tmp_dir".$input_files[0].".hashesDM.gen2.$cur_seq") or die("$0: Could not open file $tmp_dir".$input_files[0].".hashesDM.gen2.$cur_seq for writing: ".$!); } print OUT $line; } close hashesDM_RH; waitpid $sort_pid2, 0; # Sort gen2base aligns on seq2, then start2 $sort_pid = open2(\*RH, \*WH, "sort --key=7,7 --key=1.2,1n"); # input is base 2 $input_prefix = $tmp_dir.$input_files[0].".gen2base"; foreach my $seq (sort alnum keys(%$sizes2)) { open(IN, "< $input_prefix.$seq.chaos.glocal-out") or next; my $line = ; if (seqBelowMinScore($line)) { $dropped_seqs++; print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; } seek IN, 0, 0; # back to start print WH while ; close IN; unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; } unlink $input_prefix.".chaos" unless $nodelete; close WH or die("$0: Error executing sort"); # Feed the gen2base aligns to the 1M2 chain scanner (chainBase2Hits()) chainBase2Hits(*RH); close RH; waitpid $sort_pid, 0; close GLOCAL_OUT_LOG if defined fileno GLOCAL_OUT_LOG; removeSLAGANOutput(); print STDERR "$0: Warning: Alignments for $dropped_seqs sequences discarded due to total score below cutoff ($min_seq_score)\n" if $dropped_seqs and not $quiet; } sub removeSLAGANOutput() { my $input_prefix = $tmp_dir.$input_files[0].".gen1base"; foreach my $seq (sort alnum keys(%$sizes1)) { unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; } unlink $input_prefix.".chaos" unless $nodelete; $input_prefix = $tmp_dir.$input_files[0].".gen2base"; foreach my $seq (sort alnum keys(%$sizes2)) { unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; } unlink $input_prefix.".chaos" unless $nodelete; rmdir $tmp_dir; } sub alignHashID($) { my ($align) = @_; # return 23*$$align[START1] + 41*$$align[START2] + 61*$$align[END1] + 83*$$align[END2]; return $$align[SEQ1].":".$$align[START1]."-".$$align[END1]."=".$$align[SEQ2].":".$$align[START2]."-".$$align[END2]; } # The chain writer lags the chainer by two chains because the full contents of neighboring chains must be known. sub printChainToTemp($$$$) { my ($FH, $prev_chain, $cur_chain, $next_chain) = @_; return unless defined $cur_chain; my $type = ${$$cur_chain[0]}[ORIGIN]; my ($first_align, $last_align) = ($$cur_chain[0], $$cur_chain[@$cur_chain-1]); print $FH ${$$cur_chain[0]}[ORIGIN]." ".@$cur_chain." ". $$first_align[START1]." ".$$first_align[END1]." ".$$first_align[START2]." ".$$first_align[END2]." ". $$first_align[SEQ1]." ".$$first_align[SEQ2]." ".$$first_align[ORIENT]." ".$$first_align[SCORE]." ". $$last_align[START1]." ".$$last_align[END1]." ".$$last_align[START2]." ".$$last_align[END2]." ". $$last_align[SEQ1]." ".$$last_align[SEQ2]." ".$$last_align[ORIENT]." ".$$last_align[SCORE]; if ($print_chains) { foreach my $align (@$cur_chain) { print $FH " ".$$align[START1]." ".$$align[END1]." ".$$align[START2]." ".$$align[END2]; } } print $FH "\n"; } sub chainBase1Hits($$) { my ($FH, $hashesDM) = @_; local *OUT; my ($cur_align, $prev_align, $cur_chain, $prev_chain, $pre_prev_chain, $chain_start_2M, $chain_start_1M1, $cur_seq, $align_peers, $flip_counter); my @bad_aligns; my %base2peers; while (<$FH>) { /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s([\d\.\-]+)\s([\+\-]+)\s\[([\d\.\-]+)\][\s]*s1\:(.*)[\s]*s2\:(.*)\n\Z/; next if ($1==$2); # skip null alignments (push(@bad_aligns, $_), next) unless $1 and $2 and $3 and $4 and $5 and $6; $cur_align = []; ($$cur_align[START1], $$cur_align[END1], $$cur_align[START2], $$cur_align[END2], $$cur_align[SCORE], $$cur_align[ORIENT], $$cur_align[TOTSC], $$cur_align[SEQ1], $$cur_align[SEQ2]) = ($1, $2, $3, $4, $5, $6, $7, $8, $9); $$cur_align[SEQ1] =~ s/^\s+//; $$cur_align[SEQ1] =~ s/\s+$//; $$cur_align[SEQ2] =~ s/^\s+//; $$cur_align[SEQ2] =~ s/\s+$//; #warn("Seen: ".$_) if $$cur_align[SEQ1] eq "AC002301.1"; checkAlignCoords($cur_align); if ($proflip and defined $flipped_aligns{alignHashID($cur_align)}) { my $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2; my $j = $$cur_align[START2]; $$cur_align[START2] = (2 * $seq2center) - $$cur_align[END2]; $$cur_align[END2] = (2 * $seq2center) - $j; if ($$cur_align[ORIENT] eq "+") { $$cur_align[ORIENT] = "-"; } else { $$cur_align[ORIENT] = "+"; } $$cur_align[FLIPPED]=1; $flip_counter++; } $$cur_align[HASHID] = alignHashID($cur_align); if ($$cur_align[SEQ1] ne $cur_seq) { #warn("Handling seq trans") if $prev_align and $$prev_align[SEQ1] eq "AC002301.1"; printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain);# unless defined $cur_seq; printChainToTemp(*OUT, $prev_chain, $cur_chain, undef);# unless defined $cur_seq; undef $chain_start_2M; undef $chain_start_1M1; undef $prev_align; undef $pre_prev_chain; undef $prev_chain; undef $cur_chain; $cur_seq = $$cur_align[SEQ1]; %base2peers = %{loadBase2Hashes($tmp_dir.$input_files[0].".gen2base.sorted-gen1.$cur_seq.chaos.glocal-out")}; close OUT if defined fileno OUT; open(OUT, "> ".$tmp_dir.$input_files[0].".2MM1.$cur_seq"); } $align_peers = $base2peers{$$cur_align[HASHID]}; $$cur_align[ORIGIN] = defined($align_peers) ? 2 : 1; if ($chain_start_2M and defined $align_peers and defined $prev_align # continue open 2M chain and (($$cur_align[ORIENT] eq "+" and $$cur_align[START2] > $$prev_align[END2] and $$prev_align[HASHID] eq $$align_peers[0]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] < $$prev_align[START2] and $$prev_align[HASHID] eq $$align_peers[1]) or ($$cur_align[FLIPPED] and ($$cur_align[ORIENT] eq "+" and $$cur_align[START2] < $$prev_align[END2] and $$prev_align[HASHID] eq $$align_peers[0]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] > $$prev_align[START2] and $$prev_align[HASHID] eq $$align_peers[1]))) and $$cur_align[ORIENT] eq $$prev_align[ORIENT] and $$cur_align[FLIPPED] eq $$prev_align[FLIPPED] and $$cur_align[SEQ2] eq $$prev_align[SEQ2] and ($$cur_align[START1] > $$prev_align[END1] or ($$cur_align[FLIPPED] and $$cur_align[START1] > $$prev_align[END1])) and abs($$cur_align[END1] - $$chain_start_2M[START1]) < $max_chainlen and abs($$cur_align[END2] - $$chain_start_2M[START2]) < $max_chainlen #and abs($$cur_align[END1] - $$chain_start_2M[START1])/abs($$cur_align[END2] - $$chain_start_2M[START2]) < $max_asym #and abs($$cur_align[END2] - $$chain_start_2M[START2])/abs($$cur_align[END1] - $$chain_start_2M[START1]) < $max_asym ) { push(@$cur_chain, $cur_align); print $hashesDM $$cur_align[SEQ1]."\t".$$cur_align[SEQ2]."\t".$$cur_align[HASHID]."\n"; } elsif (defined $align_peers) { # start new 2M chain printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain); $chain_start_2M = $cur_align; undef $chain_start_1M1; $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain; $cur_chain = [$cur_align]; print $hashesDM $$cur_align[SEQ1]."\t".$$cur_align[SEQ2]."\t".$$cur_align[HASHID]."\n"; } elsif ($chain_start_1M1 and defined $prev_align # continue open 1M1 chain and ((($$cur_align[ORIENT] eq "+" and $$cur_align[START2] > $$prev_align[END2]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] < $$prev_align[START2])) or ($$cur_align[FLIPPED] and (($$cur_align[ORIENT] eq "+" and $$cur_align[START2] < $$prev_align[END2]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] > $$prev_align[START2])))) and $$cur_align[ORIENT] eq $$prev_align[ORIENT] and $$cur_align[FLIPPED] eq $$prev_align[FLIPPED] and $$cur_align[SEQ2] eq $$prev_align[SEQ2] and ($$cur_align[START1] > $$prev_align[END1] or ($$cur_align[FLIPPED] and $$cur_align[START1] > $$prev_align[END1])) and abs($$cur_align[END1] - $$chain_start_1M1[START1]) < $max_chainlen and abs($$cur_align[END2] - $$chain_start_1M1[START2]) < $max_chainlen #and abs($$cur_align[END1] - $$chain_start_1M1[START1])/abs($$cur_align[END2] - $$chain_start_1M1[START2]) < $max_asym #and abs($$cur_align[END2] - $$chain_start_1M1[START2])/abs($$cur_align[END1] - $$chain_start_1M1[START1]) < $max_asym ) { push(@$cur_chain, $cur_align); } else { # start new 1M1 chain printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain); $chain_start_1M1 = $cur_align; undef $chain_start_2M; $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain; $cur_chain = [$cur_align]; } $prev_align = $cur_align; } printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain); printChainToTemp(*OUT, $prev_chain, $cur_chain, undef); print "$0: Single-sequence flip mode: ".($flip_counter+0)." gen1base hits backflipped\n" if $debug and $proflip; warn "$0: Warning: ".@bad_aligns." bad SLAGAN alignments discarded" if @bad_aligns > 0; } # Input is base 2, i.e. (start2 end2)=(start1 end1)... sub chainBase2Hits($) { my ($FH) = @_; local *OUT; my ($cur_align, $prev_align, $cur_chain, $prev_chain, $pre_prev_chain, $chain_start_2M, $chain_start_1M2, $cur_seq, $align_is_2M, $flip_counter); my @bad_aligns; my %aligns2M; while(<$FH>) { /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s([\d\.\-]+)\s([\+\-]+)\s\[([\d\.\-]+)\][\s]*s1\:(.*)[\s]*s2\:(.*)\n\Z/; next if ($1==$2); # skip null alignments (push(@bad_aligns, $_), next) unless $1 and $2 and $3 and $4 and $5 and $6; $cur_align = []; ($$cur_align[START2], $$cur_align[END2], $$cur_align[START1], $$cur_align[END1], $$cur_align[SCORE], $$cur_align[ORIENT], $$cur_align[TOTSC], $$cur_align[SEQ2], $$cur_align[SEQ1]) = ($1, $2, $3, $4, $5, $6, $7, $8, $9); $$cur_align[SEQ1] =~ s/^\s+//; $$cur_align[SEQ1] =~ s/\s+$//; $$cur_align[SEQ2] =~ s/^\s+//; $$cur_align[SEQ2] =~ s/\s+$//; checkAlignCoords($cur_align); if ($proflip and defined $flipped_aligns{alignHashID($cur_align)}) { my $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2; my $j = $$cur_align[START2]; $$cur_align[START2] = (2 * $seq2center) - $$cur_align[END2]; $$cur_align[END2] = (2 * $seq2center) - $j; if ($$cur_align[ORIENT] eq "+") { $$cur_align[ORIENT] = "-"; } else { $$cur_align[ORIENT] = "+"; } $$cur_align[FLIPPED] = 1; $flip_counter++; } $$cur_align[HASHID] = alignHashID($cur_align); if ($$cur_align[SEQ2] ne $cur_seq) { printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3;# and not defined $cur_seq; printChainToTemp(*OUT, $prev_chain, $cur_chain, undef) if $$cur_chain[0][ORIGIN] == 3;# and not defined $cur_seq; undef $chain_start_1M2; undef $prev_align; undef $pre_prev_chain; undef $prev_chain; undef $cur_chain; $cur_seq = $$cur_align[SEQ2]; %aligns2M = %{load2MHashes($tmp_dir.$input_files[0].".hashesDM.gen2.$cur_seq")}; close OUT if defined fileno OUT; open(OUT, "> ".$tmp_dir.$input_files[0].".M2.$cur_seq"); } $$cur_align[ORIGIN] = defined($aligns2M{$$cur_align[HASHID]}) ? 2 : 3; if (defined $aligns2M{$$cur_align[HASHID]}) { # align is 2M my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : []; printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3; undef $chain_start_1M2; # close 1M2 chain $chain_start_2M = $cur_align; $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain; $cur_chain = [$cur_align]; } elsif ($chain_start_1M2 # continue open 1M2 chain and ((($$cur_align[ORIENT] eq "+" and $$cur_align[START1] > $$prev_align[END1]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END1] < $$prev_align[START1])) or ($$cur_align[FLIPPED] and (($$cur_align[ORIENT] eq "+" and $$cur_align[START1] < $$prev_align[END1]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END1] > $$prev_align[START1])))) and $$cur_align[ORIENT] eq $$prev_align[ORIENT] and $$cur_align[SEQ1] eq $$prev_align[SEQ1] and $$cur_align[FLIPPED] == $$prev_align[FLIPPED] and ($$cur_align[START2] > $$prev_align[END2] or ($$cur_align[FLIPPED] and $$cur_align[START2] < $$prev_align[END2])) and abs($$cur_align[END1] - $$chain_start_1M2[START1]) < $max_chainlen and abs($$cur_align[END2] - $$chain_start_1M2[START2]) < $max_chainlen #and abs($$cur_align[END1] - $$chain_start_1M2[START1])/abs($$cur_align[END2] - $$chain_start_1M2[START2]) < $max_asym #and abs($$cur_align[END2] - $$chain_start_1M2[START2])/abs($$cur_align[END1] - $$chain_start_1M2[START1]) < $max_asym ) { push(@$cur_chain, $cur_align); } else { # start new 1M2 chain my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : []; printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3; $chain_start_1M2 = $cur_align; $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain; $cur_chain = [$cur_align]; } $prev_align = $cur_align; } my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : []; printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3; printChainToTemp(*OUT, $prev_chain, $cur_chain, undef) if $$cur_chain[0][ORIGIN] == 3; print "$0: Single-sequence flip mode: ".($flip_counter+0)." gen2base hits backflipped\n" if $debug and $proflip; warn "$0: Warning: ".@bad_aligns." bad SLAGAN alignments discarded" if @bad_aligns > 0; } # Input: file with lines of the form "seq1 seq2 hash" (seq2 should be the same per file) # Output: hash(key->align hash ID, value->1). Input file is deleted. sub load2MHashes($) { my ($file) = @_; my %hashes; local *FH; open(FH, "< $file") or return {}; while () { /\A[^\s]+\t[^\s]+\t([^\s]+)\n\Z/; warn("Hash collision in \"$_\" vs. \"".$hashes{$1}."\"") if defined $hashes{$1}; $hashes{$1} = 1; } close FH; unlink $file unless $nodelete; return \%hashes; } # Input: file with gen2base alignments which should have the same seq1 ordered by start2 or not exist # Output: hash(key->align hash ID, value->[prev align hash ID, next align hash ID]). Input file is deleted. # Input is base 2, i.e. (start2 end2)=(start1 end1)... sub loadBase2Hashes($) { my ($file) = @_; my ($prev_align, $cur_align, $next_align); my %hashes; local *FH; open(FH, "< $file") or return {}; while () { # Scan 1 line ahead because the next align must also be seen /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s.*s1\:(.*)[\s]*s2\:(.*)/; $next_align = []; # Hits are gen2base ($$next_align[START2], $$next_align[END2], $$next_align[START1], $$next_align[END1], $$next_align[SEQ2], $$next_align[SEQ1]) = ($1, $2, $3, $4, $5, $6); checkAlignCoords($next_align); $$next_align[SEQ1] =~ s/^\s+//; $$next_align[SEQ1] =~ s/\s+$//; $$next_align[SEQ2] =~ s/^\s+//; $$next_align[SEQ2] =~ s/\s+$//; $$next_align[HASHID] = alignHashID($next_align); warn("LB2H: Hash collision in \"$_\"") if defined $cur_align and defined $hashes{$$cur_align[HASHID]}; $hashes{$$cur_align[HASHID]} = [$prev_align ? $$prev_align[HASHID] : 1, $next_align ? $$next_align[HASHID] : 1] if $cur_align; $prev_align = $cur_align; $cur_align = $next_align; } $hashes{$$cur_align[HASHID]} = [$prev_align ? $$prev_align[HASHID] : 1, undef] if $cur_align; close FH; unlink $file unless $nodelete; return \%hashes; } # Load chained regions and expand them according to the expansion rules, then print them out and display some chain statistics sub postProcessRegions() { local (*IN, *OUT, *RH1, *WH1, *RH2, *WH2, *RH3, *WH3); my ($first_align, $last_align, $type, $num_aligns, $sort_pid1, $sort_pid2, $sort_pid3); my (@line, @min_lengths, @max_lengths, @means, @pos_counts, @neg_counts); $sort_pid1 = open2(\*RH1, \*WH1, "sort --key=7,7 --key=3,3n"); # sort on seq1, start1 $sort_pid2 = open2(\*RH2, \*WH2, "sort --key=8,8 --key=5,5n"); # sort on seq2, start2 $sort_pid3 = open2(\*RH3, \*WH3, "sort --key=7,7 --key=3,3n"); # sort on seq1, start1 # open(WH1, "> ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!); open(OUT, "> ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!); # open(OUT, "| sort --key=1,1 --key=2,2n > ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!); foreach my $seq (sort alnum keys %$sizes1) { open(IN, "< ".$tmp_dir.$input_files[0].".2MM1.$seq") or next; print WH1 while ; close IN; unlink $tmp_dir.$input_files[0].".2MM1.$seq" unless $nodelete; } foreach my $seq (sort alnum keys %$sizes2) { open(IN, "< ".$tmp_dir.$input_files[0].".M2.$seq") or next; print WH1 while ; close IN; unlink $tmp_dir.$input_files[0].".M2.$seq" unless $nodelete; } close WH1; expandSeq1(\*RH1, \*WH2); close RH1; waitpid $sort_pid1, 0; close WH2; expandSeq2(\*RH2, \*WH3); close RH2; waitpid $sort_pid2, 0; close WH3; finalExpand(\*RH3, \*OUT); close RH3; waitpid $sort_pid3, 0; close OUT; } # Input: chains ordered by seq1, start1 # Output: chains expanded on seq1 sub expandSeq1($$) { my ($RH, $WH) = @_; my ($first_align, $last_align, $type, $num_aligns, $cur_seq, $preexpand1, $postexpand1, $prev_chain, $cur_chain, $next_chain); my (@line); while (<$RH>) { chomp; @line = split; # skip M2 regions if ($line[0] == 3) { $,= " "; print $WH @line[0..17]; print $WH " 0 0 0 0 "; print $WH @line[18..$#line]; print $WH "\n"; undef $,; next; } $prev_chain = $cur_chain; $cur_chain = $next_chain; $first_align = []; $last_align = []; ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2], $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE], $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2], $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE]) = @line; $$first_align[CHALO1] = ($$first_align[START1] < $$last_align[START1] ? $$first_align[START1] : $$last_align[START1]); $$first_align[CHAHI1] = ($$first_align[END1] > $$last_align[END1] ? $$first_align[END1] : $$last_align[END1]); my @saved_line = @line; $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line]; next unless defined $cur_chain; expSeq1Reg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq); # TODO # if ($cur_seq ne $$first_align[SEQ1]) { # undef $cur_chain; # $cur_seq = $$first_align[SEQ1]; # } } expSeq1Reg($WH, $cur_chain, $next_chain, undef, $cur_seq); } sub expSeq1Reg($$$$$) { my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_; my ($preexpand1, $postexpand1); $preexpand1 = $$cur_chain[0][CHALO1] - (defined $prev_chain ? $$prev_chain[0][CHAHI1] : 0); $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len; #$preexpand1 = 0 if $preexpand1 < 0; $preexpand1 = $max_expand_len if $preexpand1 < 0; # !!! $postexpand1 = $$next_chain[0][CHALO1] - $$cur_chain[0][CHAHI1]; $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len; #$postexpand1 = 0 if $postexpand1 < 0; $postexpand1 = $max_expand_len if $postexpand1 < 0; #$postexpand1 = 0 if defined $prev_chain and $$prev_chain[0][CHAHI1] > $$cur_chain[0][CHAHI1]; # don't expand if covered by another align $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1; $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1; $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1; $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]}; $cur_seq = $$cur_chain[0][SEQ1] if not defined $cur_seq; if ($cur_seq ne $$cur_chain[0][SEQ1]) { # Correct upper expansion $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $max_expand_len; $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]}; } print $WH $$cur_chain[2]." ".$$cur_chain[3]." ". $$cur_chain[0][START1]." ".$$cur_chain[0][END1]." ".$$cur_chain[0][START2]." ".$$cur_chain[0][END2]." ". $$cur_chain[0][SEQ1]." ".$$cur_chain[0][SEQ2]." ".$$cur_chain[0][ORIENT]." ".$$cur_chain[0][SCORE]." ". $$cur_chain[1][START1]." ".$$cur_chain[1][END1]." ".$$cur_chain[1][START2]." ".$$cur_chain[1][END2]." ". $$cur_chain[1][SEQ1]." ".$$cur_chain[1][SEQ2]." ".$$cur_chain[1][ORIENT]." ".$$cur_chain[1][SCORE]." ". $$cur_chain[0][CHALO1]." ".$$cur_chain[0][CHAHI1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E]; if ($print_chains) { my $i = 18; while (1) { print $WH " ".${$$cur_chain[4]}[$i]." ".${$$cur_chain[4]}[$i+1]." ".${$$cur_chain[4]}[$i+2]." ".${$$cur_chain[4]}[$i+3]; last if @{$$cur_chain[4]} <= $i+4; $i+=4; } } print $WH "\n"; } # Input: chains ordered by seq2, start2 # Output: chains expanded on seq1 and seq2 (final output) sub expandSeq2($$) { my ($RH, $WH) = @_; my ($first_align, $last_align, $type, $num_aligns, $cur_seq, $preexpand1, $postexpand1, $preexpand2, $postexpand2, $prev_chain, $cur_chain, $next_chain); my (@line); while (<$RH>) { chomp; @line = split; # skip M1 regions if ($line[0] == 1) { $,= " "; print $WH @line[0..21]; print $WH " 0 0 0 0 "; print $WH @line[22..$#line]; print $WH "\n"; undef $,; next; } $prev_chain = $cur_chain; $cur_chain = $next_chain; $first_align = []; $last_align = []; ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2], $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE], $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2], $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE], $$first_align[CHALO1], $$first_align[CHAHI1], $$first_align[CHALO1E], $$first_align[CHAHI1E]) = @line; $$first_align[CHALO2] = ($$first_align[START2] < $$last_align[START2] ? $$first_align[START2] : $$last_align[START2]); $$first_align[CHAHI2] = ($$first_align[END2] > $$last_align[END2] ? $$first_align[END2] : $$last_align[END2]); my @saved_line = @line; $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line]; next unless defined $cur_chain; expSeq2Reg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq); # if ($cur_seq ne $$first_align[SEQ2]) { # undef $cur_chain; # $cur_seq = $$first_align[SEQ2]; # } } expSeq2Reg($WH, $cur_chain, $next_chain, undef, $cur_seq); } sub expSeq2Reg($$$$$) { my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_; my ($preexpand1, $postexpand1, $preexpand2, $postexpand2); $preexpand1 = $$cur_chain[0][CHALO1] - $$cur_chain[0][CHALO1E]; $postexpand1 = $$cur_chain[0][CHAHI1E] - $$cur_chain[0][CHAHI1]; $preexpand2 = $$cur_chain[0][CHALO2] - (defined $prev_chain ? $$prev_chain[0][CHAHI2] : 0); $preexpand2 = $preexpand1 * $expand_factor if $preexpand2 > $preexpand1 * $expand_factor and $$cur_chain[2] != 3; $preexpand2 = $max_expand_len if $preexpand2 > $max_expand_len; #$preexpand2 = 0 if $preexpand2 < 0; $preexpand2 = $max_expand_len if $preexpand2 < 0; $preexpand1 = $preexpand2 * $expand_factor if $preexpand1 > $preexpand2 * $expand_factor and $$cur_chain[2] != 3; $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len; $postexpand2 = $$next_chain[0][CHALO2] - $$cur_chain[0][CHAHI2]; $postexpand2 = $postexpand1 * $expand_factor if $postexpand2 > $postexpand1 * $expand_factor and $$cur_chain[2] != 3; $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len; #$postexpand2 = 0 if $postexpand2 < 0; $postexpand2 = $max_expand_len if $postexpand2 < 0; $postexpand1 = $postexpand2 * $expand_factor if $postexpand1 > $postexpand2 * $expand_factor and $$cur_chain[2] != 3; $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len; $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1; $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1; $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1; $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]}; $$cur_chain[0][CHALO2E] = $$cur_chain[0][CHALO2] - $preexpand2; $$cur_chain[0][CHALO2E] = 1 if $$cur_chain[0][CHALO2E] < 1; $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2; $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]}; if ($cur_seq ne $$cur_chain[0][SEQ2]) { # Correct upper expansion $postexpand2 = $postexpand1 * $expand_factor; $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len; $postexpand2 = 0 if $postexpand2 < 0; $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2; $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]}; } print $WH $$cur_chain[2]." ".$$cur_chain[3]." ". $$cur_chain[0][START1]." ".$$cur_chain[0][END1]." ".$$cur_chain[0][START2]." ".$$cur_chain[0][END2]." ". $$cur_chain[0][SEQ1]." ".$$cur_chain[0][SEQ2]." ".$$cur_chain[0][ORIENT]." ".$$cur_chain[0][SCORE]." ". $$cur_chain[1][START1]." ".$$cur_chain[1][END1]." ".$$cur_chain[1][START2]." ".$$cur_chain[1][END2]." ". $$cur_chain[1][SEQ1]." ".$$cur_chain[1][SEQ2]." ".$$cur_chain[1][ORIENT]." ".$$cur_chain[1][SCORE]." ". $$cur_chain[0][CHALO1]." ".$$cur_chain[0][CHAHI1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E]." ". $$cur_chain[0][CHALO2]." ".$$cur_chain[0][CHAHI2]." ".$$cur_chain[0][CHALO2E]." ".$$cur_chain[0][CHAHI2E]; if ($print_chains) { my $i = 22; while (1) { print $WH " ".${$$cur_chain[4]}[$i]." ".${$$cur_chain[4]}[$i+1]." ".${$$cur_chain[4]}[$i+2]." ".${$$cur_chain[4]}[$i+3]; last if @{$$cur_chain[4]} <= $i+4; $i+=4; } } print $WH "\n"; } sub finalExpReg($$$$$) { my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_; my ($preexpand1, $postexpand1, $preexpand2, $postexpand2); if ($$cur_chain[2] == 1) { # M1: expand in seq1 on seq2 expands * factor only $preexpand1 = $$cur_chain[0][CHALO1] - $$cur_chain[0][CHALO1E]; $preexpand2 = $preexpand1 * $expand_factor; $preexpand2 = $max_expand_len if $preexpand2 > $max_expand_len; $postexpand1 = $$cur_chain[0][CHAHI1E] - $$cur_chain[0][CHAHI1]; $postexpand2 = $postexpand1 * $expand_factor; $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len; $$cur_chain[0][CHALO2E] = $$cur_chain[0][CHALO2] - $preexpand2; $$cur_chain[0][CHALO2E] = 1 if $$cur_chain[0][CHALO2E] < 1; $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2; $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]}; } elsif ($$cur_chain[2] == 3) { # M2: expand in seq2 on seq1 expands * factor only $preexpand2 = $$cur_chain[0][CHALO2] - $$cur_chain[0][CHALO2E]; $preexpand1 = $preexpand2 * $expand_factor; $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len; $postexpand2 = $$cur_chain[0][CHAHI2E] - $$cur_chain[0][CHAHI2]; $postexpand1 = $postexpand2 * $expand_factor; $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len; $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1; $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1; $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1; $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]}; } print $WH $$cur_chain[0][SEQ1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E]." ". $$cur_chain[0][SEQ2]." ".$$cur_chain[0][CHALO2E]." ".$$cur_chain[0][CHAHI2E]." ".$$cur_chain[0][ORIENT]; print $WH " (".($$cur_chain[2]==1?"M1, ":$$cur_chain[2]==2?"DM, ":"M2, ").$$cur_chain[3]." aligns)" unless $no_aligntotals; if ($print_chains) { my $i = 26; while (1) { print $WH " [".${$$cur_chain[4]}[$i]."-".${$$cur_chain[4]}[$i+1]."=".${$$cur_chain[4]}[$i+2]."-".${$$cur_chain[4]}[$i+3]."]"; last if @{$$cur_chain[4]} <= $i+4; $i+=4; } } print $WH "\n"; } sub finalExpand($$) { my ($RH, $WH) = @_; my ($first_align, $last_align, $type, $num_aligns, $cur_seq, $preexpand1, $postexpand1, $preexpand2, $postexpand2, $prev_chain, $cur_chain, $next_chain); my %stats; my (@line); while (<$RH>) { chomp; @line = split; $prev_chain = $cur_chain; $cur_chain = $next_chain; $first_align = []; $last_align = []; ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2], $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE], $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2], $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE], $$first_align[CHALO1], $$first_align[CHAHI1], $$first_align[CHALO1E], $$first_align[CHAHI1E], $$first_align[CHALO2], $$first_align[CHAHI2], $$first_align[CHALO2E], $$first_align[CHAHI2E]) = @line; if ($type == 1) { $$first_align[CHALO2] = ($$first_align[START2] < $$last_align[START2] ? $$first_align[START2] : $$last_align[START2]); $$first_align[CHAHI2] = ($$first_align[END2] > $$last_align[END2] ? $$first_align[END2] : $$last_align[END2]); } elsif ($type == 3) { $$first_align[CHALO1] = ($$first_align[START1] < $$last_align[START1] ? $$first_align[START1] : $$last_align[START1]); $$first_align[CHAHI1] = ($$first_align[END1] > $$last_align[END1] ? $$first_align[END1] : $$last_align[END1]); } my @saved_line = @line; $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line]; next unless defined $cur_chain; finalExpReg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq); if ($debug or $print_stats) { if ($type == 1) { $$cur_chain[0][ORIENT] eq "+" ? $stats{"M1+"}++ : $stats{"M1-"}++; $stats{"M1min"} = $num_aligns if $stats{"M1min"} > $num_aligns or not defined $stats{"M1min"}; $stats{"M1max"} = $num_aligns if $stats{"M1max"} < $num_aligns or not defined $stats{"M1max"}; $stats{"M1mean"} += $num_aligns; } elsif ($type == 2) { $$cur_chain[0][ORIENT] eq "+" ? $stats{"DM+"}++ : $stats{"DM-"}++; $stats{"DMmin"} = $num_aligns if $stats{"DMmin"} > $num_aligns or not defined $stats{"DMmin"}; $stats{"DMmax"} = $num_aligns if $stats{"DMmax"} < $num_aligns or not defined $stats{"DMmax"}; $stats{"DMmean"} += $num_aligns; } else { $$cur_chain[0][ORIENT] eq "+" ? $stats{"M2+"}++ : $stats{"M2-"}++; $stats{"M2min"} = $num_aligns if $stats{"M2min"} > $num_aligns or not defined $stats{"M2min"}; $stats{"M2max"} = $num_aligns if $stats{"M2max"} < $num_aligns or not defined $stats{"M2max"}; $stats{"M2mean"} += $num_aligns; } } if ($cur_seq ne $$first_align[SEQ2]) { undef $cur_chain; $cur_seq = $$first_align[SEQ2]; } } finalExpReg($WH, $cur_chain, $next_chain, undef, $cur_seq); if ($debug or $print_stats) { foreach my $i ("DM", "M1", "M2") { $stats{$i."mean"} /= ($stats{$i."+"} + $stats{$i."-"}) unless ($stats{$i."+"} + $stats{$i."-"} == 0); print $i.": ".($stats{$i."+"} + $stats{$i."-"})." chains (".$stats{$i."+"}."+, ".$stats{$i."-"}."-); ". "length min ".$stats{$i."min"}.", avg ".$stats{$i."mean"}.", max ".$stats{$i."max"}."\n"; } } } # Called only in a "$0 worker" invocation sub workerRun($$$$) { my ($tar_file, $score_file, $SLAGAN, $debug) = @_; my ($tmp_dir, $io_dir) = ($worker_tmp_dir, getcwd); local *FH; mkdir($tmp_dir) or die("$0 (worker): Could not create directory $tmp_dir: ".$!); copy($score_file, $tmp_dir); $score_file =~ /.*\/([^\/]+)$/; $score_file = $tmp_dir.$1; print("$0 (worker): Version ".$VERSION." started ".localtime()."\n") if $debug; print("$0 (worker): Jobfile=$tar_file, scorefile=$score_file, tmpdir=$tmp_dir, iodir=$io_dir, SLAGAN=$SLAGAN\n") if $debug; move($io_dir."/".$tar_file, $tmp_dir); my @files = `cd $tmp_dir; tar -xvf $tar_file` or warn("$0 (worker): Error extracting $tar_file"); foreach my $file (@files) { chomp $file; #print "$SLAGAN $tmp_dir$file $score_file > $tmp_dir$file.glocal-out 2> $tmp_dir$file.glocal-err\n"; system("$SLAGAN $tmp_dir$file $score_file ". "> $tmp_dir$file.glocal-out ". "2> $tmp_dir$file.glocal-err"); } $tar_file =~ /(.*)\.tar$/; $tar_file = $1; open(FH, "| cd $tmp_dir; xargs tar --append --file=$io_dir/$tar_file.results.tar"); foreach my $file (glob("$tmp_dir/*glocal-out")) { $file =~ /\/([^\/]+)$/; print FH $1." "; } close FH; rmtree $tmp_dir; opendir(DIR, "."); if (my @x = grep(/core\./,readdir(DIR))) { warn("$0 (worker): WARNING: $SLAGAN crashed ".@x." times"); } closedir DIR; unlink(glob("core.*")) unless $nodelete; } # Interrupt handler sub dequeueClustJobs($) { print "\n$0: Received SIG".$_[0].". Cleaning up... "; if ($clust_run_pid) { # send SIGQUIT to clust_run so it can dequeue cluster jobs kill "QUIT", $clust_run_pid; } unless ($debug or $nodelete) { print "Removing job files..."; foreach my $i (1..$num_jobs-1) { unlink $tmp_dir."JOB".$i.".tar"; unlink $tmp_dir."JOB".$i.".results.tar"; unlink $tmp_dir."CLUSTER_JOB_MESSAGES.$i"; unlink $tmp_dir."CLUSTER_JOB_ERRMSG.$i"; } unlink "$tmp_dir$input_glob.chaos"; unlink $tmp_dir."CLUSTER_JOB_PARAMS"; rmtree($tmp_dir) if $ARGV[0] eq "worker"; } print "\n"; exit(1); } # Retrieve sequence length data from GPDB sub get_all_seqs($$) { my ($dbh, $genome) = @_; my ($dset, $annot_db, $family, $check_chroms, %sizes, $chroms, @real_chroms, $ctgs); ($dset, $annot_db, $family) = ($genome =~ /^\d+$/o) ? ($genome + 0, ($dbh->get_data_set($genome))[4,14]) : ($dbh->get_family_dset($genome))[0,4,14]; print "$0: Genome $genome, dataset $dset, annotation db \"$annot_db\", family \"$family\"\n" if $debug; $annot_db and $check_chroms = 1; if ($check_chroms) { $chroms = $dbh->get_chroms(($dbh->get_data_set($dset))[2]); foreach my $chrom (@$chroms) { $$chrom[1] == 1 or next; my $name = "chr$$chrom[2]"; my ($chr_id, $chr_type, $ctg_id, $size) = $dbh->find_seq($name, $dset, $annot_db); $chr_id and $sizes{$name} = $size; } } $ctgs = $dbh->selectcol("SELECT name FROM dset$dset\_contigs " . "WHERE name is not null and name != ? group by name", undef, ""); foreach my $ctg (@$ctgs) { $sizes{$ctg} = $dbh->get_contig_size($dset, $ctg); } return \%sizes; } sub alnum { my ($i); my ($len1, $len2) = (length($a), length($b)); for ($i = 0; ($i < $len1) && ($i < $len2); ++$i) { my $c1 = substr($a, $i, 1); my $c2 = substr($b, $i, 1); ($c1 =~ /^\d/o) || ($c2 =~ /^\d/o) || ($c1 ne $c2) and last; } my $a_r = ($i < $len1) ? substr($a, $i) : ""; my $b_r = ($i < $len2) ? substr($b, $i) : ""; my ($a_n, $a_s) = ($a_r =~ /^(\d+)(.*)$/o); my ($b_n, $b_s) = ($b_r =~ /^(\d+)(.*)$/o); return (defined($a_n) && defined($b_n)) ? (($a_n <=> $b_n) || ($a_s cmp $b_s)) : ($a cmp $b); } sub isBLAT($) { my ($file) = @_; local *FH; open(FH, "< ".$file) or die("$0: Cannot open input file $file: ".$!); my $line = ; close FH; if ($line =~ /\A.+\s[\d]+\s[\d]+\;\s.+\s[\d]+\s[\d]+\;\sscore/) { return 0; } elsif ($line =~ /\A[^\s]+\s[\d]+\s[\d]+\s[^\s]+\s/) { return 1; } else { die("$0: Unknown input format in $file. Stopped"); } } sub getMinSeqScore($) { my ($file) = @_; my $score; local *FH; open(FH, "< ".$file) or die("$0: Could not open SLAGAN scorefile $file: $!"); while () { # sample line: {+U+;+U-;-U+;-U-}{70000 0 0 0} /\{\+U\+\;.+\}.*\{(\d+)\s.+\}/; $score = $1 if $1; } close FH; die("$0: Could not determine min_seq_score from SLAGAN scorefile $file. Stopped") unless $score; print "$0: min_seq_score: $score\n" if $debug; return $score; } sub writeSizes($$) { my ($sizes, $outfile) = @_; local *FH; open(FH, "> ".$outfile) or die("$0: Could not open file $outfile for writing: ".$!); foreach my $key (sort alnum keys %$sizes1) { print FH $key."\t".$$sizes1{$key}."\n"; } close FH; } # Borrowed from if.pm to enable standalone conditional module loading on earlier versions of Perl sub useIf($$) { my $method = 'import'; return unless shift; # CONDITION my $package = $_[0]; (my $file = $package.".pm") =~ s!::!/!g; require $file; my $method_entry_point = $package->can($method); goto &$method_entry_point if $method_entry_point; } sub checkAlignCoords($) { my $cur_align = $_[0]; if ($$cur_align[START1] > $$cur_align[END1]) { my $i = $$cur_align[START1]; $$cur_align[START1] = $$cur_align[END1]; $$cur_align[END1] = $i; } if ($$cur_align[START2] > $$cur_align[END2]) { my $i = $$cur_align[START2]; $$cur_align[START2] = $$cur_align[END2]; $$cur_align[END2] = $i; } # if ($$cur_align[OSTART1] > $$cur_align[OEND1]) { my $i = $$cur_align[OSTART1]; $$cur_align[OSTART1] = $$cur_align[OEND1]; $$cur_align[OEND1] = $i; } # if ($$cur_align[OSTART2] > $$cur_align[OEND2]) { my $i = $$cur_align[OSTART2]; $$cur_align[OSTART2] = $$cur_align[OEND2]; $$cur_align[OEND2] = $i; } } =head1 NAME Supermap: Piecewise monotonic alignment map generator for shuffle-lagan =head1 SYNOPSIS supermap.pl (gen2=id | sizes2=filename) (gen1=id | sizes1=filename) [-infile=] [-outfile=] [-bacteria] [-score=filename] [-f] [file1 file2 ...] =head1 EXAMPLES supermap.pl -sizes1=human.sizes -sizes2=mouse.sizes hm.chr*.chaos =head1 DESCRIPTION Supermap is a whole-genome alignment map generator. It is an extension to the Shuffle-LAGAN suite (Brudno et al., 2003). Supermap removes the asymmetry between the query genomes by running multiple SLAGAN passes and combining them into a full two-genome alignment. To run Supermap without the Berkeley Genome Pipeline functionality, you will need sequence length files for each of the genomes. Each file should contain one sequence length entry per line, of the form "sequence_name sequence_length". In the CHAOS output format (this program's input), negative orientation always means second pair of coords is inverted. In this program's output, negative orientation does not invert coordinates (coordinate pairs are always ascending). Run supermap.pl with no arguments to see a further description. The terms "hit" and "anchor" usually refer to local alignments produced by CHAOS or another program. The term "chain" refers to an extended union of a number of these local alignments. =head1 DEPENDENCIES Supermap depends on Utils.pm, SLAGAN, and a number of Unix utilities. To use the Berkeley Genome Pipeline and cluster functionality, Supermap needs GPutils.pm, GPDBI.pm, and clust_run. =head1 LIMITATIONS Supermap is designed to allow the manipulation of large datasets in a reasonable memory footprint. To do this, it allows multiple files on input and keeps most of its intermediate data in small temporary files. However, one current limitation is that the alignments for any sequence in either genome must fit into the largest addressable file size (typically 2GB), and the output alignments must also fit in that size (the remainder will be truncated). =head1 BUGS =head1 TODO TODO: bacteria description, examples, other input formats TODO: installer routine TODO: discuss input glob parameters TODO: local multithreading TODO: ignore escaped slashes when splitting dir/file (copy Alex) TODO: check for ++ etc in SLAGAN out TODO: .supermaprc file for score files, etc TODO: hazelton.lbl.gov/bugzilla for supermap =head1 AUTHOR Andrey Kislyuk L. =cut lagan20/src/thrtrie.c0000644000076500007650000001715010502337063015537 0ustar brudnobrudno00000000000000#include #include #include #include "skiplist.h" #include "thrtrie.h" #include int triealphasize=0; int nnodes=0; #define DEBUG 1 #define JQ_SIZE 1024 #include "mempage.c" TJob* jobqueue=0; int jqsize = 1; int numjobs = 0; void makeAlpha(char* alpha) { int i; int isin = 0; for (i=0; i < 256; i++) indeces[i] = -1; i = 0; while (*alpha) { if (!isin && *alpha == '[') isin = 1; else if (isin && *alpha == ']') { isin = 0; i++; } else if (isin) indeces[*alpha] = i; else indeces [*alpha] = i++; alpha++; } triealphasize = i; } int lookup(char c) { return indeces[c]; } TNode* makeTrie(int height, char* alphabet) { TNode* root; initMP(0); makeAlpha(alphabet); if (!jobqueue) jobqueue = (TJob*) malloc(sizeof(TJob)); root = makeNode(height); return root; } void junker (TNode** m){ } int tccc = 0; void freeTrie (TNode* trgt) { /* int i; if (trgt->height) { for (i = 0; i < triealphasize; i++) if (trgt->kids.ptrs[i]) freeTrie(trgt->kids.ptrs[i]); junker (trgt->kids.ptrs); } else free(trgt->kids.locator.locs); free (trgt); */ MPallfree(); } TNode* makeNode(int height) { TNode* tn = (TNode*) MPmalloc(sizeof(TNode)); int i; tn->height=height; if (height) { tn->kids.ptrs = (TNode**) MPmalloc(sizeof(TNode*)*triealphasize); for (i=0; i < triealphasize; i++) tn->kids.ptrs[i]=0; } else { tn->kids.locator.numlocs = 0; tn->kids.locator.locs = (int*)MPmalloc(sizeof(int)*2); tn->kids.locator.locssize = 2; } return tn; } int insertLoc (int word, locs* locator) { locator->locs[locator->numlocs++] = word; if (locator->numlocs >= locator->locssize) { locator->locs = (int*) MPrealloc (locator->locs, sizeof(int)*locator->locssize, sizeof(int)*locator->locssize*2); locator->locssize *= 2; } return 0; } int insertWordHelp(TNode* currnode, char* word, char* strbeg, int height,int wordlen) { int letter; if (height == 0) return insertLoc((int)(word-strbeg), &(currnode->kids.locator)); else { letter = lookup(word[wordlen-height]); if (letter < 0) return 1; if (!currnode->kids.ptrs[letter]) { currnode->kids.ptrs[letter] = makeNode(height-1); } return insertWordHelp(currnode->kids.ptrs[letter], word, strbeg, height-1, wordlen); } return 42; } int insertWord(TNode* currnode, char* word, char* strbeg) { return insertWordHelp(currnode, word, strbeg, currnode->height, currnode->height); } LList* appendLList(LList* a , LList* b) { if (!a) return b; if (!b) return a; b->next = appendLList(a, b->next); return b; } /*no longer works */ /* make iterative??? */ /* LList* lookupZZZWord(TNode* currnode, char* word, int ndegen) { int letter,i; LList *temp, *help, *res=0; int height = currnode->height; if (!currnode || ndegen < 0) return 0; if (!currnode->height) { res = (LList*) malloc (sizeof(LList)); res->myloc = &currnode->kids.locator; res->degleft = 0; res->next = 0; return res; } letter = lookup(word[currnode->height-1]); if (letter >=0 && currnode->kids.ptrs[letter]) { temp = lookupZZZWord(currnode->kids.ptrs[letter], word, ndegen); res = appendLList(res, temp); } for (i=0; i < triealphasize; i++) { if (ndegen > 0 && i != letter) { if (currnode->kids.ptrs[i]) { temp = lookupZZZWord(currnode->kids.ptrs[i], word, ndegen-1); help = temp; while (help != 0) { help->degloc[help->degleft++] = currnode->height; help = help->next; } res = appendLList(res, temp); } } } return res; }*/ void insertString(TNode* root, char* word) { char* begin = word; int i, j, wordlen = root->height, letprev, letcurr; TNode* prev, *curr; insertWord(root, word, begin); word++; root->backptr = root; while (*word) { curr = prev = root; insertWord(root, word, begin); for (i=0; i < wordlen; i++) { letprev = lookup(word[i-1]); letcurr = lookup(word[i]); if (letprev >= 0) prev = prev->kids.ptrs[letprev]; else break; prev->backptr = curr; if (letcurr >= 0) curr = curr->kids.ptrs[letcurr]; else break; } word++; } letcurr = lookup(*(word-1)); if (letcurr >=0) root->kids.ptrs[letcurr]->backptr = root; } void addjob(TNode* tn, char *thisdeg, char dirty, int oldindex) { int i; jobqueue[numjobs].mynode = tn; jobqueue[numjobs].dirty = dirty; if (oldindex >= 0) { jobqueue[numjobs].numdeg = jobqueue[oldindex].numdeg; for (i = 0; i < jobqueue[oldindex].numdeg; i++) jobqueue[numjobs].degloc[i] = jobqueue[oldindex].degloc[i]; } else { jobqueue[numjobs].numdeg = 0; } if (thisdeg>0) { jobqueue[numjobs].degloc[jobqueue[numjobs].numdeg++] = thisdeg; } numjobs++; if (jqsize == numjobs) jobqueue = (TJob*)realloc(jobqueue, sizeof(TJob)*(jqsize *=2)); } void cleanJobQueue() { numjobs = 0; } void remjob(int i) { jobqueue[i]= jobqueue[--numjobs]; } LList* makeLList(TJob* tj, char* word, int offset) { LList* res; int i; TNode* currnode = tj->mynode; res = (LList*) malloc (sizeof(LList)); res->myloc = &(currnode->kids.locator); res->degleft = tj->numdeg; for (i = 0; i < tj->numdeg; i++) res->degloc[i] = (char *)(word - tj->degloc[i]); res->next = 0; return res; } LList* getNextWords (TNode* currnode, char* word, int ndegen) { int i, j; int height = currnode->height; int letter = lookup(*word); int mynjobs; char mydirty; char myflags; char first = 0; LList* res=0, *temp; // -1 --> 0 (second param) if (letter >= 0 && numjobs == 0) /*new string*/ addjob(currnode, 0, 0, -1); mydirty = jobqueue[0].dirty; mynjobs = numjobs; /* need my own copy so that I don't go over inserted things */ for (i = 0; i < mynjobs; i++) { myflags = - 1 - (1 << triealphasize)+1; first = 0; // printf("jqdl = %d, w = %d, mnh = %d\n", jobqueue[i].degloc[0],(int)word, jobqueue[i].mynode->height); if (jobqueue[i].numdeg > 0 && ((char *) jobqueue[i].degloc[0] < word - (height -jobqueue[i].mynode->height))) { remjob(i); if (jobqueue[i].dirty == mydirty) { mynjobs--; i--; } continue; } do { if (!jobqueue[i].mynode) { remjob(i); if (jobqueue[i].dirty == mydirty) { mynjobs--; i--; /* need this if the guy I moved in the old place is in my pass */ } break; } if (jobqueue[i].mynode->height == 0 || first) { jobqueue[i].mynode = jobqueue[i].mynode->backptr; } first = 1; if (ndegen - jobqueue[i].numdeg > 0) { for (j = 0; j < triealphasize; j++) { if (!(myflags & (1<< j)) && jobqueue[i].mynode->kids.ptrs[j]) { // changed -1 --> 0 addjob(jobqueue[i].mynode->kids.ptrs[j], (j==letter)?0:word, !mydirty,i); if (jobqueue[i].mynode->height == 1) { temp = makeLList(&jobqueue[numjobs-1], word, j); temp->next = res; res = temp; } myflags = myflags | (1 << j); } } } else { if (letter >= 0 && jobqueue[i].mynode->kids.ptrs[letter]) { jobqueue[i].mynode = jobqueue[i].mynode->kids.ptrs[letter]; jobqueue[i].dirty = !mydirty; if (jobqueue[i].mynode->height == 0) { temp = makeLList(&jobqueue[i], word, letter); temp->next = res; res = temp; } myflags = -1; } } if (myflags == -1) { break; } } while(jobqueue[i].mynode != jobqueue[i].mynode->backptr); if (jobqueue[i].dirty == mydirty) { remjob(i); if (jobqueue[i].dirty == mydirty) { mynjobs--; i--; /* need this if the guy I moved in the old place is in my pass */ } } } return res; } lagan20/src/thrtrie.h0000644000076500007650000000251310502337063015541 0ustar brudnobrudno00000000000000#include "fchaos.h" #define MAX_DEGEN 2 int indeces[256]; typedef struct PrevHits { int* inds1; int* inds2; int numind; } phits; typedef struct Locator { int* locs; int numlocs; int locssize; } locs; typedef struct LocatorList { locs* myloc; int degleft; char* degloc[MAX_DEGEN]; struct LocatorList* next; /* Stuff below is for chaining */ int location; char* toberemoved; float* scores; int* seq1startpnt; int* seq2startpnt; int* seq1endpnt; int* seq2endpnt; phits* myhits; sle** mysles; } LList; typedef struct TrieNode { union children { struct TrieNode** ptrs; locs locator; } kids; struct TrieNode* backptr; /* added for threading */ int height; } TNode; typedef struct TrieJob { TNode* mynode; int numdeg; char *degloc[MAX_DEGEN]; char dirty; } TJob; LList* appendLList(LList* a , LList* b); LList* savenfreeLList (LList* tbf, seq* seq1, seq* seq2); TNode* makeTrie(int height, char* alphabet); void freeTrie (TNode* root); TNode* makeNode(int height); int insertWord(TNode* root, char* word, char* strbeg); LList* lookupWord(TNode* currnode, char* word, int ndegen); /* above this line are things for all tries */ /*this is for threaded stuff */ void cleanJobQueue(); LList* getNextWords(TNode* root, char* word, int ndegen); void insertString(TNode* root, char* tbi); lagan20/src/translate.c0000644000076500007650000000416210502337063016052 0ustar brudnobrudno00000000000000#include #include #include #include "fchaos.h" #include "translate.h" #include "assert.h" char toPeptide (char* dnaword, char revcomp) { int i, j, sum=0, mask = 0; char *table = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"; if (revcomp) mask = 3; /* Hacking... */ for (i = 0; i < 3; i++) { sum*=4; switch (dnaword[(i^mask)-!!revcomp]) { case 'a': case 'A': sum+=(0^mask); break; case 'c': case 'C': sum+=(1^mask); break; case 'g': case 'G': sum+=(2^mask); break; case 't': case 'T': sum+=(3^mask); break; case 'n': case 'N': return 'X'; default: fprintf(stderr, "%d = %c: bad letter in sequence\n",i,dnaword[i^mask]); exit(1); } } return table[sum]; } seq* transSeq(seq* theseq, int frame) { char* res; seq* resseq = (seq*) malloc(sizeof(seq)); char revcomp = 0; int i, numXs = 0; assert (resseq); if (frame < 0 || frame > 5) { fprintf(stderr, "Valid frame numbers are 1-6\n"); exit(1); } if (frame > 2) revcomp = 1; frame = frame % 3; resseq->numlets = (theseq->numlets-frame)/3; res = (char*) malloc((resseq->numlets+1)* sizeof(char)); assert (res); /** * This was the error. */ res[(theseq->numlets-frame)/3] = 0; for (i = 0;i < (theseq->numlets-frame)/3; i++) { res[i] = (!revcomp)?toPeptide(&theseq->lets[i*3+frame],0) :toPeptide(&theseq->lets[theseq->numlets-3*(i+1)-frame],1); if (res[i] == 'X') numXs++; } resseq->numsiglets = resseq->numlets - numXs; resseq->rptr = resseq->lets = res; resseq->name = (char*) malloc(strlen(theseq->name)+5); resseq->name[0] = 0; sprintf(resseq->name, "%s_f%c%d", theseq->name, (revcomp)?'-':'+', frame); return resseq; } /* int main(int argc, char** argv) { printf("%s\n", transSeq(argv[1], strlen(argv[1]), 0)); printf("%s\n", transSeq(argv[1], strlen(argv[1]), 1)); printf("%s\n", transSeq(argv[1], strlen(argv[1]), 2)); printf("%s\n", transSeq(argv[1], strlen(argv[1]), 3)); printf("%s\n", transSeq(argv[1], strlen(argv[1]), 4)); printf("%s\n", transSeq(argv[1], strlen(argv[1]), 5)); } */ lagan20/src/translate.h0000644000076500007650000000011110502337063016045 0ustar brudnobrudno00000000000000 seq* transSeq(seq*, int); char toPeptide (char* dnaword, char revcomp); lagan20/src/util.cpp0000644000076500007650000000261110502337063015367 0ustar brudnobrudno00000000000000#define fastaRowLength 50 #define bufSize 2000 int trim(char* s) { int i=strlen(s); while (i>0 && (s[i-1]=='\n' || s[i-1]=='\r')) s[--i]='\0'; return i; } string itoa(int i) { char buf[20]; sprintf(buf,"%d",i); return buf; } FILE* openFile(string path,char* mode) { FILE *f=fopen(path.c_str(),mode); if (f==NULL) { fprintf(stderr,"ERROR: Failed open file: %s\n",path.c_str()); exit(1); } return f; } int isArg(char* key,int argc, char* argv[]) { for (int i=0;i #include void Add_Tick(char *line, int count, int length); void Print_Lines(char *line1, char *line2, char *ticks1, char *ticks2, char *match); int Usage(void); char MyName[1024]; int main(int argc, char **argv) { FILE *infile = NULL; FILE *snp_file = NULL; char *slash; int fields, start = -1, end = -1, bp, base1, base2; int base1_count = 0; int base2_count = 0; int start2 = 0; int end2 = 0; int tick1_done = 0; int tick2_done = 0; int width = 60; int length = 0; int html_length = 0; int snp_pos = -1; int param1 = 1; char bases[] = {'-', 'A', 'C', 'T', 'G', 'N'}; char line1[1024]; char line2[80]; char match[80]; char ticks1[80] = ""; char ticks2[80] = ""; char snp_fname[1024] = ""; char font_start[80] = ""; char dash[] = " - "; char snp_bases[2]; // remove the directory name from the program pathname if (((slash = strrchr(argv[0], '/')) != NULL) || ((slash = strrchr(argv[0], '\\')) != NULL)) strcpy(MyName, slash + 1); else strcpy(MyName, argv[0]); // parse my command line and open input file(s) if (argc < 2) return Usage(); if (argv[1][0] == '-') if (strcasecmp(argv[1], "-pga") == 0) ++param1; else if (strcmp(argv[1], "-") != 0) return Usage(); if ((argc <= param1) || ((strcmp(argv[param1], "-") != 0) && ((infile = fopen(argv[param1], "r")) == NULL)) || ((argc > (param1 + 1)) && (((fields = sscanf(argv[param1 + 1], "%d", &start)) != 1) || (start <= 0))) || ((argc > (param1 + 2)) && (((fields = sscanf(argv[param1 + 2], "%d", &end)) != 1) || (start > end)))) return Usage(); if (infile == NULL) infile = stdin; else if (param1 > 1) { if (((slash = strrchr(argv[param1], '/')) != NULL) || ((slash = strrchr(argv[param1], '\\')) != NULL)) { strncpy(snp_fname, argv[param1], slash - argv[param1] + 1); snp_fname[slash - argv[param1] + 1] = '\0'; } strcat(snp_fname, "SNP.txt"); snp_file = fopen(snp_fname, "r"); } while (!feof(infile)) { if ((bp = getc(infile)) == EOF) { // get next char if (!ferror(infile)) { end2 = base2_count; continue; } perror("Error reading file"); // stop if an error is found return 1; } // decode bp char base1 = bp >> 4; base2 = bp & 0xf; if (base1 != 0) { ++base1_count; tick1_done = 0; } if (base2 != 0) { ++base2_count; tick2_done = 0; } if (base1_count < start) continue; if (snp_file != NULL) { while (base1_count > snp_pos) { if ((fields = fscanf(snp_file, "%d %2c", &snp_pos, snp_bases)) == 2) continue; fclose(snp_file); snp_file = NULL; break; } } if (start2 == 0) { start2 = base2_count; if (base2 == 0) ++start2; } if (base1_count != snp_pos) { line1[html_length] = bases[base1]; line1[html_length + 1] = 0; ++html_length; } else { strcpy(line1 + html_length, font_start); strcat(line1, status_start); html_length = strlen(line1); line1[html_length] = snp_bases[0]; strcpy(line1 + html_length + 1, dash); line1[html_length + strlen(dash) + 1] = snp_bases[1]; strcpy(line1 + html_length + strlen(dash) + 2, status_end); html_length = strlen(line1); line1[html_length] = bases[base1]; strcpy(line1 + html_length + 1, font_end); html_length = strlen(line1); } line2[length] = bases[base2]; line2[length + 1] = 0; match[length] = ((base1 == base2) && (base1 != 5)) ? '|' : ' '; match[length + 1] = 0; ++length; if ((tick1_done == 0) && ((base1_count % 10) == 0) && (base1_count > 0)) { Add_Tick(ticks1, base1_count, length); tick1_done = 1; } if ((tick2_done == 0) && ((base2_count % 10) == 0) && (base2_count > 0)) { Add_Tick(ticks2, base2_count, length); tick2_done = 1; } if (length == 60) { Print_Lines(line1, line2, ticks1, ticks2, match); length = 0; html_length = 0; } if (base1_count == end) { end2 = base2_count; break; } } if (length != 0) Print_Lines(line1, line2, ticks1, ticks2, match); fclose(infile); if (param1 > 1) printf("start2=%d\nend2=%d\n", start2, end2); return 0; } void Add_Tick(char *line, int count, int length) { int space; char tick[20]; sprintf(tick, "%d", count); space = length + 9 - strlen(line) - strlen(tick); if (space > 0) { while (space > 0) { strcat(line, " "); --space; } strcat(line, tick); } } void Print_Lines(char *line1, char *line2, char *ticks1, char *ticks2, char *match) { printf("\n%s\nseq1 %s\n %s\nseq2 %s\n%s\n", ticks1, line1, match, line2, ticks2); line1[0] = line2[0] = ticks1[0] = ticks2[0] = match[0] = 0; } int Usage() { fprintf(stderr, " \ Usage: %s [-pga] { - | alignment_file } [start [end]]\n", MyName); return 1; } lagan20/src/utils/bin2mf.c0000644000076500007650000000276510502337062016400 0ustar brudnobrudno00000000000000#include #include #include void Add_Tick(char *line, int count, int length); void Print_Lines(char *line1, char *line2, char *ticks1, char *ticks2, char *match); int Usage(void); char MyName[1024]; int main(int argc, char **argv) { FILE *infile = NULL; char bases[] = {'-', 'A', 'C', 'T', 'G', 'N'}; char *seq1, *seq2; int seqsize=1, numread=0; int bp, base1, base2, i; seq1 = (char*) malloc(sizeof(char)); seq2 = (char*) malloc(sizeof(char)); // parse my command line and open input file(s) if (argc < 2) return Usage(); if ((strcmp(argv[1], "-") != 0) && ((infile = fopen(argv[1], "r")) == NULL)) return Usage(); if (infile == NULL) infile = stdin; while (!feof(infile)) { if ((bp = getc(infile)) == EOF) { // get next char break; } // decode bp char base1 = bp >> 4; base2 = bp & 0xf; seq1[numread] = bases[base1]; seq2[numread] = bases[base2]; numread++; if (numread >= seqsize) { seq1 = (char*) realloc(seq1, sizeof(char)* (seqsize *2)); seq2 = (char*) realloc(seq2, sizeof(char)* (seqsize *2)); seqsize *= 2; } } printf(">seq1"); for (i = 0; i < numread; i++) { if (!(i%60)) printf("\n"); printf("%c", seq1[i]); } printf("\n>seq2"); for (i = 0; i < numread; i++) { if (!(i%60)) printf("\n"); printf("%c", seq2[i]); } return 0; } int Usage() { fprintf(stderr, " \ Usage: %s { - | alignment_file }]\n", MyName); return 1; } lagan20/src/utils/cextract.c0000644000076500007650000000534610502337062017036 0ustar brudnobrudno00000000000000#include #include #include #include #include int begin, finish, seqIdx, seqExt, seqlen, numseqs, seqlen2, numseqs2; char name[1024], name2[1024], **seqs, **seqs2; int getLength (char *filename){ FILE *file; char buffer[1024], ch; int length = 0; file = fopen (filename, "r"); assert (file); fgets (buffer, 1024, file); while (!feof (file)){ ch = fgetc (file); if (ch == '>') break; if (isalpha (ch) || ch == '.' || ch == '-') length++; } fclose (file); return length; } void readfile (char *filename, int *seqlen, int *numseqs, char *name, char ***seqs){ FILE *file; char buffer[1024], ch; int i; *numseqs = 0; *seqlen = getLength (filename); strcpy (name, ""); *seqs = (char **) malloc (sizeof (char *) * 1); assert (*seqs); (*seqs)[0] = (char *) malloc (sizeof (char) * (*seqlen)); file = fopen (filename, "r"); assert (file); while (!feof (file)){ i = 0; fgets (buffer, 1024, file); if (strlen (name) == 0) strcpy (name, buffer); if (feof (file)) break; (*numseqs)++; if (*numseqs > 1){ *seqs = (char **) realloc (*seqs, sizeof (char *) * (*numseqs)); assert (*seqs); (*seqs)[*numseqs - 1] = (char *) malloc (sizeof (char) * (*seqlen)); assert ((*seqs)[*numseqs - 1]); } while (!feof (file)){ ch = fgetc (file); if (ch == '>') break; if (isalpha (ch) || ch == '.' || ch == '-'){ assert (i < (*seqlen)); (*seqs)[*numseqs - 1][i] = ch; i++; } } if (ch == '>') ungetc (ch, file); assert (i == *seqlen); } fclose (file); } void print (void){ int i = 0, pos = 0, written = 0, j = 0; assert (seqIdx >= 0 && seqIdx < numseqs); assert (seqExt >= 0 && seqExt < numseqs); printf ("%s", name); while (pos <= finish && i < seqlen){ if (isalpha (seqs[seqIdx][i])) pos++; if (isalpha (seqs[seqExt][i]) || seqs[seqExt][i] == '.'){ assert (seqlen2 == 0 || j < seqlen2); if (pos >= begin && pos <= finish){ printf ("%c", seqlen2 == 0 ? seqs[seqExt][i] : seqs2[0][j]); written++; if (written % 60 == 0) printf ("\n"); } j++; } i++; } if (written % 60 != 0) printf ("\n"); } int main (int argc, char** argv){ int i; if (argc != 6 && !(argc == 8 && strcmp (argv[6], "-subst") == 0)){ fprintf (stderr, "Usage:\n\ncextract multi_fasta_file begin end seqidx seqextract\n"); exit (1); } begin = atoi (argv[2]); finish = atoi (strdup(argv[3])); seqIdx = atoi (argv[4]); seqExt = atoi (argv[5]); seqlen2 = 0; readfile (argv[1], &seqlen, &numseqs, name, &seqs); if (argc == 8) readfile (argv[7], &seqlen2, &numseqs2, name2, &seqs2); print (); for (i = 0; i < numseqs; i++) free (seqs[i]); free (seqs); } lagan20/src/utils/cmerge2.pl0000755000076500007650000001562410502337062016741 0ustar brudnobrudno00000000000000#!/usr/bin/env perl use File::Basename; $lagandir = $ENV{LAGAN_DIR}; $pid = $$; # process arguments if (@ARGV < 4 && @ARGV > 6) { print STDERR ("usage:\n cmerge seqfile mfafile draftfile outfile [-nocrop] [-skipfr pid]\n"); exit(1); } $arglist = ""; $nocrop = 0; for ($i = 4; $i < @ARGV; $i++) { if ($ARGV[$i] =~ /-nocrop/){ $nocrop = 1; } elsif ($ARGV[$i] =~ /-skipfr/){ $skipfr = 1; $pid = $ARGV[++$i]; chomp $pid; } else { print STDERR "Bad arg to cmerge: $ARGV[$i]"; exit(1); } } $arglist = "$arglist $recurfl"; if (!$skipfr) { exit(1); } $newdir = `pwd`; chomp $newdir; $newdir = "$newdir/$pid"; open (LOGFILE, ">>$newdir/log"); open (INFOFILE, ">$newdir/minfo"); print STDERR ("\n"); print STDERR ("Computing Contig Overlaps\n"); print STDERR ("-------------------------\n"); print LOGFILE ("\n"); print LOGFILE ("Computing Contig Overlaps\n"); print LOGFILE ("-------------------------\n"); # initialize merged file open (OFILE, ">$ARGV[3]"); print OFILE (">merged\n"); close (OFILE); `cp $ARGV[3] $ARGV[3].masked`; # initialize padding file open (OFILE, ">$newdir/padding"); print OFILE (">padding\n"); print OFILE ("NNNNNNNNNNNNNNNNNNNN.NNNNNNNNNNNNNNNNNNNN\n"); close (OFILE); $padlength = `$lagandir/utils/getlength $newdir/padding`; chomp $padlength; # other initialization $totlength = `$lagandir/utils/getlength $ARGV[0]`; chomp $totlength; $mergedEnd = 0; # read contig list $numContigs = 0; @list = `cat $ARGV[2]`; for ($i = 3; $i < @list; $i++){ $list[$i] =~ /(.*)\.mfa --\> \((\d+) (\d+)\) score=(\d+), offset=\((\d+) (\d+)\), index=(\d+)/; $filenames[$i-3] = $1; $seq1Begin[$i-3] = $2; $seq1End[$i-3] = $3; $score[$i-3] = $4; $s1shifts[$i-3] = $5; $s2shifts[$i-3] = $6; $num[$i-3] = $7; $temp = $seq1Begin[$i-3] - $s1shifts[$i-3]; $seq2Begin[$i-3] = `$lagandir/utils/getcontigpos $filenames[$i-3].mfa $temp`; chomp $seq2Begin[$i-3]; $seq2Begin[$i-3] += $s2shifts[$i-3]; $temp = $seq1End[$i-3] - $s1shifts[$i-3]; $seq2End[$i-3] = `$lagandir/utils/getcontigpos $filenames[$i-3].mfa $temp`; chomp $seq2End[$i-3]; $seq2End[$i-3] += $s2shifts[$i-3]; print STDERR "$filenames[$i-3].mfa --> $seq1Begin[$i-3] $seq1End[$i-3] $score[$i-3] $s1shifts[$i-3] $s2shifts[$i-3] $num[$i-3] $seq2Begin[$i-3] $seq2End[$i-3]\n"; $numContigs++; } # extract contigs $contigfile = basename ($ARGV[1]); $contigdir = dirname ($ARGV[1]); $newdir = `pwd`; chomp $newdir; $newdir = "$newdir/$pid"; # start out merged file with only padding `mv $ARGV[3] $ARGV[3].new`; `$lagandir/utils/seqmerge $ARGV[3].new $newdir/padding > $ARGV[3]`; `mv $ARGV[3].masked $ARGV[3].masked.new`; `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/padding > $ARGV[3].masked`; $contigStart[0] = 1; $startChop[0] = 0; `cp $filenames[0] $newdir/current`; `cp $filenames[0].masked $newdir/current.masked`; # merge contigs for ($i = 1; $i < $numContigs; $i++){ `$lagandir/rechaos.pl $newdir/current $filenames[$i] -recurse \"(12,0,40,0)x\" -maskedonly > $newdir/currentanchs`; # find the overlap `$lagandir/utils/getoverlap $newdir/currentanchs` =~ /(-?\d+) (-?\d+) (-?\d+) (-?\d+)/; $rangebegin1 = $1; $rangeend1 = $2; $rangebegin2 = $3; $rangeend2 = $4; chomp $rangebegin1; chomp $rangeend1; chomp $rangebegin2; chomp $rangeend2; $thislength = `$lagandir/utils/getlength $filenames[$i-1]`; chomp $thislength; $nextlength = `$lagandir/utils/getlength $filenames[$i]`; chomp $nextlength; # if no overlap, flush the buffer if ($rangebegin1 == -1 && $rangeend1 == -1){ print STDERR "No overlap found...\n"; `mv $ARGV[3] $ARGV[3].new`; `$lagandir/utils/seqmerge $ARGV[3].new $newdir/current $newdir/padding > $ARGV[3]`; `cp $filenames[$i] $newdir/current`; `mv $ARGV[3].masked $ARGV[3].masked.new`; `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/current.masked $newdir/padding > $ARGV[3].masked`; `cp $filenames[$i].masked $newdir/current.masked`; $contigEnd[$i-1] = $contigStart[$i-1] + $thislength - 1; $contigStart[$i] = $contigEnd[$i-1] + $padlength + 1; $endChop[$i-1] = 0; $startChop[$i] = 0; } else { print STDERR "Overlap detected!\n"; # extract the overlapped region > overlap $j = $rangebegin1 - 1; if ($j > 0){ `$lagandir/utils/cextract $newdir/current 1 $j 0 0 > $newdir/overlap`; `$lagandir/utils/cextract $newdir/current.masked 1 $j 0 0 > $newdir/overlap.masked`; $overlaplength = `$lagandir/utils/getlength $newdir/overlap`; chomp $overlaplength; `mv $ARGV[3] $ARGV[3].new`; `$lagandir/utils/seqmerge $ARGV[3].new $newdir/overlap > $ARGV[3]`; `mv $ARGV[3].masked $ARGV[3].masked.new`; `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/overlap.masked > $ARGV[3].masked`; } # extract the nonoverlapped region > current `$lagandir/utils/cextract $filenames[$i] $rangebegin2 $nextlength 0 0 > $newdir/current`; `$lagandir/utils/cextract $filenames[$i].masked $rangebegin2 $nextlength 0 0 > $newdir/current.masked`; $contigEnd[$i-1] = $contigStart[$i-1] + $overlaplength - 1; $contigStart[$i] = $contigEnd[$i-1] + 1; $endChop[$i-1] = $thislength - $rangeend1; $startChop[$i] = $rangebegin2 - 1; } if (index ($filenames[$i-1], ".rc") == -1) { $direction = "+"; } else { $direction = "-"; } @temp = `head $filenames[$i-1]`; chomp $temp[0]; $temp[0] = substr $temp[0], 1; print INFOFILE "$temp[0]\n"; print INFOFILE "$num[$i-1] $seq1Begin[$i-1] $seq1End[$i-1] $contigStart[$i-1] $contigEnd[$i-1] $startChop[$i-1] $endChop[$i-1] $direction $score[$i-1] $seq2Begin[$i-1] $seq2End[$i-1]\n"; } $thislength = `$lagandir/utils/getlength $filenames[$numContigs - 1]`; chomp $thislength; $contigEnd[$numContigs - 1] = $contigStart[$numContigs - 1] + $thislength - 1; $endChop[$numContigs - 1] = 0; `mv $ARGV[3] $ARGV[3].new`; `$lagandir/utils/seqmerge $ARGV[3].new $newdir/current $newdir/padding > $ARGV[3]`; `mv $ARGV[3].masked $ARGV[3].masked.new`; `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/current.masked $newdir/padding > $ARGV[3].masked`; if (index ($filenames[$numContigs - 1], ".rc") == -1) { $direction = "+"; } else { $direction = "-"; } @temp = `head $filenames[$numContigs - 1]`; chomp $temp[0]; $temp[0] = substr $temp[0], 1; print INFOFILE "$temp[0]\n"; print INFOFILE "$num[$numContigs - 1] $seq1Begin[$numContigs - 1] $seq1End[$numContigs - 1] $contigStart[$numContigs - 1] $contigEnd[$numContigs - 1] $startChop[$numContigs - 1] $endChop[$numContigs - 1] $direction $score[$numContigs - 1] $seq2Begin[$numContigs - 1] $seq2End[$numContigs - 1]\n"; print STDERR "Merging complete!\n\n"; print LOGFILE "Merging complete!\n\n"; # 1. write getoverlap() -- given a set of chaos hits, find the beginning and end in both seqs # 2. implement contigStart, contigStop -- positions of the contig begins/ends in the merged draft sequence # 3. startChop, endChop -- number chopped from each end # 4. secFrom, secTo -- pos in the chopped contig sequence lagan20/src/utils/contigorder.c0000644000076500007650000002225010502337062017531 0ustar brudnobrudno00000000000000#include #include #include #define MAX_CELLS ((long long int) 100000000) #define MAX_TIME ((long long int) 100000 * (long long int) 100000) int failed = 0; void getFileInfo (char *filename, int *numContigs, int *seqLen, int *numHits){ FILE *file; int dummy, i; if (!(file = fopen (filename, "r"))){ fprintf (stderr, "contigorder: Error opening file: %s\n"); exit (1); } fscanf (file, "numContigs = %d\n", numContigs); fscanf (file, "seqLen = %d\n", seqLen); *numHits = 0; while (!feof (file)){ if (fscanf (file, "(%d %d)", &dummy, &dummy) == 2){ for (i = 0; i < *numContigs; i++){ fscanf (file, "%&d", &dummy); } while (fgetc (file) != '\n'); (*numHits)++; } } fclose (file); } void getScores (char *filename, int numContigs, int seqLen, int numHits, int ***score, int ***ranges){ FILE *file; int i, j; *score = (int **) malloc (sizeof (int *) * numHits); assert (*score); *ranges = (int **) malloc (sizeof (int *) * numHits); assert (*ranges); for (i = 0; i < numHits; i++){ (*score)[i] = (int *) calloc (numContigs, sizeof (int)); assert ((*score)[i]); (*ranges)[i] = (int *) calloc (2, sizeof (int)); assert ((*ranges)[i]); } if (!(file = fopen (filename, "r"))){ fprintf (stderr, "contigorder: Error opening file: %s\n"); exit (1); } fscanf (file, "numContigs = %*d\n"); fscanf (file, "seqLen = %*d\n"); i = 0; while (!feof (file) && i < numHits){ if (fscanf (file, "(%d %d)", &((*ranges)[i][0]), &((*ranges)[i][1])) == 2){ for (j = 0; j < numContigs; j++){ fscanf (file, "%d", &((*score)[i][j])); } while (fgetc (file) != '\n'); i++; } } fclose (file); } void floodfill (int *labels, int *first, int *last, int numContigs, int here, int groupNum){ int i; labels[here] = groupNum; for (i = 0; i < numContigs; i++){ if (i != here && labels[i] == -1 && first[i] != -1){ if (!(first[here] > last[i] || last[here] < first[i])){ floodfill (labels, first, last, numContigs, i, groupNum); } } } } int *getLabels (int **score, int numContigs, int numHits){ int *labels, *first, *last, i, j; labels = (int *) calloc (numContigs, sizeof (int)); assert (labels); first = (int *) calloc (numContigs, sizeof (int)); assert (first); last = (int *) calloc (numContigs, sizeof (int)); assert (last); for (j = 0; j < numContigs; j++){ first[j] = -1; for (i = 0; i < numHits; i++){ if (score[i][j] > 0){ if (first[j] == -1) first[j] = i; last[j] = i; } } } j = 0; for (i = 0; i < numContigs; i++) labels[i] = -1; for (i = 0; i < numContigs; i++){ if (labels[i] == -1 && first[i] != -1){ floodfill (labels, first, last, numContigs, i, j++); } } free (first); free (last); return labels; } int makeRanges (int **score, int numHits, int *cols, int numCols, int **first, int **last){ int i, j, k, found, numRanges = 1; for (i = 0; i < numHits; i++){ for (j = 0; j <= i; j++){ for (k = found = 0; !found && k < numCols; k++){ found = (score[i][cols[k]] > 0) && (score[j][cols[k]] > 0); } if (found) numRanges++; } } *first = (int *) calloc (numRanges, sizeof (int)); assert (*first); *last = (int *) calloc (numRanges, sizeof (int)); assert (*last); (*first)[0] = -1; // initial range (*last)[0] = -1; // initial range numRanges = 1; for (i = 0; i < numHits; i++){ for (j = 0; j <= i; j++){ for (k = found = 0; !found && k < numCols; k++){ found = (score[i][cols[k]] > 0) && (score[j][cols[k]] > 0); } if (found){ (*first)[numRanges] = j; (*last)[numRanges] = i; numRanges++; } } } return numRanges; } int **calcRangeScores (int **score, int *cols, int numCols, int *first, int *last, int numRanges){ int i, j, k, **scoreOf; scoreOf = (int **) malloc (sizeof (int *) * numCols); assert (scoreOf); for (i = 0; i < numCols; i++){ scoreOf[i] = (int *) malloc (sizeof (int) * numRanges); assert (scoreOf[i]); for (j = 0; j < numRanges; j++){ scoreOf[i][j] = 0; if (j > 0){ for (k = first[j]; k <= last[j]; k++){ scoreOf[i][j] += score[k][cols[i]]; } } } } return scoreOf; } void solveOrder (int **score, int numContigs, int numHits, int *cols, int numCols, int **ranges, int **results, int *resultCtr){ int i, j, k, l, m; int numStates = (1 << numCols), numRanges; int **best, *first, *last, ptr, newScore, **scoreOf; int bestScore = 0, bestState, bestRange, newBest, addedScore; int *stateList, *rangeList, *scoreList; int work, totwork; numRanges = makeRanges (score, numHits, cols, numCols, &first, &last); if ((long long int) numRanges * (long long int) numStates > MAX_CELLS || (long long int) numRanges * (long long int) numStates * (long long int) numCols * (long long int) numRanges > MAX_TIME){ fprintf (stderr, "ordering failed, retrying... (numRanges = %d, numStates = %d)\n", numRanges, numStates); printf ("ordering failed\n"); failed = 1; return; } best = (int **) malloc (sizeof (int *) * numStates); assert (best); for (i = 0; i < numStates; i++){ best[i] = (int *) calloc (numRanges, sizeof (int)); assert (best[i]); } for (i = 0; i < numStates; i++) best[i][0] = 0; for (j = 1; j < numRanges; j++) best[0][j] = 0; scoreOf = calcRangeScores (score, cols, numCols, first, last, numRanges); // -- DP solution --------------- work = 0; totwork = (numRanges - 1) * (numStates - 1); // search over all state transitions for (i = 1; i < numRanges; i++){ for (j = 1; j < numStates; j++){ newBest = -1; // compute best previous state for (k = 0; k < numCols; k++) if (j & (1 << k)){ m = j - (1 << k); addedScore = scoreOf[k][i]; for (l = 0; l < numRanges; l++) if (last[l] < first[i]){ newScore = best[m][l] + addedScore; if (newScore > newBest){ newBest = newScore; } } } best[j][i] = newBest; if (best[j][i] > bestScore){ bestScore = best[j][i]; bestState = j; bestRange = i; } work++; if ((work % 100000) == 0){ fprintf (stderr, "WORKING %d/%d\n", work, totwork); } } } // -- Compute traceback --------- l = 0; stateList = (int *) calloc (numCols, sizeof (int)); assert (stateList); rangeList = (int *) calloc (numCols, sizeof (int)); assert (rangeList); scoreList = (int *) calloc (numCols, sizeof (int)); assert (scoreList); while (bestState != 0){ k = 1; for (i = 0; k && i < numCols; i++) if (bestState & (1 << i)){ m = bestState - (1 << i); for (j = 0; k && j < numRanges; j++) if (last[j] < first[bestRange]){ newScore = best[m][j] + scoreOf[i][bestRange]; if (newScore == best[bestState][bestRange]){ stateList[l] = cols[i]; rangeList[l] = bestRange; scoreList[l] = scoreOf[i][bestRange]; l++; bestState = m; bestRange = j; k = 0; } } } } // -- Report traceback ---------- for (i = l - 1; i >= 0; i--){ results[*resultCtr][0] = stateList[i]; results[*resultCtr][1] = ranges[first[rangeList[i]]][0]; results[*resultCtr][2] = ranges[last[rangeList[i]]][1]; results[*resultCtr][3] = scoreList[i]; (*resultCtr)++; } for (i = 0; i < numCols; i++) free (scoreOf[i]); free (scoreOf); for (i = 0; i < numStates; i++) free (best[i]); free (best); free (first); free (last); free (stateList); free (rangeList); free (scoreList); } int compFn (const void *a, const void *b){ return (*(int **) a)[1] - (*(int **) b)[1]; } void findGroups (int numContigs, int seqLen, int numHits, int **score, int **ranges){ int *labels, group, pos, i; int *columns, **results, resultCtr = 0; labels = getLabels (score, numContigs, numHits); columns = (int *) malloc (sizeof (int) * numContigs); assert (columns); results = (int **) malloc (sizeof (int *) * numContigs); assert (results); for (i = 0; i < numContigs; i++){ results[i] = (int *) calloc (4, sizeof (int)); assert (results[i]); } group = pos = 0; while (!failed){ for (i = 0; i < numContigs; i++){ if (labels[i] == group) columns[pos++] = i; } if (pos == 0) break; solveOrder (score, numContigs, numHits, columns, pos, ranges, results, &resultCtr); pos = 0; group++; } if (!failed){ qsort (results, resultCtr, sizeof (int *), compFn); for (i = 0; i < resultCtr; i++){ printf ("%d --> (%d %d) %d\n", results[i][0], results[i][1], results[i][2], results[i][3]); } } for (i = 0; i < numContigs; i++) free (results[i]); free (results); free (labels); free (columns); } int main (int argc, char **argv){ int numContigs, seqLen, numHits, i; int **score, **ranges; if (argc != 2){ fprintf (stderr, "Usage:\ncontigorder rangefile\n"); exit (1); } getFileInfo (argv[1], &numContigs, &seqLen, &numHits); //fprintf (stderr, "numContigs = %d, seqLen = %d, numHits = %d\n", numContigs, seqLen, numHits); getScores (argv[1], numContigs, seqLen, numHits, &score, &ranges); findGroups (numContigs, seqLen, numHits, score, ranges); for (i = 0; i < numHits; i++){ free (score[i]); free (ranges[i]); } free (score); free (ranges); return 0; } lagan20/src/utils/cstat.c0000644000076500007650000001377010502337062016337 0ustar brudnobrudno00000000000000#include #include #include #include #include #define MAX_SEQ 31 #define MAX(a,b) ((a)>(b)?(a):(b)) #define MIN(a,b) ((a)<(b)?(a):(b)) #define CNTS_LEN 6 #define CNTS_A 0 #define CNTS_T 1 #define CNTS_C 2 #define CNTS_G 3 #define CNTS_N 4 #define CNTS_GAP 5 double logs[MAX_SEQ+1]; double maxentr; char* alpha = "ATCGN-"; int s1shift = 0, s2shift = 0; typedef struct pair_ints { int s; int e; } pair; typedef struct align_res { char* names[MAX_SEQ]; int algnlen; int numseq; int* algn; char* cnts[CNTS_LEN]; } align; int cntlets(FILE* input) { int numread=0; char temp[256]; char currchar = '~'; if (feof(input)) return 0; fgets(temp, 255, input); if (temp[0] != '>') { fprintf(stderr, "File is not in FASTA format!!\n"); exit(1); } while ((currchar != '>') && (currchar != EOF)) { currchar = fgetc(input); if (!isspace(currchar)) { currchar = toupper(currchar); numread++; } } rewind(input); return numread-1; } int readseq(FILE* input, align* myal, int seqnum, int checksum) { int numread=0, help; char temp[256]; char currchar; if (feof(input)) return 0; fgets(temp, 255, input); if (temp[0] != '>') { fprintf(stderr, "File is not in FASTA format!!\n"); exit(1); } myal->names[seqnum] = (char*) malloc((strlen(temp))*sizeof(char)); strcpy(myal->names[seqnum], temp+1); *(strchr(myal->names[seqnum], '\n')) = 0; currchar = fgetc(input); while (numread <= checksum &&(currchar != '>') && (currchar != EOF)) { if (!isspace(currchar)) { currchar = toupper(currchar); if (!strchr(alpha, currchar)) { // fprintf(stderr, "WARNING %c converted to N\n", currchar, alpha); currchar = 'N'; } help = strchr(alpha, currchar)-alpha; myal->cnts[help][numread]++; if (help != CNTS_GAP) { myal->algn[numread] |= (1 << seqnum); } numread++; } currchar = fgetc(input); } if (currchar == '>') ungetc(currchar, input); if (numread != checksum) { fprintf(stderr, "Sequence (%s) of different lengths (%d v. %d)!!\n", myal->names[seqnum], numread, checksum); exit(1); } return 1; } align* readMultial(FILE* alfile) { int letcnt = cntlets(alfile), i, j; align* res = (align*)malloc (sizeof(align)); res->algn = (int*) malloc (sizeof(int)* letcnt); for (j=0; jcnts[j] = (char*) malloc (sizeof(char)* letcnt); for (i=0; ialgn[i] = 0; for (j=0; jcnts[j][i] = 0; } i = 0; while (readseq(alfile, res, i++, letcnt)) ; res->numseq = i-1; res->algnlen = letcnt; return res; } inline int getScore (align* a, int i){ return ((a->cnts[0][i] * (a->cnts[0][i] - 1)) + (a->cnts[1][i] * (a->cnts[1][i] - 1)) + (a->cnts[2][i] * (a->cnts[2][i] - 1)) + (a->cnts[3][i] * (a->cnts[3][i] - 1))) / 2; } void skipto (align *myal, int trgt, int *i, int* pos){ int j; while (*i < trgt){ for (j = 0; j < myal->numseq; j++) pos[j] += (myal->algn[*i] & (1 << j)) > 0; (*i)++; } } void print (align *myal, int *first, int *last, int len){ int *start, *end, i, j, s = 0, e = 0; start = (int *) malloc (sizeof (int) * myal->numseq); assert (start); end = (int *) malloc (sizeof (int) * myal->numseq); assert (end); for (i = 0; i < myal->numseq; i++) start[i] = end[i] = 0; for (i = 0; i < len; i++){ skipto (myal, first[i], &s, start); skipto (myal, last[i], &e, end); printf ("(%d %d) --> ", first[i] + s1shift, last[i] + s1shift); if (myal->numseq == 2){ printf ("(%d %d)%s", start[0] + s1shift, end[0] + s1shift, (0 == myal->numseq - 1) ? "\n" : ", "); printf ("(%d %d)%s", start[1] + s2shift, end[1] + s2shift, (1 == myal->numseq - 1) ? "\n" : ", "); } else { for (j = 0; j < myal->numseq; j++){ printf ("(%d %d)%s", start[0], end[0], (j == myal->numseq - 1) ? "\n" : ", "); } } // this is a hack -- can't handle multiple seq's /* for (j = 0; j < myal->numseq; j++){ printf ("(%d %d)%s", start[j], end[j], (j == myal->numseq - 1) ? "\n" : ", "); } */ } free (start); free (end); } void analyze (align *myal, int cutoff, int window){ int *first, *last, size = 1, len = 0, i, score, count = 0; int runstart = -1, numpairs = myal->numseq * (myal->numseq - 1) / 2; window = MIN (window, myal->algnlen); first = (int *) malloc (size * sizeof (int)); assert (first); last = (int *) malloc (size * sizeof (int)); assert (last); score = 0; for (i = 0; i < window; i++) score += getScore (myal, i); if (score * 100 >= window * numpairs * cutoff) runstart = 0; for (i = 1; i <= myal->algnlen - window; i++){ score += getScore (myal, i + window - 1) - getScore (myal, i - 1); if (score * 100 >= window * numpairs * cutoff){ if (runstart == -1){ if (len > 0 && last[len - 1] >= i) runstart = first[--len]; else runstart = i; } } else if (runstart >= 0){ first[len] = runstart; last[len++] = i + window - 1; runstart = -1; if (len == size){ size *= 2; first = (int *) realloc (first, sizeof (int) * size); assert (first); last = (int *) realloc (last, sizeof (int) * size); assert (last); } } } if (runstart >= 0){ first[len] = runstart; last[len++] = myal->algnlen - 1; } for (i = 0; i < len; i++){ count += last[i] - first[i]; } printf ("%d\n", count); print (myal, first, last, len); free (first); free (last); } int main(int argc, char** argv) { FILE *alignfile; align* myal; int i; if (argc != 4 && argc != 7) { fprintf(stderr, "usage:\ncstat multi_fasta_file cutoff window_size [-shift s1shift s2shift]\n"); exit(1); } if (!(alignfile = fopen(argv[1],"r"))) { fprintf(stderr, "couldnt open alignment file %s\n",argv[1]); return 2; } if (argc == 7){ s1shift = atoi (argv[5]); s2shift = atoi (argv[6]); } myal = readMultial(alignfile); analyze (myal, atoi (argv[2]), atoi (argv[3])); } lagan20/src/utils/dotplot.cpp0000644000076500007650000000523310502337062017241 0ustar brudnobrudno00000000000000#include #include int main (int argc, char **argv){ FILE *file; int s1b, s1e, s2b, s2e, pa, pb, maxa = 0, maxb = 0; float score; char buffer[105]; char* name1 = NULL; char* name2 = NULL; char dummy[] = "unknown"; int PAD, PAD2; if (argc < 2){ fprintf (stderr, "Usage: dotplot anchfile [name1 [name2]] \n"); exit(1); } if (argc > 2) name1 = argv[2]; if (argc > 3) name2 = argv[3]; if (name1 == NULL) name1 = dummy; if (name2 == NULL) name2 = dummy; pa = -1; pb = -1; file = fopen (argv[1], "r"); while (!feof (file)){ if (fscanf (file, "(%d %d)=(%d %d) %f", &s1b, &s1e, &s2b, &s2e, &score) == 5 && s2b > 0){ if (s1b > maxa) maxa = s1b; if (s1e > maxa) maxa = s1e; if (s2b > maxb) maxb = s2b; if (s2e > maxb) maxb = s2e; } fgets (buffer, 105, file); } fclose (file); // PAD = maxa / 1000; // PAD2 = maxb / 1000; file = fopen (argv[1], "r"); printf ("set nokey\n"); printf ("set xlabel \"%s\"\n", name1); printf ("set ylabel \"%s\"\n", name2); printf ("set title \"Dotplot: %s vs. %s\"\n", name1, name2); printf ("set style line 1 linetype 3 linewidth 3\n"); printf ("set style line 2 linetype 1 linewidth 4\n"); while (!feof (file)){ if (fscanf (file, "(%d %d)=(%d %d) %f", &s1b, &s1e, &s2b, &s2e, &score) == 5 && s2b > 0){ if (s1b > maxa) maxa = s1b; if (s1e > maxa) maxa = s1e; if (s2b > maxb) maxb = s2b; if (s2e > maxb) maxb = s2e; if (s2b < s2e){ // draw forward aligns PAD = (s1e-s1b)* 2/10; PAD2 = (s2e-s2b)* 2/10; printf ("set arrow from %d,%d to %d,%d nohead ls 1\n", s1b-PAD, s2b-PAD2, s1e+PAD, s2e+PAD2); // draw connections // if (pa != -1 && pb != -1) // printf ("set arrow from %d,%d to %d,%d nohead lt -1 lw 0.01\n", pa, pb, s1b, s2b); pa = s1e; pb = s2e; } } fgets (buffer, 105, file); } fclose (file); file = fopen (argv[1], "r"); while (!feof (file)){ if (fscanf (file, "(%d %d)=(%d %d) %f", &s1b, &s1e, &s2b, &s2e, &score) == 5 && s2b > 0){ if (s2b > s2e){ // draw rev aligns PAD = (s1e-s1b)* 2/10; PAD2 = (s2b-s2e)* 2/10; printf ("set arrow from %d,%d to %d,%d nohead ls 2\n", s1b-PAD2, s2b+PAD2, s1e+PAD2, s2e-PAD2); // draw connections // if (pa != -1 && pb != -1) // printf ("set arrow from %d,%d to %d,%d nohead lt -1 lw 0.01\n", pa, pb, s1b, s2b); pa = s1e; pb = s2b; } } fgets (buffer, 105, file); } printf ("plot [1:%d][1:%d] -1\n", maxa * 11/10, maxb*11/10); printf ("set terminal postscript enhanced color\n"); printf ("set output \"sin.ps\"\n"); printf ("replot\n"); fclose (file); } lagan20/src/utils/draft.pl0000755000076500007650000001711110502337062016506 0ustar brudnobrudno00000000000000#!/usr/bin/env perl use File::Basename; $lazyflag = 0; $lagandir = $ENV{LAGAN_DIR}; $recurfl = "-recurse \"(12,0,30,0)x,(13,1,30,0)x,(3,0,30,0)xt,(8,1,30,0)x,(7,1,30,0)x,(7,1,15,0)x\""; $laganparams = "-maskedonly "; $anchgapstart = -5; $anchgapcont = -0.2; $usebounds = 1; $startingrate = 65; $rateinc = 1; $frlevel = ""; $pid = "mergedir"; if (@ARGV < 2) { if ((@ARGV == 1) && ($ARGV[0] =~ /-version/)){ print STDERR "DRAFT version 0.1\n"; exit (0); } else { print STDERR ("Usage:\n\ndraft.pl SEQFILE MFAFILE [-cons RATE] [-translate] [-version]\n"); exit (1); } } $arglist = ""; $skipfr = 0; for ($i = 2; $i < @ARGV; $i++) { if ($ARGV[$i] =~ /-recurse/){ $recurfl = " -recurse \"".$ARGV[++$i]."\""; } elsif ($ARGV[$i] =~ /-skipfr/){ $skipfr = 1; $pid = $ARGV[++$i]; chomp $pid; } elsif ($ARGV[$i] =~ /-translate/){ $recurfl = $recurfl." -translate"; } elsif ($ARGV[$i] =~ /-cons/){ $startingrate = $ARGV[++$i]; chomp $startingrate; } elsif ($ARGV[$i] =~ /-lazy/){ $lazyflag = 1; } elsif ($ARGV[$i] =~ /-fastreject/){ $frarg = " -fastreject $frlevel"; } else { print STDERR "Bad arg to draft: $ARGV[$i]"; } } $arglist = "$arglist $recurfl -usebounds $laganparams $frarg"; # create new directory $newdir = `pwd`; chomp $newdir; $newdir = "$newdir/$pid"; `mkdir $newdir` if (!(-e $newdir)); open (LOGFILE, ">$newdir/log"); print STDERR ("\n"); print STDERR ("Finding Contig Alignments\n"); print STDERR ("-------------------------\n"); print LOGFILE ("\n"); print LOGFILE ("Finding Contig Alignments\n"); print LOGFILE ("-------------------------\n"); # extract contigs; $contigfile = basename ($ARGV[1]); $contigdir = dirname ($ARGV[1]); `cp $ARGV[1] $newdir`; @contigs = `perl $lagandir/mextract.pl $newdir/$contigfile`; if ($?) { exit(1);} for ($i = 0; $i < @contigs; $i++){ chomp $contigs[$i]; `$lagandir/utils/rc < $contigs[$i] > $contigs[$i].rc`; if ($?) { exit(1); } } # extract masked contigs $maskedname = $ARGV[1].".masked"; if (-e $maskedname){ $maskedcontigfile = basename ($maskedname); `cp $maskedname $newdir`; @maskedcontigs = `perl $lagandir/mextract.pl $newdir/$maskedcontigfile -masked`; if ($?) { exit(1);} for ($i = 0; $i < @maskedcontigs; $i++){ chomp $maskedcontigs[$i]; `$lagandir/utils/rc < $maskedcontigs[$i] > $contigs[$i].rc.masked`; if ($?) { exit(1); } } } # create file storing name of contig stats open (LFILE, ">$newdir/filenames") if (!$lazyflag); $num = 0; for ($i = 0; $i < @contigs; $i++){ chomp $contigs[$i]; $skip1 = $skip2 = 0; # make alignments if (!$lazyflag || !(-e "$contigs[$i].mfa")){ $execute = "perl $lagandir/lagan.pl $ARGV[0] $contigs[$i] -mfa $arglist -out $contigs[$i].mfa"; $execute = $execute." -gap $anchgapstart $anchgapcont" if ($usebounds); `$execute`; $ex_val = $? >> 8; if (!(-e "$contigs[$i].mfa")) { $skip1 = 1; } elsif ($?) { exit(1);} if (!$skip1 && $usebounds){ # compute bounds @bounds = `$lagandir/utils/getbounds anchs.final $ARGV[0] $contigs[$i]`; if ($?) { exit(1);} $bounds[0] =~ /-s1 (\d+) (\d+) -s2 (\d+) (\d+)/; $s1shift = $1 - 1; $s2shift = $3 - 1; } `rm anchs.final`; } if (!$lazyflag || !(-e "$contigs[$i].rc.mfa")){ $execute = "perl $lagandir/lagan.pl $ARGV[0] $contigs[$i].rc -mfa $arglist -out $contigs[$i].rc.mfa"; $execute = $execute." -gap $anchgapstart $anchgapcont" if ($usebounds); `$execute`; $ex_val = $? >> 8; if (!(-e "$contigs[$i].rc.mfa")) { $skip2 = 1; } elsif ($?) { exit(1);} if (!$skip2 && $usebounds){ # compute bounds @bounds = `$lagandir/utils/getbounds anchs.final $ARGV[0] $contigs[$i].rc`; if ($?) { exit(1);} $bounds[0] =~ /-s1 (\d+) (\d+) -s2 (\d+) (\d+)/; $s1rcshift = $1 - 1; $s2rcshift = $3 - 1; } `rm anchs.final`; } if ($skip1) { $fscore = 0; } else { $fscore = `$lagandir/utils/scorealign $contigs[$i].mfa $startingrate`; chomp $fscore; if ($?) { exit(1);} } if ($skip2) { $bscore = 0; } else { $bscore = `$lagandir/utils/scorealign $contigs[$i].rc.mfa $startingrate`; chomp $bscore; if ($?) { exit(1);} } # pick strand # print LFILE "$s1shift $contigs[$i].mfa\n" if (!$lazyflag); # print LFILE "$s1rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag); # if (0){ if ($fscore > 0 || $bscore > 0){ $j = $i + 1; if ($fscore > $bscore){ print STDERR ("(+) direction preferred for Contig \"$contigs[$i]\": $fscore > $bscore\n"); print LOGFILE ("(+) direction preferred for Contig \"$contigs[$i]\": $fscore > $bscore\n"); print LFILE "$j $s1shift $s2shift $contigs[$i].mfa\n" if (!$lazyflag); print STDERR "$j $s1shift $s2shift $contigs[$i].mfa\n" if (!$lazyflag); } elsif ($bscore > $fscore){ print STDERR ("(-) direction preferred for Contig \"$contigs[$i]\": $fscore < $bscore\n"); print LOGFILE ("(-) direction preferred for Contig \"$contigs[$i]\": $fscore < $bscore\n"); print LFILE "$j $s1rcshift $s2rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag); print STDERR "$j $s1rcshift $s2rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag); } } # } else { print STDERR ("Contig \"$contigs[$i]\" could not be matched: $fscore, $bscore\n"); print LOGFILE ("Contig \"$contigs[$i]\" could not be matched: $fscore, $bscore\n"); } } close (LFILE); print STDERR ("\n"); print STDERR ("Computing Contig Ordering\n"); print STDERR ("-------------------------\n\n"); print LOGFILE ("\n"); print LOGFILE ("Computing Contig Ordering\n"); print LOGFILE ("-------------------------\n\n"); $foundorder = 0; for ($cutoff = $startingrate; !$foundorder && ($cutoff < 100); $cutoff += $rateinc){ `$lagandir/utils/scorecontigs /$newdir/filenames $ARGV[0] $newdir/contignames $cutoff > $newdir/ranges`; if ($?) { exit(1);} @list = `cat $newdir/ranges`; $list[0] =~ /numContigs = (\d+)/; next if ($1 == 0); `$lagandir/utils/contigorder $newdir/ranges > $newdir/corder`; if ($?) { exit(1);} @list = `cat $newdir/corder`; chomp $list[0]; $foundorder = 1 if ($list[0] ne "ordering failed"); } if ($foundorder){ open (OFILE, ">$newdir/draft"); print OFILE ("Draft Ordering\n"); print OFILE ("--------------\n\n"); @contignames = `cat $newdir/contignames`; for ($i = 0; $i < @contignames; $i++){ $contignames[$i] =~ /(\d+) (\d+) (\d+) (.*)/; $num[$i] = $1; chomp $num[$i]; $s1shifts[$i] = $2; chomp $s1shifts[$i]; $s2shifts[$i] = $3; chomp $s2shifts[$i]; $filenames[$i] = $4; chomp $filenames[$i]; } @list = `cat $newdir/corder`; for ($i = 0; $i < @list; $i++){ $list[$i] =~ /(\d+) --\> \((\d+) (\d+)\) (.*)/; $score = $4; chomp $score; print OFILE ("$filenames[$1] --> ($2 $3) score=$score, offset=($s1shifts[$1] $s2shifts[$1]), index=$num[$1]\n"); } close (OFILE); print STDERR `cat $newdir/draft`; print LOGFILE `cat $newdir/draft`; close (LOGFILE); } else { print STDERR "Could not compute ordering."; print LOGFILE "Could not compute ordering."; close (LOGFILE); exit (0); } $filename1 = $ARGV[0]; $filename2 = "$newdir/$contigfile"; `$lagandir/cmerge2.pl $filename1 $filename2 $newdir/draft $filename2.merged -skipfr $pid`; if ($?) { exit(1); } print STDERR "EXECUTE $lagandir/cmerge2.pl $filename1 $filename2 $newdir/draft $filename2.merged -skipfr $pid\n"; `cp $filename2.merged merged_seq.fa`; `cp $filename2.merged.masked merged_seq.fa.masked`; `cp $newdir/minfo minfo`; `cp $newdir/ranges ranges`; `cp $newdir/log log`; print STDERR ("\n"); print STDERR ("Computing Final Alignment\n"); print STDERR ("-------------------------\n\n"); # `rm -rf $newdir`; lagan20/src/utils/fa2xfa.c0000644000076500007650000000562710502337062016372 0ustar brudnobrudno00000000000000#include #include #include #include #include int begin, finish, seqIdx, seqExt, seqlen, numseqs, seqlen2, numseqs2; int rcflag = 0; char name[1024], name2[1024], **seqs, **seqs2; char comp(char a) { if (!rcflag) return a; switch (a) { case 'A': case 'a': return 'T'; case 'T': case 't': return 'A'; case 'C': case 'c': return 'G'; case 'G': case 'g': return 'C'; case 'N': case 'n': return 'N'; } fprintf (stderr, "bad letter to RC %c\n",a); exit(2); } int getLength (char *filename){ FILE *file; char buffer[1024], ch; int length = 0; file = fopen (filename, "r"); assert (file); fgets (buffer, 1024, file); while (!feof (file)){ ch = fgetc (file); if (ch == '>') break; if (((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) || ch == '.' || ch == '-') length++; } fclose (file); return length; } void readfile (char *filename, int *seqlen, int *numseqs, char *name, char ***seqs){ FILE *file; char buffer[1024], ch; int i; *numseqs = 0; *seqlen = getLength (filename); strcpy (name, ""); *seqs = (char **) malloc (sizeof (char *) * 1); assert (*seqs); (*seqs)[0] = (char *) malloc (sizeof (char) * (*seqlen)); file = fopen (filename, "r"); assert (file); while (!feof (file)){ i = 0; fgets (buffer, 1024, file); if (strlen (name) == 0) strcpy (name, buffer); if (feof (file)) break; (*numseqs)++; if (*numseqs > 1){ *seqs = (char **) realloc (*seqs, sizeof (char *) * (*numseqs)); assert (*seqs); (*seqs)[*numseqs - 1] = (char *) malloc (sizeof (char) * (*seqlen)); assert ((*seqs)[*numseqs - 1]); } while (!feof (file)){ ch = fgetc (file); if (ch == '>') break; ch = toupper(ch); if (((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) || (ch == '.') || (ch == '-')){ // assert (i < (*seqlen)); (*seqs)[*numseqs - 1][i] = ch; i++; } } if (ch == '>') ungetc (ch, file); assert (i == *seqlen); } fclose (file); } void print (void){ int i = 0, pos = 0, written = 0, j = 0; assert (seqExt >= 0 && seqExt < numseqs); name[0] = ' '; printf (">%d:%d-%d %c %s", seqIdx, begin+1, finish, (rcflag)?'-':'+', name); for (i = begin; i < finish; i++) { printf ("%c", comp(seqs[seqExt][(rcflag)?(finish+begin-i-1):i])); written++; if (written % 60 == 0) printf ("\n"); } if (written % 60 != 0) printf ("\n"); } int main (int argc, char** argv){ int i; if (argc != 5 && !(argc == 6 && strcmp (argv[5], "-rc") == 0)){ fprintf (stderr, "Usage:\n\nfa2xfa fasta_file begin end seqid [-rc]\n"); exit (1); } seqExt = 0; begin = atoi (argv[2])-1; finish = atoi (strdup(argv[3])); seqIdx = atoi (argv[4]); if (argc == 6) rcflag = 1; seqlen2 = 0; readfile (argv[1], &seqlen, &numseqs, name, &seqs); print (); for (i = 0; i < numseqs; i++) free (seqs[i]); free (seqs); } lagan20/src/utils/getbounds.c0000644000076500007650000000375110502337062017211 0ustar brudnobrudno00000000000000#include #include #include #include #include #define EXPAND 2 inline int max (int a, int b){ if (a > b) return a; return b; } inline int min (int a, int b){ if (a < b) return a; return b; } int getLength (char *filename){ FILE *file; char buffer[1024], ch; int length = 0; file = fopen (filename, "r"); assert (file); fgets (buffer, 1024, file); while (!feof (file)){ ch = fgetc (file); if (ch == '>') break; if (isalpha (ch) || ch == '.') length++; } fclose (file); return length; } int main (int argc, char **argv){ FILE *file; int s1b, s1e, s2b, s2e, i; int S1B, S1E, S2B, S2E, ext, len1, len2; int m1b, m1e, m2b, m2e; float f; if (argc != 4){ fprintf (stderr, "Usage:\n\ngetbounds anchfile seqfile1 seqfile2\n"); exit (1); } file = fopen (argv[1], "r"); assert (file); len1 = getLength (argv[2]); len2 = getLength (argv[3]); m1b = m2b = 1000000000; m1e = m2e = -1000000000; while (!feof (file)){ if (fscanf (file, "(%d %d)=(%d %d) %f\n", &s1b, &s1e, &s2b, &s2e, &f) == 5){ m1b = min (m1b, s1b); m1e = max (m1e, s1e); m2b = min (m2b, s2b); m2e = max (m2e, s2e); } } m1e = len2 - m1e; m2e = len2 - m2e; fclose (file); file = fopen (argv[1], "r"); assert (file); i = 0; while (!feof (file)){ if (fscanf (file, "(%d %d)=(%d %d) %f\n", &s1b, &s1e, &s2b, &s2e, &f) == 5){ if (i == 0){ S1B = max (s1b - m2b * EXPAND, 1); S1E = min (s1e + m2e * EXPAND, len1); S2B = max (s2b - m2b * EXPAND, 1); S2E = min (s2e + m2e * EXPAND, len2); i = 1; } else { S1B = min (S1B, max (s1b - m2b * EXPAND, 1)); S1E = max (S1E, min (s1e + m2e * EXPAND, len1)); S2B = min (S2B, max (s2b - m2b * EXPAND, 1)); S2E = max (S2E, min (s2e + m2e * EXPAND, len2)); } } } if (i == 0){ S1B = 1; S1E = len1; S2B = 1; S2E = len2; } printf ("-s1 %d %d -s2 %d %d\n", S1B, S1E, 1, len2); fclose (file); return 0; } lagan20/src/utils/getcontigpos.c0000644000076500007650000000427710502337062017730 0ustar brudnobrudno00000000000000#include #include #include #include #include int begin, finish, seqIdx, seqExt, seqlen, numseqs, seqlen2, numseqs2; char name[1024], name2[1024], **seqs, **seqs2; int getLength (char *filename){ FILE *file; char buffer[1024], ch; int length = 0; file = fopen (filename, "r"); assert (file); fgets (buffer, 1024, file); while (!feof (file)){ ch = fgetc (file); if (ch == '>') break; if (isalpha (ch) || ch == '.' || ch == '-') length++; } fclose (file); return length; } void readfile (char *filename, int *seqlen, int *numseqs, char *name, char ***seqs){ FILE *file; char buffer[1024], ch; int i; *numseqs = 0; *seqlen = getLength (filename); strcpy (name, ""); *seqs = (char **) malloc (sizeof (char *) * 1); assert (*seqs); (*seqs)[0] = (char *) malloc (sizeof (char) * (*seqlen)); file = fopen (filename, "r"); assert (file); while (!feof (file)){ i = 0; fgets (buffer, 1024, file); if (strlen (name) == 0) strcpy (name, buffer); if (feof (file)) break; (*numseqs)++; if (*numseqs > 1){ *seqs = (char **) realloc (*seqs, sizeof (char *) * (*numseqs)); assert (*seqs); (*seqs)[*numseqs - 1] = (char *) malloc (sizeof (char) * (*seqlen)); assert ((*seqs)[*numseqs - 1]); } while (!feof (file)){ ch = fgetc (file); if (ch == '>') break; if (isalpha (ch) || ch == '.' || ch == '-'){ assert (i < (*seqlen)); (*seqs)[*numseqs - 1][i] = ch; i++; } } if (ch == '>') ungetc (ch, file); assert (i == *seqlen); } fclose (file); } void print (void){ int i = 0, pos = 0, pos2 = 0, written = 0, j = 0; while (pos <= finish && i < seqlen){ if (isalpha (seqs[0][i])) pos++; if (isalpha (seqs[1][i])) pos2++; if (pos == finish){ printf ("%d\n", pos2); break; } i++; } } int main (int argc, char** argv){ int i; if (argc == 0){ fprintf (stderr, "Usage:\n\ngetcontigpos multi_fasta_file finished_index\n"); exit (1); } finish = atoi (strdup(argv[2])); readfile (argv[1], &seqlen, &numseqs, name, &seqs); print (); for (i = 0; i < numseqs; i++) free (seqs[i]); free (seqs); } lagan20/src/utils/getlength.c0000644000076500007650000000141410502337062017172 0ustar brudnobrudno00000000000000#include #include #include #include #include #define BUF_SIZE 1024 int main (int argc, char **argv){ FILE *file; char buffer[BUF_SIZE], ch; int length = 0, i, done = 0, nread; if (argc != 2){ fprintf (stderr, "Usage:\n\ngetlength seqfile\n"); exit (1); } file = fopen (argv[1], "r"); assert (file); fgets (buffer, BUF_SIZE, file); while (!feof (file) && !done){ nread = fread (buffer, 1, BUF_SIZE, file); for (i = 0; i < nread; i++){ ch = buffer[i]; if (ch == '>'){ done = 1; break; } if (((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) || ch == '.' || ch == '-') length++; } } fclose (file); printf ("%d\n", length); return 0; } lagan20/src/utils/getoverlap.c0000644000076500007650000000145010502337062017361 0ustar brudnobrudno00000000000000#include #include #define INTMAX (100000000) #define INTMIN (-INTMAX) int max (int a, int b){ if (a > b) return a; return b; } int min (int a, int b){ if (a < b) return a; return b; } int main (int argc, char **argv){ FILE *file; int seq1begin = INTMAX, seq1end = INTMIN, seq2begin = INTMAX, seq2end = INTMIN; int a, b, c, d, e = 0; file = fopen (argv[1], "r"); assert (file); while (!feof (file)){ if (fscanf (file, "(%d %d)=(%d %d) %*f\n", &a, &b, &c, &d) == 4){ seq1begin = min (seq1begin, a); seq1end = max (seq1end, b); seq2begin = min (seq2begin, c); seq2end = max (seq2end, d); e++; } } fclose (file); if (!e) printf ("-1 -1 -1 -1\n"); else printf ("%d %d %d %d\n", seq1begin, seq1end, seq2begin, seq2end); } lagan20/src/utils/Glue.cpp0000644000076500007650000003103310502337062016445 0ustar brudnobrudno00000000000000#include "MultiSequence.h" #include "SafeVector.h" #include "Output.h" #include #include #include #include #include #define NUCLEOTIDE_MATRIX_FILE "nucmatrix.txt" #define MAX_LINE_LENGTH 1024 #define CONS_RATE 70 #define INF 2000000000 #define CNTG_BRK_N 50 typedef SafeVector vi; typedef SafeVector vvi; typedef SafeVector vvvi; MultiSequence seqs; vvi matchScore (256, vi (256, 0)); vvi dad, score; int gapopen, gapcont; int NCtoNC = 0, NCtoCN = -1000, CNtoNC = -1000, CNtoCN = 0; void readScoreMatrix (char *filename){ FILE *file; int i, j, k, numlets = 0; char lets[256], line[1024]; char *lagan_dir; lagan_dir = getenv ("LAGAN_DIR"); if (!lagan_dir){ fprintf (stderr, "Error: $LAGAN_DIR not set.\n"); exit (1); } sprintf (line, "%s/%s", lagan_dir, filename); fprintf (stderr, "%s\n", line); file = fopen (line, "r"); assert (file); fgets (line, 1024, file); for (i = 0; i < (int) strlen (line); i++){ if (!isspace (line[i])){ lets[numlets++] = line[i]; } } for (i = 0; i < numlets; i++){ fscanf (file, "%1s", &(line[0])); for (j = 0; j < numlets; j++){ fscanf (file, "%d", &k); matchScore[(unsigned char) line[0]][(unsigned char) lets[j]] = k; } } fscanf (file, "%d%d", &gapopen, &gapcont); fclose (file); } void calculateScoreMatrix (int cons_rate){ char *alpha = "ATCG"; int i, j; for (int i = 0; i < 256; i++) for (int j = 0; j < 256; j++) matchScore[i][j] = 0; if (cons_rate == 0){ readScoreMatrix (NUCLEOTIDE_MATRIX_FILE); return; } double p_ij = (double) cons_rate / 100.0; double match = log (p_ij / 0.25); double mismatch = log ((1 - p_ij) / 0.75); for (i = 0; i < (int) strlen (alpha); i++){ for (j = 0; j < (int) strlen (alpha); j++){ matchScore[(unsigned char) alpha[i]][(unsigned char) alpha[j]] = (i == j) ? (int)(match * 100) : (int)(mismatch * 100); } } gapopen = (int)(-match * 750); gapcont = (int)(-match * 25); // fprintf (stderr, "Using match=%d mismatch=%d gapopen=%d gapcont=%d...\n", // (int)(match*100), (int)(mismatch*100), gapopen, gapcont); } #define NUM_STATES 2 #define NC 0 #define CN 1 void chooseBestOfTwo (int score1, int score2, int ptr1, int ptr2, int &score, int &ptr){ if (score1 >= score2){ score = score1; ptr = ptr1; } else { score = score2; ptr = ptr2; } } void chooseBestOfTwo (int score1, int score2, int &score){ if (score1 >= score2){ score = score1; } else { score = score2; } } int scorePosition (char c, char d, int &isGap){ if (c == '-' && d == '-') return 0; if (c == '-' || d == '-'){ if (isGap) return gapcont; isGap = 1; return gapopen; } isGap = 0; return matchScore[(unsigned char) c][(unsigned char) d]; } int rescoreRegion (Sequence &seq1, Sequence &seq2, int begin, int end){ SafeVector::iterator lets1 = seq1.getIterator(); SafeVector::iterator lets2 = seq2.getIterator(); lets1 += begin - 1; lets2 += begin - 1; int isGap = 0; for (int i = 0; i < NUM_STATES; i++) score[i][begin-1] = dad[i][begin-1] = 0; for (int i = begin; i <= end; i++){ chooseBestOfTwo (score[NC][i-1] + NCtoNC, score[CN][i-1] + CNtoNC, score[NC][i]); chooseBestOfTwo (score[NC][i-1] + NCtoCN, score[CN][i-1] + CNtoCN, score[CN][i]); score[CN][i] += scorePosition (*(++lets1), *(++lets2), isGap); } chooseBestOfTwo (score[NC][end], score[CN][end], isGap); return isGap; } void getNucLabels (Sequence &seq1, Sequence &seq2, vi &nucLabels){ SafeVector::iterator lets1 = seq1.getIterator(); SafeVector::iterator lets2 = seq2.getIterator(); int seqLen = seq1.getLength(); int isGap = 0; nucLabels = vi (seqLen+1, 0); for (int i = 0; i < NUM_STATES; i++) score[i][0] = dad[i][0] = 0; for (int i = 1; i <= seqLen; i++){ chooseBestOfTwo (score[NC][i-1] + NCtoNC, score[CN][i-1] + CNtoNC, NC, CN, score[NC][i], dad[NC][i]); chooseBestOfTwo (score[NC][i-1] + NCtoCN, score[CN][i-1] + CNtoCN, NC, CN, score[CN][i], dad[CN][i]); score[CN][i] += scorePosition (*(++lets1), *(++lets2), isGap); } chooseBestOfTwo (score[NC][seqLen], score[CN][seqLen], NC, CN, isGap, nucLabels[seqLen]); for (int i = seqLen - 1; i >= 1; i--){ nucLabels[i] = dad[nucLabels[i+1]][i]; } } int getSeqCoord (int seq, int pos){ SafeVector::iterator lets = seqs[seq].getIterator(); int j = 0; for (int i = 1; i <= pos; i++) if (*(++lets) != '-') j++; return j; } void printCoordinates (int seq, int begin, int end){ cout << seqs[seq].getID() << ":" << getSeqCoord(seq, begin) << "-" << getSeqCoord(seq, end) << " "; } int printRegion (int begin, int end){ int score = 0; int numSeqs = seqs.getNumSeqs(); for (int i = 0; i < numSeqs; i++){ printCoordinates (i, begin, end); for (int j = i+1; j < numSeqs; j++){ score += rescoreRegion (seqs[i], seqs[j], begin, end); } } cout << score << endl; return score; } void scoreAlign (){ int numSeqs = seqs.getNumSeqs(); int seqLen = seqs[0].getLength(); vvvi nucLabels (numSeqs, vvi (numSeqs, vi())); for (int i = 0; i < numSeqs; i++){ for (int j = i+1; j < numSeqs; j++){ getNucLabels (seqs[i], seqs[j], nucLabels[i][j]); } } int begin = -1, end = -1, score = 0; for (int i = 1; i <= seqLen+1; i++){ int conserved = 1; if (i == seqLen+1) conserved = 0; else { for (int j = 0; conserved && j < numSeqs; j++) for (int k = j+1; conserved && k < numSeqs; k++) conserved = nucLabels[j][k][i]; } if (conserved){ if (begin == -1) begin = i; } else { if (begin != -1){ end = i-1; score += printRegion (begin, end); begin = end = -1; } } } cout << "= score=" << score << endl; } int countLets (SafeVector &data){ int ct = 0; for (int i = 0; i < (int) data.size(); i++){ if (data[i] >= 'A' && data[i] <= 'Z' || data[i] >= 'a' && data[i] <= 'z') ct++; } return ct; } int findSplit (SafeVector &data1, SafeVector &data2, int overlap, SafeVector &data1a, SafeVector &data2a){ int offs1 = data1.size(), num1 = 0; for (int i = (int) data1.size() - 1; i >= 0; i--){ if (overlap == 0) break; if (isalpha(data1[i])) num1++; if (num1 == overlap){ offs1 = i; break; } } int offs2 = 0; num1 = 0; for (int i = 0; i < (int) data2.size(); i++){ if (overlap == 0) break; if (isalpha(data2[i])) num1++; if (num1 == overlap){ offs2 = i; break; } } SafeVector score1 (overlap+1, 0); SafeVector score2 (overlap+1, 0); int score = 0; for (int ct = 0,i=0; ct < overlap;i++){ if (isalpha(data1[i+offs1])) ct++; score += (data1[i+offs1] == data1a[i+offs1]) ? 18 : -8; score1[ct] = score; } score = 0; for (int ct = 0,i=0; ct < overlap;i++){ if (isalpha(data2[offs2-i])) ct++; score += (data2[offs2-i] == data2a[offs2-i]) ? 18 : -8; score2[ct] = score; } int j = 0, best = -1000000; for (int i = 0; i <= overlap; i++){ if (score1[i] + score2[overlap-i] > best){ best = score1[i] + score2[overlap-i]; j = i; } } // fprintf (stderr, "0 <= %d <= %d\n", j, overlap); return j; } template int chopLeft (SafeVector &data1, SafeVector &data2, int num, bool inAlign){ int num1 = 0, here = -1; if (inAlign) here = num - 1; else { for (int i = 0; i < (int) data1.size(); i++){ if (num == 0) break; if (isalpha(data1[i])) num1++; if (num1 == num){ here = i; break; } } } int chopped = here + 1; for (int i = here + 1; i < (int) data1.size(); i++){ data1[i - chopped] = data1[i]; data2[i - chopped] = data2[i]; } data1.resize ((int) data1.size() - chopped); data2.resize ((int) data2.size() - chopped); return chopped; } template int chopRight (SafeVector &data1, SafeVector &data2, int num, bool inAlign){ int num1 = 0, here = data1.size(); if (inAlign) here = data1.size() - num; else { for (int i = (int) data1.size() - 1; i >= 0; i--){ if (num == 0) break; if (isalpha(data1[i])) num1++; if (num1 == num){ here = i; break; } } } int ret = (int) data1.size() - here; data1.resize (here); data2.resize (here); return ret; } template SafeVector merge (SafeVector &data1, SafeVector &data2){ SafeVector temp; for (int i = 0; i < (int) data1.size(); i++) temp.push_back (data1[i]); for (int i = 0; i < (int) data2.size(); i++) temp.push_back (data2[i]); return temp; } int main (int argc, char **argv){ FILE* outfile; if (argc < 2 || argc > 3){ cerr << "Usage: Glue align.mfa \n" << endl; exit (1); } if (argc == 3) { if (!(outfile = fopen (argv[2], "w"))) { fprintf (stderr, "couldn't open %s for writing\n", argv[2]); exit(1); } } else outfile = stderr; // calculateScoreMatrix (CONS_RATE); SafeVector merged1, merged2; SafeVector strand; SafeVector merged1label, merged2label; int begin1 = 1, end1 = 1; ifstream data (argv[1]); int alignNum = 0; strand.push_back ('?'); // nothing for alignNum 0 while (true){ seqs = MultiSequence(); seqs.addRawFromMFA (data); if (seqs.getNumSeqs() != 2) break; alignNum++; strand.push_back (seqs[1].getStrand()); if (alignNum == 1){ begin1 = seqs[0].getStartCoord(); end1 = seqs[0].getEndCoord(); merged1 = seqs[0].getData(); merged1label = SafeVector((int) merged1.size(), 1); merged2 = seqs[1].getData(); merged2label = SafeVector((int) merged2.size(), 1); continue; } int b1 = seqs[0].getStartCoord(); int e1 = seqs[0].getEndCoord(); SafeVector seqs0; SafeVector seqs1; seqs0 = seqs[0].getData(); seqs1 = seqs[1].getData(); SafeVector seqs0label((int) seqs0.size(), alignNum); SafeVector seqs1label((int) seqs1.size(), alignNum); int overlap = e1 - begin1 + 1; if (overlap > 0){ int numLeft = findSplit (seqs0, merged1, overlap, seqs1, merged2); int numRight = overlap - numLeft; int choppedLeft = chopLeft (merged1, merged2, numLeft, false); int choppedRight = chopRight (seqs0, seqs1, numRight, false); chopLeft (merged1label, merged2label, choppedLeft, true); chopRight (seqs0label, seqs1label, choppedRight, true); } else if (overlap < 0){ SafeVector temp1 (-overlap, 'N'); SafeVector temp2 (-overlap, 'N'); merged1 = merge (temp1, merged1); merged2 = merge (temp2, merged2); SafeVector temp1label (-overlap, 0); SafeVector temp2label (-overlap, 0); merged1label = merge (temp1label, merged1label); merged2label = merge (temp2label, merged2label); } merged1 = merge (seqs0, merged1); merged2 = merge (seqs1, merged2); merged1label = merge (seqs0label, merged1label); merged2label = merge (seqs1label, merged2label); //seqs[0].writeXMFAHeader(cerr); begin1 = b1; if (data.eof()) break; if (data.peek() == '=') data.ignore (MAX_LINE_LENGTH, '\n'); if (data.eof()) break; } SafeVector temp1 (begin1 - 1, 'N'); SafeVector temp2 (begin1 - 1, '-'); for (int i = 0; i < min ((int) temp2.size(), CNTG_BRK_N); i++) temp2[i] = 'N'; merged1 = merge (temp1, merged1); merged2 = merge (temp2, merged2); SafeVector temp1label (begin1 - 1, 0); SafeVector temp2label (begin1 - 1, 0); merged1label = merge (temp1label, merged1label); merged2label = merge (temp2label, merged2label); for (int i = 1; i <= alignNum; i++){ int min1 = INF, max1 = 0, min2 = INF, max2 = 0; int pos1 = 0, pos2 = 0; for (int j = 0; j < (int) merged1label.size(); j++){ if (isalpha(merged1[j])) pos1++; if (isalpha(merged2[j])) pos2++; if (merged1label[j] == i){ min1 = min (min1, pos1); max1 = max (max1, pos1); } if (merged2label[j] == i){ min2 = min (min2, pos2); max2 = max (max2, pos2); } } //[FASTA line for this contig in the original sequence file] //n baseFrom baseTo mergedFrom mergedTo startChop endChop {+,-} score secFrom secTo fprintf (outfile, "Align %d\n", i); if (min1 == INF) fprintf (outfile, "%d was cropped completely.\n", i); else fprintf (outfile, "%d %d %d 0 0 0 0 %c 0 %d %d\n", i, min1, max1, strand[i], min2, max2); } printMFA (cout, merged1, string ("first"), 60); printMFA (cout, merged2, string ("second"), 60); } lagan20/src/utils/mextract.pl0000755000076500007650000000346310502337062017242 0ustar brudnobrudno00000000000000#!/usr/bin/env perl if (@ARGV < 1) { print ("usage:\n mextract.pl filename [-masked]\n"); exit(1); } $masked=0; $filename = $ARGV[0]; if(@ARGV==2) { if ($ARGV[1] eq "-masked") { $masked = 1; } } open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n"; $prefix = substr $filename, 0, (rindex $filename, "."); if ($masked || index ($filename, ".masked") != -1) { $prefix = substr $filename, 0, (rindex $prefix, "."); } $line = ; chomp $line; while (substr($line, 0, 1) ne ">") { $line = ; chomp $line; } $suffix = "fa"; if ($masked) { $suffix = "$suffix.masked"; } if (substr($line, 0, 1) eq ">") { $name = substr($line, 1); if (index ($name, " ") != -1){ $name = substr($name, 0, index ($name, " ")); } if (substr ($name, length ($name) - 1) eq ","){ $name = substr($name, 0, length ($name) - 1); } # $name = substr($line, 1); # $_ = substr($line, 1); # /\w+/g; # $name = $&; # substr($line, 1)." " =~ /(.+)[,]\s+/g; # $name = $1; $fname = "$prefix\_$name.$suffix"; print("$fname\n"); open(OUTFILE, ">$fname"); print OUTFILE ">$name\n"; } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { close OUTFILE; # substr($line, 1)." " =~ /(.+)[,]\s/g; # $name = $1; $name = substr($line, 1); if (index ($name, " ") != -1){ $name = substr($name, 0, index ($name, " ")); } if (substr ($name, length ($name) - 1) eq ","){ $name = substr($name, 0, length ($name) - 1); } # $_ = substr($line, 1); # /\w+/g; # $name = $&; $fname = "$prefix\_$name.$suffix"; print("$fname\n"); open(OUTFILE, ">$fname"); print OUTFILE ">$name\n"; } else { print OUTFILE "$line"; } } close OUTFILE; lagan20/src/utils/mf2bin.pl0000755000076500007650000000312510502337062016563 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # defaults # constants # usage notes if (@ARGV < 1) { print ("usage:\n mf2bin.pl inputfile [-out outputfile] \n"); exit(1); } # parse parameters $tofile = 0; for ($i=1; $i<@ARGV; $i++) { if ($ARGV[$i] eq "-out") { $tofile = 1; $outfilename = $ARGV[++$i]; } } if ($tofile) { open(OUTFILE, ">$outfilename"); } # read in Multi-FASTA file $infilename = $ARGV[0]; open(FASTAFILE, "$infilename") || die "Could not open $infilename.\n\n"; $line = ; chomp $line; $i=0; %list=(); @seqs=(()); if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; push @seqs, (); } else { push @{$seqs[$i]}, "$line"; } } $i=0; for $row (@seqs) { @strs[$i++] = join "", @$row; } if (@keys != 2) { print ("mpack needs two FASTA sequences\n"); exit(1); } # pack bin # format from Alex Poliakov's glass2bin.pl script %base_code = ('-' => 0, 'A' => 1, 'C' => 2, 'T' => 3, 'G' => 4, 'N' => 5, 'a' => 1, 'c' => 2, 't' => 3, 'g' => 4, 'n' => 5); $l = length @strs[0]; # $l--; $s1 = reverse(@strs[0]); $s2 = reverse(@strs[1]); for ($i=0; $i<$l; $i++) { if ($tofile) { print OUTFILE pack("H2", $base_code{chop($s1)} . $base_code{chop($s2)}); } else { print pack("H2", $base_code{chop($s1)} . $base_code{chop($s2)}); } } lagan20/src/utils/mpretty.pl0000755000076500007650000001143310502337062017113 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # defaults $linelen = 50; $interval = 10; $labellen = 5; $uselabels = 1; $useintervals = 1; $usecounts = 1; $usebase = 0; $liststart = 1; $listend = 0; $usestart = 0; $useend = 0; # constants $minlinelen = 10; $mininterval = 10; $minlabellen = 3; # usage notes if (@ARGV < 1) { print ("usage:\n mpretty.pl filename\n"); print ("options:\n"); print (" -linelen value\n"); print (" (min: $minlinelen, default: $linelen)\n"); print (" -interval value\n"); print (" (min: $mininterval, default: $interval, none: 0)\n"); print (" -labellen value\n"); print (" (min: $labellen, default: $labellen, none: 0)\n"); print (" -base sequence_name\n"); print (" (if used, must specify a sequence on which to base counting\n"); print (" -start value\n"); print (" (if used, must specify a start coordinate (>=1)\n"); print (" -end value\n"); print (" (if used, must specify an end coordinate (>=start)\n"); print (" -nocounts\n"); exit(1); } # parse parameters for ($i=1; $i<@ARGV; $i++) { if ($ARGV[$i] eq "-nocounts") { $usecounts = 0; } if ($ARGV[$i] eq "-linelen") { $linelen = $ARGV[++$i]; if ($linelen < $minlinelen) { $linelen = $minlinelen; } } if ($ARGV[$i] eq "-interval") { $interval = $ARGV[++$i]; if ($interval <= 0) { $useintervals = 0; } if ($interval < $mininterval) { $interval = $mininterval; } } if ($ARGV[$i] eq "-labellen") { $labellen = $ARGV[++$i]; if ($labellen <= 0) { $uselabels = 0; } if ($labellen < $minlabellen) { $labellen = $minlabellen; } } if ($ARGV[$i] eq "-base") { $baseseq = $ARGV[++$i]; $usebase = 1; } if ($ARGV[$i] eq "-start") { $usestart = 1; $liststart = $ARGV[++$i]; } if ($ARGV[$i] eq "-end") { $useend = 1; $listend = $ARGV[++$i]; } } # preprocessing for labels if ($uselabels) { $labtail = ""; for ($i=0; $i<$labellen; $i++) { $labtail="$labtail "; } } if (($usestart && ($liststart<1)) || ($useend && ($listend<$liststart))) { die "Invalid range specified: [$liststart, $listend].\n\n"; } # read in Multi-FASTA file $filename = $ARGV[0]; open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n"; $line = ; chomp $line; while (substr($line, 0, 1) ne ">") { $line = ; chomp $line; } $i=0; %list=(); @seqs=(()); if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; @count[$i]=0; @label[$i] = substr("@keys[$i]$labtail", 0, $labellen); $list{@keys[$i]}=$i; } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; @count[$i]=0; @label[$i] = substr("@keys[$i]$labtail", 0, $labellen); $list{@keys[$i]}=$i; push @seqs, (); } else { push @{$seqs[$i]}, "$line"; } } $i=0; $maxlen = 0; for $row (@seqs) { @strs[$i++] = join "", @$row; $templen = length @strs[$i-1]; if ($templen > $maxlen) { $maxlen = $templen; } } $foundseq=0; if ($usebase) { foreach $s (@keys) { $foundseq = ($s eq $baseseq) || $foundseq; } if (!$foundseq) { die "Could not find Base Sequence: <$baseseq>\n\n"; } } # preprocessing for counts if ($usecounts) { foreach $s (@keys) { $_ = @strs[$list{$s}]; $ls = tr/ATCGNatcgn/ATGCNatcgn/; @tot[$list{$s}] = $ls; } } # length of sequence display $l=$maxlen; if ((!$listend) || ($listend>$maxlen)) { $listend = $maxlen; } if ($maxlen < $liststart) { die "Starting out of bounds...\b\b"; } if ($usebase) { # find base sequence position $i=0; $j=0; while ($j<$liststart) { if (substr(@strs[$list{$baseseq}], $i, 1) ne "-") { $j++; } $i++; } $liststart = $i; while ($j<$listend) { if (substr(@strs[$list{$baseseq}], $i, 1) ne "-") { $j++; } $i++; } $listend = $i; } # pretty print if ($usecounts) { foreach $s (@keys) { $_ = substr(@strs[$list{$s}], 0, $liststart-1); $lc = tr/ATCGN/ATGCN/; @count[$list{$s}]+=$lc; } } for ($i=$liststart-1; $i<$listend; $i+=$linelen) { if ($listend-$i<$linelen) { $linelen = $listend-$i;} foreach $s (@keys) { if ($uselabels) { print "@label[$list{$s}] : "; } $p = substr(@strs[$list{$s}], $i, $linelen); print "$p"; if ($usecounts) { $_ = $p; $lc = tr/ATCGN/ATGCN/; @count[$list{$s}]+=$lc; print " @ @count[$list{$s}]/@tot[$list{$s}]"; } print "\n"; } if ($useintervals) { if ($uselabels) { print "$labtail = "; } for ($j=$i+1; $j<=$i+$linelen && $j<=$l; $j+=$interval) { $ct = "$j"; print $ct; for ($k=0; $k<($interval-(length $ct)); $k++) { print " "; } } print "\n"; } print "\n"; } lagan20/src/utils/mproject.pl0000755000076500007650000000245310502337062017234 0ustar brudnobrudno00000000000000#!/usr/bin/env perl if (@ARGV < 2) { print ("usage:\n mproject.pl filename seqname1 [seqname2 ... ]\n"); exit(1); } $filename = $ARGV[0]; $i = 1; while ($i < @ARGV) { @targets[$i-1] = $ARGV[$i]; $i++; } open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n"; $line = ; chomp $line; $i=0; %list=(); @seqs=(()); if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; $_ = substr($line,1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; push @seqs, (); } else { push @{$seqs[$i]}, "$line"; } } $i=0; for $row (@seqs) { @strs[$i++] = join "", @$row; } $seqlen = length $strs[0]; # $seqlen--; for ($i=0; $i<$seqlen; $i++) { @isgap[$i] = 1; foreach $s (@targets) { if (substr(@strs[$list{$s}], $i, 1) ne "-") { @isgap[$i] = 0; break; } } } foreach $s (@targets) { print ">@keys[$list{$s}]\n"; $j=0; for ($i=0; $i<$seqlen; $i++) { if(!@isgap[$i]) { print substr(@strs[$list{$s}], $i, 1); $j++; if (($j % 60) == 0) { print "\n"; } } } print "\n"; } lagan20/src/utils/mrun.pl0000755000076500007650000001412410502337062016370 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # This script requires the environment variables: # LAGAN_DIR and VISTA_DIR # VISTA .plotfile defaults ($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set"; $paregmin = 75; $paregmax = 100; $pamin = 50; $pbases = 10000; $ptickdist = 2000; $presolution = 25; $pwindow = 40; $pnumwindows = 4; if (@ARGV < 1) { print ("usage:\n mrun.pl filename -tree \"(tree...)\"\n"); print ("options: [base sequence name [sequence pairs]]\n"); print ("default: [base sequence name = first sequence]\n"); print ("other MLAGAN parameters...\n"); print ("other VISTA parameters...\n"); exit(1); } $filename = $ARGV[0]; $i = 1; $j = 0; $k = 0; $l = 0; $treespec = 0; while ($i < @ARGV) { if ($ARGV[$i] eq "-tree") { @params[$j] = "-tree"; @params[++$j] = "\"$ARGV[++$i]\""; $_ = @params[$j]; $topen = tr/"\("/"\("/; $tclose = tr/"\)"/"\)"/; $treespec = ($topen == $tclose); } else { if (substr($ARGV[$i],0,1) eq "-") { if (substr($ARGV[$i],0,2) eq "--") { @vparams[$l++] = $ARGV[$i++]; @vparams[$l++] = $ARGV[$i]; } else { $j++; @params[$j] = $ARGV[$i]; if ((@params[$j] eq "-gapstart") || (@params[$j] eq "-gapend") || (@params[$j] eq "-gapcont") || (@params[$j] eq "-gapperseq") || (@params[$j] eq "-match") || (@params[$j] eq "-mismatch") || (@params[$j] eq "-overlap") || (@params[$j] eq "-translate") || (@params[$j] eq "-gfc") || (@params[$j] eq "-ext") || (@params[$j] eq "-glwidth")) { @params[++$j] = $ARGV[++$i]; } } } else { @targets[$k++] = $ARGV[$i]; } } $i++; } for ($i=0; $i<@vparams; $i+=2) { if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; } elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; } elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; } elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; } elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; } elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; } } if (!$treespec) { print ("Must specify valid phylogenetic tree...\n"); exit(1); } if ($lagandir eq "") { print ("Must specify environment variable LAGAN_DIR\n"); exit(1); } $mextstr = "$lagandir/utils/mextract.pl $filename"; print "$mextstr\n"; if(!`$mextstr`) { print "\nMulti-FASTA extraction failure...\n"; exit(1); } if (-e "$filename.masked") { $mextstr = "$lagandir/utils/mextract.pl $filename.masked -masked"; print "$mextstr\n"; if(!`$mextstr`) { print "\nMasked Multi-FASTA extraction failure...\n"; exit(1); } } open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n"; $line = ; chomp $line; while (substr($line, 0, 1) ne ">") { $line = ; chomp $line; } $i=0; %list=(); if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; if (@targets == 0) { @targets[0] = @keys[$i]; print "Setting Base Sequence: @targets[0]\n"; } } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; } } $prefix = substr $filename, 0, (rindex $filename, "."); $prefix = "$prefix\_"; foreach $s (@keys) { @fnames[$list{$s}] = "$prefix$keys[$list{$s}].fa"; } if ((@targets > 1)) { if (@targets %2 != 1) { $c = @targets; print ("$c sequences: "); print ("Must specify single base sequence\n"); print (" OR base sequence and pairs of sequences.\n"); exit(1); } } $mfiles = ""; foreach $s (@fnames) { $mfiles = "$mfiles $s"; } $mparams = ""; foreach $s (@params) { $mparams = "$mparams $s"; } $mlagan = "$lagandir/mlagan$mfiles$mparams > $prefix.out"; print STDERR "\n$mlagan\n\n"; if(`$mlagan`) { print "\n\n"; exit(1); } $i=0; if (@targets == 1) { foreach $s (@keys) { if ($s ne @targets[0]) { @targets[++$i] = @targets[0]; @targets[++$i] = $s; } } } $prjhead = "$lagandir/utils/mproject.pl $prefix.out"; $binhead = "$lagandir/utils/mf2bin.pl"; $j=0; for($i=1; $i<@targets; $i+=2) { $outprefix = "$prefix@targets[$i]\_@targets[$i+1]"; $pargs = "$targets[$i]_aligned $targets[$i+1]_aligned"; $pstr = "$prjhead $pargs > $outprefix.prj"; print "$pstr\n"; if(`$pstr`) { print "\nprojection failure...\n"; exit(1); } $bstr = "$binhead $outprefix.prj -out $outprefix.bin"; print "$bstr\n"; if(`$bstr`) { print "\npacking failure...\n"; exit(1); } @bins[$j++] = "$outprefix.bin"; print "\n"; } %distinct=(); foreach $s (@targets) { $distinct{$s} = 0; } @dseqs = keys %distinct; $plotfile = "$prefix.plotfile"; open (PLOTFILE, ">$plotfile"); print PLOTFILE "TITLE $prefix.fa - mlagan\n\n"; print PLOTFILE "OUTPUT $prefix.pdf\n\n"; print PLOTFILE "SEQUENCES "; foreach $s (@dseqs) { print PLOTFILE "$s "; } print PLOTFILE "\n\n"; $i=1; foreach $s (@bins) { print PLOTFILE "ALIGN $s BINARY\n"; print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n"; print PLOTFILE " REGIONS $paregmin $paregmax\n"; print PLOTFILE " MIN $pamin\n"; print PLOTFILE "END\n\n"; $i+=2; } print "touch $prefix.ann\n\n"; `touch $prefix.ann`; print PLOTFILE "GENES $prefix.ann\n\n"; print PLOTFILE "LEGEND on\n\n"; print PLOTFILE "COORDINATE @targets[0]\n\n"; print PLOTFILE "PAPER letter\n\n"; print PLOTFILE "BASES $pbases\n\n"; print PLOTFILE "TICK_DIST $ptickdist\n\n"; print PLOTFILE "RESOLUTION $presolution\n\n"; print PLOTFILE "WINDOW $pwindow\n\n"; print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n"; #$vistadir = `echo \$VISTA_DIR`; #chomp $vistadir; #if ($vistadir eq "") { # print ("Must specify environment variable VISTA_DIR\n"); # exit(1); #} #$vistastr = "$vistadir/RunVista $plotfile"; #print "$vistastr\n"; #if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); } print "\n\nmrun.pl -- end.\n\n"; lagan20/src/utils/mrunfile.pl0000755000076500007650000000337310502337062017234 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # This script requires the environment variables: # LAGAN_DIR and VISTA_DIR if (@ARGV < 1) { print ("usage:\n mrunfile.pl filename [-pairwise] [-vista]\n\n"); exit(1); } ($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set"; $filename = $ARGV[0]; open(PARAMFILE, "$filename") || die "Could not open $filename.\n\n"; $pairwise = 0; $dovista = 0; for ($l=1; $l<@ARGV; $l++) { if ($ARGV[$l] eq "-pairwise") { $pairwise = 1; } elsif ($ARGV[$l] eq "-vista") { $dovista = 1; } } $i=0; $j=0; $k=0; $filespec = 0; while ($line = ) { chomp $line; if ((substr($line, 0, 1) ne "#") && ($line ne "")) { if (!$filespec) { $seqfile = $line; $filespec = 1; } elsif (substr($line,0,1) eq "-") { if (substr($line,0,2) eq "--") { @vparams[$j++] = $line; } else { @params[$i++] = $line; } } else { @seqs[$k++] = $line; } } } if ($lagandir eq "") { print ("Must specify environment variable LAGAN_DIR\n"); exit(1); } if ($pairwise) { $mexecs = "mrunpairs.pl"; } else { $mexecs = "mrun.pl"; } $mstr = "$lagandir/utils/$mexecs $seqfile"; foreach $s (@params) { $mstr = "$mstr $s" } foreach $s (@seqs) { $mstr = "$mstr $s" } foreach $s (@vparams) { $mstr = "$mstr $s" } print "$mstr\n"; `$mstr`; if($dovista) { $prefix = substr $seqfile, 0, (rindex $filename, "."); $prefix = "$prefix\_"; if ($pairwise) { $prefix="$prefix\pairwise\_"; } $plotfile = "$prefix.plotfile"; ($vistadir = $ENV{VISTA_DIR}) or die "VISTA_DIR not set"; $vistastr = "$vistadir/RunVista $plotfile"; print "$vistastr\n"; if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); } } print "\nmrunfile.pl -- end.\n\n"; lagan20/src/utils/mrunpairs.pl0000755000076500007650000001363210502337062017432 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # This script requires the environment variables: # LAGAN_DIR and VISTA_DIR # VISTA .plotfile defaults ($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set"; $paregmin = 75; $paregmax = 100; $pamin = 50; $pbases = 10000; $ptickdist = 2000; $presolution = 25; $pwindow = 40; $pnumwindows = 4; if (@ARGV < 1) { print ("usage:\n mrunpairs.pl filename\n"); print ("options: [base sequence name [sequence pairs]]\n"); print ("default: [base sequence name = first sequence]\n"); print ("other MLAGAN parameters...\n"); print ("other VISTA parameters...\n"); exit(1); } $filename = $ARGV[0]; $i = 1; $j = 0; $k = 0; $l = 0; $treespec = 0; while ($i < @ARGV) { if ($ARGV[$i] eq "-tree") { $treepos = $j+1; @params[$j] = "-tree"; @params[++$j] = "\"$ARGV[++$i]\""; $_ = @params[$j]; $topen = tr/"\("/"\("/; $tclose = tr/"\)"/"\)"/; $treespec = ($topen == $tclose); } else { if (substr($ARGV[$i],0,1) eq "-") { if (substr($ARGV[$i],0,2) eq "--") { @vparams[$l++] = $ARGV[$i++]; @vparams[$l++] = $ARGV[$i]; } else { $j++; @params[$j] = $ARGV[$i]; if ((@params[$j] eq "-gapstart") || (@params[$j] eq "-gapend") || (@params[$j] eq "-gapcont") || (@params[$j] eq "-gapperseq") || (@params[$j] eq "-match") || (@params[$j] eq "-mismatch") || (@params[$j] eq "-overlap") || (@params[$j] eq "-glwidth")) { @params[++$j] = $ARGV[++$i]; } } } else { @targets[$k++] = $ARGV[$i]; } } $i++; } for ($i=0; $i<@vparams; $i+=2) { if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; } elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; } elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; } elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; } elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; } elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; } } if (!$treespec) { $j++; $treepos = $j+1; @params[$j] = "-tree"; @params[++$j] = "\"()\""; } if ($lagandir eq "") { print ("Must specify environment variable LAGAN_DIR\n"); exit(1); } $mextstr = "$lagandir/mextract.pl $filename"; print "$mextstr\n"; if(!`$mextstr`) { print "\nMulti-FASTA extraction failure...\n"; exit(1); } if (-e "$filename.masked") { $mextstr = "$lagandir/mextract.pl $filename.masked -masked"; print "$mextstr\n"; if(!`$mextstr`) { print "\nMasked Multi-FASTA extraction failure...\n"; exit(1); } } open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n"; $line = ; chomp $line; while (substr($line, 0, 1) ne ">") { $line = ; chomp $line; } $i=0; %list=(); $i=0; %list=(); if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; if (@targets == 0) { @targets[0] = @keys[$i]; print "Setting Base Sequence: @targets[0]\n"; } } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; } } $fprefix = substr $filename, 0, (rindex $filename, "."); $prefix = "$fprefix\_"; $pprefix = "$fprefix\_pairwise\_"; foreach $s (@keys) { @fnames[$list{$s}] = "$prefix$keys[$list{$s}].fa"; } if ((@targets > 1)) { if (@targets %2 != 1) { $c = @targets; print ("$c sequences: "); print ("Must specify single base sequence\n"); print (" OR base sequence and pairs of sequences.\n"); exit(1); } } $i=0; if (@targets == 1) { foreach $s (@keys) { if ($s ne @targets[0]) { @targets[++$i] = @targets[0]; @targets[++$i] = $s; } } } $j=0; for($i=1; $i<@targets; $i+=2) { $outprefix = "$pprefix@targets[$i]\_@targets[$i+1]"; $mfiles = " @fnames[$list{@targets[$i]}] @fnames[$list{@targets[$i+1]}]"; @params[$treepos]="\"(@targets[$i] @targets[$i+1])\""; $mparams = ""; foreach $s (@params) { $mparams = "$mparams $s"; } $mlagan = "$lagandir/mlagan$mfiles$mparams > $outprefix.out"; print "\n$mlagan\n\n"; if(`$mlagan`) { print "\n\n"; exit(1); } $binhead = "$lagandir/mpack.pl"; $bstr = "$binhead $outprefix.out -out $outprefix.bin"; print "$bstr\n"; if(`$bstr`) { print "\npacking failure...\n"; exit(1); } @bins[$j++] = "$outprefix.bin"; print "\n"; } %distinct=(); foreach $s (@targets) { $distinct{$s} = 0; } @dseqs = keys %distinct; $plotfile = "$pprefix.plotfile"; open (PLOTFILE, ">$plotfile"); print PLOTFILE "TITLE $prefix.fa - mlagan\n\n"; print PLOTFILE "OUTPUT $pprefix.pdf\n\n"; print PLOTFILE "SEQUENCES "; foreach $s (@dseqs) { print PLOTFILE "$s "; } print PLOTFILE "\n\n"; $i=1; foreach $s (@bins) { print PLOTFILE "ALIGN $s BINARY\n"; print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n"; print PLOTFILE " REGIONS $paregmin $paregmax\n"; print PLOTFILE " MIN $pamin\n"; print PLOTFILE "END\n\n"; $i+=2; } print "touch $prefix.ann\n\n"; `touch $prefix.ann`; print PLOTFILE "GENES $prefix.ann\n\n"; print PLOTFILE "LEGEND on\n\n"; print PLOTFILE "COORDINATE @targets[0]\n\n"; print PLOTFILE "PAPER letter\n\n"; print PLOTFILE "BASES $pbases\n\n"; print PLOTFILE "TICK_DIST $ptickdist\n\n"; print PLOTFILE "RESOLUTION $presolution\n\n"; print PLOTFILE "WINDOW $pwindow\n\n"; print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n"; #$vistadir = `echo \$VISTA_DIR`; #chomp $vistadir; #if ($vistadir eq "") { # print ("Must specify environment variable VISTA_DIR\n"); # exit(1); #} #$vistastr = "$vistadir/RunVista $plotfile"; #print "$vistastr\n"; #if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); } print "\n\nmrunpairs.pl -- end.\n\n"; lagan20/src/utils/MultiSequence.h0000644000076500007650000000537710502337062020015 0ustar brudnobrudno00000000000000// MultiSequence.h // --------------- // Multiple sequence class #ifndef MULTISEQUENCE_H #define MULTISEQUENCE_H #include #include #include #include #include "Sequence.h" #include "SafeVector.h" using namespace std; class MultiSequence { private: SafeVector sequences; // sequences SafeVector cache; bool cacheEnabled; public: MultiSequence (): cacheEnabled (false) {} void buildCache (){ assert (!cacheEnabled); cacheEnabled = true; int length = sequences[0].getLength(); int numSeqs = getNumSeqs(); cache.resize ((length + 1) * numSeqs, (char) 0); for (int i = 0; i < numSeqs; i++){ Sequence &seq = (*this)[i]; cache[i] = '@'; for (int j = 1; j <= length; j++){ cache[j * numSeqs + i] = seq[j]; } } } // return letter cache for fast processing SafeVector::iterator getCache (){ assert (cacheEnabled); return cache.begin(); } // add a sequence to the alignment void addSequence (Sequence &sequence){ sequences.push_back (sequence); } // Read in all of the Sequences in an MFA file and append them to the // existing MultiSequence object. void addRawFromMFA (const string& filename){ // open up file for reading ifstream infile (filename.c_str()); // check for error assert (!infile.fail()); // add only sequences that check out ok while (true){ Sequence seq (infile); if (seq.fail()) break; sequences.push_back (seq); } // close up the input file infile.close(); } // Read in all of the Sequences in an MFA file and append them to the // existing MultiSequence object. void addRawFromMFA (ifstream &infile){ // check for error assert (!infile.fail()); // add only sequences that check out ok while (true){ Sequence seq (infile); if (seq.fail()) break; sequences.push_back (seq); } } // Writes sequences to outfile in XMFA format. void writeToXMFA (ostream &outfile, int numColumns) const { for (int i = 0; i < (int) sequences.size(); ++i){ sequences[i].writeToXMFA (outfile, numColumns); } } // Returns a sequence. Sequence& operator[] (int index){ // error checking on bounds assert (index >= 0 && index < (int) sequences.size()); // return the correct sequence return sequences[index]; } // Returns a sequence. const Sequence& operator[] (int index) const { // error checking on bounds assert (index >= 0 && index < (int) sequences.size()); // return the correct sequence return sequences[index]; } // Returns number of sequences. const int getNumSeqs() const { return sequences.size(); } }; #endif lagan20/src/utils/mviz.pl0000755000076500007650000001173210502337062016376 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # This script requires the environment variables: # LAGAN_DIR and VISTA_DIR ($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set"; $paregmin = 75; $paregmax = 100; $pamin = 50; $pbases = 10000; $ptickdist = 2000; $presolution = 25; $pwindow = 40; $pnumwindows = 4; if (@ARGV < 2) { print ("usage:\n mviz.pl data_file param_file [plotfile]\n\n"); exit(1); } $pfspec = 0; if (@ARGV==3) { $pfspec = 1; $plotfile=@ARGV[2]; print "Using VISTA plotfile: $plotfile\n"; } $filename = $ARGV[1]; open(PARAMFILE, "$filename") || die "Could not open $filename.\n\n"; $i=0; $j=0; $k=0; $filespec = 0; while ($line = ) { chomp $line; if ((substr($line, 0, 1) ne "#") && ($line ne "")) { if (!$filespec) { $seqfile = $line; $filespec = 1; } elsif (substr($line,0,1) eq "-") { if (substr($line,0,2) eq "--") { @vparams[$j++] = $line; } else { @params[$i++] = $line; } } else { @targets[$k++] = $line; } } } $seqfile = @ARGV[0]; if ($lagandir eq "") { print ("Must specify environment variable LAGAN_DIR\n"); exit(1); } for ($i=0; $i<@vparams; $i+=2) { if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; } elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; } elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; } elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; } elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; } elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; } } open(FASTAFILE, "$seqfile") || die "Could not open $seqfile.\n\n"; $prefix = substr $seqfile, 0, (rindex $seqfile, "."); if (substr($prefix, -1, 1) ne "_") {$prefix = "$prefix\_";} $line = ; chomp $line; while (substr($line, 0, 1) ne ">") { $line = ; chomp $line; } $i=0; %list=(); if (substr($line, 0, 1) eq ">") { @keys[$i] = substr($line, 1); $list{@keys[$i]}=$i; if (@targets == 0) { @targets[0] = @keys[$i]; print "Setting Base Sequence: @targets[0]\n"; } } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; @keys[$i] = substr($line, 1); $list{@keys[$i]}=$i; } } if ((@targets > 1)) { $j=0; for ($i=1; $i<@targets; $i++) { $_ = @targets[$i]; @bp[$j++]=/\w+/g; $_=$&; @bp[$j++]=/\w+/g; } $j=1; foreach $s (@bp) { @targets[$j++]=$s; } if (@targets %2 != 1) { $c = @targets; print ("$c sequences: "); print ("Must specify single base sequence\n"); print (" OR base sequence and pairs of sequences.\n"); exit(1); } } $i=0; if (@targets == 1) { foreach $s (@keys) { $s = substr $s, 0, (rindex $s, "_aligned"); if ($s ne @targets[0]) { @targets[++$i] = @targets[0]; @targets[++$i] = $s; } } } print "TARGETS:\n";foreach $s (@targets) { print "\"$s\"\n"; } $prjhead = "$lagandir/utils/mproject.pl $seqfile"; $binhead = "$lagandir/utils/mf2bin.pl"; $j=0; for($i=1; $i<@targets; $i+=2) { $outprefix = "$prefix@targets[$i]\_@targets[$i+1]"; $pargs = "$targets[$i]_aligned $targets[$i+1]_aligned"; $pstr = "$prjhead $pargs > $outprefix.prj"; print "$pstr\n"; if(`$pstr`) { print "\nprojection failure...\n"; exit(1); } $bstr = "$binhead $outprefix.prj -out $outprefix.bin"; print "$bstr\n"; if(`$bstr`) { print "\npacking failure...\n"; exit(1); } @bins[$j++] = "$outprefix.bin"; print "\n"; } %distinct=(); foreach $s (@targets) { $distinct{$s} = 0; } @dseqs = keys %distinct; if (!$pfspec) { $plotfile = "$prefix.plotfile"; open (PLOTFILE, ">$plotfile"); print PLOTFILE "TITLE $prefix.fa - mlagan\n\n"; print PLOTFILE "OUTPUT $prefix.pdf\n\n"; print PLOTFILE "SEQUENCES "; foreach $s (@dseqs) { print PLOTFILE "$s "; } print PLOTFILE "\n\n"; $i=1; foreach $s (@bins) { print PLOTFILE "ALIGN $s BINARY\n"; print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n"; print PLOTFILE " REGIONS $paregmin $paregmax\n"; print PLOTFILE " MIN $pamin\n"; print PLOTFILE "END\n\n"; $i+=2; } print "touch $prefix.ann\n\n"; `touch $prefix.ann`; print PLOTFILE "GENES $prefix.ann\n\n"; print PLOTFILE "LEGEND on\n\n"; print PLOTFILE "COORDINATE @targets[0]\n\n"; print PLOTFILE "PAPER letter\n\n"; print PLOTFILE "BASES $pbases\n\n"; print PLOTFILE "TICK_DIST $ptickdist\n\n"; print PLOTFILE "RESOLUTION $presolution\n\n"; print PLOTFILE "WINDOW $pwindow\n\n"; print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n"; } ($vistadir = $ENV{VISTA_DIR}) or die "VISTA_DIR not set"; $vistastr = "$vistadir/RunVista $plotfile"; print "$vistastr\n"; if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); } print "\n\nmviz.pl -- end.\n\n"; lagan20/src/utils/Output.h0000644000076500007650000000070010502337062016513 0ustar brudnobrudno00000000000000#ifndef OUTPUT_H #define OUTPUT_H // print reversed string in MFA format void printMFA (ostream &outfile, SafeVector &data, string comment, int numColumns){ int charsWritten = 0; outfile << ">" << comment << endl; for (int i = 0; i < (int) data.size(); i++){ outfile << data[i]; charsWritten++; if (charsWritten % numColumns == 0) outfile << endl; } if (charsWritten % numColumns != 0) outfile << endl; } #endif lagan20/src/utils/overlay.c0000644000076500007650000001310510502337062016672 0ustar brudnobrudno00000000000000#include #include #include #include #define MAX_SEQS 63 #define MIN2(y,z) ((y)<(z))?(y):(z) #define MIN3(x,y,z) MIN2((x),MIN2((y),(z))) #define MIN4(w,x,y,z) MIN2((w),MIN3((x),(y),(z))) // Newick: (((One:0.2,Two:0.3):0.3,(Three:0.5,Four:0.3):0.2):0.3,Five:0.7):0.0; // Takes a tree in newick format, builds an internal "tree" structure // generates calls to other programs with correct weights typedef struct sequence { char* seqname; char* aligned; char* overlay; int alignlen; int overlaylen; int mynum; } seq; seq* allseqs[MAX_SEQS]; int numseqs; char* dna_alpha = "ACGT"; char* valid_alpha = "ACGTN-"; char* DNA_PRINT; char* DNA_LET; char* NUM_ONES; void init_consts() { int i; DNA_LET = (char*) malloc (sizeof(char) * 0x10); DNA_PRINT = (char*) malloc (sizeof(char) * 0x10); NUM_ONES = (char*) malloc (sizeof(char) * 0x10); for (i=0; i < 0x10; i++) { NUM_ONES[i] = DNA_LET[i] = DNA_PRINT[i] = -1; } DNA_LET[1] = 0; DNA_LET[2] = 1; DNA_LET[4] = 2; DNA_LET[8] = 3; DNA_PRINT[0] = 'N'; DNA_PRINT[1] = 'A'; DNA_PRINT[2] = 'C'; DNA_PRINT[4] = 'G'; DNA_PRINT[8] = 'T'; DNA_PRINT[1|2] = 'M'; DNA_PRINT[1|4] = 'R'; DNA_PRINT[1|8] = 'W'; DNA_PRINT[2|4] = 'S'; DNA_PRINT[2|8] = 'Y'; DNA_PRINT[4|8] = 'K'; DNA_PRINT[1|2|4] = 'V'; DNA_PRINT[1|2|8] = 'H'; DNA_PRINT[1|4|8] = 'D'; DNA_PRINT[2|4|8] = 'B'; DNA_PRINT[1|2|4|8] = 'X'; NUM_ONES[0] = 0; NUM_ONES[1] = 1; NUM_ONES[2] = 1; NUM_ONES[4] = 1; NUM_ONES[8] = 1; NUM_ONES[1|2] = 2; NUM_ONES[1|4] = 2; NUM_ONES[1|8] = 2; NUM_ONES[2|4] = 2; NUM_ONES[2|8] = 2; NUM_ONES[4|8] = 2; NUM_ONES[1|2|4] = 3; NUM_ONES[1|2|8] = 3; NUM_ONES[1|4|8] = 3; NUM_ONES[2|4|8] = 3; NUM_ONES[1|2|4|8] = 4; } seq* mk_seq() { seq* res = (seq*)malloc(sizeof(seq)); res->seqname = 0; res->aligned = 0; res->overlay = 0; res->mynum = -1; return res; } int read_align(FILE* input, int target) { char* res = (char*) malloc(sizeof(char)*1); int i, ressize = 1, numread=0; char temp[1024]; char currchar, checkchar, *tt; if (feof(input)) { fprintf(stderr, "2COULDN'T READ ALIGNMENT\n"); exit (2); } fgets(temp, 255, input); if (temp[0] != '>') { fprintf(stderr, "File is not in FASTA format!!\n"); exit(1); } *(strchr(temp, '\n')) = 0; currchar = fgetc(input); while ((currchar != '>') && (currchar != EOF)) { if (!isspace(currchar)) { checkchar = toupper(currchar); if (!strchr(valid_alpha, checkchar)) { // fprintf(stderr, "Warning: %d:%c skipped'\n", numread,currchar); currchar = 'N'; } res[numread++] = currchar; if (numread >= ressize) { res=(char*)realloc(res, sizeof(char)*(ressize*=2)); } } currchar = fgetc(input); } if (target >= 0) { allseqs[target]->seqname = malloc (strlen(temp)+1); strncpy(allseqs[target]->seqname, temp, strlen(temp)+1); allseqs[target]->aligned = res; allseqs[target]->alignlen = numread; } else { for (i = 0; i < numseqs; i++) { if (!strncmp(allseqs[i]->seqname, temp, strlen(temp))) { // fprintf(stderr, "found %d\n",i); allseqs[i]->overlay = res; allseqs[i]->overlaylen = numread; break; } } if (i == numseqs) { fprintf(stderr, "seq %s not found!\n", temp); exit(2); } } if (currchar == '>') { ungetc(currchar, input); return 1; } return 0; } void read_align_file (char* filename) { FILE* input; if (!(input = fopen (filename, "r"))) { fprintf(stderr, "COULDN'T OPEN ALIGNMENT\n"); exit (2); } while (read_align(input,numseqs++)) ; } void read_sequences(int argc, char**argv) { char* filename; FILE* input; seq* myn; int i, j, kmer, breaker; int zz; for (i=2; i < argc; i++) { filename = argv[i]; myn = 0; if (!(input = fopen (filename, "r"))) { fprintf(stderr, "COULDN'T OPEN SEQ %d %s\n",i,argv[i]); exit (2); } do { myn= allseqs[i-1]; myn->mynum = i-1; zz = read_align(input,-1); } while (zz) ; } } void overlayseq(int w) { int pos=0, i; for (i = 0; i < allseqs[w]->alignlen; i++) { if (allseqs[w]->aligned[i] != '-') allseqs[w]->aligned[i] = allseqs[w]->overlay[pos++]; } fprintf(stderr, "check %d == %d\n",pos,allseqs[w]->overlaylen); } void overlay() { int i; for (i=0; i < numseqs; i++) { overlayseq(i); } } void printAlign() { int i,j; seq* a; for (j=0; j < numseqs; j++) { a = allseqs[j]; fprintf(stdout, "%s", a->seqname); for (i=0; i < a->alignlen; i++) { if (!(i%60)) fprintf(stdout, "\n"); // fprintf(stdout, "%d:[%x]%c", i+1,a->aligned[i],DNA_PRINT[a->aligned[i]]); fprintf(stdout, "%c", a->aligned[i]); } fprintf(stdout, "\n"); } } int main(int argc, char** argv) { char string_tree[16537]; //noone will ever need more :))) int moved, i; float ttree, test; // fprintf(stderr, "Parsed tree\n"); if (argc < 3) { fprintf(stderr, "Usage: overlay align.mfa seq1 [seq2].... > newalign.mfa\n"); exit(2); } numseqs = 0; init_consts(); for (i=0; i < MAX_SEQS; i++) { allseqs[i] = mk_seq(); } // ttree = get_outgroups(align_node, 0); // fprintf(stdout, "ALIGN %s %s RES %s OUTS", align_node->lc->seqname, // align_node->rc->seqname, align_node->seqname); // for (i=0; i< numouts; i++) { // fprintf(stdout, " %s %f", outgroups[i]->seqname, outdists[i]); // test += outdists[i]; // } // fprintf(stdout, "\n"); read_align_file(argv[1]); read_sequences(argc, argv); overlay(); printAlign(); return 0; } lagan20/src/utils/rc.c0000644000076500007650000000255110502337062015620 0ustar brudnobrudno00000000000000#include #include #include #include char* alpha = "ATCGN"; typedef struct Sequence { char* lets; int numlets; char* name; char* rptr; } seq; char comp(char c) { switch(c) { case 'A': return 'T'; case 'T': return 'A'; case 'C': return 'G'; case 'G': return 'C'; case 'N': return 'N'; case 'a': return 't'; case 't': return 'a'; case 'c': return 'g'; case 'g': return 'c'; case 'n': return 'n'; default: return c; } } int main (int argc, char **argv){ char* res = (char*) malloc(sizeof(char)); int ressize = 1, numread = 0, i; char temp[256]; char currchar; if (feof(stdin)) return 0; fgets(temp, 255, stdin); if (temp[0] != '>') { fprintf(stderr, "File is not in FASTA format!!\n"); exit(1); } *(strchr(temp,'\n')) = 0; // strcat (temp, "(-)"); printf ("%s\n", temp); currchar = fgetc(stdin); while ((currchar != '>') && (currchar != EOF)) { if (!isspace(currchar)) { res[numread++] = comp (currchar); if (numread >= ressize) { res=(char*)realloc(res, sizeof(char)*(ressize*=2)); } } currchar = fgetc(stdin); } res[numread]=0; i = 0; while (--numread >= 0){ putchar (res[numread]); i++; if (i % 60 == 0){ putchar ('\n'); i = 0; } } if (i != 0) putchar ('\n'); free (res); return 0; } lagan20/src/utils/SafeVector.h0000644000076500007650000000174010502337062017261 0ustar brudnobrudno00000000000000// SafeVector.h // ------------ // Class for array bounds checking. // define ENABLE_CHECKS in order to enable array bounds checking. #ifndef SAFEVECTOR_H #define SAFEVECTOR_H #include #include using namespace std; // class derived from the STL std::vector template class SafeVector : public std::vector{ public: // miscellaneous constructors SafeVector () {} SafeVector (size_t size) : vector(size) {} SafeVector (size_t size, const TYPE &value) : vector(size, value) {} SafeVector (const SafeVector &source) : vector(source) {} #ifdef ENABLE_CHECKS // [] array bounds checking TYPE &operator[](size_t index){ assert (index >= 0 && index < size()); return std::vector::operator[] (index); } // [] const array bounds checking const TYPE &operator[] (size_t index) const { assert (index >= 0 && index < size()); return std::vector::operator[] (index) ; } #endif }; #endif lagan20/src/utils/scorealign.c0000644000076500007650000002516710502337062017352 0ustar brudnobrudno00000000000000#include #include #include #include #include #include #define NUCLEOTIDE_MATRIX_FILE "nucmatrix.txt" #define COLUMNS 60 int cons_rate = 0; int doibounds = 0, doubounds = 0, leftbound, rightbound, pairseqlen; int doregions = 0, docropxmfa = 0; char **seqs; int *seqid, *seqstart, *seqend; char *seqdir, **seqcomment; int numseqs, seqlen = -1; int matchscore[256][256]; int gapopen = -1500, gapcont = -50; inline int min (int a, int b){ if (a < b) return a; return b; } inline int max (int a, int b){ if (a > b) return a; return b; } inline int scoreMatch (char c, char d){ if (c == '-' && d == '-') return 0; if (c == '-' || d == '-') return gapcont; return matchscore[(unsigned char) c][(unsigned char) d]; } int conv2seqcoords (int pos, int i, int j){ int alignpos = -1, pairpos = -1; while (pairpos < pos && alignpos < seqlen){ alignpos++; if (seqs[i][alignpos] != '-' || seqs[j][alignpos] != '-') pairpos++; if (alignpos >= seqlen){ printf ("%d %d %d %d", pairpos, pos, alignpos, seqlen); } assert (alignpos < seqlen); } return alignpos+1; } #define CN 0 #define NC 1 int scorePair (char *seq1, char *seq2, int seqindex1, int seqindex2){ int score[2][2]; char *dad[2], *state; int i, j, CNscore, NCscore, left = pairseqlen, right = 1; for (i = 0; i < 2; i++){ dad[i] = (char *) malloc (sizeof (char) * pairseqlen); assert (dad[i]); dad[i][0] = -1; score[i][0] = 0; } state = (char *) malloc (sizeof (char) * pairseqlen); assert (state); j = 0; for (i = 0; i < pairseqlen; i++){ CNscore = score[CN][j]; NCscore = score[NC][j] + gapopen; if (CNscore > NCscore){ score[CN][!j] = CNscore; dad[CN][i] = CN; } else { score[CN][!j] = NCscore; dad[CN][i] = NC; } score[CN][!j] += scoreMatch (seq1[i], seq2[i]); CNscore = score[CN][j] + gapopen; NCscore = score[NC][j]; if (CNscore > NCscore){ score[NC][!j] = CNscore; dad[NC][i] = CN; } else { score[NC][!j] = NCscore; dad[NC][i] = NC; } j = !j; } i = pairseqlen - 1; j = (score[CN][j] > score[NC][j]) ? CN : NC; while (i >= 0){ state[i] = j; assert (j == CN || j == NC); j = dad[j][i]; i--; } j = 0; CNscore = 0; for (i = 0; i < pairseqlen; i++){ if (state[i] == CN){ if (!CNscore){ CNscore = 1; if (doregions) printf ("Conserved region: %d ", i+1); left = min (left, i+1); } else if (i == pairseqlen - 1){ if (doregions) printf ("%d\n", i+1); right = max (right, i+1); } j++; } else if (CNscore){ CNscore = 0; if (doregions) printf ("%d\n", i); right = max (right, i); } } if (j > 0){ left = conv2seqcoords(left-1, seqindex1, seqindex2); right = conv2seqcoords(right-1, seqindex1, seqindex2); if (doibounds){ leftbound = max (leftbound, left); rightbound = min (rightbound, right); } else if (doubounds){ leftbound = min (leftbound, left); rightbound = max (rightbound, right); } } else { leftbound = 1; rightbound = seqlen; } for (i = 0; i < 2; i++) free (dad[i]); free (state); return j; } void project (char *orig1, char *orig2, char *dest1, char *dest2, int *length){ int i, j; j = 0; for (i = 0; i < *length; i++){ if (orig1[i] != '-' || orig2[i] != '-'){ dest1[j] = orig1[i]; dest2[j] = orig2[i]; j++; } } *length = j; } int countleft (int pos, int i){ int j, k; k = 0; for (j = 0; j < pos; j++) if (seqs[i][j] != '-') k++; return k; } int countright (int pos, int i){ int j, k; k = 0; for (j = seqlen - 1; j > pos; j--) if (seqs[i][j] != '-') k++; return k; } void printXMFA (int score){ int i, j, k; if (leftbound > rightbound) { return; } if (seqid[0] == -1){ for (i = 0; i < numseqs; i++){ seqid[i] = i+1; seqstart[i] = 1; seqend[i] = countleft (seqlen, i); seqdir[i] = '+'; strcpy (seqcomment[i], ""); } } for (i = 0; i < numseqs; i++){ if (seqcomment[i][strlen(seqcomment[i]) - 1] == '\n') seqcomment[i][strlen(seqcomment[i]) - 1] = '\0'; printf (">%d:%d-%d %c %s\n", seqid[i], seqstart[i] + countleft (leftbound-1, i), seqend[i] - countright(rightbound-1, i), seqdir[i], seqcomment[i]); k = 0; for (j = leftbound - 1; j <= rightbound - 1; j++){ printf ("%c", seqs[i][j]); k++; if (k % COLUMNS == 0) printf("\n"); } if (k % COLUMNS != 0) printf("\n"); } printf ("= score=%d\n", score); } void scoreAlign (){ int i, j; int score = 0; char *u, *v; for (i = 0; i < numseqs - 1; i++){ for (j = i + 1; j < numseqs; j++){ pairseqlen = seqlen; u = (char *) malloc (sizeof (char) * seqlen); assert (u); v = (char *) malloc (sizeof (char) * seqlen); assert (v); project (seqs[i], seqs[j], u, v, &pairseqlen); score += scorePair (u, v, i, j); free (u); free (v); } } if (!doregions){ if (doibounds || doubounds) if (docropxmfa){ printXMFA(score); } else printf ("score=%d start=%d end=%d\n", score, leftbound, rightbound); else printf ("%d\n", score); } } inline int issymbol (char ch){ return ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '.' || ch == '-'; } void extractXMFAinfo (char *line, int *si, int *ss, int *se, char *sd, char **sc){ int numread; *sc = malloc (sizeof (char) * 1024); numread = sscanf (line, ">%d:%d-%d %c %s", si, ss, se, sd, *sc); if (numread < 4){ *si = *ss = *se = -1; *sd = '~'; strcpy (*sc, ""); } else if (numread < 5){ strcpy (*sc, ""); } } char *getSequence (FILE *file, int *si, int *ss, int *se, char *sd, char **sc){ int charsread = 0; int bufsize = 1; char *buffer; char prevch = '~'; char line[1024]; if (feof (file)) return NULL; fgets (line, 1024, file); if (line[0] == '='){ return NULL; } extractXMFAinfo (line, si, ss, se, sd, sc); buffer = (char *) malloc (sizeof (char) * bufsize); assert (buffer); while (!feof (file)){ buffer[charsread] = toupper (fgetc (file)); if (buffer[charsread] == '>' || buffer[charsread] == '='){ ungetc (buffer[charsread], file); break; } if (issymbol (buffer[charsread])) charsread++; if (charsread == bufsize){ bufsize *= 2; buffer = (char *) realloc (buffer, sizeof (char) * bufsize); } prevch = buffer[charsread]; } if (charsread == 0){ free (buffer); return NULL; } if (seqlen == -1) seqlen = charsread; else { assert (seqlen == charsread); } return buffer; } int getSequences (FILE *file){ char *newseq, sd, *sc; int i, si, ss, se; seqlen = -1; numseqs = 0; seqs = (char **) malloc (sizeof (char *) * 0); seqid = (int *) malloc (sizeof (int) * 0); seqstart = (int *) malloc (sizeof (int) * 0); seqend = (int *) malloc (sizeof (int) * 0); seqdir = (char *) malloc (sizeof (char) * 0); seqcomment = (char **) malloc (sizeof (char *) * 0); while (newseq = getSequence (file, &si, &ss, &se, &sd, &sc)){ numseqs++; seqs = (char **) realloc (seqs, sizeof (char *) * numseqs); seqid = (int *) realloc (seqid, sizeof (int) * numseqs); seqstart = (int *) realloc (seqstart, sizeof (int) * numseqs); seqend = (int *) realloc (seqend, sizeof (int) * numseqs); seqdir = (char *) realloc (seqdir, sizeof (char) * numseqs); seqcomment = (char **) realloc (seqcomment, sizeof (char *) * numseqs); seqs[numseqs - 1] = newseq; seqid[numseqs - 1] = si; seqstart[numseqs - 1] = ss; seqend[numseqs - 1] = se; seqdir[numseqs - 1] = sd; seqcomment[numseqs - 1] = sc; } if (numseqs > 0) return 1; free (seqs); free (seqid); free (seqstart); free (seqend); free (seqdir); free (seqcomment); return 0; } int processSequences (FILE *file){ int i, j; if (getSequences (file)){ if (doibounds){ leftbound = 0; rightbound = 1000000000; } else if (doubounds){ leftbound = 1000000000; rightbound = 0; } scoreAlign(); for (i = 0; i < numseqs; i++) free (seqs[i]); free (seqs); free (seqid); free (seqstart); free (seqend); free (seqdir); for (i = 0; i < numseqs; i++) free (seqcomment[i]); free (seqcomment); return 1; } return 0; } void calculateScoreMatrix(){ char *alpha = "ATCG"; int i, j; double p_ij = (double) cons_rate / 100.0; double match = log (p_ij / 0.25); double mismatch = log ((1 - p_ij) / 0.75); for (i = 0; i < strlen (alpha); i++){ for (j = 0; j < strlen (alpha); j++){ matchscore[(unsigned char) alpha[i]][(unsigned char) alpha[j]] = (i == j) ? (int)(match * 100) : (int)(mismatch * 100); } } gapopen = (int)(-40 * match * 100); } void readScoreMatrix (char *filename){ FILE *file; int i, j, k, numlets = 0; char lets[256], line[1024]; char *lagan_dir; lagan_dir = getenv ("LAGAN_DIR"); if (!lagan_dir){ fprintf (stderr, "Error: $LAGAN_DIR not set.\n"); exit (1); } sprintf (line, "%s/%s", lagan_dir, filename); fprintf (stderr, "%s\n", line); file = fopen (line, "r"); assert (file); fgets (line, 1024, file); for (i = 0; i < strlen (line); i++){ if (!isspace (line[i])){ lets[numlets++] = line[i]; } } for (i = 0; i < numlets; i++){ fscanf (file, "%1s", &(line[0])); for (j = 0; j < numlets; j++){ fscanf (file, "%d", &k); matchscore[(unsigned char) line[0]][(unsigned char) lets[j]] = k; } } fscanf (file, "%d%d", &gapopen, &gapcont); fclose (file); } void processFile (char *filename){ FILE *file; int i, j; for (i = 0; i < 256; i++) for (j = 0; j < 256; j++) matchscore[i][j] = 0; if (cons_rate >= 0) calculateScoreMatrix(); else readScoreMatrix (NUCLEOTIDE_MATRIX_FILE); file = fopen (filename, "r"); assert (file); while (!feof (file)){ processSequences (file); } fclose (file); } int main (int argc, char **argv){ int i; if (argc < 3 || argc > 6){ // [-bounds seqidx] fprintf (stderr, "Usage: scorealign mfa_file cons_rate [-regions] [-ibounds | -ubounds [-cropxmfa]]\n"); exit (1); } cons_rate = atoi (argv[2]); for (i = 3; i < argc; i++){ if (strcmp (argv[i], "-cropxmfa") == 0) docropxmfa = 1; else if (strcmp (argv[i], "-ibounds") == 0) doibounds = 1; else if (strcmp (argv[i], "-ubounds") == 0) doubounds = 1; else if (strcmp (argv[i], "-regions") == 0) doregions = 1; } if (docropxmfa) assert (doibounds || doubounds); processFile (argv[1]); return 0; } lagan20/src/utils/scorecontigs.c0000644000076500007650000002411410502337062017715 0ustar brudnobrudno00000000000000#include #include #include #include #include #define MAX_SEQ 1024 #define MAX(a,b) ((a)>(b)?(a):(b)) #define MIN(a,b) ((a)<(b)?(a):(b)) #define CNTS_LEN 6 #define CNTS_A 0 #define CNTS_T 1 #define CNTS_C 2 #define CNTS_G 3 #define CNTS_N 4 #define CNTS_GAP 5 #define STATE_NULL 0 #define STATE_MATCH 1 #define STATE_MISMATCH 2 #define STATE_GAP 3 #define CACHE_SIZE 1000 int PEN_0_MIS, PEN_0_MTC, PEN_0_GAP; int PEN_1_MIS, PEN_1_MTC, PEN_1_GAP; int PEN_TO_0, PEN_TO_1; char* alpha = "ATCGN-."; double scoreMatch = 12; double scoreMismatch = -4; double scoreGapOpen = -80; double cache[CACHE_SIZE]; typedef struct align_res { char *names[MAX_SEQ]; int algnlen; int numseqs; char *data[MAX_SEQ]; } align; typedef struct rangelist_res { int seqlen; int *score; } rangelist; int cntlets(FILE* input, int lettersonly) { int numread=0; char temp[1024]; char currchar = '~'; rewind (input); if (feof(input)) return 0; fgets(temp, 1024, input); if (temp[0] != '>') { fprintf(stderr, "File is not in FASTA format!!\n"); exit(1); } currchar = fgetc(input); while ((currchar != '>') && !feof (input)) { if (!isspace(currchar)) { currchar = toupper(currchar); if (!lettersonly || isalpha (currchar)){ numread++; } } currchar = fgetc(input); } rewind(input); return numread; } int readseq (FILE *input, align *res){ int numread = 0; char temp[1024], currchar, *write; if (feof (input)) return 0; fgets (temp, 1024, input); if (temp[0] != '>'){ fprintf (stderr, "scorealign: File is not in FASTA format!!\n"); exit (1); } res->names[res->numseqs] = (char*) malloc((strlen(temp))*sizeof(char)); strcpy(res->names[res->numseqs], temp+1); *(strchr(res->names[res->numseqs], '\n')) = 0; write = res->data[res->numseqs] = (char *) malloc (sizeof (char) * res->algnlen); assert (write); currchar = fgetc (input); while (numread <= res->algnlen && (currchar != '>') && !feof (input)){ if (!isspace (currchar)){ currchar = toupper (currchar); if (!strchr(alpha, currchar)) currchar = 'N'; write[numread++] = currchar; } currchar = fgetc (input); } if (currchar == '>'){ ungetc (currchar, input); } if (numread != res->algnlen) { fprintf (stderr, "Sequence (%s) of different lengths (%d v. %d)!!\n", res->names[res->numseqs], numread, res->algnlen); exit(1); } return 1; } align *readMultial (char *filename){ FILE *alfile; align *res; if (!(alfile = fopen (filename, "r"))){ fprintf (stderr, "scorecontigs: couldn't open alignment file: %s\n", filename); exit (1); } res = (align *) malloc (sizeof (align)); assert (res); res->algnlen = cntlets (alfile, 0); res->numseqs = 0; while (readseq (alfile, res)) res->numseqs++; assert (res->numseqs == 2); fclose (alfile); return res; } inline int getstate (char c, char d){ if (c == '-' || d == '-') return 2; if (c == 'N' || d == 'N') return 3; return c == d; } rangelist *getranges (char *filename, int offs){ FILE *file; align *myal = readMultial (filename); rangelist *r = (rangelist *) malloc (sizeof (rangelist)); int *scores[2], i, j, k, l, m, state, from0, from1, herescore; int *states, len, used, tot; char *traceback[2]; assert (r); file = fopen (filename, "r"); assert (file); r->seqlen = cntlets (file, 1); len = cntlets (file, 0); for (i = 0; i < 2; i++){ scores[i] = (int *) malloc (sizeof (int) * len); assert (scores[i]); traceback[i] = (char *) malloc (sizeof (char) * len); assert (traceback[i]); } for (i = 0; i < len; i++){ state = getstate (myal->data[0][i], myal->data[1][i]); assert (i >= 0 && i < myal->algnlen); if (i <= 5){ scores[0][i] = scores[1][i] = 0; traceback[0][i] = traceback[1][i] = 0; } else { // go to state 0 herescore = (state == 0 ? PEN_0_MIS : (state == 1 ? PEN_0_MTC : (state == 2 ? PEN_0_GAP : 0))); from0 = scores[0][i-1] + herescore; from1 = scores[1][i-1] + herescore + PEN_TO_0; if (from0 > from1){ scores[0][i] = from0; traceback[0][i] = 0; } else { scores[0][i] = from1; traceback[0][i] = 1; } // go to state 1 herescore = (state == 0 ? PEN_1_MIS : (state == 1 ? PEN_1_MTC : (state == 2 ? PEN_1_GAP : 0))); from0 = scores[0][i-1] + herescore + PEN_TO_1; from1 = scores[1][i-1] + herescore; if (from0 > from1){ scores[1][i] = from0; traceback[1][i] = 0; } else { scores[1][i] = from1; traceback[1][i] = 1; } } } states = (int *) malloc (sizeof (int) * len); assert (states); states[len - 1] = (scores[0][len - 1] > scores[1][len - 1]) ? 0 : 1; for (i = len - 2; i >= 0; i--) states[i] = traceback[states[i+1]][i+1]; r->score = (int *) malloc (sizeof (int) * r->seqlen); assert (r->score); k = tot = used = 0; for (i = 0; i < len; i++){ if (!states[i]){ if (isalpha (myal->data[0][i])){ r->score[k] = 0; k++; } continue; } used = 1; herescore = l = 0; for (j = i; j < len && states[j]; j++){ if (isalpha (myal->data[0][j])) l++; state = getstate (myal->data[0][j], myal->data[1][j]); herescore += (state == 0 ? PEN_1_MIS : (state == 1 ? PEN_1_MTC : (state == 2 ? PEN_1_GAP : 0))); } tot += herescore; herescore /= l; // fprintf (stderr, "%s: (%d %d) %d %d\n", filename, k + offs, k + l + offs, herescore, r->seqlen); for (m = k; m < k + l; m++) r->score[m] = herescore; k += l; i = j - 1; } // printf ("%d\n", tot); free (states); for (i = 0; i < 2; i++){ free (scores[i]); free (traceback[i]); } if (!used){ free (r->score); free (r); return NULL; } return r; } inline int getdata (rangelist **ranges, int *offs, int j, int i){ i -= offs[j]; if (i >= 0 && i < ranges[j]->seqlen) return ranges[j]->score[i]; return 0; } inline int match (rangelist **ranges, int numContigs, int i, int j, int *offs){ int k; for (k = 0; k < numContigs; k++) if ((getdata (ranges, offs, k, i) != 0) != (getdata (ranges, offs, k, j) != 0)) return 0; return 1; } inline int allzeroes (rangelist **ranges, int numContigs, int pos, int *offs){ int i; for (i = 0; i < numContigs; i++) if (getdata (ranges, offs, i, pos) != 0) return 0; return 1; } inline void print (int start, int end, int *score, int numContigs){ int j; printf ("(%7d %7d)", start, end); for (j = 0; j < numContigs; j++) printf (" %7d", score[j]); printf ("\n"); } void printRanges (rangelist **ranges, int numContigs, int seqLen, int *offs){ int i, j, start = 0, end; int *score = (int *) malloc (sizeof (int) * numContigs); int *pattern = (int *) malloc (sizeof (int) * numContigs); assert (score); assert (pattern); printf ("numContigs = %d\n", numContigs); printf ("seqLen = %d\n", seqLen); for (i = 0; i < numContigs; i++) score[i] = 0; for (i = 0; i <= seqLen; i++) if (!allzeroes (ranges, numContigs, i, offs)) break; if (i > 0) print (0, i - 1, score, numContigs); start = end = i; while (i <= seqLen){ if (i != seqLen && match (ranges, numContigs, start, i, offs)){ end = i; for (j = 0; j < numContigs; j++){ score[j] += getdata (ranges, offs, j, i); } } else if (i == seqLen || !allzeroes (ranges, numContigs, i, offs)){ print (start, end, score, numContigs); for (j = 0; j < numContigs; j++) score[j] = 0; if (end < i - 1) print (end + 1, i - 1, score, numContigs); start = end = i; } i++; } free (score); free (pattern); } inline double scoregap (int gaplen){ if (gaplen == 0) return 0; //return (gaplen - 1) * -1 - 50; return (log (gaplen) / log (10) + 1) * scoreGapOpen; } double scorealign (align *myal, int a, int b){ int i, gaplen = 0; double score = 0; double best = 0; char c, d; // compensate for lagan bug for (i = 10; i < myal->algnlen; i++){ c = myal->data[a][i]; d = myal->data[b][i]; if (c == '-' && d == '-') continue; if (c == '-' || d == '-') gaplen++; else { if (gaplen != i){ if (gaplen < CACHE_SIZE) score += cache[gaplen]; else score += scoregap (gaplen); } gaplen = 0; if (c == d) score += scoreMatch; else score += scoreMismatch; if (score > best) best = score; if (score < 0) score = 0; } } return best; } void analyze (align *myal){ int i, j, k; double score = 0; for (i = 0; i < CACHE_SIZE; i++) cache[i] = scoregap (i); for (i = 0; i < myal->numseqs; i++) for (j = i + 1; j < myal->numseqs; j++) score += scorealign (myal, i, j); printf ("%d\n", (int) score); } int main(int argc, char** argv) { FILE *filelist, *cfile; char contignames[MAX_SEQ][1024]; rangelist *ranges[MAX_SEQ]; int numseqs, i, j; int offs1[MAX_SEQ], offs2[MAX_SEQ], off[MAX_SEQ], num[MAX_SEQ]; if (argc != 5) { fprintf(stderr, "Usage:\n\nscorecontigs file_list fasta_file contig_list cons_rate\n"); exit (1); } PEN_1_MIS = -(25 * atoi(argv[4])) / (101 - atoi (argv[4])); PEN_1_MTC = 25; PEN_1_GAP = PEN_1_MIS / 2; PEN_0_MIS = 0; PEN_0_MTC = 0; PEN_0_GAP = 0; PEN_TO_0 = -250; //-300; PEN_TO_1 = -350; //-400; if (!(filelist = fopen (argv[1], "r"))) { fprintf(stderr, "scorecontigs: Couldn't open alignment file: %s\n", argv[1]); exit (1); } numseqs = 0; while (!feof (filelist)){ if (fscanf (filelist, "%d %d %d %s\n", &(num[numseqs]), &(offs1[numseqs]), &(offs2[numseqs]), &(contignames[numseqs])) == 4){ numseqs++; } } fclose (filelist); if (numseqs == 0){ fprintf (stderr, "scorecontigs: No contigs found.\n"); exit (1); } cfile = fopen (argv[3], "w"); assert (cfile); j = 0; for (i = 0; i < numseqs; i++){ ranges[j] = getranges (contignames[i], offs1[i]); if (ranges[j]){ fprintf (cfile, "%d %d %d %s\n", num[i], offs1[i], offs2[i], contignames[i]); off[j] = offs1[i]; j++; } } fclose (cfile); filelist = fopen (argv[2], "r"); assert (filelist); printRanges (ranges, j, cntlets (filelist, 1), off); fclose (filelist); } lagan20/src/utils/seqmerge.c0000644000076500007650000000136010502337062017021 0ustar brudnobrudno00000000000000#include #include #include #include #include int main (int argc, char** argv){ FILE *file; int i, written = 0; char buffer[1024], ch; if (argc == 1){ fprintf (stderr, "Usage:\n\nseqmerge fasta_file1 fasta_file2 ...\n"); exit (1); } for (i = 1; i < argc; i++){ file = fopen (argv[i], "r"); assert (file); fgets (buffer, 1024, file); if (i == 1) printf ("%s", buffer); while (!feof (file)){ ch = fgetc (file); if (ch == '>') break; if (isalpha (ch) || ch == '.' || ch == '-'){ printf ("%c", ch); written++; if (written % 60 == 0) printf ("\n"); } } fclose (file); } if (written ^ 60 != 0) printf ("\n"); } lagan20/src/utils/Sequence.h0000644000076500007650000001327110502337062016772 0ustar brudnobrudno00000000000000// Sequence.h // ---------- // Class file to hold a sequence object. #ifndef SEQUENCE_H #define SEQUENCE_H #include #include "SafeVector.h" using namespace std; class Sequence { private: // Read header of MFA/XMFA file. bool readHeader (ifstream &infile, bool &isXMFA){ string header; while (true){ // check to make sure that the there is more data in the file if (infile.fail() || infile.eof()) return false; // get new header line getline (infile, header); // check that header line is not empty if (header.length() != 0) break; } // check for appropriate header if (header[0] != '>') return false; // attempt to read XMFA format isXMFA = true; char buffer[1024]; int numread = sscanf (header.c_str(), ">%d:%d-%d %c %s", &id, &startCoord, &endCoord, &direction, buffer); // if basic requirements for XMFA not met, then MFA file if (numread < 4){ comment = header.substr(1); isXMFA = false; } // basic requirements for XMFA met, no comments else if (numread < 5) comment = ""; // otherwise full XMFA format else comment = buffer; return true; } protected: SafeVector data; // character data for the sequence bool isValid; // is the sequence valid? int length; // length of the sequence int id; // sequence ID (for XMFA) int startCoord; // sequence position of first character int endCoord; // sequence position of last character char direction; // + or - string comment; // comments public: Sequence (){ isValid = true; length = 1; data.resize (1, ' '); startCoord = 1; endCoord = 1; direction = '+'; } // Constructor. Reads in a sequence from the input file. Sequence (ifstream &infile){ bool isXMFA = true; // sequence starts out not valid isValid = false; // check to make sure that the header is read first if (readHeader (infile, isXMFA)){ // put in a dummy character to fill the zero position data.push_back ('@'); // read in character data char ch; // loop until no more character data or end of sequence found while (infile.get(ch)){ // check to make sure that the end of a section is not reached if (ch == '>' || ch == '='){ infile.unget(); break; } // check for white space if (ch == ' ' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' || ch == '\v') continue; // convert lowercase letters to uppercase if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A'; // check that characters are letters OR contig breaks OR gaps assert ((ch >= 'A' && ch <= 'Z') || ch == '.' || ch == '-'); // add character to list data.push_back (ch); } // check to see if any data was read if (data.size() > 1){ // if so, the sequence is valid, and compute the length isValid = true; length = data.size() - 1; // if the sequence is not originally XMFA if (!isXMFA){ // assign it some temporary values for XMFA format id = 0; startCoord = 1; endCoord = length; direction = '+'; } } } // some sanity checks if (isValid){ assert (id >= 0); assert (startCoord >= 0); assert (endCoord >= 0); assert (startCoord <= endCoord); assert (direction == '+' || direction == '-'); assert (length > 0); } } // Constructor. Gets sequence from array data. Sequence (SafeVector data, string comment) : data(data), comment(comment) { length = data.size() - 1; id = 0; startCoord = 1; endCoord = length; direction = '+'; isValid = true; comment = ""; assert (length > 0); } SafeVector getData (){ SafeVector temp; for (int i = 1; i <= length; i++) temp.push_back (data[i]); return temp; } const string getComment () const { return comment; } void setLength (int num){ if (num > length){ length = num; endCoord = length; data.resize(length+1, ' '); } } SafeVector::iterator getIterator (){ return data.begin(); } const char operator[] (int index) const { assert (index >= 1 && index <= length); return data[index]; } // Used to check for sequence validity after construction. const bool fail () const { return !isValid; } // Return the length of the sequence. const int getLength () const { assert (isValid); return length; } const char getStrand () const { assert (isValid); return direction; } const int getStartCoord () const { assert (isValid); return startCoord; } const int getEndCoord () const { assert (isValid); return endCoord; } // Print XMFA header only. void writeXMFAHeader (ostream &outfile) const { assert (isValid); outfile << '>' << id << ':' << startCoord << '-' << endCoord << ' ' << direction << ' ' << comment << endl; } // Return sequence ID. const int getID () const { assert (isValid); return id; } // Set sequence ID. void setID (int id) { assert (isValid); this->id = id; } // Writes sequence to XMFA format. void writeToXMFA (ostream &outfile, int numColumns) const { assert (isValid); // print XMFA header outfile << ">" << comment << endl; // outfile << '>' << id << ':' << startCoord << '-' << endCoord << ' ' << direction << ' ' << comment << endl; // print character data for (int i = 1; i <= length; ++i){ outfile << data[i]; if (i % numColumns == 0) outfile << endl; } if (length % numColumns != 0) outfile << endl; } }; #endif lagan20/src/Utils.pm0000644000076500007650000003072510502337063015353 0ustar brudnobrudno00000000000000#!/usr/bin/env perl package Utils; require 5.000; use strict; use Exporter; use Cwd; use IO::File; use POSIX qw(setsid); use Sys::Syslog qw(:DEFAULT setlogsock); sub Trim( @ ); sub Lock_File( $ ; $ $ $ ); sub Unlock_File( $ ); sub Write_Log( $ $ ; $ $ ); sub Parse_Filename( $ ); sub Get_Abs_Path( $ ); sub Expand_Path( $ ); sub Get_Random_Key( ; $ ); sub Hex2Ascii( $ ); sub Ascii2Hex( $ ); sub Get_Config_Record( $ $ ); sub Round( $ ); sub Set_Log( $ $ ); sub Log( $ $ ); sub Min( $ $ ); sub Max( $ $ ); sub Reg_Diff( $ $ ; $ $ $ $ $ ); sub Reg_Rem_Overlap( $ ; $ $ $ ); sub Reg_Sort( $ ; $ $ $ ); sub Reg_Intersect( $ $ ; $ $ $ $ $ ); sub Reg_Merge( $ ; $ $ $ ); use vars qw(@ISA @EXPORT $VERSION $JOB $Error $Syslog $Facility $Msg_Prefix); @ISA = qw(Exporter); @EXPORT = qw(Trim Lock_File Unlock_File Write_Log Parse_Filename Get_Abs_Path Expand_Path Hex2Ascii Ascii2Hex Get_Config_Record Get_Random_Key Round Set_Log Log Min Max Reg_Diff Reg_Rem_Overlap Reg_Sort Reg_Intersect Reg_Merge redirect_err2log openlogs safe_glob daemon wr_log wr_err start_watcher confirm $JOB); my $Id = '$Id: Utils.pm,v 1.21 2005/01/07 23:08:59 poliakov Exp $'; ($VERSION) = ($Id =~ /,v\s+(\d+\S+)/o); $JOB = '^(\S+)\@(\S+?)_(\d{4})(?:_(.+)|)$'; $Error = 0; $Syslog = 0; $Facility = "user"; $Msg_Prefix = undef; my $E_FORK = "cannot fork"; my @LOG_FILE = (); my %Locks = (); sub Trim( @ ) { for (my $i = 0; $i <= $#_; ++$i) { $_[$i] =~ s/^\s+//; $_[$i] =~ s/\s+$// } } sub Lock_File( $ ; $ $ $ ) { my ($file, $retry, $timeout, $max_mtime) = @_; my ($lock_fh, $start_time, $mtime); if (!$file || ($file =~ /\/$/o)) { $Error = "Invalid filename"; return 0; } $file = Get_Abs_Path("$file.lock"); if (exists($Locks{$file})) { $Error = "Already locked"; return 1; } if (!-w (Parse_Filename($file))[0]) { $Error = "Permission denied"; return 0; } if (!defined($retry)) { $retry = 1; } if (!defined($timeout)) { $timeout = 1200; } if (!defined($max_mtime)) { $max_mtime = ($timeout > 0) ? int($timeout / 2) : 0; } $start_time = time(); LOCK: { if (!($lock_fh = IO::File->new($file, O_RDWR|O_CREAT|O_EXCL))) { if (!$retry || (($timeout > 0) && ((time() - $start_time) > $timeout))) { $Error = "Locked by someone else"; return 0; } if ($max_mtime > 0) { $mtime = (stat($file))[9]; if ($mtime && ((time() - $mtime) > $max_mtime)) { unlink($file); } } redo LOCK; } } $lock_fh->close(); $Locks{$file} = 1; return 1; } sub Unlock_File( $ ) { my ($file) = @_; if (!$file) { $Error = "Invalid filename"; return 0; } $file = Get_Abs_Path("$file.lock"); if (!exists($Locks{$file})) { $Error = "Not locked"; return 0; } if (!unlink($file)) { $Error = "Cannot unlock"; return 0; } delete($Locks{$file}); return 1; } { my $Uname; foreach my $dir ('/bin', '/sbin', '/usr/bin', '/usr/sbin') { -x "$dir/uname" and $Uname = "$dir/uname", last; } my $Host = $Uname ? `$Uname -n` : 'localhost'; chomp($Host); ($Host) = ($Host =~ /^([^\.]+)(\..*)?$/); sub Write_Log( $ $ ; $ $ ) { no strict "refs"; my ($log_file, $msg, $name, $pid) = @_; my $error = 0; my $date; local *LOG; if (!defined($log_file) || !defined($msg)) { return 0; } if (*{$log_file}{IO}) { *LOG = *{$log_file}{IO}; } elsif ($log_file eq '/dev/null') { return 1; } else { if (!Lock_File($log_file)) { return 0; } if (!open(LOG, ">> $log_file")) { $error = 1; } } if (!$error) { chomp($msg); $date = localtime(time()); if (!$name) { $name = $0; } if (!$pid) { $pid = $$; } if (!print LOG "$date $Host $name\[$pid\]: $msg\n") { $error = 1; } if (!*{$log_file}{IO}) { close(LOG); } } if ($error && $!) { $Error = "$!"; } if (!*{$log_file}{IO}) { Unlock_File($log_file); } return !$error; }} sub Parse_Filename( $ ) { my ($name) = @_; my ($last_slash_pos, $dir, $file); if (!defined($name)) { return (); } $last_slash_pos = rindex($name, "/"); if ($last_slash_pos >= 0) { $dir = substr($name, 0, $last_slash_pos + 1); $file = substr($name, $last_slash_pos + 1); } else { $dir = ""; $file = $name; } return ($dir, $file); } sub Expand_Path( $ ) { my ($path) = @_; my $home_dir; $path && ($path =~ /^~/o) or return $path; $path =~ /^~([^\/]*)(.*)$/o; $home_dir = $1 ? (getpwnam($1))[7] : ($ENV{"HOME"} || $ENV{"LOGDIR"} || (getpwuid($>))[7]); defined($home_dir) and $path = "$home_dir$2"; return $path; } sub Get_Abs_Path( $ ) { my ($path) = @_; defined($path) or return $path; $path = Expand_Path($path); $path =~ /^\//o or $path = getcwd() . "/$path"; $path =~ s(/{2,})(/)g; # get rid of "/./" while ($path =~ /^(.*?)\/\.(?:|\/(.*))$/o) { $path = "$1/" . ($2 ? $2 : ""); } # get rid of "/../" while ($path =~ /^(((?:.*?\/)*?)[^\/]+){0,1}?\/\.\.(?:|\/(.*))$/o) { $path = ($1 ? $2 : "/") . ($3 ? $3 : ""); } return $path; } { my @Chars = ("A" .. "Z", "a" .. "z", 0 .. 9); srand(); sub Get_Random_Key( ; $ ) { my ($len) = @_; if (!defined($len) || ($len !~ /^\d+$/o) || ($len < 2) || ($len > 1024)) { $len = 8; } return join("", @Chars[map {rand @Chars } (1 .. 8)]); }} sub Hex2Ascii( $ ) { my ($str) = @_; if ($str) { $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; } return $str; } { my $a2h = { "\t" => "%29", "+" => "%2B", "," => "%2C", "." => "%2E", ";" => "%3B", "/" => "%2F", "?" => "%3F", ":" => "%3A", "@" => "%40", "=" => "%3D", "&" => "%26", " " => "%20", "<" => "%3C", ">" => "%3E", "\"" => "%22", "%" => "%25", "#" => "%23", "[" => "%5B", "]" => "%5D", "{" => "%7B", "}" => "%7D", "|" => "%7C", "\\" => "%5C", "^" => "%5E", "~" => "%7E", "`" => "%60"}; sub Ascii2Hex( $ ) { my ($str) = @_; my $new_str = ""; if (!$str) { return $str; } foreach my $char (split(//, $str)) { if (exists($a2h->{$char})) { $char = $a2h->{$char}; } $new_str .= $char; } return $new_str; }} sub Get_Config_Record( $ $ ) { my ($conf_file, $rec) = @_; my ($db, $field, $value); my @result = (); if (!($db = Registry->New($conf_file, "r", 1))) { $Error = "$Registry::Error", return (); } if (!$db->Record_Exists($rec)) { $Error = qq("$rec" record not found); return (); } foreach my $field (qw(dir users log)) { if (!($value = Expand_Path($db->Get_Val($rec, $field)))) { if ($field eq "log") { $value = ""; } else { $Error = qq("$field" field of "$rec" record is missing), return (); } } elsif ($value !~ /^\//o) { $Error = qq("$field" field of "$rec" record should be absolute path); return (); } push(@result, $value); } foreach my $field (qw(max_down grace_period)) { if (!($value = $db->Get_Val($rec, $field)) || ($value !~ /^\d+$/o)) { $value = 0; } push(@result, $value); } return @result; } sub Round( $ ) { my ($num) = @_; return int($num + 0.5); } sub Log( $ $ ) { my ($log_num, $msg) = @_; (defined($log_num) && ($log_num >= 0) && $LOG_FILE[$log_num]) and Write_Log($LOG_FILE[$log_num], $msg); } sub Set_Log( $ $ ) { my ($log_num, $file) = @_; (defined($log_num) && ($log_num >= 0) && $file) and $LOG_FILE[$log_num] = $file; } sub Min( $ $ ) { my ($i, $j) = @_; return ($i < $j) ? $i : $j; } sub Max( $ $ ) { my ($i, $j) = @_; return ($i > $j) ? $i : $j; } sub Reg_Diff( $ $ ; $ $ $ $ $ ) { my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_; my (@new_regs, $start, $end, $new_reg); $regs1 && $regs2 or return $regs1; $s1 ||= 0; defined($e1) or $e1 = 1; $s2 ||= 0; defined($e2) or $e2 = 1; for (my $i = 0; $i < @$regs1; ++$i) { $start = $$regs1[$i][$s1]; $end = $$regs1[$i][$e1]; for (my $j = 0; $j < @$regs2; ++$j) { $$regs2[$j][$s2] > $end and last; $$regs2[$j][$e2] < $start and next; if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] >= $end)) { undef($start), last; } if (($$regs2[$j][$s2] > $start) && ($$regs2[$j][$e2] >= $end)) { $end = $$regs2[$j][$s2] - 1, last; } if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] < $end)) { $start = $$regs2[$j][$e2] + 1, next; } ($start < ($$regs2[$j][$s2] - 1)) || !$strict and $new_reg = [@{$$regs1[$i]}], $$new_reg[$s1] = $start, $$new_reg[$e1] = $$regs2[$j][$s2] - 1, push(@new_regs, $new_reg); $start = $$regs2[$j][$e2] + 1; } !defined($start) || ($start > $end) and next; ($start < $end) || !$strict and $new_reg = [@{$$regs1[$i]}], $$new_reg[$s1] = $start, $$new_reg[$e1] = $end, push(@new_regs, $new_reg); } return \@new_regs; } sub Reg_Rem_Overlap( $ ; $ $ $ ) { my ($regs, $strict, $s, $e) = @_; my (@new_regs); $regs or return $regs; $s ||= 0; defined($e) or $e = 1; for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); } for (my $i = 0; $i < @new_regs; ++$i) { if (($i < $#new_regs) && ($new_regs[$i + 1][$s] <= $new_regs[$i][$e])) { $new_regs[$i + 1][$e] <= $new_regs[$i][$e] and splice(@new_regs, $i + 1, 1), --$i, next; $new_regs[$i + 1][$s] = $new_regs[$i][$e] + 1; } ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next; splice(@new_regs, $i, 1); --$i; } return \@new_regs; } sub Reg_Sort( $ ; $ $ $ ) { my ($regs, $rev, $s, $e) = @_; my (@new_regs); $regs or return $regs; $s ||= 0; defined($e) or $e = 1; if ($rev) { @new_regs = sort { ($$b[$s] <=> $$a[$s]) || ($$b[$e] <=> $$a[$e]) } @$regs; } else { @new_regs = sort { ($$a[$s] <=> $$b[$s]) || ($$a[$e] <=> $$b[$e]) } @$regs; } return \@new_regs; } sub Reg_Intersect( $ $ ; $ $ $ $ $ ) { my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_; $regs1 && $regs2 or return undef; $s1 ||= 0; defined($e1) or $e1 = 1; $s2 ||= 0; defined($e2) or $e2 = 1; return Reg_Diff($regs1, Reg_Diff($regs1, $regs2, $strict, $s1, $e1, $s2, $e2), $strict, $s1, $e1, $s1, $e1); } sub Reg_Merge( $ ; $ $ $ ) { my ($regs, $strict, $s, $e) = @_; my (@new_regs); $regs or return $regs; $s ||= 0; defined($e) or $e = 1; for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); } for (my $i = 0; $i < @new_regs; ++$i) { ($i < $#new_regs) && ($new_regs[$i + 1][$s] == ($new_regs[$i][$e] + 1)) and $new_regs[$i][$e] = $new_regs[$i + 1][$e], splice(@new_regs, $i + 1, 1), --$i, next; } for (my $i = 0; $i < @new_regs; ++$i) { ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next; splice(@new_regs, $i, 1); --$i; } return \@new_regs; } sub safe_glob { my ($regexp, $dir) = @_; my (@files); local (*DIR); $dir ||= "."; $regexp ||= ".*"; opendir(DIR, $dir) or return; @files = grep { /$regexp/ } readdir(DIR); closedir(DIR); return wantarray() ? @files : scalar(@files); } sub redirect_err2log { my ($facility) = @_; $Facility = $facility; stderr2log(); } sub stderr2log { my ($oldfh); open(STDERR, "> /dev/null"); open(STDERR, "| logger -p $Facility.err -t '$0\[$$\]'"); $oldfh = select(STDERR); $| = 1; select($oldfh); } sub openlogs { my ($facility) = @_; $facility and $Facility = $facility; stderr2log(); setlogsock("unix"); openlog($0, "pid", $Facility); $Syslog = 1; } sub daemon { my ($facility) = @_; my ($pid); if ($pid = fork()) { exit(0); } elsif (!defined($pid)) { wr_err("$E_FORK: $!"); die; } else { setsid(); close(STDIN); close(STDOUT); open(STDOUT, "> /dev/null"); openlogs($facility); } } sub start_watcher { my ($watcher, $facility, @params) = @_; my ($pid, $parent); $parent = $$; if ($pid = fork()) { return; } elsif (!defined($pid)) { wr_err("$E_FORK: $!"); die; } else { setsid(); close(STDIN); close(STDOUT); open(STDOUT, "> /dev/null"); $0 .= "_watcher"; openlogs($facility); &$watcher($parent, @params); } } sub wr_log { my $msg = shift; chomp($msg); $msg = ( $Msg_Prefix ? &$Msg_Prefix : "") . $msg; if ($Syslog) { syslog("info", "%s", $msg); } else { print "$msg\n"; } } sub wr_err { my $msg = shift; chomp($msg); print STDERR (( $Msg_Prefix ? &$Msg_Prefix : ""), "$msg\n"); return 1; } sub confirm { my ($msg) = @_; my ($ans); print $msg; $ans = ; chomp($ans); return ($ans =~ /^(y|yes)$/io) ? 1 : 0; } END { foreach my $lock (keys(%Locks)) { unlink($lock); } } 1; lagan20/src/xmfa2mfa.pl0000755000076500007650000000315210502337063015750 0ustar brudnobrudno00000000000000#!/usr/bin/perl use strict; $0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0; my (@lines, @filt_lines); my ($line, $line_in, $type); my $mode = ($ARGV[0] eq "1" ? "M1" : ($ARGV[0] eq "2" ? "M2" : die("$0: Invalid base genome argument (expected 1 or 2)"))); die("$0: LAGAN_DIR not defined. Stopped") unless defined $ENV{"LAGAN_DIR"}; while () { $line_in = $_; if ($line_in =~ /^\=.*(DM|M1|M2)$/) { $type = $1; $line .= $line_in; $lines[$#lines+1] = $line if $type eq "DM" or $type eq $mode; undef $line; undef $type; } else { $line .= $line_in; } } foreach my $line (@lines) { if ($mode eq "M2") { $line =~ /(\>[^\s\n]+\s([\+\-])[^\n]+)\n(.+)\n(\>[^\s\n]+\s([\+\-])[^\n]+)\n(.+)\n(\=.+?)\n/s; # $line =~ /(\>[^\s\n]+\s([\+\-])[^\n]+)\n([^\n]+)\n(\>[^\s\n]+\s([\+\-])[^\n]+)\n([^\n]+)\n(\=.+?\n)/s; my ($head1, $strand1, $seq1, $head2, $strand2, $seq2, $foot) = ($1, $2, $3, $4, $5, $6, $7); die if $strand1 ne $strand2; if ($strand1 eq "-") { $seq1 =~ s/\n//g; $seq2 =~ s/\n//g; $seq1 = reverse($seq1); $seq2 = reverse($seq2); $seq1 =~ s/(.{80})/$1\n/g; $seq2 =~ s/(.{80})/$1\n/g; } $line = $head2."\n".$seq2."\n".$head1."\n".$seq1."\n".$foot."\n"; } push @filt_lines, $line; } open(OUT, "> tmp.xmfa"); foreach my $line (@filt_lines) { print OUT $line; } close OUT; system($ENV{"LAGAN_DIR"}."/utils/Glue tmp.xmfa > glue.out 2> glue.err"); open(IN, "< glue.out"); my @glue_out = ; close IN; open(IN, "< glue.err"); my @glue_err = ; close IN; unlink("tmp.xmfa"); unlink("glue.out"); unlink("glue.err"); print STDOUT @glue_out; print STDERR @glue_err; lagan20/supermap.pl0000755000076500007650000021521310502337064015320 0ustar brudnobrudno00000000000000#!/usr/bin/perl # Supermap: Piecewise monotonic alignment map generator for Shuffle-LAGAN # Author: Andrey Kislyuk (kislyuk@ocf.berkeley.edu) package Supermap; require 5.005; my ($VERSION) = ('$Id: supermap.pl,v 1.50 2005/06/15 22:40:04 kislyuk Exp $' =~ /,v\s+(\d+\S+)/o); # Default constant values my $overlap_factor = 0.8; # Aligns will be discarded if another align overlaps them by this factor or more in both seqs and has the same orientation my $max_asym = 10; # Chains will be formed only if the resulting region's lengths differ by at most this factor my $min_seq_score; # All aligns for sequences with this total score will be discarded. See getMinSeqScore my $max_expand_len = 30000; # Aligns will be expanded or contracted on both sides on both strands by this amount up to the total length below my $expand_factor = 4; # When one of an align's sequences is constrained in its expansion by a neighbor/start/end, the other one will be expanded by this times more than the first one my $max_chainlen = 1500000; # Aligns will not be joined if the total length on either strand exceeds this. Set 0 to disable (no chain length limit) my $max_job_size = 50000; # Maximum job size, in blat hits, for chunking when running glocal in parallel my $erode_align = 15; # Amount by which to erode the coords of each align loaded (to avoid overlap problems when chaining) my ($c1, $c2, $c3, $c4) = (100, 50, 400, 25); # BLAT->CHAOS score conversion parameters #my $max_dist_y = 10000; # Join x-monotonic into same single-chain only if at most that apart in y-species. my $default_lagan_dir = "/home/genome/glocal"; my $glocal_name = (0 ? "SLAGAN" : "glocal"); use Getopt::Long; use File::Path; use File::Copy; use Cwd; use IPC::Open2; use IO::Handle; #use Carp; use strict; use warnings; no warnings "uninitialized"; sub main(); sub init(); sub getSeqSizes($$$); sub prepareHits(); sub runSLAGAN(); sub reprintInputHits($$$); sub processResults(); sub removeSLAGANOutput(); sub seqBelowMinScore($); sub alignHashID($); sub printChainToTemp($$$$); sub chainBase1Hits($$); sub chainBase2Hits($); sub load2MHashes($); sub loadBase2Hashes($); sub postProcessRegions(); sub workerRun($$$$); sub dequeueClustJobs($); sub get_all_seqs($$); sub isBLAT($); sub useIf($$); sub writeSizes($$); sub getMinSeqScore($); sub checkAlignCoords($); sub expandSeq1($$); sub expandSeq2($$); sub finalExpand($$); sub expSeq1Reg($$$$$); sub expSeq2Reg($$$$$); sub finalExpReg($$$$$); # array index constants use constant START1 => 0; use constant END1 => 1; use constant START2 => 2; use constant END2 => 3; use constant SEQ1 => 4; use constant SEQ2 => 5; use constant ORIENT => 6; use constant ORIGIN => 7; use constant SCORE => 8; use constant TOTSC => 9; use constant HASHID => 10; use constant FLIPPED=> 11; use constant CHALO1 => 12; use constant CHAHI1 => 13; use constant CHALO2 => 14; use constant CHAHI2 => 15; use constant CHALO1E=> 16; use constant CHAHI1E=> 17; use constant CHALO2E=> 18; use constant CHAHI2E=> 19; #use constant PREV1 => 8; use constant NEXT1 => 9; #use constant PREV2 => 10; use constant NEXT2 => 11; #use constant OSTART1=> 12; use constant OEND1 => 13; #use constant OSTART2=> 14; use constant OEND2 => 15; $SIG{'INT'} = $SIG{'QUIT'} = $SIG{'HUP'} = $SIG{'TRAP'} = $SIG{'ABRT'} = $SIG{'STOP'} = $SIG{'TERM'} = \&dequeueClustJobs; my ($debug, $quiet, $outfile, $proflip, $skip, $no_pid, $input_glob, $input_dir, $server, $db, $gen1, $gen2, $gen1sizefile, $gen2sizefile, $write_sizes1, $write_sizes2, $score_file, $cfg, $cfg_file, $sizes1, $sizes2, $dbh, $tmp_dir, $tmp_prefix, $nodelete, $clust_run_pid, $print_chains, $no_aligntotals, $no_clust_run, $num_jobs, $input_is_blat, $force_overwrite, $print_csv, $using_GP, $slagan_params, $tmp_existed, $print_stats, $lagan_dir, $glocal_out_logfile); my (@input_files); my (%offsets1, %offsets2, %aligns1, %aligns2, %flipped_aligns); my $supermapexec = $0; my $mycwd = getcwd(); $supermapexec =~ s/^\./$mycwd/ unless $supermapexec =~ /^\.\./; $supermapexec = $mycwd."/".$supermapexec if $supermapexec =~ /^\.\./; die("$0: Problem resolving my name, \'$supermapexec\' is not a file") unless -f $supermapexec or $ARGV[0] eq "worker"; $0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0; $lagan_dir = $ENV{"LAGAN_DIR"} if defined $ENV{"LAGAN_DIR"}; $lagan_dir = $ENV{"LAGAN_DIR"} = $default_lagan_dir unless defined $ENV{"LAGAN_DIR"}; $lagan_dir =~ s/^\.\./$mycwd\/\.\./; $lagan_dir =~ s/^\./$mycwd\//; $ENV{"LAGAN_DIR"} = $lagan_dir; print STDERR "$0: Warning: LAGAN_DIR=$lagan_dir is not a valid directory\n" unless -d $lagan_dir; push @INC, $lagan_dir; my $SLAGAN = $lagan_dir."/".$glocal_name; my $error_file = "./$0.$$.error.log"; my $default_score_file = $lagan_dir."/test.score"; my $default_outfile = "$0.out"; my $worker_tmp_dir = "/tmp/$0.$$.worker/"; # The directory where workers store their intermediate files (two workers should not use the same directory) my $usage = " -infile=file \t Name of input file containing all hits for the two genomes -outfile=file \t Output filename (default: $default_outfile) -gen1=id \t First genome ID (must exist in the GPDB) -gen2=id \t Second genome ID (must exist in the GPDB) -sizes1=file \t File with sequence sizes for first genome -sizes2=file \t File with sequence sizes for second genome -bacteria \t Rearrange circular DNA to find a better alignment map -server=hostname GPDB server (default: lemur) -db=dbname \t GPDB name (default: GP) -config=file \t GPDB config file (default: ~/.gprc) -score=file \t Score file for SLAGAN (default: $default_score_file) -glocal_out=file \t Save intermediate GLOCAL alignment hits to this file -no_clust_run \t Run CPU/memory intensive jobs locally, not on the GP cluster -tmp_dir=dir \t Working directory (default: /tmp/$0.pid) -f \t\t Overwrite output file without prompting if it exists -v \t\t Verbose mode -q \t\t Quiet mode -k \t\t Keep all temporary files -expand_length=N Maximum length by which to expand alignments (default: $max_expand_len) -max_length=N \t Maximum length for any alignment chain in either strand \t\t (default: $max_chainlen) -min_seq_score=N Sequences with total align score below this threshold will be \t\t discarded (default: U penalty in SLAGAN score file) -max_job_size=N Threshold, in hits, for splitting workload into separate jobs \t\t for clust_run (default: $max_job_size) -c1, c2, c3, c4=N: Score factors for BLAT->CHAOS conversion \t\t (default: $c1, $c2, $c3, $c4) Options may be abbreviated. Input file format is BLAT or CHAOS. Sequence names should not contain spaces. Alignments with negative scores are discarded. Sequence size file format, one sequence per line: seq_name seq_size "; exit(main()); # ___ Subroutines _______________ sub main() { if ($ARGV[0] eq "worker") { workerRun($ARGV[1], $ARGV[2], $ARGV[3], $ARGV[4]); exit(0); } # Running SLAGAN in distributed mode init(); print("$0: Retrieving sequence info...\n") unless $quiet; $sizes1 = getSeqSizes($dbh, $gen1, $gen1sizefile); (writeSizes($sizes1, $write_sizes1), exit(0)) if defined $write_sizes1; $sizes2 = getSeqSizes($dbh, $gen2, $gen2sizefile); (writeSizes($sizes2, $write_sizes2), exit(0)) if defined $write_sizes2; die("$0: No sequence size data found. Stopped") if (keys(%$sizes1) < 1 or keys(%$sizes2) < 1); die("$0: Flip mode is only applicable for two single-sequence organisms. Stopped") if ($proflip and not (keys(%$sizes1) == 1 and keys(%$sizes2) == 1)); # Sort and separate the alignments, run SLAGAN on them prepareHits(); runSLAGAN(); # Chain SLAGAN alignments into supermonotonic chain and save the intermediate results my ($dc, $sc1, $sc2) = processResults(); # Load the results back and expand regions, then print them postProcessRegions(); print "$0: Output written to $outfile\n" unless $quiet; print "$0: Intermediate files kept in $tmp_dir\n" if $nodelete and not $quiet; rmdir $tmp_dir unless $tmp_existed or $nodelete; return 0; } # Startup tasks sub init() { system('export LC_ALL="C"'); # Things may misbehave if locale is set to UTF-8 # Berkeley Genome Pipeline functionality is used if corresponding Perl modules are found in @INC foreach my $dir (@INC) { $using_GP = 1 if -f $dir."/GPDBI.pm" and -f $dir."/GPutils.pm"; } useIf $using_GP, "GPDBI"; useIf $using_GP, "GPutils"; useIf 1, "Utils"; # useIf 1, "Desoverlap"; die("$0: GetOptions failed to retrieve options. Check the input options. Usage:".$usage) unless GetOptions( "server=s" => \$server, "gen1=s" => \$gen1, "gen2=s" => \$gen2, "sizes1=s" => \$gen1sizefile, "sizes2=s" => \$gen2sizefile, "blatfile=s" => \$input_glob, "infile=s" => \$input_glob, "outfile=s" => \$outfile, "glocal_out=s" => \$glocal_out_logfile, "bacteria" => \$proflip, "server=s" => \$server, "db=s" => \$db, "config=s" => \$cfg_file, "tmp_dir=s" => \$tmp_dir, "skip" => \$skip, "no_pid" => \$no_pid, "no_clust_run" => \$no_clust_run, "print_chains" => \$print_chains, "print_stats" => \$print_stats, "no_aligntotals"=> \$no_aligntotals, "print_csv" => \$print_csv, "max_job_size" => \$max_job_size, "max_length=i" => \$max_chainlen, "expand_length=i"=>\$max_expand_len, "min_seq_score=i"=>\$min_seq_score, "max_asym=i" => \$max_asym, "overlap_factor"=> \$overlap_factor, "score=s" => \$score_file, "c1=i" => \$c1, "c2=i" => \$c2, "c3=i" => \$c3, "c4=i" => \$c4, "slagan_params" => \$slagan_params, "write_sizes1=s"=> \$write_sizes1, "write_sizes2=s"=> \$write_sizes2, "keep" => \$nodelete, "f" => \$force_overwrite, "v" => \$debug, "q" => \$quiet ); undef $quiet if $debug; my @uinfo = getpwuid($>); print("$0: Version ".$VERSION." started ".localtime()." by ".$uinfo[0]."\n") unless $quiet; $tmp_prefix = $0.($no_pid ? "" : ".".$$); unless ($no_clust_run) { $no_clust_run = `which clust_run 2> /dev/null`; $no_clust_run = not $no_clust_run; print("$0: clust_run not found - cluster operation disabled\n") if $no_clust_run and not $quiet; } if ($tmp_dir) { $tmp_existed = 1 if -d $tmp_dir; mkdir $tmp_dir unless -d $tmp_dir; $tmp_dir .= "/" unless /\/^Z/; } else { $tmp_dir = "/tmp/".$tmp_prefix; mkdir $tmp_dir; $tmp_dir .= "/"; } die("$0: No write permissions in working directory $tmp_dir. Stopped") unless -w $tmp_dir; die("$0: Genome IDs or size files not specified. Usage:".$usage) unless ($gen1 or $gen1sizefile) and ($gen2 or $gen2sizefile); die("$0: '-gen' options are invalid because GPDB is not available. Use '-sizes'. Stopped") if (($gen1 or $gen2) and not $using_GP); die("$0: Sequence size file $gen1sizefile not found. Stopped") unless -f $gen1sizefile or $gen1; die("$0: Sequence size file $gen2sizefile not found. Stopped") unless -f $gen2sizefile or $gen2; die("$0: Maximum job size too small, must exceed 10000 hits. Stopped") if $max_job_size < 10000; die("$0: Overlap factor must be between 0 and 1. Stopped") if $overlap_factor < 0 or $overlap_factor > 1; print("$0: SLAGAN score file not specified, using default $default_score_file\n") unless $score_file or $quiet; print("$0: Output file not specified, using default $default_outfile\n") unless $outfile or $quiet; # Check input file or glob if (defined $input_glob) { if ($input_glob =~ /\//) { ($input_dir, $input_glob) = ($input_glob =~ /\A(.*\/)([^\/]+)\Z/); } $input_glob .= "\$" unless $input_glob =~ /\$$/; $input_glob = "^".$input_glob unless $input_glob =~ /^\^/; @input_files = Utils::safe_glob($input_glob, $input_dir); } elsif (@ARGV > 0) { foreach my $file (@ARGV) { if ($file =~ /\//) { ($input_dir, $file) = ($file =~ /\A(.*\/)([^\/]+)\Z/); } push @input_files, $file; } } else { # TODO: split stdin for >2GB input open(FH, "> $tmp_dir$tmp_prefix.in"); print FH while ; close FH; push @input_files, "$tmp_prefix.in"; $input_dir = $tmp_dir; } unless ($input_dir =~ /\A\//) { $input_dir = $mycwd."/".$input_dir; } die("$0: No input files matching \"$input_dir$input_glob\" found. Stopped") unless @input_files > 0; print "$0: ".@input_files." input file(s)\n" if $debug; # Check output file $outfile = $default_outfile unless $outfile; if (-f $outfile and not $force_overwrite and -t STDERR) { print STDERR "$0: $outfile exists. Overwrite? (y/N, '-f' to force) "; my $overwrite = ; chomp $overwrite; (print("Move \"$outfile\" or use option '-f'.\n"), exit(1)) unless ($overwrite eq "Y" or $overwrite eq "y" or $overwrite eq "yes"); } open(FH, "> ".$outfile) or die("$0: Cannot open $outfile for writing: $!"); close FH; # Check SLAGAN score file $score_file = $default_score_file unless $score_file; unless ($score_file =~ /\A\//) { $score_file = $mycwd."/".$score_file; } $max_expand_len += $erode_align; die("$0: max_length cannot be less than 0. Stopped") if $max_chainlen < 0; $max_chainlen = 1000000000 if $max_chainlen == 0; $max_chainlen -= 2*$max_expand_len; # SLAGAN output for a given sequence will be discarded if the total score for the sequence is below this threshold. Default value is the SLAGAN unrelated gap penalty. $min_seq_score = getMinSeqScore($score_file) unless defined $min_seq_score; # Connect to GPDB if ($using_GP) { $GPutils::Error = ""; $cfg = read_gp_config(Get_Abs_Path($cfg_file)) or die($GPutils::Error); $server ||= $cfg->Get_Val("DB", "server"); $db ||= $cfg->Get_Val("DB", "main_db"); $dbh = GPDBI->connect($server, 0, $db, undef, undef, "gp_cgi", undef, {PrintError => 0, RaiseError => 1}); } } # Load sequence names and sizes either from GPDB or from file sub getSeqSizes($$$) { my ($dbh, $dataset, $gen_size_file) = @_; if ($dataset) { return get_all_seqs($dbh, $dataset); } else { my %sizes; open(FH, "< ".$gen_size_file) or die("$0: Could not open file $gen_size_file for reading: ".$!); while () { chomp; my ($seq, $size) = split; die("$0: Invalid format in file $gen_size_file") unless $seq and $size; $sizes{$seq} = $size; } close FH; return \%sizes; } } # Convert BLAT to CHAOS if necessary # Flip hits on circular sequence if necessary sub prepareHits() { my ($cur_align); local (*FH, *OUT1); print "$0: Preparing files...\n" unless $quiet; $input_is_blat = 1 if isBLAT($input_dir.$input_files[0]); if ($input_is_blat) { foreach my $file (@input_files) { system('awk \'{$13=($13+$15)?$13:1; print $1,$2,$3";",$5,$6,$7"; '. 'score = "' . $c1 . '*$8-' . $c2 . '*$9-' . $c3 . '*($12+$14)-' . $c4 . '*log($13+$15),"("$4")"}\''. "< $input_dir$file > $tmp_dir$file.chaos"); } } else { foreach my $file (@input_files) { system('ln -s "'.$input_dir.$file.'" "'.$tmp_dir.$file.'.chaos"'); } } if ($proflip) { open(FH, "< ".$tmp_dir.$input_files[0].".chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".chaos for reading: ".$!); open(OUT1, "> ".$tmp_dir.$input_files[0].".flipped.chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".flipped.chaos for writing: ".$!); my (@seq1s, @seq1e, @seq2s, @seq2e, @scores, @orientations, @seqn1, @seqn2); my ($seq1center, $seq2center, $seq1median, $seq2median); my $i = 0; while () { /\A[\s]*.*\s([\d]+)\s([\d]+)\;\s.*\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/; # ($seqn1[$i], $seq1s[$i], $seq1e[$i], $seqn2[$i], $seq2s[$i], $seq2e[$i], $scores[$i], $orientations[$i]) = ($1, $2, $3, $4, $5, $6, $7, $8); ($seq1s[$i], $seq1e[$i], $seq2s[$i], $seq2e[$i], $scores[$i], $orientations[$i]) = ($1, $2, $3, $4, $5, $6); if ($seq1s[$i] > $seq1e[$i]) { my $j = $seq1s[$i]; $seq1s[$i] = $seq1e[$i]; $seq1e[$i] = $j; } if ($seq2s[$i] > $seq2e[$i]) { my $j = $seq2s[$i]; $seq2s[$i] = $seq2e[$i]; $seq2e[$i] = $j; } $i++; } # For each interval pair, # if the seq1 interval median is greater than seq1 median, and the corresponding interval median in seq2 is less than seq2 median, # OR if the seq1 interval median is less than seq1 median, and the corresponding interval median in seq2 is greater than seq2 median, # set start of interval in seq1 to 2CoM1 - previous end of interval # set end of interval in seq1 to 2CoM1 - previous start of interval # flip the orientation (+/-) $seq1center = $$sizes1{(keys(%$sizes1))[0]} / 2; $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2; my $flip_counter = 0; foreach $i (0..@seq1s-1) { $seq1median = ($seq1s[$i] + $seq1e[$i]) / 2; $seq2median = ($seq2s[$i] + $seq2e[$i]) / 2; if (($seq1median > $seq1center and $seq2median < $seq2center) or ($seq1median < $seq1center and $seq2median > $seq2center)) { my $j = $seq2s[$i]; $seq2s[$i] = (2 * $seq2center) - $seq2e[$i]; $seq2e[$i] = (2 * $seq2center) - $j; if ($orientations[$i] eq "+") { $orientations[$i] = "-"; } else { $orientations[$i] = "+"; } $cur_align = []; $$cur_align[START1] = $seq1s[$i]; $$cur_align[START2] = $seq2s[$i]; $$cur_align[END1] = $seq1e[$i]; $$cur_align[END2] = $seq2e[$i]; $$cur_align[SCORE] = $scores[$i]; $$cur_align[ORIENT] = $orientations[$i]; $$cur_align[SEQ1] = (keys(%$sizes1))[0]; $$cur_align[SEQ2] = (keys(%$sizes2))[0]; $$cur_align[START1] += $erode_align; $$cur_align[END1] -= $erode_align; $$cur_align[START2] += $erode_align; $$cur_align[END2] -= $erode_align; $flipped_aligns{alignHashID($cur_align)} = $cur_align; $flip_counter++; } print OUT1 "seq1 ".$seq1s[$i]." ".$seq1e[$i]."; seq2 ".$seq2s[$i]." ".$seq2e[$i]."; score = ".$scores[$i]." (".$orientations[$i].")\n"; } close FH; close OUT1; print "$0: Single-sequence flip mode: ".($flip_counter+0)." hits flipped\n" if $debug; } } # Load all hits into a hash table, then write the hits for each sequence into a file # Run SLAGAN on each of these files, via worker instances either on the cluster or sequentially sub runSLAGAN() { my ($clust_run_invoke, $num_jobs, $sort_pid1, $sort_pid2, $sort_pid3, $one_seq_mode, $cur_align, $next_align, $curlen1, $curlen2, $nextlen1, $nextlen2, $overlap1, $overlap2, $dump_count); local (*RH1, *WH1, *RH2, *WH2, *RH3, *WH3, *IN, *DUPES); # my $filter = Desoverlap->new($overlap_factor, $debug); print "$0: Sorting input hits...\n" if $debug; open(DUPES, "> supermap.duplicates") if $debug; $one_seq_mode = 1 if (keys(%$sizes1) == 1 and keys(%$sizes2) == 1); $sort_pid1 = open2(\*RH1, \*WH1, "sort -k 1,1 -k 2,2n"); # pre-scan $sort_pid2 = open2(\*RH2, \*WH2, "sort -k 1,1 -k 2,2n"); # gen1base $sort_pid3 = open2(\*RH3, \*WH3, "sort -k 4,4 -k 5,5n"); # gen2base # Sort input on seq1 foreach my $file (@input_files) { open(IN, "< $tmp_dir$file".($proflip?".flipped":"").".chaos"); print WH1 while ; close IN; } close WH1; # Scan input, check if start2, end2 are ascending for sorting, erode alignments while () { /\A[\s]*(.*)\s([\d]+)\s([\d]+)\;\s(.*)\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/o; $next_align=[]; ($$next_align[SEQ1], $$next_align[START1], $$next_align[END1], $$next_align[SEQ2], $$next_align[START2], $$next_align[END2], $$next_align[SCORE], $$next_align[ORIENT]) = ($1, $2, $3, $4, $5, $6, $7, $8); next if $$next_align[SCORE] <= 0; if ($one_seq_mode) { $$next_align[SEQ1] = (keys(%$sizes1))[0]; $$next_align[SEQ2] = (keys(%$sizes2))[0]; } checkAlignCoords($next_align); unless ($$next_align[END1]-$$next_align[START1] <= $erode_align*2 or $$next_align[END2]-$$next_align[START2] <= $erode_align*2) { $$next_align[START1] += $erode_align; $$next_align[END1] -= $erode_align; $$next_align[START2] += $erode_align; $$next_align[END2] -= $erode_align; } =head1 # Overlap scan if ($$next_align[START1] <= $$cur_align[END1] and $$next_align[END1] >= $$cur_align[START1] # overlap in seq1 and $$next_align[START2] <= $$cur_align[END2] and $$next_align[END2] >= $$cur_align[START2] # overlap in seq2 and $$cur_align[SEQ1] eq $$next_align[SEQ1] and $$cur_align[SEQ2] eq $$next_align[SEQ2] and $$cur_align[ORIENT] eq $$next_align[ORIENT]) { ($curlen1, $curlen2, $nextlen1, $nextlen2) = ($$cur_align[END1] - $$cur_align[START1] + 1, $$cur_align[END2] - $$cur_align[START2] + 1, $$next_align[END1] - $$next_align[START1] + 1, $$next_align[END2] - $$next_align[START2] + 1); if ($$next_align[START1] <= $$cur_align[START1] and $$next_align[END1] >= $$cur_align[END1]) { $overlap1 = $$cur_align[END1] - $$cur_align[START1] + 1; # next covers cur } elsif ($$next_align[START1] <= $$cur_align[START1]) { $overlap1 = $$next_align[END1] - $$cur_align[START1] + 1; # next is to the left } elsif ($$next_align[END1] >= $$cur_align[END1]) { $overlap1 = $$cur_align[END1] - $$next_align[START1] + 1; # next is to the right } else { $overlap1 = $$next_align[END1] - $$next_align[START1] + 1; # cur covers next } if ($$next_align[START2] <= $$cur_align[START2] and $$next_align[END2] >= $$cur_align[END2]) { $overlap2 = $$cur_align[END2] - $$cur_align[START2] + 1; } elsif ($$next_align[START2] <= $$cur_align[START2]) { $overlap2 = $$next_align[END2] - $$cur_align[START2] + 1; } elsif ($$next_align[END2] >= $$cur_align[END2]) { $overlap2 = $$cur_align[END2] - $$next_align[START2] + 1; } else { $overlap2 = $$next_align[END2] - $$next_align[START2] + 1; } die("$0: Bad internal state") if $overlap1 < 0 or $overlap2 < 0; if (($overlap1 / $curlen1 > $overlap_factor) and ($overlap2 / $curlen2 > $overlap_factor) and $$cur_align[SCORE] <= $$next_align[SCORE]) { $dump_count++; print DUPES "Cur: (".$$cur_align[START1]."-".$$cur_align[END1].")(".$$cur_align[START2]."-".$$cur_align[END2].") ".$$cur_align[SCORE]." over with (".$$next_align[START1]."-".$$next_align[END1].")(".$$next_align[START2]."-".$$next_align[END2].") ".$$next_align[SCORE]."\n" if $debug; $cur_align = $next_align; next; # discard current align } elsif (($overlap1 / $nextlen1 > $overlap_factor) and ($overlap2 / $nextlen2 > $overlap_factor) and $$cur_align[SCORE] >= $$next_align[SCORE]) { $dump_count++; print DUPES "Nxt: (".$$next_align[START1]."-".$$next_align[END1].")(".$$next_align[START2]."-".$$next_align[END2].") ".$$next_align[SCORE]." over with (".$$cur_align[START1]."-".$$cur_align[END1].")(".$$cur_align[START2]."-".$$cur_align[END2].") ".$$cur_align[SCORE]."\n" if $debug; next; # discard next align } } =cut foreach my $cur_align ($next_align){ # (@{$filter->put($next_align)}) { print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n"; print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n"; } # print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if @$cur_align; # print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if @$cur_align; # $cur_align = $next_align; } # $filter->printAll(); # Flush alignments remaining in filter buffer # foreach my $cur_align (@{$filter->getBuffer()}) { # print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if $cur_align != 0; # print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if $cur_align != 0; # } close RH1; waitpid $sort_pid1, 0; close WH2; $num_jobs = reprintInputHits(1, 1, \*RH2); close RH2; waitpid $sort_pid2, 0; close WH3; $num_jobs = reprintInputHits(2, $num_jobs, \*RH3); close RH3; waitpid $sort_pid3, 0; close DUPES if defined fileno DUPES; # print STDERR "$0: Warning: ".$filter->{dump_count}." near duplicate alignments discarded (overlap factor $overlap_factor)\n" if $filter->{dump_count} and not $quiet; open(FH, "> ".$tmp_dir."CLUSTER_JOB_PARAMS") or die; foreach my $i (1..$num_jobs-1) { print FH "worker JOB".$i.".tar ".$score_file." ".$SLAGAN.($debug ? " -v" : ""); print FH " << JOB$i.tar > CLUSTER_JOB_MESSAGES.$i >> CLUSTER_JOB_ERRMSG.$i" unless $no_clust_run; print FH "\n"; } close FH; if ($no_clust_run) { open(FH, "< ".$tmp_dir."CLUSTER_JOB_PARAMS") or die; print "$0: Running ".($num_jobs-1)." SLAGAN jobs locally...\n" unless $quiet; while () { chomp; print("Job $.: \"$0 $_\"\n") if $debug; system("cd $tmp_dir; $supermapexec ".$_); } close FH; } else { $clust_run_invoke = "clust_run -program=".$supermapexec." -parameters=".$tmp_dir."CLUSTER_JOB_PARAMS -init_dir=$tmp_dir -wait"; print "$0: Running ".($num_jobs-1)." distributed SLAGAN jobs with clust_run...\n" unless $quiet; print "$0: \"$clust_run_invoke\"\n" if $debug; if ($clust_run_pid = fork()) { # I am the parent waitpid($clust_run_pid, 0); } elsif (not defined $clust_run_pid) { die("$0: Could not fork"); } else { # I am the child die("$0: Could not exec \"$clust_run_invoke\"") unless exec($clust_run_invoke); } undef $clust_run_pid; } foreach my $i (1..$num_jobs-1) { system("cd $tmp_dir; tar -xf ".$tmp_dir."JOB".$i.".results.tar"); unlink $tmp_dir."JOB".$i.".tar" unless $nodelete; unlink $tmp_dir."JOB".$i.".results.tar" unless $nodelete; unlink $tmp_dir."CLUSTER_JOB_MESSAGES.$i" unless $nodelete; unlink $tmp_dir."CLUSTER_JOB_ERRMSG.$i" unless $nodelete; } unlink "$tmp_dir$input_glob.chaos" unless $nodelete; unlink $tmp_dir."CLUSTER_JOB_PARAMS" unless $nodelete; foreach my $file (@input_files) { unlink $tmp_dir.$file.".chaos" unless $nodelete; } } sub reprintInputHit($$$) { my ($base_gen, $align, $FH) = @_; if ($base_gen == 1 and $$align[ORIENT] eq "+") { print $FH $$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n"; } elsif ($base_gen == 1 and $$align[ORIENT] eq "-") { print $FH $$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[END2]." ".$$align[START2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n"; } elsif ($base_gen == 2 and $$align[ORIENT] eq "+") { print $FH $$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; ".$$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n"; } elsif ($base_gen == 2 and $$align[ORIENT] eq "-") { print $FH $$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; ".$$align[SEQ1]." ".$$align[END1]." ".$$align[START1]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n"; } else { die("$0: Bad internal state from hit ".$$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")"); } } sub writeJobFile($$) { my ($job_id, $seq_list) = @_; local *LIST; open(LIST, "| cd $tmp_dir; xargs tar --append --file=".$tmp_dir."JOB".$job_id.".tar"); foreach my $file (sort alnum keys(%$seq_list)) { $file =~ /\/([^\/]+)$/; print LIST $1." "; } close LIST; foreach my $file (sort alnum keys(%$seq_list)) { unlink $file unless $nodelete; } } # Separate input into files based on sequence name and reverse order in gen2base hits sub reprintInputHits($$$) { my ($base_gen, $job_id, $RH) = @_; my ($one_seq_mode, $line_count, $prev_seq, $cur_seq, $cur_align); my (%cur_seq_list, %pruned_sizes); local (*OUT, *LIST); $one_seq_mode = 1 if (keys(%$sizes1) == 1 and keys(%$sizes2) == 1); print "$0: Reprinting hits (base genome $base_gen)..." if $debug; $line_count = 0; while (<$RH>) { /\A[\s]*(.*)\s([\d]+)\s([\d]+)\;\s(.*)\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/o; $cur_align=[]; ($$cur_align[SEQ1], $$cur_align[START1], $$cur_align[END1], $$cur_align[SEQ2], $$cur_align[START2], $$cur_align[END2], $$cur_align[SCORE], $$cur_align[ORIENT]) = ($1, $2, $3, $4, $5, $6, $7, $8); $cur_seq = ($base_gen == 1 ? $$cur_align[SEQ1] : $$cur_align[SEQ2]); if ($cur_seq ne $prev_seq) { $pruned_sizes{$cur_seq} = ($base_gen == 1 ? $$sizes1{$cur_seq} : $$sizes2{$cur_seq}); print " ".$cur_seq if $debug; close OUT if defined fileno OUT; open(OUT, "> ".$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos for writing: ".$!); if ($line_count > $max_job_size) { writeJobFile($job_id, \%cur_seq_list); undef %cur_seq_list; $line_count = 0; $job_id++; } $cur_seq_list{$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos"} = 1; } reprintInputHit($base_gen, $cur_align, \*OUT) if @$cur_align; $prev_seq = $cur_seq; # $cur_align = $next_align; $line_count++; } # reprintInputHit($base_gen, $next_align, \*OUT) if @$next_align; writeJobFile($job_id, \%cur_seq_list); $job_id++; close OUT; print "\n" if $debug; $sizes1 = \%pruned_sizes if $base_gen == 1; $sizes2 = \%pruned_sizes if $base_gen == 2; return $job_id; } sub seqBelowMinScore($) { my ($line) = @_; $line =~ /\A[\s]*\([\d]+\s[\d]+\)\=\([\d]+\s[\d]+\)\s([\d\.\-]+)\s[\+\-]+\s\[([\d\.\-]+)\][\s]*s1\:.*[\s]*s2\:.*\n\Z/; die("$0: Unable to extract score values from SLAGAN output:\n$line") if not defined $2; return ($2 < $min_seq_score); } sub processResults() { my ($cur_seq, $input_prefix, $dropped_seqs, $sort_pid, $sort_pid2); local (*RH, *WH, *IN, *OUT, *hashesDM_RH, *hashesDM_WH); print "$0: Loading SLAGAN output...\n" unless $quiet; open(GLOCAL_OUT_LOG, "> ".$glocal_out_logfile) if $glocal_out_logfile; # Sort gen2base aligns on seq1, then seq2, then start2, then print them to separate files, one file per gen1 seq # These files will be loaded on demand when scanning gen1base aligns (chainBase1Hits()) $sort_pid = open2(\*RH, \*WH, "sort -k 9,9 -k 7,7 -k 1.2,1n"); # input is base 2, key is 9 because a space is expected between s2: and seq2name $input_prefix = $tmp_dir.$input_files[0].".gen2base"; foreach my $seq (sort alnum keys(%$sizes2)) { open(IN, "< $input_prefix.$seq.chaos.glocal-out") or (delete($$sizes2{$seq}), next); my $line = ; die("$0: Empty SLAGAN output file $input_prefix.$seq.chaos.glocal-out, check corresponding job logs. Stopped") unless $line; if (seqBelowMinScore($line)) { print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; } seek IN, 0, 0; # back to start print WH while ; close IN; } close WH or die("$0: Error executing sort"); while () { /\ss2\:[\s]*([^\s]+)[\s]*\n\Z/; if ($1 ne $cur_seq or not defined $cur_seq) { next unless $1; close OUT if defined fileno OUT; $cur_seq = $1; open(OUT, "> $input_prefix.sorted-gen1.$cur_seq.chaos.glocal-out") or die("$0: Could not open file $input_prefix.sorted-gen1.$cur_seq.chaos.glocal-out for writing: ".$!); } print OUT $_; } close RH; close OUT if defined fileno OUT; waitpid $sort_pid, 0; # Sort gen1base aligns on seq1, then start1 $sort_pid = open2(\*RH, \*WH, "sort -k 7,7 -k 1.2,1n"); # input is base 1 $input_prefix = $tmp_dir.$input_files[0].".gen1base"; foreach my $seq (sort alnum keys(%$sizes1)) { open(IN, "< $input_prefix.$seq.chaos.glocal-out") or (delete($$sizes1{$seq}), next); my $line = ; if (seqBelowMinScore($line)) { $dropped_seqs++; print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; } seek IN, 0, 0; # back to start print WH while ; if ($glocal_out_logfile) { seek IN, 0, 0; print GLOCAL_OUT_LOG while ; } close IN; unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; } unlink $input_prefix.".chaos" unless $nodelete; close WH or die("$0: Error executing sort"); # Feed the gen1base aligns to the 2M/1M1 chain scanner (chainBase1Hits()) # The hashesDM handle is used to write 2M aligns' hashes to be sorted in seq2 order print "$0: Generating supermonotonic map...\n" unless $quiet; $sort_pid2 = open2(\*hashesDM_RH, \*hashesDM_WH, "sort -k 2,2"); chainBase1Hits(*RH, *hashesDM_WH); close RH; waitpid $sort_pid, 0; close hashesDM_WH or die("$0: Error executing sort"); # Print sorted 2M aligns' hashes, one file per gen2 seq undef $cur_seq; while() { my $line = $_; $line =~ /\A[^\s]+\s([^\s]+)\s[^\s]+\n\Z/; if ($1 ne $cur_seq or not defined $cur_seq) { close OUT if defined fileno OUT; $cur_seq = $1; open(OUT, "> $tmp_dir".$input_files[0].".hashesDM.gen2.$cur_seq") or die("$0: Could not open file $tmp_dir".$input_files[0].".hashesDM.gen2.$cur_seq for writing: ".$!); } print OUT $line; } close hashesDM_RH; waitpid $sort_pid2, 0; # Sort gen2base aligns on seq2, then start2 $sort_pid = open2(\*RH, \*WH, "sort -k 7,7 -k 1.2,1n"); # input is base 2 $input_prefix = $tmp_dir.$input_files[0].".gen2base"; foreach my $seq (sort alnum keys(%$sizes2)) { open(IN, "< $input_prefix.$seq.chaos.glocal-out") or next; my $line = ; if (seqBelowMinScore($line)) { $dropped_seqs++; print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; } seek IN, 0, 0; # back to start print WH while ; close IN; unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; } unlink $input_prefix.".chaos" unless $nodelete; close WH or die("$0: Error executing sort"); # Feed the gen2base aligns to the 1M2 chain scanner (chainBase2Hits()) chainBase2Hits(*RH); close RH; waitpid $sort_pid, 0; close GLOCAL_OUT_LOG if defined fileno GLOCAL_OUT_LOG; removeSLAGANOutput(); print STDERR "$0: Warning: Alignments for $dropped_seqs sequences discarded due to total score below cutoff ($min_seq_score)\n" if $dropped_seqs and not $quiet; } sub removeSLAGANOutput() { my $input_prefix = $tmp_dir.$input_files[0].".gen1base"; foreach my $seq (sort alnum keys(%$sizes1)) { unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; } unlink $input_prefix.".chaos" unless $nodelete; $input_prefix = $tmp_dir.$input_files[0].".gen2base"; foreach my $seq (sort alnum keys(%$sizes2)) { unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; } unlink $input_prefix.".chaos" unless $nodelete; rmdir $tmp_dir; } sub alignHashID($) { my ($align) = @_; # return 23*$$align[START1] + 41*$$align[START2] + 61*$$align[END1] + 83*$$align[END2]; return $$align[SEQ1].":".$$align[START1]."-".$$align[END1]."=".$$align[SEQ2].":".$$align[START2]."-".$$align[END2]; } # The chain writer lags the chainer by two chains because the full contents of neighboring chains must be known. sub printChainToTemp($$$$) { my ($FH, $prev_chain, $cur_chain, $next_chain) = @_; return unless defined $cur_chain; my $type = ${$$cur_chain[0]}[ORIGIN]; my ($first_align, $last_align) = ($$cur_chain[0], $$cur_chain[@$cur_chain-1]); print $FH ${$$cur_chain[0]}[ORIGIN]." ".@$cur_chain." ". $$first_align[START1]." ".$$first_align[END1]." ".$$first_align[START2]." ".$$first_align[END2]." ". $$first_align[SEQ1]." ".$$first_align[SEQ2]." ".$$first_align[ORIENT]." ".$$first_align[SCORE]." ". $$last_align[START1]." ".$$last_align[END1]." ".$$last_align[START2]." ".$$last_align[END2]." ". $$last_align[SEQ1]." ".$$last_align[SEQ2]." ".$$last_align[ORIENT]." ".$$last_align[SCORE]; if ($print_chains) { foreach my $align (@$cur_chain) { print $FH " ".$$align[START1]." ".$$align[END1]." ".$$align[START2]." ".$$align[END2]; } } print $FH "\n"; } sub chainBase1Hits($$) { my ($FH, $hashesDM) = @_; local *OUT; my ($cur_align, $prev_align, $cur_chain, $prev_chain, $pre_prev_chain, $chain_start_2M, $chain_start_1M1, $cur_seq, $align_peers, $flip_counter); my @bad_aligns; my %base2peers; while (<$FH>) { /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s([\d\.\-]+)\s([\+\-]+)\s\[([\d\.\-]+)\][\s]*s1\:(.*)[\s]*s2\:(.*)\n\Z/; next if ($1==$2); # skip null alignments (push(@bad_aligns, $_), next) unless $1 and $2 and $3 and $4 and $5 and $6; $cur_align = []; ($$cur_align[START1], $$cur_align[END1], $$cur_align[START2], $$cur_align[END2], $$cur_align[SCORE], $$cur_align[ORIENT], $$cur_align[TOTSC], $$cur_align[SEQ1], $$cur_align[SEQ2]) = ($1, $2, $3, $4, $5, $6, $7, $8, $9); $$cur_align[SEQ1] =~ s/^\s+//; $$cur_align[SEQ1] =~ s/\s+$//; $$cur_align[SEQ2] =~ s/^\s+//; $$cur_align[SEQ2] =~ s/\s+$//; #warn("Seen: ".$_) if $$cur_align[SEQ1] eq "AC002301.1"; checkAlignCoords($cur_align); if ($proflip and defined $flipped_aligns{alignHashID($cur_align)}) { my $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2; my $j = $$cur_align[START2]; $$cur_align[START2] = (2 * $seq2center) - $$cur_align[END2]; $$cur_align[END2] = (2 * $seq2center) - $j; if ($$cur_align[ORIENT] eq "+") { $$cur_align[ORIENT] = "-"; } else { $$cur_align[ORIENT] = "+"; } $$cur_align[FLIPPED]=1; $flip_counter++; } $$cur_align[HASHID] = alignHashID($cur_align); if ($$cur_align[SEQ1] ne $cur_seq) { #warn("Handling seq trans") if $prev_align and $$prev_align[SEQ1] eq "AC002301.1"; printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain);# unless defined $cur_seq; printChainToTemp(*OUT, $prev_chain, $cur_chain, undef);# unless defined $cur_seq; undef $chain_start_2M; undef $chain_start_1M1; undef $prev_align; undef $pre_prev_chain; undef $prev_chain; undef $cur_chain; $cur_seq = $$cur_align[SEQ1]; %base2peers = %{loadBase2Hashes($tmp_dir.$input_files[0].".gen2base.sorted-gen1.$cur_seq.chaos.glocal-out")}; close OUT if defined fileno OUT; open(OUT, "> ".$tmp_dir.$input_files[0].".2MM1.$cur_seq"); } $align_peers = $base2peers{$$cur_align[HASHID]}; $$cur_align[ORIGIN] = defined($align_peers) ? 2 : 1; if ($chain_start_2M and defined $align_peers and defined $prev_align # continue open 2M chain and (($$cur_align[ORIENT] eq "+" and $$cur_align[START2] > $$prev_align[END2] and $$prev_align[HASHID] eq $$align_peers[0]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] < $$prev_align[START2] and $$prev_align[HASHID] eq $$align_peers[1]) or ($$cur_align[FLIPPED] and ($$cur_align[ORIENT] eq "+" and $$cur_align[START2] < $$prev_align[END2] and $$prev_align[HASHID] eq $$align_peers[0]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] > $$prev_align[START2] and $$prev_align[HASHID] eq $$align_peers[1]))) and $$cur_align[ORIENT] eq $$prev_align[ORIENT] and $$cur_align[FLIPPED] eq $$prev_align[FLIPPED] and $$cur_align[SEQ2] eq $$prev_align[SEQ2] and ($$cur_align[START1] > $$prev_align[END1] or ($$cur_align[FLIPPED] and $$cur_align[START1] > $$prev_align[END1])) and abs($$cur_align[END1] - $$chain_start_2M[START1]) < $max_chainlen and abs($$cur_align[END2] - $$chain_start_2M[START2]) < $max_chainlen #and abs($$cur_align[END1] - $$chain_start_2M[START1])/abs($$cur_align[END2] - $$chain_start_2M[START2]) < $max_asym #and abs($$cur_align[END2] - $$chain_start_2M[START2])/abs($$cur_align[END1] - $$chain_start_2M[START1]) < $max_asym ) { push(@$cur_chain, $cur_align); print $hashesDM $$cur_align[SEQ1]."\t".$$cur_align[SEQ2]."\t".$$cur_align[HASHID]."\n"; } elsif (defined $align_peers) { # start new 2M chain printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain); $chain_start_2M = $cur_align; undef $chain_start_1M1; $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain; $cur_chain = [$cur_align]; print $hashesDM $$cur_align[SEQ1]."\t".$$cur_align[SEQ2]."\t".$$cur_align[HASHID]."\n"; } elsif ($chain_start_1M1 and defined $prev_align # continue open 1M1 chain and ((($$cur_align[ORIENT] eq "+" and $$cur_align[START2] > $$prev_align[END2]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] < $$prev_align[START2])) or ($$cur_align[FLIPPED] and (($$cur_align[ORIENT] eq "+" and $$cur_align[START2] < $$prev_align[END2]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] > $$prev_align[START2])))) and $$cur_align[ORIENT] eq $$prev_align[ORIENT] and $$cur_align[FLIPPED] eq $$prev_align[FLIPPED] and $$cur_align[SEQ2] eq $$prev_align[SEQ2] and ($$cur_align[START1] > $$prev_align[END1] or ($$cur_align[FLIPPED] and $$cur_align[START1] > $$prev_align[END1])) and abs($$cur_align[END1] - $$chain_start_1M1[START1]) < $max_chainlen and abs($$cur_align[END2] - $$chain_start_1M1[START2]) < $max_chainlen #and abs($$cur_align[END1] - $$chain_start_1M1[START1])/abs($$cur_align[END2] - $$chain_start_1M1[START2]) < $max_asym #and abs($$cur_align[END2] - $$chain_start_1M1[START2])/abs($$cur_align[END1] - $$chain_start_1M1[START1]) < $max_asym ) { push(@$cur_chain, $cur_align); } else { # start new 1M1 chain printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain); $chain_start_1M1 = $cur_align; undef $chain_start_2M; $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain; $cur_chain = [$cur_align]; } $prev_align = $cur_align; } printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain); printChainToTemp(*OUT, $prev_chain, $cur_chain, undef); print "$0: Single-sequence flip mode: ".($flip_counter+0)." gen1base hits backflipped\n" if $debug and $proflip; warn "$0: Warning: ".@bad_aligns." bad SLAGAN alignments discarded" if @bad_aligns > 0; } # Input is base 2, i.e. (start2 end2)=(start1 end1)... sub chainBase2Hits($) { my ($FH) = @_; local *OUT; my ($cur_align, $prev_align, $cur_chain, $prev_chain, $pre_prev_chain, $chain_start_2M, $chain_start_1M2, $cur_seq, $align_is_2M, $flip_counter); my @bad_aligns; my %aligns2M; while(<$FH>) { /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s([\d\.\-]+)\s([\+\-]+)\s\[([\d\.\-]+)\][\s]*s1\:(.*)[\s]*s2\:(.*)\n\Z/; next if ($1==$2); # skip null alignments (push(@bad_aligns, $_), next) unless $1 and $2 and $3 and $4 and $5 and $6; $cur_align = []; ($$cur_align[START2], $$cur_align[END2], $$cur_align[START1], $$cur_align[END1], $$cur_align[SCORE], $$cur_align[ORIENT], $$cur_align[TOTSC], $$cur_align[SEQ2], $$cur_align[SEQ1]) = ($1, $2, $3, $4, $5, $6, $7, $8, $9); $$cur_align[SEQ1] =~ s/^\s+//; $$cur_align[SEQ1] =~ s/\s+$//; $$cur_align[SEQ2] =~ s/^\s+//; $$cur_align[SEQ2] =~ s/\s+$//; checkAlignCoords($cur_align); if ($proflip and defined $flipped_aligns{alignHashID($cur_align)}) { my $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2; my $j = $$cur_align[START2]; $$cur_align[START2] = (2 * $seq2center) - $$cur_align[END2]; $$cur_align[END2] = (2 * $seq2center) - $j; if ($$cur_align[ORIENT] eq "+") { $$cur_align[ORIENT] = "-"; } else { $$cur_align[ORIENT] = "+"; } $$cur_align[FLIPPED] = 1; $flip_counter++; } $$cur_align[HASHID] = alignHashID($cur_align); if ($$cur_align[SEQ2] ne $cur_seq) { printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3;# and not defined $cur_seq; printChainToTemp(*OUT, $prev_chain, $cur_chain, undef) if $$cur_chain[0][ORIGIN] == 3;# and not defined $cur_seq; undef $chain_start_1M2; undef $prev_align; undef $pre_prev_chain; undef $prev_chain; undef $cur_chain; $cur_seq = $$cur_align[SEQ2]; %aligns2M = %{load2MHashes($tmp_dir.$input_files[0].".hashesDM.gen2.$cur_seq")}; close OUT if defined fileno OUT; open(OUT, "> ".$tmp_dir.$input_files[0].".M2.$cur_seq"); } $$cur_align[ORIGIN] = defined($aligns2M{$$cur_align[HASHID]}) ? 2 : 3; if (defined $aligns2M{$$cur_align[HASHID]}) { # align is 2M my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : []; printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3; undef $chain_start_1M2; # close 1M2 chain $chain_start_2M = $cur_align; $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain; $cur_chain = [$cur_align]; } elsif ($chain_start_1M2 # continue open 1M2 chain and ((($$cur_align[ORIENT] eq "+" and $$cur_align[START1] > $$prev_align[END1]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END1] < $$prev_align[START1])) or ($$cur_align[FLIPPED] and (($$cur_align[ORIENT] eq "+" and $$cur_align[START1] < $$prev_align[END1]) or ($$cur_align[ORIENT] eq "-" and $$cur_align[END1] > $$prev_align[START1])))) and $$cur_align[ORIENT] eq $$prev_align[ORIENT] and $$cur_align[SEQ1] eq $$prev_align[SEQ1] and $$cur_align[FLIPPED] == $$prev_align[FLIPPED] and ($$cur_align[START2] > $$prev_align[END2] or ($$cur_align[FLIPPED] and $$cur_align[START2] < $$prev_align[END2])) and abs($$cur_align[END1] - $$chain_start_1M2[START1]) < $max_chainlen and abs($$cur_align[END2] - $$chain_start_1M2[START2]) < $max_chainlen #and abs($$cur_align[END1] - $$chain_start_1M2[START1])/abs($$cur_align[END2] - $$chain_start_1M2[START2]) < $max_asym #and abs($$cur_align[END2] - $$chain_start_1M2[START2])/abs($$cur_align[END1] - $$chain_start_1M2[START1]) < $max_asym ) { push(@$cur_chain, $cur_align); } else { # start new 1M2 chain my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : []; printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3; $chain_start_1M2 = $cur_align; $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain; $cur_chain = [$cur_align]; } $prev_align = $cur_align; } my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : []; printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3; printChainToTemp(*OUT, $prev_chain, $cur_chain, undef) if $$cur_chain[0][ORIGIN] == 3; print "$0: Single-sequence flip mode: ".($flip_counter+0)." gen2base hits backflipped\n" if $debug and $proflip; warn "$0: Warning: ".@bad_aligns." bad SLAGAN alignments discarded" if @bad_aligns > 0; } # Input: file with lines of the form "seq1 seq2 hash" (seq2 should be the same per file) # Output: hash(key->align hash ID, value->1). Input file is deleted. sub load2MHashes($) { my ($file) = @_; my %hashes; local *FH; open(FH, "< $file") or return {}; while () { /\A[^\s]+\t[^\s]+\t([^\s]+)\n\Z/; warn("Hash collision in \"$_\" vs. \"".$hashes{$1}."\"") if defined $hashes{$1}; $hashes{$1} = 1; } close FH; unlink $file unless $nodelete; return \%hashes; } # Input: file with gen2base alignments which should have the same seq1 ordered by start2 or not exist # Output: hash(key->align hash ID, value->[prev align hash ID, next align hash ID]). Input file is deleted. # Input is base 2, i.e. (start2 end2)=(start1 end1)... sub loadBase2Hashes($) { my ($file) = @_; my ($prev_align, $cur_align, $next_align); my %hashes; local *FH; open(FH, "< $file") or return {}; while () { # Scan 1 line ahead because the next align must also be seen /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s.*s1\:(.*)[\s]*s2\:(.*)/; $next_align = []; # Hits are gen2base ($$next_align[START2], $$next_align[END2], $$next_align[START1], $$next_align[END1], $$next_align[SEQ2], $$next_align[SEQ1]) = ($1, $2, $3, $4, $5, $6); checkAlignCoords($next_align); $$next_align[SEQ1] =~ s/^\s+//; $$next_align[SEQ1] =~ s/\s+$//; $$next_align[SEQ2] =~ s/^\s+//; $$next_align[SEQ2] =~ s/\s+$//; $$next_align[HASHID] = alignHashID($next_align); warn("LB2H: Hash collision in \"$_\"") if defined $cur_align and defined $hashes{$$cur_align[HASHID]}; $hashes{$$cur_align[HASHID]} = [$prev_align ? $$prev_align[HASHID] : 1, $next_align ? $$next_align[HASHID] : 1] if $cur_align; $prev_align = $cur_align; $cur_align = $next_align; } $hashes{$$cur_align[HASHID]} = [$prev_align ? $$prev_align[HASHID] : 1, undef] if $cur_align; close FH; unlink $file unless $nodelete; return \%hashes; } # Load chained regions and expand them according to the expansion rules, then print them out and display some chain statistics sub postProcessRegions() { local (*IN, *OUT, *RH1, *WH1, *RH2, *WH2, *RH3, *WH3); my ($first_align, $last_align, $type, $num_aligns, $sort_pid1, $sort_pid2, $sort_pid3); my (@line, @min_lengths, @max_lengths, @means, @pos_counts, @neg_counts); $sort_pid1 = open2(\*RH1, \*WH1, "sort -k 7,7 -k 3,3n"); # sort on seq1, start1 $sort_pid2 = open2(\*RH2, \*WH2, "sort -k 8,8 -k 5,5n"); # sort on seq2, start2 $sort_pid3 = open2(\*RH3, \*WH3, "sort -k 7,7 -k 3,3n"); # sort on seq1, start1 # open(WH1, "> ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!); open(OUT, "> ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!); # open(OUT, "| sort -k 1,1 -k 2,2n > ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!); foreach my $seq (sort alnum keys %$sizes1) { open(IN, "< ".$tmp_dir.$input_files[0].".2MM1.$seq") or next; print WH1 while ; close IN; unlink $tmp_dir.$input_files[0].".2MM1.$seq" unless $nodelete; } foreach my $seq (sort alnum keys %$sizes2) { open(IN, "< ".$tmp_dir.$input_files[0].".M2.$seq") or next; print WH1 while ; close IN; unlink $tmp_dir.$input_files[0].".M2.$seq" unless $nodelete; } close WH1; expandSeq1(\*RH1, \*WH2); close RH1; waitpid $sort_pid1, 0; close WH2; expandSeq2(\*RH2, \*WH3); close RH2; waitpid $sort_pid2, 0; close WH3; finalExpand(\*RH3, \*OUT); close RH3; waitpid $sort_pid3, 0; close OUT; } # Input: chains ordered by seq1, start1 # Output: chains expanded on seq1 sub expandSeq1($$) { my ($RH, $WH) = @_; my ($first_align, $last_align, $type, $num_aligns, $cur_seq, $preexpand1, $postexpand1, $prev_chain, $cur_chain, $next_chain); my (@line); while (<$RH>) { chomp; @line = split; # skip M2 regions if ($line[0] == 3) { $,= " "; print $WH @line[0..17]; print $WH " 0 0 0 0 "; print $WH @line[18..$#line]; print $WH "\n"; undef $,; next; } $prev_chain = $cur_chain; $cur_chain = $next_chain; $first_align = []; $last_align = []; ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2], $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE], $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2], $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE]) = @line; $$first_align[CHALO1] = ($$first_align[START1] < $$last_align[START1] ? $$first_align[START1] : $$last_align[START1]); $$first_align[CHAHI1] = ($$first_align[END1] > $$last_align[END1] ? $$first_align[END1] : $$last_align[END1]); my @saved_line = @line; $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line]; next unless defined $cur_chain; expSeq1Reg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq); # TODO # if ($cur_seq ne $$first_align[SEQ1]) { # undef $cur_chain; # $cur_seq = $$first_align[SEQ1]; # } } expSeq1Reg($WH, $cur_chain, $next_chain, undef, $cur_seq); } sub expSeq1Reg($$$$$) { my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_; my ($preexpand1, $postexpand1); $preexpand1 = $$cur_chain[0][CHALO1] - (defined $prev_chain ? $$prev_chain[0][CHAHI1] : 0); $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len; #$preexpand1 = 0 if $preexpand1 < 0; $preexpand1 = $max_expand_len if $preexpand1 < 0; # !!! $postexpand1 = $$next_chain[0][CHALO1] - $$cur_chain[0][CHAHI1]; $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len; #$postexpand1 = 0 if $postexpand1 < 0; $postexpand1 = $max_expand_len if $postexpand1 < 0; #$postexpand1 = 0 if defined $prev_chain and $$prev_chain[0][CHAHI1] > $$cur_chain[0][CHAHI1]; # don't expand if covered by another align $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1; $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1; $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1; $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]}; $cur_seq = $$cur_chain[0][SEQ1] if not defined $cur_seq; if ($cur_seq ne $$cur_chain[0][SEQ1]) { # Correct upper expansion $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $max_expand_len; $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]}; } print $WH $$cur_chain[2]." ".$$cur_chain[3]." ". $$cur_chain[0][START1]." ".$$cur_chain[0][END1]." ".$$cur_chain[0][START2]." ".$$cur_chain[0][END2]." ". $$cur_chain[0][SEQ1]." ".$$cur_chain[0][SEQ2]." ".$$cur_chain[0][ORIENT]." ".$$cur_chain[0][SCORE]." ". $$cur_chain[1][START1]." ".$$cur_chain[1][END1]." ".$$cur_chain[1][START2]." ".$$cur_chain[1][END2]." ". $$cur_chain[1][SEQ1]." ".$$cur_chain[1][SEQ2]." ".$$cur_chain[1][ORIENT]." ".$$cur_chain[1][SCORE]." ". $$cur_chain[0][CHALO1]." ".$$cur_chain[0][CHAHI1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E]; if ($print_chains) { my $i = 18; while (1) { print $WH " ".${$$cur_chain[4]}[$i]." ".${$$cur_chain[4]}[$i+1]." ".${$$cur_chain[4]}[$i+2]." ".${$$cur_chain[4]}[$i+3]; last if @{$$cur_chain[4]} <= $i+4; $i+=4; } } print $WH "\n"; } # Input: chains ordered by seq2, start2 # Output: chains expanded on seq1 and seq2 (final output) sub expandSeq2($$) { my ($RH, $WH) = @_; my ($first_align, $last_align, $type, $num_aligns, $cur_seq, $preexpand1, $postexpand1, $preexpand2, $postexpand2, $prev_chain, $cur_chain, $next_chain); my (@line); while (<$RH>) { chomp; @line = split; # skip M1 regions if ($line[0] == 1) { $,= " "; print $WH @line[0..21]; print $WH " 0 0 0 0 "; print $WH @line[22..$#line]; print $WH "\n"; undef $,; next; } $prev_chain = $cur_chain; $cur_chain = $next_chain; $first_align = []; $last_align = []; ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2], $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE], $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2], $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE], $$first_align[CHALO1], $$first_align[CHAHI1], $$first_align[CHALO1E], $$first_align[CHAHI1E]) = @line; $$first_align[CHALO2] = ($$first_align[START2] < $$last_align[START2] ? $$first_align[START2] : $$last_align[START2]); $$first_align[CHAHI2] = ($$first_align[END2] > $$last_align[END2] ? $$first_align[END2] : $$last_align[END2]); my @saved_line = @line; $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line]; next unless defined $cur_chain; expSeq2Reg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq); # if ($cur_seq ne $$first_align[SEQ2]) { # undef $cur_chain; # $cur_seq = $$first_align[SEQ2]; # } } expSeq2Reg($WH, $cur_chain, $next_chain, undef, $cur_seq); } sub expSeq2Reg($$$$$) { my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_; my ($preexpand1, $postexpand1, $preexpand2, $postexpand2); $preexpand1 = $$cur_chain[0][CHALO1] - $$cur_chain[0][CHALO1E]; $postexpand1 = $$cur_chain[0][CHAHI1E] - $$cur_chain[0][CHAHI1]; $preexpand2 = $$cur_chain[0][CHALO2] - (defined $prev_chain ? $$prev_chain[0][CHAHI2] : 0); $preexpand2 = $preexpand1 * $expand_factor if $preexpand2 > $preexpand1 * $expand_factor and $$cur_chain[2] != 3; $preexpand2 = $max_expand_len if $preexpand2 > $max_expand_len; #$preexpand2 = 0 if $preexpand2 < 0; $preexpand2 = $max_expand_len if $preexpand2 < 0; $preexpand1 = $preexpand2 * $expand_factor if $preexpand1 > $preexpand2 * $expand_factor and $$cur_chain[2] != 3; $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len; $postexpand2 = $$next_chain[0][CHALO2] - $$cur_chain[0][CHAHI2]; $postexpand2 = $postexpand1 * $expand_factor if $postexpand2 > $postexpand1 * $expand_factor and $$cur_chain[2] != 3; $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len; #$postexpand2 = 0 if $postexpand2 < 0; $postexpand2 = $max_expand_len if $postexpand2 < 0; $postexpand1 = $postexpand2 * $expand_factor if $postexpand1 > $postexpand2 * $expand_factor and $$cur_chain[2] != 3; $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len; $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1; $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1; $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1; $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]}; $$cur_chain[0][CHALO2E] = $$cur_chain[0][CHALO2] - $preexpand2; $$cur_chain[0][CHALO2E] = 1 if $$cur_chain[0][CHALO2E] < 1; $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2; $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]}; if ($cur_seq ne $$cur_chain[0][SEQ2]) { # Correct upper expansion $postexpand2 = $postexpand1 * $expand_factor; $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len; $postexpand2 = 0 if $postexpand2 < 0; $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2; $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]}; } print $WH $$cur_chain[2]." ".$$cur_chain[3]." ". $$cur_chain[0][START1]." ".$$cur_chain[0][END1]." ".$$cur_chain[0][START2]." ".$$cur_chain[0][END2]." ". $$cur_chain[0][SEQ1]." ".$$cur_chain[0][SEQ2]." ".$$cur_chain[0][ORIENT]." ".$$cur_chain[0][SCORE]." ". $$cur_chain[1][START1]." ".$$cur_chain[1][END1]." ".$$cur_chain[1][START2]." ".$$cur_chain[1][END2]." ". $$cur_chain[1][SEQ1]." ".$$cur_chain[1][SEQ2]." ".$$cur_chain[1][ORIENT]." ".$$cur_chain[1][SCORE]." ". $$cur_chain[0][CHALO1]." ".$$cur_chain[0][CHAHI1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E]." ". $$cur_chain[0][CHALO2]." ".$$cur_chain[0][CHAHI2]." ".$$cur_chain[0][CHALO2E]." ".$$cur_chain[0][CHAHI2E]; if ($print_chains) { my $i = 22; while (1) { print $WH " ".${$$cur_chain[4]}[$i]." ".${$$cur_chain[4]}[$i+1]." ".${$$cur_chain[4]}[$i+2]." ".${$$cur_chain[4]}[$i+3]; last if @{$$cur_chain[4]} <= $i+4; $i+=4; } } print $WH "\n"; } sub finalExpReg($$$$$) { my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_; my ($preexpand1, $postexpand1, $preexpand2, $postexpand2); if ($$cur_chain[2] == 1) { # M1: expand in seq1 on seq2 expands * factor only $preexpand1 = $$cur_chain[0][CHALO1] - $$cur_chain[0][CHALO1E]; $preexpand2 = $preexpand1 * $expand_factor; $preexpand2 = $max_expand_len if $preexpand2 > $max_expand_len; $postexpand1 = $$cur_chain[0][CHAHI1E] - $$cur_chain[0][CHAHI1]; $postexpand2 = $postexpand1 * $expand_factor; $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len; $$cur_chain[0][CHALO2E] = $$cur_chain[0][CHALO2] - $preexpand2; $$cur_chain[0][CHALO2E] = 1 if $$cur_chain[0][CHALO2E] < 1; $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2; $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]}; } elsif ($$cur_chain[2] == 3) { # M2: expand in seq2 on seq1 expands * factor only $preexpand2 = $$cur_chain[0][CHALO2] - $$cur_chain[0][CHALO2E]; $preexpand1 = $preexpand2 * $expand_factor; $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len; $postexpand2 = $$cur_chain[0][CHAHI2E] - $$cur_chain[0][CHAHI2]; $postexpand1 = $postexpand2 * $expand_factor; $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len; $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1; $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1; $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1; $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]}; } print $WH $$cur_chain[0][SEQ1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E]." ". $$cur_chain[0][SEQ2]." ".$$cur_chain[0][CHALO2E]." ".$$cur_chain[0][CHAHI2E]." ".$$cur_chain[0][ORIENT]; print $WH " (".($$cur_chain[2]==1?"M1, ":$$cur_chain[2]==2?"DM, ":"M2, ").$$cur_chain[3]." aligns)" unless $no_aligntotals; if ($print_chains) { my $i = 26; while (1) { print $WH " [".${$$cur_chain[4]}[$i]."-".${$$cur_chain[4]}[$i+1]."=".${$$cur_chain[4]}[$i+2]."-".${$$cur_chain[4]}[$i+3]."]"; last if @{$$cur_chain[4]} <= $i+4; $i+=4; } } print $WH "\n"; } sub finalExpand($$) { my ($RH, $WH) = @_; my ($first_align, $last_align, $type, $num_aligns, $cur_seq, $preexpand1, $postexpand1, $preexpand2, $postexpand2, $prev_chain, $cur_chain, $next_chain); my %stats; my (@line); while (<$RH>) { chomp; @line = split; $prev_chain = $cur_chain; $cur_chain = $next_chain; $first_align = []; $last_align = []; ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2], $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE], $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2], $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE], $$first_align[CHALO1], $$first_align[CHAHI1], $$first_align[CHALO1E], $$first_align[CHAHI1E], $$first_align[CHALO2], $$first_align[CHAHI2], $$first_align[CHALO2E], $$first_align[CHAHI2E]) = @line; if ($type == 1) { $$first_align[CHALO2] = ($$first_align[START2] < $$last_align[START2] ? $$first_align[START2] : $$last_align[START2]); $$first_align[CHAHI2] = ($$first_align[END2] > $$last_align[END2] ? $$first_align[END2] : $$last_align[END2]); } elsif ($type == 3) { $$first_align[CHALO1] = ($$first_align[START1] < $$last_align[START1] ? $$first_align[START1] : $$last_align[START1]); $$first_align[CHAHI1] = ($$first_align[END1] > $$last_align[END1] ? $$first_align[END1] : $$last_align[END1]); } my @saved_line = @line; $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line]; next unless defined $cur_chain; finalExpReg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq); if ($debug or $print_stats) { if ($type == 1) { $$cur_chain[0][ORIENT] eq "+" ? $stats{"M1+"}++ : $stats{"M1-"}++; $stats{"M1min"} = $num_aligns if $stats{"M1min"} > $num_aligns or not defined $stats{"M1min"}; $stats{"M1max"} = $num_aligns if $stats{"M1max"} < $num_aligns or not defined $stats{"M1max"}; $stats{"M1mean"} += $num_aligns; } elsif ($type == 2) { $$cur_chain[0][ORIENT] eq "+" ? $stats{"DM+"}++ : $stats{"DM-"}++; $stats{"DMmin"} = $num_aligns if $stats{"DMmin"} > $num_aligns or not defined $stats{"DMmin"}; $stats{"DMmax"} = $num_aligns if $stats{"DMmax"} < $num_aligns or not defined $stats{"DMmax"}; $stats{"DMmean"} += $num_aligns; } else { $$cur_chain[0][ORIENT] eq "+" ? $stats{"M2+"}++ : $stats{"M2-"}++; $stats{"M2min"} = $num_aligns if $stats{"M2min"} > $num_aligns or not defined $stats{"M2min"}; $stats{"M2max"} = $num_aligns if $stats{"M2max"} < $num_aligns or not defined $stats{"M2max"}; $stats{"M2mean"} += $num_aligns; } } if ($cur_seq ne $$first_align[SEQ2]) { undef $cur_chain; $cur_seq = $$first_align[SEQ2]; } } finalExpReg($WH, $cur_chain, $next_chain, undef, $cur_seq); if ($debug or $print_stats) { foreach my $i ("DM", "M1", "M2") { $stats{$i."mean"} /= ($stats{$i."+"} + $stats{$i."-"}) unless ($stats{$i."+"} + $stats{$i."-"} == 0); print $i.": ".($stats{$i."+"} + $stats{$i."-"})." chains (".$stats{$i."+"}."+, ".$stats{$i."-"}."-); ". "length min ".$stats{$i."min"}.", avg ".$stats{$i."mean"}.", max ".$stats{$i."max"}."\n"; } } } # Called only in a "$0 worker" invocation sub workerRun($$$$) { my ($tar_file, $score_file, $SLAGAN, $debug) = @_; my ($tmp_dir, $io_dir) = ($worker_tmp_dir, getcwd); local *FH; mkdir($tmp_dir) or die("$0 (worker): Could not create directory $tmp_dir: ".$!); copy($score_file, $tmp_dir); $score_file =~ /.*\/([^\/]+)$/; $score_file = $tmp_dir.$1; print("$0 (worker): Version ".$VERSION." started ".localtime()."\n") if $debug; print("$0 (worker): Jobfile=$tar_file, scorefile=$score_file, tmpdir=$tmp_dir, iodir=$io_dir, SLAGAN=$SLAGAN\n") if $debug; move($io_dir."/".$tar_file, $tmp_dir); my @files = `cd $tmp_dir; tar -xvf $tar_file` or warn("$0 (worker): Error extracting $tar_file"); foreach my $file (@files) { chomp $file; #print "$SLAGAN $tmp_dir$file $score_file > $tmp_dir$file.glocal-out 2> $tmp_dir$file.glocal-err\n"; system("$SLAGAN $tmp_dir$file $score_file ". "> $tmp_dir$file.glocal-out ". "2> $tmp_dir$file.glocal-err"); } $tar_file =~ /(.*)\.tar$/; $tar_file = $1; open(FH, "| cd $tmp_dir; xargs tar --append --file=$io_dir/$tar_file.results.tar"); foreach my $file (glob("$tmp_dir/*glocal-out")) { $file =~ /\/([^\/]+)$/; print FH $1." "; } close FH; rmtree $tmp_dir; opendir(DIR, "."); if (my @x = grep(/core\./,readdir(DIR))) { warn("$0 (worker): WARNING: $SLAGAN crashed ".@x." times"); } closedir DIR; unlink(glob("core.*")) unless $nodelete; } # Interrupt handler sub dequeueClustJobs($) { print "\n$0: Received SIG".$_[0].". Cleaning up... "; if ($clust_run_pid) { # send SIGQUIT to clust_run so it can dequeue cluster jobs kill "QUIT", $clust_run_pid; } unless ($debug or $nodelete) { print "Removing job files..."; foreach my $i (1..$num_jobs-1) { unlink $tmp_dir."JOB".$i.".tar"; unlink $tmp_dir."JOB".$i.".results.tar"; unlink $tmp_dir."CLUSTER_JOB_MESSAGES.$i"; unlink $tmp_dir."CLUSTER_JOB_ERRMSG.$i"; } unlink "$tmp_dir$input_glob.chaos"; unlink $tmp_dir."CLUSTER_JOB_PARAMS"; rmtree($tmp_dir) if $ARGV[0] eq "worker"; } print "\n"; exit(1); } # Retrieve sequence length data from GPDB sub get_all_seqs($$) { my ($dbh, $genome) = @_; my ($dset, $annot_db, $family, $check_chroms, %sizes, $chroms, @real_chroms, $ctgs); ($dset, $annot_db, $family) = ($genome =~ /^\d+$/o) ? ($genome + 0, ($dbh->get_data_set($genome))[4,14]) : ($dbh->get_family_dset($genome))[0,4,14]; print "$0: Genome $genome, dataset $dset, annotation db \"$annot_db\", family \"$family\"\n" if $debug; $annot_db and $check_chroms = 1; if ($check_chroms) { $chroms = $dbh->get_chroms(($dbh->get_data_set($dset))[2]); foreach my $chrom (@$chroms) { $$chrom[1] == 1 or next; my $name = "chr$$chrom[2]"; my ($chr_id, $chr_type, $ctg_id, $size) = $dbh->find_seq($name, $dset, $annot_db); $chr_id and $sizes{$name} = $size; } } $ctgs = $dbh->selectcol("SELECT name FROM dset$dset\_contigs " . "WHERE name is not null and name != ? group by name", undef, ""); foreach my $ctg (@$ctgs) { $sizes{$ctg} = $dbh->get_contig_size($dset, $ctg); } return \%sizes; } sub alnum { my ($i); my ($len1, $len2) = (length($a), length($b)); for ($i = 0; ($i < $len1) && ($i < $len2); ++$i) { my $c1 = substr($a, $i, 1); my $c2 = substr($b, $i, 1); ($c1 =~ /^\d/o) || ($c2 =~ /^\d/o) || ($c1 ne $c2) and last; } my $a_r = ($i < $len1) ? substr($a, $i) : ""; my $b_r = ($i < $len2) ? substr($b, $i) : ""; my ($a_n, $a_s) = ($a_r =~ /^(\d+)(.*)$/o); my ($b_n, $b_s) = ($b_r =~ /^(\d+)(.*)$/o); return (defined($a_n) && defined($b_n)) ? (($a_n <=> $b_n) || ($a_s cmp $b_s)) : ($a cmp $b); } sub isBLAT($) { my ($file) = @_; local *FH; open(FH, "< ".$file) or die("$0: Cannot open input file $file: ".$!); my $line = ; close FH; if ($line =~ /\A.+\s[\d]+\s[\d]+\;\s.+\s[\d]+\s[\d]+\;\sscore/) { return 0; } elsif ($line =~ /\A[^\s]+\s[\d]+\s[\d]+\s[^\s]+\s/) { return 1; } else { die("$0: Unknown input format in $file. Stopped"); } } sub getMinSeqScore($) { my ($file) = @_; my $score; local *FH; open(FH, "< ".$file) or die("$0: Could not open SLAGAN scorefile $file: $!"); while () { # sample line: {+U+;+U-;-U+;-U-}{70000 0 0 0} /\{\+U\+\;.+\}.*\{(\d+)\s.+\}/; $score = $1 if $1; } close FH; die("$0: Could not determine min_seq_score from SLAGAN scorefile $file. Stopped") unless $score; print "$0: min_seq_score: $score\n" if $debug; return $score; } sub writeSizes($$) { my ($sizes, $outfile) = @_; local *FH; open(FH, "> ".$outfile) or die("$0: Could not open file $outfile for writing: ".$!); foreach my $key (sort alnum keys %$sizes1) { print FH $key."\t".$$sizes1{$key}."\n"; } close FH; } # Borrowed from if.pm to enable standalone conditional module loading on earlier versions of Perl sub useIf($$) { my $method = 'import'; return unless shift; # CONDITION my $package = $_[0]; (my $file = $package.".pm") =~ s!::!/!g; require $file; my $method_entry_point = $package->can($method); goto &$method_entry_point if $method_entry_point; } sub checkAlignCoords($) { my $cur_align = $_[0]; if ($$cur_align[START1] > $$cur_align[END1]) { my $i = $$cur_align[START1]; $$cur_align[START1] = $$cur_align[END1]; $$cur_align[END1] = $i; } if ($$cur_align[START2] > $$cur_align[END2]) { my $i = $$cur_align[START2]; $$cur_align[START2] = $$cur_align[END2]; $$cur_align[END2] = $i; } # if ($$cur_align[OSTART1] > $$cur_align[OEND1]) { my $i = $$cur_align[OSTART1]; $$cur_align[OSTART1] = $$cur_align[OEND1]; $$cur_align[OEND1] = $i; } # if ($$cur_align[OSTART2] > $$cur_align[OEND2]) { my $i = $$cur_align[OSTART2]; $$cur_align[OSTART2] = $$cur_align[OEND2]; $$cur_align[OEND2] = $i; } } =head1 NAME Supermap: Piecewise monotonic alignment map generator for shuffle-lagan =head1 SYNOPSIS supermap.pl (gen2=id | sizes2=filename) (gen1=id | sizes1=filename) [-infile=] [-outfile=] [-bacteria] [-score=filename] [-f] [file1 file2 ...] =head1 EXAMPLES supermap.pl -sizes1=human.sizes -sizes2=mouse.sizes hm.chr*.chaos =head1 DESCRIPTION Supermap is a whole-genome alignment map generator. It is an extension to the Shuffle-LAGAN suite (Brudno et al., 2003). Supermap removes the asymmetry between the query genomes by running multiple SLAGAN passes and combining them into a full two-genome alignment. To run Supermap without the Berkeley Genome Pipeline functionality, you will need sequence length files for each of the genomes. Each file should contain one sequence length entry per line, of the form "sequence_name sequence_length". In the CHAOS output format (this program's input), negative orientation always means second pair of coords is inverted. In this program's output, negative orientation does not invert coordinates (coordinate pairs are always ascending). Run supermap.pl with no arguments to see a further description. The terms "hit" and "anchor" usually refer to local alignments produced by CHAOS or another program. The term "chain" refers to an extended union of a number of these local alignments. =head1 DEPENDENCIES Supermap depends on Utils.pm, SLAGAN, and a number of Unix utilities. To use the Berkeley Genome Pipeline and cluster functionality, Supermap needs GPutils.pm, GPDBI.pm, and clust_run. =head1 LIMITATIONS Supermap is designed to allow the manipulation of large datasets in a reasonable memory footprint. To do this, it allows multiple files on input and keeps most of its intermediate data in small temporary files. However, one current limitation is that the alignments for any sequence in either genome must fit into the largest addressable file size (typically 2GB), and the output alignments must also fit in that size (the remainder will be truncated). =head1 BUGS =head1 TODO TODO: bacteria description, examples, other input formats TODO: installer routine TODO: discuss input glob parameters TODO: local multithreading TODO: ignore escaped slashes when splitting dir/file (copy Alex) TODO: check for ++ etc in SLAGAN out TODO: .supermaprc file for score files, etc TODO: hazelton.lbl.gov/bugzilla for supermap =head1 AUTHOR Andrey Kislyuk L. =cut lagan20/test.score0000644000076500007650000000026110502337064015133 0ustar brudnobrudno00000000000000{+R+;-L-}{0 0.02 0 0;5000 0 0 0} {+R-;-L+}{200 0 0.1 0.02;5000 0 0 0} {-R+;+L-}{3000 0 0.5 0.02;5000 0 0 0} {+L+;-R-}{3000 0 0.5 0.02;5000 0 0 0} {+U+;+U-;-U+;-U-}{5000 0 0 0} lagan20/utils/0000755000076500007650000000000010502546662014266 5ustar brudnobrudno00000000000000lagan20/utils/cmerge2.pl0000755000076500007650000001562410502337061016151 0ustar brudnobrudno00000000000000#!/usr/bin/env perl use File::Basename; $lagandir = $ENV{LAGAN_DIR}; $pid = $$; # process arguments if (@ARGV < 4 && @ARGV > 6) { print STDERR ("usage:\n cmerge seqfile mfafile draftfile outfile [-nocrop] [-skipfr pid]\n"); exit(1); } $arglist = ""; $nocrop = 0; for ($i = 4; $i < @ARGV; $i++) { if ($ARGV[$i] =~ /-nocrop/){ $nocrop = 1; } elsif ($ARGV[$i] =~ /-skipfr/){ $skipfr = 1; $pid = $ARGV[++$i]; chomp $pid; } else { print STDERR "Bad arg to cmerge: $ARGV[$i]"; exit(1); } } $arglist = "$arglist $recurfl"; if (!$skipfr) { exit(1); } $newdir = `pwd`; chomp $newdir; $newdir = "$newdir/$pid"; open (LOGFILE, ">>$newdir/log"); open (INFOFILE, ">$newdir/minfo"); print STDERR ("\n"); print STDERR ("Computing Contig Overlaps\n"); print STDERR ("-------------------------\n"); print LOGFILE ("\n"); print LOGFILE ("Computing Contig Overlaps\n"); print LOGFILE ("-------------------------\n"); # initialize merged file open (OFILE, ">$ARGV[3]"); print OFILE (">merged\n"); close (OFILE); `cp $ARGV[3] $ARGV[3].masked`; # initialize padding file open (OFILE, ">$newdir/padding"); print OFILE (">padding\n"); print OFILE ("NNNNNNNNNNNNNNNNNNNN.NNNNNNNNNNNNNNNNNNNN\n"); close (OFILE); $padlength = `$lagandir/utils/getlength $newdir/padding`; chomp $padlength; # other initialization $totlength = `$lagandir/utils/getlength $ARGV[0]`; chomp $totlength; $mergedEnd = 0; # read contig list $numContigs = 0; @list = `cat $ARGV[2]`; for ($i = 3; $i < @list; $i++){ $list[$i] =~ /(.*)\.mfa --\> \((\d+) (\d+)\) score=(\d+), offset=\((\d+) (\d+)\), index=(\d+)/; $filenames[$i-3] = $1; $seq1Begin[$i-3] = $2; $seq1End[$i-3] = $3; $score[$i-3] = $4; $s1shifts[$i-3] = $5; $s2shifts[$i-3] = $6; $num[$i-3] = $7; $temp = $seq1Begin[$i-3] - $s1shifts[$i-3]; $seq2Begin[$i-3] = `$lagandir/utils/getcontigpos $filenames[$i-3].mfa $temp`; chomp $seq2Begin[$i-3]; $seq2Begin[$i-3] += $s2shifts[$i-3]; $temp = $seq1End[$i-3] - $s1shifts[$i-3]; $seq2End[$i-3] = `$lagandir/utils/getcontigpos $filenames[$i-3].mfa $temp`; chomp $seq2End[$i-3]; $seq2End[$i-3] += $s2shifts[$i-3]; print STDERR "$filenames[$i-3].mfa --> $seq1Begin[$i-3] $seq1End[$i-3] $score[$i-3] $s1shifts[$i-3] $s2shifts[$i-3] $num[$i-3] $seq2Begin[$i-3] $seq2End[$i-3]\n"; $numContigs++; } # extract contigs $contigfile = basename ($ARGV[1]); $contigdir = dirname ($ARGV[1]); $newdir = `pwd`; chomp $newdir; $newdir = "$newdir/$pid"; # start out merged file with only padding `mv $ARGV[3] $ARGV[3].new`; `$lagandir/utils/seqmerge $ARGV[3].new $newdir/padding > $ARGV[3]`; `mv $ARGV[3].masked $ARGV[3].masked.new`; `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/padding > $ARGV[3].masked`; $contigStart[0] = 1; $startChop[0] = 0; `cp $filenames[0] $newdir/current`; `cp $filenames[0].masked $newdir/current.masked`; # merge contigs for ($i = 1; $i < $numContigs; $i++){ `$lagandir/rechaos.pl $newdir/current $filenames[$i] -recurse \"(12,0,40,0)x\" -maskedonly > $newdir/currentanchs`; # find the overlap `$lagandir/utils/getoverlap $newdir/currentanchs` =~ /(-?\d+) (-?\d+) (-?\d+) (-?\d+)/; $rangebegin1 = $1; $rangeend1 = $2; $rangebegin2 = $3; $rangeend2 = $4; chomp $rangebegin1; chomp $rangeend1; chomp $rangebegin2; chomp $rangeend2; $thislength = `$lagandir/utils/getlength $filenames[$i-1]`; chomp $thislength; $nextlength = `$lagandir/utils/getlength $filenames[$i]`; chomp $nextlength; # if no overlap, flush the buffer if ($rangebegin1 == -1 && $rangeend1 == -1){ print STDERR "No overlap found...\n"; `mv $ARGV[3] $ARGV[3].new`; `$lagandir/utils/seqmerge $ARGV[3].new $newdir/current $newdir/padding > $ARGV[3]`; `cp $filenames[$i] $newdir/current`; `mv $ARGV[3].masked $ARGV[3].masked.new`; `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/current.masked $newdir/padding > $ARGV[3].masked`; `cp $filenames[$i].masked $newdir/current.masked`; $contigEnd[$i-1] = $contigStart[$i-1] + $thislength - 1; $contigStart[$i] = $contigEnd[$i-1] + $padlength + 1; $endChop[$i-1] = 0; $startChop[$i] = 0; } else { print STDERR "Overlap detected!\n"; # extract the overlapped region > overlap $j = $rangebegin1 - 1; if ($j > 0){ `$lagandir/utils/cextract $newdir/current 1 $j 0 0 > $newdir/overlap`; `$lagandir/utils/cextract $newdir/current.masked 1 $j 0 0 > $newdir/overlap.masked`; $overlaplength = `$lagandir/utils/getlength $newdir/overlap`; chomp $overlaplength; `mv $ARGV[3] $ARGV[3].new`; `$lagandir/utils/seqmerge $ARGV[3].new $newdir/overlap > $ARGV[3]`; `mv $ARGV[3].masked $ARGV[3].masked.new`; `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/overlap.masked > $ARGV[3].masked`; } # extract the nonoverlapped region > current `$lagandir/utils/cextract $filenames[$i] $rangebegin2 $nextlength 0 0 > $newdir/current`; `$lagandir/utils/cextract $filenames[$i].masked $rangebegin2 $nextlength 0 0 > $newdir/current.masked`; $contigEnd[$i-1] = $contigStart[$i-1] + $overlaplength - 1; $contigStart[$i] = $contigEnd[$i-1] + 1; $endChop[$i-1] = $thislength - $rangeend1; $startChop[$i] = $rangebegin2 - 1; } if (index ($filenames[$i-1], ".rc") == -1) { $direction = "+"; } else { $direction = "-"; } @temp = `head $filenames[$i-1]`; chomp $temp[0]; $temp[0] = substr $temp[0], 1; print INFOFILE "$temp[0]\n"; print INFOFILE "$num[$i-1] $seq1Begin[$i-1] $seq1End[$i-1] $contigStart[$i-1] $contigEnd[$i-1] $startChop[$i-1] $endChop[$i-1] $direction $score[$i-1] $seq2Begin[$i-1] $seq2End[$i-1]\n"; } $thislength = `$lagandir/utils/getlength $filenames[$numContigs - 1]`; chomp $thislength; $contigEnd[$numContigs - 1] = $contigStart[$numContigs - 1] + $thislength - 1; $endChop[$numContigs - 1] = 0; `mv $ARGV[3] $ARGV[3].new`; `$lagandir/utils/seqmerge $ARGV[3].new $newdir/current $newdir/padding > $ARGV[3]`; `mv $ARGV[3].masked $ARGV[3].masked.new`; `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/current.masked $newdir/padding > $ARGV[3].masked`; if (index ($filenames[$numContigs - 1], ".rc") == -1) { $direction = "+"; } else { $direction = "-"; } @temp = `head $filenames[$numContigs - 1]`; chomp $temp[0]; $temp[0] = substr $temp[0], 1; print INFOFILE "$temp[0]\n"; print INFOFILE "$num[$numContigs - 1] $seq1Begin[$numContigs - 1] $seq1End[$numContigs - 1] $contigStart[$numContigs - 1] $contigEnd[$numContigs - 1] $startChop[$numContigs - 1] $endChop[$numContigs - 1] $direction $score[$numContigs - 1] $seq2Begin[$numContigs - 1] $seq2End[$numContigs - 1]\n"; print STDERR "Merging complete!\n\n"; print LOGFILE "Merging complete!\n\n"; # 1. write getoverlap() -- given a set of chaos hits, find the beginning and end in both seqs # 2. implement contigStart, contigStop -- positions of the contig begins/ends in the merged draft sequence # 3. startChop, endChop -- number chopped from each end # 4. secFrom, secTo -- pos in the chopped contig sequence lagan20/utils/draft.pl0000755000076500007650000001711110502337061015716 0ustar brudnobrudno00000000000000#!/usr/bin/env perl use File::Basename; $lazyflag = 0; $lagandir = $ENV{LAGAN_DIR}; $recurfl = "-recurse \"(12,0,30,0)x,(13,1,30,0)x,(3,0,30,0)xt,(8,1,30,0)x,(7,1,30,0)x,(7,1,15,0)x\""; $laganparams = "-maskedonly "; $anchgapstart = -5; $anchgapcont = -0.2; $usebounds = 1; $startingrate = 65; $rateinc = 1; $frlevel = ""; $pid = "mergedir"; if (@ARGV < 2) { if ((@ARGV == 1) && ($ARGV[0] =~ /-version/)){ print STDERR "DRAFT version 0.1\n"; exit (0); } else { print STDERR ("Usage:\n\ndraft.pl SEQFILE MFAFILE [-cons RATE] [-translate] [-version]\n"); exit (1); } } $arglist = ""; $skipfr = 0; for ($i = 2; $i < @ARGV; $i++) { if ($ARGV[$i] =~ /-recurse/){ $recurfl = " -recurse \"".$ARGV[++$i]."\""; } elsif ($ARGV[$i] =~ /-skipfr/){ $skipfr = 1; $pid = $ARGV[++$i]; chomp $pid; } elsif ($ARGV[$i] =~ /-translate/){ $recurfl = $recurfl." -translate"; } elsif ($ARGV[$i] =~ /-cons/){ $startingrate = $ARGV[++$i]; chomp $startingrate; } elsif ($ARGV[$i] =~ /-lazy/){ $lazyflag = 1; } elsif ($ARGV[$i] =~ /-fastreject/){ $frarg = " -fastreject $frlevel"; } else { print STDERR "Bad arg to draft: $ARGV[$i]"; } } $arglist = "$arglist $recurfl -usebounds $laganparams $frarg"; # create new directory $newdir = `pwd`; chomp $newdir; $newdir = "$newdir/$pid"; `mkdir $newdir` if (!(-e $newdir)); open (LOGFILE, ">$newdir/log"); print STDERR ("\n"); print STDERR ("Finding Contig Alignments\n"); print STDERR ("-------------------------\n"); print LOGFILE ("\n"); print LOGFILE ("Finding Contig Alignments\n"); print LOGFILE ("-------------------------\n"); # extract contigs; $contigfile = basename ($ARGV[1]); $contigdir = dirname ($ARGV[1]); `cp $ARGV[1] $newdir`; @contigs = `perl $lagandir/mextract.pl $newdir/$contigfile`; if ($?) { exit(1);} for ($i = 0; $i < @contigs; $i++){ chomp $contigs[$i]; `$lagandir/utils/rc < $contigs[$i] > $contigs[$i].rc`; if ($?) { exit(1); } } # extract masked contigs $maskedname = $ARGV[1].".masked"; if (-e $maskedname){ $maskedcontigfile = basename ($maskedname); `cp $maskedname $newdir`; @maskedcontigs = `perl $lagandir/mextract.pl $newdir/$maskedcontigfile -masked`; if ($?) { exit(1);} for ($i = 0; $i < @maskedcontigs; $i++){ chomp $maskedcontigs[$i]; `$lagandir/utils/rc < $maskedcontigs[$i] > $contigs[$i].rc.masked`; if ($?) { exit(1); } } } # create file storing name of contig stats open (LFILE, ">$newdir/filenames") if (!$lazyflag); $num = 0; for ($i = 0; $i < @contigs; $i++){ chomp $contigs[$i]; $skip1 = $skip2 = 0; # make alignments if (!$lazyflag || !(-e "$contigs[$i].mfa")){ $execute = "perl $lagandir/lagan.pl $ARGV[0] $contigs[$i] -mfa $arglist -out $contigs[$i].mfa"; $execute = $execute." -gap $anchgapstart $anchgapcont" if ($usebounds); `$execute`; $ex_val = $? >> 8; if (!(-e "$contigs[$i].mfa")) { $skip1 = 1; } elsif ($?) { exit(1);} if (!$skip1 && $usebounds){ # compute bounds @bounds = `$lagandir/utils/getbounds anchs.final $ARGV[0] $contigs[$i]`; if ($?) { exit(1);} $bounds[0] =~ /-s1 (\d+) (\d+) -s2 (\d+) (\d+)/; $s1shift = $1 - 1; $s2shift = $3 - 1; } `rm anchs.final`; } if (!$lazyflag || !(-e "$contigs[$i].rc.mfa")){ $execute = "perl $lagandir/lagan.pl $ARGV[0] $contigs[$i].rc -mfa $arglist -out $contigs[$i].rc.mfa"; $execute = $execute." -gap $anchgapstart $anchgapcont" if ($usebounds); `$execute`; $ex_val = $? >> 8; if (!(-e "$contigs[$i].rc.mfa")) { $skip2 = 1; } elsif ($?) { exit(1);} if (!$skip2 && $usebounds){ # compute bounds @bounds = `$lagandir/utils/getbounds anchs.final $ARGV[0] $contigs[$i].rc`; if ($?) { exit(1);} $bounds[0] =~ /-s1 (\d+) (\d+) -s2 (\d+) (\d+)/; $s1rcshift = $1 - 1; $s2rcshift = $3 - 1; } `rm anchs.final`; } if ($skip1) { $fscore = 0; } else { $fscore = `$lagandir/utils/scorealign $contigs[$i].mfa $startingrate`; chomp $fscore; if ($?) { exit(1);} } if ($skip2) { $bscore = 0; } else { $bscore = `$lagandir/utils/scorealign $contigs[$i].rc.mfa $startingrate`; chomp $bscore; if ($?) { exit(1);} } # pick strand # print LFILE "$s1shift $contigs[$i].mfa\n" if (!$lazyflag); # print LFILE "$s1rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag); # if (0){ if ($fscore > 0 || $bscore > 0){ $j = $i + 1; if ($fscore > $bscore){ print STDERR ("(+) direction preferred for Contig \"$contigs[$i]\": $fscore > $bscore\n"); print LOGFILE ("(+) direction preferred for Contig \"$contigs[$i]\": $fscore > $bscore\n"); print LFILE "$j $s1shift $s2shift $contigs[$i].mfa\n" if (!$lazyflag); print STDERR "$j $s1shift $s2shift $contigs[$i].mfa\n" if (!$lazyflag); } elsif ($bscore > $fscore){ print STDERR ("(-) direction preferred for Contig \"$contigs[$i]\": $fscore < $bscore\n"); print LOGFILE ("(-) direction preferred for Contig \"$contigs[$i]\": $fscore < $bscore\n"); print LFILE "$j $s1rcshift $s2rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag); print STDERR "$j $s1rcshift $s2rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag); } } # } else { print STDERR ("Contig \"$contigs[$i]\" could not be matched: $fscore, $bscore\n"); print LOGFILE ("Contig \"$contigs[$i]\" could not be matched: $fscore, $bscore\n"); } } close (LFILE); print STDERR ("\n"); print STDERR ("Computing Contig Ordering\n"); print STDERR ("-------------------------\n\n"); print LOGFILE ("\n"); print LOGFILE ("Computing Contig Ordering\n"); print LOGFILE ("-------------------------\n\n"); $foundorder = 0; for ($cutoff = $startingrate; !$foundorder && ($cutoff < 100); $cutoff += $rateinc){ `$lagandir/utils/scorecontigs /$newdir/filenames $ARGV[0] $newdir/contignames $cutoff > $newdir/ranges`; if ($?) { exit(1);} @list = `cat $newdir/ranges`; $list[0] =~ /numContigs = (\d+)/; next if ($1 == 0); `$lagandir/utils/contigorder $newdir/ranges > $newdir/corder`; if ($?) { exit(1);} @list = `cat $newdir/corder`; chomp $list[0]; $foundorder = 1 if ($list[0] ne "ordering failed"); } if ($foundorder){ open (OFILE, ">$newdir/draft"); print OFILE ("Draft Ordering\n"); print OFILE ("--------------\n\n"); @contignames = `cat $newdir/contignames`; for ($i = 0; $i < @contignames; $i++){ $contignames[$i] =~ /(\d+) (\d+) (\d+) (.*)/; $num[$i] = $1; chomp $num[$i]; $s1shifts[$i] = $2; chomp $s1shifts[$i]; $s2shifts[$i] = $3; chomp $s2shifts[$i]; $filenames[$i] = $4; chomp $filenames[$i]; } @list = `cat $newdir/corder`; for ($i = 0; $i < @list; $i++){ $list[$i] =~ /(\d+) --\> \((\d+) (\d+)\) (.*)/; $score = $4; chomp $score; print OFILE ("$filenames[$1] --> ($2 $3) score=$score, offset=($s1shifts[$1] $s2shifts[$1]), index=$num[$1]\n"); } close (OFILE); print STDERR `cat $newdir/draft`; print LOGFILE `cat $newdir/draft`; close (LOGFILE); } else { print STDERR "Could not compute ordering."; print LOGFILE "Could not compute ordering."; close (LOGFILE); exit (0); } $filename1 = $ARGV[0]; $filename2 = "$newdir/$contigfile"; `$lagandir/cmerge2.pl $filename1 $filename2 $newdir/draft $filename2.merged -skipfr $pid`; if ($?) { exit(1); } print STDERR "EXECUTE $lagandir/cmerge2.pl $filename1 $filename2 $newdir/draft $filename2.merged -skipfr $pid\n"; `cp $filename2.merged merged_seq.fa`; `cp $filename2.merged.masked merged_seq.fa.masked`; `cp $newdir/minfo minfo`; `cp $newdir/ranges ranges`; `cp $newdir/log log`; print STDERR ("\n"); print STDERR ("Computing Final Alignment\n"); print STDERR ("-------------------------\n\n"); # `rm -rf $newdir`; lagan20/utils/flipchaos.pl0000644000076500007650000000050610502337061016563 0ustar brudnobrudno00000000000000#!/usr/bin/perl while ($line = ) { $line =~ /(.*)\s+([0-9]+)\s+([0-9]+);\s*(.*)\s+([0-9]+)\s+([0-9]+);\s*score\s* =\s*([0-9]*)\.?([0-9]*)\s*\(([+-])\)/; if ($9 eq "+" || $6 > $5) { print "$4 $5 $6; $1 $2 $3; score = $7.$8 ($9)\n"; } else { print "$4 $6 $5; $1 $3 $2; score = $7.$8 ($9)\n"; } } lagan20/utils/mextract.pl0000755000076500007650000000346310502337061016452 0ustar brudnobrudno00000000000000#!/usr/bin/env perl if (@ARGV < 1) { print ("usage:\n mextract.pl filename [-masked]\n"); exit(1); } $masked=0; $filename = $ARGV[0]; if(@ARGV==2) { if ($ARGV[1] eq "-masked") { $masked = 1; } } open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n"; $prefix = substr $filename, 0, (rindex $filename, "."); if ($masked || index ($filename, ".masked") != -1) { $prefix = substr $filename, 0, (rindex $prefix, "."); } $line = ; chomp $line; while (substr($line, 0, 1) ne ">") { $line = ; chomp $line; } $suffix = "fa"; if ($masked) { $suffix = "$suffix.masked"; } if (substr($line, 0, 1) eq ">") { $name = substr($line, 1); if (index ($name, " ") != -1){ $name = substr($name, 0, index ($name, " ")); } if (substr ($name, length ($name) - 1) eq ","){ $name = substr($name, 0, length ($name) - 1); } # $name = substr($line, 1); # $_ = substr($line, 1); # /\w+/g; # $name = $&; # substr($line, 1)." " =~ /(.+)[,]\s+/g; # $name = $1; $fname = "$prefix\_$name.$suffix"; print("$fname\n"); open(OUTFILE, ">$fname"); print OUTFILE ">$name\n"; } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { close OUTFILE; # substr($line, 1)." " =~ /(.+)[,]\s/g; # $name = $1; $name = substr($line, 1); if (index ($name, " ") != -1){ $name = substr($name, 0, index ($name, " ")); } if (substr ($name, length ($name) - 1) eq ","){ $name = substr($name, 0, length ($name) - 1); } # $_ = substr($line, 1); # /\w+/g; # $name = $&; $fname = "$prefix\_$name.$suffix"; print("$fname\n"); open(OUTFILE, ">$fname"); print OUTFILE ">$name\n"; } else { print OUTFILE "$line"; } } close OUTFILE; lagan20/utils/mf2bin.pl0000755000076500007650000000312510502337061015773 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # defaults # constants # usage notes if (@ARGV < 1) { print ("usage:\n mf2bin.pl inputfile [-out outputfile] \n"); exit(1); } # parse parameters $tofile = 0; for ($i=1; $i<@ARGV; $i++) { if ($ARGV[$i] eq "-out") { $tofile = 1; $outfilename = $ARGV[++$i]; } } if ($tofile) { open(OUTFILE, ">$outfilename"); } # read in Multi-FASTA file $infilename = $ARGV[0]; open(FASTAFILE, "$infilename") || die "Could not open $infilename.\n\n"; $line = ; chomp $line; $i=0; %list=(); @seqs=(()); if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; push @seqs, (); } else { push @{$seqs[$i]}, "$line"; } } $i=0; for $row (@seqs) { @strs[$i++] = join "", @$row; } if (@keys != 2) { print ("mpack needs two FASTA sequences\n"); exit(1); } # pack bin # format from Alex Poliakov's glass2bin.pl script %base_code = ('-' => 0, 'A' => 1, 'C' => 2, 'T' => 3, 'G' => 4, 'N' => 5, 'a' => 1, 'c' => 2, 't' => 3, 'g' => 4, 'n' => 5); $l = length @strs[0]; # $l--; $s1 = reverse(@strs[0]); $s2 = reverse(@strs[1]); for ($i=0; $i<$l; $i++) { if ($tofile) { print OUTFILE pack("H2", $base_code{chop($s1)} . $base_code{chop($s2)}); } else { print pack("H2", $base_code{chop($s1)} . $base_code{chop($s2)}); } } lagan20/utils/mpretty.pl0000755000076500007650000001143310502337061016323 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # defaults $linelen = 50; $interval = 10; $labellen = 5; $uselabels = 1; $useintervals = 1; $usecounts = 1; $usebase = 0; $liststart = 1; $listend = 0; $usestart = 0; $useend = 0; # constants $minlinelen = 10; $mininterval = 10; $minlabellen = 3; # usage notes if (@ARGV < 1) { print ("usage:\n mpretty.pl filename\n"); print ("options:\n"); print (" -linelen value\n"); print (" (min: $minlinelen, default: $linelen)\n"); print (" -interval value\n"); print (" (min: $mininterval, default: $interval, none: 0)\n"); print (" -labellen value\n"); print (" (min: $labellen, default: $labellen, none: 0)\n"); print (" -base sequence_name\n"); print (" (if used, must specify a sequence on which to base counting\n"); print (" -start value\n"); print (" (if used, must specify a start coordinate (>=1)\n"); print (" -end value\n"); print (" (if used, must specify an end coordinate (>=start)\n"); print (" -nocounts\n"); exit(1); } # parse parameters for ($i=1; $i<@ARGV; $i++) { if ($ARGV[$i] eq "-nocounts") { $usecounts = 0; } if ($ARGV[$i] eq "-linelen") { $linelen = $ARGV[++$i]; if ($linelen < $minlinelen) { $linelen = $minlinelen; } } if ($ARGV[$i] eq "-interval") { $interval = $ARGV[++$i]; if ($interval <= 0) { $useintervals = 0; } if ($interval < $mininterval) { $interval = $mininterval; } } if ($ARGV[$i] eq "-labellen") { $labellen = $ARGV[++$i]; if ($labellen <= 0) { $uselabels = 0; } if ($labellen < $minlabellen) { $labellen = $minlabellen; } } if ($ARGV[$i] eq "-base") { $baseseq = $ARGV[++$i]; $usebase = 1; } if ($ARGV[$i] eq "-start") { $usestart = 1; $liststart = $ARGV[++$i]; } if ($ARGV[$i] eq "-end") { $useend = 1; $listend = $ARGV[++$i]; } } # preprocessing for labels if ($uselabels) { $labtail = ""; for ($i=0; $i<$labellen; $i++) { $labtail="$labtail "; } } if (($usestart && ($liststart<1)) || ($useend && ($listend<$liststart))) { die "Invalid range specified: [$liststart, $listend].\n\n"; } # read in Multi-FASTA file $filename = $ARGV[0]; open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n"; $line = ; chomp $line; while (substr($line, 0, 1) ne ">") { $line = ; chomp $line; } $i=0; %list=(); @seqs=(()); if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; @count[$i]=0; @label[$i] = substr("@keys[$i]$labtail", 0, $labellen); $list{@keys[$i]}=$i; } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; @count[$i]=0; @label[$i] = substr("@keys[$i]$labtail", 0, $labellen); $list{@keys[$i]}=$i; push @seqs, (); } else { push @{$seqs[$i]}, "$line"; } } $i=0; $maxlen = 0; for $row (@seqs) { @strs[$i++] = join "", @$row; $templen = length @strs[$i-1]; if ($templen > $maxlen) { $maxlen = $templen; } } $foundseq=0; if ($usebase) { foreach $s (@keys) { $foundseq = ($s eq $baseseq) || $foundseq; } if (!$foundseq) { die "Could not find Base Sequence: <$baseseq>\n\n"; } } # preprocessing for counts if ($usecounts) { foreach $s (@keys) { $_ = @strs[$list{$s}]; $ls = tr/ATCGNatcgn/ATGCNatcgn/; @tot[$list{$s}] = $ls; } } # length of sequence display $l=$maxlen; if ((!$listend) || ($listend>$maxlen)) { $listend = $maxlen; } if ($maxlen < $liststart) { die "Starting out of bounds...\b\b"; } if ($usebase) { # find base sequence position $i=0; $j=0; while ($j<$liststart) { if (substr(@strs[$list{$baseseq}], $i, 1) ne "-") { $j++; } $i++; } $liststart = $i; while ($j<$listend) { if (substr(@strs[$list{$baseseq}], $i, 1) ne "-") { $j++; } $i++; } $listend = $i; } # pretty print if ($usecounts) { foreach $s (@keys) { $_ = substr(@strs[$list{$s}], 0, $liststart-1); $lc = tr/ATCGN/ATGCN/; @count[$list{$s}]+=$lc; } } for ($i=$liststart-1; $i<$listend; $i+=$linelen) { if ($listend-$i<$linelen) { $linelen = $listend-$i;} foreach $s (@keys) { if ($uselabels) { print "@label[$list{$s}] : "; } $p = substr(@strs[$list{$s}], $i, $linelen); print "$p"; if ($usecounts) { $_ = $p; $lc = tr/ATCGN/ATGCN/; @count[$list{$s}]+=$lc; print " @ @count[$list{$s}]/@tot[$list{$s}]"; } print "\n"; } if ($useintervals) { if ($uselabels) { print "$labtail = "; } for ($j=$i+1; $j<=$i+$linelen && $j<=$l; $j+=$interval) { $ct = "$j"; print $ct; for ($k=0; $k<($interval-(length $ct)); $k++) { print " "; } } print "\n"; } print "\n"; } lagan20/utils/mproject.pl0000755000076500007650000000245310502337061016444 0ustar brudnobrudno00000000000000#!/usr/bin/env perl if (@ARGV < 2) { print ("usage:\n mproject.pl filename seqname1 [seqname2 ... ]\n"); exit(1); } $filename = $ARGV[0]; $i = 1; while ($i < @ARGV) { @targets[$i-1] = $ARGV[$i]; $i++; } open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n"; $line = ; chomp $line; $i=0; %list=(); @seqs=(()); if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; $_ = substr($line,1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; push @seqs, (); } else { push @{$seqs[$i]}, "$line"; } } $i=0; for $row (@seqs) { @strs[$i++] = join "", @$row; } $seqlen = length $strs[0]; # $seqlen--; for ($i=0; $i<$seqlen; $i++) { @isgap[$i] = 1; foreach $s (@targets) { if (substr(@strs[$list{$s}], $i, 1) ne "-") { @isgap[$i] = 0; break; } } } foreach $s (@targets) { print ">@keys[$list{$s}]\n"; $j=0; for ($i=0; $i<$seqlen; $i++) { if(!@isgap[$i]) { print substr(@strs[$list{$s}], $i, 1); $j++; if (($j % 60) == 0) { print "\n"; } } } print "\n"; } lagan20/utils/mrun.pl0000755000076500007650000001412410502337061015600 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # This script requires the environment variables: # LAGAN_DIR and VISTA_DIR # VISTA .plotfile defaults ($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set"; $paregmin = 75; $paregmax = 100; $pamin = 50; $pbases = 10000; $ptickdist = 2000; $presolution = 25; $pwindow = 40; $pnumwindows = 4; if (@ARGV < 1) { print ("usage:\n mrun.pl filename -tree \"(tree...)\"\n"); print ("options: [base sequence name [sequence pairs]]\n"); print ("default: [base sequence name = first sequence]\n"); print ("other MLAGAN parameters...\n"); print ("other VISTA parameters...\n"); exit(1); } $filename = $ARGV[0]; $i = 1; $j = 0; $k = 0; $l = 0; $treespec = 0; while ($i < @ARGV) { if ($ARGV[$i] eq "-tree") { @params[$j] = "-tree"; @params[++$j] = "\"$ARGV[++$i]\""; $_ = @params[$j]; $topen = tr/"\("/"\("/; $tclose = tr/"\)"/"\)"/; $treespec = ($topen == $tclose); } else { if (substr($ARGV[$i],0,1) eq "-") { if (substr($ARGV[$i],0,2) eq "--") { @vparams[$l++] = $ARGV[$i++]; @vparams[$l++] = $ARGV[$i]; } else { $j++; @params[$j] = $ARGV[$i]; if ((@params[$j] eq "-gapstart") || (@params[$j] eq "-gapend") || (@params[$j] eq "-gapcont") || (@params[$j] eq "-gapperseq") || (@params[$j] eq "-match") || (@params[$j] eq "-mismatch") || (@params[$j] eq "-overlap") || (@params[$j] eq "-translate") || (@params[$j] eq "-gfc") || (@params[$j] eq "-ext") || (@params[$j] eq "-glwidth")) { @params[++$j] = $ARGV[++$i]; } } } else { @targets[$k++] = $ARGV[$i]; } } $i++; } for ($i=0; $i<@vparams; $i+=2) { if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; } elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; } elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; } elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; } elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; } elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; } } if (!$treespec) { print ("Must specify valid phylogenetic tree...\n"); exit(1); } if ($lagandir eq "") { print ("Must specify environment variable LAGAN_DIR\n"); exit(1); } $mextstr = "$lagandir/utils/mextract.pl $filename"; print "$mextstr\n"; if(!`$mextstr`) { print "\nMulti-FASTA extraction failure...\n"; exit(1); } if (-e "$filename.masked") { $mextstr = "$lagandir/utils/mextract.pl $filename.masked -masked"; print "$mextstr\n"; if(!`$mextstr`) { print "\nMasked Multi-FASTA extraction failure...\n"; exit(1); } } open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n"; $line = ; chomp $line; while (substr($line, 0, 1) ne ">") { $line = ; chomp $line; } $i=0; %list=(); if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; if (@targets == 0) { @targets[0] = @keys[$i]; print "Setting Base Sequence: @targets[0]\n"; } } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; } } $prefix = substr $filename, 0, (rindex $filename, "."); $prefix = "$prefix\_"; foreach $s (@keys) { @fnames[$list{$s}] = "$prefix$keys[$list{$s}].fa"; } if ((@targets > 1)) { if (@targets %2 != 1) { $c = @targets; print ("$c sequences: "); print ("Must specify single base sequence\n"); print (" OR base sequence and pairs of sequences.\n"); exit(1); } } $mfiles = ""; foreach $s (@fnames) { $mfiles = "$mfiles $s"; } $mparams = ""; foreach $s (@params) { $mparams = "$mparams $s"; } $mlagan = "$lagandir/mlagan$mfiles$mparams > $prefix.out"; print STDERR "\n$mlagan\n\n"; if(`$mlagan`) { print "\n\n"; exit(1); } $i=0; if (@targets == 1) { foreach $s (@keys) { if ($s ne @targets[0]) { @targets[++$i] = @targets[0]; @targets[++$i] = $s; } } } $prjhead = "$lagandir/utils/mproject.pl $prefix.out"; $binhead = "$lagandir/utils/mf2bin.pl"; $j=0; for($i=1; $i<@targets; $i+=2) { $outprefix = "$prefix@targets[$i]\_@targets[$i+1]"; $pargs = "$targets[$i]_aligned $targets[$i+1]_aligned"; $pstr = "$prjhead $pargs > $outprefix.prj"; print "$pstr\n"; if(`$pstr`) { print "\nprojection failure...\n"; exit(1); } $bstr = "$binhead $outprefix.prj -out $outprefix.bin"; print "$bstr\n"; if(`$bstr`) { print "\npacking failure...\n"; exit(1); } @bins[$j++] = "$outprefix.bin"; print "\n"; } %distinct=(); foreach $s (@targets) { $distinct{$s} = 0; } @dseqs = keys %distinct; $plotfile = "$prefix.plotfile"; open (PLOTFILE, ">$plotfile"); print PLOTFILE "TITLE $prefix.fa - mlagan\n\n"; print PLOTFILE "OUTPUT $prefix.pdf\n\n"; print PLOTFILE "SEQUENCES "; foreach $s (@dseqs) { print PLOTFILE "$s "; } print PLOTFILE "\n\n"; $i=1; foreach $s (@bins) { print PLOTFILE "ALIGN $s BINARY\n"; print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n"; print PLOTFILE " REGIONS $paregmin $paregmax\n"; print PLOTFILE " MIN $pamin\n"; print PLOTFILE "END\n\n"; $i+=2; } print "touch $prefix.ann\n\n"; `touch $prefix.ann`; print PLOTFILE "GENES $prefix.ann\n\n"; print PLOTFILE "LEGEND on\n\n"; print PLOTFILE "COORDINATE @targets[0]\n\n"; print PLOTFILE "PAPER letter\n\n"; print PLOTFILE "BASES $pbases\n\n"; print PLOTFILE "TICK_DIST $ptickdist\n\n"; print PLOTFILE "RESOLUTION $presolution\n\n"; print PLOTFILE "WINDOW $pwindow\n\n"; print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n"; #$vistadir = `echo \$VISTA_DIR`; #chomp $vistadir; #if ($vistadir eq "") { # print ("Must specify environment variable VISTA_DIR\n"); # exit(1); #} #$vistastr = "$vistadir/RunVista $plotfile"; #print "$vistastr\n"; #if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); } print "\n\nmrun.pl -- end.\n\n"; lagan20/utils/mrunfile.pl0000755000076500007650000000337310502337061016444 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # This script requires the environment variables: # LAGAN_DIR and VISTA_DIR if (@ARGV < 1) { print ("usage:\n mrunfile.pl filename [-pairwise] [-vista]\n\n"); exit(1); } ($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set"; $filename = $ARGV[0]; open(PARAMFILE, "$filename") || die "Could not open $filename.\n\n"; $pairwise = 0; $dovista = 0; for ($l=1; $l<@ARGV; $l++) { if ($ARGV[$l] eq "-pairwise") { $pairwise = 1; } elsif ($ARGV[$l] eq "-vista") { $dovista = 1; } } $i=0; $j=0; $k=0; $filespec = 0; while ($line = ) { chomp $line; if ((substr($line, 0, 1) ne "#") && ($line ne "")) { if (!$filespec) { $seqfile = $line; $filespec = 1; } elsif (substr($line,0,1) eq "-") { if (substr($line,0,2) eq "--") { @vparams[$j++] = $line; } else { @params[$i++] = $line; } } else { @seqs[$k++] = $line; } } } if ($lagandir eq "") { print ("Must specify environment variable LAGAN_DIR\n"); exit(1); } if ($pairwise) { $mexecs = "mrunpairs.pl"; } else { $mexecs = "mrun.pl"; } $mstr = "$lagandir/utils/$mexecs $seqfile"; foreach $s (@params) { $mstr = "$mstr $s" } foreach $s (@seqs) { $mstr = "$mstr $s" } foreach $s (@vparams) { $mstr = "$mstr $s" } print "$mstr\n"; `$mstr`; if($dovista) { $prefix = substr $seqfile, 0, (rindex $filename, "."); $prefix = "$prefix\_"; if ($pairwise) { $prefix="$prefix\pairwise\_"; } $plotfile = "$prefix.plotfile"; ($vistadir = $ENV{VISTA_DIR}) or die "VISTA_DIR not set"; $vistastr = "$vistadir/RunVista $plotfile"; print "$vistastr\n"; if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); } } print "\nmrunfile.pl -- end.\n\n"; lagan20/utils/mrunpairs.pl0000755000076500007650000001363210502337061016642 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # This script requires the environment variables: # LAGAN_DIR and VISTA_DIR # VISTA .plotfile defaults ($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set"; $paregmin = 75; $paregmax = 100; $pamin = 50; $pbases = 10000; $ptickdist = 2000; $presolution = 25; $pwindow = 40; $pnumwindows = 4; if (@ARGV < 1) { print ("usage:\n mrunpairs.pl filename\n"); print ("options: [base sequence name [sequence pairs]]\n"); print ("default: [base sequence name = first sequence]\n"); print ("other MLAGAN parameters...\n"); print ("other VISTA parameters...\n"); exit(1); } $filename = $ARGV[0]; $i = 1; $j = 0; $k = 0; $l = 0; $treespec = 0; while ($i < @ARGV) { if ($ARGV[$i] eq "-tree") { $treepos = $j+1; @params[$j] = "-tree"; @params[++$j] = "\"$ARGV[++$i]\""; $_ = @params[$j]; $topen = tr/"\("/"\("/; $tclose = tr/"\)"/"\)"/; $treespec = ($topen == $tclose); } else { if (substr($ARGV[$i],0,1) eq "-") { if (substr($ARGV[$i],0,2) eq "--") { @vparams[$l++] = $ARGV[$i++]; @vparams[$l++] = $ARGV[$i]; } else { $j++; @params[$j] = $ARGV[$i]; if ((@params[$j] eq "-gapstart") || (@params[$j] eq "-gapend") || (@params[$j] eq "-gapcont") || (@params[$j] eq "-gapperseq") || (@params[$j] eq "-match") || (@params[$j] eq "-mismatch") || (@params[$j] eq "-overlap") || (@params[$j] eq "-glwidth")) { @params[++$j] = $ARGV[++$i]; } } } else { @targets[$k++] = $ARGV[$i]; } } $i++; } for ($i=0; $i<@vparams; $i+=2) { if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; } elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; } elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; } elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; } elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; } elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; } } if (!$treespec) { $j++; $treepos = $j+1; @params[$j] = "-tree"; @params[++$j] = "\"()\""; } if ($lagandir eq "") { print ("Must specify environment variable LAGAN_DIR\n"); exit(1); } $mextstr = "$lagandir/mextract.pl $filename"; print "$mextstr\n"; if(!`$mextstr`) { print "\nMulti-FASTA extraction failure...\n"; exit(1); } if (-e "$filename.masked") { $mextstr = "$lagandir/mextract.pl $filename.masked -masked"; print "$mextstr\n"; if(!`$mextstr`) { print "\nMasked Multi-FASTA extraction failure...\n"; exit(1); } } open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n"; $line = ; chomp $line; while (substr($line, 0, 1) ne ">") { $line = ; chomp $line; } $i=0; %list=(); $i=0; %list=(); if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; if (@targets == 0) { @targets[0] = @keys[$i]; print "Setting Base Sequence: @targets[0]\n"; } } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; $_ = substr($line, 1); /\w+/g; @keys[$i] = $&; $list{@keys[$i]}=$i; } } $fprefix = substr $filename, 0, (rindex $filename, "."); $prefix = "$fprefix\_"; $pprefix = "$fprefix\_pairwise\_"; foreach $s (@keys) { @fnames[$list{$s}] = "$prefix$keys[$list{$s}].fa"; } if ((@targets > 1)) { if (@targets %2 != 1) { $c = @targets; print ("$c sequences: "); print ("Must specify single base sequence\n"); print (" OR base sequence and pairs of sequences.\n"); exit(1); } } $i=0; if (@targets == 1) { foreach $s (@keys) { if ($s ne @targets[0]) { @targets[++$i] = @targets[0]; @targets[++$i] = $s; } } } $j=0; for($i=1; $i<@targets; $i+=2) { $outprefix = "$pprefix@targets[$i]\_@targets[$i+1]"; $mfiles = " @fnames[$list{@targets[$i]}] @fnames[$list{@targets[$i+1]}]"; @params[$treepos]="\"(@targets[$i] @targets[$i+1])\""; $mparams = ""; foreach $s (@params) { $mparams = "$mparams $s"; } $mlagan = "$lagandir/mlagan$mfiles$mparams > $outprefix.out"; print "\n$mlagan\n\n"; if(`$mlagan`) { print "\n\n"; exit(1); } $binhead = "$lagandir/mpack.pl"; $bstr = "$binhead $outprefix.out -out $outprefix.bin"; print "$bstr\n"; if(`$bstr`) { print "\npacking failure...\n"; exit(1); } @bins[$j++] = "$outprefix.bin"; print "\n"; } %distinct=(); foreach $s (@targets) { $distinct{$s} = 0; } @dseqs = keys %distinct; $plotfile = "$pprefix.plotfile"; open (PLOTFILE, ">$plotfile"); print PLOTFILE "TITLE $prefix.fa - mlagan\n\n"; print PLOTFILE "OUTPUT $pprefix.pdf\n\n"; print PLOTFILE "SEQUENCES "; foreach $s (@dseqs) { print PLOTFILE "$s "; } print PLOTFILE "\n\n"; $i=1; foreach $s (@bins) { print PLOTFILE "ALIGN $s BINARY\n"; print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n"; print PLOTFILE " REGIONS $paregmin $paregmax\n"; print PLOTFILE " MIN $pamin\n"; print PLOTFILE "END\n\n"; $i+=2; } print "touch $prefix.ann\n\n"; `touch $prefix.ann`; print PLOTFILE "GENES $prefix.ann\n\n"; print PLOTFILE "LEGEND on\n\n"; print PLOTFILE "COORDINATE @targets[0]\n\n"; print PLOTFILE "PAPER letter\n\n"; print PLOTFILE "BASES $pbases\n\n"; print PLOTFILE "TICK_DIST $ptickdist\n\n"; print PLOTFILE "RESOLUTION $presolution\n\n"; print PLOTFILE "WINDOW $pwindow\n\n"; print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n"; #$vistadir = `echo \$VISTA_DIR`; #chomp $vistadir; #if ($vistadir eq "") { # print ("Must specify environment variable VISTA_DIR\n"); # exit(1); #} #$vistastr = "$vistadir/RunVista $plotfile"; #print "$vistastr\n"; #if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); } print "\n\nmrunpairs.pl -- end.\n\n"; lagan20/utils/msplit.pl0000755000076500007650000000344510502337061016133 0ustar brudnobrudno00000000000000#!/usr/bin/env perl if (@ARGV < 1) { print ("usage:\n msplit.pl filename [-masked]\n"); exit(1); } $masked=0; $filename = $ARGV[0]; if(@ARGV==2) { if ($ARGV[1] eq "-masked") { $masked = 1; } } open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n"; #$prefix = substr $filename, 0, (rindex $filename, "."); #if ($masked || index ($filename, ".masked") != -1) { # $prefix = substr $filename, 0, (rindex $prefix, "."); #} $line = ; chomp $line; while (substr($line, 0, 1) ne ">") { $line = ; chomp $line; } $suffix = "fa"; if ($masked) { $suffix = "$suffix.masked"; } if (substr($line, 0, 1) eq ">") { $name = substr($line, 1); if (index ($name, " ") != -1){ $name = substr($name, 0, index ($name, " ")); } if (substr ($name, length ($name) - 1) eq ","){ $name = substr($name, 0, length ($name) - 1); } # $name = substr($line, 1); # $_ = substr($line, 1); # /\w+/g; # $name = $&; # substr($line, 1)." " =~ /(.+)[,]\s+/g; # $name = $1; $fname = "$name.$suffix"; print("$fname\n"); open(OUTFILE, ">$fname"); print OUTFILE ">$name\n"; } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { close OUTFILE; # substr($line, 1)." " =~ /(.+)[,]\s/g; # $name = $1; $name = substr($line, 1); if (index ($name, " ") != -1){ $name = substr($name, 0, index ($name, " ")); } if (substr ($name, length ($name) - 1) eq ","){ $name = substr($name, 0, length ($name) - 1); } # $_ = substr($line, 1); # /\w+/g; # $name = $&; $fname = "$name.$suffix"; print("$fname\n"); open(OUTFILE, ">$fname"); print OUTFILE ">$name\n"; } else { print OUTFILE "$line"; } } close OUTFILE; lagan20/utils/mviz.pl0000755000076500007650000001171710502337061015611 0ustar brudnobrudno00000000000000#!/usr/bin/env perl # This script requires the environment variables: # LAGAN_DIR and VISTA_DIR ($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set"; $paregmin = 75; $paregmax = 100; $pamin = 50; $pbases = 10000; $ptickdist = 2000; $presolution = 25; $pwindow = 40; $pnumwindows = 4; if (@ARGV < 2) { print ("usage:\n mviz.pl data_file param_file [plotfile]\n\n"); exit(1); } $pfspec = 0; if (@ARGV==3) { $pfspec = 1; $plotfile=@ARGV[2]; print "Using VISTA plotfile: $plotfile\n"; } $filename = $ARGV[1]; open(PARAMFILE, "$filename") || die "Could not open $filename.\n\n"; $i=0; $j=0; $k=0; $filespec = 0; while ($line = ) { chomp $line; if ((substr($line, 0, 1) ne "#") && ($line ne "")) { if (!$filespec) { $seqfile = $line; $filespec = 1; } elsif (substr($line,0,1) eq "-") { if (substr($line,0,2) eq "--") { @vparams[$j++] = $line; } else { @params[$i++] = $line; } } else { @targets[$k++] = $line; } } } $seqfile = @ARGV[0]; if ($lagandir eq "") { print ("Must specify environment variable LAGAN_DIR\n"); exit(1); } for ($i=0; $i<@vparams; $i+=2) { if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; } elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; } elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; } elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; } elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; } elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; } elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; } } open(FASTAFILE, "$seqfile") || die "Could not open $seqfile.\n\n"; $prefix = substr $seqfile, 0, (rindex $seqfile, "."); if (substr($prefix, -1, 1) ne "_") {$prefix = "$prefix\_";} $line = ; chomp $line; while (substr($line, 0, 1) ne ">") { $line = ; chomp $line; } $i=0; %list=(); if (substr($line, 0, 1) eq ">") { @keys[$i] = substr($line, 1); $list{@keys[$i]}=$i; if (@targets == 0) { @targets[0] = @keys[$i]; print "Setting Base Sequence: @targets[0]\n"; } } else { print ("$filename is NOT a Multi-FASTA file...\n"); exit(1); } while ($line = ) { chomp $line; if (substr($line, 0, 1) eq ">") { $i++; @keys[$i] = substr($line, 1); $list{@keys[$i]}=$i; } } if ((@targets > 1)) { $j=0; for ($i=1; $i<@targets; $i++) { $_ = @targets[$i]; @bp[$j++]=/\w+/g; $_=$&; @bp[$j++]=/\w+/g; } $j=1; foreach $s (@bp) { @targets[$j++]=$s; } if (@targets %2 != 1) { $c = @targets; print ("$c sequences: "); print ("Must specify single base sequence\n"); print (" OR base sequence and pairs of sequences.\n"); exit(1); } } $i=0; if (@targets == 1) { foreach $s (@keys) { # $s = substr $s, 0, (rindex $s, "_aligned"); if ($s ne @targets[0]) { @targets[++$i] = @targets[0]; @targets[++$i] = $s; } } } print "TARGETS:\n";foreach $s (@targets) { print "\"$s\"\n"; } $prjhead = "$lagandir/utils/mproject.pl $seqfile"; $binhead = "$lagandir/utils/mf2bin.pl"; $j=0; for($i=1; $i<@targets; $i+=2) { $outprefix = "$prefix@targets[$i]\_@targets[$i+1]"; $pargs = "$targets[$i] $targets[$i+1]"; $pstr = "$prjhead $pargs > $outprefix.prj"; print "$pstr\n"; if(`$pstr`) { print "\nprojection failure...\n"; exit(1); } $bstr = "$binhead $outprefix.prj -out $outprefix.bin"; print "$bstr\n"; if(`$bstr`) { print "\npacking failure...\n"; exit(1); } @bins[$j++] = "$outprefix.bin"; print "\n"; } %distinct=(); foreach $s (@targets) { $distinct{$s} = 0; } @dseqs = keys %distinct; if (!$pfspec) { $plotfile = "$prefix.plotfile"; open (PLOTFILE, ">$plotfile"); print PLOTFILE "TITLE $prefix.fa - mlagan\n\n"; print PLOTFILE "OUTPUT $prefix.pdf\n\n"; print PLOTFILE "SEQUENCES "; foreach $s (@dseqs) { print PLOTFILE "$s "; } print PLOTFILE "\n\n"; $i=1; foreach $s (@bins) { print PLOTFILE "ALIGN $s BINARY\n"; print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n"; print PLOTFILE " REGIONS $paregmin $paregmax\n"; print PLOTFILE " MIN $pamin\n"; print PLOTFILE "END\n\n"; $i+=2; } print "touch $prefix.ann\n\n"; `touch $prefix.ann`; print PLOTFILE "GENES $prefix.ann GFF\n\n"; print PLOTFILE "LEGEND on\n\n"; print PLOTFILE "COORDINATE @targets[0]\n\n"; print PLOTFILE "PAPER letter\n\n"; print PLOTFILE "BASES $pbases\n\n"; print PLOTFILE "TICK_DIST $ptickdist\n\n"; print PLOTFILE "RESOLUTION $presolution\n\n"; print PLOTFILE "WINDOW $pwindow\n\n"; print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n"; } ($vistadir = $ENV{VISTA_DIR}) or die "VISTA_DIR not set"; $vistastr = "$vistadir/RunVista $plotfile"; print "$vistastr\n"; if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); } print "\n\nmviz.pl -- end.\n\n"; lagan20/utils/Utils.pm0000644000076500007650000003072510502337061015722 0ustar brudnobrudno00000000000000#!/usr/bin/env perl package Utils; require 5.000; use strict; use Exporter; use Cwd; use IO::File; use POSIX qw(setsid); use Sys::Syslog qw(:DEFAULT setlogsock); sub Trim( @ ); sub Lock_File( $ ; $ $ $ ); sub Unlock_File( $ ); sub Write_Log( $ $ ; $ $ ); sub Parse_Filename( $ ); sub Get_Abs_Path( $ ); sub Expand_Path( $ ); sub Get_Random_Key( ; $ ); sub Hex2Ascii( $ ); sub Ascii2Hex( $ ); sub Get_Config_Record( $ $ ); sub Round( $ ); sub Set_Log( $ $ ); sub Log( $ $ ); sub Min( $ $ ); sub Max( $ $ ); sub Reg_Diff( $ $ ; $ $ $ $ $ ); sub Reg_Rem_Overlap( $ ; $ $ $ ); sub Reg_Sort( $ ; $ $ $ ); sub Reg_Intersect( $ $ ; $ $ $ $ $ ); sub Reg_Merge( $ ; $ $ $ ); use vars qw(@ISA @EXPORT $VERSION $JOB $Error $Syslog $Facility $Msg_Prefix); @ISA = qw(Exporter); @EXPORT = qw(Trim Lock_File Unlock_File Write_Log Parse_Filename Get_Abs_Path Expand_Path Hex2Ascii Ascii2Hex Get_Config_Record Get_Random_Key Round Set_Log Log Min Max Reg_Diff Reg_Rem_Overlap Reg_Sort Reg_Intersect Reg_Merge redirect_err2log openlogs safe_glob daemon wr_log wr_err start_watcher confirm $JOB); my $Id = '$Id: Utils.pm,v 1.21 2005/01/07 23:08:59 poliakov Exp $'; ($VERSION) = ($Id =~ /,v\s+(\d+\S+)/o); $JOB = '^(\S+)\@(\S+?)_(\d{4})(?:_(.+)|)$'; $Error = 0; $Syslog = 0; $Facility = "user"; $Msg_Prefix = undef; my $E_FORK = "cannot fork"; my @LOG_FILE = (); my %Locks = (); sub Trim( @ ) { for (my $i = 0; $i <= $#_; ++$i) { $_[$i] =~ s/^\s+//; $_[$i] =~ s/\s+$// } } sub Lock_File( $ ; $ $ $ ) { my ($file, $retry, $timeout, $max_mtime) = @_; my ($lock_fh, $start_time, $mtime); if (!$file || ($file =~ /\/$/o)) { $Error = "Invalid filename"; return 0; } $file = Get_Abs_Path("$file.lock"); if (exists($Locks{$file})) { $Error = "Already locked"; return 1; } if (!-w (Parse_Filename($file))[0]) { $Error = "Permission denied"; return 0; } if (!defined($retry)) { $retry = 1; } if (!defined($timeout)) { $timeout = 1200; } if (!defined($max_mtime)) { $max_mtime = ($timeout > 0) ? int($timeout / 2) : 0; } $start_time = time(); LOCK: { if (!($lock_fh = IO::File->new($file, O_RDWR|O_CREAT|O_EXCL))) { if (!$retry || (($timeout > 0) && ((time() - $start_time) > $timeout))) { $Error = "Locked by someone else"; return 0; } if ($max_mtime > 0) { $mtime = (stat($file))[9]; if ($mtime && ((time() - $mtime) > $max_mtime)) { unlink($file); } } redo LOCK; } } $lock_fh->close(); $Locks{$file} = 1; return 1; } sub Unlock_File( $ ) { my ($file) = @_; if (!$file) { $Error = "Invalid filename"; return 0; } $file = Get_Abs_Path("$file.lock"); if (!exists($Locks{$file})) { $Error = "Not locked"; return 0; } if (!unlink($file)) { $Error = "Cannot unlock"; return 0; } delete($Locks{$file}); return 1; } { my $Uname; foreach my $dir ('/bin', '/sbin', '/usr/bin', '/usr/sbin') { -x "$dir/uname" and $Uname = "$dir/uname", last; } my $Host = $Uname ? `$Uname -n` : 'localhost'; chomp($Host); ($Host) = ($Host =~ /^([^\.]+)(\..*)?$/); sub Write_Log( $ $ ; $ $ ) { no strict "refs"; my ($log_file, $msg, $name, $pid) = @_; my $error = 0; my $date; local *LOG; if (!defined($log_file) || !defined($msg)) { return 0; } if (*{$log_file}{IO}) { *LOG = *{$log_file}{IO}; } elsif ($log_file eq '/dev/null') { return 1; } else { if (!Lock_File($log_file)) { return 0; } if (!open(LOG, ">> $log_file")) { $error = 1; } } if (!$error) { chomp($msg); $date = localtime(time()); if (!$name) { $name = $0; } if (!$pid) { $pid = $$; } if (!print LOG "$date $Host $name\[$pid\]: $msg\n") { $error = 1; } if (!*{$log_file}{IO}) { close(LOG); } } if ($error && $!) { $Error = "$!"; } if (!*{$log_file}{IO}) { Unlock_File($log_file); } return !$error; }} sub Parse_Filename( $ ) { my ($name) = @_; my ($last_slash_pos, $dir, $file); if (!defined($name)) { return (); } $last_slash_pos = rindex($name, "/"); if ($last_slash_pos >= 0) { $dir = substr($name, 0, $last_slash_pos + 1); $file = substr($name, $last_slash_pos + 1); } else { $dir = ""; $file = $name; } return ($dir, $file); } sub Expand_Path( $ ) { my ($path) = @_; my $home_dir; $path && ($path =~ /^~/o) or return $path; $path =~ /^~([^\/]*)(.*)$/o; $home_dir = $1 ? (getpwnam($1))[7] : ($ENV{"HOME"} || $ENV{"LOGDIR"} || (getpwuid($>))[7]); defined($home_dir) and $path = "$home_dir$2"; return $path; } sub Get_Abs_Path( $ ) { my ($path) = @_; defined($path) or return $path; $path = Expand_Path($path); $path =~ /^\//o or $path = getcwd() . "/$path"; $path =~ s(/{2,})(/)g; # get rid of "/./" while ($path =~ /^(.*?)\/\.(?:|\/(.*))$/o) { $path = "$1/" . ($2 ? $2 : ""); } # get rid of "/../" while ($path =~ /^(((?:.*?\/)*?)[^\/]+){0,1}?\/\.\.(?:|\/(.*))$/o) { $path = ($1 ? $2 : "/") . ($3 ? $3 : ""); } return $path; } { my @Chars = ("A" .. "Z", "a" .. "z", 0 .. 9); srand(); sub Get_Random_Key( ; $ ) { my ($len) = @_; if (!defined($len) || ($len !~ /^\d+$/o) || ($len < 2) || ($len > 1024)) { $len = 8; } return join("", @Chars[map {rand @Chars } (1 .. 8)]); }} sub Hex2Ascii( $ ) { my ($str) = @_; if ($str) { $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; } return $str; } { my $a2h = { "\t" => "%29", "+" => "%2B", "," => "%2C", "." => "%2E", ";" => "%3B", "/" => "%2F", "?" => "%3F", ":" => "%3A", "@" => "%40", "=" => "%3D", "&" => "%26", " " => "%20", "<" => "%3C", ">" => "%3E", "\"" => "%22", "%" => "%25", "#" => "%23", "[" => "%5B", "]" => "%5D", "{" => "%7B", "}" => "%7D", "|" => "%7C", "\\" => "%5C", "^" => "%5E", "~" => "%7E", "`" => "%60"}; sub Ascii2Hex( $ ) { my ($str) = @_; my $new_str = ""; if (!$str) { return $str; } foreach my $char (split(//, $str)) { if (exists($a2h->{$char})) { $char = $a2h->{$char}; } $new_str .= $char; } return $new_str; }} sub Get_Config_Record( $ $ ) { my ($conf_file, $rec) = @_; my ($db, $field, $value); my @result = (); if (!($db = Registry->New($conf_file, "r", 1))) { $Error = "$Registry::Error", return (); } if (!$db->Record_Exists($rec)) { $Error = qq("$rec" record not found); return (); } foreach my $field (qw(dir users log)) { if (!($value = Expand_Path($db->Get_Val($rec, $field)))) { if ($field eq "log") { $value = ""; } else { $Error = qq("$field" field of "$rec" record is missing), return (); } } elsif ($value !~ /^\//o) { $Error = qq("$field" field of "$rec" record should be absolute path); return (); } push(@result, $value); } foreach my $field (qw(max_down grace_period)) { if (!($value = $db->Get_Val($rec, $field)) || ($value !~ /^\d+$/o)) { $value = 0; } push(@result, $value); } return @result; } sub Round( $ ) { my ($num) = @_; return int($num + 0.5); } sub Log( $ $ ) { my ($log_num, $msg) = @_; (defined($log_num) && ($log_num >= 0) && $LOG_FILE[$log_num]) and Write_Log($LOG_FILE[$log_num], $msg); } sub Set_Log( $ $ ) { my ($log_num, $file) = @_; (defined($log_num) && ($log_num >= 0) && $file) and $LOG_FILE[$log_num] = $file; } sub Min( $ $ ) { my ($i, $j) = @_; return ($i < $j) ? $i : $j; } sub Max( $ $ ) { my ($i, $j) = @_; return ($i > $j) ? $i : $j; } sub Reg_Diff( $ $ ; $ $ $ $ $ ) { my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_; my (@new_regs, $start, $end, $new_reg); $regs1 && $regs2 or return $regs1; $s1 ||= 0; defined($e1) or $e1 = 1; $s2 ||= 0; defined($e2) or $e2 = 1; for (my $i = 0; $i < @$regs1; ++$i) { $start = $$regs1[$i][$s1]; $end = $$regs1[$i][$e1]; for (my $j = 0; $j < @$regs2; ++$j) { $$regs2[$j][$s2] > $end and last; $$regs2[$j][$e2] < $start and next; if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] >= $end)) { undef($start), last; } if (($$regs2[$j][$s2] > $start) && ($$regs2[$j][$e2] >= $end)) { $end = $$regs2[$j][$s2] - 1, last; } if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] < $end)) { $start = $$regs2[$j][$e2] + 1, next; } ($start < ($$regs2[$j][$s2] - 1)) || !$strict and $new_reg = [@{$$regs1[$i]}], $$new_reg[$s1] = $start, $$new_reg[$e1] = $$regs2[$j][$s2] - 1, push(@new_regs, $new_reg); $start = $$regs2[$j][$e2] + 1; } !defined($start) || ($start > $end) and next; ($start < $end) || !$strict and $new_reg = [@{$$regs1[$i]}], $$new_reg[$s1] = $start, $$new_reg[$e1] = $end, push(@new_regs, $new_reg); } return \@new_regs; } sub Reg_Rem_Overlap( $ ; $ $ $ ) { my ($regs, $strict, $s, $e) = @_; my (@new_regs); $regs or return $regs; $s ||= 0; defined($e) or $e = 1; for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); } for (my $i = 0; $i < @new_regs; ++$i) { if (($i < $#new_regs) && ($new_regs[$i + 1][$s] <= $new_regs[$i][$e])) { $new_regs[$i + 1][$e] <= $new_regs[$i][$e] and splice(@new_regs, $i + 1, 1), --$i, next; $new_regs[$i + 1][$s] = $new_regs[$i][$e] + 1; } ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next; splice(@new_regs, $i, 1); --$i; } return \@new_regs; } sub Reg_Sort( $ ; $ $ $ ) { my ($regs, $rev, $s, $e) = @_; my (@new_regs); $regs or return $regs; $s ||= 0; defined($e) or $e = 1; if ($rev) { @new_regs = sort { ($$b[$s] <=> $$a[$s]) || ($$b[$e] <=> $$a[$e]) } @$regs; } else { @new_regs = sort { ($$a[$s] <=> $$b[$s]) || ($$a[$e] <=> $$b[$e]) } @$regs; } return \@new_regs; } sub Reg_Intersect( $ $ ; $ $ $ $ $ ) { my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_; $regs1 && $regs2 or return undef; $s1 ||= 0; defined($e1) or $e1 = 1; $s2 ||= 0; defined($e2) or $e2 = 1; return Reg_Diff($regs1, Reg_Diff($regs1, $regs2, $strict, $s1, $e1, $s2, $e2), $strict, $s1, $e1, $s1, $e1); } sub Reg_Merge( $ ; $ $ $ ) { my ($regs, $strict, $s, $e) = @_; my (@new_regs); $regs or return $regs; $s ||= 0; defined($e) or $e = 1; for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); } for (my $i = 0; $i < @new_regs; ++$i) { ($i < $#new_regs) && ($new_regs[$i + 1][$s] == ($new_regs[$i][$e] + 1)) and $new_regs[$i][$e] = $new_regs[$i + 1][$e], splice(@new_regs, $i + 1, 1), --$i, next; } for (my $i = 0; $i < @new_regs; ++$i) { ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next; splice(@new_regs, $i, 1); --$i; } return \@new_regs; } sub safe_glob { my ($regexp, $dir) = @_; my (@files); local (*DIR); $dir ||= "."; $regexp ||= ".*"; opendir(DIR, $dir) or return; @files = grep { /$regexp/ } readdir(DIR); closedir(DIR); return wantarray() ? @files : scalar(@files); } sub redirect_err2log { my ($facility) = @_; $Facility = $facility; stderr2log(); } sub stderr2log { my ($oldfh); open(STDERR, "> /dev/null"); open(STDERR, "| logger -p $Facility.err -t '$0\[$$\]'"); $oldfh = select(STDERR); $| = 1; select($oldfh); } sub openlogs { my ($facility) = @_; $facility and $Facility = $facility; stderr2log(); setlogsock("unix"); openlog($0, "pid", $Facility); $Syslog = 1; } sub daemon { my ($facility) = @_; my ($pid); if ($pid = fork()) { exit(0); } elsif (!defined($pid)) { wr_err("$E_FORK: $!"); die; } else { setsid(); close(STDIN); close(STDOUT); open(STDOUT, "> /dev/null"); openlogs($facility); } } sub start_watcher { my ($watcher, $facility, @params) = @_; my ($pid, $parent); $parent = $$; if ($pid = fork()) { return; } elsif (!defined($pid)) { wr_err("$E_FORK: $!"); die; } else { setsid(); close(STDIN); close(STDOUT); open(STDOUT, "> /dev/null"); $0 .= "_watcher"; openlogs($facility); &$watcher($parent, @params); } } sub wr_log { my $msg = shift; chomp($msg); $msg = ( $Msg_Prefix ? &$Msg_Prefix : "") . $msg; if ($Syslog) { syslog("info", "%s", $msg); } else { print "$msg\n"; } } sub wr_err { my $msg = shift; chomp($msg); print STDERR (( $Msg_Prefix ? &$Msg_Prefix : ""), "$msg\n"); return 1; } sub confirm { my ($msg) = @_; my ($ans); print $msg; $ans = ; chomp($ans); return ($ans =~ /^(y|yes)$/io) ? 1 : 0; } END { foreach my $lock (keys(%Locks)) { unlink($lock); } } 1; lagan20/Utils.pm0000644000076500007650000003072510502337064014565 0ustar brudnobrudno00000000000000#!/usr/bin/env perl package Utils; require 5.000; use strict; use Exporter; use Cwd; use IO::File; use POSIX qw(setsid); use Sys::Syslog qw(:DEFAULT setlogsock); sub Trim( @ ); sub Lock_File( $ ; $ $ $ ); sub Unlock_File( $ ); sub Write_Log( $ $ ; $ $ ); sub Parse_Filename( $ ); sub Get_Abs_Path( $ ); sub Expand_Path( $ ); sub Get_Random_Key( ; $ ); sub Hex2Ascii( $ ); sub Ascii2Hex( $ ); sub Get_Config_Record( $ $ ); sub Round( $ ); sub Set_Log( $ $ ); sub Log( $ $ ); sub Min( $ $ ); sub Max( $ $ ); sub Reg_Diff( $ $ ; $ $ $ $ $ ); sub Reg_Rem_Overlap( $ ; $ $ $ ); sub Reg_Sort( $ ; $ $ $ ); sub Reg_Intersect( $ $ ; $ $ $ $ $ ); sub Reg_Merge( $ ; $ $ $ ); use vars qw(@ISA @EXPORT $VERSION $JOB $Error $Syslog $Facility $Msg_Prefix); @ISA = qw(Exporter); @EXPORT = qw(Trim Lock_File Unlock_File Write_Log Parse_Filename Get_Abs_Path Expand_Path Hex2Ascii Ascii2Hex Get_Config_Record Get_Random_Key Round Set_Log Log Min Max Reg_Diff Reg_Rem_Overlap Reg_Sort Reg_Intersect Reg_Merge redirect_err2log openlogs safe_glob daemon wr_log wr_err start_watcher confirm $JOB); my $Id = '$Id: Utils.pm,v 1.21 2005/01/07 23:08:59 poliakov Exp $'; ($VERSION) = ($Id =~ /,v\s+(\d+\S+)/o); $JOB = '^(\S+)\@(\S+?)_(\d{4})(?:_(.+)|)$'; $Error = 0; $Syslog = 0; $Facility = "user"; $Msg_Prefix = undef; my $E_FORK = "cannot fork"; my @LOG_FILE = (); my %Locks = (); sub Trim( @ ) { for (my $i = 0; $i <= $#_; ++$i) { $_[$i] =~ s/^\s+//; $_[$i] =~ s/\s+$// } } sub Lock_File( $ ; $ $ $ ) { my ($file, $retry, $timeout, $max_mtime) = @_; my ($lock_fh, $start_time, $mtime); if (!$file || ($file =~ /\/$/o)) { $Error = "Invalid filename"; return 0; } $file = Get_Abs_Path("$file.lock"); if (exists($Locks{$file})) { $Error = "Already locked"; return 1; } if (!-w (Parse_Filename($file))[0]) { $Error = "Permission denied"; return 0; } if (!defined($retry)) { $retry = 1; } if (!defined($timeout)) { $timeout = 1200; } if (!defined($max_mtime)) { $max_mtime = ($timeout > 0) ? int($timeout / 2) : 0; } $start_time = time(); LOCK: { if (!($lock_fh = IO::File->new($file, O_RDWR|O_CREAT|O_EXCL))) { if (!$retry || (($timeout > 0) && ((time() - $start_time) > $timeout))) { $Error = "Locked by someone else"; return 0; } if ($max_mtime > 0) { $mtime = (stat($file))[9]; if ($mtime && ((time() - $mtime) > $max_mtime)) { unlink($file); } } redo LOCK; } } $lock_fh->close(); $Locks{$file} = 1; return 1; } sub Unlock_File( $ ) { my ($file) = @_; if (!$file) { $Error = "Invalid filename"; return 0; } $file = Get_Abs_Path("$file.lock"); if (!exists($Locks{$file})) { $Error = "Not locked"; return 0; } if (!unlink($file)) { $Error = "Cannot unlock"; return 0; } delete($Locks{$file}); return 1; } { my $Uname; foreach my $dir ('/bin', '/sbin', '/usr/bin', '/usr/sbin') { -x "$dir/uname" and $Uname = "$dir/uname", last; } my $Host = $Uname ? `$Uname -n` : 'localhost'; chomp($Host); ($Host) = ($Host =~ /^([^\.]+)(\..*)?$/); sub Write_Log( $ $ ; $ $ ) { no strict "refs"; my ($log_file, $msg, $name, $pid) = @_; my $error = 0; my $date; local *LOG; if (!defined($log_file) || !defined($msg)) { return 0; } if (*{$log_file}{IO}) { *LOG = *{$log_file}{IO}; } elsif ($log_file eq '/dev/null') { return 1; } else { if (!Lock_File($log_file)) { return 0; } if (!open(LOG, ">> $log_file")) { $error = 1; } } if (!$error) { chomp($msg); $date = localtime(time()); if (!$name) { $name = $0; } if (!$pid) { $pid = $$; } if (!print LOG "$date $Host $name\[$pid\]: $msg\n") { $error = 1; } if (!*{$log_file}{IO}) { close(LOG); } } if ($error && $!) { $Error = "$!"; } if (!*{$log_file}{IO}) { Unlock_File($log_file); } return !$error; }} sub Parse_Filename( $ ) { my ($name) = @_; my ($last_slash_pos, $dir, $file); if (!defined($name)) { return (); } $last_slash_pos = rindex($name, "/"); if ($last_slash_pos >= 0) { $dir = substr($name, 0, $last_slash_pos + 1); $file = substr($name, $last_slash_pos + 1); } else { $dir = ""; $file = $name; } return ($dir, $file); } sub Expand_Path( $ ) { my ($path) = @_; my $home_dir; $path && ($path =~ /^~/o) or return $path; $path =~ /^~([^\/]*)(.*)$/o; $home_dir = $1 ? (getpwnam($1))[7] : ($ENV{"HOME"} || $ENV{"LOGDIR"} || (getpwuid($>))[7]); defined($home_dir) and $path = "$home_dir$2"; return $path; } sub Get_Abs_Path( $ ) { my ($path) = @_; defined($path) or return $path; $path = Expand_Path($path); $path =~ /^\//o or $path = getcwd() . "/$path"; $path =~ s(/{2,})(/)g; # get rid of "/./" while ($path =~ /^(.*?)\/\.(?:|\/(.*))$/o) { $path = "$1/" . ($2 ? $2 : ""); } # get rid of "/../" while ($path =~ /^(((?:.*?\/)*?)[^\/]+){0,1}?\/\.\.(?:|\/(.*))$/o) { $path = ($1 ? $2 : "/") . ($3 ? $3 : ""); } return $path; } { my @Chars = ("A" .. "Z", "a" .. "z", 0 .. 9); srand(); sub Get_Random_Key( ; $ ) { my ($len) = @_; if (!defined($len) || ($len !~ /^\d+$/o) || ($len < 2) || ($len > 1024)) { $len = 8; } return join("", @Chars[map {rand @Chars } (1 .. 8)]); }} sub Hex2Ascii( $ ) { my ($str) = @_; if ($str) { $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; } return $str; } { my $a2h = { "\t" => "%29", "+" => "%2B", "," => "%2C", "." => "%2E", ";" => "%3B", "/" => "%2F", "?" => "%3F", ":" => "%3A", "@" => "%40", "=" => "%3D", "&" => "%26", " " => "%20", "<" => "%3C", ">" => "%3E", "\"" => "%22", "%" => "%25", "#" => "%23", "[" => "%5B", "]" => "%5D", "{" => "%7B", "}" => "%7D", "|" => "%7C", "\\" => "%5C", "^" => "%5E", "~" => "%7E", "`" => "%60"}; sub Ascii2Hex( $ ) { my ($str) = @_; my $new_str = ""; if (!$str) { return $str; } foreach my $char (split(//, $str)) { if (exists($a2h->{$char})) { $char = $a2h->{$char}; } $new_str .= $char; } return $new_str; }} sub Get_Config_Record( $ $ ) { my ($conf_file, $rec) = @_; my ($db, $field, $value); my @result = (); if (!($db = Registry->New($conf_file, "r", 1))) { $Error = "$Registry::Error", return (); } if (!$db->Record_Exists($rec)) { $Error = qq("$rec" record not found); return (); } foreach my $field (qw(dir users log)) { if (!($value = Expand_Path($db->Get_Val($rec, $field)))) { if ($field eq "log") { $value = ""; } else { $Error = qq("$field" field of "$rec" record is missing), return (); } } elsif ($value !~ /^\//o) { $Error = qq("$field" field of "$rec" record should be absolute path); return (); } push(@result, $value); } foreach my $field (qw(max_down grace_period)) { if (!($value = $db->Get_Val($rec, $field)) || ($value !~ /^\d+$/o)) { $value = 0; } push(@result, $value); } return @result; } sub Round( $ ) { my ($num) = @_; return int($num + 0.5); } sub Log( $ $ ) { my ($log_num, $msg) = @_; (defined($log_num) && ($log_num >= 0) && $LOG_FILE[$log_num]) and Write_Log($LOG_FILE[$log_num], $msg); } sub Set_Log( $ $ ) { my ($log_num, $file) = @_; (defined($log_num) && ($log_num >= 0) && $file) and $LOG_FILE[$log_num] = $file; } sub Min( $ $ ) { my ($i, $j) = @_; return ($i < $j) ? $i : $j; } sub Max( $ $ ) { my ($i, $j) = @_; return ($i > $j) ? $i : $j; } sub Reg_Diff( $ $ ; $ $ $ $ $ ) { my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_; my (@new_regs, $start, $end, $new_reg); $regs1 && $regs2 or return $regs1; $s1 ||= 0; defined($e1) or $e1 = 1; $s2 ||= 0; defined($e2) or $e2 = 1; for (my $i = 0; $i < @$regs1; ++$i) { $start = $$regs1[$i][$s1]; $end = $$regs1[$i][$e1]; for (my $j = 0; $j < @$regs2; ++$j) { $$regs2[$j][$s2] > $end and last; $$regs2[$j][$e2] < $start and next; if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] >= $end)) { undef($start), last; } if (($$regs2[$j][$s2] > $start) && ($$regs2[$j][$e2] >= $end)) { $end = $$regs2[$j][$s2] - 1, last; } if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] < $end)) { $start = $$regs2[$j][$e2] + 1, next; } ($start < ($$regs2[$j][$s2] - 1)) || !$strict and $new_reg = [@{$$regs1[$i]}], $$new_reg[$s1] = $start, $$new_reg[$e1] = $$regs2[$j][$s2] - 1, push(@new_regs, $new_reg); $start = $$regs2[$j][$e2] + 1; } !defined($start) || ($start > $end) and next; ($start < $end) || !$strict and $new_reg = [@{$$regs1[$i]}], $$new_reg[$s1] = $start, $$new_reg[$e1] = $end, push(@new_regs, $new_reg); } return \@new_regs; } sub Reg_Rem_Overlap( $ ; $ $ $ ) { my ($regs, $strict, $s, $e) = @_; my (@new_regs); $regs or return $regs; $s ||= 0; defined($e) or $e = 1; for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); } for (my $i = 0; $i < @new_regs; ++$i) { if (($i < $#new_regs) && ($new_regs[$i + 1][$s] <= $new_regs[$i][$e])) { $new_regs[$i + 1][$e] <= $new_regs[$i][$e] and splice(@new_regs, $i + 1, 1), --$i, next; $new_regs[$i + 1][$s] = $new_regs[$i][$e] + 1; } ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next; splice(@new_regs, $i, 1); --$i; } return \@new_regs; } sub Reg_Sort( $ ; $ $ $ ) { my ($regs, $rev, $s, $e) = @_; my (@new_regs); $regs or return $regs; $s ||= 0; defined($e) or $e = 1; if ($rev) { @new_regs = sort { ($$b[$s] <=> $$a[$s]) || ($$b[$e] <=> $$a[$e]) } @$regs; } else { @new_regs = sort { ($$a[$s] <=> $$b[$s]) || ($$a[$e] <=> $$b[$e]) } @$regs; } return \@new_regs; } sub Reg_Intersect( $ $ ; $ $ $ $ $ ) { my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_; $regs1 && $regs2 or return undef; $s1 ||= 0; defined($e1) or $e1 = 1; $s2 ||= 0; defined($e2) or $e2 = 1; return Reg_Diff($regs1, Reg_Diff($regs1, $regs2, $strict, $s1, $e1, $s2, $e2), $strict, $s1, $e1, $s1, $e1); } sub Reg_Merge( $ ; $ $ $ ) { my ($regs, $strict, $s, $e) = @_; my (@new_regs); $regs or return $regs; $s ||= 0; defined($e) or $e = 1; for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); } for (my $i = 0; $i < @new_regs; ++$i) { ($i < $#new_regs) && ($new_regs[$i + 1][$s] == ($new_regs[$i][$e] + 1)) and $new_regs[$i][$e] = $new_regs[$i + 1][$e], splice(@new_regs, $i + 1, 1), --$i, next; } for (my $i = 0; $i < @new_regs; ++$i) { ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next; splice(@new_regs, $i, 1); --$i; } return \@new_regs; } sub safe_glob { my ($regexp, $dir) = @_; my (@files); local (*DIR); $dir ||= "."; $regexp ||= ".*"; opendir(DIR, $dir) or return; @files = grep { /$regexp/ } readdir(DIR); closedir(DIR); return wantarray() ? @files : scalar(@files); } sub redirect_err2log { my ($facility) = @_; $Facility = $facility; stderr2log(); } sub stderr2log { my ($oldfh); open(STDERR, "> /dev/null"); open(STDERR, "| logger -p $Facility.err -t '$0\[$$\]'"); $oldfh = select(STDERR); $| = 1; select($oldfh); } sub openlogs { my ($facility) = @_; $facility and $Facility = $facility; stderr2log(); setlogsock("unix"); openlog($0, "pid", $Facility); $Syslog = 1; } sub daemon { my ($facility) = @_; my ($pid); if ($pid = fork()) { exit(0); } elsif (!defined($pid)) { wr_err("$E_FORK: $!"); die; } else { setsid(); close(STDIN); close(STDOUT); open(STDOUT, "> /dev/null"); openlogs($facility); } } sub start_watcher { my ($watcher, $facility, @params) = @_; my ($pid, $parent); $parent = $$; if ($pid = fork()) { return; } elsif (!defined($pid)) { wr_err("$E_FORK: $!"); die; } else { setsid(); close(STDIN); close(STDOUT); open(STDOUT, "> /dev/null"); $0 .= "_watcher"; openlogs($facility); &$watcher($parent, @params); } } sub wr_log { my $msg = shift; chomp($msg); $msg = ( $Msg_Prefix ? &$Msg_Prefix : "") . $msg; if ($Syslog) { syslog("info", "%s", $msg); } else { print "$msg\n"; } } sub wr_err { my $msg = shift; chomp($msg); print STDERR (( $Msg_Prefix ? &$Msg_Prefix : ""), "$msg\n"); return 1; } sub confirm { my ($msg) = @_; my ($ans); print $msg; $ans = ; chomp($ans); return ($ans =~ /^(y|yes)$/io) ? 1 : 0; } END { foreach my $lock (keys(%Locks)) { unlink($lock); } } 1; lagan20/xmfa2mfa.pl0000755000076500007650000000315210502337064015162 0ustar brudnobrudno00000000000000#!/usr/bin/perl use strict; $0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0; my (@lines, @filt_lines); my ($line, $line_in, $type); my $mode = ($ARGV[0] eq "1" ? "M1" : ($ARGV[0] eq "2" ? "M2" : die("$0: Invalid base genome argument (expected 1 or 2)"))); die("$0: LAGAN_DIR not defined. Stopped") unless defined $ENV{"LAGAN_DIR"}; while () { $line_in = $_; if ($line_in =~ /^\=.*(DM|M1|M2)$/) { $type = $1; $line .= $line_in; $lines[$#lines+1] = $line if $type eq "DM" or $type eq $mode; undef $line; undef $type; } else { $line .= $line_in; } } foreach my $line (@lines) { if ($mode eq "M2") { $line =~ /(\>[^\s\n]+\s([\+\-])[^\n]+)\n(.+)\n(\>[^\s\n]+\s([\+\-])[^\n]+)\n(.+)\n(\=.+?)\n/s; # $line =~ /(\>[^\s\n]+\s([\+\-])[^\n]+)\n([^\n]+)\n(\>[^\s\n]+\s([\+\-])[^\n]+)\n([^\n]+)\n(\=.+?\n)/s; my ($head1, $strand1, $seq1, $head2, $strand2, $seq2, $foot) = ($1, $2, $3, $4, $5, $6, $7); die if $strand1 ne $strand2; if ($strand1 eq "-") { $seq1 =~ s/\n//g; $seq2 =~ s/\n//g; $seq1 = reverse($seq1); $seq2 = reverse($seq2); $seq1 =~ s/(.{80})/$1\n/g; $seq2 =~ s/(.{80})/$1\n/g; } $line = $head2."\n".$seq2."\n".$head1."\n".$seq1."\n".$foot."\n"; } push @filt_lines, $line; } open(OUT, "> tmp.xmfa"); foreach my $line (@filt_lines) { print OUT $line; } close OUT; system($ENV{"LAGAN_DIR"}."/utils/Glue tmp.xmfa > glue.out 2> glue.err"); open(IN, "< glue.out"); my @glue_out = ; close IN; open(IN, "< glue.err"); my @glue_err = ; close IN; unlink("tmp.xmfa"); unlink("glue.out"); unlink("glue.err"); print STDOUT @glue_out; print STDERR @glue_err;