murasaki/0000755000177700001440000000000011434752243011757 5ustar krispusersmurasaki/repeats2anchors.pl0000755000177700001440000000535211434752242015426 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Long; use Pod::Usage; use File::Basename; use IO::Handle; use strict; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; our ($help,$man); our ($kogfile,@kogmap); GetOptions('help|?' => \$help, man => \$man); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; foreach my $inf (@ARGV){ die "Input file not found: $inf" unless -f $inf; my ($prefix)=$inf=~m/^(.*)\.repeats/; die "Invalid file name for $inf" unless $prefix; my $seqs="$prefix.seqs"; die "Seqs file ($seqs) not found" unless -f $seqs; my $outf="$inf.anchors"; my $seqsDat=slurp($seqs); our $seqCount=scalar(split(/\n/,$seqsDat)); blit($seqsDat,"$inf.seqs"); #mmm cp open(my $ofh,">$outf"); open(my $fh,$inf); my ($length,@locs); while(<$fh>){ chomp; unless($_){ createAnchors($ofh,$length,@locs); #start fresh undef $length; @locs=(); next; } my ($seq,$dat)=m/^([^:]+): (.*)$/ or die "Weird line: $_"; if($seq eq 'R'){ $length=length($dat); }else{ $locs[$seq]=[split(/\s/,$dat)]; } } } sub slurp { local $/; open(my $fh,"@_") or return; return <$fh>; } sub blit { my ($dat,$file)=@_; open(my $fh,">$file"); print $fh $dat; } sub createAnchors { my ($ofh,$len,@locs)=@_; my @firsts=map {$locs[$_][0]} (0..$#locs); foreach my $focus (0..($#locs-1)){ #create anchors across each of these combinations foreach my $x (@{$locs[$focus]}){ foreach my $y (@{$locs[$focus+1]}){ createAnchor($ofh,$len,@firsts[0..($focus-1)],$x,$y,@firsts[($focus+2)..$#locs]); } } } } sub createAnchor { my ($ofh,$len,@locs)=@_; my @printLocs=(map {[$_,$_+$len,($_ > 0 ? '+':'-')]} @locs); print $ofh join("\t",(map {join("\t",@$_)} @printLocs))."\n"; } __END__ =head1 NAME repeats2anchors.pl -- generates a phony alignment (suitable for graphing) out of a .repeats file =head1 SYNOPSIS repeats2anchors.pl =head1 OPTIONS No options yet. =cut murasaki/repeatviz.pl0000755000177700001440000002507111434752241014333 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use strict; use Getopt::Long; use Pod::Usage; use File::Basename; use POSIX qw{floor}; #use Data::Dump qw{dump}; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; use Murasaki::Ticker qw{resetTick tick}; our $root; my $geneparse="$root/geneparse"; $geneparse="$root/geneparse.pl" unless -x $geneparse; die "Need geneparse to be available" unless -x $geneparse; my ($nodeTotal,$edgeTotal); my ($help,$man); my ($width,$height,$res)=(8,6,96); my ($echo,$colors,$connect,$maxDraw,$colorby,$colorOpts,$title,$pointCex,$plotOrder,$density,$placeLegend,$legendPts,$squeeze,$pch); $pointCex=.25; $legendPts=8; $pch=19; $plotOrder='big'; my $err= GetOptions('help|?' => \$help, man => \$man, 'res=f'=>\$res,'width=f'=>\$width,'height=f'=>\$height, echo=>\$echo, 'colors|palette=s'=>\$colors, 'colorby=s'=>\$colorby, connect=>\$connect,'maxlines=i'=>\$maxDraw, 'colorOpts=s'=>\$colorOpts,'title=s'=>\$title, 'size=f'=>\$pointCex, bigfirst=>sub {$plotOrder='big';}, smallfirst=>sub {$plotOrder='small';}, 'density=i'=>\$density, 'legend=s'=>\$placeLegend, 'legendpts=i'=>\$legendPts, 'squeeze=s'=>\$squeeze ); pod2usage(1) if !$err or $help or @ARGV<1 or !$colorby=~m/^size|id|density$/ or $legendPts<2; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my $inf=shift @ARGV; die "File not found: $inf" unless -f $inf; die "Input must be a .repeats file" unless $inf=~m/^(.*)\.repeats$/; my ($prefix)=$1; die "Invalid file name for $inf" unless $prefix; my $seqs="$prefix.seqs"; die "Seqs file ($seqs) not found" unless -f $seqs; my @seqs=split(/\n/,slurp($seqs)); our $seqCount=scalar(@seqs); our @seqLengths=map {my $l=`$geneparse -l $_`;chomp $l; $l} @seqs; if($density){ $colorby='density'; }elsif($colorby eq 'density'){ $density=floor(max(@seqLengths)/($width*$res)*.8*($pointCex*40)); #approximate, but ok. } my $outf="$inf.png"; $outf="$inf.".join("-",$colorby ? ($colorby):(),$colors ? ($colors):()).".png" if $colors or $colorby; if(@ARGV>0){ $outf=shift @ARGV; } print "Writing output to $outf\n"; my $Rfh; my ($type)= $outf=~m/\.(\w+)$/; my $outputter=$type ne 'pdf' ? ($type=~/^e?ps$/ ? qq!postscript(file="$outf",width=$width,height=$height)!: qq!bitmap(file="$outf",type="png16m",width=$width,height=$height,res=$res)!): qq!pdf(file="$outf",width=$width,height=$height)!; my $Ropts="--slave" unless $echo; open($Rfh,"|R --vanilla $Ropts"); dor($outputter); open(my $infh,"$inf"); $title=(fileparse($inf))[0] unless $title; my $subtitle="Colored according to repeat ".($colorby ? $colorby:"id"); my ($fg,$bg)=("'black'","'white'"); if($colors eq 'heat'){ #because heat ends in white, which obviously doesn't show up well on white ($fg,$bg)=($bg,$fg); dor("par(bg=$bg,fg=$fg,col.axis=$fg,col.lab=$fg,col.main=$fg,col.sub=$fg)"); } dor("par(yaxt='n',las=1)"); do { my ($squeezeLeft,$squeezeRight)=(($squeeze=~m/(\S+),(\S+)/) ? ($1,1-$2):($squeeze,1)); dor("par('fig'=c($squeezeLeft,$squeezeRight,0,1))") } if $squeeze; my $seqLines="c(".join(",",map {(0,$seqLengths[$_])} (0..$#seqs))."),c(". join(",",map {($_,$_)} (0..$#seqs)).")"; dor("plot($seqLines,'n',ylim=c(-.5,".($#seqs+.5)."),col='black',xlab='Sequence Position',ylab='',main='$title',sub='$subtitle')"); #sets the range automatically foreach my $i (0..$#seqs){ my $y=($#seqs-$i); dor("lines(c(0,$seqLengths[$i]),c($y,$y),lwd=2)"); } dor("par(yaxt='s')"); do { my $num=$seqCount+1; dor("axis(side=2,at=c(".join(",",0..$#seqs)."),labels=c(". join(",",map {$num--;"\"#$num: ".((fileparse($_,qr/\.[^.]+$/))[0]).'"'} reverse(@seqs))."),tick=TRUE)"); }; #start the parsing! my (@clusters,@sizes); my $maxPos; my @seqMax; my $clusterId; my ($length,@locs,$size); while (<$infh>) { chomp; unless($_){ push(@clusters,{size=>$size,locs=>[@locs],id=>$clusterId}); push(@sizes,$size); #start fresh undef $length; undef $size; @locs=(); $clusterId++; next; } my ($seq,$dat)=m/^([^:]+): (.*)$/ or die "Weird line: $_"; if ($seq eq 'R') { $length=length($dat); } else { my @poses=split(/\s/,$dat); $size+=scalar @poses; $locs[$seq]=[@poses]; } } @sizes=sort @sizes; resampleDensity() if($colorby eq 'density'); my $paletteMaker='rainbow'; $paletteMaker=$colors if $colors; if($paletteMaker eq 'rainbow'){ $colorOpts="end=5/6" unless $colorOpts or $colorby eq 'id'; }else{ if($colors=~m/^rainbow|heat|terrain|topo|cm$/){ #built in palettes $paletteMaker.=".colors" }else{ $colors="blue,yellow,red" if $colors eq 'cool'; dor("require(graphics);custom.colors=colorRampPalette(c(". join(',',map {'"'.$_.'"'} split(/,/,$colors)). "),space='Lab')"); $paletteMaker="custom.colors"; } } $colorOpts=','.$colorOpts if $colorOpts; print "Using $paletteMaker to make colors for ".(scalar @clusters)." clusters\n"; dor("pal<-${paletteMaker}(".scalar(@clusters).($colorOpts ? ",$colorOpts":"").")"); print "Initial set up completed. Plotting clusters.\n"; resetTick(scalar @clusters); my ($totalLines,$linesDrawn,$totalPoints,$clustersLined); foreach my $cluster (sort {$plotOrder eq 'big' ? $b->{size} <=> $a->{size}:$a->{size} <=> $b->{size}} @clusters){ local $"=','; my $lines; my @locs=@{$cluster->{locs}}; my (@xlist,@ylist); my (@prevx,@prevy); my $color=$cluster->{id}; $color=getSizeRank($cluster->{size}) if $colorby eq 'size'; foreach my $seq (0..$#locs) { my (@myx,@myy); foreach my $loci (0..$#{$locs[$seq]}) { my $x=abs($locs[$seq][$loci]); my $y=$#seqs-($seq+($locs[$seq][$loci]>0 ? -.25:.25)); push(@myx,$x); push(@myy,$y); $totalPoints++; } if($connect and $seq>0){ #draw lines between all pairs in each adjacent seq my (@cx,@cy); foreach my $i (0..$#myx){ foreach my $j (0..$#prevx){ push(@cx,$myx[$i],$prevx[$j],'NA'); push(@cy,$myy[$i],$prevy[$j],'NA'); $totalLines++; $lines++; } } if(!$maxDraw or $lines<=$maxDraw){ dor("lines(c(@cx),c(@cy),col=pal[$color])"); $linesDrawn+=$lines; $clustersLined++; } @prevx=@myx; @prevy=@myy; } push(@xlist,@myx); push(@ylist,@myy); } dor("points(c(@xlist),c(@ylist),col=pal[$color],cex=$pointCex,pch=$pch)"); tick(); } if($placeLegend){ dor("legend('$placeLegend',c(".join(",",map {'"'.$_.'"'} ('high',(('') x ($legendPts-2)),'low'))."),pch=$pch,col=rev(${paletteMaker}($legendPts)))"); } print "\nDone\n"; print "Drew $totalPoints points.\n"; if($connect){ print "Drew $linesDrawn lines (out of $totalLines possible (".percent($linesDrawn,$totalLines).") for $clustersLined (".percent($clustersLined,scalar(@clusters)*($seqCount-1)).") clusters-sequences\n"; } close $Rfh; exit; do { my %rankMemo; sub getSizeRank { my ($size)=@_; return $rankMemo{$size} if exists $rankMemo{$size}; my $rank; foreach my $i (0..$#sizes){ # $rank=$i+1 and last if $sizes[$i]==$size; $rank=($paletteMaker eq 'rainbow' ? $#sizes-$i:$i)+1 and last if $sizes[$i]==$size; } $rankMemo{$size}=$rank; return $rank; } }; sub percent { return 'N/A%' unless $_[1]; return sprintf("%.2f%%",$_[0]/$_[1]*100); } sub dor { my @cmds=@_; foreach my $cmd (@cmds){ $cmd.=";" unless $cmd=~m/;\s+$/; $cmd.="\n" unless $cmd=~m/\n$/; print $Rfh $cmd; print $cmd if $echo; } } sub max { my $max=$_[0]; foreach (@_){ $max=$_ if $_>$max; } return $max; } sub slurp { local $/; open(my $fh,"@_") or return; return <$fh>; } sub resampleDensity { my @pixels; my $pixels=floor(max(@seqLengths)/$density); print "Calculating density every $density bp (into $pixels points) from ".scalar(@clusters)." clusters.\n"; foreach my $cluster (@clusters){ my @locs=@{$cluster->{locs}}; foreach my $seq (0..$#locs){ foreach my $loci (0..$#{$locs[$seq]}) { my $pi=round($locs[$seq][$loci]/$density); $pixels[$seq]->{$pi}++; } } } my %sizes; my $tick; foreach my $seq (0..$#seqs){ my $size; foreach my $pixel (ref $pixels[$seq] ? keys(%{$pixels[$seq]}):()){ #it's possible we don't have any repeats $size+=$pixels[$seq]->{$pixel}; $sizes{$size}[$seq]=[] unless $sizes{$size}; push(@{$sizes{$size}[$seq]},$pixel*$density); $tick++; } } @clusters=(); @sizes=(); #reset existing clusters and sizes my $id=0; foreach my $size (sort {$plotOrder eq 'big' ? $b<=>$a : $a<=>$b} keys %sizes){ push(@clusters,{size=>$size,locs=>$sizes{$size},id=>$id}); }continue{$id++} } sub round { my ($number)=@_; return int($number + .5 * ($number <=> 0)); } __END__ =head1 NAME repeatviz.pl -- visualize a bunch of repeats using graphviz =head1 SYNOPSIS repeatviz.pl [output] =head1 OPTIONS output is defined based on the input name unless otherwise specified Options: --echo shows all input to R (very spammy). --colors={rainbow,heat,terrain,topo,cm,cool} uses an alternate color palette (default is rainbow). Custom palettes can also be created by providing a list of colors (like red,yellow,blue). --colorby={size,id,density} specify what to select colors based on (default is id) --density={N} sample repeat density at every N bp (default calculates a value based on width*res and genome size) --coloropts={extra options to pass to R's color generator} --connect draw connecting edges for points in adjacent sequences --maxlines={N} don't draw lines to connect cluster-sequence pairs with more than N edges --size={N} size of dots --bigfirst plot big clusters first --smallfirst plot small clusters first(default) --res={DPI} defaults to 96 --width|height={N} graph size in inches --legend={position} position to place legend (R keywords like "right" or "topright" are good) --legendpts={N} number of dots to put in the legend (default is 8). must be at least 2 --squeeze={[0,1)} Y axis labels not fitting in the frame? squeeze the graph over with this. --squeeze={[0,1),F} an amount to change the right margin can also be specified by F like this. murasaki/names2kog.pl0000755000177700001440000000717411434752241014214 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Long; use Pod::Usage; use File::Basename; #use Data::Dump qw {dump}; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; use Murasaki::KOG; use strict; my @kogmap; our $podusageOpts={-message=>'Use --help or --man for more detailed help.',-verbose => 0,-exitval=>2}; my ($help,$man); my $output; my $opterr= GetOptions('help|?' => \$help, man => \$man, 'kog=s%' => sub {pod2usage($podusageOpts) unless($_[1] and $_[2] and $_[1]>0 and $_[2]=~m/^...$/); $kogmap[$_[1]]=$_[2]}, 'output=s' => \$output, ); unless($output){ $output=pop(@ARGV); if(!$output or -f $output){ push(@ARGV,$output) if $output; $output=">-"; } } pod2usage({-verbose=>1,-exitval=>2,-message=>'Need some input file...'}) if $#ARGV<0 or !$opterr; open(our $outf,">$output") or die "Couldn't open $output for writing."; setupKogmap(@ARGV); my %kogs; my $si=0; foreach my $file (@ARGV){ my $specs=$kogmap[$si]; warn "Couldn't identify a species for input $file" if(grep(/\?/,$specs)); unless($file=~m/\.cds$/){ my $newf="$file.cds"; system("$root/getcds.pl $file") unless -f $newf; $file=$newf; } open(my $inf,$file) or die "Couldn't open $file\n"; while(<$inf>){ chomp; my @a=split(/\t/); my $name="[?] $a[0] name2kog autogenerated"; my $content=" $specs: $a[4]"; push(@{$kogs{$name}},$content); } }continue{$si++} my @usable=grep {scalar(@{$kogs{$_}})>1} (keys %kogs); print $outf (map {$_."\n"} (map {($_,@{$kogs{$_}},undef)} @usable)); sub setupKogmap { my $kogre=join("|",Murasaki::KOG->knownCogs,Murasaki::KOG->knownKogs); my $i=0; my %aliases=Murasaki::KOG->commonAliases; foreach(@_){ next if $kogmap[$i]; my ($id)=m/($kogre)/; unless($id){ #try harder... foreach my $alias (keys(%aliases)){ $id=$aliases{$alias} if (m/$alias/i); } } next unless $id; print STDERR "Identified $_ as KOG member $id\n"; $kogmap[$i]=$id; }continue{$i++} } sub guessSpecies { my $file=pop; my @bits=split(/\./,$file); my $clue=$#bits>0 ? $bits[$#bits-1]:$file; return qw{hsa ptr mac mmu rno cfa bov pos avi} if $clue eq 'hcmmrdcoc'; #hacchy's alias for: human chimp macaq mouse rat dog cow oposs chick (note: i made up mac bov pos and avi) my @tokens=split(//,$clue); my %conv=(h=>'hsa',m=>'mmu',r=>'rno'); my $unknown=0; return map {exists($conv{$_}) ? $conv{$_}:"??".(++$unknown)} @tokens; } __END__ =head1 NAME names2kog.pl - builds a kog-set based only on gene "names" =head1 SYNOPSIS names2kog.pl [options] [input2 ... ] [output file] =head1 OPTIONS If "output file" exists, it is considered an input file. Note: you can redefine what is considered a "name" from the .overrides file (like say, to "product" or something). Other options: --kog = => for manually specifying 3 letter cog species -o =>specify the output file. murasaki/simplegraph.pl0000755000177700001440000003272711434752242014644 0ustar krispusers#!/usr/bin/perl -w #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . ################## ## dot plotting mojo (originally built for Mauve, but that might not work anymore) -- krisp ################## use File::Basename; use Getopt::Long; use Pod::Usage; #use Data::Dump qw{dump}; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki qw{getProg writeOut $root max}; use strict; our $geneparser=getProg('geneparse'); warn "Couldn't find geneparse. Finding lengths will fail" unless $geneparser; my ($help,$man,$opt_prefix,$align_type,$keepGnuplot,$interactive,$seqOrder,$fullRange); our $flexible=0; our $signed=0; our ($no_stitch); our %useFormats; sub addFormat { $useFormats{$_[0]}=1; } GetOptions('help|?' => \$help, man => \$man, 'output=s' => \$opt_prefix, 'type=s' => \$align_type, 'signed' => \$signed, 'nostitch' => \$no_stitch, png=>\&addFormat, pdf=>\&addFormat, ps=>\&addFormat, 'keepplot!'=>\$keepGnuplot, 'interactive:s'=>sub{$interactive=($_[1] ? $_[1]:'all')}, 'order=s'=>\$seqOrder, 'fullrange'=>\$fullRange, ); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; $useFormats{png}=1 unless keys(%useFormats); our @formats=keys(%useFormats); @formats=grep {$_ ne 'ps'} @formats if $useFormats{ps} and $useFormats{pdf}; our $out_prefix=$opt_prefix; my $alignment_src=shift(@ARGV); if(-d $alignment_src){ #output from quickrun $alignment_src=~m!^(.*?/?)([^/]+)/?$!; $alignment_src="$1$2/$2"; $align_type='mauve' if !$align_type and -e $alignment_src; if(!-e $alignment_src){ $alignment_src="$1$2/$2.anchors"; $align_type='murasaki' if !$align_type and -e $alignment_src; } die "Alignment not found" unless -e $alignment_src; print "Derived $alignment_src as source\n"; } $out_prefix="$alignment_src.graph" unless $opt_prefix; $align_type="murasaki" unless $align_type; our %mauve=%{loadMurasakiAlignment($alignment_src)}; our @seqs=@{$mauve{seqs}}; our @LCBs=@{$mauve{LCBs}}; our @seqOrder=$seqOrder ? ( ($seqOrder=~m/\D/) ? (split(/\D+/,$seqOrder)) :split(//,$seqOrder) ):(0..$#seqs); print "Sequence order: @seqOrder\n" if $seqOrder; die "Need at least 2 sequences to plot a graph" unless @seqOrder>1; die "Invalid order specification" if grep {$_>$#seqs} @seqOrder; #because of laziness our (%allGenes,%quickOrder); print "$align_type file describes ".($#seqs+1)." sequences and ".($#LCBs+1)." LCBs\n"; die "No LCBs?" unless $#LCBs>=0; our $datafile="$out_prefix.data.LCB"; writeOut($datafile,join("\n\n",map { #all LCBs join("\n", join("\t", map {#all sequences inside LCBs join("\t",$$_{start}); } @{$_}), join("\t", map {#all sequences inside LCBs join("\t",$$_{stop}); } @{$_}) ); } @LCBs)); our %formats=(png => 'png transparent size 800,800', ps => 'postscript', pdf =>'postscript'); for($interactive ? mclassToI($interactive):(1..mclassToI('all'))){ my @m=mclass($_); next unless magnitude(@m)==2 or $interactive or magnitude(@m)==scalar(@seqOrder); my $m=mclassToStr(@m); my @slice=map {$seqOrder[$_]} mclassToSlice(@m); my @col=map {$_+1} @slice; #gnuplot starts at 1 my @names=map {${$seqs[$_]}{seqName}} @slice; our $divfile="$out_prefix.data.divs"; my $divPlots=addDivPlots($divfile,@slice) unless $no_stitch; my $plotfile="$out_prefix.$m"; my $xlab=$names[0]; my $ylab=join(", ",@names[1..$#names]); my @ranges; my @lengths=map {$_->{length}} @seqs[@slice]; if($fullRange and !grep {!defined $_} @lengths){ @ranges=map {"[1:$_]"} ($lengths[0],max(@lengths[1..$#lengths])); } my @series=map {"'$datafile' using $col[0]:$col[$_] with lp lw 1 pointtype 6 pointsize 1 title '$names[0]-$names[$_]'"} (1..$#slice); my $plotline=join(", \\\n",@series,($divPlots ? ($divPlots):())); writeOut($plotfile,<$quickOrder{$b}; } sub pow { my ($b,$p)=(shift,(shift)-1); my $r=$b; for (1..$p){ $r*=$b; } return $r; } sub digits { my ($a,$b)=@_; my @l; while($a>0){ unshift(@l,$a % $b); $a=int($a / $b); } return @l; } sub pad { my $fill=shift; my $target=shift; unshift(@_,$fill) while($#_+1<$target); return @_; } sub mclassToStr { return join("",@_); } sub mclass { my $i=shift; $i=pow(2,$#seqOrder+1)-1 if $i=~m/all/i; return pad(0,$#seqOrder+1,digits($i,2)); } sub mclassToI { my $i=0; return pow(2,$#seqOrder+1)-1 if $_[0]=~m/all/i; $i=($i+pop) << 1 while(@_); return $i>>1; } sub mclassMember { my $usedr=shift; return !grep {!(($#{${$usedr}[$_]}>=0 and $_[$_]) or !$_[$_])} (0..$#_); } sub toMauveCoords { my @coords=($_[0]->start,$_[0]->end); return @coords; } sub findLCBs { my ($genome,$start,$stop)=($_[0],toMauveCoords($_[1])); # print "Searching ".($#LCBs+1)." LCBs on genome $genome\n"; # print join("\n",map {"$_ -> ".ref($LCBs[$_])."=".join(" ",@{$LCBs[$_]})} 0..$#LCBs); return grep {grep {$_} $$_{partial}} (map {my @LCBl=@$_; {partial=>coversLCB($start,$stop,$LCBl[$genome]), id=>$LCBl[$genome]{LCBId}} } @LCBs); } #returns 0 or which side of pair 1 (gene coords) overhangs pair 2 (LCB) sub coversLCB { my ($start,$stop,$LCBr)=@_; my @a=($start,$stop,$$LCBr{start},$$LCBr{stop}); return covers(@a); } sub covers { my ($a,$b,$c,$d)=@_; $a<=$b or die "start<=stop assertion failed"; $c<=$d or die "LCB start<=stop assertion failed"; return 0 if $a>$d or $c>$b; # total mismatch # if($a<=$d and $c<=$b){ #some degree of hit (already guranteed by above) return "none" if $c<=$a and $b<=$d; return "left" if $c<=$a and $d<=$b; return "right" if $a<=$c and $b<=$d; return "both" if $a<=$c and $d<=$b; } sub loadMauveAlignment { my $alignment=shift; open(MAUVE,"<$alignment"); =~m/FormatVersion\s+(\d+)/ or die "Not a mauve file: $alignment"; my $version=$1; my @seqs=(); do {print "This program is written for Mauve Format Version 4.\n This file is version $version. Weird stuff may happen.\n"; $flexible=1;} if $version!=4; =~m/SequenceCount\s+(\d+)/ or die "Unknown sequence count\n"; my $seqCount=$1; while(){ next unless m/Sequence(\d+)File\s+(\S.*)/; my ($seqId,$seqFile)=($1,$2); $_=; m/Sequence${seqId}Length\s+(\d+)/ or $flexible or die "Input file is weird: $_"; my $seqLength=$1; $seqs[$seqId]={'seqId' => $seqId,'seqFile' => $seqFile,'seqLength'=>$seqLength, 'seqName' => getName($seqFile) }; last if $seqId==$seqCount-1; } @LCBs=(); $_=; m/IntervalCount\s(\d+)/ or $flexible or die "Interval Count line weird: $_"; my $LCBCount=$1; while(){ m/Interval\s(\d+)/ or next; my $LCBId=$1; $_=; chomp; my ($length,@start)=split(/\s+/); @start=map(abs,@start); my @stop=map {$_+$length} @start; my @segs; while(){ chomp; last if $_ eq ''; next if $_ eq 'GappedAlignment' or m/^[A-Z-]+$/; #skip gapped lines ($length,@segs)=split(/\s+/); @stop=map {$_+$length} @segs; } next if (grep {$_==0} @start)>0; my @LCB=map { # ($start[$_],$stop[$_])=map(abs,($stop[$_],$start[$_])) if $start[$_]>$stop[$_]; #is rev strand? ($start[$_],$stop[$_])=map(abs,($start[$_],$stop[$_])) if $start[$_]>$stop[$_]; #is rev strand? { start => $start[$_], stop => $stop[$_], LCBId => $LCBId }} 0..$#stop; push(@LCBs,\@LCB); } return {'seqs' => \@seqs, 'LCBs' => \@LCBs}; } sub loadMurasakiAlignment { my $filename=pop; my $basename=getName($filename); my $path=getPath($filename); my @seqs; if(-e "$path$basename.seqs"){ my $seqId=0; open(SEQS,"$path$basename.seqs") or die "Could not open seqs file"; while(){ chomp; push(@seqs,{seqId => $seqId++, seqFile => $_, seqName => getName($_), divs=>getBreaks($_,$path), length=>getLength($_,$path)}); } close(SEQS); }else{ my @seqnames=split(/-/,getName($filename)); my $seqId=0; foreach(@seqnames){ my $name="$_.gbk"; -e $name or die "Sequence $name not found.\nEither use correct naming style, or create $path$basename.seqs file specifying sequence locations."; push(@seqs,{seqId => $seqId++, seqFile => $name}); } } my @LCBs; my $LCBId=0; open(BLOCKS,$filename) or die "Blocks file not found...??"; while(){ chomp; $LCBId++; my @LCB=(); while(m/(-?\d+)\s+(-?\d+)\s+([+-])/g){ my ($start,$stop,$back)=($1,$2,($3 eq '-')); # ($start,$stop)=map(abs,($stop,$start)) if $start<0; #is rev strand ($start,$stop,$back)=(map(abs,($start,$stop)),1) if $start<0 and !$signed; ($start,$stop)=($stop,$start) if $back and $start<$stop; push(@LCB,{start => $start, stop => $stop, LCBId => $LCBId, back => $back}); } push(@LCBs,\@LCB); } return {'seqs' => \@seqs, 'LCBs' => \@LCBs}; } sub getName { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $name } @_; return @ret if $#_; return $ret[0]; } sub getPath { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $path } @_; return @ret if $#_; return $ret[0]; } sub getLength { my ($file,$path)=@_; my $real=$file if -f $file; $real=$path.$file if !$real and -f $path.$file; return undef unless $real and -f $real; my $length=`$geneparser -l $real`; chomp $length; return $length; } sub getBreaks { my ($file,$path)=@_; return undef unless $file=~m/\.stitch$/; my $fh; open($fh,$file) or open($fh,"$path$file") or return (warn "Couldn't find file: $file"); my @ret; local $_; while (<$fh>){ chomp; my ($src,$length,$start,$stop)=split(/\t/,$_); push(@ret,[$start,$stop]); } return [@ret]; } sub addDivPlots { my ($divfile,$x,$y)=@_; my $cmds=""; foreach my $s ($x,$y){ my $op=($s == $y ? $x:$y); my $name=$seqs[$s]->{seqName}; my $sdivfile="$divfile.$s-$op"; my $oplen=$seqs[$op]->{length}; next unless $oplen; my $divsr=$seqs[$s]->{divs}; if(ref $divsr and scalar(@$divsr)){ writeOut($sdivfile,join("\n\n",map { #for each div... join("\n", join("\t", ($s==$x ? ($$_[0],1):(1,$$_[0]))), join("\t", ($s==$x ? ($$_[0],$oplen):($oplen,$$_[0]))) ,"", join("\t", ($s==$x ? ($$_[1],1):(1,$$_[1]))), join("\t", ($s==$x ? ($$_[1],$oplen):($oplen,$$_[1]))) ) } @$divsr)); $cmds.=qq!'$sdivfile' with l lw 1 title '$name'!; } } return $cmds; } __END__ =head1 NAME simplegraph.pl - Provides dotplot/chaos style plot of murasaki alignments =head1 SYNOPSIS simplegraph.pl [-output=] =head1 OPTIONS =over 8 =item B The main one outputted by murasaki. =item --output Prefix for output files. =item --nostitch Don't draw lines for stitch file breaks. =item --signed Leave signedness alone (otherwise absolute coordinates are drawn) =item --keepplot Keep the file containing the gnuplot commands (helpful if you want to run gnuplot interactively). =item --interactive= Run gnuplot interactively for some set of sequences (default all) =item --order= Reorder input sequences on the fly (or consider only a subset) can be specified as a simple permutation like 201 (which would mean sequence 2, then 0, then 1), or if you have more than 10 sequences you can separate digits with any non-digit character (eg: "2,0,1"). =back =head1 DESCRIPTION Draws graphs of murasaki alignments. =cut murasaki/COPYING0000644000177700001440000010451311434752241013014 0ustar krispusers GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . murasaki/doc/0000755000177700001440000000000011434752243012524 5ustar krispusersmurasaki/doc/murasaki.txt0000644000177700001440000005272511434752236015116 0ustar krispusersNAME murasaki - compute anchors between multiple sequences SYNOPSIS murasaki [OPTIONS] -p[pattern] seq1.fa seq2.gbk [seq3.raw ...] #compute anchors between seq1.fa and seq2.gbk using [pattern] mpirun murasaki [OPTIONS] -p[pattern] seq1.fa seq2.gbk [seq3.raw ...] #compute anchors between seq1.fa and seq2.gbk using [pattern] in parallel via MPI DESCRIPTION Murasaki generates anchors based on all supplied sequences based on the user supplied pattern and hash tables. Essentially each base of each sequence is masked by the pattern, forming a seed that is used to generate a hash. The location of the seed is stored in the hash table. Once all seeds have been hashed and stored, Murasaki scans the hash table, generating anchors for all matching seeds. An anchor refers to a set intervals across a subset of the input sequences. These are stored in name.anchors files, and described in "FILE FORMATS". By default anchors are maximally extended until their minimum pairwise ungapped alignment score drops below a threshold in the same fashion the X-drop parameter in BLAST and BLAST-like searches. PATTERNS Murasaki uses spaced seed patterns to in considering seeds. A spaced seed pattern is typically expressed as a string of 1s and 0s necessarily starting and ending with a 1. 1s indicate that this base is considered part of the seed, while bases at 0 positions are not. For example with a pattern "1011" the sequence "ACGT" would match sequences "AGGT" and "ATGT" but not "ACTT". The number of 1s in the pattern is known as the "weight" of the pattern, and the number of 1s and 0s combined is the "length" of the pattern. Murasaki allows the use of any arbitrary pattern expressed as a string of 1s and 0s, and also interprets patterns of the form "x:y" to mean a "random pattern of weight *x* and length *y*." The choice of pattern obviously has an impact on sensitivity and specificity, but whether one pattern is "better" than another depends on the application and the input sequences under consideration. Calcuating "maximally sensitive spaced seed patterns" is a computationally difficult problem and there are a number of research papers describing various methods for approximation ("RELATED READING"). In general, however, "heavier" spaced seed patterns are less sensitive, but more specific, than lighter seeds. Anecdotally we find that seeds with weights approximately 60% to 75% (with lengths around 24 for bacteria, and 36 to 48 for mammals) are good for most applications. Extremely similar species (for example human and chimp) benefit from longer, heavier, seeds. HASH FUNCTIONS Hash functions (as well as hash parameters) are generated automatically based the system environment and input sequences. There are essentially two types of hash functions available in Murasaki: adaptive and cryptoraphic hashes. The adaptive hashes are XOR combinations of various bitwise shifts of the seed designed by analyzing the spaced seed pattern to maximize the entropy of the resulting hash. Cryptographic hashes are available via the CryptoPP library and use the *entire* spaced seed pattern to generate a hash using one of the common cryptographic hashes like MD5 or SHA-1. The adaptive hash functions are almost always faster and more efficient than MD5 and SHA-1, but the cryptographic functions are available for reference and may be useful as an alternative in the unlikely event you're dealing with an environment where the adaptive hasher is unsuitable (for example a sequence consisting of only A and T (leaving 1 out of every 2 bits unitilized)). MEMORY SCALING Murasaki can take a lot of memory. Storing the location of each seed in the hash table is the most costly part of the operation, requiring approximately "ceil(log_2(N))" bits per seed where "N" is the total sequence length. Locations are, by default, stored in a bitpacked format to approach theoretical minimum. The second most costly element is the hash table structure, where each bucket carries a small overhead and unused are simply wasted space. More hash table buckets (i.e. a longer hash table) decreases the expected number of collisions, leading to faster executation time. Therefore Murasaki tries to use as many buckets as possible by inspecting the available system memory and using as much as it can while still storing all the seed locations. If this automatic scaling is ineffective, setting the hash table size directly via the --hashbits|-b options can force a specific hash table size. If the memory of one computer is insufficient to store the desired hash table, PARALLELIZATION can be used to distribute the hash table across multiple computers. PARALLELIZATION Murasaki is designed to run in parallel using MPI. Consult the documentation for the specific variations of your MPI implementation, however in general the executation method looks like: mpirun [MPI options] murasaki [murasaki options] -p[pattern] [seq1 ...] Murasaki in parallel divides the number of processors available (NP) into two groups: hasher nodes and storage nodes. The storage nodes divide the hash table between each themselves, each being responsible for a different part of the table. Hasher nodes divide the input sequence in between themselves, each hashing a separate portion of the input sequence, and passing the seed location to the appropriate storage node for storage. When all the hasher nodes are finished hashing, the storage nodes scan their portion of hash table and pass matching sets of seeds to a hasher node where they are assembled into anchors and extended. Finally all the hasher nodes combine their independent anchor sets into one final set in "ceil(log_2(H))" iterations (where "H" is the number of hasher nodes), with each hasher node number 2h passing its anchors to hasher number 2h-1 at each iteration. Because almost none of the parallelization steps require communication between *all* nodes, and each seed and each anchor can be processed in parallel, Murasaki scales very well in parallel, running approximately twice as fast when twice as many nodes are available. Furthermore, the hash table is automatically grown to take advantage of the combined memory from multiple machines. OPTIONS Most options can be specified in their long form (e.g. "--directory out" or "--directory=out") or short form (e.g. "-dout"). Options marked by expect a string, an integer, a float, and a boolean value ("yes/on/true/1" for true, "no/off/false/0" for false). Most booleans can omit the value, toggling the value from whatever it was to the opposite. Murasaki has a lot of options. Here we've separated them into categories to help distinguish the scope of the various options, however in certain situations certain option choices may have onforseen consequences, and of course ultimately if the specified output is *huge*, the required runtime will necessarily be *long*. It is a mistake to think that everything outside of the "tuning options" in Performance section has no bearing on performance. Anchor parameter related options These options shape what is considered an "anchor". --pattern|-p specifies the seed pattern (eg. 11101001010011011). using the format C<[:]> automatically generates a random pattern of weight and length --repeatmask|-r Skip repeat masked data (ie: lowercase atgc). Be aware that some sequence files are distributed purely in lower case. --seedfilter|-f Skip seeds that occur more than N times. Exceptionally slow. See --hashfilter for a faster approximation. --hashfilter|-m Like --seedfilter but works on hash keys instead of seeds. May cause some collateral damage to otherwise unique seeds, but it's faster. --skipfwd|-F Don't hash/match the forward strands. --skiprev|-R Don't hash/match the reverse complement strands. --skip1to1|-1 Skip matches along the 1:1 line (good for comparing to self). --hashonly|-Q Hash Only. No anchor output, just statistics. --hashskip|-S Hashes every n bases. Default is 1 (i.e. hashing all positions). Not supplying any argument increments the skip amount by 1. --join|-j Join anchors within n bases of eachother (default: 0). Specifying a negative D implies -D*patternLength. --bitscore|-B toggles compututation of a bitscore for all anchors (default is on). --seedterms|-T toggles retention of seed terms (defaults to off). These are necessary for computing TF-IDF scores). --sectime|-e Always display times in seconds as opposed to human readable "1d 3h 45m 5s" style times. --mergefilter|-Y Filter out matches which would would cause more than *D* many anchors to be generated from 1 seed (default -Y100). Use -Y0 to disable. --scorefilter Set a minimum ungapped score for seeds. --rifts|-/ Allow anchors to skip D sequences (default 0). --islands|-% Same as --rifts=S-D (where S is number of input seqs). --fuzzyextend|-z Enable (default) or disable fuzzy extension (i.e. ungapped alignment) of anchors. --fuzzyextendlosslimit|-Z Set the cutoff at which to stop extending fuzzy hits (ie. the BLAST X parameter). --gappedanchors Use gapped (true) or ungapped (false (default)) anchors. --scorebyminimumpair Do anchor scoring by minimum pair when appropriate (default). Alternative is arithmatic mean (seldom useful, but theoretically faster). =item --rifts|-/ Allow anchors to skip D sequences (default 0). --islands|-% Same as --rifts=S-D (where S is number of input seqs). --fuzzyextend|-z Enable (default) or disable fuzzy extension (i.e. ungapped alignment) of anchors. --fuzzyextendlosslimit|-Z Set the cutoff at which to stop extending fuzzy hits (ie. the BLAST X parameter). --gappedanchors Use gapped (true) or ungapped (false (default)) anchors. --scorebyminimumpair Do anchor scoring by minimum pair when appropriate (default). Alternative is arithmatic mean (seldom useful, but theoretically faster). Output options These options primarily affect what data is output where. --directory|-d output directory (default: output) --name|-n alignment name (default: test) --repeatmap|-i Toggles keeping of a repeat map when --mergefilter is used (defaults to yes). --histogram|-H Histogram computation level: (-H alone implies -H1) 0 - no histogram (default) 1 - basic bucketsize/bucketcount histogram data 2 - bucket-based scores to anchors.detils 3 - perbucket count data 4 - perbucket + perpattern count data Any values above 2 are purely explorartory and can result in massive output files. --tfidf|-k Perform accurate tfidf scoring from within murasaki (requires extra memory at anchor generation time). Default is no. Performance/tuning options These options primarily affect performance, and don't (in general) impact output. --quickhash|-q specify a hashing function: 0 - adaptive with S-boxes (default when there's plenty of hash table to spare) 1 - don't pack bits to make hash (use first word only) 2 - naively use the first hashbits worth of pattern 3 - adaptivevely find a good hash (default) **experimental CryptoPP hashes** 4 - MD5 5 - SHA1 6 - Whirlpool 7 - CRC-32 8 - Adler-32 Note: 3 and 0 are the only "recommended" hash functions, and the only ones automatically selected. The others are provided merely for reference. 1, 7, and 8 aren't even expected to utilize the entire hash space. --hashbits|-b use D bit hashes (for n's of 1 to WORDSIZE. default 26) --hashtype|-t select hash table data structure to use: OpenHash - open sub-word packing of hashbits (default when there's plenty of hash table to spare) EcoHash - chained sub-word packing of hashbits (default) ArrayHash - malloc/realloc (fast but fragmentation-prone) MSetHash - memory exorbanant, almost pointless. --probing 0 - linear, 1 - quadratic (default). Only applicable for --hashtype=OpenHash. --hitfilter|-h Minimum number of hits to be outputted as an anchor (default 1). In PatternHunter this is 2. --rseed|-s Random number seed for non-deterministic algorithms (ie: adative hash function generation). If you're doing any performance comparisons, it's probably imperative that you use the same seed for each run of the same settings. Default is obtained from time() (ie: seconds since 1970). --memory|-M [|] Set the target amount of total memory (either in gb or as % total memory). --reverseotf|-o Generate reverse complement on the fly (defaults to on). Turning this off precomputes the all reverse complement strands and stores them in memory, which rarely provides a measurable performance improvement. --binaryseq Enable (default) or disable binary sequence read/write Adaptive hash function related: Performance options related to adaptive hash function generation. --hasherFairEntropy Use more balanced entropy estimation (default: yes). --hasherCorrelationAdjust Adjust entropy estimates for nearby sources assuming some correlation (default: yes). --hasherTargetGACycles Adaptive hash function generation genetic algorithm cycle cutoff. --hasherEntropyAgro How aggressive to be about pursuing maximum entropy hash functions (takes a real. default is 1). MPI Specific: --hashers|-A [|] Specify the number of processes to be used as hashers (only applies to MPI. If a number between 0 and 1 it refers to a ratio of np). --localhash|-K Perform hashing locally on each storage node rather than sending it over the network (helpful for slow networks). --mpidistro|-L Toggles use of MPI to distribute sequence data over (if the sequence is available on local disk on each node then turning this off may potentially accerlate the intial sequence loading). --waittoanchor|-w Postpone actual anchor computation until all location sets have been received (as opposed to trying to work between receiving seed packets). --buffers|-u Maximum number of unfinished buffers to allow while message passing (0 means unlimited). Default is set based on the number of nodes participating. MPI can crash or perform *very* poorly if this value is too high. --nobuffers|-U Same as --buffers=1. --bigfirst|-I Assign hashers to large memory nodes first. --hostbalance|-l If yes (default): spread out hashers evenly among all nodes. If no: ignore host name when assigning jobs. --memorybalance|-a If yes (deafult): balance hash storage between nodes based on the amount of available ram. If no: distribute storage evently. This more likely to achieve optimal run times, but might not utilize memory as efficiently. --distmerge|-< if yes (default): during the merge step, storage nodes send seeds to any available hasher. if no: send all seeds to one node only. --distcollect|-> if yes (default): collect anchor data from all hashers. if no: send all seeds to the final assembly node only. --mpiredirectoutput if yes (default): each rank redirects its stdout/stderr to a separate file (murasaki-mpiout-*N*). if no: do what comes naturally (ie: managed by mpirun (for OpenMPI see --output-filename and --tag-output in mpirun(1))). --keepstdoe Don't erase the murasaki-mpiout files on success. --sysvipc|-V Use System V IPC to negotiate shared memory regions (saves memory when one host runs multiple nodes). Default is true. Universal options: --verbose|-v Increases verbosity. --version|-V Prints version information and quits. --help|-? Prints a help message and quits. FILE FORMATS Murasaki has a wide array of output files, the formats of most of which are intended to be intuitive. All output files are prefixed by the value of the --name parameter. The primary output file formats are described here. Files are line based and tab delimited unless otherwise specified. .seqs The .seqs shows what sequences were used as input, 1 per line. This file gets used by various programs in conjunction with the .anchors file, so it's generally important that the contents reflect the correct sequence files. Moving anchor results between computers might result in a change of paths, requiring the user to update the .seqs file. As an alternative, always using relative paths can alleviate this problem. .anchors files These files are 1 anchor per line, with a 3-tuple per sequence. Each touple represents the start and stop coordinates and strand of the anchored interval on each sequence. The sequence order matches that of the order in the .seqs file. The coordinates are structured such that 1 refers to the first base in the sequence, 2 to the second, etc. Negative values refer to the reverse complement sequence where -1 is the *last* base of the reverse complement sequence (ie: the complement first base in the forward sequence). The "strand" element is a '+' or '-' that merely matches the sign of the coordinates (this is redundant information, but kept to make parsing or filtering simpler). For examle: 1 18 + -1 -18 - This line describes an anchor where the first 18 bases of the first sequence match the first 18 bases of the reverse complement of the second sequence. .anchors.details This is an antiquated file format, but used by GMV to calculate statistics like TF-IDF scores, and has been kept around for that reason. The .anchors.details file has the same format and information as the .anchors file, however after the anchor touples are two more terms: a score, and a comma (,) delimited list of term and count pairs (written "term:count"). The score and count data might be varied depending on the "--histogram" option choices. .anchors.bitscore The term "bitscore" here is a misnomer, but maintained for historical reasons. In reality, this file contains the mean number of matching bases and length of each anchor (corresponding line by line to the .anchors file). .stats.tfidf Contains anchor TF-IDF scores (corresponding line by line to the .anchors file). .histogram Contains a simple histogram of the hash table usage. The first field is the bucket size, and the second is the frequency. For example a .histogram file like this: 1 24 2 1 Would indicate that there were 24 hash buckets that stored only 1 location (i.e. 24 unique seeds), and 1 hash bucket stored 2 locations (i.e. 1 seed that matched 2 locations (or 2 non-matching seeds that resulted in a hash collision)). .options Maintains a record of the options used when running Murasaki. .repeats The .repeats file stores a record of "repeats" as defined by the --mergefilter option (i.e. seeds that would have have induced more anchors than permitted). In this file, each repeat record is separated by a blank line. A repeat record looks like this: R: G.GCCTTT.T.ACT.CACAA..AT 0: 2145540494 -425039256 -113794380 1998323403 1: 2480929222 -1874514626 2543723555 -2550045172 The first line (always prefixed "R:") shows the repeating seed itself (where the . are the bases masked by the pattern). The subsequent lines show where these seeds occured in the input sequences (in the first (0) and second (1) sequences). Note that if there are no hits in a particular sequence, it doesn't include a blank line for that sequence. For example: R: G.GCCTTT.T.ACT.CACAA..AT 0: 2145540494 -425039256 -113794380 1998323403 2: 2480929222 -1874514626 2543723555 -2550045172 is also a valid .repeats file. LICENSE GNU General Public License, version 3 (GPLv3) AVAILABILITY AUTHOR Kris Popendorf SEE ALSO mbfa(1), geneparse(1) RELATED READING M. Csuros and B. Ma, "Rapid Homology Search with Two-Stage Extension and Daughter Seeds" (2005). F. P. Preparata and L. Zhang and K. W. Choi, "Quick, practical selection of effective seeds for homology search" (2005). KP Choi, et. al., "Good spaced seeds for homology search" (2004). murasaki/doc/mbfa.txt0000644000177700001440000000350211434752236014174 0ustar krispusersNAME mbfa - Murasaki Binary FASTA format converter SYNOPSIS mbfa [options...] [input] [input2 ...] #convert [input] and [input2 ...] to Muraaski Binary FASTA files DESCPRIPTION Murasaki processes sequence data using a 2-bit format where each base is reprsented using 2 bits. There's a number of pre-existing formats that do similar things, however in particular Murasaki needs to know about the metadata that can't be expressed in just 2 bits (eg. where sequences of NNNNs are, sequence breaks when multiple sequences are included in a FASTA file, etc.), therefore the MBFA format includes this data as well. Ordinarily these files are generated automatically by Murasaki when first run on a new sequence. Because the file format is designed mesh closely with Murasaki, the actual file extension will vary to reflect your architecture. It will generally be some form of .mbfa[48][48] (e.g. ".mbfa88" (the default gcc build on an amd64)). OPTIONS --info|-i Show metadata about each MBFA specified. --force|-f By default mbfa will skip files that already have recent .mbfa files. This option forces the regeneration of these files. --fatal|-F Makes errors fatal. Ordinarily if you specify multiple files, mbfa will try to convert all of them even if one fails emitting a warning. With --fatal it will stop and exit with an error if there's a problem. --fasta|-A Geneates FASTA output corresponding based on the MBFA data to stdout. --help|-h, --version|-V, --verbose|-V What you'd expect. LICENSE GNU General Public License, version 3 (GPLv3) AVAILABILITY AUTHOR Kris Popendorf SEE ALSO murasaki(1), geneparse(1) murasaki/doc/murasaki.10000644000177700001440000007444211434752236014437 0ustar krispusers.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.07) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .ie \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . nr % 0 . rr F .\} .el \{\ . de IX .. .\} .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "MURASAKI 1" .TH MURASAKI 1 "2010-05-31" "perl v5.10.1" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" murasaki \- compute anchors between multiple sequences .SH "SYNOPSIS" .IX Header "SYNOPSIS" .Vb 2 \& murasaki [OPTIONS] \-p[pattern] seq1.fa seq2.gbk [seq3.raw ...] #compute anchors between seq1.fa and seq2.gbk using [pattern] \& mpirun murasaki [OPTIONS] \-p[pattern] seq1.fa seq2.gbk [seq3.raw ...] #compute anchors between seq1.fa and seq2.gbk using [pattern] in parallel via MPI .Ve .SH "DESCRIPTION" .IX Header "DESCRIPTION" Murasaki generates anchors based on all supplied sequences based on the user supplied \fBpattern\fR and hash tables. Essentially each base of each sequence is masked by the pattern, forming a \fBseed\fR that is used to generate a hash. The location of the seed is stored in the \&\fBhash table\fR. Once all seeds have been hashed and stored, Murasaki scans the hash table, generating anchors for all matching seeds. An anchor refers to a set intervals across a subset of the input sequences. These are stored in \fBname\fR.anchors files, and described in \&\*(L"\s-1FILE\s0 \s-1FORMATS\s0\*(R". By default anchors are maximally extended until their minimum pairwise ungapped alignment score drops below a threshold in the same fashion the X\-drop parameter in \s-1BLAST\s0 and BLAST-like searches. .SS "\s-1PATTERNS\s0" .IX Subsection "PATTERNS" Murasaki uses \fBspaced seed patterns\fR to in considering seeds. A \&\fBspaced seed pattern\fR is typically expressed as a string of 1s and 0s necessarily starting and ending with a 1. 1s indicate that this base is considered part of the seed, while bases at 0 positions are not. For example with a pattern \*(L"1011\*(R" the sequence \*(L"\s-1ACGT\s0\*(R" would match sequences \*(L"\s-1AGGT\s0\*(R" and \*(L"\s-1ATGT\s0\*(R" but not \*(L"\s-1ACTT\s0\*(R". The number of 1s in the pattern is known as the \*(L"weight\*(R" of the pattern, and the number of 1s and 0s combined is the \*(L"length\*(R" of the pattern. Murasaki allows the use of any arbitrary pattern expressed as a string of 1s and 0s, and also interprets patterns of the form \*(L"x:y\*(R" to mean a "random pattern of weight \fIx\fR and length \fIy\fR." .PP The choice of pattern obviously has an impact on sensitivity and specificity, but whether one pattern is \*(L"better\*(R" than another depends on the application and the input sequences under consideration. Calcuating \*(L"maximally sensitive spaced seed patterns\*(R" is a computationally difficult problem and there are a number of research papers describing various methods for approximation (\*(L"\s-1RELATED\s0 \s-1READING\s0\*(R"). In general, however, \*(L"heavier\*(R" spaced seed patterns are less sensitive, but more specific, than lighter seeds. Anecdotally we find that seeds with weights approximately 60% to 75% (with lengths around 24 for bacteria, and 36 to 48 for mammals) are good for most applications. Extremely similar species (for example human and chimp) benefit from longer, heavier, seeds. .SS "\s-1HASH\s0 \s-1FUNCTIONS\s0" .IX Subsection "HASH FUNCTIONS" Hash functions (as well as hash parameters) are generated automatically based the system environment and input sequences. There are essentially two types of hash functions available in Murasaki: adaptive and cryptoraphic hashes. The adaptive hashes are \s-1XOR\s0 combinations of various bitwise shifts of the seed designed by analyzing the \fBspaced seed pattern\fR to maximize the entropy of the resulting hash. Cryptographic hashes are available via the CryptoPP library and use the \fIentire\fR spaced seed pattern to generate a hash using one of the common cryptographic hashes like \s-1MD5\s0 or \s-1SHA\-1\s0. The adaptive hash functions are almost always faster and more efficient than \s-1MD5\s0 and \s-1SHA\-1\s0, but the cryptographic functions are available for reference and may be useful as an alternative in the unlikely event you're dealing with an environment where the adaptive hasher is unsuitable (for example a sequence consisting of only A and T (leaving 1 out of every 2 bits unitilized)). .SS "\s-1MEMORY\s0 \s-1SCALING\s0" .IX Subsection "MEMORY SCALING" Murasaki can take a lot of memory. Storing the location of each seed in the hash table is the most costly part of the operation, requiring approximately \f(CW\*(C`ceil(log_2(N))\*(C'\fR bits per seed where \f(CW\*(C`N\*(C'\fR is the total sequence length. Locations are, by default, stored in a bitpacked format to approach theoretical minimum. The second most costly element is the hash table structure, where each bucket carries a small overhead and unused are simply wasted space. More hash table buckets (i.e. a longer hash table) decreases the expected number of collisions, leading to faster executation time. Therefore Murasaki tries to use as many buckets as possible by inspecting the available system memory and using as much as it can while still storing all the seed locations. If this automatic scaling is ineffective, setting the hash table size directly via the \-\-hashbits|\-b options can force a specific hash table size. If the memory of one computer is insufficient to store the desired hash table, \s-1PARALLELIZATION\s0 can be used to distribute the hash table across multiple computers. .SS "\s-1PARALLELIZATION\s0" .IX Subsection "PARALLELIZATION" Murasaki is designed to run in parallel using \s-1MPI\s0. Consult the documentation for the specific variations of your \s-1MPI\s0 implementation, however in general the executation method looks like: .PP .Vb 1 \& mpirun [MPI options] murasaki [murasaki options] \-p[pattern] [seq1 ...] .Ve .PP Murasaki in parallel divides the number of processors available (\s-1NP\s0) into two groups: hasher nodes and storage nodes. The storage nodes divide the hash table between each themselves, each being responsible for a different part of the table. Hasher nodes divide the input sequence in between themselves, each hashing a separate portion of the input sequence, and passing the seed location to the appropriate storage node for storage. When all the hasher nodes are finished hashing, the storage nodes scan their portion of hash table and pass matching sets of seeds to a hasher node where they are assembled into anchors and extended. Finally all the hasher nodes combine their independent anchor sets into one final set in \f(CW\*(C`ceil(log_2(H))\*(C'\fR iterations (where \f(CW\*(C`H\*(C'\fR is the number of hasher nodes), with each hasher node number 2h passing its anchors to hasher number 2h\-1 at each iteration. .PP Because almost none of the parallelization steps require communication between \fIall\fR nodes, and each seed and each anchor can be processed in parallel, Murasaki scales very well in parallel, running approximately twice as fast when twice as many nodes are available. Furthermore, the hash table is automatically grown to take advantage of the combined memory from multiple machines. .SH "OPTIONS" .IX Header "OPTIONS" Most options can be specified in their long form (e.g. \*(L"\-\-directory out\*(R" or \*(L"\-\-directory=out\*(R") or short form (e.g. \*(L"\-dout\*(R"). Options marked by expect a string, an integer, a float, and a boolean value (\*(L"yes/on/true/1\*(R" for true, \*(L"no/off/false/0\*(R" for false). Most booleans can omit the value, toggling the value from whatever it was to the opposite. .PP Murasaki has a lot of options. Here we've separated them into categories to help distinguish the scope of the various options, however in certain situations certain option choices may have onforseen consequences, and of course ultimately if the specified output is \fIhuge\fR, the required runtime will necessarily be \fIlong\fR. It is a mistake to think that everything outside of the \&\*(L"tuning options\*(R" in Performance section has no bearing on performance. .SS "Anchor parameter related options" .IX Subsection "Anchor parameter related options" These options shape what is considered an \*(L"anchor\*(R". .IP "\-\-pattern|\-p " 4 .IX Item "--pattern|-p " .Vb 3 \& specifies the seed pattern (eg. 11101001010011011). using the format \& C<[:]> automatically generates a random pattern of weight \& and length .Ve .IP "\-\-repeatmask|\-r " 4 .IX Item "--repeatmask|-r " Skip repeat masked data (ie: lowercase atgc). Be aware that some sequence files are distributed purely in lower case. .IP "\-\-seedfilter|\-f " 4 .IX Item "--seedfilter|-f " Skip seeds that occur more than N times. Exceptionally slow. See \&\-\-hashfilter for a faster approximation. .IP "\-\-hashfilter|\-m " 4 .IX Item "--hashfilter|-m " Like \-\-seedfilter but works on hash keys instead of seeds. May cause some collateral damage to otherwise unique seeds, but it's faster. .IP "\-\-skipfwd|\-F " 4 .IX Item "--skipfwd|-F " Don't hash/match the forward strands. .IP "\-\-skiprev|\-R " 4 .IX Item "--skiprev|-R " Don't hash/match the reverse complement strands. .IP "\-\-skip1to1|\-1 " 4 .IX Item "--skip1to1|-1 " Skip matches along the 1:1 line (good for comparing to self). .IP "\-\-hashonly|\-Q " 4 .IX Item "--hashonly|-Q " Hash Only. No anchor output, just statistics. .IP "\-\-hashskip|\-S " 4 .IX Item "--hashskip|-S " Hashes every n bases. Default is 1 (i.e. hashing all positions). Not supplying any argument increments the skip amount by 1. .IP "\-\-join|\-j " 4 .IX Item "--join|-j " Join anchors within n bases of eachother (default: 0). Specifying a negative D implies \-D*patternLength. .IP "\-\-bitscore|\-B " 4 .IX Item "--bitscore|-B " toggles compututation of a bitscore for all anchors (default is on). .IP "\-\-seedterms|\-T " 4 .IX Item "--seedterms|-T " toggles retention of seed terms (defaults to off). These are necessary for computing TF-IDF scores). .IP "\-\-sectime|\-e " 4 .IX Item "--sectime|-e " Always display times in seconds as opposed to human readable \*(L"1d 3h 45m 5s\*(R" style times. .IP "\-\-mergefilter|\-Y " 4 .IX Item "--mergefilter|-Y " Filter out matches which would would cause more than \fID\fR many anchors to be generated from 1 seed (default \-Y100). Use \-Y0 to disable. .IP "\-\-scorefilter " 4 .IX Item "--scorefilter " Set a minimum ungapped score for seeds. .IP "\-\-rifts|\-/ " 4 .IX Item "--rifts|-/ " Allow anchors to skip D sequences (default 0). .IP "\-\-islands|\-% " 4 .IX Item "--islands|-% " Same as \-\-rifts=S\-D (where S is number of input seqs). .IP "\-\-fuzzyextend|\-z " 4 .IX Item "--fuzzyextend|-z " Enable (default) or disable fuzzy extension (i.e. ungapped alignment) of anchors. .IP "\-\-fuzzyextendlosslimit|\-Z " 4 .IX Item "--fuzzyextendlosslimit|-Z " Set the cutoff at which to stop extending fuzzy hits (ie. the \s-1BLAST\s0 X parameter). .IP "\-\-gappedanchors " 4 .IX Item "--gappedanchors " Use gapped (true) or ungapped (false (default)) anchors. .IP "\-\-scorebyminimumpair " 4 .IX Item "--scorebyminimumpair " Do anchor scoring by minimum pair when appropriate (default). Alternative is arithmatic mean (seldom useful, but theoretically faster). =item \-\-rifts|\-/ .Sp Allow anchors to skip D sequences (default 0). .IP "\-\-islands|\-% " 4 .IX Item "--islands|-% " Same as \-\-rifts=S\-D (where S is number of input seqs). .IP "\-\-fuzzyextend|\-z " 4 .IX Item "--fuzzyextend|-z " Enable (default) or disable fuzzy extension (i.e. ungapped alignment) of anchors. .IP "\-\-fuzzyextendlosslimit|\-Z " 4 .IX Item "--fuzzyextendlosslimit|-Z " Set the cutoff at which to stop extending fuzzy hits (ie. the \s-1BLAST\s0 X parameter). .IP "\-\-gappedanchors " 4 .IX Item "--gappedanchors " Use gapped (true) or ungapped (false (default)) anchors. .IP "\-\-scorebyminimumpair " 4 .IX Item "--scorebyminimumpair " Do anchor scoring by minimum pair when appropriate (default). Alternative is arithmatic mean (seldom useful, but theoretically faster). .SS "Output options" .IX Subsection "Output options" These options primarily affect what data is output where. .IP "\-\-directory|\-d " 4 .IX Item "--directory|-d " .Vb 1 \& output directory (default: output) .Ve .IP "\-\-name|\-n " 4 .IX Item "--name|-n " .Vb 1 \& alignment name (default: test) .Ve .IP "\-\-repeatmap|\-i " 4 .IX Item "--repeatmap|-i " Toggles keeping of a repeat map when \-\-mergefilter is used (defaults to yes). .IP "\-\-histogram|\-H " 4 .IX Item "--histogram|-H " Histogram computation level: (\-H alone implies \-H1) .RS 4 .IP "0 \- no histogram (default)" 4 .IX Item "0 - no histogram (default)" .PD 0 .IP "1 \- basic bucketsize/bucketcount histogram data" 4 .IX Item "1 - basic bucketsize/bucketcount histogram data" .IP "2 \- bucket-based scores to anchors.detils" 4 .IX Item "2 - bucket-based scores to anchors.detils" .IP "3 \- perbucket count data" 4 .IX Item "3 - perbucket count data" .IP "4 \- perbucket + perpattern count data" 4 .IX Item "4 - perbucket + perpattern count data" .RE .RS 4 .PD .Sp Any values above 2 are purely explorartory and can result in massive output files. .RE .IP "\-\-tfidf|\-k " 4 .IX Item "--tfidf|-k " Perform accurate tfidf scoring from within murasaki (requires extra memory at anchor generation time). Default is no. .SS "Performance/tuning options" .IX Subsection "Performance/tuning options" These options primarily affect performance, and don't (in general) impact output. .IP "\-\-quickhash|\-q " 4 .IX Item "--quickhash|-q " .Vb 1 \& specify a hashing function: .Ve .RS 4 .IP "0 \- adaptive with S\-boxes (default when there's plenty of hash table to spare)" 4 .IX Item "0 - adaptive with S-boxes (default when there's plenty of hash table to spare)" .PD 0 .IP "1 \- don't pack bits to make hash (use first word only)" 4 .IX Item "1 - don't pack bits to make hash (use first word only)" .IP "2 \- naively use the first hashbits worth of pattern" 4 .IX Item "2 - naively use the first hashbits worth of pattern" .IP "3 \- adaptivevely find a good hash (default)" 4 .IX Item "3 - adaptivevely find a good hash (default)" .IP "**experimental CryptoPP hashes**" 4 .IX Item "**experimental CryptoPP hashes**" .IP "4 \- \s-1MD5\s0" 4 .IX Item "4 - MD5" .IP "5 \- \s-1SHA1\s0" 4 .IX Item "5 - SHA1" .IP "6 \- Whirlpool" 4 .IX Item "6 - Whirlpool" .IP "7 \- \s-1CRC\-32\s0" 4 .IX Item "7 - CRC-32" .IP "8 \- Adler\-32" 4 .IX Item "8 - Adler-32" .RE .RS 4 .PD .Sp Note: 3 and 0 are the only \*(L"recommended\*(R" hash functions, and the only ones automatically selected. The others are provided merely for reference. 1, 7, and 8 aren't even expected to utilize the entire hash space. .RE .IP "\-\-hashbits|\-b " 4 .IX Item "--hashbits|-b " use D bit hashes (for n's of 1 to \s-1WORDSIZE\s0. default 26) .IP "\-\-hashtype|\-t " 4 .IX Item "--hashtype|-t " select hash table data structure to use: .RS 4 .IP "OpenHash \- open sub-word packing of hashbits (default when there's plenty of hash table to spare)" 4 .IX Item "OpenHash - open sub-word packing of hashbits (default when there's plenty of hash table to spare)" .PD 0 .IP "EcoHash \- chained sub-word packing of hashbits (default)" 4 .IX Item "EcoHash - chained sub-word packing of hashbits (default)" .IP "ArrayHash \- malloc/realloc (fast but fragmentation-prone)" 4 .IX Item "ArrayHash - malloc/realloc (fast but fragmentation-prone)" .IP "MSetHash \- memory exorbanant, almost pointless." 4 .IX Item "MSetHash - memory exorbanant, almost pointless." .RE .RS 4 .RE .IP "\-\-probing " 4 .IX Item "--probing " .PD 0 \- linear, 1 \- quadratic (default). Only applicable for \-\-hashtype=OpenHash. .IP "\-\-hitfilter|\-h " 4 .IX Item "--hitfilter|-h " Minimum number of hits to be outputted as an anchor (default 1). In PatternHunter this is 2. .IP "\-\-rseed|\-s " 4 .IX Item "--rseed|-s " Random number seed for non-deterministic algorithms (ie: adative hash function generation). If you're doing any performance comparisons, it's probably imperative that you use the same seed for each run of the same settings. Default is obtained from \fItime()\fR (ie: seconds since 1970). .IP "\-\-memory|\-M [|]" 4 .IX Item "--memory|-M [|]" Set the target amount of total memory (either in gb or as % total memory). .IP "\-\-reverseotf|\-o " 4 .IX Item "--reverseotf|-o " Generate reverse complement on the fly (defaults to on). Turning this off precomputes the all reverse complement strands and stores them in memory, which rarely provides a measurable performance improvement. .IP "\-\-binaryseq " 4 .IX Item "--binaryseq " Enable (default) or disable binary sequence read/write .PP \fIAdaptive hash function related:\fR .IX Subsection "Adaptive hash function related:" .PP Performance options related to adaptive hash function generation. .IP "\-\-hasherFairEntropy " 4 .IX Item "--hasherFairEntropy " Use more balanced entropy estimation (default: yes). .IP "\-\-hasherCorrelationAdjust " 4 .IX Item "--hasherCorrelationAdjust " Adjust entropy estimates for nearby sources assuming some correlation (default: yes). .IP "\-\-hasherTargetGACycles " 4 .IX Item "--hasherTargetGACycles " Adaptive hash function generation genetic algorithm cycle cutoff. .IP "\-\-hasherEntropyAgro " 4 .IX Item "--hasherEntropyAgro " How aggressive to be about pursuing maximum entropy hash functions (takes a real. default is 1). .SS "\s-1MPI\s0 Specific:" .IX Subsection "MPI Specific:" .IP "\-\-hashers|\-A [|]" 4 .IX Item "--hashers|-A [|]" Specify the number of processes to be used as hashers (only applies to \s-1MPI\s0. If a number between 0 and 1 it refers to a ratio of np). .IP "\-\-localhash|\-K " 4 .IX Item "--localhash|-K " Perform hashing locally on each storage node rather than sending it over the network (helpful for slow networks). .IP "\-\-mpidistro|\-L " 4 .IX Item "--mpidistro|-L " Toggles use of \s-1MPI\s0 to distribute sequence data over (if the sequence is available on local disk on each node then turning this off may potentially accerlate the intial sequence loading). .IP "\-\-waittoanchor|\-w " 4 .IX Item "--waittoanchor|-w " Postpone actual anchor computation until all location sets have been received (as opposed to trying to work between receiving seed packets). .IP "\-\-buffers|\-u " 4 .IX Item "--buffers|-u " Maximum number of unfinished buffers to allow while message passing (0 means unlimited). Default is set based on the number of nodes participating. \s-1MPI\s0 can crash or perform \fIvery\fR poorly if this value is too high. .IP "\-\-nobuffers|\-U " 4 .IX Item "--nobuffers|-U " Same as \-\-buffers=1. .IP "\-\-bigfirst|\-I " 4 .IX Item "--bigfirst|-I " Assign hashers to large memory nodes first. .IP "\-\-hostbalance|\-l " 4 .IX Item "--hostbalance|-l " .RS 4 .PD 0 .IP "If yes (default): spread out hashers evenly among all nodes." 4 .IX Item "If yes (default): spread out hashers evenly among all nodes." .IP "If no: ignore host name when assigning jobs." 4 .IX Item "If no: ignore host name when assigning jobs." .RE .RS 4 .RE .IP "\-\-memorybalance|\-a " 4 .IX Item "--memorybalance|-a " .RS 4 .IP "If yes (deafult): balance hash storage between nodes based on the amount of available ram." 4 .IX Item "If yes (deafult): balance hash storage between nodes based on the amount of available ram." .IP "If no: distribute storage evently. This more likely to achieve optimal run times, but might not utilize memory as efficiently." 4 .IX Item "If no: distribute storage evently. This more likely to achieve optimal run times, but might not utilize memory as efficiently." .RE .RS 4 .RE .IP "\-\-distmerge|\-< " 4 .IX Item "--distmerge|-< " .RS 4 .IP "if yes (default): during the merge step, storage nodes send seeds to any available hasher." 4 .IX Item "if yes (default): during the merge step, storage nodes send seeds to any available hasher." .IP "if no: send all seeds to one node only." 4 .IX Item "if no: send all seeds to one node only." .RE .RS 4 .RE .IP "\-\-distcollect|\-> " 4 .IX Item "--distcollect|-> " .RS 4 .IP "if yes (default): collect anchor data from all hashers." 4 .IX Item "if yes (default): collect anchor data from all hashers." .IP "if no: send all seeds to the final assembly node only." 4 .IX Item "if no: send all seeds to the final assembly node only." .RE .RS 4 .RE .IP "\-\-mpiredirectoutput " 4 .IX Item "--mpiredirectoutput " .RS 4 .IP "if yes (default): each rank redirects its stdout/stderr to a separate file (murasaki\-mpiout\-\fIN\fR)." 4 .IX Item "if yes (default): each rank redirects its stdout/stderr to a separate file (murasaki-mpiout-N)." .IP "if no: do what comes naturally (ie: managed by mpirun (for OpenMPI see \-\-output\-filename and \-\-tag\-output in \fImpirun\fR\|(1)))." 4 .IX Item "if no: do what comes naturally (ie: managed by mpirun (for OpenMPI see --output-filename and --tag-output in mpirun))." .RE .RS 4 .RE .IP "\-\-keepstdoe " 4 .IX Item "--keepstdoe " .PD Don't erase the murasaki-mpiout files on success. .IP "\-\-sysvipc|\-V " 4 .IX Item "--sysvipc|-V " Use System V \s-1IPC\s0 to negotiate shared memory regions (saves memory when one host runs multiple nodes). Default is true. .SS "Universal options:" .IX Subsection "Universal options:" .IP "\-\-verbose|\-v" 4 .IX Item "--verbose|-v" Increases verbosity. .IP "\-\-version|\-V" 4 .IX Item "--version|-V" Prints version information and quits. .IP "\-\-help|\-?" 4 .IX Item "--help|-?" Prints a help message and quits. .SH "FILE FORMATS" .IX Header "FILE FORMATS" Murasaki has a wide array of output files, the formats of most of which are intended to be intuitive. All output files are prefixed by the value of the \-\-name parameter. The primary output file formats are described here. Files are line based and tab delimited unless otherwise specified. .SS ".seqs" .IX Subsection ".seqs" The .seqs shows what sequences were used as input, 1 per line. This file gets used by various programs in conjunction with the .anchors file, so it's generally important that the contents reflect the correct sequence files. Moving anchor results between computers might result in a change of paths, requiring the user to update the .seqs file. As an alternative, always using relative paths can alleviate this problem. .SS ".anchors files" .IX Subsection ".anchors files" These files are 1 anchor per line, with a 3\-tuple per sequence. Each touple represents the start and stop coordinates and strand of the anchored interval on each sequence. The sequence order matches that of the order in the .seqs file. The coordinates are structured such that 1 refers to the first base in the sequence, 2 to the second, etc. Negative values refer to the reverse complement sequence where \-1 is the \fIlast\fR base of the reverse complement sequence (ie: the complement first base in the forward sequence). The \*(L"strand\*(R" element is a '+' or '\-' that merely matches the sign of the coordinates (this is redundant information, but kept to make parsing or filtering simpler). .PP For examle: .PP .Vb 1 \& 1 18 + \-1 \-18 \- .Ve .PP This line describes an anchor where the first 18 bases of the first sequence match the first 18 bases of the reverse complement of the second sequence. .SS ".anchors.details" .IX Subsection ".anchors.details" This is an antiquated file format, but used by \s-1GMV\s0 to calculate statistics like TF-IDF scores, and has been kept around for that reason. The .anchors.details file has the same format and information as the .anchors file, however after the anchor touples are two more terms: a score, and a comma (,) delimited list of term and count pairs (written \*(L"term:count\*(R"). The score and count data might be varied depending on the \f(CW\*(C`\-\-histogram\*(C'\fR option choices. .SS ".anchors.bitscore" .IX Subsection ".anchors.bitscore" The term \*(L"bitscore\*(R" here is a misnomer, but maintained for historical reasons. In reality, this file contains the mean number of matching bases and length of each anchor (corresponding line by line to the \&.anchors file). .SS ".stats.tfidf" .IX Subsection ".stats.tfidf" Contains anchor TF-IDF scores (corresponding line by line to the \&.anchors file). .SS ".histogram" .IX Subsection ".histogram" Contains a simple histogram of the hash table usage. The first field is the bucket size, and the second is the frequency. For example a .histogram file like this: .PP .Vb 2 \& 1 24 \& 2 1 .Ve .PP Would indicate that there were 24 hash buckets that stored only 1 location (i.e. 24 unique seeds), and 1 hash bucket stored 2 locations (i.e. 1 seed that matched 2 locations (or 2 non-matching seeds that resulted in a hash collision)). .SS ".options" .IX Subsection ".options" Maintains a record of the options used when running Murasaki. .SS ".repeats" .IX Subsection ".repeats" The .repeats file stores a record of \*(L"repeats\*(R" as defined by the \&\-\-mergefilter option (i.e. seeds that would have have induced more anchors than permitted). In this file, each repeat record is separated by a blank line. A repeat record looks like this: .PP .Vb 3 \& R: G.GCCTTT.T.ACT.CACAA..AT \& 0: 2145540494 \-425039256 \-113794380 1998323403 \& 1: 2480929222 \-1874514626 2543723555 \-2550045172 .Ve .PP The first line (always prefixed \*(L"R:\*(R") shows the repeating seed itself (where the . are the bases masked by the pattern). The subsequent lines show where these seeds occured in the input sequences (in the first (0) and second (1) sequences). Note that if there are no hits in a particular sequence, it doesn't include a blank line for that sequence. For example: .PP .Vb 3 \& R: G.GCCTTT.T.ACT.CACAA..AT \& 0: 2145540494 \-425039256 \-113794380 1998323403 \& 2: 2480929222 \-1874514626 2543723555 \-2550045172 .Ve .PP is also a valid .repeats file. .SH "LICENSE" .IX Header "LICENSE" \&\s-1GNU\s0 General Public License, version 3 (GPLv3) .SH "AVAILABILITY" .IX Header "AVAILABILITY" .SH "AUTHOR" .IX Header "AUTHOR" Kris Popendorf .SH "SEE ALSO" .IX Header "SEE ALSO" \&\fImbfa\fR\|(1), \fIgeneparse\fR\|(1) .SS "\s-1RELATED\s0 \s-1READING\s0" .IX Subsection "RELATED READING" .ie n .IP "M. Csuros and B. Ma, ""Rapid Homology Search with Two-Stage Extension and Daughter Seeds"" (2005)." 4 .el .IP "M. Csuros and B. Ma, ``Rapid Homology Search with Two-Stage Extension and Daughter Seeds'' (2005)." 4 .IX Item "M. Csuros and B. Ma, Rapid Homology Search with Two-Stage Extension and Daughter Seeds (2005)." .PD 0 .ie n .IP "F. P. Preparata and L. Zhang and K. W. Choi, ""Quick, practical selection of effective seeds for homology search"" (2005)." 4 .el .IP "F. P. Preparata and L. Zhang and K. W. Choi, ``Quick, practical selection of effective seeds for homology search'' (2005)." 4 .IX Item "F. P. Preparata and L. Zhang and K. W. Choi, Quick, practical selection of effective seeds for homology search (2005)." .ie n .IP "\s-1KP\s0 Choi, et. al., ""Good spaced seeds for homology search"" (2004)." 4 .el .IP "\s-1KP\s0 Choi, et. al., ``Good spaced seeds for homology search'' (2004)." 4 .IX Item "KP Choi, et. al., Good spaced seeds for homology search (2004)." murasaki/doc/INSTALLATION0000644000177700001440000000507711434752236014363 0ustar krispusers __ __ _ _ | \/ |_ _ _ __ __ _ ___ __ _| | _(_) | |\/| | | | | '__/ _` / __|/ _` | |/ / | | | | | |_| | | | (_| \__ \ (_| | <| | |_| |_|\__,_|_| \__,_|___/\__,_|_|\_\_| * Requirements Boost (http://www.boost.org/) to build/run the core Murasaki algorithm ** Optional requirements Perl (http://www.perl.org/) to assist in the build process and to interface with some of the optional packages described below. CryptoPP (http://www.cryptopp.com/) (optional, but enabled by default) provides CPU specific enhancements. If you don't want to use CryptoPP you can disable it any of the following ways: compiling via a command like "make WITH_LIBCRYPTOPP=NO" * setting WITH_LIBCRYPTOPP=NO as an environment variable before running make * setting WITH_LIBCRYPTOPP=NO somewhere in the Makefile MPI. To use Murasaki in a cluster, you'll need some implementation of MPI. While Murasaki should be implementation agnostic, we've done most of our testing and tuning on OpenMPI (http://www.openmpi.org/) MPICH (http://www.mcs.anl.gov/mpi/mpich/) and MPICH-MX (http://www.myri.com/scs/download-mpichmx.html) are also tested and known to work. Murasaki interfaces with a lot of other free software to generate graphs and statistical information. To use all the features of Murasaki, you should also have: - BioPerl (http://www.bioperl.org/) is required by the annotation reading parts of the perl scripts. - R (http://www.r-project.org/) -- ROCR (http://rocr.bioinf.mpi-sb.mpg.de/) - gnuplot (http://www.gnuplot.info/) - ImageMagick (http://www.imagemagick.org/) * Build instructions Building under a debian based system (the intended audience) is very easy. Make sure you have the appropriate packages installed: -On Debian lenny: aptitude install libboost-dev libcrypto++-dev g++ make perl -On Debian squeeze: aptitude install libboost-all-dev libcrypto++-dev g++ make perl -On Mac OS X: port install gcc42 boost libcryptopp -- We recommend using Mac Ports (http://www.macports.org/) to get/build these libraries/packages If your system is already set up, once you've download one of the above packages, the following should work: cd murasaki make In general, the included Makefile should find everything it needs automatically (including detecting whether or not you have Crypto++ and an MPI compiler available), but if something fails, or something isn't detected automatically, feel free to edit the Makefile accordingly. If all else fails, email us (murasaki-users@lists.sourceforge.net) and we'll see what we can do to help. murasaki/doc/geneparse.pod0000644000177700001440000000250011434752236015200 0ustar krispusers=head1 NAME geneparse - sequence file loader frontend =head1 SYNAPSE geneparse [options...] [input] [input2 ...] #read input and write to stdout =head1 DESCRIPTION Reads a sequence file and writes it somewhere (by default to stdout). A specific range within an input file can be specified by file[start,stop]. The square brackets can be interchanged for any of {[()]}, (eg. "genome.fa[3000,4000]" or "genome.fa{3000~4000}"). Be aware that all of those might be parsed by your shell. Also any non-word character can be used to separate the numbers. =head1 OPTIONS =over =item --repeatmask|-r use soft-repeatmasked sequences (ie: replace lowercase bases with N's). =item --upper|--unmask|-U uppercase all bases. =item --length|-l just print the length and exit =item --clean|-c don't append a new line when finished =item --version|-V prints the program version =item --help|-h prints a help message =item --output|-o send output to a file (otherwise use stdout). --output implies --clean. =item --quiet|-q silence all warnings =item --verbose|-v prints lots of extra details =back =head1 LICENSE GNU General Public License, version 3 (GPLv3) =head1 AVAILABILITY L =head1 AUTHOR Kris Popendorf =head1 SEE ALSO murasaki(1), geneparse(1) murasaki/doc/mbfa.html0000644000177700001440000000674611434752236014336 0ustar krispusers mbfa

NAME

mbfa - Murasaki Binary FASTA format converter


SYNOPSIS

 mbfa [options...] [input] [input2 ...] #convert [input] and [input2 ...] to Muraaski Binary FASTA files


DESCPRIPTION

Murasaki processes sequence data using a 2-bit format where each base is reprsented using 2 bits. There's a number of pre-existing formats that do similar things, however in particular Murasaki needs to know about the metadata that can't be expressed in just 2 bits (eg. where sequences of NNNNs are, sequence breaks when multiple sequences are included in a FASTA file, etc.), therefore the MBFA format includes this data as well. Ordinarily these files are generated automatically by Murasaki when first run on a new sequence.

Because the file format is designed mesh closely with Murasaki, the actual file extension will vary to reflect your architecture. It will generally be some form of .mbfa[48][48] (e.g. .mbfa88 (the default gcc build on an amd64)).


OPTIONS

--info|-i

Show metadata about each MBFA specified.

--force|-f

By default mbfa will skip files that already have recent .mbfa files. This option forces the regeneration of these files.

--fatal|-F

Makes errors fatal. Ordinarily if you specify multiple files, mbfa will try to convert all of them even if one fails emitting a warning. With --fatal it will stop and exit with an error if there's a problem.

--fasta|-A

Geneates FASTA output corresponding based on the MBFA data to stdout.

--help|-h, --version|-V, --verbose|-V

What you'd expect.


LICENSE

GNU General Public License, version 3 (GPLv3)


AVAILABILITY

http://murasaki.sourceforge.net


AUTHOR

Kris Popendorf <krisp@dna.bio.keio.ac.jp>


SEE ALSO

murasaki(1), geneparse(1)

murasaki/doc/murasaki.pod0000644000177700001440000005077311434752236015062 0ustar krispusers=head1 NAME murasaki - compute anchors between multiple sequences =head1 SYNOPSIS murasaki [OPTIONS] -p[pattern] seq1.fa seq2.gbk [seq3.raw ...] #compute anchors between seq1.fa and seq2.gbk using [pattern] mpirun murasaki [OPTIONS] -p[pattern] seq1.fa seq2.gbk [seq3.raw ...] #compute anchors between seq1.fa and seq2.gbk using [pattern] in parallel via MPI =head1 DESCRIPTION Murasaki generates anchors based on all supplied sequences based on the user supplied B and hash tables. Essentially each base of each sequence is masked by the pattern, forming a B that is used to generate a hash. The location of the seed is stored in the B. Once all seeds have been hashed and stored, Murasaki scans the hash table, generating anchors for all matching seeds. An anchor refers to a set intervals across a subset of the input sequences. These are stored in B.anchors files, and described in L. By default anchors are maximally extended until their minimum pairwise ungapped alignment score drops below a threshold in the same fashion the X-drop parameter in BLAST and BLAST-like searches. =head2 PATTERNS Murasaki uses B to in considering seeds. A B is typically expressed as a string of 1s and 0s necessarily starting and ending with a 1. 1s indicate that this base is considered part of the seed, while bases at 0 positions are not. For example with a pattern "1011" the sequence "ACGT" would match sequences "AGGT" and "ATGT" but not "ACTT". The number of 1s in the pattern is known as the "weight" of the pattern, and the number of 1s and 0s combined is the "length" of the pattern. Murasaki allows the use of any arbitrary pattern expressed as a string of 1s and 0s, and also interprets patterns of the form "x:y" to mean a "random pattern of weight I and length I." The choice of pattern obviously has an impact on sensitivity and specificity, but whether one pattern is "better" than another depends on the application and the input sequences under consideration. Calcuating "maximally sensitive spaced seed patterns" is a computationally difficult problem and there are a number of research papers describing various methods for approximation (L). In general, however, "heavier" spaced seed patterns are less sensitive, but more specific, than lighter seeds. Anecdotally we find that seeds with weights approximately 60% to 75% (with lengths around 24 for bacteria, and 36 to 48 for mammals) are good for most applications. Extremely similar species (for example human and chimp) benefit from longer, heavier, seeds. =head2 HASH FUNCTIONS Hash functions (as well as hash parameters) are generated automatically based the system environment and input sequences. There are essentially two types of hash functions available in Murasaki: adaptive and cryptoraphic hashes. The adaptive hashes are XOR combinations of various bitwise shifts of the seed designed by analyzing the B to maximize the entropy of the resulting hash. Cryptographic hashes are available via the CryptoPP library and use the I spaced seed pattern to generate a hash using one of the common cryptographic hashes like MD5 or SHA-1. The adaptive hash functions are almost always faster and more efficient than MD5 and SHA-1, but the cryptographic functions are available for reference and may be useful as an alternative in the unlikely event you're dealing with an environment where the adaptive hasher is unsuitable (for example a sequence consisting of only A and T (leaving 1 out of every 2 bits unitilized)). =head2 MEMORY SCALING Murasaki can take a lot of memory. Storing the location of each seed in the hash table is the most costly part of the operation, requiring approximately C bits per seed where C is the total sequence length. Locations are, by default, stored in a bitpacked format to approach theoretical minimum. The second most costly element is the hash table structure, where each bucket carries a small overhead and unused are simply wasted space. More hash table buckets (i.e. a longer hash table) decreases the expected number of collisions, leading to faster executation time. Therefore Murasaki tries to use as many buckets as possible by inspecting the available system memory and using as much as it can while still storing all the seed locations. If this automatic scaling is ineffective, setting the hash table size directly via the --hashbits|-b options can force a specific hash table size. If the memory of one computer is insufficient to store the desired hash table, L can be used to distribute the hash table across multiple computers. =head2 PARALLELIZATION Murasaki is designed to run in parallel using MPI. Consult the documentation for the specific variations of your MPI implementation, however in general the executation method looks like: mpirun [MPI options] murasaki [murasaki options] -p[pattern] [seq1 ...] Murasaki in parallel divides the number of processors available (NP) into two groups: hasher nodes and storage nodes. The storage nodes divide the hash table between each themselves, each being responsible for a different part of the table. Hasher nodes divide the input sequence in between themselves, each hashing a separate portion of the input sequence, and passing the seed location to the appropriate storage node for storage. When all the hasher nodes are finished hashing, the storage nodes scan their portion of hash table and pass matching sets of seeds to a hasher node where they are assembled into anchors and extended. Finally all the hasher nodes combine their independent anchor sets into one final set in C iterations (where C is the number of hasher nodes), with each hasher node number 2h passing its anchors to hasher number 2h-1 at each iteration. Because almost none of the parallelization steps require communication between I nodes, and each seed and each anchor can be processed in parallel, Murasaki scales very well in parallel, running approximately twice as fast when twice as many nodes are available. Furthermore, the hash table is automatically grown to take advantage of the combined memory from multiple machines. =head1 OPTIONS Most options can be specified in their long form (e.g. "--directory out" or "--directory=out") or short form (e.g. "-dout"). Options marked by expect a string, an integer, a float, and a boolean value ("yes/on/true/1" for true, "no/off/false/0" for false). Most booleans can omit the value, toggling the value from whatever it was to the opposite. Murasaki has a lot of options. Here we've separated them into categories to help distinguish the scope of the various options, however in certain situations certain option choices may have onforseen consequences, and of course ultimately if the specified output is I, the required runtime will necessarily be I. It is a mistake to think that everything outside of the L section has no bearing on performance. =head2 Anchor parameter related options These options shape what is considered an "anchor". =over =item --pattern|-p specifies the seed pattern (eg. 11101001010011011). using the format C<[:]> automatically generates a random pattern of weight and length =item --repeatmask|-r Skip repeat masked data (ie: lowercase atgc). Be aware that some sequence files are distributed purely in lower case. =item --seedfilter|-f Skip seeds that occur more than N times. Exceptionally slow. See --hashfilter for a faster approximation. =item --hashfilter|-m Like --seedfilter but works on hash keys instead of seeds. May cause some collateral damage to otherwise unique seeds, but it's faster. =item --skipfwd|-F Don't hash/match the forward strands. =item --skiprev|-R Don't hash/match the reverse complement strands. =item --skip1to1|-1 Skip matches along the 1:1 line (good for comparing to self). =item --hashonly|-Q Hash Only. No anchor output, just statistics. =item --hashskip|-S Hashes every n bases. Default is 1 (i.e. hashing all positions). Not supplying any argument increments the skip amount by 1. =item --join|-j Join anchors within n bases of eachother (default: 0). Specifying a negative D implies -D*patternLength. =item --bitscore|-B toggles compututation of a bitscore for all anchors (default is on). =item --seedterms|-T toggles retention of seed terms (defaults to off). These are necessary for computing TF-IDF scores). =item --sectime|-e Always display times in seconds as opposed to human readable "1d 3h 45m 5s" style times. =item --mergefilter|-Y Filter out matches which would would cause more than I many anchors to be generated from 1 seed (default -Y100). Use -Y0 to disable. =item --scorefilter Set a minimum ungapped score for seeds. =item --rifts|-/ Allow anchors to skip D sequences (default 0). =item --islands|-% Same as --rifts=S-D (where S is number of input seqs). =item --fuzzyextend|-z Enable (default) or disable fuzzy extension (i.e. ungapped alignment) of anchors. =item --fuzzyextendlosslimit|-Z Set the cutoff at which to stop extending fuzzy hits (ie. the BLAST X parameter). =item --gappedanchors Use gapped (true) or ungapped (false (default)) anchors. =item --scorebyminimumpair Do anchor scoring by minimum pair when appropriate (default). Alternative is arithmatic mean (seldom useful, but theoretically faster). =item --rifts|-/ Allow anchors to skip D sequences (default 0). =item --islands|-% Same as --rifts=S-D (where S is number of input seqs). =item --fuzzyextend|-z Enable (default) or disable fuzzy extension (i.e. ungapped alignment) of anchors. =item --fuzzyextendlosslimit|-Z Set the cutoff at which to stop extending fuzzy hits (ie. the BLAST X parameter). =item --gappedanchors Use gapped (true) or ungapped (false (default)) anchors. =item --scorebyminimumpair Do anchor scoring by minimum pair when appropriate (default). Alternative is arithmatic mean (seldom useful, but theoretically faster). =back =head2 Output options These options primarily affect what data is output where. =over =item --directory|-d output directory (default: output) =item --name|-n alignment name (default: test) =item --repeatmap|-i Toggles keeping of a repeat map when --mergefilter is used (defaults to yes). =item --histogram|-H Histogram computation level: (-H alone implies -H1) =over =item 0 - no histogram (default) =item 1 - basic bucketsize/bucketcount histogram data =item 2 - bucket-based scores to anchors.detils =item 3 - perbucket count data =item 4 - perbucket + perpattern count data =back Any values above 2 are purely explorartory and can result in massive output files. =item --tfidf|-k Perform accurate tfidf scoring from within murasaki (requires extra memory at anchor generation time). Default is no. =back =head2 Performance/tuning options These options primarily affect performance, and don't (in general) impact output. =over =item --quickhash|-q specify a hashing function: =over =item 0 - adaptive with S-boxes (default when there's plenty of hash table to spare) =item 1 - don't pack bits to make hash (use first word only) =item 2 - naively use the first hashbits worth of pattern =item 3 - adaptivevely find a good hash (default) =item **experimental CryptoPP hashes** =item 4 - MD5 =item 5 - SHA1 =item 6 - Whirlpool =item 7 - CRC-32 =item 8 - Adler-32 =back Note: 3 and 0 are the only "recommended" hash functions, and the only ones automatically selected. The others are provided merely for reference. 1, 7, and 8 aren't even expected to utilize the entire hash space. =item --hashbits|-b use D bit hashes (for n's of 1 to WORDSIZE. default 26) =item --hashtype|-t select hash table data structure to use: =over =item OpenHash - open sub-word packing of hashbits (default when there's plenty of hash table to spare) =item EcoHash - chained sub-word packing of hashbits (default) =item ArrayHash - malloc/realloc (fast but fragmentation-prone) =item MSetHash - memory exorbanant, almost pointless. =back =item --probing 0 - linear, 1 - quadratic (default). Only applicable for --hashtype=OpenHash. =item --hitfilter|-h Minimum number of hits to be outputted as an anchor (default 1). In PatternHunter this is 2. =item --rseed|-s Random number seed for non-deterministic algorithms (ie: adative hash function generation). If you're doing any performance comparisons, it's probably imperative that you use the same seed for each run of the same settings. Default is obtained from time() (ie: seconds since 1970). =item --memory|-M [|] Set the target amount of total memory (either in gb or as % total memory). =item --reverseotf|-o Generate reverse complement on the fly (defaults to on). Turning this off precomputes the all reverse complement strands and stores them in memory, which rarely provides a measurable performance improvement. =item --binaryseq Enable (default) or disable binary sequence read/write =back =head3 Adaptive hash function related: Performance options related to adaptive hash function generation. =over =item --hasherFairEntropy Use more balanced entropy estimation (default: yes). =item --hasherCorrelationAdjust Adjust entropy estimates for nearby sources assuming some correlation (default: yes). =item --hasherTargetGACycles Adaptive hash function generation genetic algorithm cycle cutoff. =item --hasherEntropyAgro How aggressive to be about pursuing maximum entropy hash functions (takes a real. default is 1). =back =head2 MPI Specific: =over =item --hashers|-A [|] Specify the number of processes to be used as hashers (only applies to MPI. If a number between 0 and 1 it refers to a ratio of np). =item --localhash|-K Perform hashing locally on each storage node rather than sending it over the network (helpful for slow networks). =item --mpidistro|-L Toggles use of MPI to distribute sequence data over (if the sequence is available on local disk on each node then turning this off may potentially accerlate the intial sequence loading). =item --waittoanchor|-w Postpone actual anchor computation until all location sets have been received (as opposed to trying to work between receiving seed packets). =item --buffers|-u Maximum number of unfinished buffers to allow while message passing (0 means unlimited). Default is set based on the number of nodes participating. MPI can crash or perform I poorly if this value is too high. =item --nobuffers|-U Same as --buffers=1. =item --bigfirst|-I Assign hashers to large memory nodes first. =item --hostbalance|-l =over =item If yes (default): spread out hashers evenly among all nodes. =item If no: ignore host name when assigning jobs. =back =item --memorybalance|-a =over =item If yes (deafult): balance hash storage between nodes based on the amount of available ram. =item If no: distribute storage evently. This more likely to achieve optimal run times, but might not utilize memory as efficiently. =back =item --distmerge|-< =over =item if yes (default): during the merge step, storage nodes send seeds to any available hasher. =item if no: send all seeds to one node only. =back =item --distcollect|-> =over =item if yes (default): collect anchor data from all hashers. =item if no: send all seeds to the final assembly node only. =back =item --mpiredirectoutput =over =item if yes (default): each rank redirects its stdout/stderr to a separate file (murasaki-mpiout-I). =item if no: do what comes naturally (ie: managed by mpirun (for OpenMPI see --output-filename and --tag-output in L)). =back =item --keepstdoe Don't erase the murasaki-mpiout files on success. =item --sysvipc|-V Use System V IPC to negotiate shared memory regions (saves memory when one host runs multiple nodes). Default is true. =back =head2 Universal options: =over =item --verbose|-v Increases verbosity. =item --version|-V Prints version information and quits. =item --help|-? Prints a help message and quits. =back =head1 FILE FORMATS Murasaki has a wide array of output files, the formats of most of which are intended to be intuitive. All output files are prefixed by the value of the --name parameter. The primary output file formats are described here. Files are line based and tab delimited unless otherwise specified. =head2 .seqs The .seqs shows what sequences were used as input, 1 per line. This file gets used by various programs in conjunction with the .anchors file, so it's generally important that the contents reflect the correct sequence files. Moving anchor results between computers might result in a change of paths, requiring the user to update the .seqs file. As an alternative, always using relative paths can alleviate this problem. =head2 .anchors files These files are 1 anchor per line, with a 3-tuple per sequence. Each touple represents the start and stop coordinates and strand of the anchored interval on each sequence. The sequence order matches that of the order in the .seqs file. The coordinates are structured such that 1 refers to the first base in the sequence, 2 to the second, etc. Negative values refer to the reverse complement sequence where -1 is the I base of the reverse complement sequence (ie: the complement first base in the forward sequence). The "strand" element is a '+' or '-' that merely matches the sign of the coordinates (this is redundant information, but kept to make parsing or filtering simpler). For examle: 1 18 + -1 -18 - This line describes an anchor where the first 18 bases of the first sequence match the first 18 bases of the reverse complement of the second sequence. =head2 .anchors.details This is an antiquated file format, but used by L to calculate statistics like TF-IDF scores, and has been kept around for that reason. The .anchors.details file has the same format and information as the .anchors file, however after the anchor touples are two more terms: a score, and a comma (,) delimited list of term and count pairs (written "term:count"). The score and count data might be varied depending on the C<--histogram> option choices. =head2 .anchors.bitscore The term "bitscore" here is a misnomer, but maintained for historical reasons. In reality, this file contains the mean number of matching bases and length of each anchor (corresponding line by line to the .anchors file). =head2 .stats.tfidf Contains anchor TF-IDF scores (corresponding line by line to the .anchors file). =head2 .histogram Contains a simple histogram of the hash table usage. The first field is the bucket size, and the second is the frequency. For example a .histogram file like this: 1 24 2 1 Would indicate that there were 24 hash buckets that stored only 1 location (i.e. 24 unique seeds), and 1 hash bucket stored 2 locations (i.e. 1 seed that matched 2 locations (or 2 non-matching seeds that resulted in a hash collision)). =head2 .options Maintains a record of the options used when running Murasaki. =head2 .repeats The .repeats file stores a record of "repeats" as defined by the --mergefilter option (i.e. seeds that would have have induced more anchors than permitted). In this file, each repeat record is separated by a blank line. A repeat record looks like this: R: G.GCCTTT.T.ACT.CACAA..AT 0: 2145540494 -425039256 -113794380 1998323403 1: 2480929222 -1874514626 2543723555 -2550045172 The first line (always prefixed "R:") shows the repeating seed itself (where the . are the bases masked by the pattern). The subsequent lines show where these seeds occured in the input sequences (in the first (0) and second (1) sequences). Note that if there are no hits in a particular sequence, it doesn't include a blank line for that sequence. For example: R: G.GCCTTT.T.ACT.CACAA..AT 0: 2145540494 -425039256 -113794380 1998323403 2: 2480929222 -1874514626 2543723555 -2550045172 is also a valid .repeats file. =head1 LICENSE GNU General Public License, version 3 (GPLv3) =head1 AVAILABILITY L =head1 AUTHOR Kris Popendorf =head1 SEE ALSO mbfa(1), geneparse(1) =head2 RELATED READING =over =item M. Csuros and B. Ma, "Rapid Homology Search with Two-Stage Extension and Daughter Seeds" (2005). =item F. P. Preparata and L. Zhang and K. W. Choi, "Quick, practical selection of effective seeds for homology search" (2005). =item KP Choi, et. al., "Good spaced seeds for homology search" (2004). =back murasaki/doc/geneparse.10000644000177700001440000001266611434752236014574 0ustar krispusers.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.07) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .ie \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . nr % 0 . rr F .\} .el \{\ . de IX .. .\} .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "GENEPARSE 1" .TH GENEPARSE 1 "2010-05-31" "perl v5.10.1" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" geneparse \- sequence file loader frontend .SH "SYNAPSE" .IX Header "SYNAPSE" geneparse [options...] [input] [input2 ...] #read input and write to stdout .SH "DESCRIPTION" .IX Header "DESCRIPTION" Reads a sequence file and writes it somewhere (by default to stdout). .PP A specific range within an input file can be specified by file[start,stop]. The square brackets can be interchanged for any of {[()]}, (eg. \*(L"genome.fa[3000,4000]\*(R" or \*(L"genome.fa{3000~4000}\*(R"). Be aware that all of those might be parsed by your shell. Also any non-word character can be used to separate the numbers. .SH "OPTIONS" .IX Header "OPTIONS" .IP "\-\-repeatmask|\-r" 4 .IX Item "--repeatmask|-r" use soft-repeatmasked sequences (ie: replace lowercase bases with N's). .IP "\-\-upper|\-\-unmask|\-U" 4 .IX Item "--upper|--unmask|-U" .Vb 1 \& uppercase all bases. .Ve .IP "\-\-length|\-l" 4 .IX Item "--length|-l" just print the length and exit .IP "\-\-clean|\-c" 4 .IX Item "--clean|-c" don't append a new line when finished .IP "\-\-version|\-V" 4 .IX Item "--version|-V" prints the program version .IP "\-\-help|\-h" 4 .IX Item "--help|-h" prints a help message .IP "\-\-output|\-o" 4 .IX Item "--output|-o" send output to a file (otherwise use stdout). \-\-output implies \-\-clean. .IP "\-\-quiet|\-q" 4 .IX Item "--quiet|-q" silence all warnings .IP "\-\-verbose|\-v" 4 .IX Item "--verbose|-v" prints lots of extra details .SH "LICENSE" .IX Header "LICENSE" \&\s-1GNU\s0 General Public License, version 3 (GPLv3) .SH "AVAILABILITY" .IX Header "AVAILABILITY" .SH "AUTHOR" .IX Header "AUTHOR" Kris Popendorf .SH "SEE ALSO" .IX Header "SEE ALSO" \&\fImurasaki\fR\|(1), \fIgeneparse\fR\|(1) murasaki/doc/mbfa.pod0000644000177700001440000000341311434752236014140 0ustar krispusers=head1 NAME mbfa - Murasaki Binary FASTA format converter =head1 SYNOPSIS mbfa [options...] [input] [input2 ...] #convert [input] and [input2 ...] to Muraaski Binary FASTA files =head1 DESCPRIPTION Murasaki processes sequence data using a 2-bit format where each base is reprsented using 2 bits. There's a number of pre-existing formats that do similar things, however in particular Murasaki needs to know about the metadata that can't be expressed in just 2 bits (eg. where sequences of NNNNs are, sequence breaks when multiple sequences are included in a FASTA file, etc.), therefore the MBFA format includes this data as well. Ordinarily these files are generated automatically by Murasaki when first run on a new sequence. Because the file format is designed mesh closely with Murasaki, the actual file extension will vary to reflect your architecture. It will generally be some form of .mbfa[48][48] (e.g. C<.mbfa88> (the default gcc build on an amd64)). =head1 OPTIONS =over =item --info|-i Show metadata about each MBFA specified. =item --force|-f By default B will skip files that already have recent .mbfa files. This option forces the regeneration of these files. =item --fatal|-F Makes errors fatal. Ordinarily if you specify multiple files, mbfa will try to convert all of them even if one fails emitting a warning. With --fatal it will stop and exit with an error if there's a problem. =item --fasta|-A Geneates FASTA output corresponding based on the MBFA data to stdout. =item --help|-h, --version|-V, --verbose|-V What you'd expect. =back =head1 LICENSE GNU General Public License, version 3 (GPLv3) =head1 AVAILABILITY L =head1 AUTHOR Kris Popendorf =head1 SEE ALSO murasaki(1), geneparse(1) murasaki/doc/geneparse.html0000644000177700001440000000645311434752236015375 0ustar krispusers geneparse

NAME

geneparse - sequence file loader frontend


SYNAPSE

geneparse [options...] [input] [input2 ...] #read input and write to stdout


DESCRIPTION

Reads a sequence file and writes it somewhere (by default to stdout).

A specific range within an input file can be specified by file[start,stop]. The square brackets can be interchanged for any of {[()]}, (eg. "genome.fa[3000,4000]" or "genome.fa{3000~4000}"). Be aware that all of those might be parsed by your shell. Also any non-word character can be used to separate the numbers.


OPTIONS

--repeatmask|-r

use soft-repeatmasked sequences (ie: replace lowercase bases with N's).

--upper|--unmask|-U
 uppercase all bases.
--length|-l

just print the length and exit

--clean|-c

don't append a new line when finished

--version|-V

prints the program version

--help|-h

prints a help message

--output|-o

send output to a file (otherwise use stdout). --output implies --clean.

--quiet|-q

silence all warnings

--verbose|-v

prints lots of extra details


LICENSE

GNU General Public License, version 3 (GPLv3)


AVAILABILITY

http://murasaki.sourceforge.net


AUTHOR

Kris Popendorf <krisp@dna.bio.keio.ac.jp>


SEE ALSO

murasaki(1), geneparse(1)

murasaki/doc/geneparse.txt0000644000177700001440000000252511434752236015244 0ustar krispusersNAME geneparse - sequence file loader frontend SYNAPSE geneparse [options...] [input] [input2 ...] #read input and write to stdout DESCRIPTION Reads a sequence file and writes it somewhere (by default to stdout). A specific range within an input file can be specified by file[start,stop]. The square brackets can be interchanged for any of {[()]}, (eg. "genome.fa[3000,4000]" or "genome.fa{3000~4000}"). Be aware that all of those might be parsed by your shell. Also any non-word character can be used to separate the numbers. OPTIONS --repeatmask|-r use soft-repeatmasked sequences (ie: replace lowercase bases with N's). --upper|--unmask|-U uppercase all bases. --length|-l just print the length and exit --clean|-c don't append a new line when finished --version|-V prints the program version --help|-h prints a help message --output|-o send output to a file (otherwise use stdout). --output implies --clean. --quiet|-q silence all warnings --verbose|-v prints lots of extra details LICENSE GNU General Public License, version 3 (GPLv3) AVAILABILITY AUTHOR Kris Popendorf SEE ALSO murasaki(1), geneparse(1) murasaki/doc/murasaki.html0000644000177700001440000007721411434752236015243 0ustar krispusers murasaki

NAME

murasaki - compute anchors between multiple sequences


SYNOPSIS

 murasaki [OPTIONS] -p[pattern] seq1.fa seq2.gbk [seq3.raw ...] #compute anchors between seq1.fa and seq2.gbk using [pattern]
 mpirun murasaki [OPTIONS] -p[pattern] seq1.fa seq2.gbk [seq3.raw ...] #compute anchors between seq1.fa and seq2.gbk using [pattern] in parallel via MPI


DESCRIPTION

Murasaki generates anchors based on all supplied sequences based on the user supplied pattern and hash tables. Essentially each base of each sequence is masked by the pattern, forming a seed that is used to generate a hash. The location of the seed is stored in the hash table. Once all seeds have been hashed and stored, Murasaki scans the hash table, generating anchors for all matching seeds. An anchor refers to a set intervals across a subset of the input sequences. These are stored in name.anchors files, and described in FILE FORMATS. By default anchors are maximally extended until their minimum pairwise ungapped alignment score drops below a threshold in the same fashion the X-drop parameter in BLAST and BLAST-like searches.

PATTERNS

Murasaki uses spaced seed patterns to in considering seeds. A spaced seed pattern is typically expressed as a string of 1s and 0s necessarily starting and ending with a 1. 1s indicate that this base is considered part of the seed, while bases at 0 positions are not. For example with a pattern "1011" the sequence "ACGT" would match sequences "AGGT" and "ATGT" but not "ACTT". The number of 1s in the pattern is known as the "weight" of the pattern, and the number of 1s and 0s combined is the "length" of the pattern. Murasaki allows the use of any arbitrary pattern expressed as a string of 1s and 0s, and also interprets patterns of the form "x:y" to mean a "random pattern of weight x and length y."

The choice of pattern obviously has an impact on sensitivity and specificity, but whether one pattern is "better" than another depends on the application and the input sequences under consideration. Calcuating "maximally sensitive spaced seed patterns" is a computationally difficult problem and there are a number of research papers describing various methods for approximation (RELATED READING). In general, however, "heavier" spaced seed patterns are less sensitive, but more specific, than lighter seeds. Anecdotally we find that seeds with weights approximately 60% to 75% (with lengths around 24 for bacteria, and 36 to 48 for mammals) are good for most applications. Extremely similar species (for example human and chimp) benefit from longer, heavier, seeds.

HASH FUNCTIONS

Hash functions (as well as hash parameters) are generated automatically based the system environment and input sequences. There are essentially two types of hash functions available in Murasaki: adaptive and cryptoraphic hashes. The adaptive hashes are XOR combinations of various bitwise shifts of the seed designed by analyzing the spaced seed pattern to maximize the entropy of the resulting hash. Cryptographic hashes are available via the CryptoPP library and use the entire spaced seed pattern to generate a hash using one of the common cryptographic hashes like MD5 or SHA-1. The adaptive hash functions are almost always faster and more efficient than MD5 and SHA-1, but the cryptographic functions are available for reference and may be useful as an alternative in the unlikely event you're dealing with an environment where the adaptive hasher is unsuitable (for example a sequence consisting of only A and T (leaving 1 out of every 2 bits unitilized)).

MEMORY SCALING

Murasaki can take a lot of memory. Storing the location of each seed in the hash table is the most costly part of the operation, requiring approximately ceil(log_2(N)) bits per seed where N is the total sequence length. Locations are, by default, stored in a bitpacked format to approach theoretical minimum. The second most costly element is the hash table structure, where each bucket carries a small overhead and unused are simply wasted space. More hash table buckets (i.e. a longer hash table) decreases the expected number of collisions, leading to faster executation time. Therefore Murasaki tries to use as many buckets as possible by inspecting the available system memory and using as much as it can while still storing all the seed locations. If this automatic scaling is ineffective, setting the hash table size directly via the --hashbits|-b options can force a specific hash table size. If the memory of one computer is insufficient to store the desired hash table, PARALLELIZATION can be used to distribute the hash table across multiple computers.

PARALLELIZATION

Murasaki is designed to run in parallel using MPI. Consult the documentation for the specific variations of your MPI implementation, however in general the executation method looks like:

 mpirun [MPI options] murasaki [murasaki options] -p[pattern] [seq1 ...]

Murasaki in parallel divides the number of processors available (NP) into two groups: hasher nodes and storage nodes. The storage nodes divide the hash table between each themselves, each being responsible for a different part of the table. Hasher nodes divide the input sequence in between themselves, each hashing a separate portion of the input sequence, and passing the seed location to the appropriate storage node for storage. When all the hasher nodes are finished hashing, the storage nodes scan their portion of hash table and pass matching sets of seeds to a hasher node where they are assembled into anchors and extended. Finally all the hasher nodes combine their independent anchor sets into one final set in ceil(log_2(H)) iterations (where H is the number of hasher nodes), with each hasher node number 2h passing its anchors to hasher number 2h-1 at each iteration.

Because almost none of the parallelization steps require communication between all nodes, and each seed and each anchor can be processed in parallel, Murasaki scales very well in parallel, running approximately twice as fast when twice as many nodes are available. Furthermore, the hash table is automatically grown to take advantage of the combined memory from multiple machines.


OPTIONS

Most options can be specified in their long form (e.g. "--directory out" or "--directory=out") or short form (e.g. "-dout"). Options marked by <S> expect a string, <D> an integer, <F> a float, and <B> a boolean value ("yes/on/true/1" for true, "no/off/false/0" for false). Most booleans can omit the value, toggling the value from whatever it was to the opposite.

Murasaki has a lot of options. Here we've separated them into categories to help distinguish the scope of the various options, however in certain situations certain option choices may have onforseen consequences, and of course ultimately if the specified output is huge, the required runtime will necessarily be long. It is a mistake to think that everything outside of the Performance/tuning options section has no bearing on performance.

Anchor parameter related options

These options shape what is considered an "anchor".

--pattern|-p <S>
 specifies the seed pattern (eg. 11101001010011011). using the format
 C<[<w>:<l>]> automatically generates a random pattern of weight <w>
 and length <l>
--repeatmask|-r <B>

Skip repeat masked data (ie: lowercase atgc). Be aware that some sequence files are distributed purely in lower case.

--seedfilter|-f <D>

Skip seeds that occur more than N times. Exceptionally slow. See --hashfilter for a faster approximation.

--hashfilter|-m <D>

Like --seedfilter but works on hash keys instead of seeds. May cause some collateral damage to otherwise unique seeds, but it's faster.

--skipfwd|-F <B>

Don't hash/match the forward strands.

--skiprev|-R <B>

Don't hash/match the reverse complement strands.

--skip1to1|-1 <B>

Skip matches along the 1:1 line (good for comparing to self).

--hashonly|-Q <B>

Hash Only. No anchor output, just statistics.

--hashskip|-S <D>

Hashes every n bases. Default is 1 (i.e. hashing all positions). Not supplying any argument increments the skip amount by 1.

--join|-j <D>

Join anchors within n bases of eachother (default: 0). Specifying a negative D implies -D*patternLength.

--bitscore|-B <B>

toggles compututation of a bitscore for all anchors (default is on).

--seedterms|-T <B>

toggles retention of seed terms (defaults to off). These are necessary for computing TF-IDF scores).

--sectime|-e <B>

Always display times in seconds as opposed to human readable "1d 3h 45m 5s" style times.

--mergefilter|-Y <D>

Filter out matches which would would cause more than D many anchors to be generated from 1 seed (default -Y100). Use -Y0 to disable.

--scorefilter <D>

Set a minimum ungapped score for seeds.

--rifts|-/ <D>

Allow anchors to skip D sequences (default 0).

--islands|-% <D>

Same as --rifts=S-D (where S is number of input seqs).

--fuzzyextend|-z <B>

Enable (default) or disable fuzzy extension (i.e. ungapped alignment) of anchors.

--fuzzyextendlosslimit|-Z <D>

Set the cutoff at which to stop extending fuzzy hits (ie. the BLAST X parameter).

--gappedanchors <B>

Use gapped (true) or ungapped (false (default)) anchors.

--scorebyminimumpair <B>

Do anchor scoring by minimum pair when appropriate (default). Alternative is arithmatic mean (seldom useful, but theoretically faster). =item --rifts|-/ <D>

Allow anchors to skip D sequences (default 0).

--islands|-% <D>

Same as --rifts=S-D (where S is number of input seqs).

--fuzzyextend|-z <B>

Enable (default) or disable fuzzy extension (i.e. ungapped alignment) of anchors.

--fuzzyextendlosslimit|-Z <D>

Set the cutoff at which to stop extending fuzzy hits (ie. the BLAST X parameter).

--gappedanchors <B>

Use gapped (true) or ungapped (false (default)) anchors.

--scorebyminimumpair <B>

Do anchor scoring by minimum pair when appropriate (default). Alternative is arithmatic mean (seldom useful, but theoretically faster).

Output options

These options primarily affect what data is output where.

--directory|-d <S>
 output directory (default: output)
--name|-n <S>
 alignment name (default: test)
--repeatmap|-i <B>

Toggles keeping of a repeat map when --mergefilter is used (defaults to yes).

--histogram|-H <D>

Histogram computation level: (-H alone implies -H1)

  1. - no histogram (default)
  2. - basic bucketsize/bucketcount histogram data
  3. - bucket-based scores to anchors.detils
  4. - perbucket count data
  5. - perbucket + perpattern count data

Any values above 2 are purely explorartory and can result in massive output files.

--tfidf|-k <B>

Perform accurate tfidf scoring from within murasaki (requires extra memory at anchor generation time). Default is no.

Performance/tuning options

These options primarily affect performance, and don't (in general) impact output.

--quickhash|-q <D>
 specify a hashing function:
  1. - adaptive with S-boxes (default when there's plenty of hash table to spare)
  2. - don't pack bits to make hash (use first word only)
  3. - naively use the first hashbits worth of pattern
  4. - adaptivevely find a good hash (default)
  5. - MD5
  6. - SHA1
  7. - Whirlpool
  8. - CRC-32
  9. - Adler-32

Note: 3 and 0 are the only "recommended" hash functions, and the only ones automatically selected. The others are provided merely for reference. 1, 7, and 8 aren't even expected to utilize the entire hash space.

--hashbits|-b <D>

use D bit hashes (for n's of 1 to WORDSIZE. default 26)

--hashtype|-t <S>

select hash table data structure to use:

OpenHash - open sub-word packing of hashbits (default when there's plenty of hash table to spare)
EcoHash - chained sub-word packing of hashbits (default)
ArrayHash - malloc/realloc (fast but fragmentation-prone)
MSetHash - memory exorbanant, almost pointless.
--probing <D>

0 - linear, 1 - quadratic (default). Only applicable for --hashtype=OpenHash.

--hitfilter|-h <D>

Minimum number of hits to be outputted as an anchor (default 1). In PatternHunter this is 2.

--rseed|-s <D>

Random number seed for non-deterministic algorithms (ie: adative hash function generation). If you're doing any performance comparisons, it's probably imperative that you use the same seed for each run of the same settings. Default is obtained from time() (ie: seconds since 1970).

--memory|-M [<F>|<S>]

Set the target amount of total memory (either in gb or as % total memory).

--reverseotf|-o <B>

Generate reverse complement on the fly (defaults to on). Turning this off precomputes the all reverse complement strands and stores them in memory, which rarely provides a measurable performance improvement.

--binaryseq <B>

Enable (default) or disable binary sequence read/write

Adaptive hash function related:

Performance options related to adaptive hash function generation.

--hasherFairEntropy <B>

Use more balanced entropy estimation (default: yes).

--hasherCorrelationAdjust <B>

Adjust entropy estimates for nearby sources assuming some correlation (default: yes).

--hasherTargetGACycles <D>

Adaptive hash function generation genetic algorithm cycle cutoff.

--hasherEntropyAgro <F>

How aggressive to be about pursuing maximum entropy hash functions (takes a real. default is 1).

MPI Specific:

--hashers|-A [<F>|<D>]

Specify the number of processes to be used as hashers (only applies to MPI. If a number between 0 and 1 it refers to a ratio of np).

--localhash|-K <B>

Perform hashing locally on each storage node rather than sending it over the network (helpful for slow networks).

--mpidistro|-L <B>

Toggles use of MPI to distribute sequence data over (if the sequence is available on local disk on each node then turning this off may potentially accerlate the intial sequence loading).

--waittoanchor|-w <B>

Postpone actual anchor computation until all location sets have been received (as opposed to trying to work between receiving seed packets).

--buffers|-u <D>

Maximum number of unfinished buffers to allow while message passing (0 means unlimited). Default is set based on the number of nodes participating. MPI can crash or perform very poorly if this value is too high.

--nobuffers|-U <B>

Same as --buffers=1.

--bigfirst|-I <B>

Assign hashers to large memory nodes first.

--hostbalance|-l <B>
If yes (default): spread out hashers evenly among all nodes.
If no: ignore host name when assigning jobs.
--memorybalance|-a <B>
If yes (deafult): balance hash storage between nodes based on the amount of available ram.
If no: distribute storage evently. This more likely to achieve optimal run times, but might not utilize memory as efficiently.
--distmerge|-< <B>
if yes (default): during the merge step, storage nodes send seeds to any available hasher.
if no: send all seeds to one node only.
--distcollect|-> <B>
if yes (default): collect anchor data from all hashers.
if no: send all seeds to the final assembly node only.
--mpiredirectoutput <B>
if yes (default): each rank redirects its stdout/stderr to a separate file (murasaki-mpiout-N).
if no: do what comes naturally (ie: managed by mpirun (for OpenMPI see --output-filename and --tag-output in mpirun(1))).
--keepstdoe <B>

Don't erase the murasaki-mpiout files on success.

--sysvipc|-V <B>

Use System V IPC to negotiate shared memory regions (saves memory when one host runs multiple nodes). Default is true.

Universal options:

--verbose|-v

Increases verbosity.

--version|-V

Prints version information and quits.

--help|-?

Prints a help message and quits.


FILE FORMATS

Murasaki has a wide array of output files, the formats of most of which are intended to be intuitive. All output files are prefixed by the value of the --name parameter. The primary output file formats are described here. Files are line based and tab delimited unless otherwise specified.

.seqs

The .seqs shows what sequences were used as input, 1 per line. This file gets used by various programs in conjunction with the .anchors file, so it's generally important that the contents reflect the correct sequence files. Moving anchor results between computers might result in a change of paths, requiring the user to update the .seqs file. As an alternative, always using relative paths can alleviate this problem.

.anchors files

These files are 1 anchor per line, with a 3-tuple per sequence. Each touple represents the start and stop coordinates and strand of the anchored interval on each sequence. The sequence order matches that of the order in the .seqs file. The coordinates are structured such that 1 refers to the first base in the sequence, 2 to the second, etc. Negative values refer to the reverse complement sequence where -1 is the last base of the reverse complement sequence (ie: the complement first base in the forward sequence). The "strand" element is a '+' or '-' that merely matches the sign of the coordinates (this is redundant information, but kept to make parsing or filtering simpler).

For examle:

 1       18     +       -1      -18       -

This line describes an anchor where the first 18 bases of the first sequence match the first 18 bases of the reverse complement of the second sequence.

.anchors.details

This is an antiquated file format, but used by GMV to calculate statistics like TF-IDF scores, and has been kept around for that reason. The .anchors.details file has the same format and information as the .anchors file, however after the anchor touples are two more terms: a score, and a comma (,) delimited list of term and count pairs (written "term:count"). The score and count data might be varied depending on the --histogram option choices.

.anchors.bitscore

The term "bitscore" here is a misnomer, but maintained for historical reasons. In reality, this file contains the mean number of matching bases and length of each anchor (corresponding line by line to the .anchors file).

.stats.tfidf

Contains anchor TF-IDF scores (corresponding line by line to the .anchors file).

.histogram

Contains a simple histogram of the hash table usage. The first field is the bucket size, and the second is the frequency. For example a .histogram file like this:

 1  24
 2  1

Would indicate that there were 24 hash buckets that stored only 1 location (i.e. 24 unique seeds), and 1 hash bucket stored 2 locations (i.e. 1 seed that matched 2 locations (or 2 non-matching seeds that resulted in a hash collision)).

.options

Maintains a record of the options used when running Murasaki.

.repeats

The .repeats file stores a record of "repeats" as defined by the --mergefilter option (i.e. seeds that would have have induced more anchors than permitted). In this file, each repeat record is separated by a blank line. A repeat record looks like this:

 R: G.GCCTTT.T.ACT.CACAA..AT
 0: 2145540494 -425039256 -113794380 1998323403
 1: 2480929222 -1874514626 2543723555 -2550045172

The first line (always prefixed "R:") shows the repeating seed itself (where the . are the bases masked by the pattern). The subsequent lines show where these seeds occured in the input sequences (in the first (0) and second (1) sequences). Note that if there are no hits in a particular sequence, it doesn't include a blank line for that sequence. For example:

 R: G.GCCTTT.T.ACT.CACAA..AT
 0: 2145540494 -425039256 -113794380 1998323403
 2: 2480929222 -1874514626 2543723555 -2550045172

is also a valid .repeats file.


LICENSE

GNU General Public License, version 3 (GPLv3)


AVAILABILITY

http://murasaki.sourceforge.net


AUTHOR

Kris Popendorf <krisp@dna.bio.keio.ac.jp>


SEE ALSO

mbfa(1), geneparse(1)

RELATED READING

M. Csuros and B. Ma, "Rapid Homology Search with Two-Stage Extension and Daughter Seeds" (2005).
F. P. Preparata and L. Zhang and K. W. Choi, "Quick, practical selection of effective seeds for homology search" (2005).
KP Choi, et. al., "Good spaced seeds for homology search" (2004).
murasaki/doc/mbfa.10000644000177700001440000001351611434752236013523 0ustar krispusers.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.07) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .ie \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . nr % 0 . rr F .\} .el \{\ . de IX .. .\} .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "MBFA 1" .TH MBFA 1 "2010-05-31" "perl v5.10.1" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" mbfa \- Murasaki Binary FASTA format converter .SH "SYNOPSIS" .IX Header "SYNOPSIS" .Vb 1 \& mbfa [options...] [input] [input2 ...] #convert [input] and [input2 ...] to Muraaski Binary FASTA files .Ve .SH "DESCPRIPTION" .IX Header "DESCPRIPTION" Murasaki processes sequence data using a 2\-bit format where each base is reprsented using 2 bits. There's a number of pre-existing formats that do similar things, however in particular Murasaki needs to know about the metadata that can't be expressed in just 2 bits (eg. where sequences of NNNNs are, sequence breaks when multiple sequences are included in a \s-1FASTA\s0 file, etc.), therefore the \s-1MBFA\s0 format includes this data as well. Ordinarily these files are generated automatically by Murasaki when first run on a new sequence. .PP Because the file format is designed mesh closely with Murasaki, the actual file extension will vary to reflect your architecture. It will generally be some form of .mbfa[48][48] (e.g. \f(CW\*(C`.mbfa88\*(C'\fR (the default gcc build on an amd64)). .SH "OPTIONS" .IX Header "OPTIONS" .IP "\-\-info|\-i" 4 .IX Item "--info|-i" Show metadata about each \s-1MBFA\s0 specified. .IP "\-\-force|\-f" 4 .IX Item "--force|-f" By default \fBmbfa\fR will skip files that already have recent .mbfa files. This option forces the regeneration of these files. .IP "\-\-fatal|\-F" 4 .IX Item "--fatal|-F" Makes errors fatal. Ordinarily if you specify multiple files, mbfa will try to convert all of them even if one fails emitting a warning. With \-\-fatal it will stop and exit with an error if there's a problem. .IP "\-\-fasta|\-A" 4 .IX Item "--fasta|-A" Geneates \s-1FASTA\s0 output corresponding based on the \s-1MBFA\s0 data to stdout. .IP "\-\-help|\-h, \-\-version|\-V, \-\-verbose|\-V" 4 .IX Item "--help|-h, --version|-V, --verbose|-V" What you'd expect. .SH "LICENSE" .IX Header "LICENSE" \&\s-1GNU\s0 General Public License, version 3 (GPLv3) .SH "AVAILABILITY" .IX Header "AVAILABILITY" .SH "AUTHOR" .IX Header "AUTHOR" Kris Popendorf .SH "SEE ALSO" .IX Header "SEE ALSO" \&\fImurasaki\fR\|(1), \fIgeneparse\fR\|(1) murasaki/doc/make.pl0000755000177700001440000000103611434752236014003 0ustar krispusers#!/usr/bin/perl use strict; use File::Basename; my ($foo,$base)=fileparse($0); my @files=@ARGV; unless(@files){ @files=<$base/*.pod>; } foreach my $file (@files){ my ($name)=fileparse($file,qr/\.pod/); use Pod::Man; my $podman=Pod::Man->new(); $podman->parse_from_file($file,"$name.1"); use Pod::Text; my $podtext=Pod::Text->new(); $podtext->parse_from_file($file,"$name.txt"); use Pod::Html; pod2html("--infile=$file","--outfile=$name.html","--title=$name"); foreach my $tmpfile (){unlink $tmpfile} } murasaki/overlay.pl0000755000177700001440000000525611434752242014007 0ustar krispusers#!/usr/bin/perl -w #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; use Getopt::Long; use Pod::Usage; #use Data::Dump qw{dump}; use POSIX qw{floor}; use strict; my ($help,$man,$opt_prefix); my ($opt_output); my @validModes=qw{clear src dst src-over dst-over src-in dst-in src-out dst-out src-atop dst-atop xor plus multiply screen overlay darken lighten color-dodge color-burn hard-light soft-light difference exclusion change-mask}; my $mode="multiply"; GetOptions('help|?' => \$help, man => \$man, 'o|output=s'=>\$opt_output, 'mode=s'=>\$mode); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man or scalar(@ARGV)<2 or !grep {$mode ne $_} @validModes; my @bad=grep {!-r $_} @ARGV; die "File(s) not found: @bad" if @bad; my (@paths,@names,@extensions); foreach my $inf (@ARGV){ my @dat=fileparse($inf,qr/\..+/); push(@names,$dat[0]); push(@paths,$dat[1]); push(@extensions,$dat[2]); } my $path=$paths[0]; #use first path. my $outf=($opt_output ? $opt_output:$path."overlay-".join("-",@names).".png"); print "Writing output to $outf\n"; my @hues=(scalar(@names)>2 ? (map {$_/$#names*200} (0..$#names)):(100,30)); my @bits=map {"\\( $ARGV[$_] -modulate 100,100,$hues[$_] \\)"} (0..$#names); my $cmd="convert -compose $mode @bits -composite $outf"; print "Running: $cmd\n"; print "Merging using $mode\n"; system($cmd); __END__ =head1 NAME overlay.pl - overlay a couple of images, offsetting the hue in each =head1 SYNOPSIS overlay.pl [file1] [file2 [file3 ... ]] =head1 OPTIONS --output - force output to go to some particular file (otherwise it's automatically derived from the input filenames) --mode - specify a mode to use for compose. Valid modes are: clear src dst src-over dst-over src-in dst-in src-out dst-out src-atop dst-atop xor plus multiply screen overlay darken lighten color-dodge color-burn hard-light soft-light difference exclusion change-mask For information on each mode, consult http://www.imagemagick.org/script/command-line-options.php#compose murasaki/getgene.pl0000755000177700001440000000707411434752241013743 0ustar krispusers#!/usr/bin/perl -w #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; use Getopt::Long qw(:config pass_through); use Pod::Usage; #use Data::Dump qw{dump}; use strict; my ($help,$man,$opt_prefix); our ($seqhome,$root,$flexible); BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my $toRNA; my ($outprefix,$echo); GetOptions('help|?' => \$help, man => \$man, flexible => \$flexible, 'rna' => \$toRNA, 'outprefix=s'=>\$outprefix, echo=>\$echo); pod2usage(1) if $help or $#ARGV<1; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my (%names,%locii); my $filename=shift(@ARGV); die "There's no file $filename" unless -f $filename; print "Loading annotation...\n"; if(!-f "$filename.cds"){ print "CDS file not found for $filename. Generating...\n"; my $res=system("$root/getcds.pl $filename"); do { print "Generation of CDS file for $filename failed\n"; } unless -f "$filename.cds"; } if(open(CDS,"$filename.cds")){ print "Loading annotation for $filename..."; my @cds; LoadCDS: while(){ my ($name,$start,$stop,$strand,$locus) = split(/\s+/,$_); my $cd={name => $name,locus=>$locus,start=>$start,stop=>$stop,strand=>$strand}; push(@{$names{$name}},$cd); push(@{$locii{$locus}},$cd); } print "Done.\n"; }else{ print "Note: couldn't load any annotation...\n"; } print "Loading $filename...\n"; my $genome=`$root/geneparse.pl $filename`; while(@ARGV){ my $gene=shift(@ARGV); if($gene=~m/^(-?\d+)\D{1,3}?(-?\d+)/){ #fake gene. i want a region! my ($start,$stop)=map(abs,($1,$2)); ($start,$stop)=$start<$stop ? ($start,$stop):($stop,$start); push(@{$names{$gene}},{name => $gene,start=>$start,stop=>$stop, strand=>$1<0 || $2<0 ? -1:1}); } my @cds=(map {ref($_) ? (@$_):()} ($names{$gene},$locii{$gene})); print "$gene not found in annotation\n" unless scalar(@cds); foreach my $cds (@cds){ my $title="$filename: ".join(" ",@{$cds}{qw{name locus start stop strand}}); my $outfile=($outprefix and $outprefix ne "-") ? ($outprefix ? $outprefix:$filename).".$gene.fa":"-"; open(my $outfh,"|$root/faformat.pl --title='$title' - $outfile") unless $echo; print STDERR "Writing $gene data to $outfile\n" unless ($outfile eq "-" or $echo); my $dna=uc substr($genome,$cds->{start}-1,$cds->{stop}-$cds->{start}+1); if($cds->{strand}<0){ $dna=~tr/ACGT/TGCA/; $dna=reverse $dna; } if($toRNA){ $dna=~tr/ACGT/UGCA/; # $dna=reverse $dna; } if($echo){ print "$dna\n"; }else{ print $outfh "$dna\n"; } } } __END__ =head1 NAME getgene.pl - grab the dna for a gene using annotation =head1 SYNOPSIS getgene.pl [ ...] =head1 OPTIONS --rna specifies recoding to rna --outprefix=s directs output to some file s.gene_i.fa (default is ). setting outprefix=- sends output to stdout. --echo sends output directly to stdout without formatting murasaki/kogalign.pl0000755000177700001440000001360011434752242014111 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Long; use Pod::Usage; use File::Basename; use IO::Handle; use Data::Dump qw {dump}; use strict; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; use Murasaki::Ticker qw{resetTick tick}; use Murasaki::KOG; our $root; our $seqhome="$root/seq"; our $koglist; our ($help,$man); our ($kogfile,@kogmap,%kogpref); GetOptions('help|?' => \$help, man => \$man, 'kogfile=s' => \$kogfile, 'kog=s%' => sub {pod2usage(-msg => "Invalid argument for --kog ($_[1] is not a valid index)") unless(defined $_[1] and $_[1]>=0); pod2usage(-msg => "Invalid argument for --kog $_[1] ('$_[2]' doesn't look like a kog name)") unless($_[2] and $_[2]=~m/^...$/); $kogmap[$_[1]]=$_[2]}, 'kogpref=s%' => \%kogpref, 'koglist=s' => \$koglist); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; die "Need to specify kogfile with --kogfile" unless $kogfile; if($koglist){ my $count; open(my $koglistfh,$koglist) or die "Couldn't open $koglist"; $koglist={}; while(<$koglistfh>){ chomp; $koglist->{$_}=1; $count++; } print "\"Used members\" list loaded. $count approved locs.\n"; } my $outfile=shift @ARGV; my $usedSeqFile; if($outfile=~m/\.seqs$/){ #alignment file my $basename=$`; print "Loading sequence list from $outfile\n"; open(INF,$outfile) or die "Couldn't open $outfile"; while(){ chomp; push(@ARGV,$_); } close(INF); $usedSeqFile=$outfile; $outfile="$basename.kogaligned"; print "Storing output to $outfile\n"; }else{ #append a .anchors unless it already has one $outfile="$outfile.anchors" unless $outfile=~m/\.anchors$/; if(-f $outfile){ open(my $peek,$outfile); my $line=<$peek>; unless($line=~m/^(-?\d+\t-?\d+\t[-+]\t?)+$/){ die "$outfile already exists and doesn't look like an anchor file. I don't want to break something important..."; } } } our (@locusLists,%knownLocs); my ($i,$learnedKogSpecs); LoadSeqs: foreach my $seq (@ARGV){ unless($kogmap[$i]){ local $_=$seq; my ($id)=Murasaki::KOG->guessKogMember($seq); unless($id or $learnedKogSpecs){ print "$seq not identifable from basic Kog list, checking $kogfile contents...\n"; #pre-load our kogfile and see if it knows any more species names Murasaki::KOG->learnKogSpecs($kogfile); $learnedKogSpecs; $id=Murasaki::KOG->guessKogMember($seq); } if($id){ print "Identified $_ as KOG member $id\n"; $kogmap[$i]=$id; } } die "$seq isn't bound to a kog. Use --kog (eg. --kog $i=MtC) to set it.\n" unless $kogmap[$i]; if(!-f "$seq.cds"){ print "CDS file not found for $seq. Generating...\n"; my $res=system("$root/getcds.pl $seq"); do { die "Generation of CDS file for $seq failed" unless -f "$seq.cds\n"; next LoadSeqs; } unless -f "$seq.cds"; } open(CDS,"$seq.cds") or die "Couldn't read $seq.cds"; print "Loading annotation for $seq..."; my ($cdscount); LoadCDS: while(){ my ($name,$start,$stop,$strand,$locus) = split(/\s/,$_); $locus=$name if $name && !$locus; next unless $locus; $locus=uc $locus; my $cd={name => $name,start=>$start,stop=>$stop,strand=>$strand,locus=>$locus}; print "Uh oh, $locus occurs again!\n" if $locusLists[$i]->{$locus}; $locusLists[$i]->{$locus}=$cd; $knownLocs{$locus}=1; $cdscount++; } print("$cdscount CDS's loaded.\n"); }continue{$i++} my $kogs=KOG->kogFrom($kogfile,\@kogmap,\%knownLocs,\%kogpref); open(OUTF,">$outfile") or die "Couldn't open $outfile for writing\n"; our $count=0; resetTick(scalar(keys(%{$locusLists[0]}))); foreach my $locus (keys(%{$locusLists[0]})){ my $allmembers=$kogs->memberMap($locus); my @bitsToPlot; my $i=0; foreach my $spec (@kogmap){ $bitsToPlot[$i]=[grep {$allmembers->{$_} eq $spec} keys(%$allmembers)]; }continue{$i++} plotBits(0,"",@bitsToPlot); tick(); } close OUTF; print "\nDone. $count psuedo-anchors.\n"; unless($usedSeqFile or $outfile eq "-"){ my $prefix=(fileparse($outfile,qr/\.[^.]+/))[0]; print "Writing $prefix.seqs\n"; open(my $seqfh,">$prefix.seqs"); print $seqfh $_."\n" foreach @ARGV } sub plotBits { my ($ite,$leaderBits,$mybits,@others)=@_; unless($mybits){ print OUTF $leaderBits."\n"; $count++; return; } foreach my $bit (@$mybits){ my $nextLeader="$leaderBits\t" if $leaderBits; local $_=$locusLists[$ite]->{$bit}; if($koglist){ next unless $koglist->{$_->{locus}}; } my $wasNasty=($leaderBits=~m/\t\t/ or $leaderBits=~m/\t$/); my @bits=@{$_}{qw{start stop}}; die "Uh oh. Missing coordinates from: @bits from ".dump($bit)." (maybe you need --kogpref underbar=1?)" unless $bits[0] and $bits[1]; my $nextLeader=($nextLeader.join("\t",@bits,$_->{strand}<0 ? "-":"+")); plotBits($ite+1,$nextLeader,@others); } } __END__ =head1 NAME kogalign.pl -- produce (untested) alignments from cds files =head1 SYNOPSIS kogalign.pl --kogfile [ ...] kogalign.pl --kogfile =head1 OPTIONS The kogalign.pl execution style writes results to align.kogaligned The --kogfile option is necessary. =cut murasaki/mauve2anchors.pl0000755000177700001440000000765411434752242015107 0ustar krispusers#!/usr/bin/perl -w #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . ################## ## convert mauve alignments to Murasaki anchor files -- krisp ################## use File::Basename; use Getopt::Long; use Pod::Usage; #use Data::Dump qw{dump}; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; use strict; my ($help,$man,$align_type); my $autoout=1; our $flexible=0; our $signed=1; my $useAlignment; GetOptions('help|?' => \$help, man => \$man, 'autoout!'=>\$autoout); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my ($filename,$outfile)=@ARGV; my ($basename,$path,$suffix) = fileparse($filename); die "Input file not found: $filename" unless -e $filename; $outfile="$path/$basename.anchors" if $autoout; print "Outfile is $outfile\n" if $autoout; if($outfile){ open(OUTF,">$outfile"); } else { open(OUTF,">-"); } my %mauvedat=%{loadMauveAlignment($filename)}; foreach my $lcb (@{$mauvedat{LCBs}}){ print OUTF (join("\t",map {($_->{start},$_->{stop},($_->{start}<0 ? "-":"+"))} (@$lcb))."\n"); } if($basename){ #we can also make the seqs file open(my $seqfh,">$basename.seqs"); local $,="\n"; local $\="\n"; print $seqfh (map {$_->{seqFile}} @{$mauvedat{seqs}}); } sub loadMauveAlignment { my $alignment=shift; open(MAUVE,"<$alignment"); =~m/FormatVersion\s+(\d+)/ or die "Not a mauve file: $alignment"; my $version=$1; my @seqs=(); do {print "This program is written for Mauve Format Version 4.\n This file is version $version. Weird stuff may happen.\n"; $flexible=1;} if $version!=4; =~m/SequenceCount\s+(\d+)/ or die "Unknown sequence count\n"; my $seqCount=$1; while(){ next unless m/Sequence(\d+)File\s+(\S.*)/; my ($seqId,$seqFile)=($1,$2); $_=; m/Sequence${seqId}Length\s+(\d+)/ or $flexible or die "Input file is weird: $_"; my $seqLength=$1; $seqs[$seqId]={'seqId' => $seqId,'seqFile' => $seqFile,'seqLength'=>$seqLength, 'seqName' => getName($seqFile) }; last if $seqId==$seqCount-1; } my @LCBs=(); $_=; m/IntervalCount\s(\d+)/ or $flexible or die "Interval Count line weird: $_"; my $LCBCount=$1; while(){ m/Interval\s(\d+)/ or next; my $LCBId=$1; $_=; chomp; my ($length,@start)=split(/\s+/); my @stop=map {$_+$length} @start; # print "Start is at : ".join("\t",@start)."\n"; # print "Updating stop to: ".join("\t",@stop)."\n"; my @segs; while(){ chomp; last if $_ eq ''; next if $_ eq 'GappedAlignment' or m/^[A-Z-]+$/; #skip gapped lines ($length,@segs)=split(/\s+/); @stop=map {$_+$length} @segs; } next if (grep {$_==0} @start)>0; my @LCB=map { { start => $start[$_], stop => $stop[$_], LCBId => $LCBId }} 0..$#stop; push(@LCBs,\@LCB); } return {'seqs' => \@seqs, 'LCBs' => \@LCBs}; } sub getName { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $name } @_; return @ret if $#_; return $ret[0]; } __END__ =head1 NAME mauve2anchors - converts a Mauve alignment into Murasaki anchor format =head1 SYNOPSIS mauve2anchors [output] =head1 OPTIONS --autoout -- automatically pick an output name (otherwise output goes to stdout) =over 8 =back murasaki/roc-at-cutoff.pl0000755000177700001440000002243611434752242014776 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; use Getopt::Long; use Pod::Usage; #use Data::Dump qw{dump}; use strict; my ($help,$man,$opt_prefix); BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my $opt_pred="tfidf"; my $samples; my $fn; my $format="png"; my $lwd=3; my ($opt_log,$opt_clean,$opt_nofstats,$fstats,%avg,$drawAvg,$maxsamples,$nofn,$opt_pointsize,$opt_title,$opt_notitle,$opt_titlescale,$opt_legendscale); GetOptions('help|?' => \$help, man => \$man, "predictor|x|stat=s"=>\$opt_pred, 'samples|n=i' => sub { pod2usage(-verbose=>1,-exitstatus => 0,-msg => "Error: $_[0] must be at least 2") unless $_[1]>1; $samples=$_[1]; }, 'fn=i'=>\$fn,'log=s'=>\$opt_log, clean=>\$opt_clean, 'format=s'=>\$format, 'legendscale=f'=>\$opt_legendscale, pdf=>sub{$format='pdf'},lwd=>\$lwd, nofstats=>\$opt_nofstats, 'pointsize=f'=>\$opt_pointsize,'title=s'=>\$opt_title, 'notitle'=>\$opt_notitle,'titlescale=f',\$opt_titlescale, fstats=>\$fstats, 'avg:s%'=>sub{ $drawAvg=1; $avg{$_[1]}=$_[2] if $#_>1}, 'noavg'=>sub{$drawAvg=0},'maxsamples|maxn=i'=>\$maxsamples,nofn=>\$nofn ) or pod2usage(-exitstatus => 1);; pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; if(defined($samples)){ $samples=int $samples; $samples>1 or die "Samples must be greater than 1"; } if(defined($maxsamples)){ $maxsamples=int $maxsamples; $maxsamples>1 or die "Samples must be greater than 1"; } foreach my $file (@ARGV){ my ($basename,$path,$suffix)=fileparse($file,qr/\.rocr?/); $path=~s!/$!!; #kill trailing / if any my $basefile="$path/$basename"; my $rocrfile="$basefile.rocr"; #attempt to grab a FN count from a filterstats file if we don't have one if(!$opt_nofstats){ unless($fstats and -f $fstats){ my @fstats=<$basefile.*.filterstat*>; $fstats=$#fstats>0 ? pickOne("Multiple filterstats files found:","Which should I use? ",@fstats):$fstats[0]; } goto STARTSAMPLE unless -f $fstats; open(my $fstat_fh,$fstats); while(<$fstat_fh>){ if(m/Initial stats:/){ <$fstat_fh>; #junk line about # of anchors GETMEANS: while(<$fstat_fh>){ $avg{$1}=$2 if m/^(\w+) mean: (\d+\.?\d*)/; last GETMEANS unless m/^\w+ mean:/; } } if(m/^Experimental Orthos:/){ $_=<$fstat_fh>; $fn=$1 if m/FN: (\d+)/; last; } } print "Got FN=$fn from $fstats\n" if $fn; print "Found averages for: ".join(", ",sort keys %avg)."\n"; } STARTSAMPLE: my (@fields); open(my $fh,$rocrfile) or die "Couldn't open $rocrfile"; $_=<$fh>; chomp; @fields=split(/\t/,$_); print "Fields available: ".join(", ",@fields)."\n"; my @preds=($opt_pred); if ($opt_pred eq 'all') { @preds=grep {$_ ne 'label'} @fields; } print "Making ROC graphs for @preds\n"; foreach my $pred (@preds) { my $rocfile="$basefile.$pred.roc"; my $rsrc="$basefile.$pred.R"; my @outfields=($pred,qw{tpr fpr anchors spec}); unless(-f $rocfile and !$opt_clean){ #reuse rocfile if there is one print "Sampling $rocrfile against $pred statistics\n"; die "No rocr file: $rocrfile? Run filter.pl --rocr to make one..." unless -f $rocrfile; seek($fh,0,0); <$fh>; #chomp field header my ($predi)=grep {$fields[$_] eq $pred} (0..$#fields); die "Stats for $pred not in file..." unless defined $predi; my (@tp,@fp); while (<$fh>) { chomp; my @dat=split(/\t/,$_); my $lr=($dat[0]==1 ? \@tp : \@fp); push(@$lr,$dat[$predi]); } my @x; if (defined($samples)) { my ($min,$max)=(min(@tp,@fp),max(@tp,@fp)); my $range=$max-$min; @x=map {$min+$_/$samples*$range} (0..$samples); } else { my %h=(); foreach (@tp,@fp) { $h{$_}=1; } my $n=scalar(keys %h); if(defined $maxsamples and $n>$maxsamples){ print "Maxsamples ($maxsamples) exceeded ($n found). Sampling linearly across $maxsamples.\n"; my ($min,$max)=(min(@tp,@fp),max(@tp,@fp)); my $range=$max-$min; @x=map {$min+$_/$maxsamples*$range} (0..$maxsamples); }else{ @x=sort {$a<=>$b} keys %h; } } push(@outfields,qw{sens}) if defined $fn; open(my $ofh,">$rocfile") or die "Couldn't write to $rocfile"; print $ofh join("\t",@outfields),"\n"; my ($stp,$sfp)=(scalar(@tp),scalar(@fp)); die "Can't calculate rates without tp ($stp) and fp ($sfp).\n" unless ($stp and $sfp); foreach my $x (@x) { my ($ftp,$ffp)=map {scalar(grep {$_>=$x} @$_)} (\@tp,\@fp); my ($tpr,$fpr,$anchors,$spec)=($ftp/$stp,$ffp/$sfp,($ftp+$ffp)/($stp+$sfp),($ftp/($ftp+$ffp))); my @outbits=($tpr,$fpr,$anchors,$spec); if (defined $fn) { push(@outbits,($ftp/($stp+$fn))); } print $ofh join("\t",$x,@outbits) , "\n"; } close $ofh; } my $legendpos=($opt_log=~m/x/) ? #if x is logscale "min(roc[,'$pred'])+(max(roc[,'$pred'])-min(roc[,'$pred']))/10":"max(roc[,'$pred'])*.8"; my $pointsize=$opt_pointsize ? ",pointsize=$opt_pointsize":undef; my $outputter=$format ne 'pdf' ? qq!bitmap(file="$rocfile.$format",type="png16m",width=10,height=7,res=96$pointsize)!: qq!pdf(file="$rocfile.$format",width=10,height=7$pointsize)!; my $title=$opt_title ? "'$opt_title'":"'$basename $pred ROC vs cutoff'"; my @pars; if($opt_notitle){ push(@pars,'mai=c(op$mai[1:2],rep(op$mai[4],2))'); $title=""; }elsif($opt_titlescale){ push(@pars,"cex.main=op\$cex.main*$opt_titlescale"); push(@pars,'mai=c(op$mai[1:2],op$mai[3]*'.$opt_titlescale.',op$mai[4])'); } my ($paron,$paroff); if(scalar(@pars)){ $paron="op <- par(no.readonly = TRUE);par(".join(",",@pars).")"; $paroff="par(op)"; } my $legendcex=$opt_legendscale ? $opt_legendscale:1; #do the R output open(my $R,">$rsrc"); print $R <$best; } return $best; } sub pickOne { my ($ps1,$ps2,@opts)=@_; print $ps1."\n"; print map {($_==0 ? "[$_]":" $_ ").": $opts[$_]\n"} 0..$#opts; my $res; do{ print $ps2; $res=; chomp $res; }while($res && ($res<0 or $res>$#opts)); return $opts[$res]; } __END__ =head1 NAME roc-at-cutoff.pl - computes ROC/sensitivity/specificity at various/all thresholds =head1 SYNOPSIS cbtest.pl [input2 ...] =head1 OPTIONS Input file should be some alignment that has as a .rocr file (presumably generated by filter.pl --rocr). Output is graphed to .cutoff.roc.png. sensitivity/specificity requires a false negative count, and as such requires a .filterstats file. Other options: --stat|predictor|x specifies what stat to use as a predictor --samples|n sets number of samples (default is sample at each possible cutoff) --maxsamples|maxn sets a max for n/samples --clean forces a re-sampling of the .rocr file --log can apply log scale to x or y or xy axes --fn can specify a FN count for calculating sensitivity --avg preload averages (normally found from filterstats) (note: averages no drawn by default, so specify --avg (with no args) to enable them) --noavg don't plot the line for averages --lwd can specify line weight --format can specify output file format (default png) --pdf set format to pdf --nofstats disable searching for filterstats statistics --nofn pretend not to have fn statistics (ie: skip sensitivity) --fstats specify the filterstats file to use murasaki/maf2anchors.pl0000755000177700001440000001772611434752242014536 0ustar krispusers#!/usr/bin/perl -w #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . ################## ## convert mauve alignments to Murasaki anchor files -- krisp ################## use File::Basename; use Getopt::Long; use Pod::Usage; use Data::Dump qw{dump}; use IO::Handle; use strict; my ($help,$man,$align_type,$opt_prefix,$fakeSeqs,$trustTBA,$sortSeqs); $trustTBA=1; my $rifts=0; GetOptions('help|?' => \$help, man => \$man, 'prefix=s'=>\$opt_prefix, 'rifts=i'=>\$rifts,'fakeseqs'=>\$fakeSeqs, 'trust!'=>\$trustTBA, 'sort!'=>\$sortSeqs ); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my ($filename,$outfile)=@ARGV; my ($basename,$path,$suffix) = fileparse($filename); open(my $infh,"<",$filename) or die "Couldn't open $filename"; my $prefix=$opt_prefix ? $opt_prefix:$filename; my %seqs; my @allSeqs; local $\="\n"; local $,="\n"; my $state=0; our (%seqsSeen,%seqsLen,$seqsLocked); my $stanzaCount; if(!$trustTBA or !peekSeqs($infh,\%seqs,\@allSeqs)){ print "Reading MAF once to determine sequences used..."; while(<$infh>){ chomp; next if m/^\#/; if($state==0){ next unless m/^(\w) ?(.*)/; my %header=parseHeader($1,$2); my %stanza=parseStanza($infh,%header); $stanzaCount++ if $header{type} eq 'a'; #don't care about anything but alignment blocks } } @allSeqs=sort keys %seqsSeen; print "Read $stanzaCount stanzas about ".scalar(@allSeqs)." sequences"; }else{ print "Anticipating ".scalar(@allSeqs)." sequences (@allSeqs) based on TBA tag."; %seqsSeen=map {$_=>1} @allSeqs; } $seqsLocked=1; our $minSeqs=(scalar @allSeqs)-$rifts; my $anchorCount=0; print "Rereading and writing anchor data..."; seek($infh,0,0); $infh->input_line_number(1); open(my $outfh,">","$prefix.anchors"); my @stanzas; while(<$infh>){ chomp; next if m/^\#/; if($state==0){ next unless m/^(\w) ?(.*)/; my %header=parseHeader($1,$2); my %stanza=parseStanza($infh,%header); next unless $header{type} eq 'a'; #don't care about anything but alignment blocks foreach my $anchor (makeAnchors(\%stanza,grep {exists $stanza{$_}} @allSeqs)){ print $outfh $anchor; }continue{$anchorCount++} } } print "Made $anchorCount anchors."; print "Generating sequence data."; open(my $seqfh,">","$prefix.seqs"); if(!$fakeSeqs){ print $seqfh @allSeqs; }else{ my $seqprefix="$prefix.seqs"; foreach my $s (@allSeqs){ my $name=$s; my $len=$seqsLen{$s}; $name=~s/[^a-zA-Z0-9]/_/g; my $fakefile="$seqprefix.$name.fa"; print $seqfh $fakefile; open(my $fakeseqfh,">",$fakefile) unless -f $fakefile; #doesn't actually have to be used if($fakeseqfh){ print $fakeseqfh ">fake version of $s ($len)"; } foreach my $lenfile ("$fakefile.length","$fakefile.len"){ open(my $fakeseqlenfh,">",$lenfile) or die "Couldn't create $lenfile"; print $fakeseqlenfh $len; if($lenfile eq "$fakefile.len"){ print $fakeseqlenfh join("\t",$len,$s); } } } } sub makeAnchors { my ($stanza,@seqs)=@_; return () if scalar @seqs < $minSeqs; my %starts=map {$_=>$stanza->{$_}->{start}} @seqs; my %inGap=map {$_=>(substr($stanza->{$_}->{alignment},0,1) eq '-')} @seqs; my @anchors; my $done; my $length=0; my $alignLength=min(map {length($stanza->{$_}->{alignment}) } @seqs); die "Whacky incomplete alignment" if grep {length($stanza->{$_}->{alignment})!=$alignLength} @seqs; foreach my $offset (0..$alignLength){ my %oldGap=%inGap; %inGap=map {$_ =>(substr($stanza->{$_}->{alignment},$offset,1) eq '-')} @seqs; if(grep {$oldGap{$_} ne $inGap{$_}} @seqs){ #change in rift members push(@anchors,makeAnchor($stanza,\%starts,$length,\%oldGap,@seqs)); #start a new one foreach my $s (@seqs) { $starts{$s}+=$length unless $oldGap{$s}; } $length=0; } $length++; } #and finally we should be left the with the last trailing anchor push(@anchors,makeAnchor($stanza,\%starts,$length,\%inGap,@seqs)); return @anchors; } sub makeAnchor { my ($stanza,$starts,$length,$inGap,@seqs)=@_; our $minSeqs; return () unless (scalar grep {!$inGap->{$_}} @seqs)>=$minSeqs; return join("\t",map { (!exists $inGap->{$_} or $inGap->{$_}) ? (0,0,'+'):do { my $fStart=$starts->{$_}+1; #MAF is 0 based, so +1 my $fStop=$fStart+$length-1; #when MAF indicates a - strand, the coords are from the start of the reverse comp'd sequence, so requires messy calculation ($fStart,$fStop)=(($fStart-1)-$stanza->{$_}->{totalLen},($fStop-1)-$stanza->{$_}->{totalLen}) if $stanza->{$_}->{strand} eq '-'; ($fStart,$fStop,$stanza->{$_}->{strand})} } @allSeqs ); } sub min { my ($m,@l)=@_; foreach my $x (@l){ $m=$x if $x<$m; } return $m; } sub max { my ($m,@l)=@_; foreach my $x (@l){ $m=$x if $x>$m; } return $m; } sub getName { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $name } @_; return @ret if $#_; return $ret[0]; } sub parseHeader { my ($type,$bits)=@_; my %bits=map {m/(^[^=]+)=(.*)$/ ? ($1=>$2):()} split(/ /,$bits); return (%bits,type=>$type); } sub parseStanza { my ($infh,%header)=@_; my %alignments; our %seqsSeen; while(<$infh>){ chomp; last if length() <=0; my ($type,$bits)=m/^(\w) (.*)$/; next unless $type eq 's'; #don't care about anything but sequence lines my ($seq,$start,$len,$strand,$totalLen,$alignment)=split(/\s+/,$bits); $seqsLen{$seq}=$totalLen unless exists $seqsLen{$seq}; if($seqsLocked){ unless($seqsSeen{$seq}){ warn "Unexpected sequence ($seq) in MAF. Ignoring."; next; } }else{ $seqsSeen{$seq}++ unless $seqsSeen{$seq}; } $alignments{$seq}={start=>$start,len=>$len,strand=>$strand,alignment=>$alignment,totalLen=>$totalLen,%header}; } return %alignments; } sub peekSeqs { my ($infh,$seqs,$allSeqs)=@_; seek($infh,0,0); #reset file pointer my $l1=<$infh>; my $l2=<$infh>; seek($infh,0,0); #reset file pointer return undef unless $l1=~m/scoring=tba\.v\d+/; #this probably won't work unless it's TBA and I don't know about prior versions return undef unless $l2=~m/^\# tba\.v12 \((.+)\)/; my $tree=$1; $tree=~s/\(|\)//g; #strip actual tree data. we just want to know what seqs to expect @$allSeqs=split(/\s+/,$tree); @$allSeqs=sort @$allSeqs if $sortSeqs; return undef unless @$allSeqs; %$seqs=map {$_=>1} @$allSeqs; return @$allSeqs; } __END__ =head1 NAME maf2anchors -- Converts maf files into Murasaki anchor sets =head1 SYNOPSIS maf2anchors =head1 OPTIONS =over =item --prefix= use as a prefix for output file names (otherwise uses ) =item --rifts= allow up to rifts. =item --fakeseqs generate fake sequence length files and sequence stubs based on sequence length data in MAF file =item --trust trust the comment annotation added by TBA on the second line of the file (if present) that it lists tree used in the alignment and all of the sequences involved. Defaults to on. use --notrust to distrust TBA tags (technically this data isn't part of the file format specification and is just crammed into comments that TBA produces, but might not be present or accurate on all MAF files, thus we give you the option to distrust it). =item --sort Sort the sequences found by --trust (so as to match the output that would have happened with --notrust). =back murasaki/dna2binary.pl0000755000177700001440000000527111434752241014353 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Std; use File::Basename; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; $Getopt::Std::STANDARD_HELP_VERSION=true; getopts('DCncBrl'); if($opt_h){HELP_MESSAGE();exit(0);} ($filename,$outfile)=@ARGV; if($filename and -e $filename){ $inseq=`$root/geneparse.pl -c $filename`; my ($basename,$path,$suffix) = fileparse($filename); $name="$basename-revcomp"; } else { print STDERR "File $filename not found. Waiting for input from stdin.\n" unless !$filename or $filename eq "-"; while($_=){ chomp; $inseq.=$_; } $name="stdin"; } $outfile="-" unless $outfile; if(!$opt_C){ open(OUTF,">$outfile"); }else{ open(OUTF,"|$root/faformat.pl --name=\"$name\" - $outfile"); } if($opt_r or $inseq=~m/^[10]+$/){ %bases=("00" => "A", "01" => "C", "10" => "G", "11" => "T"); while($inseq=~m/([10]{2})/g){ # print "Parsing $` - $& - $'\n"; $outseq.=$bases{$1}; } $outseq=lc($outseq) if $opt_l; print OUTF "$outseq"; }else{ $inseq=~s/[ \.a-]/00/gi; $inseq=~s/c/01/gi; $inseq=~s/[gu]/10/gi; $inseq=~s/[t]/11/gi; $inseq=~s/[^01]//g unless $opt_n; print OUTF "Binary: " unless $opt_c; print OUTF "$inseq"; unless($opt_D){ print "\n" unless $opt_B; $inseq=bin2dec($inseq); print OUTF "Decimal: " unless $opt_c; print OUTF "$inseq"; } } close(OUTF); print "\n"; sub main::HELP_MESSAGE { print <] [] If you don't specify an infile or outfile, stdin/stdout is used. -C specifies pretty formatted output (can't imagine why...but ok) -c specifies clean output. (ie: no labels) -D skip decimal output -B skip binary output -n don't erase non-nucleotides (default is set blasters to kill) -r reverse (binary to bases) -l output lowercase ENDTEXT ; } sub main::VERSION_MESSAGE { } sub bin2dec { my $arg=shift; my ($pos,$ret)=(0,0); foreach (split(//,reverse($arg))){ $ret+=$_<<$pos; $pos++; } return $ret; } murasaki/phAlignment2anchors.pl0000755000177700001440000000726511434752241016235 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . ############### ## convert Pattern Hunter alignment to Murasaki Anchors # by krisp use File::Basename; use Getopt::Long; use Pod::Usage; #use Data::Dump qw{dump}; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; use strict; my ($help,$man,$align_type); our $flexible=0; our $signed=1; my $useAlignment; GetOptions('help|?' => \$help, man => \$man); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; foreach my $file (@ARGV){ local $\="\n"; my ($basename,$path,$suffix) = fileparse($file); die "Input file not found: $file" unless -e $file; open(my $infh,$file) or die "Couldn't open $file"; my $prefix="$path$basename.aligned"; my $outf="$prefix.anchors"; open(my $otfh,">$outf") or die "Couldn't write to $outf"; my ($state,$strand,$length,$qi,$si,$links,$in,$qinc,$starts,$startq,$qx,$sx); $state='findStrand'; my $count=0; while(<$infh>){ if($state eq 'findStrand'){ next unless m!Strand = (\w+) / (\w+)!; $strand=$1; $qinc=($strand=~m/Minus/i ? -1:1); $state='readQuery'; $in=0; }elsif($state eq 'readQuery'){ chomp; # print "Query:$_" if $_=~m/\S/; if(m/Score =/i){ #ended a stanza # print "Hit the end!\n"; if($in){#were we midway through an anchor? # print "Making a final anchor: ".join(" ",$starts,$sx,'+',$startq,$qx,($strand=~m/Minus/i ? '-':'+'))."\n"; print $otfh join("\t",$startq,$qx,($strand=~m/Minus/i ? '-':'+'),$starts,$sx,'+'); $count++; } $state='findStrand'; next; } next unless m!Query:\s+(\d+)\s+(\S+)!; my ($qdat,$sdat)=($2); $qi=$1; $links=<$infh>; $_=<$infh>; m!Sbjct:\s+(\d+)\s+(\S+)! or die "Wtf? $_"; $si=$1; $sdat=$2; # print "Query $qi: $qdat\n", # "Subjt $si: $sdat"; #parse link bits chomp $links; $links=~s/^ //; #stupid white space based header my $inre=qr/[^\-]/; foreach my $i (0..(length($qdat)-1)){ my ($sc,$lc,$qc)=map {substr($_,$i,1)} ($sdat,$links,$qdat); $qx=$qi+$i*$qinc if $qc=~m/$inre/; $sx=$si+$i if $sc=~m/$inre/; if($qc=~m/$inre/ and $sc=~m/$inre/){ if($in){ #nothin to see here... }else{ # start a new anchor $startq=$qx; $starts=$sx; $in=1; } }else{ if($in){ #end of an anchor my $stopq=$qx; my $stops=$sx; # print "Making an anchor midway ($i -> '$qc,$lc,$sc): ".join(" ",$starts,$stops,'+',$startq,$stopq,($strand=~m/Minus/i ? '-':'+'))."\n"; print $otfh join("\t",$startq,$stopq,($strand=~m/Minus/i ? '-':'+'),$starts,$stops,'+'); $count++; $in=0; }else{ } #move along... } } }else{ die "Uh oh. Broken FSM!"; } } print "Done with $outf ($count anchors)"; } __END__ =head1 NAME phAlignment2anchors - converts a PatternHunter alignment into Murasaki anchor format =head1 SYNOPSIS mauveAlignment2anchors [input2 ...] =head1 OPTIONS Nothing. This is a very simple program. murasaki/antisense.pl0000755000177700001440000000366511434752242014321 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Std; use File::Basename; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; $Getopt::Std::STANDARD_HELP_VERSION=true; getopts('chRC'); if($opt_h){HELP_MESSAGE();exit(0);} ($filename,$outfile)=@ARGV; if($filename and -e $filename){ $inseq=`$root/geneparse.pl -c $filename`; my ($basename,$path,$suffix) = fileparse($filename); $name="$basename-revcomp"; } else { print STDERR "File $filename not found. Waiting for input from stdin.\n" unless !$filename or $filename eq "-"; while($_=){ chomp; $inseq.=$_; } $name="stdin"; } $outfile="-" unless $outfile; if($opt_c){ open(OUTF,">$outfile"); }else{ open(OUTF,"|$root/faformat.pl --name=\"$name\" - $outfile"); } $inseq=reverse($inseq) unless $opt_C; $inseq=~y/agtcAGTC/tcagTCAG/ unless $opt_R; print OUTF $inseq; close(OUTF); sub main::HELP_MESSAGE(){ print < [] ] If you don't specify an infile or outfile, stdin/stdout is used. -c specifies clean output (ie: just the sequence) -R specifies reverse ONLY -C specifies complement ONLY Now if you're wondering what mixing -R and -C do... ENDTEXT ; } sub main::VERSION_MESSAGE(){ } murasaki/cgr-image.pl0000755000177700001440000001574211434752242014162 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . ############### ## view histogram details as a CGR graph ############### use strict; use Image::Magick; use Getopt::Long; use Getopt::Std; use Pod::Usage; $Getopt::Std::STANDARD_HELP_VERSION=1; our ($highest,$lowest); our $res; our $lasttick; our $samples; our $detailfile; our $seedcount; our $noIntermediates; my ($help,$man); my $err=GetOptions('help|?' => \$help, man => \$man,"samples=i"=>\$samples,"noIntermediates|1"=>\$noIntermediates); pod2usage(1) if $help or $#ARGV<0 or !$err; pod2usage(-exitstatus => 0, -verbose => 2) if $man; foreach $detailfile (@ARGV){ print "Loading seedlist from $detailfile...\n"; open(INF,"<$detailfile"); my @pixels; my $sample; my @keyl; while(){ my ($keyn,$keya,$count)=split(/\s/,$_); $keya=~s/\.//g; push(@keyl,[$keya,$count]); $seedcount+=$count; } print "$seedcount seeds total. Mode switch predicted at sample ".log(sqrt($seedcount))/log(2)."\n"; close(INF); die "No keys!" unless $#keyl>=0; # print join("\n",@keyl),"\n"; my $sample=$keyl[0][0]; print "Available samples: ".(length($sample))."\n"; $samples=(length($sample)) unless $samples and $samples<=(length($sample)); $res=2<<($samples-1); my @graph=growGraph(1,\@keyl,0); print "Writing $res x $res image to file $detailfile.cgr.png...\n"; sneakyImage("$detailfile.cgr.png",\@graph); } sub grabCount { my ($key,$keylr)=@_; my $count; foreach(grep {$_[0]=~m/$key\$/} @$keylr){ $count+=$_[1]; } return $count; } sub pngGraph { print "(using imagemagick pixels may be slow)\n"; my ($filename,$graph)=@_; my $image=Image::Magick->new(); $image->Set(size=>join("x",$res,$res)); $image->Read('xc:white'); my $err; # $err=$image->set(type=>'Grayscale'); warn $err if $err; my ($i,$max)=(0,$res*$res); $lasttick=time; foreach my $y (0..($res-1)){ foreach my $x (0..($res-1)){ my $v=255-int(255*(($$graph[$y][$x])/$highest)); $v=sprintf('%02x',$v); $v="#".$v.$v.$v; $err=$image->Set("pixel[$x,$y]"=>$v); warn $err if $err; ticker($i,$max); $i++; } } $err=$image->Write(filename=>$filename); warn $err if $err; } sub ticker { if(time>$lasttick+5){ $lasttick=time; print int($_[0]/$_[1]*100)."%...\n"; } } sub growGraph { my ($level,$keylr,$old_graphr)=@_; print "Subsample level $level\n"; my @graph=doubleGraph($old_graphr); $highest=0; foreach my $keyd (@$keylr){ my ($key,$count)=(substr($$keyd[0],-$level,$level),$$keyd[1]); my @coords=key2coords($key); $graph[$coords[0]][$coords[1]]+=$count; $highest=$graph[$coords[0]][$coords[1]] if $graph[$coords[0]][$coords[1]]>$highest; } sneakyImage("$detailfile.cgr.$level.png",\@graph) unless $noIntermediates; return (length($$keylr[0][0])>$level and $level<$samples) ? growGraph($level+1,$keylr,\@graph):@graph; } sub doubleGraph { my $src=pop; return ([0,0],[0,0]) unless ref($src) eq "ARRAY"; my @ret; my $buckets=@$src; $buckets*=$buckets; my $redist=$buckets<$seedcount; print $redist ? "Doubling graph and distributing...\n":"Doubling graph...\n"; foreach my $row (@$src){ if($row){ my @drow; foreach(@$row){ $_>>=2 if $redist; push(@drow,$_,$_); } push(@ret,\@drow,[@drow]); }else{ push(@ret,[],[]); } } @{$src}=(); #we can clear the old map after that. return @ret; } sub selectnext { my @ret; foreach(qw{G C A T}){ push(@ret,[map {substr($_,0,length($_)-1)} grep(/$_\$/,@_)]); } return @ret; } sub findLow { my $graph=shift; $lowest=$highest; foreach my $row (@$graph){ if($row){ foreach my $val (@$row){ $lowest=$val if $val<$lowest; } } } return $lowest; } sub firstOver { my $limit=shift; for(0..$#_){ return $_ if $limit<$_[$_]; } return -1; } sub firstUnder { my $limit=shift; for(0..$#_){ return $#_-$_ if $limit>$_[$#_-$_]; } return -1; } sub findNiceRange { my $graph=shift; my @uberlist; foreach my $row (@$graph){ push(@uberlist,@$row) if $row; } @uberlist=sort {$a<=>$b} @uberlist; $highest=$uberlist[$#uberlist]; $lowest=$uberlist[0]; return ($uberlist[int($#uberlist*.01)],$uberlist[int($#uberlist*.99)+1]); } sub writePGM { #holy crap PGMs are easy my ($filename,$graph)=@_; open(PGM,">$filename") or die "Couldnt write to $filename"; my $localres=@$graph; my $maxout=65535; print PGM <$maxout; print PGM $v." "; } print PGM "\n"; ticker($y,$localres-1); } close PGM; } sub sneakyImage { my ($filename,$graph)=@_; #make PGM version print "Writing uncompressed PGM version...\n"; writePGM("$filename.pgm",$graph); print "Converting PGM to $filename with ImageMagick.\n"; #make PNG version my $image=Image::Magick->new(); my $err=$image->Set(depth=>16); #this seems to be ignored anyway warn $err if $err; # $image->Set(size=>join("x",$res,$res)); $image->Read("$filename.pgm"); $image->Write($filename); print "Erasing temporary PGM file.\n"; system("rm $filename.pgm"); } sub showmap { return map {join(" ",@$_)."\n"} @_; } sub key2coords { my $key=shift; $key=reverse $key; my @coord=(0,0); my %field=(C => [0,0], G => [0,1], A => [1,0], T => [1,1]); foreach my $bit (split(//,$key)){ next if $bit eq '.'; $coord[0]<<=1; $coord[1]<<=1; my @add=@{$field{$bit}}; $coord[0]+=$add[0]; $coord[1]+=$add[1]; } return @coord; } sub HELP_MESSAGE(){ pod2usage(1) } __END__ =head1 NAME cgr-image.pl -- Chaos Graph Representation histogram viewer =head1 SYNOPSIS cgr-iamge.pl [options] [alignment2.histogram.details> ...] =head1 OPTIONS --samples=n -> limits to n subsamples (remember resolution is 2^n) --nointermediates|1 => skip all the intermediate outputs =cut murasaki/Makefile0000755000177700001440000000043511434752242013423 0ustar krispusers# Handy Makefile # Kris Popendorf 2005/10/14 # Murasaki project #one Makefile to rule them all... all: $(MAKE) -C src all #one Makefile to bring them all... clean: -rm -rf *.o *~ .*~ core $(MAKE) -C src clean #and in the darkness... depend: $(MAKE) -C src depend #bind themmurasaki/align2kog.pl0000755000177700001440000000755411434752241014205 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Long; use Pod::Usage; use File::Basename; #use Data::Dump qw {dump}; use strict; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; use Murasaki::KOG; my ($man,$help,$makecds); my (@kogmap,@seqnames); my $opterr= GetOptions('help|?' => \$help, man => \$man,makecds=>\$makecds, 'kog=s%' => sub {pod2usage({-msg=>"Need values for --kog $_[1]=$_[2] (lhs and rhs!)"}) unless(defined $_[1] and $_[2]); pod2usage({-msg=>"Bad --kog $_[1]= (lhs)"}) unless(defined $_[1] and $_[2] and $_[1]>=0); pod2usage({-msg=>"Bad --kog $_[1]=$_[2] (rhs)"}) unless(defined $_[1] and $_[2] and $_[1]>=0 and $_[2]); my $newname=scalar(Murasaki::KOG->guessKogMember($_[2])); $kogmap[$_[1]]=$newname ? $newname:$_[2]} ); pod2usage(1) if $help; pod2usage(-exitstatus => 0, -verbose => 2) if $man; pod2usage({-verbose=>1,-exitval=>2,-message=>'Need some input file...'}) if $#ARGV<0; my ($inf,$output)=@ARGV; my ($name,$path,$suffix) = fileparse($inf, qr{\.[^.]*}); $output="$path/$name.kog" if !$output; warn "Warning: going to overwrite $output" if -f $output; open(our $outf,">$output") or die "Couldn't open $output for writing."; makeNames("$path/$name.seqs"); print "Using these kognames: @kogmap\n"; open(my $infh,$inf) or die "Couldn't open $inf"; my $line=0; while(<$infh>){ chomp; my @dats=split(/\t/); print $outf "[?] $line Alignment-derived kog\n"; foreach my $seqid (0..$#kogmap){ my ($start,$stop,$strand)=@dats[map {$seqid*3+$_} (0..2)]; ($start,$stop)=(0-$stop,0-$start) if $start<0; $strand=($strand eq '+' ? 1:-1); my $locii="$name:$seqid:$line"; my $spec=$kogmap[$seqid]; print $outf " $spec: $locii\n"; } print $outf "\n"; }continue{$line++} close $outf; our $root; if($makecds){ my $seqf=repSuffix($inf,".seqs"); open(my $seqfh,$seqf); foreach my $file (<$seqfh>){ chomp $file; print "Making CDS for $file...\n"; system("$root/getcds.pl --redirect murasaki_synth=$inf $file"); } } exit; sub makeNames { my $seqdata=shift; open(my $seqfh,$seqdata) or die "Couldn't read sequence file"; my $kogre=join("|",(Murasaki::KOG->knownKogs,Murasaki::KOG->knownCogs)); my $i=0; my %aliases=Murasaki::KOG->commonAliases; foreach(<$seqfh>){ chomp; push(@seqnames,getName($_)); next if $kogmap[$i]; my ($id)=m/($kogre)/; unless($id){ #try harder... foreach my $alias (keys(%aliases)){ $id=$aliases{$alias} if (m/$alias/i); } } print "Identified $_ as KOG member $id\n" if $id; $id="a$i" unless $id; $kogmap[$i]=$id; }continue{$i++} @kogmap=@kogmap[0..($i-1)] unless scalar(@kogmap)==$i; } sub getName { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $name } @_; return @ret if $#_; return $ret[0]; } __END__ =head1 NAME align2kog.pl - converts an alignment into a kog file =head1 SYNOPSIS align2kog.pl [options] [output file] (make sure to use the murasaki_synth redirect in getcds to generate the corresponding cds) =head1 OPTIONS [Output file] defaults to input.kog (for an input of input.anchors). No options exist yet. murasaki/dnashuffle.pl0000755000177700001440000000362311434752241014440 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Std; use List::Util shuffle; use File::Basename; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; $Getopt::Std::STANDARD_HELP_VERSION=true; getopts('chn'); if($opt_h){HELP_MESSAGE();exit(0);} ($filename,$outfile)=@ARGV; if($filename and -e $filename){ $inseq=`$root/geneparse.pl -c $filename`; my ($basename,$path,$suffix) = fileparse($filename); $name="$basename-shuffled"; } else { print STDERR "File $filename not found. Waiting for input from stdin.\n" unless !$filename or $filename eq "-"; while($_=){ chomp; $inseq.=$_; } $name="stdin"; } $outfile="-" unless $outfile; if($opt_c){ open(OUTF,">$outfile"); }else{ open(OUTF,"|$root/faformat.pl --name=\"$name\" - $outfile"); } $inseq=~s/[^atgc]//gi unless $opt_n; print OUTF join("",shuffle(split(//,$inseq))); sub main::HELP_MESSAGE(){ print <] [] If you don't specify an infile or outfile, stdin/stdout is used. -c specifies clean output (ie: just the sequence) -n specifies to preserve non-atgc data. (Default behaviour is to eliminate them). ENDTEXT ; } sub main::VERSION_MESSAGE(){ } murasaki/histocomp.pl0000755000177700001440000001252211434752241014324 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; use Getopt::Long; use Pod::Usage; #use Data::Dump qw{dump}; use strict; my ($help,$man,$opt_prefix); BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my $samples; my $fn; my $format="png"; my $lwd=3; my ($opt_log,$opt_clean,$opt_nofstats,$fstats,%avg,$drawAvg,$maxsamples,$nofn,$outfile,$opt_names,$opt_xlab,$opt_ylab,$opt_title); $opt_log='xy'; my ($width,$height,$res)=(10,7,96); GetOptions('help|?' => \$help, man => \$man, 'log=s'=>\$opt_log, clean=>\$opt_clean, 'format=s'=>\$format, pdf=>sub{$format='pdf'},'lwd=f'=>\$lwd, 'output=s'=>\$outfile, 'names=s'=>\$opt_names, 'xlab'=>\$opt_xlab,'ylab=s'=>\$opt_ylab, 'res=f'=>\$res,'width=f'=>\$width,'height=f'=>$height, 'title=s'=>\$opt_title ) or pod2usage(1); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my @names=getName(@ARGV); my $type=(scalar(grep(/rank/i,@ARGV))==scalar(@ARGV)) ? "rank":"buckets"; $outfile=((fileparse($ARGV[0]))[1]).join("-",@names).".histogram.$type.$format" unless $outfile; my ($basename,$path,$suffix)=fileparse($outfile,qr/\.[^.]/); $path=~s!/$!!; #kill trailing / if any my $basefile="$path/$basename"; print "Writing output to $basefile\n"; my %infh; foreach my $file (@ARGV){ open($infh{$file},$file) or die "Couldn't open $file"; } my $datafile="$basefile.combined"; if($opt_clean or !-f $datafile){ print "Merging input files...\n"; open(my $datafh,">$datafile"); print $datafh join("\t",@names)."\n"; my $rank=1; while(scalar(keys(%infh))){ my $first=1; foreach my $file (@ARGV){ my ($junk,$good); if(defined($infh{$file})){ my $line=readline($infh{$file}); do{print "$file done\n";delete $infh{$file}} unless $line; chomp $line; # print "Got $line from $file\n" if $line; ($junk,$good)=split(/\t/,$line); } $good="0" unless $good; if($first){ print $datafh $good; }else{ print $datafh "\t".$good; } $first=undef; } }continue{$rank++;print $datafh "\n";} close($datafile); }else{ print "Reusing existing data file\n"; } my @legendTerms=($opt_names ? split(/,/,$opt_names):@names); my $legendpos="1,max(yl)/22+min(yl)"; my $outputter=$format ne 'pdf' ? qq!bitmap(file="$outfile",type="png16m",width=$width,height=$height,res=$res)!: qq!pdf(file="$outfile",width=$width,height=$height)!; my $rsrc="$outfile.R"; my $title=$opt_title ? $opt_title:(join(" ",@names)." $type histogram"); my $xlab=$opt_xlab ? $opt_xlab:($type eq "rank" ? "Rank":"Bucket size"); my $ylab=$opt_ylab ? $opt_ylab:"Frequency"; my $type='l'; #do the R output my $pch="'o'"; open(my $R,">$rsrc"); print $R <0) { #if only 1 x... $color=1; } push(@colors,$color); if($i==1){ print $R <$best; } return $best; } sub pickOne { my ($ps1,$ps2,@opts)=@_; print $ps1."\n"; print map {($_==0 ? "[$_]":" $_ ").": $opts[$_]\n"} 0..$#opts; my $res; do{ print $ps2; $res=; chomp $res; }while($res && ($res<0 or $res>$#opts)); return $opts[$res]; } sub getName { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\..*}); $name } @_; return @ret if $#_; return $ret[0]; } __END__ =head1 NAME histocomp.pl -- plot multiple histograms on one graph =head1 SYNOPSIS histocomp.pl [input2 ...] =head1 OPTIONS Plot a couple histograms on one graph Other options: --log can apply log scale to x or y or xy axes --lwd can specify line weight --format can specify output file format (default png) --pdf set format to pdf --output specify a different output (otherwise it's autonamed from the inptus) murasaki/rearrange.pl0000755000177700001440000000575311434752241014275 0ustar krispusers#!/usr/bin/perl -w #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; use Getopt::Long; use Pod::Usage; #use Data::Dump qw{dump}; use POSIX qw{floor}; use strict; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my ($help,$man,$opt_output,$noRifts); GetOptions('help|?' => \$help, man => \$man, 'output|o=s'=>\$opt_output,'norifts'=>\$noRifts); pod2usage(1) if $help or $#ARGV<1; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my ($infname,@permute)=@ARGV; my $base=fileparse($infname,qr/\.anchors/); die "$infname not found." unless -f $infname; die "Only works on .anchor files." unless $infname=~m/^(.*)\.anchors$/; my $prefix=$1; my $seqs="$prefix.seqs"; die "No .seqs file found.\n" unless -f $seqs; my $oprefix=$opt_output ? $opt_output:"$prefix.".join("-",@permute); my @seqs=split("\n",slurp($seqs)); #check existing @permute to make sure values are valid foreach my $i (0..$#permute){ die "Bad permutation value $permute[$i] at permutation[$i]" unless $permute[$i]<=(scalar @seqs) and $permute[$i]>0; } @permute=map {$_-1} @permute; #switch from 1-index to 0-indexed my ($oseqs,$outf)=("$oprefix.seqs","$oprefix.anchors"); foreach my $f ($oseqs,$outf){print "Warning: overwriting existing $f\n" if -f $f;} open(my $infh,$infname) or die "Couldn't open $infname for reading."; open(my $outfh,">$outf") or die "Couldn't open $outf for writing."; open(my $oseqsh,">$oseqs") or die "Couldn't open $oseqs for writing."; foreach my $p (0..$#permute){ print $oseqsh $seqs[$permute[$p]]."\n"; } close $oseqsh; print "Wrote $oseqs\n"; while(my $line=<$infh>){ my @anchors; while($line=~m/(\S+\t\S+\t\S+)/g){ my $anchor=$1; if($noRifts){ my ($start,$stop,$sign)=(split(/\t/,$anchor)); goto NEXTLINE if $start==0 or $stop==0; } push(@anchors,$anchor); } print $outfh join("\t",map {$anchors[$_]} @permute)."\n"; NEXTLINE: } close $outfh; print "Wrote $outf\n"; sub slurp { local $/; open(my $fh,"@_") or return; return <$fh>; } __END__ =head1 NAME rearrange.pl -- rearrange an anchors file according to some rules =head1 SYNOPSIS rearrange.pl eg. rearrange.pl myalign.anchors 2 3 1 4 =head1 OPTIONS --output - force output to go to some particular file (otherwise it's automatically derived from the input filenames) murasaki/interpolate.pl0000755000177700001440000001110311434752242014640 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . ############### ## chop anchors into tiny bits ## by krisp ###### use Getopt::Long; use Pod::Usage; use File::Basename; use strict; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my ($help,$man,$maxSize); my $opt_res=GetOptions('help|?' => \$help, man => \$man, 'size=i' => \$maxSize); pod2usage(-exitstatus => 0, -verbose => 2) if $man; pod2usage(-exitstatus => 0, -verbose => 99, -section=>"SYNOPSIS|OPTIONS|DESCRIPTION") if $help; pod2usage() if !@ARGV or !$opt_res; my $geneparse="$root/geneparse"; $geneparse="$root/geneparse.pl" unless -x $geneparse; foreach my $file (@ARGV){ my ($name,$dir,$ext)=fileparse($file,qr/\.[^\.]+/); die "Need an .anchors file" unless $ext eq '.anchors'; local $\="\n"; open(my $infh,$file) or die "Couldn't open $file"; open(my $seqinfh,"$dir$name.seqs") or die "No seqfile $dir$name.seqs"; my @seqs=grep {m/\S/} (map {chomp;$_} <$seqinfh>); unless($maxSize>0){ $maxSize=int(min(map {`$geneparse -l -c $_`/1000} @seqs)); print "MaxSize set automatically to $maxSize"; } my $prefix="$dir$name.interp-$maxSize"; print "Writing to $prefix.anchors"; open(my $otseqfh,">$prefix.seqs") or die "Couldn't open $prefix.seqs"; print $otseqfh join("\n",@seqs); open(my $otfh,">$prefix.anchors") or die "Couldn't open $prefix.seqs"; while(my $line=<$infh>){ chomp $line; my @bits=split(/\t/,$line); my @meta=@bits[scalar(@seqs)*3..$#bits]; @bits=@bits[0..(scalar(@seqs)*3-1)]; my @anchors=toAnchors(scalar(@seqs),@bits); my @lens=map {ancLength($_)} @anchors; # print join(" ",map {ancToString($_)} @anchors)." -> ".max(@lens); if(max(@lens)>$maxSize){ #break into chunks my $factor=$maxSize/max(@lens); # print "Splitting @bits into ".int(1/$factor)." chunks (each: ".join(",",map {int($_*$factor)} @lens).")\n"; my $pi=0; while($pi<1){ $pi=1 if($pi>1); my @rescaled=(map { my $len=$lens[$_]; my $anc=$anchors[$_]; # print "$pi: Seq $_: offset: ".($pi*$len)."\n"; { start=>int($anc->{start}+$pi*$len), stop=>int($anc->{start}+(min($pi+$factor,1)*$len)), sign=>($anc->{sign}) } } 0..$#anchors); foreach my $i (0..$#rescaled){ my $test=$rescaled[$i]; # die "Anchor bit $i too big ($pi+$factor):".ancToString($anchors[$i])."(".ancLength($anchors[$i]).")->".ancToString($test)."(".ancLength($test).")" if ancLength($test)>$maxSize; # die "Anchor bit $i too small ($pi+$factor):".ancToString($anchors[$i])."(".ancLength($anchors[$i]).")->".ancToString($test)."(".ancLength($test).")" if ancLength($test)<=1; } print $otfh join("\t",map {(@{$_}{qw{start stop sign}})} @rescaled); }continue{$pi+=$factor} }else{ #output as is print $otfh join("\t",@bits,@meta); } } } sub ancToString { my ($a)=@_; return '['.($a->{start}.'~'.$a->{stop}).']'; } sub ancLength { my ($a)=@_; return ($a->{stop}-$a->{start}); } sub toAnchors { my ($count,@bits)=@_; my @res; foreach my $si (0..($count-1)){ my ($start,$stop,$sign)=map {$bits[$_+$si*3]} 0..2; die "Not an anchor $si: (@bits) -> ($start $stop $sign)" unless $start=~m/\d+/ and $stop=~m/\d+/ and $sign=~m/^[+-]$/; ($start,$stop)=($stop,$start) if $stop<$start; push(@res,{start=>$start,stop=>$stop,sign=>$sign}); } return @res; } sub sum { my $r=0; foreach my $i (@_){ $r+=$i; } return $r; } sub max { my ($r,@l)=@_; foreach my $v (@l){ $r=$v if $v>$r; } return $r; } sub min { my ($r,@l)=@_; foreach my $v (@l){ $r=$v if $v<$r; } return $r; } __END__ =head1 NAME interpolate.pl - chop anchors into arbitrarily sized tiny anchor bits, interpolate anchor break points based on individual anchor lengths. =head1 SYNOPSIS interpolate.pl [input2] =head1 OPTIONS --size= Specify max size for an anchor. By default, it's set 1/1000 of the longest input sequence. murasaki/mfasplit.pl0000755000177700001440000000270111434752241014134 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; use Getopt::Long; use Pod::Usage; GetOptions('help|?' => \$help, man => \$man); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 1) if $man; foreach $file (@ARGV){ my ($name,$path,$suffix) = fileparse($file, qr{\.[^.]*$}); system("mkdir $name") unless -d $name; open(INF,"<$file"); while(){ if(m/^>(.+)/){ $filename=$1; $filename.='.fa' unless $filename=~m/\.fa$/; close(OF) if OF; open(OF,">$name/$filename"); } print OF $_ if OF; } } __END__ =head1 NAME mfasplit.pl -- splits mfa (MultiFASTA) files into individual fa files =head1 SYNOPSIS mfasplit.pl [file2.mfa ...] =head1 OPTIONS =over 8 =back murasaki/chromostitch.pl0000755000177700001440000001046111434752242015026 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; use Getopt::Long; use Pod::Usage; use Roman; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my $forceRoman; my $noRoman; GetOptions('help|?' => \$help, man => \$man, 'output=s' => \$outfile, 'sortall' => \$sortall, 'noroman'=>\$noRoman,'roman'=>\$forceRoman); pod2usage(1) if $help or $#ARGV<1; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my $geneparse="$root/geneparse"; $geneparse="$root/geneparse.pl" unless -x $geneparse; $geneparse=`geneparse.pl` unless -x $geneparse; if(!$outfile){ #not yet specified by getopt $outfile=shift(@ARGV); #snag first arg } foreach $arg (@ARGV){ die "File not found: $arg" if !-e $arg; die "File not readable: $arg" if !-R $arg; if(-d $arg) { #directory parsing mojo print "Reading directory: $arg\n"; chop($arg) if $arg=~m!/$!; #chomp final / if there is one opendir(DIR,$arg); @files=grep {!/^\.|~$|^#.*#$|\.sml$|^README|\.stitch$|\.bin|\.p?hmask$/ and -T "$arg/$_"} readdir(DIR); @files=sort chrNameCmp @files; push(@srcfiles,map {"$arg/$_"} @files); closedir(DIR); }if(-r $arg) { push(@srcfiles,$arg); }else{ die "Can't read file: $arg"; } } $noRoman=grep {numbered($_)=~m/\d+/} @srcfiles unless $forceRoman; @srcfiles=sort chrNameCmp @srcfiles if $sortall; open(OUTF,">$outfile"); $pos=1; foreach $file (@srcfiles){ $length=genomeLength($file); print OUTF join("\t",$file,$length,$pos,$pos+$length)."\n"; $pos+=$length+10; } sub chrNameCmp { my ($ca,$cb)=(map {numbered($_)} $a,$b); my ($na,$nb)=(chrNum($ca),chrNum($cb)); return -1 if $na and !$nb; return 1 if $nb and !$na; return $na<=>$nb unless !($ca<=>$b); return $ca cmp $cb; #last resort is lexical compare } sub parseEnsemblName { my ($file)=@_; my ($filename,$dir)=fileparse($file); my ($species,$assembly,$release,$type,$chrom,$gz)=($filename=~m/([^.]+)\.(.+)\.(\d+)\.(dna(?:_rm)?)\.chromosome\.([^.]+)\.fa(\.gz)?/) or return undef; return {file=>$filename, species=>$species, assembly=>$assembly, release=>$release, type=>$type, chrom=>$chrom, compressed=>$gz}; } sub chrNum { my ($chrom)=@_; return arabic($chrom) if !$noRoman and isroman($chrom); return $1 if $chrom=~m/^(\d+)/; } sub numbered { local $_=pop; my $ensembl=parseEnsemblName($_); return $ensembl->{chrom} if parseEnsemblName($_); return $1 if m/(\d+)(?:\.[a-zA-Z]+)+?$/; return $1 if m/chr(?:omosome)(?:\W)*([ivxIVX0-9]+)/; return undef; } sub genomeLength { $file=pop; print $indent."Finding length of $file ...\n"; # print $indent."Command: $geneparse -c -l $file\n"; open(PARSE,"-|","$geneparse -c -l $file"); $seq=; close(PARSE); chomp($seq); #thou beist an int! return $seq; #omph } __END__ =head1 NAME chromostitch.pl - Makes stitch files for feeding to geneparse.pl to allow feeding of multiple chromosomes to programs like murasaki =head1 SYNOPSIS chromostitch.pl [options] =item Options: =item --sort sorts everything =item --output= altnerate way of specifying an output file =head1 PARAMETERS =over 8 =item B output file (overwrites) =item B List of chromosome input files =item B--output= Alternate way of specifying an output file =item B--sort Always sort EVERYTHING. (Allows for nicely sorted lists from sloppy input of filenames from like multiple directories of *.gbk or something) =back =head1 DESCRIPTION Makes stitch files for feeding to geneparse.pl to allow feeding of multiple chromosomes to programs like murasaki =cut murasaki/mbgd-to-kog.pl0000755000177700001440000000556311434752242014436 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Long; use Pod::Usage; use File::Basename; use Data::Dump qw {dump}; use strict; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my $output=pop(@ARGV); if(!$output or (-f $output and formatCheck($output))){ push(@ARGV,$output) if $output; $output=">-"; } pod2usage({-verbose=>1,-exitval=>2,-message=>'Need some input file...'}) if $#ARGV<0; open(our $outf,">$output") or die "Couldn't open $output for writing."; foreach my $file (@ARGV){ open(my $inf,$file); my $line=<$inf>; chomp $line; my $headerline=$line; #keep this around because (as of recently) it occurs again at the end of the file for some dumb reason my @cols=split(/\t/,$line); unless($cols[$#cols]){ print STDERR "Removing stupid extra tab from header line.\n"; pop(@cols); } my @specs=@cols[1..($#cols-2)]; print STDERR "Using species: @specs\n"; while(<$inf>){ chomp; next if m/^$headerline/; my @dat=split(/\t/); die "Invalid number of dat ($#dat vs $#cols)! $_" unless $#dat==$#cols or $#dat+1==$#cols; my @locii=@dat[1..($#cols-2)]; my $func=$dat[$#cols-1]; my $gene=$dat[$#cols]; $gene="MBGD gene" unless $gene; print $outf "[$func] $dat[0] $gene\n"; foreach my $i (0..$#locii){ my $spec=$specs[$i]; foreach my $locii (split(/ /,$locii[$i])){ die "Weird entry: $locii" unless $locii=~m/$spec:(.*)/; $locii=$1; $locii=~s/MG(\d+)/MG_$1/; $locii=~s/MPN(\d+)/MPN_$1/; $locii=~s/\(\d+\)//; $locii=~s/\.\d+//; print $outf " $spec: $locii\n"; } } print $outf "\n"; } } sub formatCheck { my $file=pop; open(my $inf,$file) or return undef; my ($line1,$line2)=map {readline($inf)} (0,0); my @fields=map {scalar(split(/\t/,$line1))} ($line1,$line2); return 1 if $line1=~m/^ClusterID/ and ($fields[0]==$fields[1] or $fields[0]==($fields[1]+1)); return undef; } __END__ =head1 NAME mbgd-to-kog.pl - converts mbgd cluster files to kog files =head1 SYNOPSIS mbgd-to-kog.pl [options] [input2 ... ] [output file] =head1 OPTIONS If "output file" exists, and is of the right format, it is considered an input file. No other options exist yet. murasaki/lav2murasaki.pl0000755000177700001440000001321211434752242014716 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use strict; use Getopt::Long; use Pod::Usage; use File::Basename; #use Data::Dump qw{dump}; our ($inf,$ungapped,$verbose); my ($help,$man); GetOptions('help|?' => \$help, man => \$man, ungapped => \$ungapped, 'verbose+'=>\$verbose); pod2usage(1) if $help or scalar(@ARGV)<1; pod2usage(-exitstatus => 0, -verbose => 4) if $man; $inf=shift(@ARGV); die "File ($inf) not found.\n" unless open(my $infh,$inf); my ($in_name,$in_path,$in_suffix) = fileparse($inf, qr{\.[^.]*}); our $basename="$in_path/$in_name"; our %outfh; open($outfh{anchors},">$basename.anchors"); open($outfh{details},">$basename.anchors.details"); open($outfh{seqs},">$basename.seqs"); open($outfh{bitscore},">$basename.anchors.bitscore") if $ungapped; unlink("$basename.anchors.bitscore") if !$ungapped and -f "$basename.anchors.bitscore"; our %info; our @seqs; our $anchorsWritten=0; my $state=0; while(<$infh>){ chomp; next unless $_; next if m/^\#/ && $state ne 0; if($state eq 0){ die "Not a LAV file." unless $_ eq "#:lav"; $state++; }elsif($state eq 1){ die "Expected a stanza..." unless m/(\w) \{/; print "Parsing a '$1' stanza...\n" if $verbose>2 or ($verbose and $1 ne 'a');; parseStanza($1,$infh); } } print "Writing sequence data...\n" if $verbose; writeSeqs(); print "Done. Anchors written: $anchorsWritten\n"; exit; sub parseStanza { my ($type,$infh)=@_; my $subline=0; local $_; my $state; my %alignment; while(<$infh>){ chomp $_; if($_ eq "}"){ writeAnchor($alignment{chunks},$alignment{score}) if($type eq 'a' and !$ungapped); last; } if($type eq 'd'){ s/^\s+//; $info{commandline}=$_; foreach my $i (0..4){ $info{matrix}.=<$infh>; } $info{vals}=<$infh>; chomp $info{matrix}; chomp $info{vals}; }elsif($type eq 's'){ m/^\s+\"([^"]*)" (\d+) (\d+) (\d+) (\d+)/ or die "$type stanza line $subline: Doesn't parse..."; push(@{$info{seqs}},$1); $seqs[$subline]={file=>$1,start=>$2,stop=>$3,strand=>$4,contig=>$5}; $seqs[$subline]->{file}=~s/-$// if $seqs[$subline]->{strand}>0; warn "Warning: $type stanza line $subline: Specified file not found..." unless -f $seqs[$subline]->{file}; }elsif($type eq 'h'){ m/^\s+"([^"]*)"/ or die "$type stanza line $subline: Couldn't find a description..."; push(@{$info{description}},$1); }elsif($type eq 'a'){ m/^\s+([sbel]) ((?:\d+ ?)+)/ or die "$type stanza line $subline: Weird alignment line: $_"; $alignment{score}=$2 if($1 eq 's'); if($ungapped){ next unless $1 eq 'l'; my @bits=split(/\s+/,$2); my (@chunks,$p); foreach my $i (0..$#seqs) { $chunks[$i]={start=>shift(@bits)}; } foreach my $i (0..$#seqs) { my $c=shift(@bits); $chunks[$i]->{stop}=$c; if($seqs[$i]->{strand}>0){ ($chunks[$i]->{start},$chunks[$i]->{stop})= ($seqs[$i]->{stop}-$chunks[$i]->{stop}, $seqs[$i]->{stop}-$chunks[$i]->{start}); } } $p=shift(@bits); writeAnchor(\@chunks,$p); }else{ next if $1 eq 'l'; my $field='start' if $1 eq 'b'; $field='stop' if $1 eq 'e'; my @bits=split(/\s+/,$2); $alignment{chunks}=[] unless exists($alignment{chunks}); foreach my $i (0..$#seqs) { my $c=shift(@bits); my $f=$field; ($c,$f) = ($seqs[$i]->{stop}-$c,$f eq 'start' ? 'stop':'start') if $seqs[$i]->{strand}>0; $alignment{chunks}[$i]={} unless ref $alignment{chunks}[$i]; $alignment{chunks}[$i]->{$f}=$c; } } }elsif($type=~m/x|m/){ #dynamically repeatmasked regions. don't care. }else{ warn "Unknown stanza type $type\n"; } }continue{ $subline++; } print "Done parsing $type stanza\n" if $verbose>2 or ($verbose>1 and $type ne 'a'); } sub writeAnchor { my ($chunksr,$score)=@_; $anchorsWritten++; my @dats=map {($_->{start},$_->{stop},$_->{start}<0 ? "-":"+")} @$chunksr; my $bitscore=(int(($score/100*chunkLength(${$chunksr}[0])))); print {$outfh{anchors}} join("\t",@dats)."\n"; print {$outfh{details}} join("\t",@dats,$score)."\n"; print {$outfh{bitscore}} $bitscore."\n" if $ungapped; } sub chunkLength { my ($chunk)=@_; my $len=($$chunk{stop}-$$chunk{start})+1; warn "negative length? what's this crap? from ($$chunk{start} ~ $$chunk{stop})" unless $len>0; return $len; } sub writeSeqs { foreach my $seq (@seqs){ print {$outfh{seqs}} $seq->{file}."\n"; } } __END__ =head1 NAME lav2murasaki.pl -- convert a LAV file (ie: from Blastz) into a Murasaki set of files =head1 SYNOPSIS lav2murasaki.pl [options] =head1 OPTIONS --ungapped|-u => LAV files have "gapped regions" composed of "ungapped regions". Using this option creates 1 anchor for each ungapped region (this also provides a .bitscore file, as each ungapped region contains a percent identity field). --verbose|-v => More verbose output to stdout. Can be applied up to 3 times (anything beyond 1 gets messy though, as each anchor gets 1 line of output). =head1 COMMENTS You and Toadofsky make beautiful music! =cut murasaki/geneparse.pl0000755000177700001440000002027411434752241014273 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . ############### ## loads/reads generic sequences -- krisp ############### use File::Basename; use Getopt::Std; use POSIX qw{pow}; $Getopt::Std::STANDARD_HELP_VERSION=true; getopts('eclrfmhqsv'); #use -c/l my $verbose=$opt_v; my $ignoreBogus=getYesNo($ENV{MURASAKI_SR_IGNOREBOGUS}) if defined getYesNo($ENV{MURASAKI_SR_IGNOREBOGUS}); my $preserveIUPAC=getYesNo($ENV{MURASAKI_GP_KEEPIUPAC}) if defined getYesNo($ENV{MURASAKI_GP_KEEPIUPAC}); if($opt_h){HELP_MESSAGE();exit(0);} ($filename,$outfile)=@ARGV; $inpath=(fileparse($filename))[1] if $filename; ($filename,$start,$stop,$rangeonly)=($1,$2-1,$3,1) if $filename=~m/^(.*?)\[(\d+)\D(\d+)\]$/ and !-e $filename; if($filename and -e $filename){ my $openfile=$filename=~m/\.gz$/ ? "zcat $filename|":"<$filename"; print STDERR "Opening $openfile\n" if $verbose; open(INF,$openfile) or die "Couldn't read $filename"; } else { if(!($opt_s or $filename eq '-')){ print STDERR "Could not find file $filename.\nUse - as filename or -s to allow input from stdin\n"; exit(-1); } print STDERR "File $filename not found. Waiting for input from stdin.\n" unless $opt_q; open(INF,"-"); } if($outfile){ open(OUTF,">$outfile"); $opt_c=1; #never stick a newline on files... } else { open(OUTF,">-"); } $linenum=0; $nogenome=1 if $opt_l or $opt_e; my $lengthCache="$filename.length"; if($opt_l && -f $lengthCache){ #shortcut route! if(-M $filename > -M $lengthCache){ open(my $lenfh,$lengthCache); print OUTF <$lenfh>; exit; }else{ print STDERR "Length cache is old, refreshing...\n"; } } $length=sneakyLength($filename) if $opt_f; if(!$nogenome or ($nogenome and (!$opt_f or !$length))){ $rbytes=read(INF,$_,300); push(@toCheck,split(/\n/,$_)) unless (seek(INF,0,0)); #try to return to start, otherwise keep data if(m/^[AGTCUN]+$/i){ #its a raw print STDERR "Loading raw file...\n"; push(@toCheck,$_); goto LINES; } if(m/(\S+)\t(\d+)\t(\d+)\t(\d+)/){ #stitch format my ($startdir,$startcmd)=($0=~m!^(.*)/([^/]+)!); $startdir=$ENV{PWD}."/".$startdir unless $startdir=~m!^/!; #make absolute my $start=($startcmd ? "$startdir/$startcmd":"$0"); my $localdir=(fileparse($filename, qr{\.[^.]*}))[1]; my $totallength=0; print STDERR "Stitch file --- " unless $opt_q; push(@lines,); print STDERR " containing ".($#lines+1)." entries\n" unless $opt_q; @files=map {my ($name,$length,$start,$stop)=split(/\s+/,$_);$name} @lines; @files_dat=map {my ($name,$length,$start,$stop)=split(/\s+/,$_); {name=>$name, length=>$length, start=>$start, stop=>$stop}} @lines; $launch_opts=" -c" if $opt_c; $launch_opts.=" -l" if $opt_l; $launch_opts.=" -f" if $opt_f; $launch_opts.=" -e" if $opt_e; $launch_opts.=" -r" if $opt_r; $launch_opts.=" -q" if $opt_q; $launch_opts.=" -s" if $opt_s; $remaining=$#files; foreach my $filei (0..$#files){ my $file=$files[$filei]; my $file_dat=$files_dat[$filei]; # $file=join("",$inpath,$file) if $inpath; chdir $localdir unless -f $file; die "File $file not found in $localdir or $startdir..." unless -f $file; my $cmd="$start $launch_opts $file"; # print STDERR "Read $file via: $cmd\n"; if($opt_l){ $totallength+=$file_dat->{length}; $totallength+=10 if $filei!=$#files; }else{ print OUTF `$cmd`; print OUTF join('n' x 10,@parts) if $filei!=$#files; } } # print OUTF "nnnnnnnnnn" if $remaining--; #stick 10 n's between chromosomes if($opt_l){ cacheLength($totallength); print $totallength."\n"; }else{ print OUTF "\n" unless $opt_c } exit(0); } my $subseq=1; #doing it this way, we skip the first header here in this block while(1){#skip gbk/fasta header info while(@toCheck){ $_=shift(@toCheck); goto LINES if(m/^(ORIGIN\s*[^\w]?)$|^(\>.*?)$/); #signals start of genome }continue{$linenum++} $_=; push(@toCheck,split(/\n/,$_)); die "File ended before genome started?" unless defined $_; } LINES: print STDERR "Extracting genome starting at line $linenum\n" unless $opt_q; $length=0; { do { $rbytes=read(INF,$_,1024); push(@toCheck,split(/\n/,$_)); while(@toCheck){ $_=shift(@toCheck); goto DONE if m!^//!; if(m/(^>).*/){ s!(^>).*!NNNNNNNNNN!smg if $subseq; #insert 'N' x 10 at each new subsequence (to mimic stitch files) $subseq++; } s!(^;).*!!smg; #erase comments $_=~y/uU/tT/; if($ignoreBogus){ $_=~s/[^atgcn]+//ig; }else{ $_=~s/[^a-z]+//ig; #any non-alphabet codes willl stil be culled (technically IUPAC is only atgcnurymkwsbdhv but, whatever. this provides some extra flexibility). $_=~s/[^atgcn]/n/gi if(!$preserveIUPAC); } $_=~s/[atgc]/n/g if $opt_r; $chars=length($_); if(!$nogenome){ if($rangeonly){ if($length>=$start){ if($length+$chars<$stop){ print OUTF $_; }else{ print OUTF substr($_,0,$stop-$length); } } elsif($length+$chars>=$start){ print OUTF substr($_,$start-$length,$stop-$start); } }else{ print OUTF $_; } } $length+=$chars; goto DONE if $rangeonly and $length>=$stop; } } until(!$rbytes); } DONE: close(INF); } cacheLength($length); if($opt_l){ print "$length\n"; }else{ print OUTF "\n" if $ENV{TERM} and !$opt_c and !$opt_e; } if($opt_e){ $el=int(pow($length,.3)/2.5); $ew=int($el*.65); if($opt_m){ print "[$ew:$el]"; }else{ print "$ew $el"; } print "\n" unless $opt_c; } sub cacheLength { return if $rangeOnly; my $len=pop; open(my $lenfh,">$lengthCache") or return; print $lenfh "$len\n"; close $lenfh; } sub sneakyLength(){ ($filename)=@_; my ($name,$path,$suffix) = fileparse($filename, qr{\.[^.]*}); return 0 unless $suffix eq ".fa"; open(FILE,"<$filename"); $line1=; $line2=; chomp($line2); seek(FILE,-length($line2)*2,SEEK_END); while(){ $lastline=$_; chomp($lastline); } $lines=`wc $filename -l`; $len=length($line2)*($lines-2)+length($lastline);; print STDERR "Fast length detection succeeded: $len\n" unless $opt_q; return $len; } sub main::HELP_MESSAGE(){ print <:] format -q quiet operation (no notifications at all) -s allow input from stdin A neat trick is to use -m to specify a random pattern directly to Mursaki. For example: ./murasaki -p`./geneparse.pl -m -f -e -c seq/humanY.fa` Another trick: using the filename you can specify a start/stop point to extract from. for example: humanX.gbk[1025,3033] -- extracts humanX.gbk from base 1025 to 3033 (the range is inclusive and starts at 1) Behavior regarding what to do with non-ACGT codes can be set from the MURASAKI_SR_IGNOREBOGUS environment variable. If MURASAKI_SR_IGNOREBOGUS can be some value like "true/yes/1" or "false/no/0". If enabled non-ACGT are removed from the output stream. Default is disabled. ENDTEXT ; } sub main::VERSION_MESSAGE(){ } sub getYesNo { my ($v)=@_; return 1 if($v=~m/^(true|yes|1)$/); return 0 if($v=~m/^(false|no|0)$/); return undef; } murasaki/humantime.pl0000755000177700001440000000160211434752242014304 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my ($arg)=@ARGV; print readableTime($arg)."\n"; murasaki/faformat.pl0000755000177700001440000000443311434752242014121 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . ############### ## formats sequences to pretty Fasta -- krisp ############### use Getopt::Long; use Pod::Usage; use File::Basename; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my $geneparse=getProg("geneparse"); die "Couldn't find useable geneparse program" unless $geneparse; our ($name,$width)=(undef,75); GetOptions('help|?' => \$help, man => \$man, "title|name=s" => \$name, "width=i" => \$width, ); pod2usage(1) if $help; pod2usage(-exitstatus => 0, -verbose => 4) if $man; ($filename,$outfile)=@ARGV; if($filename and -e $filename){ open(INF,'-|',"$geneparse -c $filename") or die "Couldn't load geneparse.pl"; my ($basename,$path,$suffix) = fileparse($filename); $name=$basename unless $name; } else { print STDERR "File $filename not found. Waiting for input from stdin.\n" unless !$filename or $filename eq "-"; open(INF,"-"); $name="stdin" unless $name; } if($outfile){ open(OUTF,">$outfile"); } else { open(OUTF,">-"); } print OUTF ">$name\n"; do{ $rbytes=read(INF,$_,$width); goto DONE unless $rbytes; s/\s//gmi; #kill all whitespace print OUTF $_."\n"; }until(!$rbytes); DONE: __END__ =head1 NAME faformat.pl -- Reformat a sequence into fasta format =head1 SYNOPSIS faformat.pl [options] [ []] =head1 OPTIONS =over 8 =item B<--name|--title> Sets the name to be provided on the first line of the fasta file. =item B<--width=> Column width to line wrap at (default 75). =back =head1 DESCRIPTION Reads in a sequence, and puts it out in FastA format. =cut murasaki/perlmodules/0000755000177700001440000000000011434752243014312 5ustar krispusersmurasaki/perlmodules/Murasaki.pm0000644000177700001440000000574711434752240016436 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . package Murasaki; use File::Basename; use strict; use base 'Exporter'; our ($root,@EXPORT,@EXPORT_OK); BEGIN { our $root; unless($root){ my $runby=$0; $runby=`which $runby` unless $runby=~m!/!; #unless absolute path, grab real path this way. unpleasant hack. open to suggestions. $root=(fileparse($runby))[1]; $root=$ENV{MURASAKI} if $ENV{MURASAKI}; } @EXPORT=qw{$root readableTime parseHumanTime getProg}; @EXPORT_OK=qw{$root repSuffix writeOut min max median readfile}; } sub repSuffix { my ($name,$path,$suffix) = fileparse($_[0], qr{\.[^.]*}); return $path.$name.$_[1]; } sub writeOut { my $filename=shift; open(my $ofh,">$filename") or die "Ack! Couldn't open $filename for writing!\n"; print $ofh @_; } sub readableTime { my $t=shift; my $str=""; my %r = ( d => int($t/(60*60*24)), h => int($t%(60*60*24)/60/60), m => int($t%(60*60)/60), s => $t % 60); join(" ",map { "$r{$_}$_" } grep { $r{$_} } qw{d h m s}); } sub parseHumanTime { my $total; my %r=( d => 24*60*60, day => 24*60*60, days => 24*60*60, hour => 60*60, hours => 60*60, h => 60*60, 'minute' => 60, 'minutes' => 60, 'm' => 60, 'second' => 1, 'seconds' => 1, 's' => 1, ); my $line=pop; my $tre=join("|",keys(%r)); if($line=~m/\d+($tre)/){ foreach (split(/\s+/,$line)){ m/(\d+\.?\d*)(\w)/ or warn "Not a time bit: $_"; warn "Unknown time unit: $2" unless exists($r{$2}); $total+=$1*$r{$2}; } }else{ while($line=~m/(\d+\.?\d*) ($tre)/g){ $total+=$1*$r{$2}; } } return $total; } sub getProg { my (@l)=@_; our $root; my (@r)=map {my $name=$_; my $p="$root/$name"; $p="$root/$name.pl" unless -x $p; -x $p ? $p:undef } @l; return @r if wantarray; return $r[0]; } sub readfile { my ($file)=@_; open(my $fh,$file); local $/; my $dat=<$fh>; if(wantarray){ return split(/\n/,$dat); }else{ return $dat; } } sub max { my ($m,@l)=@_; foreach my $x (@l){ $m=$x if $x>$m; } return $m; } sub min { my ($m,@l)=@_; foreach my $x (@l){ $m=$x if $x<$m; } return $m; } sub median { my (@l)=sort {$a<=>$b} @_; return $l[int($#l/2)]if(@l%2==1); return ($l[int($#l/2)]+$l[int($#l/2)+1])/2; } 1; murasaki/perlmodules/Murasaki/0000755000177700001440000000000011434752243016066 5ustar krispusersmurasaki/perlmodules/Murasaki/Ticker.pm0000644000177700001440000000266311434752240017651 0ustar krispusers#!/usr/bin/perl #Murasaki - multiple genome global alignment program #Copyright (C) 2006-2007 Keio University #(Kris Popendorf) (2006) # #This program is free software; you can redistribute it and/or #modify it under the terms of the GNU General Public License #as published by the Free Software Foundation; either version 2 #of the License, or (at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program; if not, write to the Free Software #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. package Murasaki::Ticker; use IO::Handle; use Exporter 'import'; @EXPORT_OK=qw{resetTick tick}; our $screenwidth=75; $screenwidth=$ENV{COLUMNS} if(defined($ENV{COLUMNS}) and $ENV{COLUMNS}>0); sub resetTick { my ($total,$div)=@_; our ($anchorCount,$screenwidth); $div=$screenwidth unless $div; $total=$anchorCount unless $total; our ($ticksper,$ticksleft)=(int($total/$div),int($total/$div)); print "|",(map {"-"} (3..$div)),"|\n"; } sub tick { our ($ticksper,$ticksleft); $ticksleft--; if(!$ticksleft){ print STDOUT '.'; STDOUT->flush(); $ticksleft=$ticksper; } } 1; murasaki/perlmodules/Murasaki/KOG.pm0000644000177700001440000001754511434752240017055 0ustar krispusers#!/usr/bin/perl #Murasaki - multiple genome global alignment program #Copyright (C) 2006-2007 Keio University #(Kris Popendorf) (2006) # #This program is free software; you can redistribute it and/or #modify it under the terms of the GNU General Public License #as published by the Free Software Foundation; either version 2 #of the License, or (at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program; if not, write to the Free Software #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. use strict; package Murasaki::KOG; #use Data::Dump qw{dump}; use Murasaki::Ticker qw{resetTick tick}; sub knownKogs {qw{ath ath dme hsa sce spo ecu osa aga ptr cfa mmu rno mgr ncr}} sub knownCogs {qw{Mth Mja Hbs Tac Tvo Pho Pab Afu Mka Mac Pya Sso Ape Sce Spo Ecu Aae Tma Nos Syn Dra Fnu Tpa Bbu Ctr Cpn Cgl Mtu MtC Mle Cac Sau Lin Bsu Bha Lla Spy Spn Uur Mpu Mpn Mge Pae Eco EcZ Ecs Ype Sty Buc Xfa Vch Hin Pmu Rso Nme NmA Hpy jHp Cje Ccr Atu Sme Bme Mlo Rpr Rco mga mge mpn}} sub commonAliases { (human=>'hsa',rice=>'osa',mosquito=>'aga',chimp=>'ptr',dog=>'cfa',mouse=>'mmu',mus=>'mmu',rat=>'rno', #official ones homo=>'hsa',sapien=>'hsa', #semi-official rhesus=>'mac', macaq=>'mac', cow=>'bov', oposs=>'pos', chicken=>'avi', #made up by me mx=>'mmu',hx=>'hsa',cx=>'ptr',qx=>'mac', #also made up by me (for chromosome x special case) avium=>'mta', bovis=>'mtb', m3=>'mpn',m6=>'mge',m8=>'mga' #made up by osana )} sub empty { my ($class,$kogname)=@_; $kogname="**empty**" unless $kogname; bless {kogs=>{},kogSrc=>$kogname,locToKog=>{}},$class; } sub kogFrom { my ($class,$kogSrc,$srcspec,$knownLocusr,$kogprefr)=@_; my $srcre=qr/.../; #default to all if($srcspec){ $srcre=join("|",@$srcspec); $srcre=qr/$srcre/; } my %kogs; open(my $infh,$kogSrc) or die "Couldn't open $kogSrc\n"; my $state=0; my $kog; my %unknownLocs; my %locToKog; while(<$infh>){ if($state==0){ chomp; m/\[(\S+)\] (\S+) (.*)/ or die "$kogSrc: unparsable KOG description at line $.: $_"; # print "loading kog: func $1 name $2 desc $3\n"; $kog={}; $kog->{func}=$1; $kog->{name}=$2; $kog->{desc}=$3; $state++; }elsif($state==1){ goto KOGDONE if m/^$/; m/^ ($srcre):\s+(.+)/ or next; my ($spec,$allloc)=($1,uc $2); foreach my $loc (split(/s+/,$allloc)){ do{$unknownLocs{$loc}++;next} unless (!$knownLocusr or exists $knownLocusr->{$loc}); #only load for known locuses $loc=~s/_\d+// unless $kogprefr->{underbar}; #this is some KOG crazy talk for domains or something $kog->{members}->{$loc}=$spec; } } next; KOGDONE: $state=0; next unless $kog->{members}; #skip unless it has some contents if($srcspec){ #got a set of species to require? my %met; foreach my $spec (values(%{$kog->{members}})){ $met{$spec}=1; } next if grep {!$met{$_}} @$srcspec; #no soup if you're missing one } #good to go! $kogs{$kog->{name}}=$kog; foreach my $loc (keys(%{$kog->{members}})){ $locToKog{$loc}=$kog; } } my $this=bless {kogs=>\%kogs,kogSrc=>$kogSrc,locToKog=>\%locToKog},$class; print "$kogSrc loaded. ".$this->summary."\n".(keys(%unknownLocs))." unknown locs\n"; print "Unknown locs: ".join(" ",keys(%unknownLocs))."\n" if $kogprefr->{echoUnknown}; return $this; } sub memberList { my ($self,$ref)=@_; my $kog=$self->isIn($ref); return () unless $kog; return keys(%{$kog->{members}}); } sub memberMap { my ($self,$ref)=@_; my $kog=$self->isIn($ref); return () unless $kog; return $kog->{members}; } sub isIn { my ($self,$ref)=@_; my $kog=$self->{locToKog}->{$ref}; return undef unless $kog; return $self->{locToKog}->{$ref}=$self->flatten($kog); } sub isOrtho { my ($self,$l1,$l2)=@_; my ($k1,$k2)=map {$self->isIn($_)} ($l1,$l2); return undef if !$k1 && !$k2; return $k1==$k2; } sub flatten { my ($self,$kog)=@_; return $kog unless $kog->{remapped}; return $kog->{remapped}=$self->flatten($kog->{remapped}); } sub pushHash { $_[0]={} unless ref $_[0] eq "HASH"; my $h1=shift; foreach my $h2 (@_){ while(my ($k,$v)=each(%$h2)){ $h1->{$k}=$v; } } } sub countKogs { my ($self)=@_; return scalar(keys(%{$self->{kogs}})); } sub countGenes { my ($self)=@_; return scalar(keys(%{$self->{locToKog}})); } sub summary { my ($self)=@_; return $self->countKogs." KOGs of ".$self->countGenes." genes"; } sub merge { my ($self,$kog1,$kog2)=@_; return $kog1 if($kog1 && !$kog2); return $kog2 if($kog2 && !$kog1); #no need to merge if only 1 exists return $self->anonKog if(!$kog1 and !$kog2); #no kogs at all? well make one! return $kog1 if ($kog1 == $kog2); #if they're the same kog, pick one... #ok, two different kogs exist and need to be merged. my ($smaller,$bigger)=($kog1->{members}<$kog2->{members} ? ($kog1,$kog2):($kog2,$kog1)); $smaller->{remapped}=$bigger; pushHash($bigger->{members},$smaller->{members}); delete $self->{kogs}->{$smaller->{name}}; #dont need that anymore return $bigger; } sub anonKog { my ($self)=@_; my $kogid=$self->{nextAnonKog}++; my $name="ANON$kogid"; my $anonkog={name=>$name}; $self->{kogs}->{$name}=$anonkog; return $anonkog; } sub add { my ($self,$p1,$p2)=@_; my ($l1,$l2)=map {(keys(%$_))[0]} ($p1,$p2); my ($kog1,$kog2)=map {$self->isIn($_)} ($l1,$l2); return if $kog1==$kog2 and $kog1 and $kog2; #be thee not silly! my $kog3=$self->merge($kog1,$kog2); pushHash($kog3->{members},$p1,$p2); $self->{locToKog}->{$l1}=$kog3; $self->{locToKog}->{$l2}=$kog3; } sub compare { my ($self,$other)=@_; my %loclist; print "Comparing KOGs from $self->{kogSrc} and $other->{kogSrc}\n"; foreach my $kog (values(%{$self->{kogs}}),values(%{$other->{kogs}})){ foreach my $member (keys(%{$kog->{members}})){ $loclist{$member}=1; } } print "Comparing KOG members...\n"; resetTick(scalar(keys(%loclist))); my @unmatched; my %res=(in1=>0,in2=>0,inboth=>0); foreach my $node (keys(%loclist)){ my %matching; foreach my $linked ($self->memberList($node)){ next if $linked eq $node; $matching{$linked}=1; } foreach my $linked ($other->memberList($node)){ next if $linked eq $node; if($matching{$linked}){ $res{inboth}++; delete $matching{$linked}; }else{ $res{in2}++; } } $res{in1}+=keys(%matching); # push(@unmatched,map {"$node~$_"} keys(%matching)); tick(); } # print "\nUnmatched:\n".join("\n",@unmatched)."\n"; $res{in0}=(scalar(keys %loclist)*(scalar(keys %loclist)-1))-$res{in1}-$res{in2}-$res{inboth}; print "\n"; return %res; } sub guessKogMember { my ($class,@src)=@_; our %learnedKogSpecs; my $kogre=join("|",knownCogs,knownKogs,keys %learnedKogSpecs); my $i=0; my %aliases=commonAliases; my @res; foreach(@src){ my ($id)=m/($kogre)/i; unless($id){ #try harder... foreach my $alias (keys(%aliases)){ $id=$aliases{$alias} if (m/$alias/i); } } return $id unless wantarray; push(@res,$id); } return @res; } sub learnKogSpecs { our %learnedKogSpecs; our %learnedFrom; my ($class,$kogSrc)=@_; return if exists $learnedFrom{$kogSrc}; my $state=0; open(my $infh,$kogSrc) or die "Couldn't open $kogSrc\n"; while(<$infh>){ if($state==0){ chomp; m/\[(\S+)\] (\S+) (.*)/ or die "$kogSrc: unparsable KOG description at line $.: $_"; $state++; }elsif($state==1){ goto KOGDONE if m/^$/; m/^ (\w+):\s+(.+)/ or next; $learnedKogSpecs{$1}->{$kogSrc}=1; $learnedFrom{$kogSrc}->{$1}=1; } next; KOGDONE: $state=0; } } 1; murasaki/perlmodules/Murasaki/OrthoList.pm0000644000177700001440000001043711434752240020355 0ustar krispusers#!/usr/bin/perl #Murasaki - multiple genome global alignment program #Copyright (C) 2006-2007 Keio University #(Kris Popendorf) (2006) # #This program is free software; you can redistribute it and/or #modify it under the terms of the GNU General Public License #as published by the Free Software Foundation; either version 2 #of the License, or (at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program; if not, write to the Free Software #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. use strict; package Murasaki::OrthoList; #use Data::Dump qw{dump}; use Murasaki::Ticker qw{resetTick tick}; sub empty { my ($class,$name,$kogmap,$knownLocs)=@_; bless {orthos=>{},name=>$name,knownLocs=>$knownLocs,spec2pos=>reverseMap($kogmap)},$class; } sub canonicalize { my ($members)=@_; return join("~",map {keys(%$_)} (sort {(values(%$a))[0] cmp (values(%$b))[0]} @$members)); } sub addBits { my ($basis,$next,@others)=@_; if($next){ my @bits; foreach my $bit (@$next){ push(@bits,addBits([@$basis,$bit],@others)); } return @bits; }else{ return $basis; } } sub addAll { my ($self,$memr)=@_; our @unfolded=addBits([],(ref $memr eq 'ARRAY') ? (@$memr): (values(%{reverseMap($memr)}))); foreach my $subset (@unfolded){ $self->{orthos}->{canonicalize($subset)}=1; } } sub reverseMap { my $arg=pop; my $ret={}; if(ref $arg eq 'ARRAY'){ for(0..$#$arg){ $ret->{$$arg[$_]}=$_; } }elsif(ref $arg eq 'HASH'){ for my $k (keys(%$arg)){ push(@{$ret->{$arg->{$k}}},{$k=>$arg->{$k}}); } }else{ print "Non-reversable: ".dump($arg)."....\n"; die "Non-reversable arg called on reverseMap...\n"; $ret=$arg; #ummmm } return $ret; } sub toSpecList { my ($self,$members)=@_; my @ret; foreach my $k (keys(%$members)){ do{print "Unknown species $k.\n"; return undef} unless exists($self->{spec2pos}->{$k}); $ret[$self->{spec2pos}->{$k}]=$members->{$k}; } return @ret; } sub orthosFromKog { my ($class,$kogs,$kogmap,$knownLocs)=@_; my $this=empty($class,$kogs->{kogSrc},$kogmap,$knownLocs); #this isnt C++. we can get our object up and going now ^^ print "Filling ortholist from KOGset\n"; foreach my $kogname (keys(%{$kogs->{kogs}})){ $this->addAll($kogs->{kogs}->{$kogname}->{members}) } print $kogs->{kogSrc}." converted to ortholog list. ".$this->summary."\n"; return $this; } sub summary { my ($self)=@_; my $count=scalar(keys %{$self->{orthos}}); return $count.($count==1 ? " ortholog":" orthologs"); } sub possibleOrthosCount { my ($self)=@_; my %count; foreach my $s (values(%{$self->{knownLocs}})){ $count{$s}++; } return 0 unless keys(%count); my $possible=1; foreach my $s (keys(%count)){ $possible*=$count{$s}; } return $possible; } sub addOrtho { my ($self,@set)=@_; foreach my $subset (addBits([],@set)){ $self->{orthos}->{canonicalize($subset)}=1; } } sub isOrtho { my ($self,@set)=@_; my %res; foreach my $subset (addBits([],@set)){ $res{($self->{orthos}->{canonicalize($subset)}) ? "yes":"no"}++; } return %res; } sub compare { my ($self,$other)=@_; my %done; my %res=(in1=>0,in2=>0,inboth=>0); print "Comparing ortholist members...\n"; @done{keys(%{$self->{orthos}})}=(1) x scalar keys(%{$self->{orthos}}); my @inboth; resetTick(scalar(keys(%{$other->{orthos}}))); # print "Done: ".join("\n",keys(%done))."\n"; foreach my $ortho (keys(%{$other->{orthos}})){ # print "Compare: $ortho\n"; if($done{$ortho}){ $res{inboth}++; push(@inboth,$ortho); delete $done{$ortho}; }else{ $res{in2}++; } tick; } $res{in1}+=scalar keys %done; # print "\nOrtho inboth:\n".join("\n+",@inboth)."\n"; # print "\nOrtho in source:\n".join("\n*",keys %{$self->{orthos}})."\n"; if($self->{knownLocs} == $other->{knownLocs}){#impossible to know otherwise $res{in0}=$self->possibleOrthosCount()-$res{in1}-$res{in2}-$res{inboth}; } print "\n"; return %res; } 1; murasaki/perlmodules/Murasaki/SeqFeatures.pm0000644000177700001440000000551511434752240020656 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . package Murasaki::SeqFeatures; use Exporter 'import'; @EXPORT=qw{readCds writeCds}; use Bio::SeqFeature::Generic; sub readCds { my $file=shift; open(my $infh,"<",$file); my $offset=shift; $offset=0 unless $offset; my @features; while(<$infh>){ my @a=split(/\s/,$_); unless($#a>2){ print "Couldn't parse line $file $.: $_\n"; next; } push(@features,new Bio::SeqFeature::Generic (-primary => CDS, -tag => {($a[0] ? (gene => $a[0]):()), ($a[4] ? (locus_tag=>$a[4]):())}, -start => $offset+$a[1], -end => $offset+$a[2], -strand => $a[3]) ); } close($infh); return @features; } sub writeCds { my $outf=shift; my ($gene_src,$locus_src)=("gene","locus_tag"); $locus_src=$common_overrides{locus} if $common_overrides{locus}; $gene_src=$common_overrides{gene} if $common_overrides{gene}; open(my $outfh,">",$outf) or die "Couldn't open $outf for writing"; foreach(@_){ my ($name,$locus); $name=join(";",$_->get_tag_values($gene_src)) if $_->has_tag($gene_src); $name=join(";",$_->get_tag_values('gene')) if $_->has_tag('gene') && !$name; $name=join(";",$_->get_tag_values('protein_id')) if $_->has_tag('protein_id') && !$name; $name=join(";",$_->get_tag_values('product')) if $_->has_tag('product') && !$name; $name=$common_overrides{setName} if exists $common_overrides{setName}; $locus=join(";",$_->get_tag_values($locus_src)) if $_->has_tag($locus_src); if($common_overrides{extract} and $locus){ my $pat=$common_overrides{extract}; $locus=~m/$pat/; $locus=$1 if $1; } if($common_overrides{prefix} and $locus){ #I'm looking at you EcZ.... $locus=$common_overrides{prefix}.$locus; } if($common_overrides{autoenum}){ #Cgl tRNA. no locus names our $lastLocusId; $locus.=$common_overrides{autoenum}.++$lastLocusId; } ($name,$locus)=map { $_ ? $_:"" } ($name,$locus); foreach($name,$locus){ $_=~s/\s/_/g }; print $outfh join("\t",$name, $_->location->start, $_->location->end, $_->location->strand, $locus),"\n"; } close($outfh); } 1; murasaki/perlmodules/Murasaki/OrthoConsistency.pm0000644000177700001440000001575211434752240021750 0ustar krispusers#!/usr/bin/perl #Murasaki - multiple genome global alignment program #Copyright (C) 2006-2007 Keio University #(Kris Popendorf) (2006) # #This program is free software; you can redistribute it and/or #modify it under the terms of the GNU General Public License #as published by the Free Software Foundation; either version 2 #of the License, or (at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program; if not, write to the Free Software #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. use strict; package Murasaki::OrthoConsistency; #use Data::Dump qw{dump}; use Murasaki::Ticker qw{resetTick tick}; #no Carp::Assert; sub empty { my ($class,$name,$kogmap,$knownLocs)=@_; bless {orthos=>{},name=>$name,knownLocs=>$knownLocs,spec2pos=>reverseMap($kogmap)},$class; } sub clone { my ($class,$base,$newName)=@_; my %content=%{$base}; $content{name}=$newName if $newName; bless {%content},$class; } sub canonicalize { my ($members)=@_; return join("~",map {keys(%$_)} (sort {(values(%$a))[0] cmp (values(%$b))[0]} @$members)); } sub canonParts { my ($canon)=@_; return split(/~/,$canon); } sub addBits { my ($basis,$next,@others)=@_; if($next){ my @bits; foreach my $bit (@$next){ push(@bits,addBits([@$basis,$bit],@others)); } return @bits; }else{ return $basis; } } sub getLocPairs { my @bits=@_; my @res; foreach my $i (0..$#bits){ foreach my $j (($i+1)..$#bits){ push(@res,[@{$bits[$i]},@{$bits[$j]}]); } } return @res; } sub getPairs { my @bits=@_; my @res; foreach my $i (0..$#bits){ foreach my $j (($i+1)..$#bits){ push(@res,[$bits[$i],$bits[$j]]); } } return @res; } sub isKnownLoc { my ($self,$loc)=@_; die "Bad loc: ".dump($loc) unless ref $loc eq 'HASH' and scalar(keys %$loc)==1; my $l=(keys %$loc)[0]; return exists $self->{knownLocs}->{$l} and $self->{knownLocs}->{$l} eq $loc->{$l}; } sub addAll { my ($self,$memr)=@_; our @unfolded=addBits([],(ref $memr eq 'ARRAY') ? (@$memr): (values(%{reverseMap($memr)}))); foreach my $subset (@unfolded){ my $canon=canonicalize($subset); my $content={ortho=>1}; if(!grep {!$self->isKnownLoc($_)} @$subset){ $content->{known}=1; my ($p1,$p2)=map {canonicalize([$_])} @$subset; my $canon=canonicalize($subset); $self->{knownOrthos}->{$p1}->{$canon}=1; $self->{knownOrthoList}->{$canon}=1; } $self->{orthos}->{$canon}=$content; } } sub hitAllKnown { my ($self,@perSeqLocs)=@_; my @locs=map {@$_} @perSeqLocs; #flatten my %hits=map {canonicalize([$_])=>1} @locs; #hash; my %res; #for each loc that we hit, if it didn't meet its intended partners, we record a miss (otherwise it's a hit) my %counted; foreach my $loc (@locs){ next unless $self->isKnownLoc($loc); my $p1=canonicalize([$loc]); foreach my $canon (keys %{$self->{knownOrthos}->{$p1}}){ next if $counted{$canon}; $counted{$canon}=1; my @parts=canonParts($canon); if(grep {!$hits{$_}} @parts){ #hit all constituent parts? $self->{orthos}->{$canon}->{missed}++; $res{misses}++; }else{ $self->{orthos}->{$canon}->{hit}++; $self->{orthos}->{$canon}->{hitonce}=1; $res{hits}++; } } } return %res; } sub reverseMap { my $arg=pop; my $ret={}; if(ref $arg eq 'ARRAY'){ for(0..$#$arg){ $ret->{$$arg[$_]}=$_; } }elsif(ref $arg eq 'HASH'){ for my $k (keys(%$arg)){ push(@{$ret->{$arg->{$k}}},{$k=>$arg->{$k}}); } }else{ print "Non-reversable: ".dump($arg)."....\n"; die "Non-reversable arg called on reverseMap...\n"; $ret=$arg; #ummmm } return $ret; } sub toSpecList { my ($self,$members)=@_; my @ret; foreach my $k (keys(%$members)){ do{print "Unknown species $k.\n"; return undef} unless exists($self->{spec2pos}->{$k}); $ret[$self->{spec2pos}->{$k}]=$members->{$k}; } return @ret; } sub orthosFromKog { my ($class,$kogs,$kogmap,$knownLocs)=@_; my $this=empty($class,$kogs->{kogSrc},$kogmap,$knownLocs); #this isnt C++. we can get our object up and going now ^^ print "Filling OrthoPairs from KOGset\n"; foreach my $kogname (keys(%{$kogs->{kogs}})){ $this->addAll($kogs->{kogs}->{$kogname}->{members}) } print $kogs->{kogSrc}." converted to OrthoPairs list. ".$this->summary."\n"; return $this; } sub summary { my ($self)=@_; my $count=scalar(keys %{$self->{orthos}}); return $count.($count==1 ? " ortholog":" orthologs"); } sub possibleOrthosCount { my ($self)=@_; my %count; foreach my $s (values(%{$self->{knownLocs}})){ $count{$s}++; } return 0 unless keys(%count); my $possible=1; foreach my $s (keys(%count)){ $possible*=$count{$s}; } return $possible; } sub addOrtho { my ($self,@set)=@_; foreach my $subset (addBits([],@set)){ $self->{orthos}->{canonicalize($subset)}=1; } } sub isOrtho { my ($self,@set)=@_; my %res; foreach my $subset (addBits([],@set)){ $res{($self->{orthos}->{canonicalize($subset)}) ? "yes":"no"}++; } return %res; } sub stats { #both of those should just be one function my ($self)=@_; my %counted; my ($conCount,$inconCount,$recallCount)=(0,0,0); foreach my $hitloc (keys %{$self->{knownOrthos}}){ foreach my $canon (keys %{$self->{knownOrthos}->{$hitloc}}){ next if $counted{$canon}; #thou shalt not double count $conCount+=$self->{orthos}->{$canon}->{hit}; $inconCount+=$self->{orthos}->{$canon}->{missed}; $recallCount+=$self->{orthos}->{$canon}->{hitonce}; $counted{$canon}=1; } } my $totalOrthos=keys %{$self->{knownOrthoList}}; my %res=(recall=>$recallCount, con=>$conCount, incon=>$inconCount, total=>$totalOrthos); $res{sens}=$res{recall}/$res{total} if $res{total}; $res{spec}=$conCount/($conCount+$inconCount) if ($conCount+$inconCount); return %res; } sub compare { my ($self,$other)=@_; my %done; my %res=(in1=>0,in2=>0,inboth=>0); print "Comparing ortholist members...\n"; @done{keys(%{$self->{orthos}})}=(1) x scalar keys(%{$self->{orthos}}); my @inboth; resetTick(scalar(keys(%{$other->{orthos}}))); # print "Done: ".join("\n",keys(%done))."\n"; foreach my $ortho (keys(%{$other->{orthos}})){ # print "Compare: $ortho\n"; if($done{$ortho}){ $res{inboth}++; push(@inboth,$ortho); delete $done{$ortho}; }else{ $res{in2}++; } tick; } $res{in1}+=scalar keys %done; # print "\nOrtho inboth:\n".join("\n+",@inboth)."\n"; # print "\nOrtho in source:\n".join("\n*",keys %{$self->{orthos}})."\n"; if($self->{knownLocs} == $other->{knownLocs}){#impossible to know otherwise $res{in0}=$self->possibleOrthosCount()-$res{in1}-$res{in2}-$res{inboth}; } print "\n"; return %res; } 1; murasaki/perlmodules/Murasaki/OrthoPairs.pm0000644000177700001440000001752311434752240020523 0ustar krispusers#!/usr/bin/perl #Murasaki - multiple genome global alignment program #Copyright (C) 2006-2007 Keio University #(Kris Popendorf) (2006) # #This program is free software; you can redistribute it and/or #modify it under the terms of the GNU General Public License #as published by the Free Software Foundation; either version 2 #of the License, or (at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program; if not, write to the Free Software #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. use strict; package Murasaki::OrthoPairs; #use Data::Dump qw{dump}; use Murasaki::Ticker qw{resetTick tick}; no Carp::Assert; sub empty { my ($class,$name,$kogmap,$knownLocs)=@_; bless {orthos=>{},name=>$name,knownLocs=>$knownLocs,spec2pos=>reverseMap($kogmap)},$class; } sub clone { my ($class,$base,$newName)=@_; my %content=%{$base}; $content{name}=$newName if $newName; bless {%content},$class; } sub canonicalize { my ($members)=@_; return join("~",map {keys(%$_)} (sort {(values(%$a))[0] cmp (values(%$b))[0]} @$members)); } sub addBits { my ($basis,$next,@others)=@_; if($next){ my @bits; foreach my $bit (@$next){ push(@bits,addBits([@$basis,$bit],@others)); } return @bits; }else{ return $basis; } } sub getLocPairs { my @bits=@_; my @res; foreach my $i (0..$#bits){ foreach my $j (($i+1)..$#bits){ push(@res,[@{$bits[$i]},@{$bits[$j]}]); } } return @res; } sub getPairs { my @bits=@_; my @res; foreach my $i (0..$#bits){ foreach my $j (($i+1)..$#bits){ push(@res,[$bits[$i],$bits[$j]]); } } return @res; } sub isKnownLoc { my ($self,$loc)=@_; die "Bad loc: ".dump($loc) unless ref $loc eq 'HASH' and scalar(keys %$loc)==1; my $l=(keys %$loc)[0]; return exists $self->{knownLocs}->{$l} and $self->{knownLocs}->{$l} eq $loc->{$l}; } sub addAll { my ($self,$memr)=@_; my @bits=(ref $memr eq 'ARRAY') ? (@$memr): (values(%{reverseMap($memr)})); our @pairs=getLocPairs(@bits); foreach my $pair (@pairs){ assert(scalar(@{$pair})==2); my $content={ortho=>1}; if($self->isKnownLoc($$pair[0]) and $self->isKnownLoc($$pair[1])){ $content->{known}=1; my ($p1,$p2)=map {canonicalize([$_])} @$pair; my $canon=canonicalize($pair); $self->{knownPairs}->{$p1}->{$p2}=$canon; $self->{knownPairs}->{$p2}->{$p1}=$canon; $self->{knownPairList}->{$canon}=1; } $self->{orthos}->{canonicalize($pair)}=$content; } } sub hitAllKnown { my ($self,@perSeqLocs)=@_; my @locs=map {@$_} @perSeqLocs; #flatten my %hits=map {canonicalize([$_])=>1} @locs; #hash; my %res; #for each loc that we hit, if it didn't meet its intended partner, we record a miss (otherwise it's a hit) my %counted; foreach my $loc (@locs){ next unless $self->isKnownLoc($loc); my $p1=canonicalize([$loc]); foreach my $p2 (keys %{$self->{knownPairs}->{$p1}}){ my $canon=$self->{knownPairs}->{$p1}->{$p2}; next if $counted{$canon}; $counted{$canon}=1; unless($hits{$p1} and $hits{$p2}){ $self->{orthos}->{$canon}->{missed}++; $res{misses}++; }else{ $self->{orthos}->{$canon}->{hit}++; $self->{orthos}->{$canon}->{hitonce}=1; $res{hits}++; } } } return %res; } sub reverseMap { my $arg=pop; my $ret={}; if(ref $arg eq 'ARRAY'){ for(0..$#$arg){ $ret->{$$arg[$_]}=$_; } }elsif(ref $arg eq 'HASH'){ for my $k (keys(%$arg)){ push(@{$ret->{$arg->{$k}}},{$k=>$arg->{$k}}); } }else{ print "Non-reversable: ".dump($arg)."....\n"; die "Non-reversable arg called on reverseMap...\n"; $ret=$arg; #ummmm } return $ret; } sub toSpecList { my ($self,$members)=@_; my @ret; foreach my $k (keys(%$members)){ do{print "Unknown species $k.\n"; return undef} unless exists($self->{spec2pos}->{$k}); $ret[$self->{spec2pos}->{$k}]=$members->{$k}; } return @ret; } sub orthosFromKog { my ($class,$kogs,$kogmap,$knownLocs)=@_; my $this=empty($class,$kogs->{kogSrc},$kogmap,$knownLocs); #this isnt C++. we can get our object up and going now ^^ print "Filling OrthoPairs from KOGset\n"; foreach my $kogname (keys(%{$kogs->{kogs}})){ $this->addAll($kogs->{kogs}->{$kogname}->{members}) } print $kogs->{kogSrc}." converted to OrthoPairs list. ".$this->summary."\n"; return $this; } sub summary { my ($self)=@_; my $count=scalar(keys %{$self->{orthos}}); return $count.($count==1 ? " ortholog":" orthologs"); } sub possibleOrthosCount { my ($self)=@_; my %count; foreach my $s (values(%{$self->{knownLocs}})){ $count{$s}++; } return 0 unless keys(%count); my $possible=1; foreach my $s (keys(%count)){ $possible*=$count{$s}; } return $possible; } sub addOrtho { my ($self,@set)=@_; foreach my $subset (addBits([],@set)){ $self->{orthos}->{canonicalize($subset)}=1; } } sub isOrtho { my ($self,@set)=@_; my %res; foreach my $subset (addBits([],@set)){ $res{($self->{orthos}->{canonicalize($subset)}) ? "yes":"no"}++; } return %res; } sub sensitivity { my ($self)=@_; my ($consistent); my $total=keys %{$self->{knownPairList}}; return undef unless $total; foreach my $canonPair (keys %{$self->{knownPairList}}){ $consistent++ if $self->{orthos}->{$canonPair}->{hit}; } return $consistent/$total; } sub specificity { my ($self)=@_; my %counted; my ($conCount,$inconCount)=(0,0); foreach my $hitloc (keys %{$self->{hitLocs}}){ foreach my $partner (keys %{$self->{knownPairs}->{$hitloc}}){ my $canon=$self->{knownPairs}->{$hitloc}->{$partner}; next if $counted{$canon}; #thou shalt not double count $conCount+=$self->{orthos}->{$canon}->{hit}; $inconCount+=$self->{orthos}->{$canon}->{missed}; } } return undef unless ($conCount+$inconCount); return $conCount/($conCount+$inconCount); } sub stats { #both of those should just be one function my ($self)=@_; my %counted; my ($conCount,$inconCount,$recallCount)=(0,0,0); foreach my $hitloc (keys %{$self->{knownPairs}}){ foreach my $partner (keys %{$self->{knownPairs}->{$hitloc}}){ my $canon=$self->{knownPairs}->{$hitloc}->{$partner}; next if $counted{$canon}; #thou shalt not double count $conCount+=$self->{orthos}->{$canon}->{hit}; $inconCount+=$self->{orthos}->{$canon}->{missed}; $recallCount+=$self->{orthos}->{$canon}->{hitonce}; $counted{$canon}=1; } } my $totalPairs=keys %{$self->{knownPairList}}; my %res=(recall=>$recallCount, con=>$conCount, incon=>$inconCount, total=>$totalPairs); $res{sens}=$res{recall}/$res{total} if $res{total}; $res{spec}=$conCount/($conCount+$inconCount) if ($conCount+$inconCount); return %res; } sub compare { my ($self,$other)=@_; my %done; my %res=(in1=>0,in2=>0,inboth=>0); print "Comparing ortholist members...\n"; @done{keys(%{$self->{orthos}})}=(1) x scalar keys(%{$self->{orthos}}); my @inboth; resetTick(scalar(keys(%{$other->{orthos}}))); # print "Done: ".join("\n",keys(%done))."\n"; foreach my $ortho (keys(%{$other->{orthos}})){ # print "Compare: $ortho\n"; if($done{$ortho}){ $res{inboth}++; push(@inboth,$ortho); delete $done{$ortho}; }else{ $res{in2}++; } tick; } $res{in1}+=scalar keys %done; # print "\nOrtho inboth:\n".join("\n+",@inboth)."\n"; # print "\nOrtho in source:\n".join("\n*",keys %{$self->{orthos}})."\n"; if($self->{knownLocs} == $other->{knownLocs}){#impossible to know otherwise $res{in0}=$self->possibleOrthosCount()-$res{in1}-$res{in2}-$res{inboth}; } print "\n"; return %res; } 1; murasaki/stat-vs-stat.pl0000755000177700001440000001533311434752242014675 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; use Getopt::Long; use Pod::Usage; #use Data::Dump qw{dump}; use strict; my ($help,$man,$opt_prefix); our ($seqhome,$root,$flexible); $root=(fileparse($0))[1]; $root=$ENV{MURASAKI} if $ENV{MURASAKI}; require "${root}/config.pl"; require "$root/common.pl"; my ($opt_x,$opt_y)=("tfidf","hits"); my $samples; my $fn; my $format="png"; my $lwd=3; my ($opt_log,$opt_clean,$opt_nofstats,$fstats,%avg,$drawAvg,$maxsamples,$nofn); GetOptions('help|?' => \$help, man => \$man, "x=s"=>\$opt_x, 'y=s' =>\$opt_y, 'log=s'=>\$opt_log, clean=>\$opt_clean, 'format=s'=>\$format, pdf=>sub{$format='pdf'},'lwd=f'=>\$lwd, fstats=>\$fstats, 'avg:s%'=>sub{ $drawAvg=1; $avg{$_[1]}=$_[2] if $#_>1}, 'noavg'=>sub{$drawAvg=0},nofstats=>\$opt_nofstats ) or pod2usage(1); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; foreach my $file (@ARGV){ my ($basename,$path,$suffix)=fileparse($file,qr/\.rocr?/); $path=~s!/$!!; #kill trailing / if any my $basefile="$path/$basename"; my $rocrfile="$basefile.rocr"; #attempt to grab a FN count from a filterstats file if we don't have one if(!$opt_nofstats){ unless($fstats and -f $fstats){ my @fstats=<$basefile.*.filterstat*>; $fstats=$#fstats>0 ? pickOne("Multiple filterstats files found:","Which should I use? ",@fstats):$fstats[0]; } goto STARTSAMPLE unless -f $fstats; open(my $fstat_fh,$fstats); print "Reading stats from $fstats\n"; while(<$fstat_fh>){ if(m/Initial stats:/){ <$fstat_fh>; #junk line about # of anchors GETMEANS: while(<$fstat_fh>){ $avg{$1}=$2 if m/^(\w+) mean: (\d+\.?\d*)/; last GETMEANS unless m/^\w+ mean:/; } } } print "Found averages for: ".join(", ",sort keys %avg)."\n"; } STARTSAMPLE: my (@fields); open(my $fh,$rocrfile); $_=<$fh>; chomp; @fields=split(/\t/,$_); print "Fields available: ".join(", ",@fields)."\n"; my (@x,@y); @x=split(/\W/,$opt_x); @y=split(/\W/,$opt_y); if ($opt_x eq 'all') { @x=grep {$_ ne 'label'} @fields; } if ($opt_y eq 'all') { @y=grep {$_ ne 'label'} @fields; } print "Making graphs for $opt_x vs $opt_y\n"; my %done; foreach my $x (@x) { foreach my $y (@y) { next if(exists($done{"$x-$y"}) or $x eq $y); $done{"$x-$y"}=1; my $tpfile="$basefile.stats.tp"; my $fpfile="$basefile.stats.fp"; my $rout="$basefile.$x.$y.$format"; my $rsrc="$basefile.$x.$y.R"; unless(-f $tpfile and -f $fpfile and !$opt_clean){ #reuse rocfile if there is one print "Splitting $rocrfile into TP and FP...\n"; die "No rocr file: $rocrfile? Run filter.pl --rocr to make one..." unless -f $rocrfile; open(my $tpfh,">$tpfile") or die "Couldn't write to $tpfile\n"; open(my $fpfh,">$fpfile") or die "Couldn't write to $fpfile\n"; seek($fh,0,0); $_=<$fh>; #they each need a copy of the header print $tpfh $_; print $fpfh $_; while (<$fh>) { my $target=(m/^1/ ? $tpfh : $fpfh); print $target $_; } close($tpfh); close($fpfh); } my $legendpos="min(tp[,'$x'],fp[,'$x']),max(tp[,'$y'],fp[,'$y'])"; my $outputter=$format ne 'pdf' ? qq!bitmap(file="$rout",type="png16m",width=10,height=7,res=96)!: qq!pdf(file="$rout",width=10,height=7)!; #do the R output my $tppch='1'; my $fppch='16'; open(my $R,">$rsrc"); print $R <$best; } return $best; } sub pickOne { my ($ps1,$ps2,@opts)=@_; print $ps1."\n"; print map {($_==0 ? "[$_]":" $_ ").": $opts[$_]\n"} 0..$#opts; my $res; do{ print $ps2; $res=; chomp $res; }while($res && ($res<0 or $res>$#opts)); return $opts[$res]; } __END__ =head1 NAME stat-vs-stat.pl -- draws graphs of 1 stat vs another stat =head1 SYNOPSIS cbtest.pl [input2 ...] =head1 OPTIONS Input file should be some alignment that has as a .rocr file (presumably generated by filter.pl --rocr). Output is graphed to .cutoff.roc.png. sensitivity/specificity requires a false negative count, and as such requires a .filterstats file. Other options: --stat|predictor|x specifies what stat to use as a predictor --samples|n sets number of samples (default is sample at each possible cutoff) --maxsamples|maxn sets a max for n/samples --clean forces a re-sampling of the .rocr file --log can apply log scale to x or y or xy axes --fn can specify a FN count for calculating sensitivity --avg preload averages (normally found from filterstats) (note: averages no drawn by default, so specify --avg (with no args) to enable them) --noavg don't plot the line for averages --lwd can specify line weight --format can specify output file format (default png) --pdf set format to pdf --nofstats disable searching for filterstats statistics --nofn pretend not to have fn statistics (ie: skip sensitivity) --fstats specify the filterstats file to use murasaki/.svnignore0000644000177700001440000000001311434752242013763 0ustar krispusersoutput seq murasaki/stat-histo.pl0000755000177700001440000001534411434752241014423 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; use Getopt::Long; use Pod::Usage; #use Data::Dump qw{dump}; use strict; my ($help,$man,$opt_prefix); my ($opt_x)=("auc.tfidf"); my $samples; my $fn; my $format="png"; my $lwd=3; my ($opt_log,$opt_clean,$opt_nofstats,$fstats,%avg,$drawAvg,$maxsamples,$nofn,$opt_title,$opt_names,$cleanup,$opt_noxlab,%defs,@def_order,$opt_n,$opt_bw,$opt_denopts,$opt_xlab,$opt_special); GetOptions('help|?' => \$help, man => \$man, "x=s"=>\$opt_x, 'log=s'=>\$opt_log, clean=>\$opt_clean, 'format=s'=>\$format, pdf=>sub{$format='pdf'},'lwd=f'=>\$lwd, avg=>\$drawAvg, 'noavg'=>sub{$drawAvg=0},nofstats=>\$opt_nofstats, 'title=s'=>\$opt_title, 'names=s'=>\$opt_names,cleanup=>\$cleanup,noxlab=>\$opt_noxlab, 'def=s%'=>sub{$_[1]=~s/\W/./g; print "Adding definition for $_[1]\n"; $defs{$_[1]}=$_[2];push(@def_order,$_[1])}, 'n=i'>\$opt_n,'bw=f'=>\$opt_bw,'denopts=s'=>\$opt_denopts, 'xlab=s'=>sub{unless($_[1]){$opt_noxlab=1;return 1;} $opt_xlab=$_[1]}, special=>\$opt_special ) or pod2usage(1); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; foreach my $file (@ARGV){ my ($basename,$path,$suffix)=fileparse($file,qr/\.[^.]+/); $path=~s!/$!!; #kill trailing / if any my $basefile="$path/$basename"; $basefile.="$suffix" if $suffix; print "Basefile: $basefile\n"; my (@fields); open(my $fh,$basefile); $_=<$fh>; chomp; @fields=split(/\t/,$_); print "Fields available: ".join(", ",@fields)."\n"; my @x; @x=split(/,/,$opt_x); if ($opt_x eq 'all') { @x=grep {$_ ne 'label'} @fields; } foreach(@x){ s/\W/./g; } my $plotname=join("-",@x); print "Making histogram graph for $plotname\n"; my $rout="$basefile.$plotname.hist.$format"; my $rsrc="$basefile.$plotname.hist.R"; my $outputter=$format ne 'pdf' ? qq!bitmap(file="$rout",type="png16m",width=10,height=7,res=96)!: qq!pdf(file="$rout",width=10,height=7)!; my $densityOpts=",bw=$opt_bw" if $opt_bw; $densityOpts.=",n=$opt_n" if $opt_n; $densityOpts.=",$opt_denopts" if $opt_denopts; #do the R output open(my $R,">$rsrc"); print $R <0) { #if only 1 x... $color=1; $xlab_val=$x; } $xlab_val=$opt_xlab if $opt_xlab; my $xlab="xlab='$xlab_val'," unless $opt_noxlab; print $R <= 1: ",100*round(length(x[x>=1])/length(x),4),"%")); ENDTEXT } }continue{$i++} print $R "abline(v=1,lty=2,lwd=$lwd/2,col=1)\n" if $opt_special; if($#x>0){ #draw a legend my $legendpos="xl[1],yl[2]"; my $legendcols=join(",",2..($#x+2)); my $names=join(",",map {qq!"$_"!} @legendTerms); print $R "legend($legendpos,c($names),col=c($legendcols),lwd=$lwd);\n"; } close($R); system("R --vanilla < $rsrc"); } sub sum { my $sum=0; grep {$sum+=$_} @_; return $sum; } sub mean { my $total; foreach(@_){ $total+=$_; } return $total/($#_+1); } sub min { my $best=$_[0]; foreach(@_){ $best=$_ if $_<$best; } return $best; } sub max { my $best=$_[0]; foreach(@_){ $best=$_ if $_>$best; } return $best; } sub pickOne { my ($ps1,$ps2,@opts)=@_; print $ps1."\n"; print map {($_==0 ? "[$_]":" $_ ").": $opts[$_]\n"} 0..$#opts; my $res; do{ print $ps2; $res=; chomp $res; }while($res && ($res<0 or $res>$#opts)); return $opts[$res]; } __END__ =head1 NAME stat-histo.pl -- density plot of some given stat =head1 SYNOPSIS stat-histo.pl [input2 ...] =head1 OPTIONS Input file should be some stat summary (like generated by gatherstats.pl) default stat is: auc-tfidf Other options: --stat|predictor|x specifies what stat to use as a predictor --log can apply log scale to x or y or xy axes --fn can specify a FN count for calculating sensitivity --avg enable plotting of averages --noavg don't plot the line for averages --lwd can specify line weight --format can specify output file format (default png) --pdf set format to pdf --cleanup forces cleanup of values (ie: removes % and reparses as numeric) --bw set bandwidth for density function calculation --n samples for density function calculation --denopts add extra opts for density function (separate with commas) --def define custom fields (input fields from other fields can be taken by quoting with {} (eg: --def tough={size}/{speed}) murasaki/substitch.pl0000755000177700001440000003521011434752241014326 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use strict; use File::Basename; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; use Getopt::Long; use Pod::Usage; use Term::ANSIColor qw{:constants colored}; $Term::ANSIColor::AUTORESET=1; #use Data::Dump qw{dump}; our $root; our %units=(gb => 1024*1024*1024, mb => 1024*1024, kb=>1024, b=>1, tb=>1024*1024*1024*1024); my ($chunks,$projectSeqs,$opt_out,$faketfidf,$reduction); $faketfidf=1; #for now my ($help,$man,$verbose); my $err=GetOptions('help|?' => \$help, man => \$man, 'verbose+'=>\$verbose, 'split=i'=>\$chunks, 'project=s'=>\$projectSeqs, 'faketfidf'=>\$faketfidf, 'o=s'=>\$opt_out); pod2usage(1) if $help or !$err or @ARGV<1; pod2usage(-exitstatus => 0, -verbose => 4) if $man; my $action='split' if $chunks and $chunks>0; pod2usage(-msg=>'Can only specify only 1 action',-exitval=>2) if $projectSeqs and $action; $action='project' if $projectSeqs; pod2usage(-msg=>'Must specify one action',-exitval=>2) unless $action; if($action eq 'split'){ foreach my $file (@ARGV){ my ($name,$dir,$ext)=fileparse($file,qr/\.[^\.]+/); my @files=readStitchFile($file); my @ids=makeIds(map {$_->{file}} @files); foreach my $fi (0..$#ids){ $files[$fi]->{id}=$ids[$fi]; } my @sets=makeSets($chunks,@files); my $prefix="$name.sub"; foreach my $chunk (1..$chunks){ my $outfile="$prefix.$chunk-$chunks.stitch"; open(my $ofh,">$outfile") or die "Couldn't write to $outfile"; print "Writing $outfile...\n"; my $pos=1; my @parts=@{$sets[$chunk-1]}; foreach my $part (@parts) { print $ofh join("\t",$part->{file},$part->{len},$pos,$pos+$part->{len})."\n"; $pos+=$part->{len}+10; } open(my $olfh,">$outfile.length"); print $olfh ($pos-11); } } }elsif($action eq 'project'){ #remap component parts into coordinate space of $projectSeqs locations my ($name,$dir,$ext)=fileparse($projectSeqs,qr/\.(anchors(?:\.details)?|seqs)/); $projectSeqs="$dir$name.seqs" unless $ext eq '.seqs'; open(my $psfh,$projectSeqs) or die "Couldn't read $projectSeqs"; my @targetSeqs=grep {m/\S/} (map {chomp;$_} <$psfh>); my %targetRegions=findRegions(@targetSeqs); my $outfile=$opt_out ? $opt_out:"$dir$name.anchors"; my $detailfile="$outfile.details" if grep {/\.details$/} @ARGV; my $scorefile="$dir$name.stats.tfidf" if $detailfile and $faketfidf; printf "Storing scores values to $scorefile\n" if $scorefile and $verbose; open(my $ofh,">$outfile") or die "Couldn't write to $outfile"; open(my $odfh,">$detailfile") or die "Couldn't write to $detailfile" if $detailfile; open(my $scorefh,">$scorefile") or die "Couldn't write to $scorefile" if $scorefile; foreach my $file (@ARGV){ my ($name,$dir,$ext)=fileparse($file,qr/\.anchors(?:\.details)?/); die "Need an anchors or seqs file (not $file)" unless $ext; # $file="$dir$name.anchors" if($ext ne '.anchors'); print "Reading anchors from $file\n" if $verbose; open(my $fh,$file) or die "Can't read $file"; my $seqfile="$dir$name.seqs"; open(my $sfh,$seqfile) or die "Can't read $seqfile"; my @srcSeqs=grep {m/\S/} (map {chomp;$_} <$sfh>); my %srcRegions=findRegions(@srcSeqs); my @regionMaps=mapRegions(%srcRegions); while(my $line=<$fh>){ chomp $line; my @bits=split(/\t/,$line); my @meta=@bits[scalar(@srcSeqs)*3..$#bits]; @bits=@bits[0..(scalar(@srcSeqs)*3-1)]; my @outAnchors; my $si=0; while(@bits){ my ($start,$stop,$sign,@rem)=@bits; @bits=@rem; my $anchor=[abs($start),abs($stop),$sign]; my $inRegion=locateRegion($anchor,$regionMaps[$si]); die "\#$si Anchor (".anchorStr($anchor).") in unmapped territory??" unless $inRegion; if(ref $inRegion){ my $outRegion=$targetRegions{$inRegion->{seq}}; unless($outRegion){ warn "Warning: ".anchorStr($anchor)." -> $inRegion->{seq} is not in output set." unless $reduction; goto NextAnchor; } die anchorStr($anchor)." is mapped into the same SI as another anchor?" if $outAnchors[$outRegion->{si}]; $outAnchors[$outRegion->{si}]=remapAnchor($anchor,$inRegion,$outRegion); }elsif($inRegion eq 'rift'){ die anchorStr($anchor)." is mapped into the same SI as another anchor?" if $outAnchors[$si]; $outAnchors[$si]=$anchor; #stays a rift }else{ die "Unknown region mapping: $inRegion"; } }continue{$si++} print $ofh join("\t",(map {anchorStr($_)} @outAnchors))."\n"; print $odfh join("\t",(map {anchorStr($_)} @outAnchors),@meta)."\n" if $odfh; print $scorefh $meta[1]."\n" if $scorefh; NextAnchor: #tra la la, nothin to do yet ; } } close($ofh); close($odfh); sleep(1); #gmv and some other programs want to see a score file older than the anchor files close($scorefh); } sub makeSets { my ($chunks,@files)=@_; my @bysize=sort {$files[$a]->{len} <=> $files[$b]->{len}} (0..$#files); my $goal=totalLen(@files)/$chunks; print "Splitting into $chunks chunks. Goal is: ".humanMemory($goal)."\n"; #take a rough stab at it my @sol; my %res; my @typedesc=("underfill","overfill","nofill"); foreach my $type (0..2){ print BOLD GREEN "Making an initial guess using", BOLD CYAN $typedesc[$type], BOLD GREEN "method\n"; my @bins=map {[]} 1..$chunks; my $sofar=0; my $bin=0; foreach my $fi (0..$#files){ my $next=$files[$bysize[$fi]]; my $newTotal=totalLen(@{$bins[$bin]},$next); $bin++ if($type==0 and $newTotal>$goal and $bin<$#bins); push(@{$bins[$bin]},$next); $bin++ if($type==1 and $newTotal>$goal and $bin<$#bins); } @bins=refine($goal,@bins); my $score=score($goal,@bins); $res{$typedesc[$type]}={score=>$score,bins=>[@bins]}; } my @rank=sort {($res{$a}->{score}) <=> ($res{$b}->{score})} (keys %res); print BOLD YELLOW "$rank[0] method did best (rmse: ".humanMemory($res{$rank[0]}->{score}).")\n"; return @{$res{$rank[0]}->{bins}}; } sub refine { my ($goal,@bins)=@_; print "Initial guestimate:\n".summary($goal,@bins); my $initialScore=score($goal,@bins); print "Refining..."; my $count=0; while(my @swaps=allPossibleSwaps(@bins)){ my %scores; foreach my $swap (@swaps){ $scores{$swap}=score($goal,doSwap($swap,@bins)); } my @rank=sort {$scores{$a}<=>$scores{$b}} @swaps; if($scores{$rank[0]}{id}} @files)."): ".humanMemory($total)." (error: ".humanMemory($total-$goal).")\n"; } return $r; } sub totalLen { my @files=@_; return sum(map {$_->{len}} @files); } sub rmsd { my ($v,@l)=@_; my $r=0; foreach my $i (@l){ $r+=($i-$v)*($i-$v); } return sqrt($r); } sub sum { my $r=0; foreach my $i (@_){ $r+=$i; } return $r; } sub max { my ($r,@l)=@_; foreach my $v (@l){ $r=$v if $v>$r; } return $r; } sub makeIds { my ($ref,@others)=@_; my $re=qr/\W+/; my @bits=split($re,$ref); my @use=map {0} 0..$#bits; foreach my $other (@others){ my @obits=split($re,$other); for my $i (0..max($#obits,$#bits)){ $use[$i]=1 if $bits[$i] ne $obits[$i]; } } my @ids; my @slice=toSlice(@use); @slice=(0) unless @slice; foreach my $name ($ref,@others){ my @bits=split($re,$name); push(@ids,join(".",@bits[@slice])); } return @ids; } sub toSlice { my @r; foreach my $i (0..$#_){ push(@r,$i) if $_[$i]; } return @r; } sub humanMemory { my @l=@_; @l=map { my $minus=($_<0); $_=abs($_); my $scale=$_/$units{b}<500 ? "b": $_/$units{kb}<500 ? "kb": $_/$units{mb}<500 ? "mb": $_/$units{gb}<500 ? "gb":"tb"; my $str=sprintf("%.3f",$_/$units{$scale}); $str=~s/(\.\d+?)0+$/$1/; $str=~s/\.$//; (($minus ? "-":"").$str.$scale)} @l; return @l if wantarray; return $l[0]; } sub readStitchFile { my ($file)=@_; my @files; open(my $fh,$file) or die "Couldn't open $file"; while(my $line=<$fh>){ chomp $line; my ($file,$len,$start,$stop)=split(/\t/,$line); die "Not a valid stitch file" unless ($file and $len and $start and $stop); my %dat; @dat{qw{file len start stop}}=($file,$len,$start,$stop); push(@files,\%dat); } return @files; } ###stuff for project sub findRegions { my @seqs=@_; my %regions; foreach my $si (0..$#seqs){ my $seq=$seqs[$si]; my ($name,$dir,$ext)=fileparse($seq,qr/\.[^\.]+/); if($ext eq '.stitch'){ %regions=(%regions,findStitchRegions($seq,$si)); }else{ $regions{$seq}={seq=>$seq,si=>$si,start=>1}; print "si $si : Raw region: ".regionStr($regions{$seq})."\n" if $verbose>1; } } return %regions; } sub findStitchRegions { my ($stitch,$si,$start)=@_; $start=1 unless $start; my @files=readStitchFile($stitch); my %regions; foreach my $file (@files){ my ($name,$dir,$ext)=fileparse($file->{file},qr/\.[^\.]+/); if($ext eq '.stitch'){ %regions=(%regions,findStitchRegions($file->{file},$si,$file->{start}+$start-1)); }else{ $regions{$file->{file}}={seq=>$file->{file},si=>$si,start=>$start+$file->{start}-1}; print "si $si : Stitch region: ".regionStr($regions{$file->{file}})."\n" if $verbose>1; } } return %regions; } sub regionStr { return join(",",map { "(".join(",",@{$_}{qw{seq si start}}).")"} @_); } sub mapRegions { my %regions=@_; my @maps; foreach my $region (values %regions){ $maps[$region->{si}]=[] unless ref $maps[$region->{si}]; push(@{$maps[$region->{si}]},$region); } foreach my $i (0..$#maps){ $maps[$i]=[sort {$b->{start}<=>$a->{start}} @{$maps[$i]}]; } return @maps; } sub locateRegion { my ($anchor,$mapr)=@_; die "Search for a region without a map? What are you getting at?" unless ref $mapr; return 'rift' if isRift($anchor); foreach my $region (@$mapr){ return $region if $$anchor[0]>=$region->{start}; } die "Region not found for ".shortAnchorStr($anchor)."? How is this possible?"; } sub isRift { my ($start,$stop,$sign)=@{$_[0]}; return ($start==0 and $stop==0); } sub anchorStr { my ($start,$stop,$sign)=ref $_[0] ? @{$_[0]}:@_; return join("\t",$sign=~m/-/ ? (-$start,-$stop,$sign):($start,$stop,$sign)); } sub shortAnchorStr { my ($start,$stop,$sign)=ref $_[0] ? @{$_[0]}:@_; return '['.join(",",$sign=~m/-/ ? (-$start,-$stop,$sign):($start,$stop,$sign)).']'; } sub remapAnchor { my ($anchor,$inRegion,$outRegion)=@_; my ($start,$stop,$sign)=@$anchor; my $offset=$outRegion->{start}-$inRegion->{start}; return [$start+$offset,$stop+$offset,$sign]; } __END__ =head1 NAME substitch.pl : Split/merge stitch files into/out of stitch files =head1 SYNOPSIS substitch.pl --split 5 allchromosomes.stitch #split big stitch into 5 roughly equal chunks substitch.pl --project allspecies.seqs sub.anchors #project some anchors into a different coordinate space (as long as the stitch component sequences match) =head1 OPTIONS --verbose => makes more verbose --faketfidf => fake tfidf scores based on score stat in file Note on split: This program does not claim to produce an optimal splitting. It tries a couple heuristics, refines the results, and picks the best arrangement it's found so far. Technically this is a variation on the traditional "trunk packing problem," which is (at least in the abstract case) NP-hard, if I remember 15-251 correctly. This particular variety of trunk packing however, seems like it should be solvable faster (worst case some n^k dynamic programming I think, but I'm betting this way is faster and tons easier to write for 90% of the cases out there). If anyone reading this goes "You moron, this has been solved a thousand times already," please let me know how: krisp@dna.bio.keio.ac.jp murasaki/gatherstats.pl0000755000177700001440000001624711434752241014660 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use strict; use Getopt::Long; use Pod::Usage; use File::Basename; #use Data::Dump qw {dump}; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; use Murasaki::KOG; use Murasaki::Ticker qw{resetTick tick}; my $suffixre=qr/\.(anchors|anchors\.details|options|seqs|stdout|stderr|filterstats?|anchors\.filterstats?|anchors\.details\.filterstats?|murasaki)/; our $podusageOpts={-message=>'Use --help or --man for more detailed help.',-verbose => 0,-exitval=>2}; my ($help,$man,$output,$disp_all,$opt_loadSeqs); our ($completed_checked); GetOptions('help|?' => \$help, man => \$man, 'output=s' => \$output, all => \$disp_all, loadseqs=>\$opt_loadSeqs ) or pod2usage({-verbose=>1,-exitval=>2}); pod2usage({-verbose=>1,-exitval=>2,-message=>'Need some input file...'}) if $#ARGV<0; our %stats; foreach my $arg (@ARGV){ procArg($arg); } $disp_all=1 unless $completed_checked; #unless there's a possibility for confirming completeness, don't check it my %fields=(file=>sub{my ($a)=@_;$a->{name}}, anchors=>sub{my ($a)=@_;$a->{anchors}}, time=>sub{my ($a)=@_;$a->{time}}, weight=>sub{my ($a)=@_;$a->{weight}}, length=>sub{my ($a)=@_;$a->{length}}, members=>sub{my ($a)=@_;join(",",$a->{cogspec})}, size=>sub{my ($a)=@_;$a->{alt_inputsize} ? $a->{alt_inputsize}:$a->{inputsize}}, seqs=>sub{my ($a)=@_;$a->{seqcount}}, Ksens=>sub{my ($a)=@_;$a->{rocp}->{KOGs}->{Sensitivity}}, Kspec=>sub{my ($a)=@_;$a->{rocp}->{KOGs}->{Specificity}}, sens=>sub{my ($a)=@_;$a->{rocp}->{KOGs}->{Sensitivity}}, spec=>sub{my ($a)=@_;$a->{rocp}->{KOGs}->{Specificity}}, Osens=>sub{my ($a)=@_;$a->{rocp}->{Orthos}->{Sensitivity}}, Ospec=>sub{my ($a)=@_;$a->{rocp}->{Orthos}->{Specificity}}, 'auc-tfidf'=>sub{my ($a)=@_;$a->{predictor}->{auc}->{tfidf}}, 'auc-hits'=>sub{my ($a)=@_;$a->{predictor}->{auc}->{hits}}, 'auc-length'=>sub{my ($a)=@_;$a->{predictor}->{auc}->{length}} ); my @fieldorder=qw{file weight length anchors time size seqs sens spec Osens Ospec auc-tfidf auc-hits auc-length}; $output="-" unless $output; open(my $outfh,">$output"); print $outfh join("\t",@fieldorder)."\n"; print $outfh (map {my $s=$stats{$_}; join("\t", (map { die "Unknown field $_" unless exists $fields{$_}; my $res=&{$fields{$_}}($s); $res} @fieldorder))."\n" } (grep {$disp_all or exists $stats{$_}->{completed}} sort numberedFilesCmp keys %stats)); sub numberedFilesCmp { my $an=$1 if $a=~m/(\d+)/; my $bn=$1 if $b=~m/(\d+)/; return $an <=> $bn if(defined($an) and defined($bn)); return $a cmp $b; } sub procArg { my ($arg)=@_; if(-d $arg){ opendir(my $dh,$arg); my @files=grep {$_ ne '.' and $_ ne '..'} readdir($dh); print "Scanning ".(@files)." files from $arg\n"; resetTick(scalar(@files)); foreach my $f (@files){ procArg("$arg/$f"); tick; } print "\n"; }elsif(-f $arg){ my ($name,$path,$suffix)=fileparse($arg,$suffixre); next unless $suffix; #only process known file types # print "Proc $arg as $name ~~ $suffix\n"; $stats{$name}={name=>$name} unless ref $stats{$name} eq "HASH"; my $statr=$stats{$name}; open(my $fh,$arg); if($suffix eq ".seqs"){ my $inputsize; while(<$fh>){ chomp; push(@{$statr->{cogspec}},Murasaki::KOG->guessKogMember($_)); $statr->{inputsize}+=getInputSize($_) if $opt_loadSeqs; } $statr->{seqcount}=lineCount($fh); }elsif($suffix=~m/^\.anchors(.details)?$/){ $statr->{anchors}=lineCount($fh); $statr->{format}=anchorFormat($fh) if $suffix=~m/\.details$/; }elsif($suffix eq '.options'){ while(<$fh>){ chomp; next unless $_; if(m/(.*)?: (.*)/){ $statr->{options}->{$1}=$2; }else{ $statr->{options}->{$_}=undef; } $statr->{repeatmasked}=1 if $_ eq 'Repeats are: masked'; } }elsif($suffix eq '.stdout' or $suffix eq '.murasaki'){ $completed_checked=1; while(<$fh>){ chomp; if(m/^Total anchors: /){ $statr->{completed}=1; }elsif(m/^Total processing time: (.*)/){ $statr->{time}=parseHumanTime($1); }elsif(m/^Pat: \((\d+) bases long\) contains (\d+) bits/){ $statr->{length}=$1; $statr->{weight}=$2; }elsif(m/^Buckets used: (.*)/){ $statr->{bucketdat}=$1 }elsif(m/^Output writing finished in: (.*)/){ $statr->{outputwrite_time}=parseHumanTime($1); }elsif(m/^Done \((\d+)bp\)/){ $statr->{alt_inputsize}+=$1; } } }elsif($suffix=~m/filterstats?$/){ while(<$fh>){ FILTERSTATCHECK: if(m/^ROCR derived predictor stats:/){ #grab AUC style stuff $_=<$fh>; s/^\s+//; my @types=split(/\s+/); do{ $_=<$fh>; m/^\s*(\S+)\s+([0-9\. ]*)$/ or goto FILTERSTATCHECK; my @bits=split(/\s+/,$2); foreach my $i (0..$#types){ $statr->{predictor}->{$types[$i]}->{$1}=$bits[$i]; } }while(1); } if(m/^Experimental (\w+):/){ #gather ROC my $type=$1; #get raw TP/FP/etc counts $_=<$fh>; while(m/(\w+: \d+)/g){ my ($stat,$score)=split(/: /,$1); $statr->{roc}->{$type}->{$stat}+=$score; } #get summarized percent form do { $_=<$fh>; chomp; if(m/^(Sensiti.*|Specific.*|Preci.*: .*$)/i){ my ($stat,$score)=split(/: /,$1); $statr->{rocp}->{$type}->{$stat}=$score; }else{ goto FILTERSTATCHECK; } }while(1); } } } }elsif(!-e $arg){ warn "Non-existant file input: $arg"; }else{ die "Don't know how to handle input: $arg"; } } { my %sizeCache; sub getInputSize { my $fn=pop; return $sizeCache{$fn} if exists $sizeCache{$fn}; return unless -f $fn; $sizeCache{$fn}=`$root/geneparse.pl -lfc $fn`; return $sizeCache{$fn}; } } sub lineCount { my $count=0; my $fh=pop; seek($fh,0,0); while(<$fh>){ $count++; } seek($fh,0,0); return $count; } sub anchorFormat { my $fh=pop; my $format=0; seek($fh,0,0); $_=<$fh>; $format=1 if m/^(?:-?\d+\s-?\d+\s[+-]\s*)+\d+$/; $format=2 if m/^(?:-?\d+\s-?\d+\s[+-]\s*)+\d+\s(?:\S+)\s(?:\d+:\d+,?){1,50}$/; #note: for some inputs, the member lists these lines can be craaaazy long. seek($fh,0,0); return $format; } __END__ =head1 NAME gatherstats.pl -- gathers stats from all sorts of sources =head1 SYNOPSIS gatherstats.pl [input2 ...] =head1 OPTIONS Input is any bunch of files, or directories. They'll be scanned for files from the various Murasaki programs, like filterstats and stdout, etc. Options: --loadseqs allows length checking from .seqs files --output outputs to a file instead of STDOUT murasaki/randpat.pl0000755000177700001440000000364011434752242013752 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Std; use List::Util shuffle; $Getopt::Std::STANDARD_HELP_VERSION=true; getopts('chs:t:n'); if($opt_h){HELP_MESSAGE();exit(0);} srand($opt_s) if defined($opt_s); my $repeats=$opt_t ? $opt_t:1; ($weight,$length,$file)=@ARGV; ($weight,$length,$file)=($1,$2,$length) if $weight=~m/(\d+)\D(\d+)/; $zeros=$length-$weight; if($file){ open(OF,">>$file"); select OF; } print "Weight: $weight\nLength: $length\n" unless $opt_c; for my $i (1..$repeats){ $pat=join("",1, shuffle((map {1} (1..$weight-2)),(map {0} (1..$zeros))), 1); print $pat; }continue{print "\n" if $i < $repeats} print "\n" unless $opt_n; sub main::HELP_MESSAGE(){ print < -c specifies "clean output" ie only the pattern -n suppress the final newline -s specifies seed for random number generator -t specifies the number of patterns to generate As an alternative to specifying and you can also use the Murasaki format of :. A neat trick for scripts is to use geneparse.pl to generate this for you. eg: ./murasaki -p`./geneparse.pl -m -f -e -c seq/humanY.fa` ENDTEXT ; } sub main::VERSION_MESSAGE(){ } murasaki/getcds.pl0000755000177700001440000005204011434752242013570 0ustar krispusers#!/usr/bin/perl -w #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; use Getopt::Long; use Pod::Usage; #use Data::Dump qw{dump}; use strict; my ($help,$man,$opt_prefix); our ($seqhome,$root,$flexible,$strict,$beenwarned); BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; use Murasaki::SeqFeatures; our %common_overrides; our %redirect; my $opt_force; my @instantRedirects; my $verbose; GetOptions('help|?' => \$help, man => \$man, flexible => \$flexible, force=>\$opt_force, 'redirect=s' => sub { push(@instantRedirects,$_[1]) }, verbose => \$verbose, 'strict!' => \$strict); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my $startdir=$ENV{PWD}; my %dbConnections; #keep connection up in case we use it later print "Do stuff with @ARGV\n"; while(@ARGV){ my ($filename,$outfile); $filename=shift(@ARGV); $outfile=shift(@ARGV) if @ARGV; if($outfile and -f $outfile){ #if it's another input file, throw it back on! unshift(@ARGV,$outfile); $outfile="$filename.cds"; } # print "Make cds for $filename\n"; $outfile="$filename.cds" unless $outfile; my @cds_features; our $src_primary='CDS'; open(my $infh,$filename) or die "Couldn't open source file $filename"; procOverrides("$filename.overrides") if -f "$filename.overrides"; $src_primary=$common_overrides{CDS} if $common_overrides{CDS}; my $redirect="$filename.redirect"; my $globalRedirect=getPath($redirect)."/.redirect"; push(@cds_features,useRedirect($filename,\@instantRedirects,$outfile)) if scalar(@instantRedirects); push(@cds_features,useRedirect($filename,$globalRedirect,$outfile)) if -f $globalRedirect; push(@cds_features,useRedirect($filename,$redirect,$outfile)) if -f $redirect; $_=<$infh>; if($_ and m/(\S+)\t(\d+)\t(\d+)\t(\d+)/){ #stitch format print "$filename is a stitch file. Compiling constituent parts:\n"; do{ my ($src,$len,$start,$stop)=m/(\S+)\t(\d+)\t(\d+)\t(\d+)/; #recursively generate baby! my $localpath=getPath($filename); if(!-f $src and $localpath ne './'){ print "Don't see $src from this directory.\n". "Trying from $localpath\n"; chdir($localpath); } die "Can't find component file: $src\n" unless -f $src; my $runopts=""; $runopts.=" --flexible" if $flexible; $runopts.=" --force" if $opt_force; $runopts.=" --verbose" if $verbose; $runopts.=join(" ",map {"--redirect=$_"} @instantRedirects); system("$root/getcds.pl $runopts $src") unless -f "$src.cds"; die "Couldn't load annotation data for $src" unless -f "$src.cds"; my $offset=$start-1; print "Adding $src.cds with offset $offset -> "; my @cds=readCds("$src.cds",$offset); print scalar(@cds)." CDS's\n"; push(@cds_features,@cds); }while(<$infh>); close($infh); }else{ #simple file close($infh); if($filename=~m/\.fa(?:\.gz)?$/){ #bioperl won't get crap out of fasta files #in case we already got some data manually from a .redirect, don't duplicate unless($redirect{ensembl_build} || $redirect{ucsc_build} || $redirect{gtf}){ #hope we can guess a database based on the filename my ($db,$build,$chrom)=guessDbSource($filename); if($db){ die "No chromosome?" unless $chrom; die "Couldn't find $db build for chromosome $chrom\n" unless $build; print "Appears to be $db data (build $build).\n"; push(@cds_features,ensemblFetch($build,$chrom)) if($db=~m/^ensembl$/i); push(@cds_features,ucscFetch($build,$chrom)) if($db=~m/^ucsc$/i); }else{ die "Couldn't find a db source for $filename"; } } }else{ unless($redirect{nobioperl}){ print "Loading $filename\n"; my $seq=Bio::SeqIO->new(-file => $filename)->next_seq; die "Error loading $filename" unless $seq; @cds_features = grep { $_->primary_tag eq $src_primary } $seq->get_SeqFeatures; print "$filename loaded. Contains ".($#cds_features+1)." CDSs\n"; if($#cds_features<0 and !$opt_force){ print "Cowardly refusing to write an empty .cds file (use --force to force this).\n"; exit(2); } } } } writeCds("$outfile",@cds_features); } sub useRedirect { my ($filename,$redirect,$outfile)=@_; our $src_primary; if(ref $redirect){ #oneshot wonders foreach (@$redirect){ s!^(.*?)(#|//|;|%).*$!$1!; #chop off various comments my ($key,$val)=m/^(.*?)=(.*)$/; next unless $key; #nothing? skip... $val=~s/\s+$//; #cleanup any spaces at end print "Redirect: $key = $val\n"; $redirect{$key}=$val; } }else{ print "Loading redirect $redirect...\n"; open(my $infh,$redirect) or return; while(<$infh>){ chomp; s!^(.*?)(#|//|;|%).*$!$1!; #chop off various comments my ($key,$val)=m/^(.*?)=(.*)$/; next unless $key; #nothing? skip... $val=~s/\s+$//; #cleanup any spaces at end print "Redirect: $key : $val\n"; $redirect{$key}=$val; } } my ($chromosome,$offset)=(0,0); my $raw_chromosome; #before getting mangled hacchy-style my $out_primary=$src_primary; my ($ucsc_chromosome)=($filename=~m/(chr[^.]+)/); $out_primary=$redirect{primary} if exists $redirect{primary}; $offset=$redirect{offset} if exists $redirect{offset}; $redirect{nobioperl}=1 unless exists $redirect{'+bioperl'}; if(exists($redirect{chromosome})){ $chromosome=$redirect{chromosome}; $raw_chromosome=$chromosome; }else{ $chromosome=(parseEnsemblName($filename))->{chrom} if parseEnsemblName($filename); $chromosome=$1 if $filename=~m/chr(?:omosome\.?)?([^.]*)\..*$/ and !$chromosome; $chromosome=$1 if $filename=~m/(\d+|.)\..*$/ and !$chromosome; $raw_chromosome=$chromosome; my %conv=(X=>-1,Y=>-2,M=>-3,Un=>-2); $chromosome=$conv{$chromosome} if(exists($conv{$chromosome})); } return gtfFetch($redirect{gtf},$raw_chromosome,$filename) if $redirect{gtf}; return ensemblFetch($redirect{ensembl_build},$raw_chromosome) if $redirect{ensembl_build}; return ucscFetch($redirect{ucsc_build},$ucsc_chromosome) if $redirect{ucsc_build}; if($redirect{murasaki_synth}){ my $inf=$redirect{murasaki_synth}; $inf.=getName($inf) if -d $inf; return murasakiSynth($inf,exists $redirect{murasaki_synth_id} ? $redirect{murasaki_synth_id}:getSeqId(repSuffix($inf,".seqs"),$filename)); } #extract from one of hacchy's cooked files print "Identified as hacchy-zome $chromosome\n"; die "Unknown chromosome" unless ($chromosome=~m/-?\d+/ and $chromosome!=0); die "No annotation source file?" unless $redirect{src}; open(my $src,$redirect{src}) or die "Couldn't open indicated src file: $redirect{src}"; my @features; while(<$src>){ chomp; my @a=split(/\t/); next unless($a[1]==$chromosome or $chromosome eq 'any'); push(@features,new Bio::SeqFeature::Generic (-primary => $out_primary, -tag => {($a[0] ? (gene => $a[0]):()), ($a[0] ? (locus_tag=>$a[0]):())}, -start => $offset+$a[2], -end => $offset+$a[3], -strand => $a[4]) ); } print "Derived ".scalar(@features)." features from $redirect.\n"; return @features; } sub ensemblFetch { my ($build,$chromosome)=@_; my $out_primary=our $src_primary; $out_primary=$redirect{primary} if exists $redirect{primary}; my $offset=exists $redirect{offset} ? $redirect{offset}:0; use DBI; my $dbh=getDb('ensembl',$build); my $sth = $dbh->prepare("SELECT i.stable_id,s.name,g.seq_region_start,g.seq_region_end,g.seq_region_strand FROM gene g, gene_stable_id i, seq_region s, coord_system c WHERE c.coord_system_id=s.coord_system_id AND c.name='chromosome' AND g.gene_id=i.gene_id and s.seq_region_id=g.seq_region_id AND s.name='$chromosome'"); #weee for ugly long SQL! print "Connected. Running query for chromosome $chromosome...\n"; my $rv=$sth->execute; print "Query returned: $rv rows\n"; my @a; my @features; while(@a=$sth->fetchrow_array){ push(@features,new Bio::SeqFeature::Generic (-primary => $out_primary, -tag => {gene => $a[0], locus_tag=>$a[0]}, -start => $offset+$a[2], -end => $offset+$a[3], -strand => $a[4]) ); } # $dbh->disconnect or warn $dbh->errstr; return @features; } sub gtfFetch { my ($gtf,$chromosome,$filename)=@_; my @features; my $mergeExons=!$redirect{keepExons}; my $offset=exists $redirect{offset} ? $redirect{offset}:0; print "Extracting features for $chromosome from ".(fileparse($gtf))[0]."\n"; my %regions; my $multiseg=$chromosome=~m/seqlevel|nonchromosomal/; if($multiseg and $filename){ #gets much more complicated %regions=loadSegments($filename); } my $length=`$root/geneparse -l -c $filename`; my $out_primary=our $src_primary; $out_primary=$redirect{primary} if exists $redirect{primary}; open(my $gtfh,($gtf=~m/\.gz$/ ? "zcat $gtf|":$gtf)) or die "Couldn't open $gtf"; my (@noteOrder,%notes); my $anonId=1; while(<$gtfh>){ my $rawline=$_; my $segoffset=0; chomp; my ($chrom,$family,$feature,$start,$stop,$score,$strand,$frame,$infoString)=split(/\t/); next unless $feature eq $src_primary; if($multiseg){ # warn "Source contig not found for $chrom?" unless exists $regions{$chrom}; next unless exists $regions{$chrom}; $segoffset=$regions{$chrom}->{offset}; }else{ next unless $chrom eq $chromosome; } $infoString=~s/;\s*$//; my @bits=split(/;\s*/,$infoString); my %info; foreach my $bit (@bits){ $bit=~m/^\s*(\S+)\s"(.*)"$/ or die "Malformed info bit: '$bit'"; $info{$1}=$2; } my $id=$info{gene_id}; $id="ANON".($anonId) unless $id; if($multiseg){ if($stop-$start>=$regions{$chrom}->{length}){ # print dump($regions{$chrom})."\n"; possiblyFatal("CDS outside of region: ($stop-$start=".($stop-$start).")>=".($regions{$chrom}->{length})." (from $rawline -> $chrom)\n"); } if($offset+$stop+$segoffset){ possiblyFatal("CDS outside of genome: $offset + $stop + $segoffset > $length (from $chrom)\n"); } } my $note={-primary => $out_primary, -tag => {gene=>$id,locus_tag=>$id}, -start => $offset+$start+$segoffset, -end => $offset+$stop+$segoffset, -strand => $strand}; unless($mergeExons){ push(@features,new Bio::SeqFeature::Generic (%$note)); }else{ if($notes{$id}){ mergeFeature($notes{$id},$note); }else{ push(@noteOrder,$id); $notes{$id}=$note; } } } if($mergeExons){ foreach my $id (@noteOrder){ push(@features,new Bio::SeqFeature::Generic (%{$notes{$id}})); } } print "Got ".(scalar @features)." features.\n"; return @features; } sub mergeFeature { my ($target,$add)=@_; die "Merging two non-matching tags ($target->{-tag}->{gene} and $add->{-tag}->{gene})" unless $target->{-tag}->{gene} eq $add->{-tag}->{gene}; #sadly, this seems to happen. wtf are these genes? warn "Merging two tags on different strands! ($target->{-strand} and $add->{-strand})" unless $target->{-strand} eq $add->{-strand}; $target->{-start}=$add->{-start} if $target->{-start}>$add->{-start}; $target->{-end}=$add->{-end} if $target->{-end}<$add->{-end}; } sub ucscFetch { my ($build,$chromosome)=@_; my $out_primary=our $src_primary; $out_primary=$redirect{primary} if exists $redirect{primary}; my $offset=exists $redirect{offset} ? $redirect{offset}:0; use DBI; my $dbh=getDb('ucsc',$build); my $sth = $dbh->prepare("SELECT name,chrom,cdsStart,cdsEnd,strand FROM ensGene WHERE chrom='$chromosome'"); #weee for short boring SQL! print "Connected. Running query for chromosome $chromosome...\n"; my $rv=$sth->execute; print "Query returned: $rv rows\n"; my @a; my @features; while(@a=$sth->fetchrow_array){ push(@features,new Bio::SeqFeature::Generic (-primary => $out_primary, -tag => {gene => $a[0], locus_tag=>$a[0]}, -start => $offset+$a[2], -end => $offset+$a[3], -strand => $a[4]) ); } # $dbh->disconnect or warn $dbh->errstr; return @features; } sub murasakiSynth { my ($alignment,$seqid)=@_; my $out_primary=our $src_primary; $out_primary=$redirect{primary} if exists $redirect{primary}; open(my $fh,$alignment) or die "Couldn't open $alignment."; my $alignname=getName($alignment); my $line=0; my @features; while(<$fh>){ chomp; my @dats=split(/\t/); my ($start,$stop,$strand)=@dats[map {$seqid*3+$_} (0..2)]; ($start,$stop)=(0-$stop,0-$start) if $start<0; $strand=($strand eq '+' ? 1:-1); my $id="$alignname:$seqid:$line"; push(@features,new Bio::SeqFeature::Generic (-primary => $out_primary, -tag =>{gene=>$id,locus_tag=>$id}, -start=>$start, -end=>$stop, -strand=>$strand)); }continue{$line++} return @features; } sub procOverrides { my $src=pop; open(my $infh,$src) or return; while(<$infh>){ chomp; my ($key,$val)=m/^(.*?)=(.*)$/; print "Overriding $key as $val\n"; $common_overrides{$key}=$val; } } sub getSeqId { my ($seqf,$target)=@_; open(my $fh,$seqf) or die "No sequence file $seqf"; my $id=0; local $_; while(<$fh>){ chomp; return $id if m/$target/; }continue{$id++} die "Couldn't find $target in $seqf"; return 0; } sub getPath { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $path } @_; return @ret if $#_; return $ret[0]; } sub getName { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $name } @_; return @ret if $#_; return $ret[0]; } sub slurp { local $/; open(my $fh,"@_") or return; return <$fh>; } sub getDb { my ($db,$build)=@_; return $dbConnections{$db} if $dbConnections{$db}; use DBI; my $dbh; if($db=~m/^ucsc$/i){ my $dsn = "DBI:mysql:$build:genome-mysql.cse.ucsc.edu"; my $db_user_name='genome'; print "Connecting to UCSC MySQL DB...\n"; $dbh = DBI->connect($dsn, $db_user_name) or die "Couldn't connect to $dsn!"; }elsif($db=~m/^ensembl$/i){ my ($release,$version)=$build=~m/_(\d+)(_[^_]+)$/; my $port=$release>47 ? 5306:3306; my $dsn = "DBI:mysql:$build:ensembldb.ensembl.org:$port"; my $db_user_name='anonymous'; print "Connecting to Ensembl MySQL DB...\n"; $dbh = DBI->connect($dsn, $db_user_name) or die "Couldn't connect!"; }else{ die "Unknown database: $db"; } $dbConnections{$db}=$dbh; return $dbh; } sub parseEnsemblDbName { my @bits=split(/_/,$_[0]); return {species=>"$bits[0] $bits[1]", data=>$bits[2], release=>$bits[3], assembly=>$bits[4]} } sub parseEnsemblName { my ($file)=@_; my ($filename,$dir)=fileparse($file); my ($species,$assembly,$release,$type,$chrom,$gz)=($filename=~m/([^.]+)\.(.+)\.(\d+)\.(dna(?:_rm)?)\.((?:chromosome\.[^.]+)|nonchromosomal|seqlevel)\.fa(\.gz)?/) or return undef; $chrom=$1 if $chrom=~m/chromosome\.(.*)/; return {file=>$filename, species=>$species, assembly=>$assembly, release=>$release, type=>$type, chrom=>$chrom, compressed=>$gz}; } sub guessDbSource { my ($file)=@_; my ($filename,$dir)=fileparse($file); my ($db,$build,$chrom); my $dat; if($dat=parseEnsemblName($filename)){ my ($species,$assembly,$release,$dna,$file_chrom,$gz)=@{$dat}{qw{species assembly release type chrom compressed}}; print "Looks like an Ensembl sequence ($species assembly: $assembly release: $release chromosome: $file_chrom).\n"; $db='ensembl'; my @dbs=getAllDbs($db,$release); # print "Databases available: ".join("\n",@dbs)."\n"; my @similar; do { my $base=lc($species)."_core"; my $tail=$assembly; $tail=~s/\D//g; @similar=grep(/^${base}_\d+_${tail}/,@dbs); unless(@similar){ #try just using release number print "No results for ${base}_\\d+_${tail}\n"; #get by release (should only have one choice available) my $base=lc($species)."_core_$release"; print "Trying $base\n"; @similar=grep(/^$base/,@dbs); } unless(@similar){ #last resort #bos taurus acts a little strange. 3.1 is in the database as 3,3a,3b,3c,3d print "Still no results!\n"; ($tail)=($assembly=~m/^\D*(\d+)/); #just use the first numeric part if($tail){ print "Trying ${base}_\\d+_${tail}\n"; @similar=grep(/^${base}_\d+_${tail}/,@dbs); } } }; my @best=grep {parseEnsemblDbName($_)->{release}==$release} @similar; #if possible, get same release if(!@best){ #otherwise take the highest release number @best=sort {parseEnsemblDbName($a)->{release}<=>parseEnsemblDbName($b)->{release}} @similar if $#similar>0; } #now take latest version/assembly @best=sort {parseEnsemblDbName($a)->{assembly}<=>parseEnsemblDbName($b)->{assembly}} @best; $build=$best[$#best]; $chrom=$file_chrom; } return ($db,$build,$chrom); } sub getAllDbs { my ($db,$release)=@_; our %dbsAvailable; return @{$dbsAvailable{$db}} if ref $dbsAvailable{$db}; my @dbs; my $dbid=$db; $dbid.=$release>47 ? "_v2":"_v1" if($db eq 'ensembl'); my $cachefile="$ENV{HOME}/.murasaki.dbcache.$dbid"; if(-f $cachefile and -M $cachefile < 1){ print "Loading $db databases from cache ($cachefile)\n"; @dbs=split(/\n/,slurp($cachefile)); }else{ use DBI; print "Loading available data sources for $dbid\n"; my %sources=('ensembl_v1'=> {host=>'ensembldb.ensembl.org',user=>'anonymous',port=>3306}, 'ucsc'=> {host=>'genome-mysql.cse.ucsc.edu',user=>'genome'} ); $sources{'ensembl_v2'}={%{$sources{'ensembl_v1'}},port=>5306}; die "Unknown db: $dbid" unless ref $sources{$dbid}; @dbs=DBI->data_sources("mysql",$sources{$dbid}); @dbs=map {(split(/:/,$_))[2]} @dbs; #only need the names blit(join("\n",@dbs),$cachefile); my $count=@dbs; print "Data sources availabe in $dbid: $count\n"; } $dbsAvailable{$db}=[@dbs]; return @dbs; } sub blit { my ($dat,$file)=@_; open(my $fh,">$file"); print $fh $dat; } sub loadSegments { my ($filename)=(@_); unless(-f "$filename.len"){ print "Extracting subsequence meta-data from $filename\n"; system("$root/getsegments $filename") and die "Error creating segment data for $filename" } open(my $segfh,"$filename.len") or die "Couldn't open $filename.len"; my $length=<$segfh>; #first line is length only local $_; my ($at,%regions)=(0,()); while(<$segfh>){ chomp; my ($length,$name)=split(/\t/,$_); my ($id,$meta)=$name=~m/^(.*?)\s+(.*)$/; if($meta=~m/dna(?:_rm)?:.*:.*:/){ #this is ensembl style $id=(split(/:/,$name))[3]; }else{ $id=$name unless $id; } $regions{$id}={offset=>$at, length=>$length}; $at+=$length+10; } print "Loaded meta-data for ".scalar(keys %regions)." subsequences\n"; return %regions; } sub possiblyFatal { our ($strict,$beenwarned); my ($msg)=@_; die $msg if $strict; warn $msg unless $beenwarned; unless($beenwarned or $verbose){ warn "Further warning suppressed (set verbose to see them)."; $beenwarned=1; } } __END__ =head1 NAME getcds.pl - CDS extraction from various annotation formats =head1 SYNOPSIS getcds.pl [options] input file [output file] [input file [outputfile] ...] =head1 OPTIONS Options include: redirect => Enable a redirect at runtime. explained below. force => allows writing empty .cds files If [output file] is unspecificied, "[input file].cds" is used. There's lots of dark magic that can be performed with .redirect files. Redirects are used to modify the ordinary behaviour and can be applied on a global basis (a .redirect file in the same directory as [input file]), and/or on a per-input basis ([input file].redirect). Redirects are read in that order such that per-input effects are applied ontop of global effects. If you've left the filename as-is from an Ensembl downoad, the appropriate Ensembl DB source will be guessed automatically. Possible redirects: =item CDS - changes what tag type is searched for when running on bioperl files (default is CDS) =item offset - add some value to all coordinates =item +bioperl - on top of any other redirect-based data, also extract annotation via bioperl (using a direct disables bioperl's parser by default) =item ensembl_build - grab data from the ensembl (in the specified schema) =item ucsc_build - grab data from (in the specified schema) =item murasaki_synth - synthesize annotation data for each anchor from an alignment (if this is a directory, then each input file (eg input.lav) is checked for a corresponding .anchors file (eg. input.anchors). =item primary - what to type of tags to create from data gathered from outside data (eg: ensembl, ucsc, or murasaki). default is the same as the cds redirect. =item chromosome - forces a specific chromosome rather than deriving from filename =item gtf - extract annotation from a .gtf file murasaki/filter.pl0000755000177700001440000015524011434752242013612 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Long; use Pod::Usage; use File::Basename; use IO::Handle; #use Data::Dump qw {dump}; #use Math::BigRat; no Carp::Assert; use strict; #open(DEBUG,">-"); BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; use Murasaki::KOG; use Murasaki::OrthoList; use Murasaki::OrthoPairs; use Murasaki::OrthoConsistency; my $tentative=1; my (@seqlen); my ($help,$man,%min,%max,$fwd,$rev,$noDetails,@plotWhat,$bins,%plotopts,$showall,$noTags,$bwPlot,$noImaginaryExtend,$noQuick,$kogfile,$widen,$randomWiden,$statDump,@dumpStats,$anchorFormat,$opt_pdf,$opt_pointsize,$opt_lwd,$opt_roclegendpos); my ($rocrTarget)='ortholog'; my @validRocrTargets=qw{ortholog paralog tagged miss}; my (@rocrWhat,$seqCount); my @rocrPlot=qw{tpr,fpr}; my @rocrEval=qw{auc}; our @knownKogs=(Murasaki::KOG->knownKogs,Murasaki::KOG->knownCogs); #they're identical for all i care our $infh={}; #global filepointer handler our ($knownKogs,$expKogs,@kogmap,$statsPass,%knownLocs,$knownOList,$expOList,$knownOPairs,$expOPairs,$knownOCons,$expOCons); my ($useOCons,$useOPairs,$useOList,$useKogs)=(1,1,1,1); our @reservedTypes=qw{label all}; #dont get printed/plotted or always available. hidden our @mixPlotableTypes=qw{bitscore length tfitf tfidf hits cds score bitrate homology}; our @annotationTypes=qw{cds}; #data that comes from annotation, ie: cheating our @defaultRocrTypes=grep {my $a=$_; grep {$a ne $_} @annotationTypes} @mixPlotableTypes; our @kogTypes=qw{kogtp kogfp oltp olfp}; our @alignTypes=qw{miss untagged distant paralog ortholog}; our @allStatTypes=(@mixPlotableTypes,@alignTypes,@kogTypes); our @statTypes=@allStatTypes; our %kogpref; our %statreqs=(etfidf => sub{!$noDetails}, tfitf => sub{!$noDetails}, ortholog => sub{!$noTags}, miss => sub{!$noTags}, distant => sub{!$noTags}, paralog => sub{!$noTags}, untagged => sub{!$noTags}, cds => sub{!$noTags}, bitscore => sub{$tentative || $infh->{bitscore}}, bitrate => sub{$tentative || $infh->{bitscore}}, homology => sub{$tentative || $infh->{homology}}, kogtp => sub{$kogfile}, kogfp => sub{$kogfile}, score => sub{$anchorFormat>1}, hits => sub{$anchorFormat>0} ); our %userStats; our %externalStats; #statname -> infh name our $searcher=\&binarySearch; resetStatTypes(); $bins=100; $showall='scale'; our $podusageOpts={-message=>'Use --help or --man for more detailed help.',-verbose => 0,-exitval=>2}; my $opterr= GetOptions('help|?' => \$help, man => \$man, 'lwd=f'=>\$opt_lwd,'pointsize=f'=>\$opt_pointsize, 'roclegendpos=s'=>\$opt_roclegendpos, 'min=s%' => sub { pod2usage({%$podusageOpts,-msg=>"Syntax error: $_[0] $_[1]=$_[2]\n"}) unless ($_[1] and $_[2]); # pod2usage({%$podusageOpts,-msg=>"Unknown stat: $_[1]\n"}) unless(grep {$_[1] eq $_} @statTypes); $min{$_[1]}=$_[2]; }, 'max=s%' => sub { pod2usage({%$podusageOpts,-msg=>"Syntax error: $_[0] $_[1]=$_[2]\n"}) unless ($_[1] and $_[2]); # pod2usage({%$podusageOpts,-msg=>"Unknown stat: $_[1]\n"}) unless grep {$_[1] eq $_} @statTypes; $max{$_[1]}=$_[2]; }, 'def=s%' => sub { pod2usage({%$podusageOpts,-msg=>"Syntax error: $_[0] $_[1]=$_[2]\n"}) unless $_[1] and $_[2]; pod2usage({%$podusageOpts,-msg=>"Cannot redefine existing stats: $_[1]\n"}) if grep {$_[1] eq $_} @statTypes; pod2usage({%$podusageOpts,-msg=>"Cannot $_[1] is a reserved type, sorry.\n"}) if grep {$_[1] eq $_} @reservedTypes; $userStats{$_[1]}=userStatFun(@_); addStat($_[1]); my $func="sub {" . join(" && ",(map {"statReqsOk('$_')"} statReqs(@_)))."}"; $statreqs{$_[1]}=eval($func); }, 'minscore=s' => \$min{score}, 'maxscore=s' => \$max{score}, 'minlength=s' => \$min{length}, 'maxlength=s' => \$max{length}, 'mintfidf=s' => \$min{tfidf}, 'maxtfidf=s' => \$max{tfidf}, 'minhits=s' => \$min{hits}, 'maxhits=s' => \$max{hits}, fwd => \$fwd, rev => \$rev, details => sub{$noDetails=0;resetStatTypes()}, nodetails => sub{$noDetails=1;resetStatTypes()}, 'plot:s' => sub { @plotWhat=parseStatTypeList($_[1] ? $_[1]:'all')}, 'bins=i' => \$bins, 'nobins' => sub {$bins=0}, 'plotopts:s%' => sub { my ($type,$target,$data)=('all',$_[1],$_[2] ? $_[2]:1); ($type,$target)=($1,$2) if($_[1]=~m/(\w+)\.(\w+)/); my @targets=parseStatTypeList($type); foreach my $t (@targets){ ${$plotopts{$t}}{$target}=$data; # print "Setting $t.$target = $data\n"; } }, 'color' => sub {$bwPlot=0}, 'bw' => \$bwPlot, 'tags' => sub {$noTags=0;resetStatTypes()}, 'notags' => sub {$noTags=1;resetStatTypes()}, 'showall:s' => sub {$showall=0;$showall=$_[1] if $_[1]}, 'noextend' => \$noImaginaryExtend, 'all' => \$noQuick, 'quick' => sub {$noQuick=0}, 'kogfile=s' => sub {$kogfile=$_[1];resetStatTypes()}, 'kog=s%' => sub {pod2usage({%$podusageOpts,-msg=>"Need values for --kog $_[1]=$_[2] (lhs and rhs!)"}) unless(defined $_[1] and $_[2]); pod2usage({%$podusageOpts,-msg=>"Bad --kog $_[1]= (lhs)"}) unless(defined $_[1] and $_[2] and $_[1]>=0); pod2usage({%$podusageOpts,-msg=>"Bad --kog $_[1]=$_[2] (rhs)"}) unless(defined $_[1] and $_[2] and $_[1]>=0 and $_[2]); my $newname=scalar(Murasaki::KOG->guessKogMember($_[2])); $kogmap[$_[1]]=$newname ? $newname:$_[2]}, 'widen=i' => \$widen, 'randomwiden=i' => \$randomWiden, 'statdump' => \$statDump, 'dumpstats=s' => sub{@dumpStats=parseStatTypeList($_[1] ? $_[1]:'all',\@defaultRocrTypes)}, 'ROCR|rocr:s' => sub{pod2usage({%$podusageOpts,-msg=>"ROCR requires kog support (make sure --kogfile comes before --rocr)"}) unless $kogfile; @rocrWhat=parseStatTypeList($_[1] ? $_[1]:'all',\@defaultRocrTypes)}, 'rocreval=s' => sub{push(@rocrEval,$_[1])}, 'rocrplot=s' => sub{push(@rocrPlot,$_[1])}, 'rocrtarget=s' => sub{pod2usage({%$podusageOpts,-msg=>"Unknown rocrtarget: $_[1]"}) unless grep {$_ eq $_[1]} @validRocrTargets; $rocrTarget=$_[1]}, 'linear' => sub{$searcher=\&linearSearch}, 'pdf' => \$opt_pdf, 'kogpref=s%' => \%kogpref, 'usekogs!'=>\$useKogs, 'useolist!'=>\$useOList, 'useopairs!'=>\$useOPairs, 'useocons!'=>\$useOCons ); pod2usage(-exitstatus => 0, -verbose => 2) if $man; pod2usage(-exitstatus => 0, -verbose => 99, -section=>"SYNOPSIS|OPTIONS|DESCRIPTION") if $help; pod2usage($podusageOpts) if $#ARGV<0 or !$opterr; sub addStat { my ($stat)=@_; push(@allStatTypes,$stat); push(@statTypes,$stat); push(@defaultRocrTypes,$stat); push(@mixPlotableTypes,$stat); } sub statReqsOk { local $_=pop; my $subres; $subres=&{$statreqs{$_}} if $statreqs{$_}; return ($subres || !$statreqs{$_}); } sub resetStatTypes { #must amend @statTypes if user disabled some of them @statTypes=grep {statReqsOk($_)} @allStatTypes; #how can you not love perl? @plotWhat=grep {statReqsOk($_)} @plotWhat; @rocrWhat=grep {statReqsOk($_)} @rocrWhat; #how can you not love perl? } our $screenwidth=$ENV{COLUMNS}; $screenwidth=75 unless $screenwidth>0; my ($inf,$outf)=@ARGV; open(STDOUT,"|tee $inf.filterstats"); #log output! our $basepath=getPath($inf); our $basename=(fileparse($inf,qr{\.anchors\.details?|\.anchors}))[0]; our $basefile=getPath($inf).$basename; #my $hash_inf="$basefile.histogram.details"; open($infh->{anchors},"$inf") or die "Can't open $inf for input"; open($infh->{bitscore},"$basefile.anchors.bitscore") or delete($inf->{bitscore}) if -f "$basefile.anchors.bitscore"; open($infh->{homology},"$basefile.homology") or delete($inf->{homology}) if -f "$basefile.homology"; do{print "Bitscore data found.\n"} if exists $infh->{bitscore}; do{print "Homology data found.\n"} if exists $infh->{homology}; foreach my $statFile (<$inf.stats.*>){ open(my $fh,$statFile) or next; my ($statName)=$statFile=~m/\.stats\.(.*)$/; next unless $statName; print "Found external stat ($statName) file: $statFile\n"; if(exists $infh->{$statName} or (grep {$_ eq $statName} @statTypes)){ print "$statName conflicts with existing stat. Ignoring.\n"; next; } $infh->{$statName}=$fh; addStat($statName); $externalStats{$statName}=$statName; } #our @hashCount=loadHistogramDetails($hash_inf) if -e $hash_inf and !$noDetails; $anchorFormat=anchorFormat($infh->{anchors}); do {print "Disabling 'details' due to insufficient detail in anchors.details file.\n"; $noDetails=1} unless $anchorFormat>2 or $noDetails; resetStatTypes(); @plotWhat=split(/,/,join(',',@plotWhat)); @plotWhat=@statTypes if grep {$_ eq 'all'} @plotWhat; our (@hashCount,@docCount,@seqcds,%commonNames,@usedNames,%namecon); #data used while running statistics our $annotation; our $anchorCount=lineCount($infh->{anchors}); reverseDetails($inf) unless $noDetails; print "Assuming sequence file: $basefile.seqs\n"; setupKogmap("$basefile.seqs") if $kogfile; getTags("$basefile.seqs") unless $noTags; setupKogs("$basefile.seqs") if $kogfile; getLens("$basefile.seqs") if $randomWiden; $tentative=0; #major inits done... resetStatTypes(); #make sure each min/max actually has supporting stats foreach my $req (\%max,\%min){ foreach my $k (keys %$req){ do { warn "Requisite stats not found for: $k ($req->{$k})" if $req->{$k}; delete $req->{$k} } unless grep {$k eq $_} @statTypes; } } #can't dump stats for which we're also reading from their file @dumpStats=grep {!exists $externalStats{$_}} @dumpStats; my %userReqs; %userReqs=statReplace(\%max,{},"max","preview"); hashSum(\%userReqs,{statReplace(\%min,{},"min","preview")}); #Gather stats! my $prestats=gatherStats($infh,"Initial stats:\n",0,$anchorCount); %userReqs=statReplace(\%max,$prestats,"max"); hashSum(\%userReqs,{statReplace(\%min,$prestats,"min")}); $userReqs{count}++ if $rev; $userReqs{count}++ if $fwd; if(!$userReqs{count}){ print "No user requirements.\n"; exit(0); } $outf=getPath($inf).getName(getName($inf)).".filtered" if !$outf and $inf=~m/.+\.anchors(?:\.details)?/; die "Could not find an filename to use for output" unless $outf; open(OUTF,">$outf") or die "Can't open $outf for output"; print "Writing output to $outf\n"; print "Filtering $anchorCount anchors....\n"; seekAll($infh,0,0); resetTick(); my $keptCount=0; my $anchor=0; my @kept; my $linedat; LINE: while($linedat=readlineAll($infh)){ tick(); my %stats=getStats($linedat); chomp; my @vals=split(/\t/,$_); do{next unless grep {$_ eq "-"} @vals} if $rev; do{next unless grep {$_ eq "+"} @vals} if $fwd; foreach my $k (@statTypes){ do{next LINE unless $min{$k}<=$stats{$k}} if defined($min{$k}); do{next LINE unless $max{$k}>=$stats{$k}} if defined($max{$k}); } $kept[$anchor]=1; $keptCount++; print OUTF join("\t",@vals[0..($seqCount*3-1)])."\n"; } continue { $anchor++; } close(OUTF); print "\n"; my $poststats=gatherStats($infh,"Retained value stats:\n",\@kept,$keptCount); ###end of main stuff### sub avgLength { my $all=join("\t",@_); my @lens; while($all=~m/(-?\d+)\t(-?\d+)\t(.)/g){ my $length=abs($2-$1+1); push(@lens,$length); } return 0 unless $#lens>=0; return mean(@lens); } sub sum { my $sum=0; grep {$sum+=$_} @_; return $sum; } sub mean { my $total; foreach(@_){ $total+=$_; } return $total/($#_+1); } sub min { my $best=$_[0]; foreach(@_){ $best=$_ if $_<$best; } return $best; } sub max { my $best=$_[0]; foreach(@_){ $best=$_ if $_>$best; } return $best; } sub stddev { my $sumosqrs; my $s=mean(@_); foreach(@_){ $_-=$s; $sumosqrs+=$_*$_; } return sqrt($sumosqrs); } sub getName { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $name } @_; return @ret if $#_; return $ret[0]; } sub getPath { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $path } @_; return @ret if $#_; return $ret[0]; } sub loadHistogramDetails { my $inf=pop; print "Loading histogram details from $inf.\n"; my (@hashcount,$format); our $screenwidth; my $fh; open($fh,$inf) or die "Couldn't open $inf for reading"; $_=<$fh>; $format=1 if m/^\d+:\d+$/; $format=2 if m/^\d+\s+[ACGT.]+\s+\d+$/; die "Unknown histogram details format...\n" unless $format; my $stats=lineCount($fh); $stats*=1; print "Histogram details is format: $format and has $stats lines\n"; resetTick($stats,$screenwidth); seek($fh,0,0); my $moo; while(<$fh>){ m/^(\d+):(\d+)$/ if $format=1; m/^(\d+)\s+[ACGT.]+\s+(\d+)$/ if $format=1;; $hashcount[$1]+=$2; tick(); } print "\n"; print "Histogram details loaded.\n"; close($fh); return @hashcount; } sub reverseDetails { print "Deriving term index...\n"; our ($anchorCount,@hashCount); resetTick(); my $inf=pop; my ($infh); open($infh,$inf); while(<$infh>){ my @vals=split(/\t/,$_); my @meta=@vals[($#vals-2)..$#vals]; my ($hits,$score,$members)=@meta; foreach (split(/,/,$members)){ my ($idx,$count)=m/(\d+):(\d+)/; $hashCount[$idx]+=$count; $docCount[$idx]++; } tick(); } print "\n"; } sub tfitf { my $wi; my $dat=pop; our (@hashCount,$anchorCount); foreach (split(/,/,$dat)){ my ($idx,$count)=m/(\d+):(\d+)/; $wi+=$count*log($anchorCount/$hashCount[$idx]); } return $wi; } sub tfidf { my $wi; my $dat=pop; our (@hashCount,$anchorCount); foreach (split(/,/,$dat)){ my ($idx,$count)=m/(\d+):(\d+)/; $wi+=$count*log($anchorCount/$docCount[$idx]); } return $wi; } sub resetTick { my ($total,$div)=@_; our ($anchorCount,$screenwidth); ($total,$div)=($anchorCount,$screenwidth) unless $total; our ($ticksper,$ticksleft)=(int($total/$div),int($total/$div)); print "|",(map {"-"} (3..$div)),"|\n"; } sub tick { our ($ticksper,$ticksleft); $ticksleft--; if(!$ticksleft){ print STDOUT '.'; STDOUT->flush(); $ticksleft=$ticksper; } } sub getStats { my $dats=pop; $_=$dats->{anchors}; my (%bitdata,%homdata,%kogorthos,%listOrthos,%pairOrthos,%consOrthos); my (%namealign,%namescores,@cds); if($dats->{bitscore}){ my ($bitscore,$maxbits)=split(/\t/,$dats->{bitscore}); %bitdata=(bitscore=>$bitscore,bitrate=>$bitscore/$maxbits); } if($dats->{homology}){ my @homvals=split(/\t/,$dats->{homology}); %homdata=(homology=>mean(@homvals)); } my @vals=split(/\t/,$_); my $lastval=$anchorFormat>0 ? ($#vals-$anchorFormat):$#vals; my @meta=@vals[$lastval+1..$#vals]; @vals=@vals[0..$lastval]; my ($hits,$score,$members)=@meta; $hits=undef if $anchorFormat<1; ($score,$members)=(undef,undef) unless $anchorFormat>1; #only use @meta if there -is- meta my $length=avgLength(@vals); my $tfitf=tfitf($members) if $members && !$noDetails; my $tfidf=tfidf($members) if $members && !$noDetails; unless($noTags){ @cds=findIncidentCDS(@vals); %namealign=nameComp(@cds); %namescores=hashHistogram(%namealign); %kogorthos=updateKogs($expKogs,@cds) if $expKogs; %listOrthos=updateOList($expOList,@cds) if $expOList; %pairOrthos=updateOPairs($expOPairs,@cds) if $expOPairs; #we don't construct new pair lists, just score old. %consOrthos=updateOPairs($expOCons,@cds) if $expOCons; #orthoCons work just like opairs so we can actually reuse the same func %namescores=(untagged=>1) unless %namescores; #if no tags, untagged! } my %stats=(length=>$length, hits=>$hits, score=>$score, tfitf=>$tfitf, tfidf=>$tfidf, cds=>(scalar(values(%namealign))), %namescores, #isnt perl spiffy? %bitdata,%homdata,%kogorthos,%listOrthos, %pairOrthos, namealign=>\%namealign); foreach my $estat (keys %externalStats){ $stats{$estat}=$dats->{$externalStats{$estat}}; } foreach my $ustat (keys(%userStats)){ # print "Run $ustat = "; $stats{$ustat}=&{$userStats{$ustat}}(%stats); # print "$stats{$ustat}\n"; } return %stats; } sub updateOPairs { my ($olist,@cds)=@_; my @bits; for my $i (0..$#cds){ next unless $kogmap[$i]; my @locs=map {{$_->{locus}=>$kogmap[$i]}} (grep {$_->{locus}} @{$cds[$i]}); push(@bits,\@locs); } my %res=$olist->hitAllKnown(@bits); return (opc=>$res{hits},opic=>$res{misses}); } sub updateOList { my ($olist,@cds)=@_; my @bits; for my $i (0..$#cds){ next unless $kogmap[$i]; my @locs=map {{$_->{locus}=>$kogmap[$i]}} (grep {$_->{locus}} @{$cds[$i]}); return unless @locs; #cant use if missing a locus in a genome push(@bits,\@locs); } $olist->addOrtho(@bits); my %res=$knownOList->isOrtho(@bits); return (oltp=>$res{yes},olfp=>$res{no}); } sub updateKogs { my ($kogs,@cds)=@_; my @koggedbits=grep {$kogmap[$_]} (0..$#cds); my %res=(kogtp=>0,kogfp=>0); for my $i (@koggedbits){ for my $j (@koggedbits){ next if $i==$j; foreach my $l1 (map {$_->{locus}} @{$cds[$i]}){ foreach my $l2 (map {$_->{locus}} @{$cds[$j]}){ #oh dear god, the combinatorial explosion of it! $expKogs->add({$l1=>$kogmap[$i]},{$l2=>$kogmap[$j]}) if $statsPass<2; #only do this once $res{($knownKogs->isOrtho($l1,$l2)) ? 'kogtp':'kogfp'}++; } } } } return %res; } sub hashHistogram { my (%hist,%in); %in=@_; foreach(values(%in)){ $hist{$_}++; } return %hist; } sub seekAll { my ($fhr,$pos,$whence)=@_; foreach my $file (keys(%{$fhr})){ seek($fhr->{$file},$pos,$whence); } } sub readlineAll { my ($fh)=(@_); my %res; my %failed; foreach my $key (keys(%$fh)){ $res{$key}=readline($$fh{$key}); $failed{$key}=1 unless $res{$key}; } if(keys %failed){#if one file fails, fail all warn join(",",keys %failed)." file(s) ended before other files" unless keys(%failed)==keys(%res); return undef; } return \%res; } sub gatherStats { my ($fh,$subtitle,$kept,$input_count)=@_; our ($anchorCount); my ($anchor,$anchors)=(0,0); my (%sum,%avg,%variance,%stddev); my (%histo); my ($tp,$fp,$fn); @usedNames=();%namecon=(); my %rocrUsedLabels; my $plotPhase=($subtitle=~m/initial/i ? 1:2); $expKogs=Murasaki::KOG->empty($plotPhase<2 ? "unfiltered experimental":"filtered experimental") if $knownKogs and $useKogs; $expOList=Murasaki::OrthoList->empty($plotPhase<2 ? "unfiltered experimental":"filtered experimental",\@kogmap,\%knownLocs) if $knownOList and $useOList; $expOPairs=Murasaki::OrthoPairs->clone($knownOPairs,$plotPhase<2 ? "unfiltered experimental":"filtered experimental",\@kogmap,\%knownLocs) if $knownOPairs and $useOPairs; $expOCons=Murasaki::OrthoConsistency->clone($knownOCons,$plotPhase<2 ? "unfiltered experimental":"filtered experimental",\@kogmap,\%knownLocs) if $knownOCons and $useOCons; my $prefix=$plotPhase<2 ? "":"filtered"; my %outfhs; if($statDump){ my $statdumpPath="$basefile.".($prefix ? "$prefix.":"")."stats"; open($outfhs{statdump},'>',$statdumpPath) or die "Couldn't write to $statdumpPath"; print "Dumping per-anchor stats to $statdumpPath\n"; print {$outfhs{statdump}} join("\t",@statTypes),"\n"; } foreach my $stat (@dumpStats){ my $statdumpPath="$basefile.".($prefix ? "$prefix.":"")."filterstats.$stat"; open($outfhs{"dump-$stat"},'>',$statdumpPath); print "Dumping per-anchor $stat stats to $statdumpPath\n"; } my $rocrPath; if(@rocrWhat){ $rocrPath="$basefile.".($prefix ? "$prefix.":"")."rocr"; open($outfhs{rocr},'>',$rocrPath) or die "Couldn't write to $rocrPath"; print {$outfhs{rocr}} join("\t","label",@rocrWhat),"\n"; } my @business=qw{averages}; push(@business,'histogram') if @plotWhat; push(@business,'classification') unless $noTags; push(@business,'KOGs') if $knownKogs; $statsPass=1; print "Analyzing $input_count anchors (phase 1: ". (join(", ",@business)).")\n"; resetTick(); seekAll($fh,0,0); my $linedat; while($linedat=readlineAll($fh)){ next if($kept and !$$kept[$anchor]); my %stats=getStats($linedat); foreach my $k (@statTypes){ next unless $stats{$k}; my $kval=$stats{$k}; $sum{$k}+=$kval; if(grep {$_ eq $k} @plotWhat){ if($stats{cds}){ #has annotation data foreach my $aligned (@alignTypes){ next unless $stats{$aligned}; ${$histo{$k}}{$kval}->{$aligned}+=$stats{$aligned}/$stats{cds}; #mmm fractional histograms }; }else{ #belongs to "unknown" ${$histo{$k}}{$kval}->{untagged}++; } } if($stats{cds}){ #hit some annotation data my %aligned=%{$stats{namealign}}; foreach my $name (keys(%aligned)){ $namecon{$name}->{$aligned{$name}}++; ($aligned{$name} eq "miss" ? $fp:$tp)+=$stats{length}/scalar(keys(%aligned)); } }else{ $fp+=$stats{length}; } } $anchors++; print {$outfhs{statdump}} join("\t",map {$stats{$_}} @statTypes),"\n" if $statDump; foreach my $stat (@dumpStats){ print {$outfhs{"dump-$stat"}} $stats{$stat}."\n"; } if(@rocrWhat){ foreach my $label (rocrLabels(\%stats)){ foreach my $stat (@rocrWhat){ ${$rocrUsedLabels{$stat}}[$label]=1; } print {$outfhs{rocr}} join("\t",$label, map {$stats{$_}} @rocrWhat),"\n"; } } } continue { $anchor++; tick();} my %imaginary=hitCdsSize(); $fn=$imaginary{fn}; $imaginary{fp}=$fp; print "\n"; foreach my $k (@plotWhat){ plotHistogram($histo{$k},$k,$plotPhase); } my $rocrRes=plotRocr($rocrPath,$plotPhase, [grep {${$rocrUsedLabels{$_}}[0] && ${$rocrUsedLabels{$_}}[1]} @rocrWhat]) if @rocrWhat; #averages foreach my $k (keys(%sum)){ $avg{$k}=$sum{$k}/$anchors; } $statsPass=2; if($noQuick or $userReqs{s} or $userReqs{v}){ print "Analyzing $input_count anchors (phase 2: stddev)\n"; seekAll($fh,0,0); $anchor=0; resetTick(); while($linedat=readlineAll($fh)){ next if($kept and !$$kept[$anchor]); my %stats=getStats($linedat); foreach my $k (@statTypes){ next unless $stats{$k}; my $dif=$stats{$k}-$avg{$k}; $variance{$k}+=$dif*$dif; } } continue { $anchor++ ; tick();} foreach my $k (keys(%variance)){ $variance{$k}/=$anchors; $stddev{$k}=sqrt($variance{$k}); } }else{ print "Skipping Phase 2 (standard deviation/variation) as it's not necessary.\n"; } seekAll($fh,0,0); my %kogres=$knownKogs->compare($expKogs) if $knownKogs and $expKogs; my %olres=$knownOList->compare($expOList) if $knownOList and $expOList; print "\n"; print $subtitle if $subtitle; print "Total anchors: $anchors\n"; foreach my $k (@statTypes){ print "$k mean: $avg{$k}".($stddev{$k} ? " stddev $stddev{$k}":"")."\n"; } print "ROCR derived predictor stats:\n$rocrRes" if $rocrRes; unless($noTags){ print "CDS/".($knownKogs ? "KOG/Name":"Name")."-based Paralog ROC:\n"; printf("Sensitivity: %s (with extend: %s)\n", percent($tp,$tp+$fn), percent($imaginary{tp},$imaginary{tp}+$imaginary{fn})); printf("Specificity: %s (with extend: %s)\n", percent($tp,($tp+$fp)), percent($imaginary{tp},$imaginary{tp}+$imaginary{fp})); } if($knownKogs and $expKogs){ print "KOG-based Ortholog ROC:\n"; print "Known KOGs: ".$knownKogs->summary."\n"; print "Experimental KOGs: ".$expKogs->summary."\n"; my %kogroc=(tp=>$kogres{inboth},fn=>$kogres{in1},fp=>$kogres{in2},tn=>$kogres{in0}); print(join(" ",map {uc($_).": $kogroc{$_}"} qw{tp fp fn tn}),"\n"); printf("Sensitivity: %s\n", percent($kogroc{tp},$kogroc{tp}+$kogroc{fn})); printf("Specificity: %s\n", percent($kogroc{tp},$kogroc{tp}+$kogroc{fp})); printf("Precision: %s\n", percent($kogroc{tp}+$kogroc{tn},$kogroc{tp}+$kogroc{fp}+$kogroc{tn}+$kogroc{fn})); printf("MCC: %f\n", (($kogroc{tp}*$kogroc{tn})/sqrt(($kogroc{tp}+$kogroc{fp})* ($kogroc{tp}+$kogroc{fn})* ($kogroc{tn}+$kogroc{fp})* ($kogroc{tn}+$kogroc{fn})))) if (($kogroc{tp}+$kogroc{fp})* #no dividing by zero! ($kogroc{tp}+$kogroc{fn})* ($kogroc{tn}+$kogroc{fp})* ($kogroc{tn}+$kogroc{fn})); } if($knownOList and $expOList){ print "Non-transitive Ortholog ROC:\n"; print "Known Orthos: ".$knownOList->summary."\n"; print "Experimental Orthos: ".$expOList->summary."\n"; my %olroc=(tp=>$olres{inboth},fn=>$olres{in1},fp=>$olres{in2},tn=>$olres{in0}); print(join(" ",map {uc($_).": $olroc{$_}"} qw{tp fp fn tn}),"\n"); printf("Sensitivity: %s\n", percent($olroc{tp},$olroc{tp}+$olroc{fn})); printf("Specificity: %s\n", percent($olroc{tp},$olroc{tp}+$olroc{fp})); printf("Precision: %s\n", percent($olroc{tp}+$olroc{tn},$olroc{tp}+$olroc{fp}+$olroc{tn}+$olroc{fn})); printf("MCC: %f\n", (($olroc{tp}*$olroc{tn})/sqrt(($olroc{tp}+$olroc{fp})* ($olroc{tp}+$olroc{fn})* ($olroc{tn}+$olroc{fp})* ($olroc{tn}+$olroc{fn})))) if (($olroc{tp}+$olroc{fp})* #no dividing by zero! ($olroc{tp}+$olroc{fn})* ($olroc{tn}+$olroc{fp})* ($olroc{tn}+$olroc{fn})); } if($knownOPairs and $expOPairs){ print "Orhtolog-pair based statistics:\n"; print "Known pairs: ".$knownOPairs->summary."\n"; my %opres=($expOPairs->stats); #gives sens and spec assert(defined $opres{con}); assert(defined $opres{incon}); $opres{fscore}=(2*$opres{sens}*$opres{spec})/($opres{sens}+$opres{spec}) if ($opres{sens}+$opres{spec}); local $,="\n"; local $\="\n"; print("Consistent: ".$opres{con}, "Inconsistent: ".$opres{incon}, "Recall: ".$opres{sens}, "Precision: ".$opres{spec}, "F-Score: ".$opres{fscore}); } if($knownOCons and $expOCons){ print "Orhtolog-constitency based statistics:\n"; print "Known orthologs: ".$knownOCons->summary."\n"; my %opres=($expOCons->stats); #gives sens and spec assert(defined $opres{con}); assert(defined $opres{incon}); $opres{fscore}=(2*$opres{sens}*$opres{spec})/($opres{sens}+$opres{spec}) if ($opres{sens}+$opres{spec}); local $,="\n"; local $\="\n"; print("Consistent: ".$opres{con}, "Inconsistent: ".$opres{incon}, "Recall: ".$opres{sens}, "Precision: ".$opres{spec}, "F-Score: ".$opres{fscore}); } return {anchors=>$anchors,avg=>\%avg,variance=>\%variance,stddev=>\%stddev}; } sub rocrLabels { my $stats=pop; local $_=$rocrTarget; if(m/^ortholog$/){ return ((map {1} (1..$stats->{oltp})), (map {0} (1..$stats->{olfp}))) }elsif(m/^kogolog$/){ return ((map {1} (1..$stats->{kogtp})), (map {0} (1..$stats->{kogfp}))) }elsif(m/^tagged$/){ return ($stats->{cds} ? 1:0); }elsif(m/^paralog$/){ my $hits=sum(@{$stats}{qw{paralog ortholog}}); my $misses=sum(@{$stats}{qw{miss untagged distant}}); return ((1) x $hits, (0) x $misses); }elsif(m/^miss$/){ my $hits=sum(@{$stats}{qw{paralog ortholog untagged distant}}); my $misses=sum(@{$stats}{qw{miss}}); return ((1) x $hits, (0) x $misses); } return (); } sub writeList { open(my $outfh,'>',shift); foreach(@_){ print $outfh ($_,"\n"); } } sub percent { return 'N/A%' unless $_[1]; return sprintf("%.2f%%",$_[0]/$_[1]*100); } sub statReqs { my ($opt,$stat,$exp)=@_; my @available=grep {$_ ne $stat} @statTypes; my @reqs; foreach my $ustat (@available){ push(@reqs,$ustat) if $exp=~m/(?<=\W)$ustat(?=\W|$)/; push(@reqs,$ustat) if $exp=~m/^$ustat(?=\W|$)/; } return @reqs; } sub userStatFun { my ($opt,$stat,$exp)=@_; my @available=grep {$_ ne $stat} @statTypes; my @reqs; foreach my $ustat (@available){ my $rep='$stats{'.$ustat.'}'; $exp=~s/(?<=\W)$ustat(?=\W|$)/$rep/g; $exp=~s/^$ustat(?=\W|$)/$rep/g; } return eval('sub { my %stats=@_; return '.$exp.'}'); } sub statReplace { my ($reqsr,$statsr,$desc,$preview)=@_; my %reqs=%{$reqsr}; my %stats=%{$statsr}; my %using; my ($count,$first,$converts)=(0,1,0); foreach my $k (keys(%reqs)){ next unless defined($reqs{$k}); $count++; next unless $reqs{$k}=~m/x|s|v|a/; #only fiddle with things that have an x or an s if($first and !$preview){print "\n";$first=0} if($preview){ $using{x}+=($reqs{$k}=~m/x/g); $using{s}+=($reqs{$k}=~m/s/g); $using{v}+=($reqs{$k}=~m/v/g); $using{a}+=($reqs{$k}=~m/a/g); }else{ print "Converting $desc$k: $reqs{$k} -> "; $reqs{$k}=~s/(\d\.?\d*)(x|s|v|a)/\1*\2/g; $using{x}+=($reqs{$k}=~s/x/${$stats{avg}}{$k}/g); $using{s}+=($reqs{$k}=~s/s/${$stats{stddev}}{$k}/g); $using{v}+=($reqs{$k}=~s/v/${$stats{variance}}{$k}/g); $using{a}+=($reqs{$k}=~s/a/$stats{anchors}/g); print $reqs{$k}." = "; $reqs{$k}=eval($reqs{$k}); print $reqs{$k}."\n"; } $converts++; } print "\n" if $converts and !$preview; %{$reqsr}=%reqs; return (%using,count=>$count,converts=>$converts); } sub anchorFormat { my $fh=pop; my $format=0; $seqCount=0; seek($fh,0,0); $_=<$fh>; $format=1 if m/^(-?\d+\s-?\d+\s[+-]\s*)+\d+$/; $format=2 if m/^(-?\d+\s-?\d+\s[+-]\s*)+\d+\s(\d+.?\d*)$/; $format=3 if m/^(-?\d+\s-?\d+\s[+-]\s*)+\d+\s(\S+)\s(\d+:\d+,?)+$/; while(m/(-?\d+\s-?\d+\s[+-]\s*)/g){ $seqCount++; } seek($fh,0,0); return $format; } sub lineCount { my $count=0; my $fh=pop; seek($fh,0,0); while(<$fh>){ $count++; } seek($fh,0,0); return $count; } sub plotHistogram { my ($histr,$type,$phase) = @_; my (@bins,%hist); unless(keys(%$histr)){ print "Cowardly refusing to graph an empty $type histogram.\n"; return; } if($bins){ ${$plotopts{$type}}{with}="boxes" unless ${$plotopts{$type}}{with}; #lines for bins doesnt make sense if($bwPlot){ ${$plotopts{$type}}{style}="fill pattern 3 border 6" unless ${$plotopts{$type}}{style}; #and we like our bins visible }else{ ${$plotopts{$type}}{style}="fill solid border 7" unless ${$plotopts{$type}}{style}; #and we like our bins visible (and in multiple colors!) } my @skeys=sort {$a <=> $b} keys(%$histr); # print "Rebinning $type data into $bins ". # (${$plotopts{$type}}{flatx} ? "":"logscaled ")."bins...\n"; my ($min,$max)=($skeys[0],$skeys[$#skeys]); my $range=$max-$min; unless(${$plotopts{$type}}{flatx}){ ($min,$max)=map(log,($min!=0 ? $min:0.1,$max!=0 ? $max:0.1)); $range=$max-$min; @bins=map {exp($min+$range*($_/$bins))} (0..($bins-1)); }else{ @bins=map {$min+$range*($_/$bins)} (0..($bins-1)); } my $i=0; foreach my $k (@skeys){ while($k>$bins[$i] && $i<$#bins){ $i++; } foreach my $aligntype (keys(%{$$histr{$k}})){ $hist{$bins[$i]}->{$aligntype}+=$$histr{$k}->{$aligntype}; } } }else{ @bins=sort {$a <=> $b} keys(%$histr); %hist=%$histr; } my $hdata="$basefile.$type.".($phase>1 ? "filtered.":"")."histogram.data"; open(HDATA,">$hdata") or die "Couldn't open $hdata for writing"; if(!${$plotopts{$type}}{flaty}){ if($showall eq "scale"){ my ($ymin,$xmin)=(min(map {sum(values(%$_))} values(%$histr)),min(keys(%$histr))); my ($ymax,$xmax)=(max(map {sum(values(%$_))} values(%$histr)),max(keys(%$histr))); $ymin-=$ymin/10;$ymax+=$ymax/10; $xmin-=$xmin/10;$xmax+=$xmax/10; $ymin=.9 unless $xmin; $xmin=.9 unless $xmin; ${$plotopts{$type}}{yrange}="[$ymin:$ymax]"; ${$plotopts{$type}}{xrange}="[$xmin:$xmax]"; ${$plotopts{$type}}{tics}="out"; }elsif($showall eq 'crop'){ my ($lastvisible,$i)=(0,0); foreach my $k (@bins){ $lastvisible=$i if $hist{$k}>1; $i++; } @bins=@bins[0..$lastvisible]; } } # print "Writing $type histogram data to $hdata: @alignTypes\n"; foreach my $k (@bins){ print HDATA join("\t",$k, map { $_ ? $_:0 } ($hist{$k} ? (@{$hist{$k}}{@alignTypes}): map {0} @alignTypes))."\n"; } close(HDATA); makePlot($hdata,$type); } sub makePlot { my ($datafile,$type)=@_; my $title="$basename anchor $type histogram"; my %opts; my $plotwith='linespoints'; %opts=%{$plotopts{$type}} if ref $plotopts{$type} eq "HASH"; my $extra_opts; while(my ($opt,$val)=each %opts){ if($opt eq 'with'){ $plotwith=$val; }elsif($opt=~m/^flat(x|y)$/){ $extra_opts.="unset logscale $1\n"; } else{ $extra_opts.="set $opt $val\n"; } } my @serieslist; my $hiIdx=$#alignTypes+2; for(my $i=0;$i<=$#alignTypes;$i++){ my $align=$alignTypes[$i]; my $y=($plotwith eq 'boxes') ? join("+",map {"\$$_"} (reverse($i+2..$hiIdx))) : '\$'.($i+2); push(@serieslist, qq!"$datafile" using 1:($y) title '$align' with $plotwith!); } my $series=join(",\\\n",@serieslist); my ($xlabel,$ylabel)=("$type value","count"); my $outbase=getPath($datafile).getName($datafile); my $gnuplot_cmds=<$outbase.plot"); # print "Writing plot commands to $outbase.plot\n"; print PLOTCMDS $gnuplot_cmds; close(PLOTCMDS); print ">Gnuplotting $outbase.png\n"; open(GNUPLOT,"|gnuplot"); print GNUPLOT $terminal_setup; print GNUPLOT $gnuplot_cmds; close(GNUPLOT); } sub plotRocr { my ($rocrPath,$plotPhase,$targetStats)=@_; my @rocrWhat=@$targetStats; #make local screened copy my ($pointsize_str,$lwd); $lwd=$opt_lwd ? $opt_lwd:1; $pointsize_str=",pointsize=$opt_pointsize" if $opt_pointsize; unless(@rocrWhat){ print "Nothing to ROCR...\n"; return; } print "ROCR-$rocrTarget: @rocrWhat\n"; open(my $R,'>',"$rocrPath.R") or die "Couldn't write to $rocrPath.R"; print $R <1){ foreach my $plot (@rocrPlot){ my ($yax,$xax)=split(/,/,$plot); my ($perfname,$perfcmd)=((join("-",$yax,$xax)), (join(",",(map {qq!"$_"!} ($yax,$xax))))); if($opt_pdf){ print $R <& $rocrPath.Rout"); my $Rres=slurp("$rocrPath.Rout"); if($RresCode){ print "R had an error! ($RresCode) R said:\n$Rres--END--\n"; } my (%evalRes,$i); while($Rres=~m/^\[1\]\s+(\d+\.?\d+)/gsm){ my ($eval,$stat)=($rocrEval[$i%(@rocrEval)], $rocrWhat[int($i/(@rocrEval))]); $evalRes{$stat}->{$eval}=$1; $i++; } return makeTable(\%evalRes,@rocrEval); } sub slurp { local $/; open(my $fh,"@_") or return; return <$fh>; } sub makeTable { my ($data,@cols)=@_; return unless @cols; my %longest; $longest{label}=max(map(length,keys(%$data))); foreach my $col (@cols){ $longest{$col}=max(map {length($data->{$_}->{$col})} keys(%$data)); } my $res=join(" ",sprintf("%$longest{label}s",''),map{sprintf("%-$longest{$_}s",$_)} @cols)."\n"; foreach my $stat (keys(%$data)){ $res.=join(" ",sprintf("%$longest{label}s",$stat), (map {sprintf("%-$longest{$_}f",$data->{$stat}->{$_})} @cols))."\n"; } return $res; } sub parseStatTypeList { my ($types,$all)=@_; $all=\@mixPlotableTypes unless $all; my @l; foreach my $t (split(/,/,$types)){ unless(grep {$_ eq $t} (@mixPlotableTypes,'all')){ pod2usage({%$podusageOpts,-msg=>qq!"$t" is not a recognized statistic to plot!}); } if($t eq 'all'){ push(@l,@$all); }else{ push(@l,$t); } } return @l; } sub fixCoords { my $cd=pop; #actually nothing to do return unless $cd->{strand}; delete $cd->{strand}; } sub getLens { my ($seqsfile)=@_; open(SEQSFH,$seqsfile) or do {print "Couldn't open sequence list to find sequence length.\nDisabling randomwiden.\n"; undef $randomWiden}; #no seqs file for reference print "Reading sequence lengths...\n"; my $i=0; LoadLenSeqs: while(){ chomp; my $seq=$_; my $length=`$root/geneparse.pl -lf $seq`; chomp $length; $seqlen[$i]=$length; }continue{$i++} close SEQSFH; } sub getTags { my ($seqsfile)=@_; our @seqcds=(); open(SEQSFH,$seqsfile) or return; #no seqs file for reference print "Checking for annotation...\n"; my ($seqcount,$cdscount); $annotation=0; LoadSeqs: while(){ chomp; my $seq=$_; $cdscount=0; $seqcount++; if(!-f "$seq.cds"){ print "CDS file not found for $seq. Generating...\n"; my $res=system("$root/getcds.pl $seq"); unless(-f "$seq.cds"){ print "Generation of CDS file for $seq failed\n" unless -f "$seq.cds\n"; push(@seqcds,undef); next LoadSeqs; } } open(CDS,"$seq.cds") or die "Couldn't read $seq.cds"; our ($overlaps,$messyoverlaps)=(0,0); print "Loading annotation for $seq...\n"; my @cds; my $kogSpec=$seqcount-1; $kogSpec=$kogmap[$kogSpec] if $kogmap[$kogSpec]; LoadCDS: while(){ chomp; my ($name,$start,$stop,$strand,$locus) = split(/\t/,$_); $locus=$name if $name && !$locus; $locus=uc $locus; my $cd={name => $name,start=>$start,stop=>$stop,strand=>$strand,locus=>$locus}; $commonNames{$name}++; $knownLocs{$locus}=$kogSpec; fixCoords($cd); inorderInsert(\@cds,$cd); # print "CD-tree: ".dump(@cds)."\n"; $cdscount++; } close CDS; cdsSanityCheck(@cds); print("$cdscount CDS's loaded into ",($#cds+1)," regions ($overlaps overlaps, $messyoverlaps of them out of order).\n"); push(@seqcds,\@cds); $annotation++; } close SEQSFH; unless($annotation){ print "No annotation data found. Disabling annotation based analysis.\n"; $noTags=1; resetStatTypes(); return; } print "Establishing shared names list from $annotation sequences... "; foreach my $name (keys(%commonNames)){ do{ delete $commonNames{$name}} unless $commonNames{$name}==$annotation; } print (scalar(keys(%commonNames))," shared name",(keys(%commonNames)==1 ? "":"s"),".\n"); } sub cdsSanityCheck { my ($first,@cds)=@_; foreach my $chunkr (@_){ die "Bad start bounds (should be ".(min(map {$_->{start}} @{$chunkr->{regions}})).") on ".dump($chunkr) if min(map {$_->{start}} @{$chunkr->{regions}})!=$chunkr->{bounds}->{start}; die "Bad stop bounds (should be ".(max(map {$_->{stop}} @{$chunkr->{regions}})).") on ".dump($chunkr) if max(map {$_->{stop}} @{$chunkr->{regions}})!=$chunkr->{bounds}->{stop}; } my ($lastStart,$lastStop); foreach (@{$first->{regions}}){ $lastStart=min($_->{start},defined($lastStart) ? $lastStart:()); $lastStop=max($_->{stop},defined($lastStop) ? $lastStop:()); } foreach (map {$_->{regions}} @cds){ my ($nowStart,$nowStop); foreach (@$_){ if($_->{start}<=$lastStart || $_->{stop}<=$lastStop){ print dump($first,@cds); die "Out of order cds! ".dump($_); } if(defined($nowStart)){ $nowStart=min($_->{start},$nowStart); $nowStop=max($_->{stop},$nowStop); }else{ $nowStart=$_->{start}; $nowStop=$_->{stop}; } } } } sub regionCmp { # print DEBUG "Comp: ($_[0]->{stop}<=>$_[1]->{stop}) --> "; return 0 if overlaps(@_); # print DEBUG (($_[0]->{stop}<=>$_[1]->{stop}),"\n"); return ($_[0]->{stop}<=>$_[1]->{stop}) } sub overlaps { return ($_[0]->{start}<=$_[1]->{stop} and $_[1]->{start}<=$_[0]->{stop}); } sub fixAnchor { my ($r)=@_; ($r->{start},$r->{stop})=($r->{stop},$r->{start}) if($r->{stop} < $r->{start}); #dumb dumb dumb } sub findIncidentCDS { my $seqi=0; my @allhits; while(@_){ do{shift;shift;shift;next} unless $seqcds[$seqi]; #skip sequences for which we don't have annotation my %anchor=(start=>abs(shift)-$widen,stop=>abs(shift)+$widen,strand=>shift); fixAnchor(\%anchor); my @hits=&$searcher(\%anchor,@{$seqcds[$seqi]}); if($randomWiden){ my $base=int($randomWiden+rand(max(0,$seqlen[$seqi]-$randomWiden))); my @newhits=&$searcher({start=>$base-$randomWiden,stop=>$base+$randomWiden,strand=>'+'},@{$seqcds[$seqi]}); my %done=map {$_=>1} @hits; push(@hits,grep {!$done{$_}} @newhits); # dont read the same CDS twice } foreach my $cds (@hits){ $usedNames[$seqi]->{$cds->{name}}=1; } push(@{$allhits[$seqi]},@hits); }continue{ $seqi++; } return @allhits; } sub nameComp { my (%ref,@ref,$inited); my %res; my $i=0; for(;$i<$#_;$i++){ #make intial set next unless $seqcds[$i]; foreach my $cds (@{$_[$i]}){ push(@ref,$cds->{name}); $ref{$cds->{name}}=$cds; $res{$cds->{name}}='unknown'; $inited=1; } last if $inited; } for(;$i<$#_;$i++){ #compare others unless(@{$_[$i]}){ #if it didnt hit any CDS if(($knownKogs && $kogmap[$i]) || $seqcds[$i]){ #and that seq has annotation... #then this anchor is a miss foreach my $k (keys(%res)){ $res{$k}='miss'; } } } foreach my $cds (@{$_[$i]}){ my $name=$cds->{name}; if($knownKogs && $kogmap[$i]){ #if KOG is available, use that my $loc=$cds->{locus}; #if it's not in the KOG database, it has no known relatives #and is likely only distantly related at best... $res{$name}='distant' unless $knownKogs->isIn($loc); if($res{$name} eq 'ortholog'){ #must be an ortho to something on %ref or downgrade to paralog $res{$name}='paralog' unless grep {$knownKogs->isOrtho($loc,$_->{locus})} values(%ref); }elsif($res{$name} eq 'unknown'){ #ok, it's similar so at least paralog $res{$name}='paralog'; #if KOG aligns it, then ortholog $res{$name}='ortholog' if grep {$knownKogs->isOrtho($loc,$_->{locus})} values(%ref); } }else{ #no KOGs, guess based on names (wee!) unless($ref{$name}){ #name not present in ref seq? if($commonNames{$name}){ #name exists elsewhere... $res{$name}='distant'; }else{ #it's not a common name. might just be unannotated in other seq $res{$name}='paralog'; } }else{ #names match! $res{$name}='ortholog' if($res{$name} eq 'unknown'); } } } } foreach my $k (keys(%res)){ $res{$k}='miss' if $res{$k} eq 'unknown'; #never hit another tag } return %res; } sub linearSearch { my $needle=shift; return map {grep {overlaps($needle,$_)} @{$_->{regions}}} @_; } sub binarySearch { #(needle,haystack) my ($needle,$low,$high)=($_[0],1,$#_); my $mid; my @hits; # print "Looking for ".dump($needle)."\n"; do { $mid=int(($low+$high)/2); my $res=regionCmp($_[$mid]->{bounds},$needle); goto MONKIES if $res==0; $low=$mid+1 if $res<0; $high=$mid-1 if $res>0; }while($low<$high); MONKIES: $mid++ if $low==$mid; my ($start,$stop)=($mid,$mid); #check for needle overlapping lots of hay @hits=grep {overlaps($needle,$_)} (@{$_[$mid]->{regions}}); my $shits=$#hits; while(--$start>=1){ my @new=grep {overlaps($needle,$_)} (@{$_[$start]->{regions}}); last unless @new; push(@hits,@new); } my $lhits=$#hits; while(++$stop<=$#_){ my @new=grep {overlaps($needle,$_)} (@{$_[$stop]->{regions}}); last unless @new; push(@hits,@new); } my $rhits=$#hits; # print "Got bits! ".join(",",map {$_->{locus}} @hits)."\n" if $rhits>=0; # print "Got bits! ".scalar(@hits)."\n" if $rhits>=0; return @hits; } sub inorderInsert { #(haystack,needle) too match push/unshift #ewww remind me to write recursively... our ($overlaps,$messyoverlaps); my ($hayr,$needle)=(@_); my ($low,$high)=(0,$#$hayr); # print "Insert $needle->{start} ~ $needle->{stop}\n"; return push(@$hayr,newChunk($needle)) if($high<0); #first element (supergreen! baboomching!) return push(@$hayr,newChunk($needle)) if ($$hayr[$high]->{bounds}->{stop}<=>$needle->{start})<0 and !overlaps($needle,$$hayr[$high]->{bounds}); #biggest needle return unshift(@$hayr,newChunk($needle)) if ($$hayr[$low]->{bounds}->{start}<=>$needle->{stop})>0 and !overlaps($needle,$$hayr[$low]->{bounds}); #smallest needle my $mid; do { # print "Looping at $low ~ $high\n"; #check current $mid=int(($low+$high)/2); my $hits=overlaps($needle,$$hayr[$mid]->{bounds}); # print "Overlaps in $mid!\n" if $hits; if($hits){ #ok, does it overlap neighbors? $overlaps++; my ($start,$stop)=($mid,$mid); #check for needle overlapping lots of hay my ($lefthits,$righthits); while(--$start>=0){ my $new=overlaps($needle,$$hayr[$start]->{bounds}); last unless $new; $lefthits+=$new; } $start++; while(++$stop<=$#{$hayr}){ my $new=grep {overlaps($needle,$_)} (@{$$hayr[$stop]->{regions}}); last unless $new; $righthits+=$new; } $stop--; if($lefthits || $righthits){ #oh fuck... $messyoverlaps++; # print "Messy insert of ".dump($needle)."!\n"; #everything from $start to $stop now goes into 1 blob my (@pre,@mid,@post); if($start>0){ @pre=@{$hayr}[0..$start-1]; } if($stop<$#{$hayr}){ @post=@{$hayr}[$stop+1..$#{$hayr}]; } # $start..$stop should contain multiple anons that need to be flattened foreach (@{$hayr}[$start..$stop]){ push(@mid,(map {@{$_->{regions}}} $_)); } push(@mid,$needle); #the new guy #now reassemble... @$hayr=(@pre,newChunk(@mid),@post); }else{ #simple overlap. yay! addRegion($$hayr[$mid],$needle); } return; #either way, resolved. } $_=regionCmp($$hayr[$mid]->{bounds},$needle); $low=$mid+1 if $_<0; $high=$mid-1 if $_>0; }while($low<=$high); for(regionCmp($$hayr[$mid]->{bounds},$needle)){ @$hayr=(@$hayr[0..$mid],newChunk($needle),@$hayr[($mid+1)..$#$hayr]) and last if $_<0; @$hayr=(@$hayr[0..($mid-1)],newChunk($needle),@$hayr[$mid..$#$hayr]) and last if $_>0; last; } } sub newChunk { if(@_==1){ my ($needle)=@_; return {bounds=>{start=>$needle->{start},stop=>$needle->{stop}},regions=>[$needle]}; }else{ my $chunkr={bounds=>{},regions=>[@_]}; resetBounds($chunkr); return $chunkr; } } sub addRegion { my ($chunkr,$needle)=@_; push(@{$chunkr->{regions}},$needle); resetBounds($chunkr); } sub resetBounds { my ($chunkr)=@_; $chunkr->{bounds}->{start}=min(map {$_->{start}} @{$chunkr->{regions}}); $chunkr->{bounds}->{stop}=max(map {$_->{stop}} @{$chunkr->{regions}}); } sub hitCdsSize { my ($tp,$fp,$fn); for(my $i=0;$i<@seqcds;$i++){ foreach my $cdsl (@{$seqcds[$i]}){ foreach my $cds (@{$cdsl->{regions}}){ next unless $commonNames{$cds->{name}}; my ($consensus,$conscore); foreach my $align (sort {$a cmp $b} (keys(%{$namecon{$cds->{name}}}))){ my $thisScore=$namecon{$cds->{name}}->{$align}; if($thisScore>$conscore){ $consensus=$align; $conscore=$thisScore; } } unless($usedNames[$i]->{$cds->{name}} or !$consensus){ $fn+=($cds->{stop}-$cds->{start}+1); }elsif($consensus ne 'miss' && $consensus ne 'untagged'){ $tp+=($cds->{stop}-$cds->{start}+1); } } } } return (tp=>$tp,fp=>$fp,fn=>$fn); } sub hashSum { my $r=shift; foreach(@_){ foreach my $k (keys(%{$_})){ $r->{$k}+=$_->{$k}; } } } sub setupKogmap { my $seqdata=shift; open(my $seqfh,$seqdata) or die "Couldn't read sequence file"; my $i=0; my %aliases=Murasaki::KOG->commonAliases; foreach(<$seqfh>){ next if $kogmap[$i]; chomp; my ($id)=Murasaki::KOG->guessKogMember($_); unless($id){ #load kogfile and try again Murasaki::KOG->learnKogSpecs($kogfile); $id=Murasaki::KOG->guessKogMember($_); } next unless $id; print "Identified $_ as KOG member $id\n"; $kogmap[$i]=$id; }continue{$i++} } sub setupKogs { our @kogmap; if(!@kogmap){ print "Kogfile supplied, but no input sequences associated with Kogs?\n"; print " >> Maybe try --kog 1= (eg. --kog 0=sce) to\n", " >> force associations?\n"; $kogfile=undef; print "Disabling ROCR...\n"; @rocrWhat=undef; return; } print "knownlocs contains: ".scalar(keys %knownLocs)." entries\n"; $knownKogs=Murasaki::KOG->kogFrom($kogfile,[grep {$_} @kogmap],\%knownLocs,\%kogpref); $knownOList=Murasaki::OrthoList->orthosFromKog($knownKogs,\@kogmap,\%knownLocs) if $useOList; $knownOPairs=Murasaki::OrthoPairs->orthosFromKog($knownKogs,\@kogmap,\%knownLocs) if $useOPairs; $knownOCons=Murasaki::OrthoConsistency->orthosFromKog($knownKogs,\@kogmap,\%knownLocs) if $useOCons; unless($knownKogs){ print "Disabling ROCR..."; @rocrWhat=undef; } } __END__ =head1 NAME filter.pl - filters output from murasaki based on various filters =head1 SYNOPSIS filter.pl [options] [output file] =head1 OPTIONS Filtering options: Filtering can be performed on "length", "hits", "tfidf", or "score". You can filter by either setting either a "min" or "max". For example: "--minlength 50" filters out any anchors with an average length shorter than 50. (average length is used because an anchor may be different sizes on different sequences). Filters can also use the average(x), standard deviation(s), variance(v), or anchor count(a) in an expression to set a filter. For example: "--minscore=2x" sets the minimum score threshold to twice the average score. "--minlength=x+s/2" sets the minimum length threshold to the average length plus half a standard deviation. Input modification: "--widen=500" => extends each anchor 500 bases in both directions "--randomwiden=500" => extends each anchor 500 bases _at random_ (ie: to compare to --widen) Output options: "--statdump" => dump all available stats into 1 file with a header row "--dumpstats tfidf,length" => dumps stats tfidf and length to separate files one line per anchor "--dumpstats" => assumes "--dumpstats all" Plot options: filter.pl can plot any or all of the statistics gathered by using the "--plot" option. Examples: "--plot hits,length" => plots both hits and length "--plot" => assumes "--plot all" "--plotopts" allows setting of various gnuplot and special plot options. Different statistics can be targeted separately by prefixing the setting name with the statistic(s) of your choice followed by a "." For example: "--plotopts hits,length.flatx" => disables the log scale on the x axis of the hits and length plots only "--plotopts with=points" => uses points instead of bars on all plots "--plotopts =" => can be used to set arbitrary gnuplot options of the form "set " "--bins" can be used to specify the number of bins. "--nobins" turns off binning and plots a raw (likely bumpy) histogram "--showall=crop" sends all data to gnuplot even on logscale plots (by default for logscale plots values <=1 at the extreme right end are chopped off because they dont show up in gnuplot (1 is the baseline) but they do affect the visible range, and thus causes some scrunching. "--showall=scale" manually sets to the X and Y ranges to the range of the data (so 1's are visibly different from 0's). "--showall=" disables scaling/cropping The Gnuplot commands for generating the plots are also dumped to ..plot and can be run interactively in gnuplot by typing: load "..plot" Statistical Options: "--all/--quick" => standard deviation calculations require a second pass through the data, and as histogram plots are generally much more useful than a standard deviation statistic (especially considering not all of these statistics may be gaussian), so unless one of your constraints calls for standard deviation, this calculation is skipped. It can be forced by applying --all. (--quick is the default) "--nodetails" => disables reconstruction of term indicies (this will disable tfidf stats). "--tags" => enables reading of annotation. This produces "good, miss, shuffle" stats (which can also be plotted) and specificity/sensitivity information "--notags" => disables reading of annotation (by default) COG/KOG Statistics: At any rate: "--kogfile=path/to/kog" => enables kog-based alignment "--kogmap 3=hsa" => forces sequence 3 (note: sequences are 0 indexed) to be assigned to the "hsa" kog. If the file name includes one of the kog species abbreviations, it is assumed to belong to that kog. Debugging: "--linear" => forces linear scans for CDS's instead of binary searches (if this returns different results, it means something is very wrong) [output file]: If the input filename is of the form .anchors.details, then the [output file] defaults to .filtered. Incidentally, if you don't provide a .anchors.details file, it probably won't work anyway... =head1 DESCRIPTION Filters murasaki alignments based on various statistics. Various statistics can be plotted using --plot. Annotation data is processed from input files using BioPerl. ROC data can be calculated using KOG data (which is much more reliable than just gene names). To do so you need to specify a KOG data file which can be downloaded from the COG database at: http://www.ncbi.nlm.nih.gov/COG/ You'll be looking for either the "whog" file for COGs or the "kog" file on the KOG side. KOG locus naming sometimes differs from GBK file to file, and locus names are sometimes missing, so KOG based assessment is currently best effort (capitalization is ignored, locii which don't appear in the annotation are ignored, and domain-specific _x endings are ignored). =cut murasaki/osfinder2anchors.pl0000755000177700001440000000777211434752241015603 0ustar krispusers#!/usr/bin/perl -w #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use strict; use Getopt::Long; use Pod::Usage; use File::Basename; #use Data::Dump qw {dump}; use strict; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my ($man,$help,$makecds,$seqfile); my (@seqnames,$output_opt); my $opterr= GetOptions('help|?' => \$help, man => \$man, 'seq=s'=>\$seqfile, 'output=s'=>\$output_opt); pod2usage(1) if $help; pod2usage(-exitstatus => 0, -verbose => 2) if $man; pod2usage({-verbose=>1,-exitval=>2,-message=>'Need some input file...'}) if $#ARGV<0; my $round; foreach my $inf (@ARGV){ my ($iprefix,$dir,$ext)=fileparse($inf,qr{\..*}); $seqfile="$dir/$iprefix.seqs" unless $seqfile; die "Need $seqfile (or specify manually)" unless -f $seqfile; open(my $infh,$inf) or die "Couldn't open $inf"; my @seqs=map { {file=>$_} } (split(/\n/,slurp($seqfile))); foreach my $seq (@seqs){ print "Loading regions for $seq->{file}\n"; $seq->{regions}=[getRegions($seq->{file})]; } my $outf=($output_opt and $round) ? "$output_opt.$round":$output_opt; $outf="$inf.anchors" unless $outf; my ($oprefix,$odir,$oext)=fileparse($outf,qr/\.anchors/); my $oseq="$dir$oprefix.seqs"; print STDERR "Warning: Overwriting $oseq\n" if -f $outf; open(my $oseqfh,">$oseq"); foreach my $seq (@seqs){ print $oseqfh "$seq->{file}\n"; } print STDERR "Warning: Overwriting $outf\n" if -f $outf; open(my $ofh,">$outf"); while(my $line=<$infh>){ chomp $line; my @anchors=getAnchors($line); die "Illegal number of anchors (".scalar(@anchors).")" unless scalar(@anchors)==scalar(@seqs); print $ofh join("\t", map {fixInRegion($anchors[$_],$seqs[$_]->{regions})} 0..$#anchors)."\n"; } } sub fixInRegion { my ($anchor,$regions)=@_; return ($anchor->{start},$anchor->{stop},$anchor->{sign}) unless $regions; die "programming error ftl!" unless ref($regions) eq 'ARRAY'; my $region=$$regions[$anchor->{chrom}-1]; die "Invalid region: $anchor->{chrom}" unless $region; return ($anchor->{start}+$region->{start}-1,$anchor->{stop}+$region->{start}-1,$anchor->{sign}); } sub anchorToString { my ($anchor)=@_; return join("\t",$anchor->{start},$anchor->{stop},$anchor->{sign}); } sub getAnchors { my ($line)=@_; my @d=split(/\s+/,$line); my @anchors; while(@d){ my ($chrom,$start,$stop,$sign)=map {shift(@d)} (1..4); ($start,$stop)=(-$start,-$stop) if($start<0); ($start,$stop)=($stop,$start) if $stop<$start; #be flexible push(@anchors,{chrom=>$chrom,start=>$start,stop=>$stop,sign=>$sign}); } return @anchors } sub getRegions { my ($file)=@_; die "No file specified?" unless $file; my @res; if($file=~m/\.stitch$/){ open(my $fh,$file) or die "Couldn't open stitch sub-file $file"; while(<$fh>){ my @dat=split(/\t/); push(@res,{file=>$dat[0],length=>$dat[1],start=>$dat[2],stop=>$dat[3]}); } }else{ unless(-f "$file.len"){ system("$root/getsegments $file"); } open(my $fh,"$file.len") or die "Couldn't open segment data for $file"; my $length=<$fh>; #first line is length only my $at=1; while(<$fh>){ my ($length,$meta)=m/^(\d+)\t(.*)/; push(@res,{length=>$length,start=>$at,stop=>$at+$length}); $at+=10; } } return @res; } sub slurp { local $/; open(my $fh,"@_") or return; return <$fh>; } murasaki/anchors2maf.pl0000755000177700001440000000751511434752242014531 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use strict; use Getopt::Long; use Pod::Usage; use File::Basename; use Data::Dump qw{dump}; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; our ($inf,$pairs,$verbose); local $\="\n"; my ($help,$man); GetOptions('help|?' => \$help, man => \$man, 'pairs+' => \$pairs, 'verbose+'=>\$verbose); pod2usage(1) if $help or scalar(@ARGV)<1; pod2usage(-exitstatus => 0, -verbose => 4) if $man; $inf=shift(@ARGV); die "File ($inf) not found.\n" unless open(my $infh,$inf); my ($in_name,$in_path,$in_suffix) = fileparse($inf, qr{\.anchors}); $in_path=~s!/$!!; our $basename="$in_path/$in_name"; my %infiles=(anchors=>$inf); $infiles{score}="$basename.anchors.stats.ungappedscore" if -f "$basename.anchors.stats.ungappedscore"; my @seqs=split("\n",slurp("$basename.seqs")); die "Lack of sequences?" unless @seqs; my %lengths=map {$_=>`$root/geneparse -l -c $_`} @seqs; if($pairs){ foreach my $i (0..$#seqs){ foreach my $j (($i+1)..$#seqs){ my $mafName="$basename.anchors.$i-$j.maf"; makeMafFromSeqs($mafName,$i,$j); } } }else{ my $mafName="$basename.anchors.all.maf"; makeMafFromSeqs($mafName,0..$#seqs); } sub makeMafFromSeqs { my ($outf,@seqi)=@_; open(my $outfh,">$outf"); local $,="\n"; print "Writing $outf for sequences: @seqi"; print $outfh "##maf version=1 scoring=blastz program=murasaki","# murasaki"; my %infh=map {open(my $fh,$infiles{$_}) or die "Failed to open $infiles{$_}"; $_=>$fh} keys %infiles; while(my %lines=getLines(%infh)){ last unless keys(%lines); my @anchors=getAnchors($lines{anchors}); my $score=$lines{score} if exists $lines{score}; print $outfh defined $score ? "a score=$score":"a"; foreach my $i (@seqi){ my $src=$seqs[$i]; my $length=$lengths{$seqs[$i]}; my $anchor=$anchors[$i]; my $start=$anchor->{strand} eq '+' ? ($anchor->{start}-1):($length-$anchor->{stop}); my $extract="$root/geneparse -c $seqs[$i]\[$anchor->{start},$anchor->{stop}\]"; my $content=$anchor->{strand} eq '+' ? `$extract`:`$extract | $root/antisense.pl -c`; print $outfh sprintf("s %-10s %9d %4d %1s %9d ",$seqs[$i],$start,$anchor->{length},$anchor->{strand},$length).$content; } print $outfh ""; } } sub getAnchors { my ($line)=@_; my @anchors; my @bits=split(/\t/,$line); foreach my $i (map {$_*3} 0..(scalar(@bits)/3)){ push(@anchors,($bits[$i+2] eq '+') ? {start=>$bits[$i], stop=>$bits[$i+1], strand=>$bits[$i+2], length=>$bits[$i+1]-$bits[$i]+1} :{ start=>abs($bits[$i+1]), stop=>abs($bits[$i]), strand=>$bits[$i+2], length=>abs($bits[$i+1]-$bits[$i])+1 } ); } return @anchors; } sub slurp { local $/; open(my $fh,"<",@_) or die "Couldn't open @_"; my $r=<$fh>; return $r; } sub getLines { my %fhs=@_; my %lines; my $ended; foreach my $k (keys %fhs){ $lines{$k}=readline($fhs{$k}); if(length $lines{$k}){ if($ended){ die "$ended ($infiles{$ended}) ended before $k ($infiles{$k})"; } chomp $lines{$k}; }else{ unless($ended){ $ended=$k; } } } return () if $ended; return %lines; } murasaki/analyze-pat.pl0000755000177700001440000001164011434752242014545 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use strict; use Getopt::Long; use Pod::Usage; use File::Basename; use Data::Dump qw{dump}; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my ($verbose,$matrix_file); my ($help,$man); GetOptions('help|?' => \$help, man => \$man, 'verbose+'=>\$verbose, 'matrix=s'=>\$matrix_file); pod2usage(1) if $help or scalar(@ARGV)<1; pod2usage(-exitstatus => 0, -verbose => 4) if $man; my %scoring=grabScoreMatrix($matrix_file); print "Using matrix:\n"; print prettyMatrix(\%scoring)."\n"; my %stats=(minMatch=>minMatching(\%scoring), maxMatch=>maxMatching(\%scoring), minMiss=>minMiss(\%scoring), maxMiss=>maxMiss(\%scoring), minAll=>min(map {values %$_} values %scoring), maxAll=>max(map {values %$_} values %scoring) ); print "Matrix stats:\n".prettyHash(\%stats)."\n"; foreach my $inf (@ARGV) { my $pat; if($inf=~m/^\[\d+\D\d+\]/){ $pat=`$root/randpat.pl -c $inf`; analyzePat($pat); }elsif($inf=~m/[01]+$/){ $pat=$inf; analyzePat($pat); }else{ open(my $fh,$inf) or die "Couldn't open file $inf"; while(<$fh>){ chomp; analyzePat($_) if m/^[01]+$/; } } } exit; sub analyzePat { my ($pat)=@_; my ($zeros,$ones,$length)=(zeros($pat),ones($pat),length($pat)); print "Pattern: $pat\n" if $verbose; print "Analyzing [$ones,$length] ($zeros zeros) pattern.\n"; my %pstats=(minMatch=>$zeros*$stats{minAll}+$ones*$stats{minMatch}, maxMatch=>$zeros*$stats{maxAll}+$ones*$stats{maxMatch}, minMiss=>max(0,$ones+$zeros-1)*$stats{minAll}+$stats{minMiss}, maxMiss=>max(0,$ones+$zeros-1)*$stats{maxAll}+$stats{maxMiss} ); print "Pattern stats:\n".prettyHash(\%pstats)."\n"; } exit; sub prettyHash { my ($hr)=@_; my $r; my @k=sort {my ($sa,$sb)=map {my $t=$_;$t=~s/^(min|max)(.*)$/$2/;$t} ($a,$b); return ($sb cmp $sa) if ($sb cmp $sa); return $b cmp $a} keys %$hr; foreach my $k (@k){ $r.="$k: $hr->{$k}\n"; } return $r; } sub prettyMatrix { my ($mr)=@_; my $r; my @k=sort keys(%$mr); $r.=join("\t",undef,@k)."\n"; foreach my $row (@k){ $r.=join("\t",$row,map {$mr->{$row}->{$_}} @k)."\n"; } return $r; } sub zeros { my ($pat)=@_; return scalar(grep(/^0$/,split(//,$pat))); } sub ones { my ($pat)=@_; return scalar(grep(/^1$/,split(//,$pat))); } sub weight { return ones @_; } sub minMatching { my ($mr)=@_; return min(map {$mr->{$_}->{$_}} (keys %$mr)); } sub maxMatching { my ($mr)=@_; return max(map {$mr->{$_}->{$_}} (keys %$mr)); } sub minMiss { my ($mr)=@_; my @k=keys %$mr; return min(map {my $row=$_;@{$mr->{$row}}{grep{$row ne $_} @k}} (@k)); } sub maxMiss { my ($mr)=@_; my @k=keys %$mr; return max(map {my $row=$_;@{$mr->{$row}}{grep{$row ne $_} @k}} (@k)); } sub grabScoreMatrix { my ($file)=@_; my $matrix_text; if($file){ open(my $fh,$file) or die "Couldn't open $file"; for(1..8){ $matrix_text.=; } }else{ $matrix_text=<{$cols[$i]}=$row[$i]; } $row++; } die "Never found a matrix?" unless $go; return %matrix; } sub slurp { local $/; open(my $fh,"@_") or return; return <$fh>; } sub min { my $best=$_[0]; foreach(@_){ $best=$_ if $_<$best; } return $best; } sub max { my $best=$_[0]; foreach(@_){ $best=$_ if $_>$best; } return $best; } sub sum { my $sum=0; grep {$sum+=$_} @_; return $sum; } sub mean { my $total; foreach(@_){ $total+=$_; } return $total/($#_+1); } __END__ =head1 NAME analyze-pat.pl -- analyzes a pattern =head1 SYNOPSIS analyze-pat.pl =head1 OPTIONS --verbose|-v => More verbose output to stdout. Can be applied multiple times. --matrix => Where to find the scoring matrix. =head1 DESCRIPTION =cut murasaki/histoplot.pl0000755000177700001440000001036511434752242014350 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Long; use Pod::Usage; use strict; my ($help,$man,$title,%opts,$extra_cmd,$echo_on,$byrank,$nopercent,$clean,$scale,$notitle); my ($xres,$yres)=(800,800); my $font="/usr/share/fonts/truetype/ttf-bitstream-vera/Vera.ttf"; $font="/usr/share/fonts/truetype/ttf-dejavu/DejaVuSans.ttf" unless -e $font; $font=undef unless -e $font; my $fontsize; my $opt_res=GetOptions('help|?' => \$help, man => \$man, 'title=s' => \$title, 'notitle'=>\$notitle, 'opt=s%'=>\%opts,'cmd=s'=>\$extra_cmd,"echo"=>\$echo_on, "rank"=>\$byrank,"nopercent"=>\$nopercent,"clean"=>\$clean, 'font=s'=>\$font,'fontsize=f'=>\$fontsize, 'scale=s'=>\$scale,'xres=i'=>\$xres,'yres=i'=>\$yres, 'size=s'=>sub {($xres,$yres)=$_[1]=~m/(\d+)\D+(\d+)/ or die "Bad size format ($_[1])"}); pod2usage(1) if $help or $#ARGV<0 or !$opt_res; pod2usage(-exitstatus => 0, -verbose => 1) if $man; my ($file)=@ARGV; die "File not found: $file" unless -f $file; $byrank=1 if $file=~m/histogram\.details$/; my $xlabel="bucket size"; my $ylabel="keys"; my $xlabel="rank" if $byrank; my $ylabel="frequency" if $byrank; if($byrank){ print "Histogram by rank selected...\n"; my $srcfile=$file; my @src; $file="$file.rank"; if($clean or !-e $file){ #gotta make it! print "Rebuilding histogram file by rank\n"; my $total; open(SRC,"$srcfile"); while(){ my @quick=split(/\D+/,$_); my $val=pop(@quick); next unless $val; $total+=$val; push(@src,$val); } close(SRC); print "Sorting $#src values...\n"; @src=sort {$b <=> $a} @src; print "Writing $file...\n"; open(RANK,">$file"); for(0..($#src)){ my $val=$nopercent ? $src[$_]:$src[$_]/$total; $_+=1; print RANK "$_\t$val\n"; } close(RANK); print "Done building rank file $file\n"; } } my $fontset=qq!font "$font" $fontsize! if $fontsize and $font; $title=$file unless $title; my $extra_opts=join("\n",map {"set $_ $opts{$_}"} keys(%opts)); my $gnuplot_cmds=<] [--opt=] [--echo] =head1 OPTIONS =over 8 =item B<--title> Sets an optional graph title. Default is histogram file name. =item B<--opt=[,...]> Sends additional "set" options to gnuplot. For example, say you want to force the x axis to [1:1000], you can use --opt=xrange=[1:1000] =item B<--cmd=[;...]> Sends extra arbitrary commands to gnuplot. =item B<--echo> Echos a copy of the commands sent to gnuplot. =item B<--rank> Sort words into rank =item B<--nopercent> Preserve raw frequency, not percent. =item B<--clean> Forces a remake of the gnuplot data file (good if you've switched to --nopercent) =item B<--font> Specify a font (and optional pointsize) to use in PNG =item B<--font> Just specify font size (and use bitstream vera as font) =item B<--scale> scale for default font family: tiny, small, mediu, large, or giant =back =head1 DESCRIPTION Draws histogram plots. =cut murasaki/qlob.pl0000755000177700001440000001400611434752242013254 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use File::Basename; use Pod::Usage; use Sys::Hostname; use Cwd; use strict; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; our %mca; our $mem=$ENV{QLOB_MEM} ? $ENV{QLOB_MEM}:"4gb"; our $dir="output"; our $localroot=cwd; chomp $localroot; if($#ARGV<0 or grep(/--help/,@ARGV)){ pod2usage(1); exit; } our ($name,$target,$local,$kogfile,$mpinodes,$myrinet,$username,$cm); our $ppn=$ENV{QLOB_PPN} ? $ENV{QLOB_PPN}:4; my @seqs=grep(/^[^-]/,@ARGV); @ARGV=map {$_="-p11111100100001111100000100101111" if $_ eq "-P"; $_} @ARGV; @ARGV=grep {my $res=0; if(m/^(-n|--name=)(.*)/){ $username=$2; $res=1; } if(m/^(--ppn=)(\d+)/){ $ppn=$2; $res=1; } if(m/^(--cm)/){ $cm=1; $myrinet=1; $res=1; } if(m/^(--target=)(.*)/){ $target=$2; $res=1; } if(m/^(-m|--memo?r?y?=)(.*)/){ $mem=$2; $res=1; } if(m/^--?l(ocal)?$/){ $local=1; $res=1; } if(m/^(--kogfile=)(.*)/){ $kogfile=$2; $res=1; } if(m/^(--mpi(?:nodes)?=)(\d+)/){ $mpinodes=$2; $res=1; } if(m/^(--myrinet)/){ $myrinet=1; $res=1; } if(m/^(--dir=)(.*)/){ $dir=$2; $res=1; } !$res } @ARGV; my $runstring=join(" ",@ARGV); print "Got Sequences: ".join(",",@seqs)."\n"; if(!$username){ $name=join("-",getName(@seqs)); $name=quotemeta($name); $name=~s!\\[^\\\-/]!_!g; $name=~s!\\-!-!g; foreach my $seq (@seqs){ die "sequence file not found: $seq" unless -e $seq; } my $basename=$name; my $round; while(-e "$dir/$name.job"){ $round++; $name=$basename."_$round"; } }else{ $name=$username; } print "Using name: $name\n"; my $prefix="$dir/$name"; print "Storing output to: $prefix\n"; die "No name?" unless $name; #print "Murasaki Args: $runstring"; my %requirements; my @cmds; push(@cmds,$ENV{QLOB_PROLOG}) if $ENV{QLOB_PROLOG}; #$requirements{mem}=$mem; $requirements{vmem}=$mem; #$requirements{pmem}=$mem; #$requirements{nodes}="1:bigmem"; $requirements{nodes}="$target" if $target; $requirements{nodes}=join(":",($mpinodes ? ($mpinodes,"ppn=$ppn"):($target ? $target:())),($myrinet ? "myrinet":())); if($ENV{QLOB_BADMX}){ $mca{btl}='^mx'; }else{ $mca{pml}='cm' if($myrinet and $cm); if($ENV{MX_CSUM}){ #use debugging library my $debuglib='/usr/local/lib64/debug'; $debuglib=$ENV{QLOB_DEBUG_LD_LIBRARY_PATH} if $ENV{QLOB_DEBUG_LD_LIBRARY_PATH}; unshift(@cmds, 'export LD_LIBRARY_PATH='.$debuglib.':$LD_LIBRARY_PATH', "export MX_CSUM=$ENV{MX_CSUM}" ); } } my $reqs=join(",",map {"$_=$requirements{$_}"} (grep {$requirements{$_}} keys(%requirements))); my $mpiopts=join(" ",map {"-mca $_ $mca{$_}"} keys %mca); my $qsub_args=qq!-d $localroot -mae -N "murasaki:$name" -e $prefix.stderr -o $prefix.stdout -l $reqs!; open(JOB,">$prefix.job"); print JOB "#!/bin/sh\n\n"; if(`which qsub` and !$local){ open(QSUB,"|qsub $qsub_args"); # print JOB "qsub $qsub_args\n"; print "Launching via qsub\n";# $qsub_args\n"; print QSUB "echo \$HOSTNAME > $prefix.host\n"; #quite annoying not to know that }else{ open(QSUB,">-"); $local=1; } my $perl=""; if($local){ # chdir($root); $perl=`which perl`; chomp $perl; } foreach my $cmd ("set -e", @cmds, "time ".($mpinodes ? "mpirun $mpiopts ":"")."$root/murasaki -n$name $runstring | tee $prefix.murasaki", "time $perl $root/simplegraph.pl $prefix.anchors", "time $perl $root/histoplot.pl $prefix.histogram", "time $perl $root/histoplot.pl $prefix.histogram.details", ($kogfile ? ("time $perl $root/filter.pl --kogfile $kogfile $prefix.anchors"):()), ){ my ($file)=$cmd=~m/(\S*)$/; do {print "Necessary file not found for histogram graph.\n"; next;} if $local and $file=~m/histogram/ and !($file=~m/murasaki$/) and !-e "$file"; $cmd=~s/^time // if $local; #local does not like timing perl programs print QSUB "$cmd\n"; print JOB "$cmd\n"; print "Local run. Exit result: ".system($cmd)."\n" if $local; } close QSUB; close JOB; sub getName { my @ret=map { # m!.*?/?([^./]+)\..*?! or print "Eek. Couldn't extract name from $_\n"; # $1 my ($name,$path,$suffix) = fileparse($_,qr{\.[^.]*(?:\.gz)?}); $name } @_; return @ret if $#_; return $ret[0]; } __END__ =head1 NAME qlob.pl: run quick mursaki compare =head1 SYNOPSIS qlob.pl [options] {genome list...} qlob is a frontend to murasaki (through qsub if it's available). It automatically generates an output name for you based on the input files, sets a more reasonable memory limit, runs the job, and generates dot-plot outputs. In addition to the standard Murasaki options (see murasaki --help), qlob supports some some options of its own: Options: -P uses an old pattern of 11111100100001111100000100101111 --target= requests the job be run on hostname -n|--name name for the job (allows overwriting of old output) -m|--mem specifies a memory requirement to pass to qsub (default 4gb) --mpi=N use MPI across N machines (sets ppn=4 also) --myrinet require myrinet nodes --kogfile= Run filter.pl to score alignment using =head1 DESCRIPTION B runs several sequences through murasaki. When run on an hpc machine it submits the job via qsub. =cut murasaki/homology-score.pl0000755000177700001440000001567111434752242015276 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use Getopt::Long; use Pod::Usage; use File::Basename; use IO::Handle; #use Data::Dump qw {dump}; #use Math::BigRat; use strict; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; my %aligners=(clustalw=>'clustalw -outfile=/dev/null', muscle=>'muscle -out /dev/null'); my $aligner='muscle'; my $widen=0; our $maxChunkSize=64000; my $tmp='/tmp'; my ($help,$man); GetOptions('help|?' => \$help, man => \$man, 'aligner=s' => \$aligner, 'widen=i'=>\$widen); pod2usage(1) if $help or $#ARGV<0 or !$aligners{$aligner}; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my ($inf,$outf)=@ARGV; our $screenwidth=$ENV{COLUMNS}; $screenwidth=75 unless $screenwidth>0; our ($infh,$outfh,@seqlist,%seqdata); our $basepath=getPath($inf); our $basename=(fileparse($inf,qr{\.[^\.]+}))[0]; our $basefile=getPath($inf).$basename; $outf="$basefile.homology" unless $outf; open($infh->{anchors},"$inf") or die "Can't open $inf for input"; open($infh->{seqs},"$basefile.seqs") or die "Couldn't load sequence file list"; open($outfh,">$outf"); our $anchorCount=lineCount($infh->{anchors}); @seqlist=readline($infh->{seqs}); chomp @seqlist; $maxChunkSize/=@seqlist; #if there's lots of seqs, have to shorten chunks foreach my $seq (@seqlist){ print "Loading $seq...\n"; $seqdata{$seq}=`$root/geneparse.pl -c $seq`; } my $anchor=0; print "Aligning $anchorCount anchors\n"; resetTick(); while(my $line=readline($infh->{anchors})){ my $i=0; my (%res,@chunks); #make all the chunks while($line=~m/(-?\d+)\s(-?\d+)\s([+-])\s*/g){ my ($start,$stop,$strand)=($1-$widen,$2+$widen,$3); ($start,$stop)=(abs($stop),abs($start)) if $strand eq '-'; my $dna=substr($seqdata{$seqlist[$i]},$start-1,$stop-$start+1); if($strand eq '-'){ $dna=~tr/ACGT/TGCA/; $dna=reverse $dna; } my ($chunkid,$chunkstart,$chunksize)=(0,0,min(length($dna),$maxChunkSize)); while($chunkstart',$tmpfile); my $i=0; foreach my $dna (@$chunks){ print $ofh ">$i\n",$dna,"\n"; } close $ofh; my @treeDat; if($aligner eq 'clustalw'){ my $res=`$aligners{$aligner} -infile=$tmpfile -tree`; $res=~m/Phylogenetic tree file created:\s+\[(.+)\]/m or die "Uh oh, aligner puked:\n$res"; my $rawtree=slurp($1); @treeDat=tokenizeTree($rawtree); }elsif($aligner eq 'muscle'){ my $cmd="$aligners{$aligner} -in $tmpfile -quiet -tree1 $tmpfile.tree1 -tree2 $tmpfile.tree2 -maxiters 2"; my $res=`$cmd`; my $useTree="$tmpfile.tree2"; $useTree="$tmpfile.tree1" unless -f $useTree; open(my $resh,$useTree) or die "No treefile? (Chunk $chunkname)\n"; @treeDat=<$resh>; chomp @treeDat; @treeDat=@treeDat[0..($#treeDat-1)]; #dont need last line } my %res; parseTree(\%res,0,@treeDat); system("rm $tmpbase*"); #cleanup return %res; } sub tokenizeTree { my $num='\d+.?\d*'; my @tokens; foreach(split(/\s/,join("\t",@_))){ while(m/\(|\)(:$num)?|,|\S+:$num|;/g){ push(@tokens,$&) } } return @tokens; } sub sum { my $sum=0; grep {$sum+=$_} @_; return $sum; } sub min { my $best=$_[0]; foreach(@_){ $best=$_ if $_<$best; } return $best; } sub parseTree { my $resr=shift; my $root=shift; return unless @_; my ($first)=(shift); if($first eq ','){ return parseTree($resr,$root,@_); }elsif($first=~m/^\(/){ #subtree my ($in,$last,$out)=findPair(@_); my $dist=$1 if $last=~m/\):(\S+)/; parseTree($resr,$root+$dist,@$in); return parseTree($resr,$root,@$out); }elsif($first=~m/(\S+):(\S+)/){ #leaf $resr->{$1}=$root+$2; return parseTree($resr,$root,@_); } } sub findPair { my (@in,$inside); while($_=shift){ if($_ eq '('){ $inside++; } elsif(m/^\)/){ $inside--; return (\@in,$_,\@_) if $inside<0; }else{ } push(@in,$_); } die "Uh oh. Tree is missing $inside )'s"; } sub getName { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $name } @_; return @ret if $#_; return $ret[0]; } sub getPath { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $path } @_; return @ret if $#_; return $ret[0]; } sub anchorCountSeqs { my $fh=pop; my $count=0; seek($fh,0,0); $_=<$fh>; while(m/^(-?\d+\s-?\d+\s[+-]\s*)$/){ $count++; } seek($fh,0,0); return $count; } sub anchorFormat { my $fh=pop; my $format=0; seek($fh,0,0); $_=<$fh>; $format=1 if $_=~m/^(-?\d+\s-?\d+\s[+-]\s*)+\d+$/; $format=2 if $_=~m/^(-?\d+\s-?\d+\s[+-]\s*)+\d+\s(\S+)\s(\d+:\d+,?)+$/; seek($fh,0,0); return $format; } sub lineCount { my $count=0; my $fh=pop; seek($fh,0,0); while(<$fh>){ $count++; } seek($fh,0,0); return $count; } sub slurp { local $/; open(my $fh,"@_") or return; return <$fh>; } sub resetTick { my ($total,$div)=@_; our ($anchorCount,$screenwidth); ($total,$div)=($anchorCount,$screenwidth) unless $total; our ($ticksper,$ticksleft)=(int($total/$div),int($total/$div)); print "|",(map {"-"} (3..$div)),"|\n"; } sub tick { our ($ticksper,$ticksleft); $ticksleft--; if(!$ticksleft){ print STDOUT "."; STDOUT->flush(); $ticksleft=$ticksper; } } __END__ =head1 NAME homology-score.pl - generates homology scorse from murasaki anchor files =head1 SYNOPSIS filter.pl [options] [output file] =head1 OPTIONS --aligner= => Selects the aligner program. Options are clustalw (slow but won't die on large anchors), or muscle (fast, but dies on anchors longer than 20k). Default is Muscle. murasaki/mauveAlignment2anchors.pl0000755000177700001440000001210111434752242016725 0ustar krispusers#!/usr/bin/perl -w #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . ################## ## convert mauve alignments to Murasaki anchor files -- krisp ################## use File::Basename; use Getopt::Long; use Pod::Usage; #use Data::Dump qw{dump}; use strict; my ($help,$man,$align_type); my $autoout=1; our $flexible=0; our $signed=1; my $useAlignment; GetOptions('help|?' => \$help, man => \$man, 'autoout!'=>\$autoout); pod2usage(1) if $help or $#ARGV<0; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my ($filename,$outfile)=@ARGV; my ($basename,$path,$suffix) = fileparse($filename); die "Input file not found: $filename" unless -e $filename; $outfile="$path/$basename.aligned.anchors" if $autoout; print "Outfile is $outfile\n" if $autoout; if($outfile){ open(OUTF,">$outfile"); } else { open(OUTF,">-"); } my %mauvedat=%{loadMauveAlignment($filename)}; foreach my $lcb (@{$mauvedat{LCBs}}){ print OUTF (join("\t",map {($_->{start},$_->{stop},($_->{start}<0 ? "-":"+"))} (@$lcb))."\n"); } if($basename){ #we can also make the seqs file open(my $seqfh,">$basename.aligned.seqs"); local $,="\n"; local $\="\n"; print $seqfh (map {$_->{seqFile}} @{$mauvedat{seqs}}); } sub loadMauveAlignment { my $alignment=shift; open(MAUVE,"<$alignment"); =~m/FormatVersion\s+(\d+)/ or die "Not a mauve file: $alignment"; my $version=$1; my @seqs=(); do {print "This program is written for Mauve Format Version 4.\n This file is version $version. Weird stuff may happen.\n"; $flexible=1;} if $version!=4; =~m/SequenceCount\s+(\d+)/ or die "Unknown sequence count\n"; my $seqCount=$1; while(){ next unless m/Sequence(\d+)File\s+(\S.*)/; my ($seqId,$seqFile)=($1,$2); $_=; m/Sequence${seqId}Length\s+(\d+)/ or $flexible or die "Input file is weird: $_"; my $seqLength=$1; $seqs[$seqId]={'seqId' => $seqId,'seqFile' => $seqFile,'seqLength'=>$seqLength, 'seqName' => getName($seqFile) }; last if $seqId==$seqCount-1; } my @LCBs=(); $_=; m/IntervalCount\s(\d+)/ or $flexible or die "Interval Count line weird: $_"; my $LCBCount=$1; while(){ chomp; if(m/^Interval\s(\d+)/){ my $LCBId=$1; while($_ ne '') { $_=; chomp; last if $_ eq ''; if(m/^GappedAlignment/){ $_=; my ($length,@start)=split(/\s+/); my @alignments=map {local $_=;chomp;$_} 0..$#start; my @subi=map {0} @start; my @substart=map {0} @subi; my $in; my $inre=qr/[^\-]/; local $|=1; while(max(@subi)<$length){ my @c=map {substr($alignments[$_],$subi[$_],1)} 0..$#subi; if(!defined $in){ $in=(grep($inre,@c)==scalar(@c)); } if(grep($inre,@c)==scalar(@c)){ #all in? if($in){ #nothing to do really... }else{ @substart=@subi; #starting new region $in=1; } foreach my $si (@subi){$si++} #advance them all! }else{ #someone's out if($in){ #region ended, add anchor my @lstart=map {$start[$_]+$substart[$_]} 0..$#start; my @stop=map {$start[$_]+$subi[$_]} 0..$#start; my @LCB=map { { start => $lstart[$_], stop => $stop[$_], LCBId => $LCBId }} 0..$#stop; $in=0; push(@LCBs,\@LCB); foreach my $si (@subi){$si++} #advance them all! }else{ #in a gap advance iterators currently pointed at gaps foreach my $i (0..$#subi){ $subi[$i]++ if !(substr($alignments[$i],$subi[$i],1)=~m/$inre/); } } } } }else{ #simple mum region (perfect match) my ($length,@start)=split(/\s+/); next if (grep {$_==0} @start)>0; my @stop=map {$_+$length} @start; my @LCB=map { { start => $start[$_], stop => $stop[$_], LCBId => $LCBId }} 0..$#stop; push(@LCBs,\@LCB); } } } } return {'seqs' => \@seqs, 'LCBs' => \@LCBs}; } sub max { my ($r,@l)=@_; foreach my $v (@l){ $r=$v if $v>$r; } return $r; } sub getName { my @ret=map { my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*}); $name } @_; return @ret if $#_; return $ret[0]; } __END__ =head1 NAME mauveAlignment2anchors - converts a Mauve alignment into Murasaki anchor format based on gapped alignment data (each gap triggers a new anchor). Note this refers to the Mauve alignment format, not the LAGAN .alignment file that Mauve also outputs. =head1 SYNOPSIS mauveAlignment2anchors [output] =head1 OPTIONS --autoout -- create output file name automatically (on by default) --noautoout -- disable the above "autoout" option murasaki/anchors2osfinder.pl0000755000177700001440000001232611434752241015572 0ustar krispusers#!/usr/bin/perl -w #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . use strict; use Getopt::Long; use Pod::Usage; use File::Basename; #use Data::Dump qw {dump}; use strict; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; use Murasaki::Ticker qw{resetTick tick}; our $root; my ($man,$help,$makecds); my (@seqnames,$output_opt); my $opterr= GetOptions('help|?' => \$help, man => \$man, 'output=s'=>\$output_opt); pod2usage(1) if $help; pod2usage(-exitstatus => 0, -verbose => 2) if $man; pod2usage({-verbose=>1,-exitval=>2,-message=>'Need some input file...'}) if $#ARGV<0; my $round; foreach my $inf (@ARGV){ my ($iprefix,$dir,$ext)=fileparse($inf,qr{\.anchors}); die "Need a .anchors file." unless $ext eq ".anchors"; my $seqfile="$dir/$iprefix.seqs"; die "Need $seqfile" unless -f $seqfile; open(my $infh,$inf) or die "Couldn't open $inf"; my @seqs=map { {file=>$_} } (split(/\n/,slurp($seqfile))); foreach my $seq (@seqs){ print "Loading regions for $seq->{file}\n"; $seq->{regions}=[getRegions($seq->{file})]; } my $outf=($output_opt and $round) ? "$output_opt.$round":$output_opt; $outf="$inf.osfinder-in" unless $outf; print STDERR "Warning: Overwriting $outf\n" if -f $outf; my ($lines)=`wc -l $inf`=~m/(\d+)/ if -f $inf; resetTick($lines) if $lines; open(my $ofh,">$outf") or die "Can't write $outf"; while(my $line=<$infh>){ my @anchors=getAnchors($line); die "Illegal number of anchors (".scalar(@anchors).")" unless scalar(@anchors)==scalar(@seqs); print $ofh join("\t", map {fixInRegion($anchors[$_],$seqs[$_]->{regions})} 0..$#anchors)."\n"; tick() if $lines; } print STDERR "\nDone with $inf\n"; } sub mean { my $s; foreach my $e (@_){ $s+=$e; } return $s/scalar(@_); } sub cmpRegion { my ($needle,$region)=@_; return 0 if ($needle->{stop}<=$region->{stop} and $needle->{start}>=$region->{start}); return $needle->{start}<=>$region->{start}; #this only works non-overlapping regions, which these regions are } sub shortAnchor { my ($anchor)=@_; return $anchor->{start}."~".$anchor->{stop}; } sub binaryRegionSearch { my ($needle,$haystack)=@_; my ($min,$max)=(0,$#{$haystack}); # print "Looking for ".shortAnchor($needle)." out of $max regions\n"; do { my $p=int(mean($min,$max)); # print "min: $min (".shortAnchor($$haystack[$min]).") p: $p(".shortAnchor($$haystack[$p]).") max: $max(".shortAnchor($$haystack[$max]).")"; my $cmp=cmpRegion($needle,$$haystack[$p]); # print " cmp: $cmp\n"; if($cmp==0){ return ($p,$$haystack[$p]); }elsif($cmp<0){ #in left half $max=$p-1; }else{ $min=$p+1; } }while($min<=$max); return undef; } sub fixInRegion { my ($anchor,$regions)=@_; return (1,$anchor->{start},$anchor->{stop}) unless $regions; die "programming error ftl!" unless ref($regions) eq 'ARRAY'; my ($i,$region)=binaryRegionSearch($anchor,$regions); die "Uh oh. No matching region found for anchor: ".anchorToString($anchor) unless $region; if($anchor->{start}>=$region->{start} and $anchor->{stop}<=$region->{stop}){ return ($i+1,$anchor->{start}-$region->{start}+1,$anchor->{stop}-$region->{start}+1,$anchor->{sign}); }else{ die "Uh oh. my binary search is very broken"; } } sub anchorToString { my ($anchor)=@_; return join("\t",$anchor->{start},$anchor->{stop},$anchor->{sign}); } sub getAnchors { my ($line)=@_; chomp $line; my @d=split(/\t/,$line); my @anchors; while(@d){ my ($start,$stop,$sign)=map {shift(@d)} (1..3); ($start,$stop)=(-$start,-$stop) if($start<0); ($start,$stop)=($stop,$start) if $stop<$start; #be flexible push(@anchors,{start=>$start,stop=>$stop,sign=>$sign}); } return @anchors } sub getRegions { my ($file)=@_; die "No file specified?" unless $file; my @res; if($file=~m/\.stitch$/){ open(my $fh,$file) or die "Couldn't open stitch sub-file $file"; while(<$fh>){ my @dat=split(/\t/); push(@res,{file=>$dat[0],length=>$dat[1],start=>$dat[2],stop=>$dat[3]}); } }else{ unless(-f "$file.len"){ system("$root/getsegments $file"); } open(my $fh,"$file.len") or die "Couldn't open segment data for $file"; my $length=<$fh>; #first line is length only my $at=1; while(<$fh>){ my ($length,$meta)=m/^(\d+)\t(.*)/; push(@res,{length=>$length,start=>$at,stop=>$at+$length}); $at+=10+$length; } } print "Loaded ".scalar(@res).(scalar(@res)==1 ? " region":" regions")." from $file\n"; return @res; } sub slurp { local $/; open(my $fh,"@_") or return; return <$fh>; } murasaki/src/0000755000177700001440000000000011434752243012546 5ustar krispusersmurasaki/src/align-and.cc0000644000177700001440000001234311434752234014712 0ustar krispusers//compare 2 alignments /* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #include #include #include #include #include #include #include #include #include #include "dinkymath.h" #include "itree.h" #include "alignments.h" #include using namespace std; namespace fs = boost::filesystem; string program_help(); string program_version(); //function decl double compareAlignment(Alignment &ref,Alignment &test); void printResults(vector > &results); //globals ProgressTicker ticker(100); bool debug=false; int main(int argc,char** argv){ int optc; uint growAnchors=0; ostream *os=&cout; string outfile; using boost::lexical_cast; using boost::bad_lexical_cast; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"help",0,0,'?'}, {"version",0,0,'v'}, {"grow",1,0,'g'}, {"output",1,0,'o'}, {"debug",0,0,'d'}, {0,0,0,0} }; int longindex=0; optc=getopt_long(argc,argv,"?vg:o:",long_options,&longindex); if(optc==-1)break; switch(optc){ case 'v': cout << program_version();exit(-1);break; case '?': cout << program_help();exit(-1);break; case 'g': if(optarg){ string optstr(optarg); if(optstr==string("0") || optstr==string("false")){ growAnchors=0; break; } try{ growAnchors=lexical_cast(optstr); } catch(bad_lexical_cast& e){ cerr << "Bad argument to --grow"< files; for(int i=optind;ianchors.size()<<" anchors with existing " << ref->anchors.size() <<" anchors."<::iterator ai=loading->anchors.begin();ai!=loading->anchors.end();++ai){ //foreach anchor in ref that overlaps ai set overlaps; ref->findOverlaps(*ai,overlaps); if(overlaps.empty()) continue; //no overlaps //generate new anchors from each overlapping region for(set::iterator oi=overlaps.begin();oi!=overlaps.end();++oi){ Anchor* o=*oi; vector parts; if(debug)cout << "Merging "<< *ai << " with "<< *o<parts.size();s++){ region r=o->parts[s].key(); region rref=ai->parts[s].key(); if(debug)cout << s << " "<< r << " && " << rref << " = "; if(isZero(r)!=isZero(rref) || (!isZero(r) && !regionsOverlap(r,rref))){ if(debug)cout << "incompatible"<mergeAdd(parts); AnchorEnd: ; } } delete ref; delete loading; }else{ cerr << "Initial set: "<anchors.size()<<" anchors."<anchors.size()<<" anchors."<flush(); delete os; // i dont like this, but it'll do... } delete result; return 0; } string program_help(){ return string("\ Usage: align-or [options] alignment1 [alignment2 ... ]\n\ \n\ Options\n\ *Takes an argument (like --maxres 3 or -m3)\n\ --grow|v = grow all anchors by some amount\n\ --output|o = store output to a separate file (otherwise, it's stdout)\n\ \n\ *Toggles: (just --merge or -b)\n\ --help|h = this message\n\ --version|v = version string\n\ "); } string program_version(){ return string("align-and v0.11"); } murasaki/src/align-coverage.cc0000644000177700001440000001340211434752234015740 0ustar krispusers//compare 2 alignments /* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include "dinkymath.h" #include "itree.h" #include "alignments.h" #include "seqread.h" #include using namespace std; namespace fs = boost::filesystem; string program_help(); string program_version(); //function decl string naPercent(double x,double y); void printHeader(ostream& os); void printResults(ostream& os,string filename,long detLength,long seqLength,long coverLength); //globals ProgressTicker ticker(100); bool debug=false; bool countNCovers=false; int main(int argc,char** argv){ int optc; uint growAnchors=0; ostream *os=&cout; string outfile; bool noHeader=false; using boost::lexical_cast; using boost::bad_lexical_cast; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"help",0,0,'?'}, {"version",0,0,'v'}, {"grow",1,0,'g'}, {"output",1,0,'o'}, {"debug",0,0,'d'}, {"ncover",0,0,'n'}, {"noheader",0,0,'H'}, {0,0,0,0} }; int longindex=0; optc=getopt_long(argc,argv,"?vg:o:",long_options,&longindex); if(optc==-1)break; switch(optc){ case 'v': cout << program_version();exit(-1);break; case '?': cout << program_help();exit(-1);break; case 'g': if(optarg){ string optstr(optarg); if(optstr==string("0") || optstr==string("false")){ growAnchors=0; break; } try{ growAnchors=lexical_cast(optstr); } catch(bad_lexical_cast& e){ cerr << "Bad argument to --grow"< files; for(int i=optind;i covers(seqCount); for(uint i=0;i parts;reader.readline(parts);parts.clear()){ anchors++; for(uint si=0;sici.key().stop) ++ci; if(ci!=covers[si].end()) if(idx>=ci.key().start && idx<=ci.key().stop && (countNCovers || determined)){ assert(c!='n' && c!='N'); coverLength++; } } if(debug)cerr << "Finished reading "< (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // scoring.cc // provides scoring functionality for working on sequences ////////////// #include "murasaki.h" #include "scoring.h" #include "sequence.h" #include "dinkymath.h" #include "murasaki_mpi.h" #include "options.h" #include #include #include #include #include #include #include bool scoreMatrixInited=false; double worstCaseEntropy=-log(1.0/(double)BASES)/M_LN2; Score ScoreMatrix[5][5]={ //borrowed from blastz, augmented with (arbitrary) match scores for matching to Ns { 91, -114, -31, -123 , 0 }, { -114, 100, -125, -31 , 0 }, { -31, -125, 100, -114 , 0 }, { -123, -31, -114, 91 , 0 }, { 0, 0, 0, 0 , 0 } //this N behaviour should perform somehwat similar to the entropy scoring method }; Score scoreColumn(const vector &column){ //sum of pairs score Score totalScore=0; if(column.size()<5){ //this way's plenty fast for small numbers of sequences for(vector::const_iterator i=column.begin();i!=column.end();++i) for(vector::const_iterator j=column.begin();i!=j;++j) totalScore+=ScoreMatrix[**i][**j]; }else{ //this is linear time in number of sequences (as opposed to n^2 as above) unsigned int baseCount[BASES]; memset(baseCount,0,sizeof(unsigned int)*BASES); for(vector::const_iterator i=column.begin();i!=column.end();++i) baseCount[**i]++; for(BASE_TYPE i=0;i &column,vector > &output){ //full set of pairs //there's no escaping s^2 in this case, so just go for it. output.resize(column.size(),vector(column.size(),0)); //in case any are uninited... for(size_t i=0;i > &pairs){ assert(!pairs.empty()); assert(!pairs.front().empty()); Score x=MAXSCORE; for(size_t i=0;i column,SeqPos max,int direction,long* scoreOut){ Score score=0; Score maxScore=0,maxScoreOffset=0,loss=0,scoreDelta; vector > pairScores; SeqPos offset=1; for(;loss0) //go-go branch prediction! ++column[i]; else --column[i]; } if(opt.scoreByMinimumPair){ addColumnPairScores(column,pairScores); Score prev=score; score=minimumPairScore(pairScores); //because pairScores is actually maintaining additive _row pair scores_ in this case, here score simply the minimum additive score scoreDelta=score-prev; }else{ scoreDelta=scoreColumn(column); score+=scoreDelta; } if(score>maxScore){ maxScore=score; maxScoreOffset=offset; } loss-=scoreDelta; if(loss<0) loss=0; } *scoreOut=maxScore; return maxScoreOffset; } murasaki/src/config.pl.example0000644000177700001440000000017011434752235016001 0ustar krispusers#!/usr/bin/perl #config file for the murasaki perl scripts our $root='/home/krisp/murasaki'; our $seqhome="$root/seq"; murasaki/src/hashing.h0000644000177700001440000000441611434752234014345 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #ifndef _HASHING_H_ #define _HASHING_H_ #include "globaltypes.h" #include "dinkymath.h" #include #include #include template class SBox { public: static const size_t SBOX_BOXBYTES=(sizeof(T)); static const size_t SBOX_BOXSIZE=(((size_t)1)<<(SBOX_BOXBYTES*8)); static const size_t SBOX_BOXBITS=(SBOX_BOXBYTES*8); // typedef T (*BoxType_p)[SBOX_BOXSIZE]; //craziest syntax evar, eh? T typedef BoxType[SBOX_BOXSIZE]; protected: int boxCount; BoxType *boxes; // T (*boxes)[SBOX_BOXSIZE]; public: SBox(int hashbits) : boxCount((hashbits+SBOX_BOXBITS-1)/SBOX_BOXBITS), //number of bytes rounded up boxes(new T[boxCount][SBOX_BOXSIZE]) { using namespace std; assert(boxes); for(int b=boxCount-1;b>=0;b--){ size_t boxbits=((b+1)*SBOX_BOXBITS>(size_t)hashbits) ? (hashbits%SBOX_BOXBITS):SBOX_BOXBITS; size_t boxsize=(boxbits==SBOX_BOXBITS) ? SBOX_BOXSIZE:(1<> shift )]) << shift; } return out; } }; #endif murasaki/src/cgr-random.cc0000644000177700001440000001375011434752234015114 0ustar krispusers/* Murasaki - multiple genome global alignment program Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////////// // generate a random sequence with markov probabilities derived from // a CGR sequence ///////////////// #include #include #include #include #include #include #include #include #include //for uint defs #include #include #include #include "dinkymath.h" #include "cgr.h" using namespace std; using boost::lexical_cast; using boost::bad_lexical_cast; boost::mt19937 randgen; //mersenne twister. long cycles, fast = wee string program_help(); string program_version(); void genLots(size_t targetLength,vector &mip,ostream *outs=&cout); void genMipmap(Cgr &src,vector &mip); typedef uint64_t superpix; //should hold 4 pix bool outRaw=false; ostream* msg=&cout; int main(int argc,char **argv){ int optc; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"help",0,0,'?'}, {"version",0,0,'v'}, {"raw",0,0,'r'}, //dont faformat {"quiet",0,0,'q'}, {0,0,0,0} }; int longindex=0; optc=getopt_long(argc,argv,"?v",long_options,&longindex); if(optc==-1)break; switch(optc){ case 'v': cout << program_version();exit(-1);break; case 'r': outRaw=true;break; case 'q': msg=new ostringstream();break; default: cout << program_help();exit(1);break; } } vector args; for(int seqi=optind;seqi mip(src.iter); *msg << "Generating mipmap..."<(args[1]); *msg << "Making a "<2){ *msg << "Writing output to: "< argv; argv.push_back(string("--name=random-")+args[0]); argv.push_back("-"); argv.push_back(args[2]); outp=new redi::opstream("./faformat.pl",argv); } } genLots(targetLength,mip,outp); if(outp==&cout) cout << endl; else { *msg << "Done!\n"; delete outp; //needs to flush/close for files } } void genMipmap(Cgr &src,vector &mip) { int i=src.iter-1; mip.resize(i+1); mip[i--]=&src; Cgr *prev=&src; for(;i>=0;i--){ Cgr *subCgr=new Cgr(*prev,true); pair lowhigh(subCgr->findNiceRange()); assert(cout << "Low: " << lowhigh.first << " High: "<< lowhigh.second << endl); mip[i]=subCgr; prev=subCgr; } #ifndef NDEBUG for(unsigned i=0;i &mip,ostream *outs){ string context; vector opts(4); opts[0]=string("C"); opts[1]=string("G"); opts[2]=string("A"); opts[3]=string("T"); for(size_t done=0;doneiter==(int)context.length()+1); prob[i]=(*sampler)[context+opts[i]]; } assert(cout <<"Probabilities: "); #ifndef NDEBUG copy(prob,prob+4,ostream_iterator(cout," ")); #endif assert(cout << endl); boost::uniform_int range(0,prob[0]+prob[1]+prob[2]+prob[3]); boost::variate_generator > chooser(randgen,range); superpix choice=chooser(); //1 random int from 0 to that sum assert(cout << "Rand is "<=0); assert(cout << "new char: "<=mip.size()){ context.erase(0,1); //chop one off the front } assert(!(context.length()>=mip.size())); //dont have to loop (by IH) } } string program_help(){ return string("\ Usage: cgr-random [options] [output]\n\ \n\ Generates a string random DNA sequence based on the markov\n\ model described by the inputted CGR.\n\ \n\ Options\n\ *Takes an option (like --maxres 3 or -m3)\n\ \n\ *Toggles: (just --merge or -b)\n\ "); } string program_version(){ return string("0.1"); } murasaki/src/align-or.cc0000644000177700001440000001004111434752234014561 0ustar krispusers//compare 2 alignments /* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #include #include #include #include #include #include #include #include #include #include "dinkymath.h" #include "itree.h" #include "alignments.h" #include using namespace std; namespace fs = boost::filesystem; string program_help(); string program_version(); //function decl double compareAlignment(Alignment &ref,Alignment &test); void printResults(vector > &results); //globals ProgressTicker ticker(100); int main(int argc,char** argv){ int optc; uint growAnchors=0; ostream *os=&cout; string outfile; using boost::lexical_cast; using boost::bad_lexical_cast; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"help",0,0,'?'}, {"version",0,0,'v'}, {"grow",1,0,'g'}, {"output",1,0,'o'}, {0,0,0,0} }; int longindex=0; optc=getopt_long(argc,argv,"?vg:o:",long_options,&longindex); if(optc==-1)break; switch(optc){ case 'v': cout << program_version();exit(-1);break; case '?': cout << program_help();exit(-1);break; case 'g': if(optarg){ string optstr(optarg); if(optstr==string("0") || optstr==string("false")){ growAnchors=0; break; } try{ growAnchors=lexical_cast(optstr); } catch(bad_lexical_cast& e){ cerr << "Bad argument to --grow"< files; for(int i=optind;ianchors.size()<<" anchors with existing " << result->anchors.size() <<" anchors."<::iterator ai=loading->anchors.begin();ai!=loading->anchors.end();++ai){ addCount++; result->mergeAdd(*ai); } delete loading; }else{ cerr << "Initial set: "<anchors.size()<<" anchors."<anchors.size()<<" anchors."<flush(); delete os; // i dont like this, but it'll do... } delete result; return 0; } string program_help(){ return string("\ Usage: align-or [options] alignment1 [alignment2 ... ]\n\ \n\ Options\n\ *Takes an argument (like --maxres 3 or -m3)\n\ --grow|v = grow all anchors by some amount\n\ --output|o = store output to a separate file (otherwise, it's stdout)\n\ \n\ *Toggles: (just --merge or -b)\n\ --help|h = this message\n\ --version|v = version string\n\ "); } string program_version(){ return string("align-compare v0.1"); } murasaki/src/arrayhash.cc0000644000177700001440000001136611434752234015046 0ustar krispusers/* Murasaki - multiple genome global alignment program Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // arrayhash.cc // provides arrayhash class ////////////// #include "murasaki.h" #include "sequence.h" #include "dinkymath.h" #include "cmultiset.h" #include "arrayhash.h" bool ArrayHash::emptyAt(const HashKey key){ return fasta[key] == NULL; } word ArrayHash::sizeAt(const HashKey key){ if(fasta[key] == NULL) return 0; else return fasta[key]->size(); } word ArrayHash::sizeAt(const HashKey key,const HashVal &val){ pair range=fasta[key]->equal_range(val); return range.second-range.first; } void ArrayHash::add(const HashKey key, const HashVal &a){ if(fasta[key]==NULL) fasta[key] = new cmultiset; fasta[key]->insert(a); } ArrayHash::ArrayHash(BitSequence *pat) : Hash(pat), //holy crap C++ is cool activeRange(0,0) { fasta=new MHash[hash_size]; memset(fasta,0,sizeof(MHash)*hash_size); } void ArrayHash::clear(){ /* // this is too slow! for(int i=0; iclear(); } void ArrayHash::pickup(LocList &locList, pair range){ for(HashIte i=range.first; i!=range.second; i++) locList[i->seqId()].push_back(*i); } void ArrayHash::getMatchingSets(HashKey key,list &sets){ if(!fasta[key]) return; fasta[key]->sort(); HashIte start(fasta[key]->begin()),stop(fasta[key]->end()); for(HashIte si=start;si!=stop;){ sets.push_back(LocList(seq_count)); HashIte ref=si; (sets.back())[si->seqId()].push_back(*si); for(++si;si!=stop && !(*ref<*si);++si){ (sets.back())[si->seqId()].push_back(*si); } } } void ArrayHash::lookup(HashKey key,LocList &locList){ if(!fasta[key]) return; fasta[key]->sort(); //the list returned by this _MUST_ be sorted. pair range(fasta[key]->begin(), fasta[key]->end()); pickup(locList, range); } void ArrayHash::dump(ostream &os){ HashKey key; unsigned s; for(key=0; key<(unsigned)hash_size; key++){ s = sizeAt(key); // cout << "dump: size=" << s<< endl; if( s==0 ) continue; os.write( (char*)&key, sizeof(HashKey) ); os.write( (char*)&s, sizeof(unsigned) ); pair p = fasta[key]->block_read(); HashVal* l = p.second; os.write( (char*)l, sizeof(HashVal) * s); } key = 0; s = 0; os.write( (char*)&key, sizeof(HashKey) ); os.write( (char*)&s, sizeof(unsigned) ); } void ArrayHash::load(istream &is){ HashKey key; unsigned s; HashVal *buf; int bufsize; bufsize = 1024*1024; buf = (HashVal*)malloc(bufsize); do{ is.read( (char*)&key, sizeof(HashKey) ); is.read( (char*)&s, sizeof(unsigned)); if (key==0 && s==0) break; if(fasta[key]==NULL) fasta[key] = new cmultiset; int readbytes = sizeof(HashVal)*s; if(bufsize > readbytes){ bufsize = readbytes; buf = (HashVal*)realloc(buf, bufsize); } is.read( (char*)buf, sizeof(HashVal)*s ); fasta[key]->block_append(buf, (int)s); } while(true); free(buf); } bool ArrayHash::rawSanityCheck(){ for(word base=0;base<(word)hash_size;base++){ if(emptyAt(base)) continue; for(HashIte i=(*fasta[base]).begin();i!=(*fasta[base]).end();i++){ Location l(*i); Window w(l,hashpat); word here=w.hash(); // cout << "hash for "< (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////////////////// // krisp - CGR image builder. from raw sequence ////////////////////// #include #include #include #include #include "dinkymath.h" #include "cgr-image.h" #include "cgr.h" #include "seqread.h" using namespace std; int maxiter=8,startiter=1; pix junk; //used as a throw-away pixel for N-containing chunks int repeatmask=0,interskip=0; bool domerge=false; extern unsigned long seedcount; bool binary=true; extern bool fullWidthBinary; int main(int argc,char** argv){ int optc; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"help",0,0,'?'}, {"version",0,0,'v'}, {"maxiter",1,0,'m'}, {"start",1,0,'s'}, {"merge",0,0,'b'}, {"interskip",0,0,'i'}, {"repeatmask",0,0,'r'}, {"only",1,0,'1'}, {"plain",0,0,'p'}, {"8bit",0,0,'8'}, {0,0,0,0} }; int longindex=0; optc=getopt_long(argc,argv,"m:?pvris:1:b8",long_options,&longindex); if(optc==-1)break; switch(optc){ case '1': interskip=1; if(!optarg || !sscanf(optarg,"%d",&maxiter)){ cerr << "Could not parse resolution.\n"; cerr << program_help(); exit(-1); } maxiter=max(1,maxiter); startiter=maxiter; startiter=max(1,startiter); cout << "Making only 1 CGR-resolution:"<(1,maxiter); cout << "Set maxiter to "<(1,startiter); cout << "Set maxiter to "< argv(1); if(repeatmask) argv.push_back(string("-r")); argv.push_back(string("-c")); argv.push_back(inname); cout << "Loading "<iter<=maxiter){ cgr->load(fwd.data(),fwd.length()); cgr->load(rev.data(),rev.length()); if(old){ if(domerge) cgr->merge(*old); delete old; } if(!interskip) cgr->savePGM(outname+"."+dstring(cgr->iter),binary); if(cgr->iteriter==maxiter) break; } cgr->savePGM(outname,binary); if(old)delete old; if(cgr)delete cgr; } cout << "All done!\n"; } string program_help(){ return string("\ Usage: cgr-image [options] [ ...]\n\ \n\ Options\n\ *Takes an option (like --maxres 3 or -m3)\n\ --maxres|-m = specify max number of subsamples\n\ --startiter|-s = start from iteration n\n\ --only|-1 = only do 1 iteration\n\ \n\ *Toggles: (just --merge or -b)\n\ --merge|-b = merge between layers\n\ --interskip|-i = skip intermediate output\n\ --repeatmask|-r = mask repeats (ie: lowercase letters)\n\ --plain|-p = save as plain PGMs (not binary)\n\ --8bit|-8 = use 8bit binary PGMs (some programs (like gimp) can't use 16bit)\n\ "); } string program_version(){ return string("0.1"); } murasaki/src/openhash.cc0000644000177700001440000001320711434752234014665 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ //////// // super-memory-miserly hash list storage (def.) // Kris Popendorf /////// #include "openhash.h" #include "ecolist.h" #include "murasaki.h" #include "exceptions.h" #include "options.h" //for performance measurement purposes... #include #include using namespace std; OpenHash::OpenHash(BitSequence *pat) : Hash(pat), keysFree(hash_size), keysFreeWarning(hash_size/2) //if we're over 50% full, we're probably in big trouble #ifdef HASHPROFILE ,perfProbeCount(0),perfFindAddrCount(0) #endif { fasta=new Ecolist[hash_size]; } OpenHash::~OpenHash(){ delete[] fasta; } void OpenHash::clear(){ for(word i=0;i &sets){ if(emptyAt(key)) return; sets.push_back(LocList(seq_count)); //we don't have any non-matching entries in each key, so, throw em all in! lookup(key,sets.back()); } bool OpenHash::lessthan(const val_type& a,const val_type& b){ Location la(val2loc(a)),lb(val2loc(b)); Window wina(lb,mfh->hashpat),winb(la,mfh->hashpat); return winahashpat); static bool warningIssued=false; while(1){ if(emptyAt(probe)){ keysFree--; if(keysFreehashpat); if(probeWin.equals(startWin)) goto FOUNDADDR; switch(opt.probingMethod){ case 1: //quadratic probe+=1+(probeCount*probeCount); break; default: //linear probe++; } while(probe>=hash_size) probe-=hash_size; probeCount++; if(probeCount>hash_size && !keysFree){ throw MurasakiException("Out of hash keys. Rerun Murasaki using either more hashbits, or a different hash table that supports chaining (eg. EcoHash)"); } } FOUNDADDR: #ifdef HASHPROFILE perfProbeHisto[probeCount]++; perfProbeCount+=probeCount; #endif return probe; } void OpenHash::writePerformanceData(string prefix){ #ifdef HASHPROFILE { cout << "Openhash: Writing probe histogram..."<::iterator i=perfProbeHisto.begin();i!=perfProbeHisto.end();++i) hfh << i->first << "\t" << i->second < disthist; for(size_t dist=1,i=first+1;i!=first;i=(i+1>=hash_size ? 0:i+1),dist++){ if(!emptyAt(i)){ disthist[dist]++; dist=0; } } for(map::iterator i=disthist.begin();i!=disthist.end();++i) hfh << i->first << "\t" << i->second < (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // globaltypes.h // provides global typing glue to keep other .h files sane ////////////// #ifndef _GLOBALTYPES_H_ #define _GLOBALTYPES_H_ #ifdef LARGESEQ_SUPPORT typedef long SeqPos; #else typedef int SeqPos; #endif //this might change on different environments...ideally would be autoconf'd typedef unsigned long word; //originally defined in sequence.h #ifdef __linux__ #define WORDSIZE __WORDSIZE #else #if (defined(__FreeBSD__) && defined(__i386__)) || (defined (__APPLE__) && (defined (__ppc__) || defined (__i386__))) || defined(__MINGW32__) #define WORDSIZE 32 #endif #if (defined (__APPLE__) && (defined (__ppc64__) || defined (__x86_64__))) || (defined(__FreeBSD__) && defined(__x86_64__)) #define WORDSIZE 64 #endif #endif class BaseIterator; class BitSequence; typedef word HashKey; class Location; class Sequence; typedef Location HashVal; typedef unsigned char SeqIdx; //from cryptohasher class CryptoHasher; #endif murasaki/src/timing.h0000644000177700001440000000472711434752234014220 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ //////////////// // timing.h // defs for timing tools //////////////// #ifndef __TIMING_H #define __TIMING_H #include #include #include #include #define MILLION_INT 1000000 //it's happened before that i miss a zero. that's just silly #define MILLION_DOUBLE 1000000.0 #define TIME_DOUBLE(a) ((double)(a.tv_sec)+(double)(a.tv_usec)/MILLION_DOUBLE) extern bool timing_useHumanTime; using namespace std; class Timer { public: struct timeval tv; Timer(); Timer(int sec,int usec); Timer(double t); void reset(); //reset timer to now double asDouble() const; friend Timer operator+ (const Timer &a,const Timer &b); Timer& operator+=(const Timer &a); friend Timer operator- (const Timer &a,const Timer &b); Timer& operator-= (const Timer &b); friend double diff(const Timer &a,const Timer &b); friend Timer operator*(const Timer &a,double factor); friend Timer operator*(double factor,const Timer &a); friend Timer operator/(const Timer &a,double factor); friend Timer operator/(double factor,const Timer &a); friend ostream& operator <<(ostream &os,const Timer &obj); }; string elapsed(Timer a,Timer b); double diff(const Timer &a,const Timer &b); string humanTime(double dur); class Stopwatch { public: Timer accum; Timer runStart; bool running; Stopwatch() : accum(0,0),runStart(0,0),running(false) {} inline void start(){assert(!running);runStart.reset();running=true;} inline void stop(){assert(running);accum+=(Timer()-runStart);running=false;} inline string asString() const {return humanTime(TIME_DOUBLE(accum.tv));} inline double asDouble() const {return TIME_DOUBLE(accum.tv);} friend ostream& operator <<(ostream &os,const Stopwatch &obj){return os << obj.asString();} }; #endif murasaki/src/msethash.h0000644000177700001440000000274211434752234014540 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #include "sequence.h" class MSetHash : public Hash { public: //my own lovely bunch of coconuts typedef multiset* MHash; //multihash typedef multiset::iterator HashIte; void clear(); void add(const HashKey key,const HashVal &val); void getMatchingSets(HashKey key,list &sets); void lookup(HashKey key,LocList &locList); bool next(HashVal *v); bool emptyAt(const HashKey key); word sizeAt(const HashKey key); word sizeAt(const HashKey key,const HashVal &val); MSetHash(BitSequence *pat); static const word linear_cost(word c) { return sizeof(Location)*c; } static const word bucket_prep_cost(word c) { return sizeof(multiset)*c; } protected: MHash *fasta; pair activeRange; }; murasaki/src/options.h0000644000177700001440000000727411434752233014423 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ///////////////////// // murasaki config options manager //////////////////// #ifndef __OPTIONS_H #define __OPTIONS_H #include "sequence.h" #include "scoring.h" #include #include #include using namespace std; enum HashType {ArrayHash_t, MSetHash_t, EcoHash_t,OpenHash_t,Last_t}; enum OptSrcType {OPT_SRC_DEFAULT=0,OPT_SRC_AUTO=1,OPT_SRC_MANUAL=2}; extern int mpi_id; class Options { public: //config options word verbose; int quick_hash,randomHashInputs; word hitfilter; int histogram; bool user_seed; int rand_seed; bool repeatMask; bool skip1to1,skipFwd,skipRev; bool hashOnly; bool hashCache; bool bitscore; int probingMethod; HashType hashPref; string alignment_name; string output_dir; string config_file; string cache_dir; word seedfilter,hashfilter,mergeFilter; Score scoreFilter; bool joinNeighbors; float joinSpec; int joinDist; int hashSkip; bool auto_hashbits; bool dumpRegions,leaveRecords; unsigned long targetMemory; bool userSetMemory; bool retainMembers; bool useHumanTime; bool repeatMap; word anchorProgressCheck; bool tfidf,reverseOtf,inplaceDf; int rifts,set_islands; bool fuzzyExtend,gappedAnchors; Score fuzzyExtendLossLimit; bool scoreByMinimumPair; bool measureHashCollisions; bool ecolistFrugal; bool hasherFairEntropy,hasherCorrelationAdjust; word hasherTargetGACycles; double hasherEntropyAgro; bool useSeqBinary; int mpi_hashers; bool mpi_noCake; bool mpi_fileDistro; bool mpi_anchorInSpareTime; word mpi_maxBuffers; bool mpi_bigFirst,mpi_hostbalance,mpi_memoryBalance,mpi_distMerge,mpi_distCollect; bool mpi_outputRedirect,mpi_keepstdoe; bool use_shm_mmap,mmap_writePerHost; //mmap has a potential performance hit because it requires sequence distribution via nfs/disk, //so we leave it as an option. sysv requires no disk backing, so if we have it, always use it. bool use_shm_sysv; //computed dynamically in solidfy() string prefix; string hashStatus_record; string hashHisto_record; string hashDetailed_record; string seq_record; string anchor_record; string repeat_record; string pat_record; string region_record; string status_record; string options_record; string anchorProgress_record; int seqreadOptions; //functions Options(); void preInit(); void loadConfig(string config); void solidify(); void commandline_opts(int argc,char **argv); void environment_opts(string prefix); string hashPref_s() const; friend ostream& operator<<(ostream& os,const Options &a); int getOptSource(string name); protected: void staticInit(); static bool staticInited; map optionSource; static map optionSwitch; static map optionSwitchToInt; static struct option long_options[]; }; string program_help(bool longhelp=false); void parseYesNo(const char* str,bool& toggle,const char *name); template void parseLexical(const char* str,T& ref,const char *name); #endif murasaki/src/alignments.h0000644000177700001440000001640711434752235015071 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ //Kris Popendorf (2007) //post-processing alignment management stuff #ifndef __ALIGNMENTS_H_ #define __ALIGNMENTS_H_ #include #include #include #include #include #include #include #include "itree.h" #include "dinkymath.h" #if defined(__FreeBSD__) typedef u_long ulong; //silly bsd'rs #endif #if defined(__APPLE__) typedef unsigned long ulong; typedef unsigned int uint; #endif using namespace std; // >_<;;; call me lazy... class Anchor; class Alignment; class RegionSet; typedef Itree::Interval region; typedef Itree::itree usedtree; extern uint seqCount; extern vector seqs; extern bool seqinit; inline bool isZero(const region& r){return r==region(0,0);} inline bool isZero(const usedtree::iterator& r){return r.key()==region(0,0);} inline bool isRev(const region& r){return r.start<0;} inline region invert(const region& r){return region(-r.stop,-r.start);} inline region& invertMe(region& r){long t=r.stop; r.stop=-r.start; r.start=-t; return r;} inline region abs(const region& r){if(isRev(r))return invert(r);else return r;} inline long length(const region& r){return r.stop-r.start+1;} bool regionsOverlap(const region &a,const region &b); bool canonicalize(vector &vr); ostream& operator<<(ostream& os,Anchor& a); ostream& operator<<(ostream& os,const RegionSet& a); ostream& operator<<(ostream& os,Alignment& a); class RegionSet { public: vector parts; RegionSet(const Anchor&); double avgLength() const; long riftCount() const; inline long islandCount() const {return parts.size()-riftCount();} bool isGapped() const; RegionSet getIslands() const; RegionSet setAbs() const; protected: RegionSet(); }; class Anchor { static int nextId; public: vector parts; list::iterator backref; int id; inline Anchor() : parts(seqCount),id(nextId++) { assert(parts.size()); } bool operator==(const Anchor& a); bool overlaps(const vector ®ions) const; }; inline region grow(const region& r,long amount){ if(r.start>0) return region(max(1,r.start-amount),r.stop+amount); else return region(r.start-amount,min(-1,r.stop+amount)); } inline region growCopy(const region& a,ulong amount){ region r(a); if(r.start>0) return region(max(1,r.start-amount),r.stop+amount); else return region(r.start-amount,min(-1,r.stop+amount)); return r; } inline void coalesce(region &a,const region &b){ if(isZero(a) || isZero(b)) return; assert((a.start<0) == (b.start<0)); a.start=min(a.start,b.start); a.stop=max(a.stop,b.stop); } class Alignment { public: list anchors; vector trees; int id; static int nextId; inline Alignment(): trees(seqCount),id(nextId++) { assert(trees.size()); } inline bool fetchAnchorIds(int seq,const region &a,set &out){ bool res=false; for(usedtree::range_iterator i(trees[seq].in_range(a));i!=trees[seq].end();++i){ Anchor* anc=*i; out.insert(anc->id); res=true; } return res; } inline bool fetchAnchorIds(int seq,const region &a,set &out,ulong growAnchors){ bool res=false; for(usedtree::range_iterator i(trees[seq].in_range(grow(a,growAnchors)));i!=trees[seq].end();++i){ Anchor* anc=*i; out.insert(anc->id); res=true; } return res; } inline bool contains(const Anchor& a){ set matches,temp,context; if(!fetchAnchorIds(0,a.parts[0].key(),context)) return false; for(uint i=1;i matches,temp,context; if(!fetchAnchorIds(0,growCopy(a.parts[0].key(),extend),context)) return false; for(uint i=1;i &parts){ anchors.push_back(Anchor()); Anchor& a=anchors.back(); a.backref=anchors.end(); --a.backref; assert(a==*(a.backref)); for(uint seqi=0;seqi &parts); void mergeAdd(const Anchor&); void remove(Anchor*); inline bool fetchAnchors(int seq,const region &a,set &out){ bool res=false; for(usedtree::range_iterator i(trees[seq].in_range(a));i!=trees[seq].end();++i){ Anchor* anc=*i; out.insert(anc); res=true; } return res; } bool findOverlaps(const Anchor& a,set &context); bool findOverlaps(const vector parts,set &context); void boolean_or(Alignment& a,Alignment& b); }; class SequenceCover { public: typedef Itree::itree TreeType; SequenceCover(); TreeType::iterator merge(region r); long totalLength(); inline TreeType::iterator begin(){return cover.begin();} inline TreeType::iterator end(){return cover.end();} inline void erase(TreeType::iterator &i){cover.erase(i);} TreeType cover; }; class AnchorFileReader { public: string filename; ulong growAnchors; protected: int linenum; ifstream inf; public: AnchorFileReader(string _filename,ulong _growAnchors); bool readline(vector &out); }; //funk decls! string getAnchorPrefix(string str); void checkSeqs(const char*); void writeSeqs(const char*); void loadAnchors(const char* filename,Alignment& tree,ulong growAnchors=0); #endif murasaki/src/cgr-image.h0000644000177700001440000000147011434752233014553 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #ifndef CGR_IMAGE_H #define CGR_IMAGE_H string program_help(); string program_version(); #endif murasaki/src/hginfo.pl0000755000177700001440000000120711434752234014360 0ustar krispusers#!/usr/bin/perl use strict; my (@hg)=`env LANG=C hg parents 2>/dev/null`; exit 1 if $?; #some hg error? don't return anything. my %hg=kvsplit(@hg); my $date=parseDate($hg{date}); my @hgstatus=`hg status`; my @modified=grep {/^M/ and !/Makefile$/ } @hgstatus; my $modified="*".join("~",map {m!([^/ \t]+?)\s*$!; "$1"} @modified); print join(":","r$hg{'changeset'}",$date,(@modified ? ($modified):())); sub kvsplit { my %r; foreach my $l (@_){ chomp $l; next unless $l=~m/^([^:]+):\s+(\S+.*)/; $r{$1}=$2; } return %r; } sub parseDate { my ($wday,$mon,$day,$time,$year,$tz)=split(/\s+/,$_[0]); return "$year-$mon-$day"; } murasaki/src/cgr.h0000644000177700001440000000566011434752234013501 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #ifndef CGR_H #define CGR_H //typedef unsigned gray; //the only useful thign to come out of pam.h //and it turns out it's wrong.... =( #include //for uint16_t typedef uint16_t gray; #define PGM_OVERALLMAXVAL 65535 #include #include #include using namespace std; typedef long unsigned pix; class PgmBuffer; class Cgr { public: const int iter,rez; Cgr(); Cgr(int res); Cgr(const Cgr&); Cgr(const Cgr&,bool); Cgr(const PgmBuffer&); ~Cgr(); bool savePGM(string file,bool binary=true); bool savePNG(string file); pair findNiceRange(); pair sampleHighLow(int low,int high); pair sampleHighLow(double lowp,double highp); pix* operator[](int); //row access pix& operator[](const char*); //point access pix& operator[](const string&); //point access pix get(unsigned i); void load(const char* str,size_t size); void normalize(); void merge(Cgr&); void findEdges(const pair &p,vector *low,vector *high); //output friend ostream& operator<<(ostream&,const Cgr&); friend class PgmBuffer; protected: void halve(const Cgr&); pix *data; bool normalized; }; class PgmBuffer { public: PgmBuffer(); int loadCgr(Cgr&); int savePGM(string file,bool binary=true); int loadPGM(string file); int loadBinaryPGM(string file); ~PgmBuffer(); friend class Cgr; gray average(); //junk function double difference(vector others); bool allSameRez(vector others); void invert(); protected: vector data; int rez; gray maxval; }; class PGMException { public: PGMException(const string& reason): reason_(reason) {} string reason() const { return reason_; } private: string reason_; }; class PGMFileException { public: PGMFileException( const string& reason ) : reason_(reason) {} PGMFileException( const string& reason, const string& filename, unsigned int line ); string reason() const { return reason_; } private: string reason_; }; string itodna(unsigned long idx,const int &iter,const int &rez); unsigned long dnatoi(const char *dna,const int &iter,const int &rez); void invert(string& str); #endif murasaki/src/seqread.h0000644000177700001440000001041711434752235014347 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // seqread.h // c++ support for reading various sequence formats ////////////// #ifndef __SEQREAD_H_ #define __SEQREAD_H_ #include #include #include #include #include #include #include #include #include using namespace std; namespace io = boost::iostreams; class SeqReadException { public: SeqReadException(const string& reason): reason_(reason) {} string reason() const { return reason_; } private: string reason_; }; enum SeqReadOptions {SEQO_RMASK=1,SEQO_LEN=2,SEQO_NOREC=4,SEQO_SILENT=8,SEQO_VERBOSE=16}; class SequenceReaderGlobalOptions { //singleton public: static SequenceReaderGlobalOptions* options(); bool ignoreBogusChars; protected: SequenceReaderGlobalOptions(); private: static SequenceReaderGlobalOptions* _instance; }; class SequenceReader { public: size_t readSeqInto(string& dst,istream& is,const string filename = string()); size_t readSeqInto(string& dst,const string &filename); bool repeatMask,lengthOnly,recordLength,silent,verbose; inline int options(){return repeatMask | lengthOnly<<1 | (!recordLength)<<2 | silent<<3 | verbose<<4;} SequenceReader(); }; enum FileFormats {SEQFILE_RAW=0,SEQFILE_FASTA,SEQFILE_STITCH,SEQFILE_GBK,SEQFILE_END}; class SequenceFileReader; class SequenceByteReader { public: SequenceByteReader(istream &is,const string filename=string(),int options=0); char getc(); inline size_t size(){return count;} bool eof(); bool repeatMask,lengthOnly,recordLength,silent,verbose; inline int options(){return repeatMask | lengthOnly<<1 | (!recordLength)<<2 | silent<<3 | verbose<<4;} inline FileFormats format(){return filetype;} inline long getSubSeqId(){return subSeqId;} inline string getSubSeqName(){return subSeqName;} protected: //no-init needed const int bufsize; std::list buffers; string subSeqName; size_t linenum; //initialized data bool finished; FileFormats filetype; string filename; istream& is; SequenceFileReader *redirect; size_t count,subcount; long subSeqId; bool bogusCharWarned; void procBuf(string& dst,const char *buf,const int &bufsize); void addRedirect(boost::cmatch results); bool readMore(); bool writeBackSize(); }; class SequenceFileReader { //provides the sugar to run with filenames and subsequences protected: public: SequenceFileReader(istream &is,const string filename=string(),int options=0); SequenceFileReader(const string filename,int options=0); static bool parseRangeSpec(string &filename,pair &range,int options); bool useRangeSpec(int noOpen=0); char getc(); size_t size(); bool eof(); inline const string& get_filename(){return filename;} static size_t getLength(string filename,int options=0); size_t peekLength() const; size_t readLength(); //actually read the file and compute length inline FileFormats format(){return byteReader->format();} string formatString(); ~SequenceFileReader(); //vars int options; protected: string filename; istream* setupInputFilters(istream &file); io::filtering_streambuf in; ifstream ifs; istream *is; SequenceByteReader *byteReader; bool rangeOnly; pair range; size_t outCount; public: inline long getSubSeqId(){return byteReader->getSubSeqId();} inline string getSubSeqName(){return byteReader->getSubSeqName();} }; #endif murasaki/src/genopts.cc0000644000177700001440000000354311434752234014541 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // genotps.cc // generic option parsing toys ////////////// #include "genopts.h" #include #include #include bool isYes(const char* str){ return (!strcasecmp(str,"yes") || !strcasecmp(str,"y") || !strcasecmp(str,"true") || !strcasecmp(str,"1")); } bool isNo(const char* str){ return (!strcasecmp(str,"no") || !strcasecmp(str,"n") || !strcasecmp(str,"false") || !strcasecmp(str,"0")); } bool getYesNo(const char* str,bool &toggle,const char* name){ if(isYes(str))toggle=true; else if(isNo(str))toggle=false; else { warnx("Invaild argument for %s. Please specify either yes/t/true or no/n/false",name); return false; } return true; } bool toggleYesNo(const char* str,bool &toggle,const char* name){ if(str) return getYesNo(str,toggle,name); else { toggle=!toggle; return true; } } template bool getLexical(const char* str,T &ref,const char* name){ using namespace boost; try { ref=lexical_cast(optarg); return true; }catch(bad_lexical_cast& e){ warnx("Invaild argument for %s.",name); return false; } } murasaki/src/timing.cc0000644000177700001440000000647411434752234014357 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////////// // timing tools // timing.h // Kris Popendorf ///////////////// #include "timing.h" #include #include bool timing_useHumanTime=true; ostream& operator <<(ostream &os,const Timer &obj){ return os << obj.asDouble(); } double Timer::asDouble() const{ return TIME_DOUBLE(tv); } double diff(const Timer &a,const Timer &b){ return a.asDouble()-b.asDouble(); } Timer operator*(const Timer &a,double factor){ //this had better already be derived by difference return Timer(TIME_DOUBLE(a.tv)*factor); } Timer operator*(double factor,const Timer &a){ //this had better already be derived by difference return Timer(factor*TIME_DOUBLE(a.tv)); } Timer operator/(const Timer &a,double factor){ //this had better already be derived by difference return Timer(TIME_DOUBLE(a.tv)/factor); } Timer operator/(double factor,const Timer &a){ //this had better already be derived by difference return Timer(factor/TIME_DOUBLE(a.tv)); } void Timer::reset(){ gettimeofday(&tv,NULL); } Timer::Timer(){ reset(); } Timer operator- (const Timer &a,const Timer &b){ Timer r(a); r-=b; return r; } Timer& Timer::operator-= (const Timer &a){ tv.tv_sec-=a.tv.tv_sec; tv.tv_usec-=a.tv.tv_usec; if(tv.tv_usec<0){ --tv.tv_sec; tv.tv_usec+=MILLION_INT; } return *this; } Timer operator+ (const Timer &a,const Timer &b){ Timer r(a); r+=b; return r; } Timer& Timer::operator+= (const Timer &a){ tv.tv_sec+=a.tv.tv_sec; tv.tv_usec+=a.tv.tv_usec; if(tv.tv_usec>=MILLION_INT){ tv.tv_usec-=MILLION_INT; ++tv.tv_sec; } return *this; } Timer::Timer(double t){ tv.tv_sec=(int)t; tv.tv_usec=(int)(t*MILLION_DOUBLE); } Timer::Timer(int sec,int usec) { tv.tv_sec=sec; tv.tv_usec=usec; } string elapsed(Timer start,Timer stop){ return humanTime((stop-start).asDouble()); } string humanTime(double dur){ char buf[300]; string str; if(timing_useHumanTime){ int days=(int)(dur/60/60/24); dur-=days*60*60*24; int hours=(int)(dur/60/60); dur-=hours*60*60; int mins=(int)(dur/60); dur-=mins*60; if(days){ sprintf(buf,days==1 ? "%d day ":"%d days ",days); str.append(string(buf)); } if(hours){ sprintf(buf,hours==1 ? "%d hour ":"%d hours ",hours); str.append(string(buf)); } if(mins){ sprintf(buf,mins==1 ? "%d minute ":"%d minutes ",mins); str.append(string(buf)); } if(dur || (!hours && !days && !mins)){ sprintf(buf,dur==1 ? "%.3f second ":"%.3f seconds",dur); str.append(string(buf)); } return str; }else{ sprintf(buf,"%lf",dur); return string(buf); } } murasaki/src/geneparse.cc0000644000177700001440000001253511434752233015033 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // geneparse.cc // c++ implementation of the geneparse.pl program ////////////// #include #include #include #include "seqread.h" #include using namespace std; string program_version(); string program_help(); bool toUpper=false; char baseMorph(char c) { if(!toUpper) return c; if(c>='a' && c<='z') return c-'a'+'A'; return c; } int main(int argc,char **argv){ SequenceReader reader; bool clean=false; bool verbose=false; bool typeOnly=false; bool fatalErrors=false; ostream* os=&cout; string customOutput; int optc; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"repeatmask",0,0,'r'}, {"length",0,0,'l'}, {"clean",0,0,'c'}, {"help",0,0,'h'}, {"version",0,0,'v'}, {"output",1,0,'o'}, {"quiet",0,0,'q'}, {"type",0,0,'t'}, {"upper",0,0,'U'}, {"unmask",0,0,'U'}, {"fatal",0,0,'F'}, {0,0,0,0} }; int longindex=0; string prefreq; optc=getopt_long(argc,argv,"trlchvVo:UF",long_options,&longindex); if(optc==-1)break; switch(optc){ case 'r': reader.repeatMask=true;break; case 'l': reader.lengthOnly=true;break; case 'h': cout << program_help();exit(0);break; case 'V': cout << program_version();exit(0);break; case 'v': verbose=true;reader.verbose=true;break; case 'q': reader.silent=true;break; case 'c': clean=true;break; case 'o': os=new ofstream(optarg); if(!os->good()){ cerr << "Couldn't open "< (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////////// // sample some cgr for statistics // (ie: find most common/least common patterns, print) ///////////////// #include #include #include #include #include #include #include #include "dinkymath.h" #include "cgr.h" #include "seqread.h" using namespace std; string program_help(); string program_version(); Cgr *cgr=0; struct lidx { bool operator()(const int idx1, const int idx2){ return cgr->get(idx1)get(idx2); } }; int main(int argc,char** argv){ int optc; int high=5,low=-1,res=8; double highp=-1,lowp=-1; bool repeatmask=false,save=false; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"help",0,0,'?'}, {"version",0,0,'v'}, {"high",1,0,'h'}, {"low",1,0,'l'}, {"res",1,0,'i'}, {"highp",1,0,'H'}, {"lowp",1,0,'L'}, {"repeatmask",0,0,'r'}, {"save",0,0,'s'}, {0,0,0,0} }; int longindex=0; optc=getopt_long(argc,argv,"?vh:l:i:H:L:rs",long_options,&longindex); if(optc==-1)break; using boost::lexical_cast; using boost::bad_lexical_cast; switch(optc){ case 'h': try{ high=lexical_cast(optarg);} catch(bad_lexical_cast& e){ cerr << "Bad argument to --high ("<(optarg);} catch(bad_lexical_cast& e){ cerr << "Bad argument to --low."<(optarg);} catch(bad_lexical_cast& e){ cerr << "Bad argument to --highp."<(optarg);} catch(bad_lexical_cast& e){ cerr << "Bad argument to --lowp."<(optarg);} catch(bad_lexical_cast& e){ cerr << "Bad argument to --res."<load(fwd.data(),fwd.length()); cgr->load(rev.data(),rev.length()); } cout << "Cgr loaded: rez "<rez<<" iter "<iter<iter,cgr->rez); unsigned check=dnatoi(dna.c_str(),cgr->iter,cgr->rez); if(check!=i){ cerr << "Oh no! broken "<rez*cgr->rez; if(highp>0) high=(int)(size-size*highp); if(lowp>0) low=(int)(size*lowp); pair ends=cgr->sampleHighLow(low,high); vector lowl,highl; if(low>=0) cout << "Low "<=0) cout << "High "<findEdges(ends,(low>=0 ? &lowl:0),(high>=0 ? &highl:0)); sort(lowl.begin(),lowl.end(),lidx()); sort(highl.begin(),highl.end(),lidx()); reverse(highl.begin(),highl.end()); int rank=0,count=0; if(highl.size()){ cout << "High values: "<<(highl.size())<<"\n"; pix last=cgr->get(highl[0]); for(unsigned i=0;iget(highl[i])iter,cgr->rez) << " ("<get(highl[i])<<")"<get(highl[i]); } } rank=count=0; if(lowl.size()){ cout << "Low values: "<<(lowl.size())<<"\n"; pix last=cgr->get(lowl[0]); for(unsigned i=0;iget(lowl[i])iter,cgr->rez) << " ("<get(lowl[i])<<")"<get(lowl[i]); } } if(save){ string outname=inname+".cgr"; cout << "Saving pgm to: "<savePGM(outname); } } string program_help(){ return string("\ Usage: cgr-sample [options] \n\ \n\ Options\n\ *Takes an option (like --high 3 or -h3)\n\ --high (-h) = find the highest frequency n patterns (default 5)\n\ --low (-l) = find the lowest frequency n patterns\n\ --res (-i) = res to use when generating CGRs from scratch (default 8)\n\ \n\ *Toggles: (just --repeatmask or -r)\n\ --repeatmask (-r) = mask repeats\n\ --save (-s) = save a PGM when done\n\ "); } string program_version(){ return string("0.1"); } murasaki/src/options.cc0000644000177700001440000010432011434752233014547 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ///////////////////// // murasaki config options manager //////////////////// #include "genopts.h" #include "options.h" #include "murasaki.h" #include "murasaki_mpi.h" #include "timing.h" #include "dinkymath.h" #include "sequence.h" #include //graceful handling of directories #include #include #include #include #include #include #include #include #include #include #include #ifdef MURASAKI_MPI #include "mpi.h" #endif using namespace std; using namespace boost::filesystem; //stupid static inits bool Options::staticInited=false; map Options::optionSwitch; map Options::optionSwitchToInt; void optErr(const char *infmt, ...){ va_list argp; char buff[2048]; char fmt[1024]; va_start(argp,infmt); sprintf(fmt,"Options: %s",infmt); vsprintf(buff,fmt,argp); throw MurasakiException(string(buff)); } void Options::staticInit(){ if(staticInited) return; //Actually this should probably be all wrapped in a singleton, but this avoids the mac os static init problem for now. map ma; map mb; for(static struct option *oi=long_options;oi->name!=0;++oi){ ma[oi->val]=oi->name; mb[oi->name]=oi->val; } optionSwitch = ma; optionSwitchToInt = mb; staticInited=true; } int Options::getOptSource(string name){ assert(optionSwitchToInt.count(name)); return optionSource[optionSwitchToInt[name]]; } Options::Options(): verbose(0),quick_hash(3),randomHashInputs(0), hitfilter(0),histogram(0), user_seed(false),rand_seed(time(NULL)),repeatMask(false), skip1to1(false),skipFwd(false),skipRev(false), hashOnly(false),hashCache(false),bitscore(true), probingMethod(1), hashPref(EcoHash_t),alignment_name("test"),output_dir("output"), seedfilter(0),hashfilter(0),mergeFilter(100),scoreFilter(0), joinNeighbors(false),joinSpec(-1),joinDist(-1), hashSkip(1), auto_hashbits(true), dumpRegions(false),leaveRecords(false), targetMemory(sysinfo.totalMemory*90/100),userSetMemory(false), retainMembers(true),useHumanTime(true), repeatMap(true),anchorProgressCheck(0), tfidf(true),reverseOtf(true),inplaceDf(false), rifts(0),set_islands(0), fuzzyExtend(true), gappedAnchors(false), //sounds attractive, but ungapped are actually much more useful fuzzyExtendLossLimit(0), //blastz uses 10*(A-to-A match score) as a default, we probably should too scoreByMinimumPair(true), measureHashCollisions(false), ecolistFrugal(false), hasherFairEntropy(true),hasherCorrelationAdjust(true),hasherTargetGACycles(1000),hasherEntropyAgro(1), useSeqBinary(true), //mpi related mpi_hashers(0),mpi_noCake(false),mpi_fileDistro(true), mpi_anchorInSpareTime(true),mpi_maxBuffers(0),mpi_bigFirst(false), mpi_hostbalance(true),mpi_memoryBalance(true),mpi_distMerge(true),mpi_distCollect(true), mpi_outputRedirect(true), mpi_keepstdoe(false), use_shm_mmap(false),mmap_writePerHost(false), #if defined(MURASAKI_MPI) && defined(USE_SHM_SYSV) use_shm_sysv(true) #else use_shm_sysv(false) #endif { staticInit(); try { if(getenv("HOME")) config_file=string(getenv("HOME"))+"/.murasaki"; if(exists(config_file)){ loadConfig(config_file); //load both! } }catch(exception &e){} try { config_file="murasaki.cfg"; if(exists(config_file)){ loadConfig(config_file);//load both! } }catch(exception &e){} } void Options::preInit(){ #ifdef MURASAKI_MPI if(mpi_capable) mpi_maxBuffers=1024/mpi_procs+1; #endif } struct option Options::long_options[] = { {"verbose",0,0,'v'}, {"help",0,0,'h'}, {"directory",1,0,'d'}, {"pattern",1,0,'p'}, {"name",1,0,'n'}, {"version",0,0,'V'}, {"quickhash",1,0,'q'}, {"hashbits",1,0,'b'}, {"histogram",2,0,'H'}, {"hashtype",1,0,'t'}, {"repeatmask",2,0,'r'}, {"rseed",1,0,'s'}, {"skip1to1",2,0,'1'}, {"skipfwd",2,0,'F'}, {"skiprev",2,0,'R'}, {"hashonly",2,0,'Q'}, {"hashcache",2,0,'c'}, {"seedfilter",1,0,'f'}, {"hashfilter",1,0,'m'}, {"bitscore",2,0,'B'}, {"join",2,0,'j'}, {"hashskip",2,0,'S'}, {"dumpregions",2,0,'G'}, {"memory",1,0,'M'}, {"seedterms",2,0,'T'}, {"hashers",1,0,'A'}, {"nocake",0,0,'K'}, {"localhash",0,0,'K'}, {"mpidistro",0,0,'L'}, {"sectime",0,0,'e'}, {"waittoanchor",2,0,'w'}, {"mergefilter",1,0,'Y'}, {"mmap",2,0,'P'}, {"mmapperhost",2,0,'W'}, {"sysvipc",2,0,'y'}, {"nobuffer",0,0,'U'}, {"buffers",1,0,'u'}, {"bigfirst",2,0,'I'}, {"repeatmap",2,0,'i'}, {"hostbalance",2,0,'l'}, {"tfidf",2,0,'k'}, {"reverseotf",2,0,'o'}, {"anchorprogress",1,0,'g'}, {"memorybalance",2,0,'a'}, {"leaverecords",2,0,'J'}, //J for junk {"distmerge",2,0,'<'}, //running out of letters!!!! {"distcollect",2,0,'>'}, {"rifts",1,0,'/'}, {"islands",1,0,'%'}, {"fuzzyextend",2,0,'z'}, {"fuzzyextendlosslimit",1,0,'Z'}, //It's official. I've run out of letters {"gappedanchors",2,0,256}, {"frugalecolist",2,0,257}, {"scorefilter",1,0,258}, {"probing",1,0,259}, {"scorebyminimumpair",2,0,260}, {"collisionprofile",2,0,261}, {"mpioutputredirect",2,0,262}, {"hitfilter",1,0,263}, {"hasherfairentropy",2,0,264}, {"hashercorrelationadjust",2,0,265}, {"hashertargetgacycles",1,0,266}, {"hasherentropyagro",1,0,267}, {"keepstdoe",2,0,268}, {"binaryseq",2,0,269}, //uppercased {"hasherFairEntropy",2,0,264}, {"hasherCorrelationAdjust",2,0,265}, {"hasherTargetGACycles",1,0,266}, {"hasherEntropyAgro",1,0,267}, //testing purposes only {"randomHashInputs",1,0,1024}, {0,0,0,0} }; void Options::commandline_opts(int argc,char **argv){ using namespace boost; int optc; char msgbuf[80]; //hate to do this... opterr=0; while(1){ //options struct: // name, has_arg, store_pointer, return_value int longindex=0; string prefreq; optc=getopt_long(argc,argv,":>::<::%:/:1::A:B::C::F::G::H::I::J::L::M:P::Q::R::S:T::V?W::Y:Z:a::b:c::d:e::f:g:hi::j:k::l::m:n:o::p:q:r::s:t:u:vw::y::z::",long_options,&longindex); if(optc==-1)break; switch(optc){ case 'h': cout << program_help(true);//ran --help or -h throw MurasakiAbortException("Help",0); case 'j': if(optarg && (strcmp(optarg,"0"))){ string optstr(optarg); if(optstr==string("0") || isNo(optarg)){ joinNeighbors=false; break; } } joinNeighbors=true; joinSpec=-1; if(optarg && !sscanf(optarg,"%f",&joinSpec)) optErr("Could not parse join specification."); break; case 'S': if(optarg){ if(!sscanf(optarg,"%d",&hashSkip)) optErr("Could not parse hashskip specification."); }else{ hashSkip++; } if(hashSkip<=0) optErr("Skip has to be at least 1."); break; case 'A': #ifdef MURASAKI_MPI if(mpi_capable){ if(optarg){ double percent; if(sscanf(optarg,"%lf",&percent)){ if(percent>=0 && percent<1){ mpi_hashers=(int)(percent*(double)mpi_procs); if(mpi_hashers==0) mpi_hashers=1; }else{ if(!sscanf(optarg,"%d",&mpi_hashers)) optErr("Could not parse hashers specification."); } } } if(hashSkip<=0) optErr("Hashers has to be at least 1."); }else{ #endif cerr << "Option --hashers has no effect without MPI"<8 || quick_hash<0) #else (quick_hash>3 || quick_hash<0) #endif ){ cerr << "Invalid quickhash specification ("<WORDSIZE || max_hash_bits<1){ cout << "*** Warning *** "<4)){ cerr << histogram << " is not a valid choice for histogram.\n"; histogram=2; cerr << "Using "<(optarg); }catch(bad_lexical_cast& e){ optErr("--anchorprogress needs a non-negative integer argument"); } break; case 'U': #ifdef MURASAKI_MPI if(!mpi_capable) #endif cout << "Warning: --nobuffer has no effect without MPI."<(optarg); }catch(bad_lexical_cast& e){ cerr << "Error: --buffers needs a non-negative integer argument"<': #ifdef MURASAKI_MPI parseYesNo(optarg,mpi_distCollect,"distcollect"); if(!mpi_distCollect && mpi_distMerge){ cerr << "Warning: --distmerge requires --distcollect (disabling distmerge)"<(optarg); }catch(bad_lexical_cast& e){ optErr("Error: --rifts needs a non-negative integer argument"); } break; case '%':try { set_islands=lexical_cast(optarg); }catch(bad_lexical_cast& e){ optErr("Error: --rifts needs a non-negative integer argument"); } break; case 'Z':try { fuzzyExtendLossLimit=lexical_cast(optarg); }catch(bad_lexical_cast& e){ optErr("Error: --fuzzyextendlosslimit needs a non-negative integer argument"); } break; case 266:try { hasherTargetGACycles=lexical_cast(optarg); }catch(bad_lexical_cast& e){ optErr("Error: --hasherTargetGACycles needs a non-negative integer argument"); } break; case 267:try { hasherEntropyAgro=lexical_cast(optarg); }catch(bad_lexical_cast& e){ optErr("Error: --hasherEntropyAgro needs a real numeric argument"); } break; case 256: parseYesNo(optarg,gappedAnchors,"gappedanchors");break; //testing case 1024: parseLexical(optarg,randomHashInputs,"randomHashInputs");break; //error cases: case ':': sprintf(msgbuf,"Option -%c requires an argument",optopt); optErr(msgbuf);break; case '?': snprintf(msgbuf,sizeof(msgbuf),"Unknown option %s",argv[optind-1]); optErr(msgbuf);break; default: optErr("Unknown argument. Try --help");break; } optionSource[optc]=OPT_SRC_MANUAL; } if(tfidf && !retainMembers){ cout << "Warning: tfidf requires --retainmembers=yes, disablign tfidf."< void parseLexical(const char* str,T& ref,const char *name){ using namespace boost; try { ref=lexical_cast(optarg); }catch(bad_lexical_cast& e){ optErr("Invaild argument for %s.",name); } } void parseYesNo(const char* str,bool& toggle,const char *name){ if(str){ if(isYes(str))toggle=true; else if(isNo(str))toggle=false; else { optErr("Please specify either yes/t/true or no/n/false as an argument to --%s",name); } }else{ toggle=!toggle; } } void Options::solidify(){ //finalize annoying names and crap #ifdef MURASAKI_MPI #ifdef USE_SHM_SYSV if(!mpi_capable) use_shm_sysv=false; #endif #endif //transfer options to other modules timing_useHumanTime=useHumanTime; if((int)set_islands>1 && (int)set_islandsseq_count-2){ cerr << "Warning: User supplied rifts ("< seqpos is "<<(sizeof(SeqPos))<<" bytes"<MPI is: "<<(mpi_capable ? "enabled":"disabled")<MMAP is: "<<(a.use_shm_mmap ? "enabled":"disabled")<System V IPC shared memory is: "<<(a.use_shm_sysv ? "enabled":"disabled")<Hashers is: "<<(a.mpi_hashers)<np is: "<MPI-based file distribution is: "<<(a.mpi_fileDistro ? "enabled":"disabled")<Distributed hashing? "<<(a.mpi_noCake ? "no":"yes")<Wait to anchor? "<<(a.mpi_anchorInSpareTime ? "no":"yes")<Max buffers: "<<(a.mpi_maxBuffers ? boost::lexical_cast(a.mpi_maxBuffers):string("unlimited"))<Allocate hashers on "<<(a.mpi_bigFirst ? "big":"small")<<" nodes first"<Balance hasher allocation? "<<(a.mpi_hostbalance ? "yes":"no")<Balance storage allocation? "<<(a.mpi_memoryBalance ? "yes":"no")<Distributed merging? "<<(a.mpi_distMerge ? "yes":"no")<Distributed collection? "<<(a.mpi_distCollect ? "yes":"no")< argv; char cmd[]="murasaki"; argv.push_back(cmd); string tmp; char buf[2048]; while(cfg.good()){ cfg.getline(buf,sizeof(buf)); tmp=string(buf); while(cfg.fail()){ //for absurdly long inputs cfg.getline(buf,sizeof(buf)); tmp+=buf; } argv.push_back(new char[tmp.length()]); strcpy(argv.back(),tmp.c_str()); } commandline_opts(argv.size(),&argv.front()); //hoho stl rocks } string program_help(bool longhelp){ string ret("\ Usage: murasaki -p [options] seq1 [seq2 [seq3 ... ]]\n"); if(longhelp) ret+=string("\ Options:\n\ --pattern|-p = seed pattern (eg. 11101001010011011).\n\ using the format [:] automatically generates a\n\ random pattern of weight and length \n\ --directory|-d = output directory (default: output)\n\ --name|-n = alignment name (default: test)\n\ --quickhash|-q = specify a hashing function:\n\ 0 - adaptive with S-boxes\n\ 1 - don't pack bits to make hash (use first word only)\n\ 2 - naively use the first hashbits worth of pattern\n\ 3 - adaptivevely find a good hash (default)\n")+ #ifdef USE_LIBCRYPTOPP string("\ **experimental CryptoPP hashes**\n\ 4 - MD5\n\ 5 - SHA1\n\ 6 - Whirlpool\n\ 7 - CRC-32\n\ 8 - Adler-32\n")+ #endif string("\ --hashbits|-b = use n bit hashes (for n's of 1 to WORDSIZE. default 26)\n\ --hashtype|-t = select hash table data structure to use:\n\ OpenHash - open sub-word packing of hashbits\n\ EcoHash - chained sub-word packing of hashbits (default)\n\ ArrayHash - malloc/realloc (fast but fragmenty)\n\ MSetHash - memory exorbanant, almost pointless.\n\ --probing = 0 - linear, 1 - quadratic (default)\n\ --hitfilter|-h = minimum number of hits to be outputted as an anchor\n\ (default 1)\n\ --histogram|-H = histogram computation level: (-H alone implies -H1)\n\ 0 - no histogram (default)\n\ 1 - basic bucketsize/bucketcount histogram data\n\ 2 - bucket-based scores to anchors.detils\n\ 3 - perbucket count data\n\ 4 - perbucket + perpattern count data\n\ --repeatmask|-r= skip repeat masked data (ie: lowercase atgc)\n\ --seedfilter|-f= skip seeds that occur more than N times\n\ --hashfilter|-m= like --seedfilter but works on hash keys instead of\n\ seeds. May cause some collateral damage to otherwise\n\ unique seeds, but it's faster. Also non-sequence-specific\n\ so more like at best 1/N the tolerance of seedfilter.\n\ --rseed|-s = random number seed for non-deterministic algorithms\n\ (ie: the adative hash-finding). If you're doing any\n\ performance comparisons, it's probably imperative that you\n\ use the same seed for each run of the same settings.\n\ Default is obtained from time() (ie: seconds since 1970).\n\ --skipfwd|-F = Skip forward facing matches\n\ --skiprev|-R = Skip reverse facing matches\n\ --skip1to1|-1 = Skip matches along the 1:1 line (good for comparing to self)\n\ --hashonly|-Q = Hash Only. no anchors. just statistics.\n\ --hashskip|-S = Hashes every n bases. (Default is 1. ie all)\n\ Not supplying any argument increments the skip amount by 1.\n\ --hashCache|-c = Caches hash tables in the directory. (default: cache/)\n\ --join|-j = Join anchors within n bases of eachother (default: 0)\n\ Specifying a negative n implies -n*patternLength\n\ --bitscore|-B = toggles compututation of a bitscore for all anchors\n\ (default is on)\n\ --memory|-M = set the target amount of total memory\n\ (either in gb or as % total memory)\n\ --seedterms|-T = toggles retention of seed terms (defaults to off)\n\ (these are necessary for computing TF-IDF scores)\n\ --sectime|-e = always display times in seconds\n\ --repeatmap|-i = toggles keeping of a repeat map when --mergefilter\n\ is used (defaults to yes).\n\ --mergefilter|-Y = filter out matches which would would cause more than N\n\ many anchors to be generated from 1 seed (default -Y100).\n\ Use -Y0 to disable.\n\ --scorefilter = set a minimum ungapped score for seeds\n\ --tfidf|-k = perform accurate tfidf scoring from within murasaki\n\ (requires extra memory at anchor generation time)\n\ --reverseotf|-o = generate reverse complement on the fly (defaults to on)\n\ --rifts|-/ = allow anchors to skip N sequences (default 0)\n\ --islands|-% = same as --rifts=S-N (where S is number of seqs)\n\ --fuzzyextend|-z = enable (default) or disable fuzzy extension of hits\n\ --fuzzyextendlosslimit|-Z = set the cutoff at which to stop extending\n\ fuzzy hits (ie. the BLAST X parameter).\n\ --gappedanchors = use gapped (yes) or ungapped (no (default)) anchors.\n\ --scorebyminimumpair = do anchor scoring by minimum pair when appropriate\n\ (default). Alternative is mean (somewhat illogical, but\n\ theoretically faster).\n\ --binaryseq = enable (default) or disable binary sequence read/write\n\ \n\ Adaptive has function related:\n\ --hasherFairEntropy = use more balanced entropy estimation (default: yes)\n\ --hasherCorrelationAdjust = adjust entropy estimates for nearby sources\n\ assuming some correlation (default: yes)\n\ --hasherTargetGACycles = GA cycle cutoff\n\ --hasherEntropyAgro = how aggressive to be about pursuing maximum\n\ entropy hash functions (takes a real. default is 1).\n\ ")+ #ifdef MURASAKI_MPI string("MPI Specific: \n\ --hashers|-A = specify the number of processes to be used as hashers\n\ (only applies to MPI. If a number between 0 and 1\n\ it refers to a % of np)\n\ --localhash|-K = perform hashing locally on each storage node rather than\n\ sending it over the network (helpful for slow networks)\n\ --mpidistro|-L = toggles use of MPI to distribute sequence data over\n\ (if the sequence is available on local disk on each\n\ node then turning this off may increase performance)\n\ --waittoanchor|-w = postpone actual anchor computation until all location\n\ sets have been received.\n\ --buffers|-u = maximum number of unfinished buffers to allow while\n\ message passing (0 means unlimited)\n\ --nobuffers|-U = same as --buffers=1\n\ --bigfirst|-I = assign hashers to large memory nodes first\n\ --hostbalance|-l = if yes (default): spread out hashers evenly among hosts\n\ if no: ignore host name when assigning jobs\n\ --memorybalance|-a = if yes (deafult): balance hash storage between nodes\n\ based on the amount of available ram.\n\ if no: distribute storage evently.\n\ --distmerge|-< = if yes (default): during the merge step, send seeds to\n\ all participating hashers.\n\ if no: send all seeds to one node only\n\ --distcollect|-> = if yes (default): collect anchor data from all hashers\n\ if no: send all seeds to the final assembly node only\n\ --mpiredirectoutput = if yes (default): each rank redirects its stdout/stderr\n\ to a separate file\n\ if no: do what comes naturally (ie: managed by mpirun).\n\ --keepstdoe = don't erase the murasaki-mpiout files on success.\n\ ")+ #endif #ifdef USE_SHM_MMAP string("\ --mmap|-P = use filebacked mmap() to store sequence data\n\ (saves memory when one host runs multiple nodes)\n\ --mmapperhost|-W = create files for mmap() on each host\n\ (ie: for when sequence sources aren't stored on NFS)\n\ ")+ #endif #ifdef USE_SHM_SYSV string("\ --sysvipc|-V = use System V IPC to negotiate shared memory regions\n\ (saves memory when one host runs multiple nodes)\n\ ")+ #endif #ifndef NDEBUG string("\n\ Debugging options:\n\ --leaverecords|-J= don't erase .status and -mpiout files on successful\n\ completion (default is to erase them)\n\ --dumpregions|-G = dump list of comparable regions for each sequence\n\ (ie: the areas between N's)\n\ --anchorprogress|-g = log the number of anchors in memory and the amount\n\ of free memory every N location-sets processed\n\ --collisionprofile = analyze hash function collision performance (no MPI)\n\ \n")+ #endif string("\ ...and of course\n\ --verbose|-v = increases verbosity\n\ --version|-V = prints version information and quits\n\ --help|-? = prints this help message and quits\n\ \n")+platformInfo()+program_version(); else ret+=string("For more information try --help\n\n")+program_version(); return ret; } murasaki/src/arrayhash.h0000644000177700001440000000354611434752234014711 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ //////// // arrayhash defs // Kris Popendorf /////// #ifndef ARRAYHASH_H_ #define ARRAYHASH_H_ #include "sequence.h" #include "cmultiset.h" class ArrayHash : public Hash { public: //my own lovely bunch of coconuts typedef cmultiset* MHash; //multihash typedef cmultiset::iterator HashIte; void clear(); void add(const HashKey key,const HashVal &val); void getMatchingSets(HashKey key,list &sets); void lookup(HashKey key,LocList &locList); bool emptyAt(const HashKey key); word sizeAt(const HashKey key); word sizeAt(const HashKey key,const HashVal &val); ArrayHash(BitSequence *pat); void dump(ostream &os); void load(istream &is); static const word linear_cost(word c) { return sizeof(Location)*c + sizeof(Location)*c/4; //the +25% term is an estimate of memory lost to external fragmentation } static const word bucket_prep_cost(word c) { return (sizeof(void*)+sizeof(cmultiset))*c; } bool rawSanityCheck(); bool rawSanityCheck(word base); protected: void pickup(LocList &locList, pair); MHash *fasta; pair activeRange; }; #endif murasaki/src/sequence.cc0000644000177700001440000032307711434752234014701 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // sequence.cc // provides "sequence" class for doing operations on dna sequences ////////////// #include "globaltypes.h" #include "murasaki.h" #include "sequence.h" #include "dinkymath.h" #include "cmultiset.h" #include "ecohash.h" #include "murasaki_mpi.h" #include "seqread.h" #include "binseq.hh" #include "scoring.h" #include "mingw32compat.h" #include "cryptohasher.h" #include "openhash.h" #include "timing.h" #include "hashing.h" #include #include #include #include #include #include #include #include //need log... #include //for tolower <-- im a lazy lazy man //#include #include #include #include #include //memcpy #ifdef USE_SHM_MMAP #include #include #include #include #include #include namespace fs = boost::filesystem; #endif #ifdef USE_SHM_SYSV #include #include #include #endif #ifdef USE_LIBCRYPTOPP #include #endif //these guys are used in creating the adaptive hash stuff: #include using namespace __gnu_cxx; using namespace std; int next_sequence_=0; int _wordsize=WORDSIZE; int hash_bits=27; word hash_padding=0; int max_hash_bits=27; //keeps the approx 256mb hash as default (or 515mb on 64bit) word hash_mask=lowN(hash_bits); int hash_max=intpow(2,max_hash_bits); //int hash_max=67108879; //first prime over .5gb/8bytes //524288057; //arbitrary default. first prime number past .5gb word hash_size; word alternating_mask; word activeHash; //for building hash member lists word totalSequenceMemory=0; word sys_pagesize=getpagesize(); //static inits map HashMethod::popmap; //hasher objects //if we're scrapping the bottom of the barrel for RAM (or cache performance is suffering), we could use a smaller type here typedef SBox SBoxT; //typedef SBox SBoxT; SBoxT *sbox=0; CryptoHasher *cryptoHasher=0; //seqtable bits Sequence** seq_table; SeqIdx seq_table_size; map seq2idx; bool _seq_no_data=false; double globalBaseFreq[4]={0,0,0,0}; word globalBaseCount[4]={0,0,0,0}; word globalCounted=0; const SeqPos maxSeqPos=((unsigned)((SeqPos)-1))>>1; void set_seqtable(vector seqs){ seq_table_size = seqs.size(); seq_table = (Sequence**)malloc(sizeof(Sequence*)*seq_table_size); if(seq_table == NULL) { cerr << "set_seqtable(): malloc() failed." << endl; exit(-1); } for(SeqIdx i=0; i100){ opt.quick_hash=100; } switch(opt.quick_hash){ //nothing to initialize (absurdly naive hashers) case 1:case 2:break; //adaptive hashers case 0: pat->compileHashFunc(); sbox=new SBoxT(hash_bits); break; case 3: pat->compileHashFunc(); break; #ifdef USE_LIBCRYPTOPP //cryptographic hash functions case 4:cryptoHasher=new MD5CryptoHasher();break; case 5:cryptoHasher=new SHACryptoHasher();break; case 6:cryptoHasher=new WhirlpoolCryptoHasher();break; case 7:cryptoHasher=new CRC32CryptoHasher();break; case 8:cryptoHasher=new Adler32CryptoHasher();break; #endif case 100: pat->randomHashFunc(opt.randomHashInputs); //for testing only. this is absolutely crazy break; default: assert((cerr << "Bad quick_hash value" << endl) && 0); throw MurasakiDebugException("Unknown hash function"); } switch(opt.hashPref){ case EcoHash_t: EcoHash::init(seq_count,longestSeq,totalHashLength,bits); break; case OpenHash_t: OpenHash::init(seq_count,longestSeq,totalHashLength,bits); break; default: ; } } void initConstants(){ srandom(time(0)); alternating_mask=1; //make a mask like 010101010101010101 for(int bit=2;bit> 1) | ((t & 0x55555555) << 1); t = ((t & 0xCCCCCCCC) >> 2) | ((t & 0x33333333) << 2); t = ((t & 0xF0F0F0F0) >> 4) | ((t & 0x0F0F0F0F) << 4); return CryptoPP::ByteReverse((CryptoPP::word32)(~t)); #elif defined(USE_LIBCRYPTOPP) && WORDSIZE == 64 // t = ((t & W64LIT(0xAAAAAAAAAAAAAAAA)) >> 1) | ((t & W64LIT(0x5555555555555555)) << 1); t = ((t & W64LIT(0xCCCCCCCCCCCCCCCC)) >> 2) | ((t & W64LIT(0x3333333333333333)) << 2); t = ((t & W64LIT(0xF0F0F0F0F0F0F0F0)) >> 4) | ((t & W64LIT(0x0F0F0F0F0F0F0F0F)) << 4); return CryptoPP::ByteReverse((CryptoPP::word64)(~t)); #else //if WORDSIZE isn't 32 or 64, Murasaki will probably break...but //for historical raisins (and in case we don't have crypto++), I'll leave this intact: word res=0; for(int i=0;i>i); return res; #endif } int highestBit(word t){ for(int i=WORDSIZE-4;i>=0;i-=4){ switch((t>>i) & (word)15){ case 1:return i; case 2:case 3:return i+1; case 4:case 5:case 6:case 7:return i+2; case 8:case 9:case 10:case 11:case 12:case 13:case 14:case 15: return i+3; //this could be unrolled arbitrarily for performance } } assert(0); return -1;//wtf? } int lowestBit(word t){ for(int i=0;i>i) & (word)15){ case 1:case 3:case 5:case 7: case 9: case 11: case 13: case 15: return i; case 2:case 6:case 10:case 14:return i+1; case 4:case 12:return i+2; case 8:return i+3; //this could be unrolled arbitrarily for performance } } assert(0); return -1;//wtf? } int matchCount(word t){ //count whole-base matches int count=0; for(int i=0;i>i) & (word)15){ //4 bits at a time case 3:case 12:count+=1;break; case 15:count+=2;break; default:break; } } return count; } int popCount(word t){ int count=0; for(int i=0;i>i) & (word)15){ case 1:case 2:case 4:case 8:count+=1;break; case 3:case 5:case 6:case 9:case 10:case 12:count+=2;break; case 7:case 11:case 13:case 14:count+=3;break; case 15:count+=4; //this could be unrolled arbitrarily for performance } } return count; } SeqPos Sequence::length() const{ return fwd->length(); } Sequence::Sequence() : fwd(),rev(),seqID(next_sequence_++) #ifdef USE_SHM_SYSV ,sysv_key(-1),sysv_shmid(-1) #endif { } Sequence::Sequence(string _filename): filename(_filename),name(_filename),baseFilename(_filename), seqID(next_sequence_++) #ifdef USE_SHM_SYSV ,sysv_shmid(-1) #endif { #ifdef MURASAKI_MPI if(mpi_capable){ word length; if(opt.mpi_fileDistro){ if(mpi_id==0){ fwd=new BitSequence(this); length=(word)fwd->length(); MPI_Bcast(&length,1,MPI_UNSIGNED_LONG,0,MPI_COMM_WORLD); cout << "Distributing "< range; //rangeSpec wants to use this SequenceFileReader::parseRangeSpec(baseFilename,range,opt.seqreadOptions); cout << "Waiting for node 0 to finish parsing "<mpi_distribute(); cout << "Done with forward strand ("<length()<<"bp). Preparing reverse complement..."<reverseComplement(); if(mpi_usingShm){ if(!mpi_isHostLeader) cout << "Waiting for leader nodes to finish preparing sequence."<shm_distribute(); rev->shm_distribute(); } }else{ #endif fwd=new BitSequence(this); cout << "Done with forward strand ("<length()<<"bp). Preparing reverse complement..."<reverseComplement(); #ifdef MURASAKI_MPI } #endif cout << "Done."<& BitSequence::words(){return _words;} int BitSequence::wordsize(){return WORDSIZE;}; inline int BitSequence::readCode(SeqPos bitpos) const { word w=readWord(bitpos/WORDSIZE); return first2(w<<(MODWORDSIZE(bitpos))); } inline BASE_TYPE BitSequence::readBase(SeqPos basepos) const { word w=readWord(basepos*2/WORDSIZE); return first2(w<<(MODWORDSIZE(basepos*2))); } inline word BitSequence::wordAtBase(SeqPos basepos) const { SeqPos bitpos=basepos*2; word w=readWord(bitpos/WORDSIZE); SeqPos offset=MODWORDSIZE(bitpos); if(offset){ //need to shuffle and append next word in place return ( (w << offset) | (readWord(bitpos/WORDSIZE+1) >> (WORDSIZE-offset) ) ); } return w; } inline word BitSequence::readRawWord(SeqPos wordpos) const{ return _words[wordpos]; } word BitSequence::readWord(SeqPos wordpos) const{ if(wordpos>=(SeqPos)word_count || wordpos<0) return 0; return (opt.reverseOtf && reverse) ? reverseOtf(wordpos):_words[wordpos]; } inline word BitSequence::reverseOtf(SeqPos pos) const{ int bitOffset=MODWORDSIZE(bit_count); if(bitOffset){ if((word)pos>(bitOffset)); } else return (revCompWord(_words[word_count-pos-1])<<(WORDSIZE-bitOffset)); //last word is a lonely muffin. } return revCompWord(_words[word_count-pos-1]); } void BitSequence::shm_distribute(){ //no distribution in non-mpi modes #ifndef MURASAKI_MPI throw MurasakiDebugException("Distribute called without MPI support compiled??"); return; #else if(!mpi_capable) throw MurasakiDebugException("Distribute called without MPI enabled??"); if(!opt.use_shm_sysv && !opt.use_shm_mmap) throw MurasakiDebugException("Distribute called without any shared memory modes enabled."); #ifdef USE_SHM_SYSV if(opt.use_shm_sysv){ if(reverse || !seq) return; //no distribution necessary. w00t. //on the forward strand //however now that we know that everyone has their sequence mapped and attached, //if we do the rm on the segment now, we'll prevent memory leaks in a segfaulty disaster if(seq->sysv_shmid==-1){//never been used? assert(cerr << "Alert: "<name<<" never got assigned a shared memory region?"<name<<"'s sysv region for deletion."<sysv_shmid,IPC_RMID,NULL)<0) throw MurasakiException("Couldn't mark System V IPC shared memory region for deletion for sequence "+seq->name+strerror(errno)); } return; } #endif #ifdef USE_SHM_MMAP if(opt.use_shm_mmap){ int fd; void *mem; const int NFS_Timeout=60; fs::path bitFile; switch(shmUsed){ case SHM_MMAP_RW: //nothing, we're done. break; case SHM_MMAP_RO: //open file and mmap it. if(opt.reverseOtf && reverse){ assert(seq);assert(seq->fwd);assert(seq->fwd->_words); _words=seq->fwd->_words; //attach to fwd seq's words, and we're done break; } bitFile=fs::path(genBitFilename()); for(int trial=0;!fs::exists(bitFile);trial++){ if(trial+1>NFS_Timeout){ throw MurasakiException("Gave up waiting for "+bitFile.string()+" to appear in filesystem."); } cerr << bitFile.string() << " not found. Waiting for filesystem to catch up..."<(word_count-sent,INT_MAX>>1); //silly MPI specifies transmission count in ints... cout << (mpi_id==0 ? "Sending a ":"Receiving a ")<filename.empty()); return seq->filename+string(reverse ? ".2bit.rev":".2bit.fwd")+string(opt.repeatMask ? ".repeatmasked":""); } string baseToString(BASE_TYPE b){ switch(b){ case BASE_A:return "A";break; case BASE_C:return "C";break; case BASE_G:return "G";break; case BASE_T:return "T";break; case BASE_N:return "N";break; default: return "*";break; } } string bitToString(word w){ switch(w & (word)3){ case 0:return "A"; break; case 1:return "C"; break; case 2:return "G"; break; case 3:return "T"; break; } return " "; } string wordToString(word w,int bits){ string ret; int start=bits < 0 ? bits+WORDSIZE:0; for(int i=start;i=64){ ret+="\n"; li=0; } } } else { const int wordsPerLine=(80/WORDSIZE*2); for(word i=0;i=word_count-1) seg.resize((((bit_count-1)%WORDSIZE)+1)/2); ret+=seg; if(i=_length || base_targetPos>=target._length)return 0; //no can do, mack SeqPos pos=2*base_pos;//to bitspace SeqPos targetPos=2*base_targetPos; SeqPos myOffset=MODWORDSIZE(pos),tOffset=MODWORDSIZE(targetPos); word myFrame=pos/WORDSIZE,tFrame=targetPos/WORDSIZE; word myWord=readWord(myFrame),myNext=readWord(myFrame+1); word tWord=target.readWord(tFrame),tNext=target.readWord(tFrame+1); SeqPos max=min(spaceRight(base_pos),target.spaceRight(base_targetPos)); // assert(cout << "local Spaceright: "<>(WORDSIZE-myOffset))); if(tOffset) tWord=(tWord<>(WORDSIZE-tOffset))); word res=(tWord ^ myWord); // assert(cout << "Cmp right -> "<>(WORDSIZE-myOffset)) << " | " << wordToString(myNext<>(WORDSIZE-myOffset)) | ((myNext<<(myOffset))); if(tOffset) tWord=(tWord>>(WORDSIZE-tOffset)) | ((tNext<<(tOffset))); word res=(tWord ^ myWord); // assert(cout << "Cmp Left <- "<0 ? seq()->fwd->localRegion(pos-1):seq()->rev->localRegion(seq()->length()+pos)); } SeqPosPairArray::iterator Location::localRegionIte(){ //find region local to pos // cout << "Converting "<<(*this)<<" into a local region. pos is "<0 ? pos:0-pos)-1; // cout << ", but bitseqp is "<matchRegions; pair target(bitseqp,bitseqp); //for simplicity SeqPosPairArray::iterator result=lower_bound(regions.begin(),regions.end(),target,ltRegion()); if(result==regions.end() || bitseqpfirst){ if(result==regions.end())assert(cout << "bitseqp "<first)assert(cout << "bitseqp "<first<<","<second<* BitSequence::localRegion(SeqPos pos){ //find region local to pos pair target(pos,pos); //for simplicity SeqPosPairArray::iterator result=lower_bound(matchRegions.begin(),matchRegions.end(),target,ltRegion()); if(result==matchRegions.end()){ // assert(cout << "pos "<first){ // assert(cout << "pos "<first<<","<second< target(pos,pos); //for simplicity SeqPosPairArray::iterator result=lower_bound(matchRegions.begin(),matchRegions.end(),target,ltRegion()); return result; } SeqPos BitSequence::spaceRight(SeqPos pos){ //find region local to pos pair target(pos,pos); //for simplicity pair *region(localRegion(pos)); if(!region){ return 0; } return region->second-pos+1; } SeqPos BitSequence::spaceLeft(SeqPos pos){ //find region local to pos pair target(pos,pos); //for simplicity pair *region(localRegion(pos)); if(!region){ return 0; } return pos-region->first+1; } UsedInt BitSequence::subSeqBounds(SeqPos at){ pair target(at,at); SeqPosPairArray::iterator result=lower_bound(subSeqs.begin(),subSeqs.end(),target,ltRegion()); assert(result!=subSeqs.end()); assert(result->first<=at); return UsedInt(result->first,result->second); } UsedInt Sequence::getSubSeqBounds(SeqPos at){ UsedInt subSeqBounds((at>0 ? fwd:rev)->subSeqBounds(bitSeqCoords(at))); if(at<0) swap(subSeqBounds.start,subSeqBounds.stop); return seqCoords(subSeqBounds,at); //coordinate system insanity! } UsedInt Sequence::growInBounds(const UsedInt &basis,SeqPos amount){ UsedInt bounds(getSubSeqBounds(basis.start)); return UsedInt(max(bounds.start,basis.start-amount),min(bounds.stop,basis.stop+amount)); } BaseIterator Sequence::iterate(SeqPos at){ return BaseIterator(this,at); } pair Sequence::iterate(const UsedInt &at){ pair r(BaseIterator(this,at.start),BaseIterator(this,at.stop)); if(at.start<0) swap(r.first,r.second); return r; } //mutator void BitSequence::invert(){ //flips to reverse comp sequence for(word i=0;ifwd);assert(seq->fwd->_words); if(opt.verbose)cout << "as mirror of fwd"<fwd->_words; //shares the same data, and we're done. } #ifdef MURASAKI_MPI #ifdef USE_SHM_SYSV if(opt.use_shm_sysv && seq){ if(opt.verbose)cout << "using sysv"; size_t size=word_count*sizeof(word); if(!opt.reverseOtf) size*=2;//fetch double the size because we're storing both fwd and rev in there //sysv segments are ephemereal and local to this mpi set, so we just use the filename as our key shmUsed=SHM_SYSV; if(reverse){ assert(seq->fwd->_words); //MUST allocate fwd before rev. if(opt.verbose)cout << " (continued from fwd allocation)"<fwd->_words+word_count; //second half of block } if(opt.verbose)cout << " (new segment)"<sysv_key=ftok(seq->baseFilename.c_str(),mpi_sysv_projid))==-1) throw MurasakiException("Error generating System V IPC key for "+string(seq->filename)+": "+strerror(errno)); if((seq->sysv_shmid=shmget(seq->sysv_key,size,IPC_CREAT | 00600))==-1) throw MurasakiException("Error creating System V IPC shared memory segment (size: "+humanMemory(size)+") for "+string(seq->filename)+": "+strerror(errno)); #else //create a unique key by magics if(mpi_myLocalRank==0){ if((seq->sysv_shmid=shmget(IPC_PRIVATE,size,IPC_CREAT | 00600))==-1) throw MurasakiException("Error creating System V IPC shared memory segment (size: "+humanMemory(size)+") for "+string(seq->filename)+": "+strerror(errno)); } MPI_Bcast(&seq->sysv_shmid,sizeof(key_t),MPI_BYTE,0,mpi_localhost_comm); #endif if((mem=(word*)shmat(seq->sysv_shmid,NULL,mpi_isHostLeader ? O_RDWR:(O_RDONLY | SHM_RDONLY)))==(void*)-1) throw MurasakiException("Error attaching to System V IPC shared memory segment for "+string(seq->filename)+strerror(errno)); cout << "Mapped a "<sysv_key)<<" shmid: "<sysv_shmid<<")"<sysv_shmid,IPC_RMID,NULL)<0) throw MurasakiException("Couldn't mark System V IPC shared memory region for deletion for sequence "+seq->name+strerror(errno)); if(mpi_isHostLeader) //keep track of how much mem is lost to sequence storage totalSequenceMemory+=((((size-1)/sys_pagesize)+1)*sys_pagesize)/1024; return mem; //wee. done. much cleaner than mmap. } #endif #ifdef USE_SHM_MMAP if(opt.use_shm_mmap && seq){ if(opt.verbose)cout << "using mmap"<fwd); shmUsed=SHM_MMAP_RO; //we'll pretend to be RO. return 0; } //prepare to write my bitsequence to file if needed fs::path bitFile(genBitFilename()); bool freshen=(!fs::exists(bitFile) || fs::last_write_time(bitFile)filename)); int fd; //actually we never need this again... if(freshen && ((opt.mmap_writePerHost && mpi_hostLeader[mpi_id]==mpi_id) || (mpi_id==0))){ cerr << "Writing 2bit data out to file..."< region(0,0); for(SeqPos i=0;i<_length;i++){ bit-=2; SeqPos inpi=reverse ? (_length-i-1):i; switch(str[inpi]){ case '1': w|=((word)3<filename<filename); loadBinary(binseq); }catch(SeqReadException e){ if(opt.verbose) cerr << "Binary load failed ("<filename<<" directly"<filename,opt.seqreadOptions); SequenceBinary binseq(reader); loadBinary(binseq); if(!binseq.save()) cerr << "#Warning: Save of binary sequence to "<filename+": "+e.reason()); } } }else{ cout << "Parsing "<filename<<" directly"<filename,opt.seqreadOptions); loadReader(reader); } }catch(SeqReadException e){ throw MurasakiException("Error reading "+s->filename+": "+e.reason()); } seqLoadTime.stop(); if(counted<1) throw MurasakiException("Murasaki can't use empty sequences ("+s->filename+")"); cout << "Read "< region(0,0),subSeqBounds(0,0); long prevSubSeqId=reader.getSubSeqId(); for(SeqPos i=0;i<_length;i++){ bit-=2; char c=reader.getc(); switch(c){ case 'A':case 'a': counters[0]++;counted++;matchable=true;break; case 'C':case 'c': w|=((word)1<=subSeqBounds.first); subSeqs.push_back(subSeqBounds); subSeqBounds.first=i+10; //this looks a bit hinky, but between every subseq there _should_ be exactly 10 Ns, so this accounts for that assert(c=='N'); //see? we're looking at an N! prevSubSeqId=reader.getSubSeqId(); } if(bit==0){ if(writeWords) _words[wordsStored++]=(reverse ? ~w:w); bit=WORDSIZE; w=0; } } if(inRegion){ region.second=_length-1; matchRegions.push_back(region); } //finish up last subseq subSeqBounds.second=_length-1; assert(subSeqBounds.second>=subSeqBounds.first); subSeqs.push_back(subSeqBounds); if(bit_words;src>=_words;--src,++dst){ *dst=revCompWord(*src); if(bitOffset){//have to shuffle bits if(dst!=rev->_words){ //if this wasn't my first, i have to fix the one before me too *(dst-1)|=*dst>>bitOffset; } *dst<<=WORDSIZE-bitOffset; } } //and if we're doing the mmap grunt work, we have to write this out. #ifdef USE_SHM_MMAP if(shmUsed==SHM_MMAP_RW && !opt.reverseOtf) rev->mmap_msync(); #endif } //have to flip regions too #ifdef MURASAKI_MPI if(!(matchRegions.hasShm() && mpi_myLocalRank!=0)){ #endif cout << " <-> Reversing metdata." << endl; rev->matchRegions.reserve(matchRegions.size()); // cout << "(Filling "<matchRegions.size()<<" / "<matchRegions.capacity()<<" array with "<matchRegions.size()<<" == #"<<(i-matchRegions.begin())<<" ~ "<first<<","<second<matchRegions.push_back(pair(_length-i->second-1,_length-i->first-1)); } // cout << "(Reversing array of "<matchRegions.size()<<" at "<<(void*)(&(*rev->matchRegions.begin()))<<" ending "<<(void*)(&(*rev->matchRegions.end()))<matchRegions.begin(),rev->matchRegions.end()); //oops. i eclipsed std::reverse with my variable name. //and subsequences rev->subSeqs.reserve(subSeqs.size()); for(SeqPosPairArray::iterator i=subSeqs.begin();i!=subSeqs.end();++i) rev->subSeqs.push_back(pair(_length-i->second-1,_length-i->first-1)); std::reverse(rev->subSeqs.begin(),rev->subSeqs.end()); //oops. i eclipsed std::reverse with my variable name. #ifdef MURASAKI_MPI } if(matchRegions.hasShm()){ rev->matchRegions.localSync(); rev->subSeqs.localSync(); } #endif globalCounted+=counted; //add this to globalCounted or we'll get confused about our sequence lengths. return rev; } void BitSequence::mmap_msync(){ #ifdef USE_SHM_MMAP assert(shmUsed==SHM_MMAP_RW); cout << "Flushing mmap'd region to disk."<filename:string("(anon)"))+strerror(errno)); #endif break; default: assert(cerr << "Impossible case." << endl && 0); } } } int HashMethod::maxSources(){ vector members; vector tmp; transform(inputs.begin(),inputs.end(),back_insert_iterator >(tmp),select1st >()); sort(tmp.begin(),tmp.end()); unique_copy(tmp.begin(),tmp.end(),back_insert_iterator >(members)); int score=0; for(vector::iterator i=members.begin();i!=members.end();i++){ if(!popmap.count(*i)) //new to the area? score+=(popmap[*i]=popCount(seq->readWord(*i))); else score+=popmap[*i]; } return score; } string HashMethod::srcSetToString(const set &a){ ostringstream r; for(set::const_iterator i=a.begin();i!=a.end();++i) r << *i << ";"; return r.str(); } double HashMethod::fitnessCheck(){ //calculate entrop based on pattern and inputs int targetBases=(hash_bits+1)/2; used.clear();entropy.clear(); //reset state entropy.resize(targetBases,0); used.resize(seq->length(),0); size_t srcCount=used.size(),sinkCount=entropy.size(),patWeight=seq->_hashLength; vector > entropySrcToSink(srcCount),entropySinkToSrc(sinkCount); active=0; for(InputList::iterator i=inputs.begin();i!=inputs.end();i++){ word w=seq->readRawWord(i->first); if(i->second > 0) w=w>>(i->second*2); else w=w<<(-i->second*2); w&=hash_mask; for(unsigned b=0;bfirst*WORDSIZE/2+ //word this input is selecting (WORDSIZE/2-1-i->second)- //b=0 would be operating on b; if(srcbase < 0 || srcbase>=(int)used.size()) { continue; } if(w & (word)(3L<<(b*2L))){ //damn int literals assert(srcbaselength() && srcbase>=0); assert(seq->readBase(srcbase)==3); entropySrcToSink[srcbase].insert(b); entropySinkToSrc[b].insert(srcbase); active|=3<<(b*2); used[srcbase]++; } } } //if any any sink uses more than half of the sourcs, flip them so we count the UNUSED ones instead for(unsigned sink=0;sinkpatWeight){ //need to flip set realSrcs(entropySinkToSrc[sink]); for(set::iterator src=realSrcs.begin();src!=realSrcs.end();++src){ entropySrcToSink[*src].erase(sink); entropySinkToSrc[sink].erase(*src); } for(unsigned src=0;srcreadCode(src)){ //is in the effective set entropySrcToSink[src].insert(sink); entropySinkToSrc[sink].insert(src); } } } } int totalUsed=0; for(size_t i=0;isink map: "; for(unsigned i=0;i" << "{"; for(set::iterator sinki=entropySrcToSink[i].begin();sinki!=entropySrcToSink[i].end();++sinki) cout << *sinki<<","; cout << "} "; } cout << endl; */ if(opt.hasherFairEntropy){ totalCorrelationPenalty=0; for(unsigned sink=0;sink1){ //sum of distance pair squares long distSum=0,pairs=0; for(set::iterator srcA=entropySinkToSrc[sink].begin();srcA!=entropySinkToSrc[sink].end();++srcA){ set::iterator srcB=srcA; for(++srcB;srcB!=entropySinkToSrc[sink].end();++srcB){ distSum+=(*srcA-*srcB)*(*srcA-*srcB); // cout << " dist @"<::iterator srcA=entropySinkToSrc[sink].begin();srcA!=entropySinkToSrc[sink].end();++srcA){ double weight=(1.0-correlationPenalty)/(double)used[*srcA]; entropy[sink]+=weight; } } //All done in time o(entropy.size()) O(used.size()*entropy.size()) }else{ //now that we've figured out the src->sink graph, assign each sink at least one src. //do low degree sinks first so we don't undercount vector > sinkDegrees; for(unsigned i=0;i "<::iterator altsinki=entropySrcToSink[src].begin();altsinki!=entropySrcToSink[src].end();){ set::iterator nextAltsinki=altsinki; ++nextAltsinki; int altsink=*altsinki; assert(sinkDegrees[entropySinkToSrc[altsink].size()].count(altsink)==1); sinkDegrees[entropySinkToSrc[altsink].size()].erase(altsink); entropySinkToSrc[altsink].erase(src); entropySrcToSink[src].erase(altsink); // cout << "Pruning "< "<hashLength()); //kill entropy from any redundant source sets set usedSrcSets; for(int sink=0;sink<(int)entropy.size();sink++){ string srcSet(srcSetToString(entropySinkToSrc[sink])); pair::iterator, bool> res(usedSrcSets.insert(srcSet)); if(!res.second)//already existed entropy[sink]=0; } //evaluate entropy entropyHi=entropy.front(),entropyLo=entropy.front(),entropyTotal=0; empties=0; int J=0; for(vector::iterator i=entropy.begin();i!=entropy.end();i++,J++){ entropyTotal+=*i; if(*i<.4){ // cout << "empty @"<(entropyHi,*i); entropyLo=min(entropyLo,*i); } entropyMean=(double)entropyTotal/(double)entropy.size(); double variance=0; for(vector::iterator i=entropy.begin();i!=entropy.end();i++) variance+=sqr((double)(*i)-entropyMean); variance/=entropy.size(); entropyStd=sqrt(variance); /* cout << *this << ": "; for(unsigned i=0;i_hashLength; //basically we estimate how much entropy to aim for based on how many entropy sources are available fitness=entropyTotal //overall entropy is a good thing -((double)inputs.size()/(double)WORDSIZE) //using more inputs is to a certain extent a bad thing, but hardly the end of the universe. -max(1,x/WORDSIZE/2.0+entropyAgro*log(x)*sqrt(x)) //target entropy scaling function, asymtotically sqrt(x) like, weighted to be at about x/2 by 1000-ish -(double)(empties*entropy.size()*entropy.size()) //empties are evil -(entropyStd) //hopefully we're spreading entropy evenly over the whole key (or else we're wasting cycles) // -(empties==0 && targetBases==seq->_hashLength ? totalUsed-seq->_hashLength:0) //special case that we have exactly as many input as outputs, if we don't have any empties prefer to use sources as few times as possible (ie: minimize reuse of sources) ; return fitness; } void BitSequence::randomHashFunc(int n){ cout << "Generating random hash function with "<fitnessCheck(); cout << "Random uses "<sources<<"/"<<(hashLength())<<" bases and has "<empties<<" empties from "<inputlist().size()<<" inputs."<fitness<entropyHi<<" lo: "<entropyLo<< " total: "<<((double)hasher->entropyTotal)<<" bases (mean: "<entropyMean<<" stddev: "<entropyStd<<")"<totalCorrelationPenalty<prettyPrint(); } void BitSequence::compileHashFunc() { assert(isPattern); vector funcs; for(int c=0;c<100;c++){ //spawn 100 population funcs.push_back(HashMethod(this)); } int cycle,lastImprovedCycle=0; const int targetCycles=opt.hasherTargetGACycles; double improved=0; sort(funcs.begin(),funcs.end()); double worstScore=funcs.front().fitness; double bestScore=funcs.back().fitness; ticker.reset(targetCycles); for(cycle=0; bestScore0) lastImprovedCycle=cycle; } ticker.done(); cout << "Acceptable hash function found after "<finalize(); cout << "Best uses "<sources<<"/"<<(hashLength())<<" bases and has "<empties<<" empties from "<inputlist().size()<<" inputs."<fitness<<" (cf. worst: "<entropyHi<<" lo: "<entropyLo<< " total: "<<((double)hasher->entropyTotal)<<" bases (mean: "<entropyMean<<" stddev: "<entropyStd<<")"<totalCorrelationPenalty<prettyPrint(); } void HashMethod::finalize(){ removeDuplicates(); pruneUseless(); fitnessCheck(); } void HashMethod::removeDuplicates(){ InputList tmp; sort(inputs.begin(),inputs.end()); unique_copy(inputs.begin(),inputs.end(),insert_iterator(tmp,tmp.begin())); inputs=tmp; } string HashMethod::prettyPrint(){ //each line starts with input tag int leaderSize=10; string spacer=repString(" ",leaderSize); string str=spacer; // |0 |32 |64 static const char alphabet[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; //the base64 alphabet static const int alphasize=sizeof(alphabet)-1; assert(alphasize==64); int lineWidth=70; int hashLen=((int)hash_bits+1)/2; //only show base positions int padLeft=(lineWidth-hashLen)/2; //draw hash positions if(hashLen>=10){ //extra tens digit line str+=repString(" ",padLeft); for(int p=hashLen-1;p>=0;p--){ if(p>=10) str+=dstring((int)(p/10)%10); else str+=" "; } str+="\n";str+=spacer; } str+=repString(" ",padLeft); for(int p=hashLen-1;p>=0;p--) str+=dstring((int)p%10); str+="\n";str+=spacer; str+=repString(" ",padLeft); str+=repString("=",hashLen); str+="\n"; //draw each input HashMethod::InputList::const_iterator last=inputs.end(); last--; for(HashMethod::InputList::const_iterator i=inputs.begin();i!=inputs.end();i++){ char leader[50]; sprintf(leader,"w[%2d]%s%2d:",i->first,(i->second>0 ? ">>":"<<"),abs(i->second)); leader[leaderSize+1]='\0'; str+=leader; int linePos=0; int start=i->first*WORDSIZE/2+ //first base used in key (WORDSIZE/2-1-i->second)- hashLen+1; for(int p=start-padLeft;plength() && linePosfirst){ str+=" "; continue; } bool patOn=(seq->readBase(p)==3); if(linePos=padLeft+hashLen) str+=patOn ? "-":"_"; else { if(patOn) str+=alphabet[p%alphasize]; else str+="."; } } str+="\n"; } return str; } ostream& operator<<(ostream &os,const HashMethod &a){ HashMethod::InputList::const_iterator last=a.inputs.end(); last--; for(HashMethod::InputList::const_iterator i=a.inputs.begin();i!=a.inputs.end();i++){ if(i->second > 0) os << "w["<first<<"]>>"<second; else if(i->second < 0) os << "w["<first<<"]<<"<<0-i->second; else os << "w["<first<<"]"; if(i!=last) os << " ^ "; } os << " {"<readRawWord(w); //try all possible alignments multimap score2shift; int bestscore=0; for(int shift=1-WORDSIZE/2;shift0) score=popCount( ((pat>>shift*2)) & hash_mask); else score=popCount( ((pat<<-shift*2)) & hash_mask); bestscore=max(score,bestscore); score2shift.insert(pair(score,shift)); } //choose randomly from among the non-zero scoring options vector options; transform(score2shift.lower_bound(1),score2shift.end(), insert_iterator >(options,options.begin()), select2nd::value_type>()); if(options.empty()) return randint(WORDSIZE)-WORDSIZE/2; //wtf? return options[randint(options.size())]; } void HashMethod::pruneUseless(){ vector toErase; for(InputList::iterator i=inputs.begin();i!=inputs.end();i++){ word w=seq->readRawWord(i->first); if(i->second>=0) w=w>>(i->second*2); else w=w<<(-i->second*2); w&=hash_mask; if(!popCount(w)) toErase.push_back(i); } for(vector::iterator i=toErase.begin(); i!=toErase.end();i++) inputs.erase(*i); } void HashMethod::addWord(){ int w; if(!unusedWords.empty()){ int wi=randint(unusedWords.size()); w=unusedWords[wi]; unusedWords.erase(unusedWords.begin()+wi); }else{ w=randint(seq->word_count); } inputs.push_back(pair(w,align(w))); //add a word } void HashMethod::removeWord(){ int wi=randint(inputs.size()); // int w=inputs[wi].first; inputs.erase(inputs.begin()+wi); //pull off inputs //pretend it never existed because we don't want force it to be pushed back on // unusedWords.push_back(w); } void HashMethod::mutate(){ // cerr << "Mutating "<1){ //randomly kill a word removeWord(); }else{ //bump a random word in a random direction int w=randint(inputs.size()); int shift=inputs[w].second+(random()&1) ? 1:-1; if(shift>=WORDSIZE/2) shift-=WORDSIZE; //wraparound else if(shift<=-WORDSIZE/2) shift+=WORDSIZE; inputs[w].second=shift; } finalize(); // cerr << "->" << *this<< " new score: "< "< usedWords; transform(inputs.begin(),inputs.end(),insert_iterator >(usedWords,usedWords.begin()),select1st >()); for(word w=0;wword_count;w++) if(!usedWords.count(w)) unusedWords.push_back(w); // mutate(); finalize(); } HashMethod::HashMethod(BitSequence *s) : seq(s),active(0) { cout.flush(); for(word w=0;wword_count;w++) unusedWords.push_back(w); addWord(); fitnessCheck(); } HashMethod::HashMethod(BitSequence *s,int n) : //random hash func for n words seq(s),active(0) { if(n){ //craft a random hash function with exactly n inputs cout << "# Generating random hash function with "< > usedInps; while((int)inputs.size() inp(randint(s->wordCount()),randint(WORDSIZE-1)-WORDSIZE/2+1); if(!usedInps.count(inp)){ usedInps.insert(inp); inputs.push_back(inp); } } }else{ //randomly selected hash function out of all possible hash functions cout << "# Selecting hash function purely at random."<wordCount();i++) for(int shift=1-WORDSIZE/2;shift<=WORDSIZE/2-1;shift++) if(random() & 1) inputs.push_back(pair(i,shift)); } } bool operator==(const BitSequence& a,const BitSequence& b){ for(word i=0;i0 ? seq()->fwd:seq()->rev;} word Hash::sizeAt(const HashKey key,const HashVal &a){ //universally functional sizeAt...(slow. should be specialized for each hash) if(emptyAt(key)) return 0; Location aloc(a); Window awin(aloc,hashpat); list fulllist; mfh->getMatchingSets(key,fulllist); for(list::iterator seti=fulllist.begin();seti!=fulllist.end();++seti){ LocList &l=*seti; SeqIdx baseSeq=0; while(l[baseSeq].empty() && baseSeq=seq_count){ assert((cerr << "Empty loclist??") && 0); continue; //wtf? how'd that happen? } Location &loc=l[baseSeq].front(); Window locwin(loc,hashpat); if(locwin.equals(awin)){ //this is the one we care about! word sum=0; for(SeqIdx si=0;si histo(1,0); for(word base=0;base<(word)hash_size;base++){ //oh this is fun... if(emptyAt(base)) continue; usedBuckets++; unsigned size=sizeAt(base); if(size>=histo.size()) histo.resize(size+1); histo[size]++; } if(of!=cerr) for(unsigned i=1;i3){ //super freaking detailed LocList ll(seq_count); lookup(base,ll); //fetch all locations... vector locs; back_insert_iterator > ii(locs); for(int i=0;i::iterator i=locs.begin(); Location last(*i); word count=1; usedPats++; //we got at least 1 new one do{ i++; if(i==locs.end()) break; Location here(*i); if(lastworstPat) worstPat=count; count=1; last=here; usedPats++; } count++; } while(1); if(count>worstPat) worstPat=count; of << base << "\t" << Window(last,hashpat).prettyString() << "\t" << count << endl; }else of << base<<":"<sanityCheck()); } } return 1; } bool AnchorSet::sanityCheck() const { for(unsigned i=0;i &out){ bool res=false; for(usedItree::range_iterator i(used[seq].in_range(a));i!=used[seq].end();++i){ out.insert(*i); res=true; } return res; } bool UsedMap::fetchNearbyOverlaps(int seq,const UsedInt &a,SeqPos maxgrow,set &out){ bool res=false; UsedInt bounds(maxgrow ? seqs[seq]->growInBounds(a,maxgrow):a); for(usedItree::range_iterator i(used[seq].in_range(bounds)); i!=used[seq].end();++i){ out.insert(*i); res=true; } return res; } bool UsedMap::alreadyExists(const IntervalSet& a){ set matches,temp,context; int i=0; while(isZero(a.spaces[i])){ i++; assert(i::iterator ite=context.begin(); ite!=context.end(); ite++){ AnchorSet *s=*ite; if(s->contains(a)){ winner=true; if(opt.retainMembers){ //if we're keeping track of members, this counts as a hit. for(HashCount::const_iterator i=a.members.begin();i!=a.members.end();i++) s->members[i->first]+=i->second; } } } return winner; } bool UsedMap::merge(IntervalSet& a){ // assert(a.sanityCheck()); set matches,temp,context; SeqPos joinDist=opt.joinNeighbors ? opt.joinDist:1; int i=0; while(isZero(a.spaces[i])){ i++; assert(i::iterator i=context.begin(); i!=context.end(); i++){ if(!a.colinear(**i,joinDist)){ continue; } assert((*i)->sanityCheck()); // cout << "Merging: "<(a)<<")"<spaces.size();i++){ // assert(cerr << "Erasing seq "<spaces[i])<<" from: "<spaces[i]); // assert(sanityCheck()); } // assert(sanityCheck()); delete a; // assert(sanityCheck()); } bool AnchorSet::contains(const IntervalSet &a) const { if(opt.gappedAnchors){ for(unsigned i=0;i(abs(a.stop-b.start),abs(a.start-b.stop)); } OverlapSense UsedInt_overlaps(UsedInt a,UsedInt b){ assert(!isZero(a)); assert(!isZero(b)); if(isRev(a)){ //work in standardized coordinate space (ugh, so crazy) UsedInt_rawInvert(a); UsedInt_rawInvert(b); } if(!a.overlaps(b)) return OVERLAP_NONE; //absolutely no overlap if(opt.gappedAnchors) //if we relied on strict matching overlap types under gapped anchor rules the order of anchor merges would make a difference resulting in icky non-determinism return ~OVERLAP_NONE; //we don't know what kind of overlap, but it's definitely "not none" OverlapSense res=OVERLAP_NONE; //flush cases are both AB and BA if(UsedInt_contains(b,a))res|=OVERLAP_AINB; if(UsedInt_contains(a,b))res|=OVERLAP_BINA; if(a.start<=b.start) res|=OVERLAP_START_AB; if(b.start<=a.start) res|=OVERLAP_START_BA; if(a.stop<=b.stop) res|=OVERLAP_STOP_AB; if(b.stop<=a.stop) res|=OVERLAP_STOP_BA; // assert(cout << a << " overlaps " << b << " "<< OverlapSense2str(res) << endl); return res; } string OverlapSense2str(OverlapSense o){ ostringstream res; vector bits; if(o==OVERLAP_NONE) bits.push_back("nowhere"); else if(o==~OVERLAP_NONE) bits.push_back("somewhere"); else { if(o & OVERLAP_AINB && o & OVERLAP_BINA) bits.push_back("perfectly"); else { if(o & OVERLAP_AINB) bits.push_back("a in b"); else if(o & OVERLAP_BINA) bits.push_back("b in a"); else { if(o & OVERLAP_START_AB && o & OVERLAP_START_BA) bits.push_back("start_flush"); else{ if(o & OVERLAP_START_AB) bits.push_back("start_ab"); if(o & OVERLAP_START_BA) bits.push_back("start_ba"); } if(o & OVERLAP_STOP_AB && o & OVERLAP_STOP_BA) bits.push_back("stop_flush"); else{ if(o & OVERLAP_STOP_AB) bits.push_back("stop_ab"); if(o & OVERLAP_STOP_BA) bits.push_back("stop_ba"); } } } } if(o & COLINEAR_AB) bits.push_back("colinear_ab"); if(o & COLINEAR_AB) bits.push_back("colinear_ba"); assert(!bits.empty()); //i should have taken care of all everything possible... copy(bits.begin(),bits.end(),ostream_iterator(res,",")); return res.str(); } OverlapSense IntervalSet::overlaps(const AnchorSet &a) const { OverlapSense ref=OVERLAP_NONE; bool first=true; // cout << "Testing overlap of "<<*this<<" with "< IntervalSet::gapOffset(const AnchorSet &a) const { SeqPos offset; uint i=0; while(isZero(spaces[i])){ assert(isZero(a.spaces[i].key())); i++; assert(igetSubSeqBounds(spaces[i].start).start!=seqs[i]->getSubSeqBounds(a.spaces[i].key().start).start){ // cout << spaces[i]<<" and "<(true,maxSeqPos); } // cout << " Offset "<getSubSeqBounds(spaces[i].start).start!=seqs[i]->getSubSeqBounds(a.spaces[i].key().start).start){ // cout << spaces[i]<<" and "<(true,maxSeqPos); } // cout << " Offset "<(true,maxSeqPos); } return pair(false,offset); } bool IntervalSet::hasGaps(const AnchorSet &a) const { return gapOffset(a).first; } ColinearSense IntervalSet::colinear(const AnchorSet &a,SeqPos maxDist) const { OverlapSense res=overlaps(a); if(!opt.gappedAnchors && hasGaps(a)){ return COLINEAR_NONE; } if(res){ //amend with colinear details if(res & (OVERLAP_START_AB | OVERLAP_STOP_AB | OVERLAP_AINB | OVERLAP_BINA)) res|=COLINEAR_AB; if(res & (OVERLAP_START_BA | OVERLAP_STOP_BA | OVERLAP_AINB | OVERLAP_BINA)) res|=COLINEAR_BA; return res; } if(opt.gappedAnchors){ res=COLINEAR_AB|COLINEAR_BA; //one of those...dunno+dont care which //possible non-overlapping colinearity for(unsigned i=0;igetSubSeqBounds(spaces[i].start).start!=seqs[i]->getSubSeqBounds(a.spaces[i].key().start).start) return COLINEAR_NONE; if(opt.gappedAnchors){ //if using gaps, everything just has to be "close enough". actual colinearity is ignored if(UsedInt_distance(spaces[i],a.spaces[i].key())>maxDist) return COLINEAR_NONE; } } return res; }else{ for(uint i=0;imaxDist){ return COLINEAR_NONE; } //because we're gappless, all distances must be the same return (COLINEAR_AB|COLINEAR_BA); //not sure which. doesn't matter though. } } assert(0); //control doesn't reach here. really. return COLINEAR_NONE; //but just to quiet the compiler... } bool UsedInt_contains(const UsedInt &outer,const UsedInt &inner){ return outer.start<=inner.start && outer.stop>=inner.stop; } void IntervalSet::coalesce(const AnchorSet &a){ for(size_t i=0;ifirst]+=i->second; } void UsedInt_coalesce(UsedInt &x,const UsedInt &a){ if(isZero(x) || isZero(a)) return; x.start=x.start<=a.start ? x.start:a.start; x.stop=x.stop>=a.stop ? x.stop:a.stop; } UsedInt UsedInt_coalesced(const UsedInt &a,const UsedInt &b){ assert(!isZero(a)); assert(!isZero(b)); //no forcing coalesces of non-intervals return UsedInt(a.start<=b.start ? a.start:b.start, a.stop>=b.stop ? a.stop:b.stop); } inline UsedInt UsedInt_grown(const UsedInt &a,const SeqPos amt){ if(isZero(a)) //no growing non-intervals return a; if(a.start>0) return UsedInt(max(a.start-amt,1),a.stop+amt); else return UsedInt(a.start-amt,min(a.stop+amt,-1)); } UsedInt UsedInt_inverted(const UsedInt &a,const Sequence& s){ if(isZero(a)) return a; SeqPos length=s.length(); if(a.start>0) //ugh. that took way more reverse engineering than it should have. return UsedInt(0-(length-a.start+1),0-(length-a.stop+1)); else return UsedInt(a.start+length+1,a.stop+length+1); } UsedInt& UsedInt_invert(UsedInt &a,const Sequence& s){ if(isZero(a)) return a; SeqPos length=s.length(); if(a.start>0){ //ugh. that took way more reverse engineering than it should have. a.start=0-(length-a.start+1); a.stop=0-(length-a.stop+1); }else{ a.start=a.start+length+1; a.stop=a.stop+length+1; } return a; } UsedInt& UsedInt_rawInvert(UsedInt &x){ //use with care (most of the time sequence coordinates need to be inverted with the sequence-aware form above) swap(x.start,x.stop); x.start=-x.start; x.stop=-x.stop; return x; } bool UsedInt_sanityCheck(const UsedInt &a){ assert(a.start <= a.stop); return true; } AnchorSet::AnchorSet(const IntervalSet &a, UsedMap &usedMap) : members(a.members) //if opt.retainMembers is off, this should be empty anyway { spaces.reserve(seq_count); for(size_t i=0;i0 ? a.pos:a.pos-length, a.pos>0 ? a.pos+length:a.pos)); else //it's a blank in an island spaces.push_back(UsedInt(0,0)); // assert(cout << "Added "<hashpat; word wc=pat->wordCount(); SeqPos aPos=bitSeqCoords(a.pos),bPos=bitSeqCoords(b.pos); for(word wi=0;wireadWord(wi); word aw=(as->wordAtBase(aPos) & pw), bw=(bs->wordAtBase(bPos) & pw); if(aw < bw){ return true; } else if(aw > bw){ return false; } } return false; } bool operator<(const Window &a,const Window &b){ for(SeqPos frame=0;framereadWord(frame); word resa=a.buffer[frame] & pword; word resb=b.buffer[frame] & pword; if(resa0 ? seqs[baseSeq]->fwd : seqs[baseSeq]->rev,*tSeq; SeqPos mstart=bitSeqCoords(spaces[baseSeq].start),tstart; SeqPos mstop=bitSeqCoords(spaces[baseSeq].stop),tstop; SeqPos right=mySeq->length()-mstop, left=mstart; assert(mstart>=0); for(unsigned i=baseSeq+1;i0 ? seqs[i]->fwd : seqs[i]->rev; tstart=bitSeqCoords(spaces[i].start); tstop=bitSeqCoords(spaces[i].stop); if(spaces[i].stop<0) swap(tstart,tstop); //sneaky devils assert(tstart>=0); right=min(tSeq->length()-tstop-1,min(right,mySeq->cmpRight(mstop+1,*tSeq,tstop+1))); left=min(tstart,min(left,mySeq->cmpLeft(mstart-1,*tSeq,tstart-1))); } if(right || left){ // assert(cout << *this << "-> Extending "<0) spaces[i].stop+=right; else spaces[i].start-=right; } if(left) for(unsigned i=0;i0) spaces[i].start-=left; else spaces[i].stop+=left; } // assert(cout << *this<<"<--"< leftEdges,rightEdges; for(size_t i=0;igetSubSeqBounds(spaces[i].start)); SeqPos leftPos=isRev(spaces[i]) ? spaces[i].stop:spaces[i].start; SeqPos rightPos=isRev(spaces[i]) ? spaces[i].start:spaces[i].stop; leftEdges.push_back(seqs[i]->iterate(leftPos)); rightEdges.push_back(seqs[i]->iterate(rightPos)); maxExtendLeft=min(maxExtendLeft,isRev(spaces[i]) ? bounds.stop-spaces[i].stop:spaces[i].start-bounds.start); maxExtendRight=min(maxExtendRight,isRev(spaces[i]) ? spaces[i].start-bounds.start:bounds.stop-spaces[i].stop); } SeqPos exRight=0,exLeft=0; Score scoreRight=0,scoreLeft=0; // assert(cout << "** Checking fuzzyExtend on "<<*this<<" up to "< Fuzzy Extending by "< r(seqs[si]->iterate(spaces[si])); while(r.first!=r.second) os << baseToString(*(r.first++)); if(si+1!=spaces.size()) os << delim; } return os.str(); } Score IntervalSet::score(){ if(opt.gappedAnchors)return 0; //no scoring for gapped intervals. too crazy. Score baseScore=0; vector column; vector > pairScores; SeqPos width=0; for(size_t i=0;iiterate(spaces[i]).first); } for(SeqPos i=0;i::iterator r=column.begin(); r!=column.end();++r){ ++(*r); } } if(opt.scoreByMinimumPair) return minimumPairScore(pairScores); return baseScore; } bool operator==(const Location &a,const Location &b) { BitSequence *mySeq=a.pos>0 ? a.seq()->fwd : a.seq()->rev, *tSeq=b.pos>0 ? b.seq()->fwd : b.seq()->rev; SeqPos mstop=bitSeqCoords(a.pos), tstop=bitSeqCoords(b.pos); return mySeq->equal(mstop,*tSeq,tstop,*mfh->hashpat); } ostream& operator<<(ostream &os, const Window &a){ for(SeqPos i=0;igetId()<<","<extCoords(a.pos)<<")"; } void BitSequence::maskString(string &str) const{ SeqPos j=0; for(word i=0;i>=2; } } } string wordToMaskedString(word w,word p,int bits){ string foo(wordToString(w,bits)); assert((int)foo.length()<=bits); SeqPos j=0; word mask=highN(2); while(mask && j<(SeqPos)foo.length()){ if(!(p & mask)) foo[j]='.'; j++; mask>>=2; } assert((int)foo.length()<=bits); return foo; } string Window::toRawString(){ string ret; for(unsigned i=0;ilength()); return ret; } string Window::prettyString(){ string ret(toRawString()); pat->maskString(ret); size_t len=ret.length(),patlen=min(pat->length(),hash_bits/2); switch(opt.quick_hash){ case 2: for(size_t i=patlen;ihasher->used[i]) ret[i]=tolower(ret[i]); break; } return ret; } void Window::slide(){ pos+=2; SeqPos i; for(i=0;i>(WORDSIZE-2)); } buffer[i]<<=2; //this should be done at the end. thus frame is always correct and no buffers are ever at empty if(pos/WORDSIZE!=frame){ frame=pos/WORDSIZE; buffer[frames-1]=seq->readWord(frame+frames-1); } } void Window::slideWord(){ //update to new positions pos+=WORDSIZE; frame++; SeqPos i; for(i=0;ireadWord(frame+frames-1); SeqPos offset=MODWORDSIZE(pos); if(offset) //[i-1] needs to get the offset bits that had been chopped off of the previous [i] buffer[i-1]|=buffer[i]>>(WORDSIZE-offset); //and [i] needs to not duplicate those on a subsequent slide... buffer[i]<<=offset; } void Window::slide(SeqPos dist){ //bah. this function isnt actually used so, dont care that it's not as efficient as possible while(dist>WORDSIZE){ slideWord(); dist-=WORDSIZE; } while(dist>0){ slide(); dist-=2; } } void Window::initslide(SeqPos dist){ assert(dist<=WORDSIZE-(MODWORDSIZE(pos))); pos+=dist; SeqPos i; for(i=0;i>(WORDSIZE-dist)); } buffer[i]<<=dist; if(pos/WORDSIZE!=frame){ frame=pos/WORDSIZE; buffer[frames-1]=seq->readWord(frame+frames-1); } } bool Window::equals(const Window &a) const { assert(pat==a.pat); //dont be dumb if(pos+pat->bit_count>seq->bit_count) //overruns end of seq return false; for(SeqPos i=0;ireadWord(i)) ^ (a.buffer[i] & pat->readWord(i)); if(result) return false; //a bit's amiss! } return true; } inline word Window::eqbases(const Window &a) const { return ~(buffer[0] ^ a.buffer[0]); } Window::Window(const UsedInt &a,const Sequence *s,BitSequence *p) : seq(a.start>0 ? s->fwd:s->rev),pat(p),pos(bitSeqCoords(a.start>0 ? a.start:a.stop)*2), frames(p->word_count+1),buffer(frames) { SeqPos stop=pos; frame=pos/WORDSIZE; pos=frame*WORDSIZE; for(SeqPos i=0;ireadWord(frame+i); if(pos!=stop) initslide(MODWORDSIZE(stop)); //scuttle right the leftover bits } Window::Window(BitSequence *_seq,SeqPos _pos,BitSequence *_pat): seq(_seq),pat(_pat),pos(_pos*2), frames(_pat->word_count+1),buffer(frames) { frame=pos/WORDSIZE; pos=frame*WORDSIZE; for(SeqPos i=0;ireadWord(frame+i); } if(pos!=_pos*2) initslide(MODWORDSIZE(_pos*2));//scuttle right the leftover bits } Window::Window(Location &v,BitSequence *p): seq(v.bitSeq()),pat(p),pos(bitSeqCoords(v.pos)*2), frames(p->word_count+1),buffer(frames) { SeqPos stop=pos; frame=pos/WORDSIZE; pos=frame*WORDSIZE; for(SeqPos i=0;ireadWord(frame+i); if(pos!=stop) initslide(MODWORDSIZE(stop)); //scuttle right the leftover bits } word Window::hash(){ word hash=hash_padding; word patpos=0; word filled=0,bit=WORDSIZE-2,frame=0; word patword=0,input; SeqPos lastpati=-1; switch(opt.quick_hash){ case 2: //naive hash: use first hash_bits bits while(filledbit_count){ if(pat->readCode(patpos)){ hash=hash<<2 | ((buffer[frame]>>bit) & 3); filled+=2; } patpos+=2; if(bit==0){ frame++; bit=WORDSIZE-2; }else{ bit-=2; } } break; case 1: //use first word only (usually very bad waste of hash) hash=buffer[0] & pat->readWord(0); break; case 4:case 5:case 6:case 7:case 8: //use cryptographic hash functions. seems unreasonable, but for comparison's sake, lets give it a shot assert(cryptoHasher); hash^=cryptoHasher->digest(buffer,pat); break; case 0: //adaptive hasher with s-boxes. Hopefully even smarter. for(HashMethod::InputList::const_iterator i=pat->hasher->inputlist().begin(); i!=pat->hasher->inputlist().end();i++){ if(i->first!=lastpati){ patword=pat->readWord(i->first); lastpati=i->first; } input=buffer[i->first] & patword; if(i->second==0) hash^=input; else { if(i->second>=0) hash^=(input>>(word)(i->second*2)); else hash^=(input<<(word)(i->second*2)); } } return sbox->confuse(hash & hash_mask); case 3:case 100: //adaptive hasher. freakin' smart (usually...) for(HashMethod::InputList::const_iterator i=pat->hasher->inputlist().begin(); i!=pat->hasher->inputlist().end();i++){ if(i->first!=lastpati){ patword=pat->readRawWord(i->first); lastpati=i->first; } input=(buffer[i->first] & patword); if(i->second==0) hash^=input; else { if(i->second>=0) hash^=(input>>i->second*2); else hash^=(input<second*2); } } } return hash & hash_mask; } ostream& operator<<(ostream &os, const AnchorSet &a){ return os << a.asString(); } string UsedMap::asString() { string ret; for(usedItree::iterator i=used[0].begin(); i!=used[0].end();i++){ ret+= (*i)->asString() + "\n"; } return ret; } ostream& operator<<(ostream &os,const pair &p){ return os << p.first << ":" << p.second; } ostream& UsedMap::writeTfidf(ostream &os){ if(used[0].empty()) return os; if(!dfCount) dfCount=makeDfCount(); for(usedItree::iterator i=used[0].begin(); i!=used[0].end();i++){ assert((*i)); AnchorSet *&s=*i; if(!opt.hitfilter || s->hitCount()>=opt.hitfilter) os << s->uniqueness(this) << endl; } return os; } ostream& UsedMap::writeScores(ostream &os){ if(used[0].empty()) return os; for(usedItree::iterator i=used[0].begin(); i!=used[0].end();i++){ assert((*i)); AnchorSet *&s=*i; if(!opt.hitfilter || s->hitCount()>=opt.hitfilter) os << s->score() << endl; } return os; } ostream& UsedMap::saveDetails(ostream &os,ostream &bitos){ if(used[0].empty()) return os; if(opt.tfidf && !dfCount) dfCount=makeDfCount(); for(usedItree::iterator i=used[0].begin(); i!=used[0].end();i++){ assert((*i)); AnchorSet *&s=*i; if(!opt.hitfilter || s->hitCount()>=opt.hitfilter){ os << s->asString() << "\t" << s->hitCount(); if(opt.retainMembers){ double score; if(opt.tfidf){ if(opt.gappedAnchors) score=s->uniqueness(this); else score=sqrt((s->uniqueness(this))*(s->score())); } else { score=s->score(); } os << "\t" << score << "\t"; HashCount::iterator last=s->members.end(); assert(!s->members.empty()); last--; for(HashCount::iterator i=s->members.begin();i!=last;i++) os << i->first << ":" << i->second <<","; os << last->first << ":" << last->second; } if(opt.bitscore && bitos!=cerr){ pair scores(s->bitscore()); bitos << scores.first << "\t" << scores.second << endl; } os << endl; } } return os; } int UsedMap::count() const { return used[0].size(); } ostream& UsedMap::writeOut(ostream &os){ if(used[0].empty()) return os; for(usedItree::iterator i=used[0].begin(); i!=used[0].end();i++){ assert(*i); if(!opt.hitfilter || (*i)->hitCount()>=opt.hitfilter) os << (*i)->asString() << "\n"; } //as these output sets can get huge, this might be a bad idea // return os << asString(); return os; } word* UsedMap::makeDfCount(){ size_t size; word *df; if(opt.tfidf){ assert(!dfCount); #ifdef MURASAKI_MPI if(mpi_capable) size=mpi_total_hash_size; else #endif size=hash_size; cout << "(Creating df table for "<members.begin();i!=s->members.end();i++) df[i->first]+=i->second; } return df; } return 0; } string AnchorSet::asString() const { string ret; for(unsigned i=0;iextCoords(a.stop))+string("\t")+ dstring(s->extCoords(a.start))+string("\t")+ string(((a.start<0) != (a.stop<0)) ? "?":(a.start>=0 ? "+":"-")); else return dstring(s->extCoords(a.start))+string("\t")+ dstring(s->extCoords(a.stop))+string("\t")+ string(((a.start<0) != (a.stop<0)) ? "?":(a.start>=0 ? "+":"-")); } double AnchorSet::uniqueness(UsedMap *context) const { double sum=0; #ifdef MURASAKI_MPI if(mpi_capable){ if(!dfCount)dfCount=context->makeDfCount(); if(dfCount) for(HashCount::const_iterator i=members.begin();i!=members.end();i++){ sum+=i->second*log(anchors->count()/dfCount[i->first]); } return sum; } #endif if(!opt.retainMembers){ //old way. takes into account non-anchored noise. not so cool for(HashCount::const_iterator i=members.begin();i!=members.end();i++){ // wi = tfi*log(D/dfi) by the money if(!mfh->sizeAt(i->first)){ throw MurasakiDebugException("What the monkey? "+dstring((long)i->first)+" isn't in hash?"); } sum+=i->second*log((double)usedBuckets/(double)mfh->sizeAt(i->first)); } }else{ //new way. separate counters. cleaner if(!dfCount)dfCount=context->makeDfCount(); if(!dfCount)return 0; for(HashCount::const_iterator i=members.begin();i!=members.end();i++){ if(!dfCount[i->first]) throw MurasakiDebugException("dfCount missing at "+dstring((long)i->first)+" in anchor "+(this->asString())); sum+=i->second*log(anchors->count()/dfCount[i->first]); } } return sum; } pair IntervalSet::entropy() const { double ret=0; vector windows; SeqPos shortest=0,shortlen=length(spaces[0]); for(int i=0;i<(int)spaces.size();i++){ windows.push_back(Window(spaces[i],seqs[i],mfh->hashpat)); if(shortlen>length(spaces[i])){ shortest=i; shortlen=length(spaces[i]); } } int counters[4],offset; for(SeqPos pos=0;pos=0 && pos>offset) & 3]++; //calculate (and add in) entropy for this column for(int b=0;b<4;b++) if(counters[b]){ double count=(double)counters[b]; ret+=count*log(count/globalBaseFreq[b]); } } for(vector::iterator ite=windows.begin();ite!=windows.end();ite++) ite->slideWord(); } return pair(ret,shortlen); } size_t AnchorSet::hitCount() const { size_t ret=0; for(HashCount::const_iterator i=members.begin();i!=members.end();++i) ret+=i->second; return ret; } Score AnchorSet::score() const { return IntervalSet(*this).score(); //theoretically it might be faster to have all the code here, but that's just stupid... } pair AnchorSet::bitscore() const { vector windows; SeqPos shortest=0,shortlen=length(spaces[0].key()); for(int i=0;i<(int)spaces.size();i++){ windows.push_back(Window(spaces[i].key(),seqs[i],globalPat)); if(shortlen>length(spaces[i].key())){ shortest=i; shortlen=length(spaces[i].key()); } } SeqPos eqbits=0; for(SeqPos pos=0;poslength(spaces[comp].key())) eq=(eq & (highN(2*(length(spaces[comp].key())-pos)))); eq=(((eq & alternating_mask)<<1) & eq); ret&=eq; } //count ret and add it up! eqbits+=popCount(ret); //shuffle everybody for(vector::iterator ite=windows.begin();ite!=windows.end();ite++) ite->slideWord(); } return pair(eqbits,shortlen); } string hashCacheName(string name, BitSequence *pat){ ostringstream os; char *n = strdup(name.c_str()); string fr; if ( !opt.skipFwd && !opt.skipRev ) fr = "both"; if ( !opt.skipFwd && opt.skipRev ) fr = "fwd"; if ( opt.skipFwd && !opt.skipRev ) fr = "rev"; string rm; if ( opt.repeatMask ) rm="r"; else rm="nr"; for(unsigned int i=0; iwordCount(); p++) os << hex << pat->readWord(p); os << "-" << fr; os << "-" << rm; os << "-" << dec << opt.rand_seed; os << "-" << hash_bits; free(n); return os.str(); } RepeatMap::RepeatMap() : clusters() {} void RepeatMap::add(const LocList &l){ clusters.push_back(l); } void RepeatMap::writeOut(ostream &os){ for(list::iterator i=clusters.begin();i!=clusters.end();++i){ LocList &l=*i; bool first=true; for(int s=0;sseq()->extCoords(lite->pos); os << dstring(p); ++lite; if(lite!=l[s].end()) os << " "; else{ os << endl; break; } }while(1); } os << endl; //extra new line on group end. } } IntervalSet::IntervalSet() : spaces() { spaces.reserve(seq_count); if(opt.retainMembers) members[activeHash]++; } IntervalSet::IntervalSet(const AnchorSet& a): spaces() , members(a.members) { spaces.reserve(seq_count); for(vector::const_iterator i=a.spaces.begin(); i!=a.spaces.end();++i) spaces.push_back(i->key()); } IntervalSet::IntervalSet(const _IntervalIterator &intStart, const _IntervalIterator &intStop, const _MemberIterator &memberStart,const _MemberIterator &memberStop){ spaces.reserve(seq_count); for(_IntervalIterator i=intStart;i!=intStop;++i) spaces.push_back(*i); for(_MemberIterator i=memberStart;i!=memberStop;++i) members.insert(*i); } ostream& operator<<(ostream& os,const IntervalSet& a){ os << "("; vector::const_iterator i=a.spaces.begin(); if(i==a.spaces.end()) return os << ")"; vector::const_iterator next=i; int idx; for(next++,idx=0;next!=a.spaces.end();next++,i++,idx++) os << anchorCoords(*i,seqs[idx]) << ","; return os << anchorCoords(*i,seqs[idx]) << ")"; } UsedInt anchorCoords(const UsedInt &b,Sequence *s){ if(b.start<0) return s->length()-b; return b; } UsedInt operator-(SeqPos a,const UsedInt& b){ return UsedInt(-1-(a+b.stop),-1-(a+b.start)); } string SeqPosPair2string(const SeqPosPair& a){ ostringstream os; os << "["<rev:seq->fwd),idx(bitSeqCoords(pos)),here(src->localRegionIte(idx)),prev(here),next(here) { assert(!src->matchRegions.empty()); if(prev!=src->matchRegions.begin()) --prev; else prev=src->matchRegions.end(); if(next!=src->matchRegions.end()) ++next; } BaseIterator::BaseIterator(const BaseIterator& a): rev(a.rev),src(a.src),idx(a.idx),here(a.here),prev(a.prev),next(a.next) { } BaseIterator& BaseIterator::operator++(){ shiftFwd(); return *this; } BaseIterator& BaseIterator::operator--(){ shiftBack(); return *this; } BaseIterator BaseIterator::operator++(int){ BaseIterator old(*this); shiftFwd(); return old; } BaseIterator BaseIterator::operator--(int){ BaseIterator old(*this); shiftBack(); return old; } bool BaseIterator::operator==(const BaseIterator &a) const{ return src==a.src && idx==a.idx; } bool BaseIterator::operator!=(const BaseIterator &a) const{ return !(*this==a); } BASE_TYPE BaseIterator::operator*() const { if(idxfirst || idx>here->second) return BASE_N; return src->readBase(idx); } void BaseIterator::shiftFwd(){ idx++; if(idx>here->second && next!=src->matchRegions.end() && idx>=next->first){ prev=here; here=next; ++next; } } void BaseIterator::shiftBack(){ idx--; if(idxfirst && prev!=src->matchRegions.end() && idx<=prev->second){ next=here; here=prev; if(here==src->matchRegions.begin()) prev=src->matchRegions.end(); else --prev; } } string BaseIterator::debugInfo(){ ostringstream os; os <genBitFilename(); os <<"@"<matchRegions.end() ? "??":SeqPosPair2string(*prev)) << " << "; os << SeqPosPair2string(*here) << " << "; os << (next==src->matchRegions.end() ? "??":SeqPosPair2string(*next)); return os.str(); } murasaki/src/openhash.h0000644000177700001440000000456111434752233014531 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ //////// // super-memory-miserly hash list storage // Kris Popendorf /////// #ifndef OPENHASH__H #define OPENHASH__H #include "ecolist.h" #include "bitmap.h" #include "sequence.h" #include "options.h" using namespace std; class OpenHash : public Hash { public: typedef Ecolist::val_type val_type; //abstract parts void clear(); void add(const HashKey key,const HashVal &val); void getMatchingSets(HashKey key,list &sets); void lookup(HashKey key,LocList &locList);//debug only bool emptyAt(const HashKey key); word sizeAt(const HashKey key); word sizeAt(const HashKey key,const HashVal &val); //for computing memory costs static const word linear_cost(word c) { return ((word)(Ecolist::seqbits+Ecolist::idxbits))*(c/8); } static const word bucket_prep_cost(word c) { return (word)(Ecolist::cost(0)*c); } //optional bits void writePerformanceData(string prefix); // dump / load may be overloaded // void dump(ostream &os); // void load(istream &is); //the boring junk OpenHash(BitSequence *pat); ~OpenHash(); //muchos boring inline static Location val2loc(const val_type& a){return Location(a.first,a.second);} inline static val_type loc2val(const Location& a){return val_type(a.seqno,a.pos);} static bool lessthan(const val_type& a,const val_type& b); protected: Ecolist *fasta; HashKey findAddr(const HashKey start,const HashVal &val); HashKey keysFree,keysFreeWarning; #ifdef HASHPROFILE map perfProbeHisto; size_t perfProbeCount,perfFindAddrCount; #endif public: static void init(SeqPos _seq_count,word _longestSeq,word _totalHashLength,SeqPos _hash_bits); }; #endif murasaki/src/wordsize.cc0000644000177700001440000000021711434752234014723 0ustar krispusers#include using namespace std; int main(){ cout << "ul: "< (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #ifndef __DINKYMATH_H #define __DINKYMATH_H #include #include #include #include #include #include #include //need random() #include "timing.h" using namespace std; //int log2(int x); inline long ceil_log2_loop(long b){ int l=0; while(b>0){ b>>=1; l++; } return l; } inline long intlog(long x,long b){ int l=0; while(x>=b){ x/=b; l++; } return l; } inline unsigned long intpow(unsigned long base,unsigned long pow){ unsigned long b=base; for(unsigned long i=1;i inline T sqr(const T& a){ return a*a; } //int intpow(int base,int pow); inline int abs(int a) throw () {return a<0 ? -a:a;} string littleEndianStr(unsigned int n,int digits); string littleEndianPat(unsigned int n,int digits); string littleEndianStr(unsigned int n); template int writeOutVector(string filename,const vector &v,string delim,int,int,int skipZeros=0); string dstring(int); string dstring(long); string fstring(float); string dhstring(int i); string humanMemory(unsigned long bytes); string humanMemory(unsigned int bytes); string humanMemory(int bytes); string humanMemory(double bytes); string humanMemory(long bytes); string humanScale(double f); string percent(double,double); size_t progressTick(size_t i,size_t max); void writeOut(string filename,string content,bool append=false); class RandomTraverse{ public: int next(); int remaining(); RandomTraverse(int min,int max); protected: int min,max; set hit; }; class ProgressTicker { public: ProgressTicker(size_t _max); size_t tick(size_t i); inline size_t tick(){return tick(lastCall+1);} void reset(size_t _max); void done(); Timer started; protected: size_t max; size_t lineDelay,dotDelay,lineCount,dotCount,lastCall; }; const int bercent=1024; class EtaBar { public: EtaBar(int _max); inline int tick(); inline int tick(int to); protected: int max; int updateDelay,untilNextUpdate,lastCall; Timer started; }; unsigned gcd(unsigned a,unsigned b); unsigned lcm(unsigned a,unsigned b); template T sum(const vector &v){ T ret=0; for(typename vector::const_iterator i=v.begin();i!=v.end();++i) ret+=*i; return ret; } inline int randint(int max){ double x=((double)random())/(double)RAND_MAX; return (int)(x*(double)max); } template string joinStrings(const T &l,string d){ string r; for(typename T::const_iterator i(l.begin()); i!=l.end() ;++i){ typename T::const_iterator next(i); ++next; r+=*i; if(next!=l.end()) r+=d; } return r; } template bool writeMap(map m,string filename,string description){ ofstream of(filename.c_str()); if(of){ for(typename map::iterator i=m.begin();i!=m.end();++i) of << i->first << "\t" << i->second << endl; }else{ cerr << "Couldn't write "< (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #include "ecolist.h" #include "murasaki.h" #include "dinkymath.h" #include #include using namespace std; long bigrand(){ return (long)(((unsigned long)(rand())<<32) + ((unsigned long)(rand()))); } ostream& operator<<(ostream& os,const pair &a){ return os << "(" << a.first << "," << a.second << ")"; } bool lt(const Ecolist::val_type& a,const Ecolist::val_type& b){ // return a.first==b.first ? a.second > v; pair keep; srand(0); for(long i=0;i entry(bigrand()%seqs,(bigrand()%(genomesize*2))-genomesize); while(entry.second==0){ entry.second=(bigrand()%(genomesize*2))-genomesize; } cout << "Adding: "< check(bob.getValAt(bob.getSize()-1)); if(entry!=check) cout << (entry == check ? "=":"*")<< "Compare "< check(bob.getValAt(i)); if(v[i]!=check){ cout << ">>>Entry: "< check(*ei); if(v[i]!=check){ cout << ">>>Entry: "<=0)){ cout << "!!!!!!!!!Ran over the end!"< check(*--bei); if(v[i]!=check){ cout << ">>>Entry: "<>>Searching for last inserted entry..."< range(bob.equal_range(keep)); cout << "<<>>Search for first entry..."< first=*ei; range=bob.equal_range(first); cout << "<<>>Search for something smaller than first entry..."<>>Search for last entry..."< final=*bei; range=bob.equal_range(final); cout << "<<>>Search for something bigger than any..."< (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ /////////////// //dinkymath.cc //for all those math functions you want all over and hate rewriting /////////// #include "dinkymath.h" #include "murasaki.h" #include #include #include #include #include #include using namespace std; string littleEndianStr(unsigned int n,int digits){ char str[200]; unsigned int mask=1; int last=0; for(int i=0;mask;i++,mask<<=1){ str[i]=(n & mask) ? '1':'0'; if(n & mask) last=i; } str[digits]=0; return string(str); } string littleEndianPat(unsigned int n,int digits){ char str[200]; unsigned int mask=1; int last=0; for(int i=0;mask;i++,mask<<=1){ str[i]=(n & mask) ? '1':' '; if(n & mask) last=i; } str[digits]=0; return string(str); } string littleEndianStr(unsigned int n){ return littleEndianStr(n,32); } template int writeOutVector(string filename,const vector &v,string delim,int from,int to,int skipZeros){ ofstream os(filename.c_str()); for(int i=from;i::iterator next=hit.upper_bound(n); assert(next!=hit.end()); n=*next; } hit.erase(n); return n; } int RandomTraverse::remaining(){ return hit.size(); } void writeOut(string filename,string content,bool append){ ofstream outf(filename.c_str(),append ? ios_base::out:(ios_base::out | ios_base::app)); outf << content; } unsigned gcd(unsigned a,unsigned b){ if(a==0 && b==0) return 0; //handy definition... unsigned t; while(b){ t=b; b=a%b; a=t; } return a; } unsigned lcm(unsigned a,unsigned b){ return a*b/gcd(a,b); } string percent(double a,double b){ char buf[128]; sprintf(buf,"%f%%",a/b*100.0); return string(buf); } EtaBar::EtaBar(int _max): max(_max), updateDelay(_max/bercent), //hey, we can eliminate mod/division by using powers of 2 like 128 (wee) (for sanity's sake we can write /128 because it gets optimized to >>7 by the compiler. We'll this Bercent (get it? binary percent. ahahaha.) untilNextUpdate(_max & (bercent-1)), lastCall(0),started() { } int EtaBar::tick(){ return tick(lastCall+1); } int EtaBar::tick(int at){ int diff=at-lastCall; untilNextUpdate-=(diff); lastCall=at; if(untilNextUpdate<=0){ untilNextUpdate=updateDelay; Timer now; cout << "Bercent done: "<sub{return runtest('which mpicxx') ? "YES":"NO";}, HASCRYPTOPP=>sub{return testgcc( < //fastest #include //medium #include //slow //Non-cryptographic checksums #include #include #include int main(){ CryptoPP::Weak::MD5 MD5CryptoHasher; CryptoPP::SHA1 SHACryptoHasher; CryptoPP::Whirlpool WhirlpoolCryptoHasher; CryptoPP::CRC32 CRC32CryptoHasher; CryptoPP::Adler32 Adler32CryptoHasher; return 0; } ENDTEXT , "@_ -lcryptopp -lpthread") ? "YES":"NO";} ); my ($test,@args)=@ARGV; exit 2 unless $test and ref $cfg{$test}; print &{$cfg{$test}}(@args); sub runtest { return !system("@_ 1>/dev/null 2>/dev/null"); } sub testgcc { my ($content,$compiler)=@_; $compiler="gcc" unless $compiler; my $cmd="$compiler -x c++ -o /dev/null -"; $cmd.=" 1>/dev/null 2>/dev/null" unless $debug; print STDERR "Trying to compile using $cmd" if $debug; open(my $gccfh,"|$cmd") or exit 1; #can't test gcc. tough potatoes. print $gccfh "$content"; close $gccfh; return !$?; #errored? } murasaki/src/mbfa.cc0000644000177700001440000001167511434752234013774 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // mbfa.cc // offline MBFA generator ////////////// #include #include #include #include "seqread.h" #include "binseq.hh" #include "genopts.h" #include #include "timing.h" #include #include using namespace std; string program_version(); string program_help(); double sizeRatio(string a,string b); int main(int argc,char **argv){ SequenceReader reader; bool verbose=false,forceCreate=false,fatalErrors=false,info=false,asFasta=false; int optc; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"help",0,0,'h'}, {"version",0,0,'V'}, {"verbose",0,0,'v'}, {"info",2,0,'i'}, {"force",2,0,'f'}, {"fatal",2,0,'F'}, {"fasta",2,0,'A'}, {0,0,0,0} }; int longindex=0; string prefreq; optc=getopt_long(argc,argv,"hVvf::i::F::A::",long_options,&longindex); if(optc==-1)break; switch(optc){ case 'h': cout << program_help();exit(0);break; case 'V': cout << program_version()<save()){ cout << "*Write of "<getBinaryFilename()<<" failed."<getBinaryFilename()<<" completed in "< " << mbfa->getBinaryFilename()<getBinaryFilename(),argv[i])<length()<counters)/sizeof(mbfa->counters[0]);i++) cout << mbfa->counters[i] << " "; cout << endl; cout << " Subsequences: "<regionSizes[0]<regionSizes[1]<regionSizes[2]<asFasta(cout); } delete mbfa; } } return fails; } size_t getSize(string file){ struct stat buf; if(stat(file.c_str(),&buf)) return 0; //if it's a stitch file it's not really fair unless we compare against the full component length try{ SequenceFileReader reader(file); if(reader.format()==SEQFILE_STITCH) return reader.peekLength() + buf.st_size; }catch(SeqReadException e){ //if it fails, so what, who cares } return buf.st_size; } double sizeRatio(string a,string b){ return (double)getSize(a)/(double)getSize(b); } string program_version(){ SequenceBinaryMetadata meta; return string("1.0 MBFA format ")+dstring((long)meta.formatVersion)+" ("+meta.suffix+")"; } string program_help(){ return string("\ Usage: mbfa [options...] [input] [input2 ...]\n\ \n\ Generate Murasaki Binary FASTA (mbfa) files for corresponding\n\ input files.\n\ \n\ Options:\n\ --help|-h - this message\n\ --version|-V - show version\n\ --verbose|-v - increase verbosity\n\ --info|-i - show info on each MBFA specified\n\ --force|-f - force (re)creation of MBFA even if it already existed\n\ --fatal|-F - make errors fatal\n\ "); } murasaki/src/mingw32compat.cc0000644000177700001440000000200411434752234015543 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #ifdef __MINGW32__ #include #include void srandom(unsigned s){ srand(s); } long random(){ return (long)rand(); } int getpagesize(){ return 4096; // maybe } void bzero(void *start, size_t size){ char* p = (char*)start; for(size_t i=0; i (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ //////// // super-memory-miserly hash list storage // Kris Popendorf /////// #ifndef ECOHASH__H #define ECOHASH__H #include "ecolist.h" #include "bitmap.h" #include "sequence.h" using namespace std; class EcoHash : public Hash { public: typedef Ecolist::val_type val_type; //abstract parts void clear(); void add(const HashKey key,const HashVal &val); void getMatchingSets(HashKey key,list &sets); void lookup(HashKey key,LocList &locList); bool emptyAt(const HashKey key); word sizeAt(const HashKey key); word sizeAt(const HashKey key,const HashVal &val); //for computing memory costs static const word linear_cost(word c) { return ((word)(Ecolist::seqbits+Ecolist::idxbits))*(c/8); } static const word bucket_prep_cost(word c) { return (word)(Ecolist::cost(0)*c); } // dump / load may be overloaded // void dump(ostream &os); // void load(istream &is); //the boring junk EcoHash(BitSequence *pat); ~EcoHash(); //muchos boring inline static Location val2loc(const val_type& a){return Location(a.first,a.second);} inline static val_type loc2val(const Location& a){return val_type(a.seqno,a.pos);} static bool lessthan(const val_type& a,const val_type& b); protected: Ecolist *fasta; bitmap sorted; public: static void init(SeqPos _seq_count,word _longestSeq,word _totalHashLength,SeqPos _hash_bits); }; #endif murasaki/src/ecolist.h0000644000177700001440000001200311434752234014355 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #ifndef ECOLIST__H #define ECOLIST__H #include "sequence.h" #include #define MAXSUBNODES typedef union block_t block; union block_t { word w; block_t* p; }; class Ecolist { public: typedef pair val_type; typedef class BidirectionalEcolistIterator { public: typedef BidirectionalEcolistIterator _Self; block *current; list prev; SeqPos pos,ite,end; BidirectionalEcolistIterator(block *start,list prev,SeqPos p,SeqPos ite); void put(SeqPos seq,SeqPos idx) const; void put(const val_type &v) const; val_type operator*() const; val_type get() const; //identical to operator*, just there for symmetry with put _Self& operator++(); _Self operator++(int); _Self& operator--(); _Self operator--(int); bool operator!=(const _Self&) const; bool operator==(const _Self&) const; inline SeqPos to_i() const {SeqPos ret=pos;for(SeqPos j=ite;j>0;j--)ret+=minChunks<<(j-1);return ret;} } bi_iterator; typedef class EcolistIterator { public: typedef EcolistIterator _Self; block *current; SeqPos pos,ite,end; EcolistIterator(block *start,SeqPos p,SeqPos ite); EcolistIterator(const BidirectionalEcolistIterator&); void put(SeqPos seq,SeqPos idx) const; void put(const val_type &v) const; val_type operator*() const; val_type get() const; //identical to operator*, just there for symmetry with put _Self& operator++(); _Self operator++(int); bool operator!=(const _Self&) const; bool operator==(const _Self&) const; inline SeqPos to_i() const {SeqPos ret=pos;for(SeqPos j=ite;j>0;j--)ret+=minChunks<<(j-1);return ret;} } iterator; //blocks are allocated in blobs (of some 2^n*initBlocks) //blocks are composed of chunks. //chunks might straddle word/block boundaries. static SeqPos minBlocks,minChunks; static unsigned seqbits,idxbits,chunkSize; static word idxLongest; static SeqPos initBlocks; static bool inited; static bool (*lessthan)(const pair&,const pair&); static SeqPos externalSortLimit; static size_t cost(size_t chunks); SeqPos size; block *blocks; //inity stuff Ecolist(); ~Ecolist(); static void init(SeqPos seqs,word longest,word total,SeqPos weight,bool (*_lt)(const val_type&,const val_type&),bool frugal); //accessy-stuff pair getValAt(SeqPos idx); iterator begin(); iterator end(); bi_iterator bi_begin(); bi_iterator bi_end(); SeqPos capacity(); inline SeqPos getSize(){return size;} iterator at(SeqPos pos); bi_iterator bi_at(SeqPos pos); //take care only to use on sorted lists... inline iterator lower_bound(const val_type& m){return at(int_lower_bound(m));} inline iterator upper_bound(const val_type& m){return at(int_upper_bound(m));} inline bi_iterator bi_lower_bound(const val_type& m){return bi_at(int_lower_bound(m));} inline bi_iterator bi_upper_bound(const val_type& m){return bi_at(int_upper_bound(m));} SeqPos int_lower_bound(const val_type& m); //first elem not equal_range(const val_type& m){return pair(lower_bound(m),upper_bound(m));} //manipulators void clear(); //erase and reset void putValAt(SeqPos idx,SeqPos seq,SeqPos pos); void putValAt(SeqPos idx,const val_type&); void push(SeqPos seq,SeqPos pos); void swap(iterator &a,iterator &b); inline void sort(){if(size>externalSortLimit)inPlaceSort(); else externalSort();} void inPlaceSort(); //in place sort. might be necessary if size becomes crazy-big void externalSort(); //this is faster, but takes more memory //behind the scenes mojo protected: block* findBlockFor(SeqPos &remain); static void eraseAll(block* root,SeqPos ite); static block* allocBlob(SeqPos ite); static void writeBits(block* root,SeqPos loc,SeqPos offset,SeqPos size,SeqPos val); static SeqPos readBits(block* root,SeqPos loc,SeqPos offset,SeqPos size); SeqPos partition(SeqPos left,SeqPos right,SeqPos pivoti); void qsort(SeqPos left,SeqPos right); friend ostream& operator<<(ostream& os,Ecolist& a); }; size_t operator-(const Ecolist::iterator &a,const Ecolist::iterator &b); size_t operator-(const Ecolist::bi_iterator &a,const Ecolist::bi_iterator &b); #endif murasaki/src/murasaki.h0000755000177700001440000000575211434752233014546 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #ifndef __MURASAKI_H #define __MURASAKI_H #include "sequence.h" #include "options.h" #include "dinkymath.h" #include "exceptions.h" //indirection allows one to quote other #defined things (like say PROGRAMVERSION) #define _QUOTEME(x) #x #define QUOTEME(x) _QUOTEME(x) #define PROGRAMVERSION "1.68.6" //these shoudl all be class-wrapped in some "options" class extern int seq_count; extern word usedBuckets,usedPats,worstPat; extern Options opt; extern Hash* mfh; //multi-fasta-hash (right now only support one instance) extern UsedMap* anchors; extern RepeatMap* repeats; extern string pattern_src; extern int patLength; extern BitSequence* globalPat; extern vector seqs; extern char nodename[81]; extern ofstream *entropy_fh,*anchorProgress_fh; void hashSeq(BitSequence *bitseq,BitSequence *pat,int sign,Sequence *s); void extractAndMatch(BitSequence *pat); void procLocs(LocList &locList,list &use,int skipsLeft,int level=0); bool isFullLocList(const LocList &locList); string program_version(); bool hashSanityCheck(); string platformInfo(); word locList2mclass(LocList &locList); ostream& outputRegions(ostream &of); void hashStatusCheck(int tick,SeqPos pos); void writeHisto(string hashHisto_record,string detailed); void writeAnchors(string anchor_record); void writeRepeats(string record); bool convertPattern(string &pat); void init_tfidfCounters(); void generateCollisionHistogram(map &collisionSizeHistogram,map &seedHistogram); extern word *dfCount; extern word totalAnchoredLocs; extern int verbose; //debug mojo extern word longestSeq,totalSeqLength,totalHashLength; //for eco-list parameter computation class SystemInfo { public: unsigned long wordsize,totalMemory,freeMemory,swapTotal; const int unit; SystemInfo(); string toString() const; friend ostream& operator<<(ostream& of,const SystemInfo& a); }; class MurasakiRuntime { protected: bool cleaned; void init(int argc,char **argv); void prep(); void work(); void success(); void cleanup(int disaster=0); public: char mpi_tag[81]; BitSequence *pat; vector args; Timer seqLoadStart; int stdoe; char stdoename[101]; MurasakiRuntime(int argc,char **argv); ~MurasakiRuntime(); }; extern SystemInfo sysinfo; extern ProgressTicker ticker; #endif murasaki/src/cgr.cc0000644000177700001440000004224311434752234013635 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ///////////////////// //Krisp - CGR Support Class ////////////////// #include "cgr.h" #include "dinkymath.h" #include "mingw32compat.h" #include #include #include #include #include #include #include //for fopen for libpgm #include #include #include //multiple versions of spirit floating around there. annoying. #include #if BOOST_VERSION >= 103800 //use spirit "classic" #if !defined(BOOST_SPIRIT_USE_OLD_NAMESPACE) #define BOOST_SPIRIT_USE_OLD_NAMESPACE #endif #include #include #include #include #include #include #include #include #else //current is "classic" #include #include #include #include #include #include #include #include #endif using namespace std; int debugswitch=0; void debugme(){debugswitch=!debugswitch;} unsigned long seedcount; const pix maxout=PGM_OVERALLMAXVAL; bool fullWidthBinary=true; //because some programs suck and don't support 16-bit binary PGMs Cgr::Cgr(): iter(1),rez(2),data(new pix[rez*rez]),normalized(false) { memset(data,0,sizeof(pix)*rez*rez); cout << "Making a new CGR of default rez "<>iter; string ret; for(int i=0;i>=1;y>>=1; } reverse(ret.begin(),ret.end()); return ret; } unsigned long dnatoi(const char *dna,const int &iter,const int &rez){ int x=0,y=0; for(int i=0;i>=1;y>>=1; //retry! return ~0; //disaster! } } /* if(debugswitch){ char buf[1024]; strncpy(buf,dna,iter); buf[iter]=0; cout << buf << " maps to (" << x << ","<< y << ")"<>8; of.write((char*)&buf,1); } } }else{ of << maxout << endl; of << *this; } return true; } ostream& operator<<(ostream& of,const Cgr &a){ for(int y=0;y " << idx < lowhigh(findNiceRange()); cout << "Low: "<(val,maxout); val=max(val,0); *i=val; } normalized=true; } pair Cgr::sampleHighLow(double lowp,double highp){ const unsigned size=rez*rez; pix *alt=new pix[size]; if(!alt){ cerr << "Out of memory!\n"<< endl; exit(-3); } memcpy(alt,data,size*sizeof(pix)); sort(alt,alt+size-1); pix low=alt[(unsigned)((float)size*lowp)],high=alt[(unsigned)((float)size*highp)]; delete alt; return pair(low,high); } pair Cgr::sampleHighLow(int lown,int highn){ const unsigned size=rez*rez; pix *alt=new pix[size]; if(!alt){ cerr << "Out of memory!\n"<< endl; exit(-3); } memcpy(alt,data,size*sizeof(pix)); sort(alt,alt+size-1); pix low=(lown>=0 ? alt[lown]:0),high=(highn>=0 ? alt[size-highn-1]:~(pix)0); delete alt; return pair(low,high); } void Cgr::findEdges(const pair &p,vector *low,vector *high){ for(pix *i=data,*end=i+rez*rez;ipush_back(i-data); } else if(*i>=p.second) if(high) high->push_back(i-data); } } pix Cgr::get(unsigned idx){ return data[idx]; } pair Cgr::findNiceRange(){ const unsigned size=rez*rez; pix *alt=new pix[size]; if(!alt){ cerr << "Out of memory!\n"<< endl; exit(-3); } memcpy(alt,data,size*sizeof(pix)); sort(alt,alt+size-1); pix low=alt[(unsigned)((float)size*.01)],high=alt[(unsigned)((float)size*.99)]; /* cout << "Source: "<(cout," ")); cout << "sorted: "<(cout," ")); */ for(uint i=(uint)((float)size*.99);low==high && i(low,high); } PgmBuffer::PgmBuffer() : data(0),rez(0) { //boring } int PgmBuffer::loadCgr(Cgr& a){ if(!data.empty()) return -1; //already loaded rez=a.rez; /* in a world where netPBM worked... data=new gray*[rez]; //make rez rows for(int y=0;y info; switch(state){ case 0: //magicnumber inf.read(line,2); line[2]=0; if(!strcmp("P5",line)) return loadBinaryPGM(file); else if(strcmp("P2",line)) throw PGMFileException("Not a (plain) PGM file",file,linenum); state++; inf.getline(line,maxline); //got to chomp that remaining newline break; case 1: //header - dimensions if(line[0]=='#')break; //comment info = parse(line, int_p[assign(width)] >> int_p[assign(height)],space_p); if(!info.full) throw PGMFileException("Couldn't parse dimensions",file,linenum); state++; if(width!=height) throw PGMFileException("Width!=Height???",file,linenum); rez=width; data.reserve(rez*rez); break; case 2: //header - maxval if(line[0]=='#')break; //comment info = parse(line, int_p[assign(maxval)], space_p); if(!info.full) throw PGMFileException("Couldn't parse maxval",file,linenum); if(maxval>255) wordwidth=2; state++; break; case 3: //data info = parse(line,*(uint_p[push_back_a(data)]),space_p); if(!info.full) throw PGMFileException("Couldn't read data...",file,linenum); break; } inf.getline(line,maxline); }while(inf); if(data.size()<(uint)(height*width)) throw PGMFileException("Didn't get enough data points...",file,data.size()); return 0; } int PgmBuffer::loadBinaryPGM(string file){ using namespace boost::spirit; int width,height; gray maxval; if(!data.empty()){ throw PGMFileException("I already have data thank you..."); } const int maxline=1024; char line[maxline]; int linenum=0; int state=0; int wordwidth=1; ifstream inf(file.c_str(),ios::in | ios::binary); if(!inf) PGMFileException("Couldn't read file",file,linenum); inf.read(line,2); line[2]=0; do{ if(inf.bad())break; //game over linenum++; parse_info info; switch(state){ case 0: //magicnumber if(strcmp("P5",line)) throw PGMFileException("Not a (binary) PGM file",file,linenum); state++; inf.getline(line,maxline); //kill that remaining last line break; case 1: //header - dimensions inf.getline(line,maxline); if(line[0]=='#')break; //comment info = parse(line, int_p[assign(width)] >> int_p[assign(height)],space_p); if(!info.full) throw PGMFileException("Couldn't parse dimensions",file,linenum); state++; if(width!=height) throw PGMFileException("Width!=Height???",file,linenum); rez=width; data.reserve(rez*rez); break; case 2: //header - maxval inf.getline(line,maxline); if(line[0]=='#')break; //comment info = parse(line, int_p[assign(maxval)], space_p); if(!info.full) throw PGMFileException("Couldn't parse maxval",file,linenum); if(maxval>255) wordwidth=2; state++; break; case 3: //data inf.read(line,wordwidth); if(inf.good()) data.push_back(*(gray*)(line)); switch(maxval){ case maxout:break; case 255: data.back()<<=8; break; //faster case 127: data.back()<<=9; break; default: data.back()*=(maxout/maxval);break;//ugly, but it'll work } break; } }while(inf); if(data.size()<(uint)(height*width)) throw PGMFileException("Didn't get enough data points...",file,data.size()); return 0; } void PgmBuffer::invert(){ for(vector::iterator i=data.begin();i!=data.end();i++){ *i=(maxval-*i); } } int PgmBuffer::savePGM(string file,bool binary){ if(data.empty()){ cerr << "No data to write???"<::iterator i=data.begin();i!=data.end();i++){ of.write((char*)&(*i),sizeof(gray)); } }else{ of << 255 << endl; uint8_t buf; for(vector::iterator i=data.begin();i!=data.end();i++){ buf=(*i>>8); of.write((char*)&(buf),1); } } }else{ vector::iterator i=data.begin(); for(int y=0;y::iterator i=data.begin();i!=data.end();i++) sum+=(double)(*i); sum/=(double)data.size(); return (gray)(sum); } double PgmBuffer::difference(vector others){ double avg=0; double sum=0; double variance=0; if(!allSameRez(others)) throw PGMException("Buffers differ in size"); for(unsigned i=0;i::iterator buf=others.begin();buf!=others.end();buf++){ avg+=(*buf)->data[i]; } avg/=others.size()+1; variance=sqr(data[i]-avg); for(vector::iterator buf=others.begin();buf!=others.end();buf++) variance+=sqr((*buf)->data[i]-avg); sum+=sqrt(variance); } return sum/((double)(rez*rez))/(double)maxout; } bool PgmBuffer::allSameRez(vector others){ for(vector::iterator i=others.begin();i!=others.end();i++) if((*i)->rez!=rez) return false; return true; } void invert(string& str){ for(unsigned i=0;i (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #include "alignments.h" #include #include #include //multiple versions of spirit floating around there. annoying. #include #if BOOST_VERSION >= 103800 //use spirit "classic" #if !defined(BOOST_SPIRIT_USE_OLD_NAMESPACE) #define BOOST_SPIRIT_USE_OLD_NAMESPACE #endif #include #include #include #include #include #include #include #include #else //current is "classic" #include #include #include #include #include #include #include #include #endif using namespace std; int Alignment::nextId=0; int Anchor::nextId=0; uint seqCount=0; vector seqs; bool seqinit=false; bool verify(const region& r){ if(r.stop parts;reader.readline(parts);parts.clear()){ if(growAnchors) alignment.mergeAdd(parts); else alignment.add(parts); anchors++; } } void writeSeqs(const char* alignfile){ const boost::regex prefix("^(.*)\\.anchors$"); boost::cmatch m; string seqfile; if(boost::regex_match(alignfile,m,prefix)){ assert(m.size()>=1); string prefix=m[1]; seqfile=prefix+string(".seqs"); }else{ //assume we've been handed a prefix seqfile=string(alignfile)+string(".seqs"); } ofstream seqsfh(seqfile.c_str()); if(!seqsfh.good()){ cerr << "Error opening "<=1); ret=m[1]; } return ret; } void checkSeqs(const char* alignfile){ string prefix=getAnchorPrefix(string(alignfile)); if(!prefix.empty()){ string seqfile=prefix+string(".seqs"); ifstream seqsfh(seqfile.c_str()); if(!seqsfh.good()){ cerr << "Error opening "<=seqCount){ cerr << "Alignments have a different number of members!"<::iterator i=a.anchors.begin();i!=a.anchors.end();++i){ Anchor& anc=*i; for(uint s=0;s &context){ set matches,temp; uint i=0; while(isZero(a.parts[i])){ i++; assert(i parts,set &context){ set matches,temp; uint i=0; while(isZero(parts[i])){ i++; assert(i ®ions) const { uint s=0; for(s=0;s parts(seqCount); for(uint s=0;s &parts){ set overlaps; vector newparts(parts); if(findOverlaps(parts,overlaps)){ for(set::iterator o=overlaps.begin();o!=overlaps.end();++o){ if(!(*o)->overlaps(parts)) //might not overlaps in _all_ sequences continue; // cerr << "Overlap in: "<parts[s].key() << " ->"; assert(verify(newparts[s])); assert(verify((*o)->parts[s].key())); coalesce(newparts[s],(*o)->parts[s].key()); // cerr << newparts[s]<parts[s]); anchors.erase(a->backref); } bool canonicalize(vector &vr){ size_t si=0; assert(!vr.empty()); while(si &parts){ using namespace boost::spirit; const int maxline=1024; char line[maxline]; parse_info info; rule<> sign_p(ch_p('+')|'-'); linenum++; inf.getline(line,maxline); if(inf.bad() || strlen(line)<1) return false; const char *parse_start=line; long start,stop; char sign='*'; //mmm invalid sign int_parser long_p; for(uint seqi=0;seqi> long_p[assign(stop)] >> (ch_p('+')[assign(sign)]|ch_p('-')[assign(sign)]) ,space_p); parse_start=info.stop; if(!info.hit){ cerr << "Failed to extract a whole anchor!!!! ("<0){ swap(start,stop); start=-start; stop=-stop; } if(start>stop){ cerr << filename << ":"<(start,stop); } assert(verify(region(start,stop))); parts.push_back(growAnchors ? grow(region(start,stop),growAnchors):region(start,stop)); assert(verify(parts.back())); } canonicalize(parts); return true; } SequenceCover::SequenceCover(){ } SequenceCover::TreeType::iterator SequenceCover::merge(region in){ size_t count=1; for(TreeType::range_iterator i(cover.in_range(in)),next;i!=cover.end();i=next){ next=i; ++next; assert(in.overlaps(i.key())); count++; in.start=min(in.start,i.key().start); in.stop=max(in.stop,i.key().stop); TreeType::iterator ite(i); cover.erase(ite); } return cover.insert(in,count); } long SequenceCover::totalLength() { long total=0; for(TreeType::iterator i=cover.begin();i!=cover.end();++i){ total+=length(i.key()); } return total; } RegionSet::RegionSet(const Anchor &a){ parts.reserve(a.parts.size()); for(size_t i=0;i::const_iterator ite=parts.begin(); ite!=parts.end();++ite) if(!isZero(*ite)){ count++; len+=length(*ite); } return (double)len/(double)count; } bool RegionSet::isGapped() const { vector::const_iterator ite=parts.begin(); while(isZero(*ite) && ++ite!=parts.end()) ; long len=length(*ite); while(++ite!=parts.end()){ if(isZero(*ite))continue; if(length(*ite)!=len) return true; } return false; } long RegionSet::riftCount() const { vector::const_iterator ite=parts.begin(); long count=0; while(++ite!=parts.end()) if(isZero(*ite)) count++; return count; } RegionSet::RegionSet() { } RegionSet RegionSet::getIslands() const { RegionSet r; vector::const_iterator ite=parts.begin(); while(++ite!=parts.end()) if(!isZero(*ite)) r.parts.push_back(*ite); assert(!r.parts.empty()); return r; } RegionSet RegionSet::setAbs() const { RegionSet r; for(vector::const_iterator ite=parts.begin(); ite!=parts.end(); ++ite){ r.parts.push_back(abs(*ite)); } return r; } murasaki/src/cmultiset.h0000644000177700001440000003073711434752233014741 0ustar krispusers/* Murasaki - multiple genome global alignment program Copyright (C) 2006-2007 Keio University (Yasunori Osana & Kris Popendorf) (2006) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ // // cmultiset: memory-conservative multiset / murasaki project // #ifndef __CMULTISET_H__ #define __CMULTISET_H__ #define DEFAULT_INITIALSIZE 1 #define DEFAULT_EXPANDSTEP 1 #define DEFAULT_KEEP_SORTED true #include #include #include #include #include "exceptions.h" using namespace std; // typedef char T; template class cmultiset { public: cmultiset(); ~cmultiset(); typedef T *iterator; typedef T &reference; typedef const T *const_iterator; // multiset compatible member fuctions iterator insert(T token); // append a token pair equal_range(T token); iterator begin(){return array;} iterator end(){return &array[arraysize];} int size(){ return arraysize; } bool empty(){ if (arraysize==0) return true; else return false;}; // incompatible but useful... void clear(); int insert(T token, int pos); // insert a token void block_append(T* tokens, int num); // insert bunch of tokens pair block_read(); // for block read int lsearch(T token); // perform linear search int bsearch(T token); // perform binary search int bsearch_position(T token); // find appropriate position by binary search // housekeepers void sort(); // perform quicksort on the array int pack(); // release unused memory area // status / preferences void set_expand_step(int s){ expandstep = s; }; void set_keep_sorted(bool a){ keep_sorted = a; }; int cap(){ return capacity; } // return capacity bool order(){ return sorted; } // sorted or not // and so on... int sanityCheck(); string asString() const; T &operator[](int i) const; template friend ostream& operator<<(ostream &os , const cmultiset &v); T* array; protected: bool keep_sorted; bool sorted; int arraysize; int capacity; int initialsize; int expandstep; int insert_anyway(T token, int pos); // insert a token without care of order status void expand(); int nextMatch(T token, int pos); // find next match (linear search) int prevMatch(T token, int pos); // find previous match (linear search) iterator indexToIterator(int i){ if (i!=-1) return &array[i]; else return begin();} int iteratorToIndex(iterator i){ if (i>=begin() && i int compareT(const void* a, const void* b){ const T aa = *((T*)a); const T bb = *((T*)b); if (aa cmultiset::cmultiset(){ arraysize = 0; expandstep = DEFAULT_EXPANDSTEP; initialsize = DEFAULT_INITIALSIZE; capacity = initialsize; keep_sorted = DEFAULT_KEEP_SORTED; array = (T*)malloc(sizeof(T) * initialsize); sorted = false; // assert(sanityCheck()); return; } template cmultiset::~cmultiset(){ free(array); } template void cmultiset::clear(){ arraysize = 0; sorted = false; } // ---------------------------------------------------------------------- // append / insert // ---------------------------------------------------------------------- // append to the array, then returns the index template T* cmultiset::insert(T token){ int pos; if (sorted && keep_sorted) { // keep it sorted pos = bsearch_position(token); insert_anyway(token, pos); } else { // otherwise, just append if (arraysize == capacity) expand(); pos = arraysize; array[pos] = token; arraysize++; sorted = false; } // assert(cout << "add(" << token << " at " << arraysize-1 << ")" << endl); // assert(sanityCheck()); return indexToIterator(pos); } template int cmultiset::insert_anyway(T token, int pos){ if (pos > arraysize) pos = arraysize; // if pos is too large, let's append. // assert(cout << "insert " << token << " at " << pos << endl); if (arraysize == capacity) expand(); // if (pos != arraysize-1) if (pos != arraysize) // is this correct? seems to be ok. for (int i=arraysize; i>pos; i--) array[i] = array[i-1]; // memmove() wasn't good... why? why? arraysize++; array[pos] = token; return arraysize; } template int cmultiset::insert(T token, int pos){ arraysize = insert_anyway(token, pos); if (sorted){ sorted = false; if (pos==0) if(compareT((void*)(array+pos), (void*)(array+pos+1)) <= 0) sorted = true; if (pos==arraysize-1) if(compareT((void*)(array+pos-1), (void*)(array+pos)) <= 0) sorted = true; if (!sorted) if (compareT((void*)(array+pos-1), (void*)(array+pos)) <= 0 && compareT((void*)(array+pos), (void*)(array+pos+1)) <= 0) sorted = true; } // assert(sanityCheck()); return arraysize; } // block append template void cmultiset::block_append(T* tokens, int num){ // make enough rooms if( arraysize+num > capacity ){ capacity = arraysize+num; array = (T*)realloc(array, capacity * sizeof(T)); } // then append int dst = arraysize; for(int i=0; i pair cmultiset::block_read(){ return pair(arraysize, array); } // ---------------------------------------------------------------------- // housekeepers // ---------------------------------------------------------------------- template void cmultiset::expand(){ int s; // s = expandstep; s = capacity /2; if ( s==0 ) s = 1; if ( s>20 ) s = 20; capacity += s; array = (T*)realloc(array, capacity * sizeof(T)); if(array == NULL){ throw MurasakiException("cmultiset: Expand() failed (out of memory?)"); } // assert(cout << "array realloced (new capacity: " << capacity << ")" << endl); } template void cmultiset::sort(){ if( !sorted ) qsort(array, arraysize, sizeof(T), compareT); sorted = true; // assert(sanityCheck()); } template int cmultiset::pack(){ // int cap_prev = capacity; capacity = arraysize; array = (T*)realloc(array, capacity * sizeof(T)); if (array == NULL) return -1; // assert( cout << "packed from " << cap_prev << " to " << capacity << endl); // assert(sanityCheck()); return capacity; } // ---------------------------------------------------------------------- // linear search functions (private) // ---------------------------------------------------------------------- // find specified token in the array, return the index (linear search) template int cmultiset::lsearch(T token){ // assert(cout << "linear search for key: " << token << endl); return nextMatch(token, 0); } // find next match, return the index (linear search) template int cmultiset::nextMatch(T token, int pos){ // assert(sanityCheck()); // assert(cout << "find next token " << token << " from " << pos <<" in " << arraysize); if (pos >= arraysize || pos < 0 ) return -1; for(int i=pos; i int cmultiset::prevMatch(T token, int pos){ // assert(sanityCheck()); // assert(cout << "find prev token " << token << " from " << pos <<" in " << arraysize); if (pos >= arraysize || pos < 0) return -1; for(int i=pos; i>=0; i--){ if (array[i] == token) { // assert(cout << " / found at: " << i << endl); return i; } } // assert(cout << "not found :(" << endl); return -1; } // ---------------------------------------------------------------------- // binary search functions (private) // ---------------------------------------------------------------------- // perform binary search to find proper insert position template int cmultiset::bsearch_position(T token){ int max = arraysize-1; int min = 0; int c, r; sort(); // the head and tail if (compareT(&token, &array[0]) < 0) return 0; if (compareT(&token, &array[arraysize-1]) > 0) return arraysize; // or insert into the middle do { if (max-min == 1){ if (compareT(&array[min], &token) == 0) return min; return max; } else { c = (min+max)/2; r = compareT(&array[c], &token); if (r == 0) return c; // assert(cout<< "<" << min <<"[" << c << "]" << max << ":" << r<< ">"); if (r < 0) min=c; else max=c; // assert(cout<< "(" << min << "," << max << ")"); } } while ( min int cmultiset::bsearch(T token){ int max = arraysize-1; int min = 0; int c, r; sort(); do { if (max-min == 1){ if (compareT(&array[max], &token) == 0) return max; if (compareT(&array[min], &token) == 0) return min; // cout << "possible position is <"<< max <<">"; return -1; } else { c = (min+max)/2; r = compareT(&array[c], &token); if (r == 0) return c; // assert(cout<< "<" << min <<"[" << c << "]" << max << ":" << r<< ">"); if (r < 0) min=c; else max=c; // assert(cout<< "(" << min << "," << max << ")"); } } while ( min pair cmultiset::equal_range(T token){ int p = bsearch(token); if ( p == -1 ) return pair(NULL, NULL); int b = p; int e = p+1; while ( 0 < b ) if (!( array[b-1] < token )) b--; else break; while ( e < arraysize ) if (!( token < array[e] )) e++; else break; // cout << "[" << b << "," << e << "]"; T* bb = indexToIterator(b); T* ee = indexToIterator(e); return pair(bb, ee); } // linear-search version. slow... /* template pair cmultiset::equal_range(T token){ sort(); int b = 0; int e = 1; while ( b < arraysize ) if (array[b] < token) b++; else break; if (b == arraysize) b=e=-1; else{ e = b; while (e < arraysize) if (!(token < array[e])) e++; else break; } if (e > arraysize) e = arraysize; // assert(cout << "[" << b << "," << e << "]"); T* bb = indexToIterator(b); T* ee = indexToIterator(e); return pair(bb, ee); } */ // ---------------------------------------------------------------------- // functions for tests // ---------------------------------------------------------------------- template string cmultiset::asString() const{ ostringstream s; for (int i=0; i int cmultiset::sanityCheck(){ //for(int i=0; i ostream& operator<<(ostream &os,const cmultiset &v){ return os << v.asString(); } template T &cmultiset::operator[](int i) const{ return array[i]; } #endif murasaki/src/binseq.hh0000644000177700001440000000741611434752233014357 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // seqread.h // c++ support for reading various sequence formats ////////////// #ifndef __BINSEQ_H_ #define __BINSEQ_H_ #include "globaltypes.h" #include "seqread.h" #include "dinkymath.h" #include #include #include #include #include #include #include #include #include #include #define BINSEQ_MAGICNUMBER "\xCC\x33\xFF" //this is purple in rgb hex #define CHARCAST(x) ((const char*)&(x)) #define VOIDOFFSET(x,off) ((const char*)(x)+static_cast(off)) #define VOIDGET(T,x,off) (*(T*)(VOIDOFFSET(x,off))) using namespace std; namespace io = boost::iostreams; class SequenceBinaryMetadata { public: unsigned char formatVersion; string suffix; SequenceBinaryMetadata(); }; class SeqBinException { public: SeqBinException(const string& reason): reason_(reason) {} string reason() const { return reason_; } const char* what() const { return reason_.c_str();} private: string reason_; }; class SequenceBinary { public: typedef pair Region; protected: void init(); SequenceBinaryMetadata formatMetaData; word _length; word bit_count,word_count; word *_words; word* allocWords(); int seqdataMalloc; int fd; void *fdmem; size_t fdmem_len; string baseFilename,binaryFilename; //debugging bits char rawBase(size_t n); word rawBaseBits(size_t n); char bitToBase(word w); public: word counters[8]; //unmasked[4],masked[4] word regionSizes[3]; vector subSeqNamesP; vector subSeqNames; vector subSeqs,readableRegions,unmaskedRegions; Region *regions[3]; void fillCopyVectors(); inline string getBinaryFilename(){return binaryFilename;} inline string getBaseFilename(){return baseFilename;} inline const word* words()const {return _words;} inline const word getWordCount()const {return word_count;} inline size_t length()const {return _length;} inline string deriveLocalBinaryFilename(){return deriveLocalBinaryFilename(baseFilename);} string deriveLocalBinaryFilename(string filename); static string deriveBinaryFilename(string filename); static bool exists(string filename); //test if a binary file exists for filename bool save(string filename); bool save(); ostream& asFasta(ostream &os,int nmaskR=1); SequenceBinary(string filename,bool complete=false); SequenceBinary(SequenceFileReader &reader); ~SequenceBinary(); }; class DeSerial { public: void *m; size_t offset; DeSerial(void *_m,size_t _offset=0); template void map(T* &dst,int count){ dst=(T*)VOIDOFFSET(m,offset); int size=sizeof(T); offset+=size*count; } template void get(T* dst,int count){ int size=sizeof(T); for(;count>0;offset+=size,count--,dst++){ *dst=VOIDGET(T,m,offset); } } template T& get(T &dst){ get(&dst,1); return dst; } }; #endif murasaki/src/itree.h0000644000177700001440000007155111434752234014040 0ustar krispusers// Interval RB tree implementation -*- C++ -*- /* Generic Interval Tree Map Copyright (C) 2006-2007 Keio University (Kris Popendorf) (2007) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. This is an implementation of Interval Trees as described by Cormen in "Introduction to Algorithms" implemented using the GCC Red/Black Tree implementation for the ugly RB Tree insertion/deletion. */ #ifndef __ITREE_H_ #define __ITREE_H_ #include #include #include namespace Itree { using namespace std; enum _Rb_tree_color { _S_red = false, _S_black = true }; template class Interval { public: typedef Interval<_IntType> _Interval; _IntType start,stop; Interval(){} Interval(const _IntType& __a,const _IntType& __b) : start(__a),stop(__b) { #ifdef DEBUG_ITREE assert(start bool operator==(const Interval<_IntType>& a, const Interval<_IntType>& b){ return a.start==b.start && a.stop==b.stop; } template std::ostream& operator<<(std::ostream& os,const Interval<_IntType>& i){ return os << "[" << i.start << "~" << i.stop << "]"; } template class itree_node { public: typedef _IntType IntType; typedef Interval<_IntType> key_type; typedef _Value value_type; typedef itree_node<_IntType,_Value> _Base_type; typedef itree_node<_IntType,_Value> node_type; typedef _Base_type* _Base_ptr; typedef pair init_type; key_type key; value_type val; IntType max; //contains the max _Rb_tree_color _M_color; _Base_ptr _M_parent; _Base_ptr _M_left; _Base_ptr _M_right; itree_node(const itree_node& a) : key(a.key),val(a.val),max(a.key.stop), _M_color(a._M_color),_M_parent(a._M_parent),_M_left(0),_M_right(0) { } itree_node(const init_type& a) : key(a.first),val(a.second),max(a.first.stop), _M_color(),_M_parent(0),_M_left(0),_M_right(0) { } itree_node() : key(),val(),max(), _M_color(),_M_parent(0),_M_left(0),_M_right(0) { } static node_type* itree_increment(node_type* __x) { if (__x->_M_right != 0) { __x = __x->_M_right; while (__x->_M_left != 0) __x = __x->_M_left; } else { node_type* __y = __x->_M_parent; while (__x == __y->_M_right) { __x = __y; __y = __y->_M_parent; } if (__x->_M_right != __y) __x = __y; } return __x; } static const node_type* itree_increment(const node_type* __x) { return itree_increment(const_cast(__x)); } static node_type* itree_decrement(node_type* __x) { if (__x->_M_color == _S_red && __x->_M_parent->_M_parent == __x) __x = __x->_M_right; else if (__x->_M_left != 0) { node_type* __y = __x->_M_left; while (__y->_M_right != 0) __y = __y->_M_right; __x = __y; } else { node_type* __y = __x->_M_parent; while (__x == __y->_M_left) { __x = __y; __y = __y->_M_parent; } __x = __y; } return __x; } static const node_type* itree_decrement(const node_type* __x) { return itree_decrement(const_cast(__x)); } static _Base_ptr _S_minimum(_Base_ptr __x) { while (__x->_M_left != 0) __x = __x->_M_left; return __x; } static _Base_ptr _S_maximum(_Base_ptr __x) { while (__x->_M_right != 0) __x = __x->_M_right; return __x; } }; template struct itree_range_iterator; //iterators are buildable from range iterators and vice versa, requiring a forward decl here template struct itree_iterator { typedef _Value value_type; typedef _Value& reference; typedef _Value* pointer; typedef Interval<_IntType> key_type; typedef Interval<_IntType>& key_reference; typedef Interval<_IntType>* key_pointer; typedef bidirectional_iterator_tag iterator_category; typedef ptrdiff_t difference_type; typedef itree_iterator<_IntType,_Value> _Self; typedef struct itree_range_iterator<_IntType,_Value> _IntervalIterator_type; typedef typename itree_node<_IntType,_Value>::_Base_ptr _Base_ptr; typedef itree_node<_IntType,_Value>* _Link_type; typedef itree_node<_IntType,_Value> node_type; itree_iterator() : _M_node() { } explicit itree_iterator(_Link_type __x) : _M_node(__x) { } itree_iterator(_IntervalIterator_type __x) : _M_node(__x._M_node) { } key_reference key() const { return static_cast<_Link_type>(_M_node)->key; } reference operator*() const { return static_cast<_Link_type>(_M_node)->val; } pointer operator->() const { return &static_cast<_Link_type>(_M_node)->val; } _Self& operator++() { _M_node = node_type::itree_increment(_M_node); return *this; } _Self operator++(int) { _Self __tmp = *this; _M_node = node_type::itree_increment(_M_node); return __tmp; } _Self& operator--() { _M_node = node_type::itree_decrement(_M_node); return *this; } _Self operator--(int) { _Self __tmp = *this; _M_node = node_type::itree_decrement(_M_node); return __tmp; } bool operator==(const _Self& __x) const { return _M_node == __x._M_node; } bool operator!=(const _Self& __x) const { return _M_node != __x._M_node; } _Base_ptr _M_node; }; template struct itree_range_iterator { typedef _Value value_type; typedef _Value& reference; typedef _Value* pointer; typedef Interval<_IntType> key_type; typedef Interval<_IntType>& key_reference; typedef Interval<_IntType>* key_pointer; typedef bidirectional_iterator_tag iterator_category; typedef ptrdiff_t difference_type; typedef itree_range_iterator<_IntType,_Value> _Self; typedef itree_iterator<_IntType,_Value> iterator; typedef typename itree_node<_IntType,_Value>::_Base_ptr _Base_ptr; typedef itree_node<_IntType,_Value>* _Link_type; typedef itree_node<_IntType,_Value> node_type; itree_range_iterator() : _M_node(),target() { } explicit itree_range_iterator(iterator __x,key_type key,_Base_ptr header) : _M_node(__x._M_node),target(key),_M_header(header) { } key_reference key() const { return static_cast<_Link_type>(_M_node)->key; } reference operator*() const { return static_cast<_Link_type>(_M_node)->val; } pointer operator->() const { return &static_cast<_Link_type>(_M_node)->val; } _Self& operator++() { _M_node = range_next_in(_M_node); return *this; } _Self operator++(int) { _Self __tmp = *this; _M_node = range_next_in(_M_node); return __tmp; } bool operator==(const _Self& __x) const { return (_M_node == __x._M_node && target==__x.target); } bool operator!=(const _Self& __x) const { return _M_node != __x._M_node || target!=__x.target; } bool operator==(const iterator& __x) const { return _M_node == __x._M_node; } bool operator!=(const iterator& __x) const { return _M_node != __x._M_node; } _Base_ptr range_next_in(_Base_ptr after) { _Base_ptr __x = node_type::itree_increment(after); _Base_ptr __z = _M_header; while(__x != _M_header && !((target.stop)<(__x->key.start))){ //mmm O(log n) if(target.overlaps(__x->key)){ __z=__x; break; } //a sneaky modified increment. can skip left traversals if max is < target.start if (__x->_M_right != 0) { __x = __x->_M_right; while (__x->_M_left != 0 && !((__x->_M_left->max)<(target.start))) __x = __x->_M_left; } else { node_type* __y = __x->_M_parent; while (__x == __y->_M_right) { __x = __y; __y = __y->_M_parent; } if (__x->_M_right != __y) __x = __y; } } return __z; } _Base_ptr _M_node; key_type target; _Base_ptr _M_header; }; template > > class itree { public: typedef itree<_IntType,_Value> _Self; typedef itree_node<_IntType,_Value> node_type; typedef size_t size_type; typedef Interval<_IntType> key_type; typedef _Value value_type; typedef pair init_type; typedef typename _Alloc::template rebind::other _Node_allocator; typedef _Alloc allocator_type; typedef itree_iterator<_IntType,_Value> iterator; typedef itree_range_iterator<_IntType,_Value> range_iterator; typedef itree_node<_IntType,_Value>* _Link_type; typedef const itree_node<_IntType,_Value>* _Const_Link_type; protected: ////////////////////// // in normal gcc RB trees this is precompiled //////////////// inline void itree_reset_max(node_type* __x){ __x->max=__x->key.stop; if(__x->_M_left && (__x->max) < (__x->_M_left->max)) __x->max=__x->_M_left->max; if(__x->_M_right && (__x->max) < (__x->_M_right->max)) __x->max=__x->_M_right->max; } inline void itree_reset_max(node_type* __x,node_type* __z){ //z is to be ignored __x->max=__x->key.stop; if(__x->_M_left && __x->_M_left!=__z && (__x->max) < (__x->_M_left->max)) __x->max=__x->_M_left->max; if(__x->_M_right && __x->_M_right!=__z && (__x->max) < (__x->_M_right->max)) __x->max=__x->_M_right->max; } void itree_rotate_left(node_type* const __x, node_type*& __root) { node_type* const __y = __x->_M_right; __x->_M_right = __y->_M_left; if (__y->_M_left !=0) __y->_M_left->_M_parent = __x; __y->_M_parent = __x->_M_parent; if (__x == __root) __root = __y; else if (__x == __x->_M_parent->_M_left) __x->_M_parent->_M_left = __y; else __x->_M_parent->_M_right = __y; __y->_M_left = __x; __x->_M_parent = __y; itree_reset_max(__x); itree_reset_max(__y); } void itree_rotate_right(node_type* const __x, node_type*& __root) { node_type* const __y = __x->_M_left; __x->_M_left = __y->_M_right; if (__y->_M_right != 0) __y->_M_right->_M_parent = __x; __y->_M_parent = __x->_M_parent; if (__x == __root) __root = __y; else if (__x == __x->_M_parent->_M_right) __x->_M_parent->_M_right = __y; else __x->_M_parent->_M_left = __y; __y->_M_right = __x; __x->_M_parent = __y; itree_reset_max(__x); itree_reset_max(__y); } void itree_insert_and_rebalance(const bool __insert_left, node_type* __x, node_type* __p, node_type& __header) { node_type *& __root = __header._M_parent; // Initialize fields in new node to insert. __x->_M_parent = __p; __x->_M_left = 0; __x->_M_right = 0; __x->_M_color = _S_red; // Insert. // Make new node child of parent and maintain root, leftmost and // rightmost nodes. // N.B. First node is always inserted left. if (__insert_left) { __p->_M_left = __x; // also makes leftmost = __x when __p == &__header if (__p == &__header) { __header._M_parent = __x; __header._M_right = __x; } else if (__p == __header._M_left) __header._M_left = __x; // maintain leftmost pointing to min node } else { __p->_M_right = __x; if (__p == __header._M_right) __header._M_right = __x; // maintain rightmost pointing to max node } if(&__header!=__p) itree_reset_max(__p); // Rebalance. while (__x != __root && __x->_M_parent->_M_color == _S_red) { node_type* const __xpp = __x->_M_parent->_M_parent; if (__x->_M_parent == __xpp->_M_left) { node_type* const __y = __xpp->_M_right; if (__y && __y->_M_color == _S_red) { __x->_M_parent->_M_color = _S_black; __y->_M_color = _S_black; __xpp->_M_color = _S_red; __x = __xpp; } else { if (__x == __x->_M_parent->_M_right) { __x = __x->_M_parent; itree_rotate_left(__x, __root); } __x->_M_parent->_M_color = _S_black; __xpp->_M_color = _S_red; itree_rotate_right(__xpp, __root); } } else { node_type* const __y = __xpp->_M_left; if (__y && __y->_M_color == _S_red) { __x->_M_parent->_M_color = _S_black; __y->_M_color = _S_black; __xpp->_M_color = _S_red; __x = __xpp; } else { if (__x == __x->_M_parent->_M_left) { __x = __x->_M_parent; itree_rotate_right(__x, __root); } __x->_M_parent->_M_color = _S_black; __xpp->_M_color = _S_red; itree_rotate_left(__xpp, __root); } } } __root->_M_color = _S_black; } node_type* itree_rebalance_for_erase(node_type* const __z, node_type& __header) { node_type *& __root = __header._M_parent; node_type *& __leftmost = __header._M_left; node_type *& __rightmost = __header._M_right; node_type* __y = __z; node_type* __x = 0; node_type* __x_parent = 0; if (__y->_M_left == 0) // __z has at most one non-null child. y == z. __x = __y->_M_right; // __x might be null. else if (__y->_M_right == 0) // __z has exactly one non-null child. y == z. __x = __y->_M_left; // __x is not null. else { // __z has two non-null children. Set __y to __y = __y->_M_right; // __z's successor. __x might be null. while (__y->_M_left != 0) __y = __y->_M_left; __x = __y->_M_right; } if (__y != __z) { // relink y in place of z. y is z's successor __z->_M_left->_M_parent = __y; __y->_M_left = __z->_M_left; if (__y != __z->_M_right) { __x_parent = __y->_M_parent; if (__x) __x->_M_parent = __y->_M_parent; __y->_M_parent->_M_left = __x; // __y must be a child of _M_left __y->_M_right = __z->_M_right; __z->_M_right->_M_parent = __y; } else __x_parent = __y; if (__root == __z) __root = __y; else if (__z->_M_parent->_M_left == __z) __z->_M_parent->_M_left = __y; else __z->_M_parent->_M_right = __y; _Link_type __old_y_parent=__y->_M_parent; __y->_M_parent = __z->_M_parent; std::swap(__y->_M_color, __z->_M_color); itree_reset_max(__x_parent); itree_reset_max(__old_y_parent); _M_update_parents(__old_y_parent); __y = __z; // __y now points to node to be actually deleted } else { // __y == __z __x_parent = __y->_M_parent; if (__x) __x->_M_parent = __y->_M_parent; if (__root == __z) __root = __x; else if (__z->_M_parent->_M_left == __z) __z->_M_parent->_M_left = __x; else __z->_M_parent->_M_right = __x; if (__leftmost == __z) { if (__z->_M_right == 0) // __z->_M_left must be null also __leftmost = __z->_M_parent; // makes __leftmost == _M_header if __z == __root else __leftmost = node_type::_S_minimum(__x); } if (__rightmost == __z) { if (__z->_M_left == 0) // __z->_M_right must be null also __rightmost = __z->_M_parent; // makes __rightmost == _M_header if __z == __root else // __x == __z->_M_left __rightmost = node_type::_S_maximum(__x); } if(__x){ itree_reset_max(__x); _M_update_parents(__x); }else{ itree_reset_max(__x_parent); _M_update_parents(__x_parent); } } if (__y->_M_color != _S_red) { while (__x != __root && (__x == 0 || __x->_M_color == _S_black)) if (__x == __x_parent->_M_left) { node_type* __w = __x_parent->_M_right; if (__w->_M_color == _S_red) { __w->_M_color = _S_black; __x_parent->_M_color = _S_red; itree_rotate_left(__x_parent, __root); __w = __x_parent->_M_right; } if ((__w->_M_left == 0 || __w->_M_left->_M_color == _S_black) && (__w->_M_right == 0 || __w->_M_right->_M_color == _S_black)) { __w->_M_color = _S_red; __x = __x_parent; __x_parent = __x_parent->_M_parent; } else { if (__w->_M_right == 0 || __w->_M_right->_M_color == _S_black) { __w->_M_left->_M_color = _S_black; __w->_M_color = _S_red; itree_rotate_right(__w, __root); __w = __x_parent->_M_right; } __w->_M_color = __x_parent->_M_color; __x_parent->_M_color = _S_black; if (__w->_M_right) __w->_M_right->_M_color = _S_black; itree_rotate_left(__x_parent, __root); break; } } else { // same as above, with _M_right <-> _M_left. node_type* __w = __x_parent->_M_left; if (__w->_M_color == _S_red) { __w->_M_color = _S_black; __x_parent->_M_color = _S_red; itree_rotate_right(__x_parent, __root); __w = __x_parent->_M_left; } if ((__w->_M_right == 0 || __w->_M_right->_M_color == _S_black) && (__w->_M_left == 0 || __w->_M_left->_M_color == _S_black)) { __w->_M_color = _S_red; __x = __x_parent; __x_parent = __x_parent->_M_parent; } else { if (__w->_M_left == 0 || __w->_M_left->_M_color == _S_black) { __w->_M_right->_M_color = _S_black; __w->_M_color = _S_red; itree_rotate_left(__w, __root); __w = __x_parent->_M_left; } __w->_M_color = __x_parent->_M_color; __x_parent->_M_color = _S_black; if (__w->_M_left) __w->_M_left->_M_color = _S_black; itree_rotate_right(__x_parent, __root); break; } } if (__x) __x->_M_color = _S_black; } return __y; } unsigned int itree_black_count(const node_type* __node, const node_type* __root) { if (__node == 0) return 0; unsigned int __sum = 0; do { if (__node->_M_color == _S_black) ++__sum; if (__node == __root) break; __node = __node->_M_parent; } while (1); return __sum; } ////////////////////////////////////////////// /////////////////////end of precompiled stuff //////////////////////////////////////////// node_type _M_header; size_type _M_node_count; allocator_type alloc; allocator_type get_allocator() const { return alloc; } node_type* _M_get_node() { return alloc.allocate(1); } void _M_put_node(node_type* __p) { alloc.deallocate(__p, 1); } _Link_type _M_create_node(const init_type& __x) { _Link_type __tmp = _M_get_node(); try { __tmp->key=__x.first; __tmp->val=__x.second; __tmp->max=__x.first.stop; } catch(...) { _M_put_node(__tmp); __throw_exception_again; } return __tmp; } _Link_type _M_clone_node(_Const_Link_type __x) { _Link_type __tmp = _M_create_node(*__x); return __tmp; } static const key_type& _S_key(_Const_Link_type __x) { return __x->key; } static const _IntType& _S_max(_Const_Link_type __x) { return __x->max; } void destroy_node(_Link_type __p) { alloc.destroy(__p); _M_put_node(__p); } _Link_type _M_begin() { return static_cast<_Link_type>(this->_M_header._M_parent); } _Link_type _M_end() { return static_cast<_Link_type>(&this->_M_header); } _Link_type& _M_root() { return this->_M_header._M_parent; } _Link_type& _M_leftmost() { return this->_M_header._M_left; } static _Link_type _S_left(_Link_type __x) { return static_cast<_Link_type>(__x->_M_left); } static _Link_type _S_right(_Link_type __x) { return static_cast<_Link_type>(__x->_M_right); } _Link_type& _M_rightmost() { return this->_M_header._M_right; } static _Link_type _S_minimum(_Link_type __x) { while (__x->_M_left != 0) __x = __x->_M_left; return __x; } static _Link_type _S_maximum(_Link_type __x) { while (__x->_M_right != 0) __x = __x->_M_right; return __x; } void _M_erase(_Link_type __x) { // Erase without rebalancing. while (__x != 0) { _M_erase(_S_right(__x)); _Link_type __y = _S_left(__x); destroy_node(__x); __x = __y; } } bool _M_key_compare(const key_type& a,const key_type& b){ return a.start < b.start; } bool _M_stop_start_compare(const key_type& a,const key_type& b){ return (a.stop_M_header); _M_update_parents(__z); ++_M_node_count; #ifdef DEBUG_ITREE assert(verify()); #endif return iterator(__z); } void _M_update_parents(_Link_type __x,_Link_type __ignore){ #ifdef DEBUG_ITREE assert(__x); #endif __x=__x->_M_parent; itree_reset_max(__x,__ignore); _Link_type prev=__x; for(__x=__x->_M_parent;__x && __x!=&_M_header;__x=__x->_M_parent){ itree_reset_max(__x); } } void _M_update_parents(_Link_type __x){ #ifdef DEBUG_ITREE assert(__x); #endif for(__x=__x->_M_parent;__x && __x!=&_M_header;__x=__x->_M_parent){ itree_reset_max(__x); } } public: iterator begin() { return iterator(this->_M_header._M_left); } iterator end() { return iterator(&this->_M_header); } itree() : _M_header(),_M_node_count(0),alloc() { this->_M_header._M_color = _S_red; this->_M_header._M_parent = 0; this->_M_header._M_left = &this->_M_header; this->_M_header._M_right = &this->_M_header; } ~itree() { _M_erase(_M_begin()); } size_t size() const{ return _M_node_count; } bool empty() const{ return !_M_node_count; } iterator insert(const key_type& key,const value_type& val){ //add stuff _Link_type __x = _M_begin(); _Link_type __y = _M_end(); while (__x != 0) { __y = __x; __x = _M_key_compare(key, _S_key(__x)) ? _S_left(__x) : _S_right(__x); } return _M_insert(__x, __y, key, val); } void clear() { _M_erase(_M_begin()); _M_leftmost() = _M_end(); _M_root() = 0; _M_rightmost() = _M_end(); _M_node_count = 0; } inline void erase(iterator& __position){ //kill it _Link_type __y = static_cast<_Link_type>(itree_rebalance_for_erase (__position._M_node, this->_M_header)); destroy_node(__y); --_M_node_count; #ifdef DEBUG_ITREE assert(verify()); #endif } iterator lower_bound(const key_type& key){ //find first value not less than key _Link_type __x = _M_begin(); // Current node. _Link_type __y = _M_end(); // Last node which is not less than __k. while (__x != 0) if (_M_edge_max_compare(key,__x)) __x = _S_right(__x); else __y = __x, __x = _S_left(__x); if(_M_end() == __y) return iterator(__y); if(key.overlaps(__y->key)){ return iterator(__y); } return next_in(key,__y); } inline iterator next_in(const key_type& key,iterator after){ _Link_type tmp=after._M_node; return next_in(key,tmp); } iterator next_in(const key_type& key,_Link_type after){ _Link_type __x = node_type::itree_increment(after); _Link_type __z = _M_end(); while(__x != _M_end() && !_M_stop_start_compare(key,_S_key(__x))){ //mmm O(log n) if(key.overlaps(__x->key)){ __z=__x; break; } //a sneaky modified increment. can skip left traversals if max is < key.start if (__x->_M_right != 0) { __x = __x->_M_right; while (__x->_M_left != 0 && !((__x->_M_left->max)<(key.start))) __x = __x->_M_left; } else { node_type* __y = __x->_M_parent; while (__x == __y->_M_right) { __x = __y; __y = __y->_M_parent; } if (__x->_M_right != __y) __x = __y; } } return iterator(__z); } iterator upper_bound(const key_type& key){ //find first value a where key equal_range(const key_type& key){ return pair(lower_bound(key),upper_bound(key)); } range_iterator in_range(const key_type& key){ return range_iterator(lower_bound(key),key,&_M_header); } std::ostream& drawTree(std::ostream& os){ return drawSubTree(os,_M_root()); } std::ostream& drawSubTree(std::ostream& os,_Link_type __x){ if(__x){ os << "("; drawSubTree(os,__x->_M_left); os << " , " << *__x << " , "; drawSubTree(os,__x->_M_right); os << ")"; }else{ os << "#"; } return os; } bool verify(){ // std::cerr << "Verifying: " << *this<_M_header._M_left == _M_end() && this->_M_header._M_right == _M_end(); unsigned int __len = _itree_black_count(_M_leftmost(), _M_root()); for (iterator __it = begin(); __it != end(); ++__it) { _Link_type __x = __it._M_node; _Link_type __L = _S_left(__x); _Link_type __R = _S_right(__x); if (__x->_M_color == _S_red){ if ((__L && __L->_M_color == _S_red) || (__R && __R->_M_color == _S_red)){ std::cerr << "** red node children both black failed"<_M_color == _S_black) ++__sum; if (__node == __root) break; __node = __node->_M_parent; } while (1); return __sum; } }; template std::ostream& operator<<(std::ostream& os,itree<_IntType,_Value,_Alloc>& t){ return t.drawTree(os); } template std::ostream& operator<<(std::ostream& os,const itree_node<_IntType,_Value>& t){ return os << ((t._M_color == _S_red) ? "+":"-") << (t.key) << "%" << (t.max); } } #endif murasaki/src/align-best.cc0000644000177700001440000001271411434752234015107 0ustar krispusers//compare 2 alignments /* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include "dinkymath.h" #include "itree.h" #include "alignments.h" #include using namespace std; namespace fs = boost::filesystem; string program_help(); string program_version(); //function decl double compareAlignment(Alignment &ref,Alignment &test); void printResults(vector > &results); //globals ProgressTicker ticker(100); bool debug=false; int main(int argc,char** argv){ int optc; uint growAnchors=0; ostream *os=&cout; string outfile; string statTarget="ungappedscore"; using boost::lexical_cast; using boost::bad_lexical_cast; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"help",0,0,'?'}, {"version",0,0,'v'}, {"grow",1,0,'g'}, {"output",1,0,'o'}, {"debug",0,0,'d'}, {"stat",1,0,'s'}, {0,0,0,0} }; int longindex=0; optc=getopt_long(argc,argv,"?vg:o:",long_options,&longindex); if(optc==-1)break; switch(optc){ case 'v': cout << program_version();exit(-1);break; case '?': cout << program_help();exit(-1);break; case 'g': if(optarg){ string optstr(optarg); if(optstr==string("0") || optstr==string("false")){ growAnchors=0; break; } try{ growAnchors=lexical_cast(optstr); } catch(bad_lexical_cast& e){ cerr << "Bad argument to --grow"< files; for(int i=optind;i inputs(alignmentCount); typedef multimap::iterator> > BestAnchorMap; BestAnchorMap bestAnchors; for(uint i=0;i parts;reader.readline(parts);parts.clear()){ Anchor& a=inputs[i].add(parts); double stat; statfh >> stat; if(!statfh) err(-2,"Stat file (%s) ended before anchor file (%s)",statfile.c_str(),files[i].c_str()); bestAnchors.insert(BestAnchorMap::value_type(stat,BestAnchorMap::mapped_type(i,a.backref))); anchors++; } cout << "Loaded "< overlaps; list::iterator ai=bami->second.second; vector parts; if(debug)cout << "Confirming "<< *ai; for(size_t s=0;sparts.size();s++){ region r=ai->parts[s].key(); if(debug)cout << s << " "<< r << " = "; if(isZero(r)){ if(debug)cout << "zero (skip)"< overlaps; if(result.fetchAnchors(s,r,overlaps)){ if(debug)cout << "overlaps "<tick(); } cerr << "Done. Writing "<flush(); delete os; // i dont like this, but it'll do... } return 0; } string program_help(){ return string("\ Usage: align-best [options] alignment1 [alignment2 ... ]\n\ \n\ Options\n\ *Takes an argument (like --maxres 3 or -m3)\n\ --grow|v = grow all anchors by some amount\n\ --output|o = store output to a separate file (otherwise, it's stdout)\n\ --stat|s = select anchors based on stat X\n\ \n\ *Toggles: (just --merge or -b)\n\ --help|h = this message\n\ --version|v = version string\n\ "); } string program_version(){ return string("align-best v0.1"); } murasaki/src/ecohash.cc0000644000177700001440000000514411434752233014472 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ //////// // super-memory-miserly hash list storage (def.) // Kris Popendorf /////// #include "ecohash.h" #include "ecolist.h" #include "murasaki.h" #include "options.h" using namespace std; EcoHash::EcoHash(BitSequence *pat) : Hash(pat), sorted(hash_size) { fasta=new Ecolist[hash_size]; } EcoHash::~EcoHash(){ delete[] fasta; } void EcoHash::clear(){ for(word i=0;i &sets){ if(emptyAt(key)) return; if(!sorted.checkAndSet(key)) fasta[key].sort(); Ecolist::iterator start(fasta[key].begin()),stop(fasta[key].end()); for(Ecolist::iterator si=start;si!=stop;){ sets.push_back(LocList(seq_count)); LocList &locList=sets.back(); for(Ecolist::iterator setEnd(fasta[key].upper_bound(*si));si!=setEnd;++si){ val_type l(*si); locList[l.first].push_back(val2loc(l)); } } } void EcoHash::lookup(HashKey key,LocList &locList){ Ecolist::iterator start(fasta[key].begin()),stop(fasta[key].end()); for(Ecolist::iterator i=start;i!=stop;++i){ val_type l(*i); locList[l.first].push_back(val2loc(l)); } } bool EcoHash::lessthan(const val_type& a,const val_type& b){ Location la(val2loc(a)),lb(val2loc(b)); return la (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ /////////////// // murasaki - multiple genome global alignment program // by Kris Popendorf (krisp@dna.bio.keio.ac.jp) ///////////// // murasaki.cc // main program control and such ////////// #include #include #include #include #include #include #include #include #include "murasaki.h" #include "timing.h" #include "dinkymath.h" #include "sequence.h" #include "msethash.h" #include "ecohash.h" #include "openhash.h" #include "murasaki_mpi.h" #include "arrayhash.h" #include "mingw32compat.h" #include #include #include #include //screw "sscanf" =P #include #include #include //for random_shuffle #ifdef __FreeBSD__ #include #include #include #include #endif #ifdef __APPLE__ #include #include #include #endif #ifdef __MINGW32__ #include #include #include #endif #ifdef MURASAKI_MPI #include "mpi.h" #endif using namespace std; int seq_count=0; Hash *mfh; //multi-fasta-hash vector seqs; word usedBuckets=0,usedPats=0,worstPat=0; UsedMap *anchors; RepeatMap *repeats=0; string version_string(PROGRAMVERSION); string pattern_src;//the pattern hunter default 18long int patLength; BitSequence* globalPat; char nodename[81]; int debugbits=-22; SystemInfo sysinfo; Options opt; //runtime option manager ProgressTicker ticker(100); int verbose=0; //ubersneaky debug switch word longestSeq=0; word totalSeqLength=0; word totalHashLength=0; word totalKeysUsed=0,totalSeedsSeen=0; word *dfCount=0; word totalAnchoredLocs=0; ofstream *entropy_fh=0,*anchorProgress_fh=0; int main (int argc,char **argv){ MurasakiRuntime *murasakiRunner=0; murasakiRunner=new MurasakiRuntime(argc,argv); cout << "Murasaki finished!"< Init "<",mpi_id); if(mpi_procs<2){ if(opt.verbose) cerr << "MPI>MPI is useless (and in fact impossible) with fewer than 2 processes. Disabling."<murasaki/MPI rank " << mpi_id << "/"<=argc) cout << "Error: Need at least one input sequence."<(pattern_src.data()),pattern_src.size(),MPI_CHAR,0,MPI_COMM_WORLD); //everybody use 0's pattern #endif writeOut(opt.pat_record,pattern_src); //solidfy parameters seq_count=args.size(); if(seq_count==1) opt.hashOnly=true; //can't anchor with only 1 sequence. opt.solidify(); if(opt.anchorProgressCheck) anchorProgress_fh=new ofstream(opt.anchorProgress_record.c_str()); ofstream opf(opt.options_record.c_str()); opf << opt; opf << platformInfo(); opf.close(); cout << opt; #ifdef MURASAKI_MPI //solidfy mpi data if(mpi_capable) mpi_init(); #endif } void MurasakiRuntime::prep(){ writeOut(opt.status_record,"Loading sequences...",true); //load sequences ofstream outf(opt.seq_record.c_str()); seqLoadStart.reset(); for(int i=0;iname<<" loaded. "<length())<(longestSeq,seqs[i]->length()); totalSeqLength+=seqs[i]->length(); totalHashLength+=((opt.skipRev || opt.skipFwd) ? seqs[i]->fwd->getCounted():seqs[i]->fwd->getCounted()*2)/opt.hashSkip; } set_seqtable(seqs); // prepare for hashing: can we skip this process in future version? Timer seqLoadStop; outf.close(); cout << "Sequence loading took: "<length()<<" bases long with weight "<hashLength()<hashLength(); patLength=pat->length(); globalPat=pat; double idealHash=pow((double)4.,(double)(hashLength)); hash_max=intpow(2,max_hash_bits); int hashBits=min(max_hash_bits,hashLength*2); bool memoryFitting=false; #ifdef MURASAKI_MPI if(mpi_capable){ mpi_initJobs(); if(opt.auto_hashbits){ int oldHashBits=hashBits; int storageHosts=mpi_hostLeader_byName.size(); while((storageHosts>>=1)>0) //every time we double the number of hosts we're on, we double hash table size hashBits++; if(hashBits>hashLength*2) hashBits=hashLength*2; if(oldHashBits!=hashBits) cout << "Using "< Input sequence is short, so using OpenHash."< Switching hash function to "<1){ if(!memoryFitting) cout << " ### This is more memory than is available in your system." << endl << " ## Murasaki will try reducing the number of hashbits used..." << endl << " # (You can override this behaviour by specifying --hasbits manually)"<(0,0)); mpi_totalStorage=0; for(int i=0;i(start,stop); prevStop=stop; last=i; } } assert(last!=-1); mpi_storeOwn[last].second=hash_size-1; mpi_storeBrkStop[hash_size+1]=last; mpi_total_hash_size=hash_size; //my own personal hash settings: if(mpi_jobs[mpi_id]==JOB_STORAGE){ mpi_myStoreOffset=mpi_storeOwn[mpi_id].first; hash_size=mpi_storeOwn[mpi_id].second-mpi_storeOwn[mpi_id].first+1; } //split up hashing work (if we had some cpu information about each node maybe //we could split it by speed, but for now, split evenly) word hashChunkSize=globalCounted/(word)mpi_jobCount[JOB_HASHER]; mpi_hashShare.resize(mpi_procs,0); mpi_hashOwn.resize(mpi_procs,pair(0,0)); last=-1; int hasher=0; for(int i=0;i(start,start+hashChunkSize); last=i; hasher++; } } assert(last!=-1); mpi_hashOwn[last].second=globalCounted-1; //display a nice summary of what's going on on each node and why cout << "MPI>Node summary:"< node "<Total world memory: "<Total world storage: "<mpi_totalStorage){ cout << "**** Warning: Using this hash method, it's physically impossible"< Consider using EcoHash."< Consider using more nodes/memory."<mpi_totalStorage){ cout << "****>>>>>> Impending Disaster <<<<<<****" << endl << "****>> Murasaki " << (bestCaseMemory/1024>mpi_totalStorage ? "_will_":"may") <<" consume more memory than you have available to MPI.\n" << "****>> Consider changing hash settings (like using fewer hashbits).\n"; } }else{ #endif //obligatory warnings about memory usage if(linearHashCost/1024>sysinfo.totalMemory){ cout << "**** Warning: Using this hash method, it's physically impossible"< Consider using EcoHash or MPI across multiple nodes."< Consider using MPI across multiple nodes."<sysinfo.freeMemory) cout << "**** Warning: This is more memory you have free on your system.\n" << "**** You may encounter massive slowdowns if murasaki\n" << "**** gets swapped to disk.\n" << "****--------> Consider changing hash settings.\n"; if(avgCaseMemory/1024>sysinfo.totalMemory) cout << "****>>>>>> Impending Disaster <<<<<<****" << endl << "****>> Murasaki "<< (bestCaseMemory/1024>sysinfo.totalMemory ? "_will_":"may") <<" consume more memory than you have in your system.\n" << "****>> Consider changing hash settings.\n"; if(avgCaseMemory/1024>opt.targetMemory && avgCaseMemory/1024<=sysinfo.totalMemory){ cout << "** Warning: This is more memory than "<<(opt.userSetMemory ? "specified.":"recommended.")<90) cout << " (however you may run out of memory!)"; cout << "."<clear(); string cachename=hashCacheName(seqs[i]->name, pat); ifstream cachef; cachef.open(cachename.c_str(), ios::binary); if (cachef){ cachef.close(); cout << "cache hit, so skipping to hash " << seqs[i]->name << "." << endl; continue; } } // Hash if(!opt.skipFwd){ cout << "\nHashing "+seqs[i]->name+" forwards\n"; writeOut(opt.status_record,"Hashing "+seqs[i]->name+" forwards\n",true); hashSeq(seqs[i]->fwd,pat,1,seqs[i]); } if(!opt.skipRev){ cout << "\nHashing "+seqs[i]->name+" backwards\n"; writeOut(opt.status_record,"Hashing "+seqs[i]->name+" backwards\n",true); hashSeq(seqs[i]->rev,pat,-1,seqs[i]); } // Save hashCache if(opt.hashCache){ string cachename=hashCacheName(seqs[i]->name, pat); ofstream cachef; cout << endl <<"Saving hash as: " << cachename << endl; cachef.open(cachename.c_str(), ios::trunc|ios::out|ios::binary); if(!cachef){ throw MurasakiException(string("can't write ")+cachename); } mfh->dump(cachef); cachef.close(); } // assert(mfh->sanityCheck()); // load from cache if(opt.hashCache){ mfh->clear(); for(unsigned i=(opt.histogram ? 0:1);iname, pat); ifstream cachef; cout << "Loading hash from: " << cachename << endl; cachef.open(cachename.c_str(), ios::binary); if(!cachef){ throw MurasakiException(string("can't load ")+cachename); } mfh->load(cachef); cachef.close(); } } } Timer hashStop; cout << "\nInitial hashing took: "<=(int)mpi_hasherIds.size()){ cout << " (sender "< senders(1,partner); //for now we only use 1 sender mpi_anchorMergeServer(senders); } } DistMergeDone: ; }else{ if(mpi_id==mpi_finalAssembler){ mpi_merge_client_mode(); } } break; case JOB_STORAGE: if(!opt.mpi_noCake) mpi_storage_client_mode(); else mpi_hashAndStore_client_mode(pat); //phase 1 complete MPI_Barrier(MPI_COMM_WORLD); if(opt.hashOnly) //skip extraction break; mpi_extract_client_mode(pat); break; default: cerr << "Please place the device on the ground and assume the party submission position. A Keio Science representative will arrive shortly to escort you to the party."<size()<<" repeats...";cout.flush(); writeRepeats(opt.repeat_record); } if(opt.histogram) mpi_write_histogram(); cout << endl; } } cout << "Counter tallies:"<count(); } MPI_Bcast(&anchorCount,1,MPI_UNSIGNED_LONG,mpi_finalAssembler,MPI_COMM_WORLD); //print them in one go (otherwise mpi tends to block on the endl buffer flushes) for(int i=0;i(mpi_hashCount) << "\t" << sum(mpi_storeCount) << "\t" << sum(mpi_extractLocCount) << "\t" << sum(mpi_extractCount) << "\t" << sum(mpi_mergeCount) << "\t" << sum(mpi_anchorSendCount) << "\t" << sum(mpi_anchorRecvCount) << "\t" << humanTime(sum(mpi_workTime)) << endl; cout << "Total anchors: "<count()< collisionSizeHistogram; map seedHistogram; generateCollisionHistogram(collisionSizeHistogram,seedHistogram); writeMap(collisionSizeHistogram,opt.prefix+".collisions.histogram","hash collision histogram"); writeMap(seedHistogram,opt.prefix+".seed.histogram","seed count histogram"); } #ifdef HASHPROFILE cout << "Writing hash table performance data..."<writePerformanceData(opt.prefix+".hash.performance"); #endif if(!opt.hashOnly || opt.measureHashCollisions){ double optimal=((double)totalSeedsSeen)/((double)hash_size)*100.0; optimal=min(optimal,100.0); double usage=((double)totalKeysUsed)/((double)hash_size)*100.0; cout << "Total seeds used in anchors: "<size()<<" repeats...";cout.flush(); writeRepeats(opt.repeat_record); } Timer totalDone; cout << "Output writing finished in: "<writeOut(of); of.close(); ofstream ancDetails((anchor_record+".details").c_str()); if(opt.bitscore){ ofstream ancBitscore((anchor_record+".bitscore").c_str()); anchors->saveDetails(ancDetails,ancBitscore); ancBitscore.close(); }else{ anchors->saveDetails(ancDetails,cerr); } ancDetails.close(); if(opt.tfidf){ ofstream of((anchor_record+".stats.tfidf").c_str()); anchors->writeTfidf(of); } if(!opt.gappedAnchors){ ofstream of((anchor_record+".stats.ungappedscore").c_str()); anchors->writeScores(of); } } void writeRepeats(string record){ ofstream of(record.c_str()); repeats->writeOut(of); } void writeHisto(string hashHisto_record,string details){ ofstream histo(hashHisto_record.c_str()); if(opt.histogram) mfh->writeHistogram(histo); histo.close(); if(opt.histogram>=3){ cout << "Writing detailed histogram...";cout.flush(); ofstream histo(details.c_str()); mfh->writeDetailedHistogram(histo); histo.close(); } } void hashSeq(BitSequence *bitseq,BitSequence *pat,int sign,Sequence *s){ ticker.reset(bitseq->length()-1); for(SeqPosPairArray::iterator region=bitseq->matchRegions.begin(); region!=bitseq->matchRegions.end();region++){ //cout << "Valid region is "<first<<" to "<second<< " - patlen" << pat->length() << endl; SeqPos stop=region->second-pat->length()+2; SeqPos start=region->first+1; Window win(bitseq,region->first,pat); word hash; //don't need to init this int progress; // cout << "Hasing region from "<name<<"."<<(sign>0 ? "fwd":"rev")<<":"<"<emptyAt(base)) continue; totalKeysUsed++; // cout << "Got bits! ("<sizeAt(base)<<" bits!)"< fulllist; mfh->getMatchingSets(base,fulllist); for(list::iterator seti=fulllist.begin();seti!=fulllist.end();++seti){ totalSeedsSeen++; LocList& loclist=*seti; bool gotAllSeqs=true; word combinations=1; int skips=opt.rifts; for(unsigned i=0;i0) skips--; else{ gotAllSeqs=false; } } } if(gotAllSeqs){ if(opt.mergeFilter && combinations>opt.mergeFilter){ if(repeats){ repeats->add(loclist); } goto ExtractLocDone; } list start; procLocs(loclist,start,opt.rifts,0); } ExtractLocDone: ; } } cout << endl; //for ticker Timer extractDone; cout << "Anchor extraction completed in: "<l.seq()->length())return false; if(l.pos<-l.seq()->length())return false; return true; } bool isFullLocList(const LocList &locList){ word skips=opt.rifts; for(unsigned i=0;i0) skips--; else return false; } return true; } void procLocs(LocList &locList,list &use,int skipsLeft,int level){ if(level0 ){ list temp(use); temp.push_back(Location(SeqIdx(0),SeqPos(0))); procLocs(locList,temp,skipsLeft-1,level+1); }else{ for(list::iterator i=locList[level].begin(); i!=locList[level].end(); i++){ list temp(use); assert(sanityCheck(*i)); temp.push_back(*i); procLocs(locList,temp,skipsLeft,level+1); } } return; } //got a full package! IntervalSet s; while(!use.empty()){ // assert(cout << " -> add "<::iterator i(s.spaces.begin()); while(!(*i).start){ ++i; assert(i!=s.spaces.end()); } SeqPos ref=(*i).start; for(i++;i!=s.spaces.end();i++) if((*i).start && (*i).start!=ref) goto HAS_OTHERS; //finished without finding a non-self reference return; //abort HAS_OTHERS:; //safe ^^ } //s ready! // cout << "inserting.."<insert(s); //check alreadyExists/merge/add whatever. do what you need to do } void generateCollisionHistogram(map &collisionSizeHistogram,map &seedHistogram){ ticker.reset(hash_size); for(word idx=0;idxemptyAt(idx)){ collisionSizeHistogram[0]++; continue; }else{ list fulllist; mfh->getMatchingSets(idx,fulllist); collisionSizeHistogram[fulllist.size()]++; if(opt.hashOnly){//we're collecting stats inplace of normal extract func totalKeysUsed++; totalSeedsSeen+=fulllist.size(); } for(list::iterator li=fulllist.begin();li!=fulllist.end();++li){ size_t freq=0; for(LocList::iterator si=li->begin();si!=li->end();++si) freq+=si->size(); seedHistogram[freq]++; } } } ticker.done(); } string platformInfo(){ char buf[1024]; sprintf(buf,"Platform information:\nWordsize: %u bits\n\ sizeof(word): %u bytes\n\ %s\n\ ",WORDSIZE,(unsigned)sizeof(word), sysinfo.toString().c_str()); return string(buf); } string program_version(){ list features; #ifndef NDEBUG features.push_back("DEBUG"); #endif #ifdef MURASAKI_MPI features.push_back("MPI"); #endif #ifdef LARGESEQ_SUPPORT features.push_back("LARGESEQ"); #endif #ifdef USE_SHM_MMAP features.push_back("MMAP"); #endif #ifdef USE_SHM_SYSV features.push_back("SYSV"); #endif #ifdef USE_LIBCRYPTOPP features.push_back("CRYPTOPP"); #endif #ifdef SVNREVISION features.push_back("SVN=" QUOTEME(SVNREVISION)); #endif #ifdef HGREVISION features.push_back("HG=" QUOTEME(HGREVISION)); #endif string featurestring; if(!features.empty()) featurestring=string(" (")+ (joinStrings(features,string(", "))) + string(")"); return "Murasaki version "+version_string+featurestring+"\n"; } #ifdef __FreeBSD__ int getsysctl(char* name){ int mib[4]; size_t len = 4; int size; sysctlnametomib(name, mib, &len); if (sysctl(mib, 4, &size, &len, NULL, 0) != -1) return size; return 0; } #endif SystemInfo::SystemInfo() : wordsize(WORDSIZE), totalMemory(0),freeMemory(0),swapTotal(0), unit(1024) { //platform specific mojo. // for Linux: #ifdef __linux__ using namespace boost; ifstream meminfo("/proc/meminfo"); const boost::regex memline("(\\w+):\\s+(\\d+)\\s(\\w+)"); boost::smatch results; string line; while(getline(meminfo,line)){ if(regex_search(line,results,memline)){ if(results[1]=="MemTotal") totalMemory=lexical_cast(results[2]); else if(results[1]=="MemFree") freeMemory+=lexical_cast(results[2]); else if(results[1]=="Buffers") freeMemory+=lexical_cast(results[2]); else if(results[1]=="Cached") freeMemory+=lexical_cast(results[2]); else if(results[1]=="SwapTotal") swapTotal=lexical_cast(results[2]); } } if(totalMemory>swapTotal) //good clue that memTotal is comprised of physical+swap, but sadly we don't have any other clues! =( totalMemory-=swapTotal; //don't count swap. we only want physical memory in total (a bit of a lie, I know) #endif // for FreeBSD #ifdef __FreeBSD__ int pagesize = getpagesize()/unit; unsigned int pa, pi, pw, pc, pf; pa = getsysctl("vm.stats.vm.v_active_count"); pi = getsysctl("vm.stats.vm.v_inactive_count"); pw = getsysctl("vm.stats.vm.v_wire_count"); pc = getsysctl("vm.stats.vm.v_cache_count"); pf = getsysctl("vm.stats.vm.v_free_count"); totalMemory = (pa + pi + pw + pc + pf) * pagesize; freeMemory = (pc + pf) * pagesize; #endif // for MacOS X #ifdef __APPLE__ vm_statistics_data_t page_info; host_basic_info maxmem_info; vm_size_t pagesize; mach_msg_type_number_t count; kern_return_t kret; unsigned int pf; pagesize = 0; kret = host_page_size (mach_host_self(), &pagesize); count = HOST_VM_INFO_COUNT; kret = host_statistics (mach_host_self(), HOST_VM_INFO, (host_info_t)&page_info, &count); if (kret == KERN_SUCCESS){ pf = page_info.free_count*pagesize; freeMemory = pf / unit; } count = HOST_BASIC_INFO_COUNT; kret = host_info (mach_host_self(), HOST_BASIC_INFO, (host_info_t)&maxmem_info, &count); if (kret == KERN_SUCCESS){ totalMemory = maxmem_info.max_mem / unit; } #endif // for Windoze #ifdef __MINGW32__ MEMORYSTATUSEX msex = { sizeof(MEMORYSTATUSEX) }; GlobalMemoryStatusEx( &msex ); freeMemory = (unsigned long)(msex.ullAvailPhys/unit); totalMemory = (unsigned long)(msex.ullTotalPhys/unit); #endif } string SystemInfo::toString() const{ char buf[1024]; sprintf(buf,"Total Memory: %s\n\ Available Memory: %s (%.2f%%)\n", humanMemory(totalMemory*unit).c_str(), humanMemory(freeMemory*unit).c_str(), ((double)freeMemory/(double)totalMemory*100.0)); return string(buf); } ostream& operator<<(ostream& of,const SystemInfo& a){ return of << a.toString(); } ostream& outputRegions(ostream &of){ for(unsigned i=0;ifwd; start: of << "Seq "<name)<<" strand "<<(a==seqs[i]->fwd ? "fwd":"rev")<matchRegions.size();j++) of << " Region from: "<matchRegions[j].first <<" to "<matchRegions[j].second<fwd){ a=seqs[i]->rev; goto start; } } return of; } void hashStatusCheck(int tick,SeqPos pos){ #ifdef NDEBUG return; //this function is really slow actually. rely on tee instead #endif ofstream of(opt.hashStatus_record.c_str()); of << "Done "<count()<(results[1]),length=lexical_cast(results[2]); if(weight>length) throw MurasakiException("Pattern weight must be less than or equal to pattern length"); if(weight<1) throw MurasakiException("Can't use patterns with weight <1"); switch(length){//degenerate cases case 2:pat="11";return true; case 1:pat="1";return true; } string randomStr; randomStr.reserve(length-2); for(int i=0;i::iterator i=seqs.begin();i!=seqs.end();i++) delete *i; seqs.clear(); } #ifdef MURASAKI_MPI if(mpi_capable){ cerr << "Terminating MPI"< (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////////// // compare some cgr images // this is basically load, then multiply ///////////////// #include #include #include #include #include #include #include #include "dinkymath.h" #include "cgr.h" using namespace std; string program_help(); string program_version(); typedef vector > CgrSet; void doSubsets(CgrSet base,list remaining); void doNwise(CgrSet base, CgrSet remaining, unsigned n); typedef map Resmap; Resmap results; unsigned nwise=0; int main(int argc,char** argv){ int optc; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"help",0,0,'?'}, {"version",0,0,'v'}, {"nwise",1,0,'n'}, {0,0,0,0} }; int longindex=0; optc=getopt_long(argc,argv,"?vn:",long_options,&longindex); if(optc==-1)break; switch(optc){ case 'n': using boost::lexical_cast; using boost::bad_lexical_cast; try{ nwise=lexical_cast(optarg);} catch(bad_lexical_cast& e){ cerr << "Bad argument to --nwise."< sets; sets.push_back(CgrSet()); //starter set for(int seqi=optind;seqi(new PgmBuffer,inname)); try { sets.back().back().first->loadPGM(inname); } catch(PGMFileException e){ cerr << e.reason() << endl; cerr << inname << " ^^File skipped^^"<first << "\t" << i->second << endl; } } inline void setCmp(CgrSet &base){ CgrSet::iterator i=base.begin(); string resname=i->second; PgmBuffer* home=i->first; vector others; i++; for(;i!=base.end();i++){ others.push_back(i->first); resname+="-"+(i->second); } results[resname]=home->difference(others); } void doNwise(CgrSet base, CgrSet remaining, unsigned n){ if(base.size()==n){ //base step setCmp(base); }else{ //inductive step if(remaining.empty())return; //insufficient to finish for(CgrSet::iterator i=remaining.begin();i!=remaining.end();i++){ CgrSet temp; CgrSet::iterator j=i; j++; copy(j,remaining.end(),back_insert_iterator(temp)); base.push_back(*i); doNwise(base,temp,n); base.pop_back(); } } } //muwah! beautiful code void doSubsets(CgrSet base, list remaining){ if(remaining.size()){ //inductive step CgrSet mine=remaining.front(); remaining.pop_front(); for(CgrSet::iterator i=mine.begin(); i!=mine.end();i++){ base.push_back(*i); doSubsets(base,remaining); base.pop_back(); } }else{ //base case setCmp(base); } } string program_help(){ return string("\ Usage: cgr-compare [options] \n\ \n\ Options\n\ *Takes an option (like --maxres 3 or -m3)\n\ \n\ *Toggles: (just --merge or -b)\n\ "); } string program_version(){ return string("0.1"); } murasaki/src/murasaki_mpi.cc0000644000177700001440000014350011434752233015540 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // murasaki_mpi.cc // provides MPI-specific functions ////////////// #ifdef MURASAKI_MPI #include #include #include #include //for getpid #include "murasaki.h" #include "sequence.h" #include "dinkymath.h" #include "timing.h" #include "murasaki_mpi.h" //defined here for sanity's sake typedef struct mpi_req_t{ int type; HashKey key; HashVal val; } mpi_req; //globals int mpi_id; int mpi_procs; bool mpi_capable=false; vector mpi_hostnames; vector mpi_hostIds; map mpi_hostPopulation; map mpi_hostLeader_byName; vector mpi_hostLeader;//lowest id on same host bool mpi_isHostLeader,mpi_usingShm; int mpi_sysv_projid; word mpi_totalMemory,mpi_totalStorage; vector mpi_storeShare; vector > mpi_storeOwn; word mpi_myStoreOffset; map mpi_storeBrk; // start of hash region -> id map mpi_storeBrkStop; // stop+1 of hash region -> id vector mpi_hashShare; vector > mpi_hashOwn; map mpi_hashBrk; //start of sequence -> rank map mpi_hashPoints; //maps each hashBrk to an actual Location vector mpi_hashCount,mpi_storeCount,mpi_extractCount,mpi_extractLocCount,mpi_mergeCount,mpi_anchorSendCount,mpi_anchorRecvCount; vector mpi_workTime; word mpi_total_hash_size; vector mpi_worldMemory; vector mpi_jobs; vector mpi_jobCount(JOB_MAX,0); vector mpi_hasherIds; vector mpi_worldId2jobId; int mpi_myHasherId=-1; vector mpi_assemblerIds; //storage nodes map to assemblers, assembler nodes map to their index in mpi_hasherIds (will be handy for merging assembler -> assembler) int mpi_finalAssembler; //someone still has to write out const char *MPI_jobNames[]={"Hasher","Storage","--MAX--"}; MPI_Comm mpi_leaders_comm,mpi_localhost_comm,mpi_job_comm; int mpi_myHostId,mpi_myLeaderRank,mpi_myLocalRank,mpi_myJobRank; MPI_Datatype MPI_HashMessage_type; struct lt_mpi_worldMemory : binary_function { inline bool operator()(const int &a, const int &b) const { return mpi_worldMemory[a]==mpi_worldMemory[b] ? a { inline bool operator()(const int &a, const int &b) const { return mpi_worldMemory[a]==mpi_worldMemory[b] ? a>b:mpi_worldMemory[a]>mpi_worldMemory[b]; } }; void mpi_types_init(){ MPI_HashMessage msg; MPI_Datatype type[3]={MPI_UNSIGNED_LONG,MPI_INT,MPI_LONG}; int blocklen[3]={1,1,1}; MPI_Aint disp[3]; //displacement MPI_Aint base; MPI_Address(&msg.key,disp); MPI_Address(&msg.seqno,disp+1); MPI_Address(&msg.pos,disp+1); base=disp[0]; for(int i=0;i<3;i++)disp[i]-=base; MPI_Type_struct(3,blocklen,disp,type,&MPI_HashMessage_type); MPI_Type_commit(&MPI_HashMessage_type); //make sure we can send with it mpi_hashCount.resize(mpi_procs,0); mpi_storeCount.resize(mpi_procs,0); mpi_extractLocCount.resize(mpi_procs,0); mpi_extractCount.resize(mpi_procs,0); mpi_mergeCount.resize(mpi_procs,0); mpi_workTime.resize(mpi_procs,0); mpi_anchorSendCount.resize(mpi_procs,0); mpi_anchorRecvCount.resize(mpi_procs,0); } void mpi_init(){ mpi_usingShm=(opt.use_shm_mmap || opt.use_shm_sysv); cout << "Synchronizing MPI nodes..."<::iterator, bool> res(mpi_hostLeader_byName.insert(pair(sbuf,i))); if(res.second)//new host! hostId++; mpi_hostIds.push_back(hostId); if(mpi_id==i){ mpi_myHostId=hostId; if(res.second || !mpi_usingShm || (opt.use_shm_mmap && !opt.mmap_writePerHost && mpi_id==0)){ //new entry mpi_isHostLeader=true; }else{ mpi_isHostLeader=false; } } mpi_hostLeader[i]=(mpi_usingShm ? mpi_hostLeader_byName[sbuf]:i); } //especially if we're using shm we have to create new comm objects per host and for all leaders if(opt.verbose)cout << "Setting up leader/drone channel (my color: "< node_memoryRank(mpi_procs); for(int i=0;i jobsAssigned; //keep track of how many storage jobs have been assigned to each node int maxJobs=0; int mid=opt.mpi_hashers ? opt.mpi_hashers:mpi_procs/4; if(mid<=0) mid=1; //need at least 1 to do final assembly! mpi_hasherIds.resize(mid,-1); int hasherId=mid; vector > hashersOn(mpi_hostnames.size()); //for later use mpi_assemblerIds.resize(mpi_procs,-1); for(int i=0;i::iterator picki=node_memoryRank.begin(); for(int pick=0;pick<(int)node_memoryRank.size();pick++,++picki) if(!opt.mpi_hostbalance || jobsAssigned[mpi_hostnames[node_memoryRank[pick]]]<=maxJobs){ node=node_memoryRank[pick]; mpi_jobs[node]=job; jobsAssigned[mpi_hostnames[node]]++; node_memoryRank.erase(picki); goto NodePicked; } //couldn't successfully find one, so, up maxJobs and try again maxJobs++; }while(node<0); NodePicked: if(job==JOB_HASHER){ hasherId--; assert(hasherId>=0); if(node==mpi_id) mpi_myHasherId=hasherId; mpi_hasherIds[hasherId]=node; hashersOn[mpi_hostIds[node]].push_back(node); } mpi_jobCount[job]++; } //pick final assemblers if(!opt.mpi_distMerge && opt.mpi_distCollect){ //can only use 1 out of n hashers as merger, so pick carefully multimap assemblingFor; // client count -> server id vector assemblingCount(mpi_procs); for(int i=0;i::iterator hi=hashersOn[mpi_hostIds[i]].begin(); hi!=hashersOn[mpi_hostIds[i]].end(); ++hi) assemblingFor.insert(pair(assemblingCount[*hi],*hi)); if(!assemblingFor.empty()){ int hi=assemblingFor.begin()->second; //most unused on this node mpi_assemblerIds[i]=hi; assemblingCount[hi]++; } } //for the remaining nodes to quickly find the least used node for(vector::iterator hi=mpi_hasherIds.begin();hi!=mpi_hasherIds.end();++hi){ assemblingFor.insert(pair(assemblingCount[*hi],*hi)); } for(int i=0;i::iterator hi=assemblingFor.begin(); mpi_assemblerIds[i]=hi->second; assemblingCount[hi->second]++; assemblingFor.erase(hi); assemblingFor.insert(pair(assemblingCount[mpi_assemblerIds[i]],mpi_assemblerIds[i])); } }else{ //just to make this mpi_assemblerIds data consistent, set it anyway... for(int i=0;i location" coordinate map // mpi_fillHashPoints(); //now that we have jobs, also init job channel if(opt.verbose)cout << "Setting up job channel (my color: "<fwd; fillRegionBrks_start: if((a==seqs[seqi]->fwd && !opt.skipFwd) || (a==seqs[seqi]->rev && !opt.skipRev)) for(unsigned j=0;jmatchRegions.size();j++){ SeqPos start=a->matchRegions[j].first+1; if(a==seqs[seqi]->rev) //flip start=0-start; mpi_hashPoints[count]=Location(seqi,start); // cout << "Defining global "<matchRegions[j].second-a->matchRegions[j].first))<<" as a "<<(a->matchRegions[j].second-a->matchRegions[j].first+1)<<" long region at: "<matchRegions[j].second-a->matchRegions[j].first+1; //this needs to be the number of bases -in that region- } if(a==seqs[seqi]->fwd){ a=seqs[seqi]->rev; goto fillRegionBrks_start; } } assert(count==globalCounted); } GlobalHashPointIterator::GlobalHashPointIterator() : _count(0), _seqi(0), _bs(seqs[_seqi]->fwd), _regi(0) { } GlobalHashPointIterator::GlobalHashPointIterator(word x) : _count(0), _seqi(0), _bs(seqs[_seqi]->fwd), _regi(0) { seek(x); } void GlobalHashPointIterator::seek(word x){ while(_count+span()-1Seeking for "<End seek for "<matchRegions.size()<<" span=" <<(_bs->matchRegions[_regi].second-_bs->matchRegions[_regi].first+1) <<" count="<<_count; return os.str(); } word GlobalHashPointIterator::counted() const {return _count;} word GlobalHashPointIterator::span() const { // cout << "Span: "<matchRegions[_regi].second<<"-"<<_bs->matchRegions[_regi].first<<"+1 ->"<<(_bs->matchRegions[_regi].second-_bs->matchRegions[_regi].first+1)<matchRegions[_regi].second-_bs->matchRegions[_regi].first+1; } pair GlobalHashPointIterator::operator*() const { SeqPos start=_bs->matchRegions[_regi].first+1; if(_bs==seqs[_seqi]->rev) //flip start=0-start; return pair(_count,Location(_seqi,start)); } GlobalHashPointIterator& GlobalHashPointIterator::operator++(){ assert(!atLast()); _count+=span(); // cout << "Advancing "<matchRegions.size()){ //still in this bitseq, advance region ++_regi; return *this; } //bitseq is over, switch fwd to rev if possible _regi=0; if(_bs==seqs[_seqi]->fwd){ _bs=seqs[_seqi]->rev; return *this; } //sequence is over, advance to next seq ++_seqi; _bs=seqs[_seqi]->fwd; return *this; } bool GlobalHashPointIterator::atLast() const { return _seqi+1>=seqs.size() && _bs!=seqs[_seqi]->fwd && _regi+1>=_bs->matchRegions.size(); } bool GlobalHashPointIterator::sanityCheck() const { GlobalHashPointIterator test(*this); while(!test.atLast()) ++test; cout << "Sanity check: counted="<"<<(test.counted()+test.span())<<"=="<Hasher client starting.",true); //prepare message buffers int peakBufferBlocks=0; vector > sendBuffers(mpi_procs); for(int i=0;iglobalPos); Location loc=(*locIte).second; assert(globalPos>=(*locIte).first); BitSequence *bitseq=loc.bitSeq(); Sequence *s=loc.seq(); int sign=bitseq==s->fwd ? 1:-1; // cout << "Global increment: "<matchRegions.end(); for(;globalPosfirst+offset+1; //location style coords (start at 1), but always positive SeqPos stop=region->second-pat->length()+2; Window win(bitseq,start-1,pat); //bitseq coords word hash; SeqPos p; for(p=start;p<=stop;p+=opt.hashSkip){ hash=win.hash(); mpi_hashCount[mpi_id]++; Location here(s,sign<0 ? 0-p:p); //select target storage node map::iterator ite=mpi_storeBrkStop.upper_bound(hash); assert(ite!=mpi_storeBrkStop.end()); assert(hashfirst); //must be within target's hash range! int target=(*ite).second; assert(target=0); assert(mpi_jobs[target]==JOB_STORAGE); assert(mpi_storeOwn[target].first<=hash); assert(mpi_storeOwn[target].second>=hash); sendBuffers[target].back().messages[sendBuffers[target].back().used++].set(hash,here); // cout << "Hashed global "<=mpi_bufferSize){ assert(sendBuffers[target].back().used<=mpi_bufferSize); #ifndef NDEBUG //make sure everything in that buffer is sane for(int mi=0;mi=mpi_storeOwn[target].first); assert(sendBuffers[target].back().messages[mi].key<=mpi_storeOwn[target].second); } // assert(cout << "Sending "<::iterator bite=sendBuffers[target].begin(); bite!=sendBuffers[target].end();){ list::iterator here=bite; ++bite; send_done=0; if(here->stored){ if(opt.mpi_maxBuffers && sendBuffers[target].size()>opt.mpi_maxBuffers){ //if currently overly full... MPI_Wait(&(here->request),&status); send_done=1; }else{ MPI_Test(&(here->request),&send_done,&status); } if(send_done){ sendBuffers[target].erase(here); } } else{break;} //these are guaranteed to be sent in order, so anything after here also hasn't been sent. } }while(opt.mpi_maxBuffers && sendBuffers[target].size()>opt.mpi_maxBuffers); sendBuffers[target].push_back(MessageBlock()); //add a fresh buffer peakBufferBlocks=max(peakBufferBlocks,sendBuffers[target].size()); } // NextHash: ticker.tick(globalPos-globalStart); if(opt.hashSkip>1) win.slide(opt.hashSkip*2); else win.slide(); globalPos+=opt.hashSkip; if(globalPos>=globalEnd){ goto DoneHashing; } } //without any hashSkip p ends a pattern length away from the region end, so need to add that in. //with pattern skips, it might go over, in which case subtraction is correct. globalPos+=(region->second-p+2); #ifndef NDEBUG // SeqPos regionLength=(region->second-region->first)+1-initialOffset; // assert(globalPos==regionLength+initialGlobalPos); #endif } } DoneHashing: cout << endl << "Adding 'finish' notifications to the send queue..."<::iterator bite=sendBuffers[target].begin(); bite!=sendBuffers[target].end();++bite){ if(!bite->stored){ assert(bite->used<=mpi_bufferSize); #ifndef NDEBUG //make sure everything in that buffer is sane for(int mi=0;miused;mi++){ assert(bite->messages[mi].key>=mpi_storeOwn[target].first); assert(bite->messages[mi].key<=mpi_storeOwn[target].second); } // assert(cout << "Sending "<used<<" messages to "<messages,bite->used*sizeof(MPI_HashMessage),MPI_BYTE,target,0,MPI_COMM_WORLD,&(bite->request)); bite->stored=1; //tag as sent. } } } } Timer hashDone; cout << "Hashing computations done in: "<::iterator bite=sendBuffers[target].begin(); bite!=sendBuffers[target].end();){ list::iterator here=bite; ++bite; send_done=0; if(here->stored){ MPI_Wait(&(here->request),&status); sendBuffers[target].erase(here); } } } } } Timer hashSendComplete; cout << "Delivery of remaining messages took: "< > ReceiverQueue; list receiverQueues; //local storage after pulling off the mpi stack word packets=0; vector doneNodes(mpi_procs,0); for(int node=0;node(1))); //start with an empty block MessageBlock &block=receiverQueues.back().second.front(); MPI_Irecv(block.messages,mpi_bufferSize*sizeof(MPI_HashMessage),MPI_BYTE,receiverQueues.back().first,0,MPI_COMM_WORLD,&block.request); //start receiving } list::iterator qi(receiverQueues.begin()),qinext=qi; while(nodesLeft){ //until we get a "done" message from everyone. qi=qinext; ++qinext; if(qinext==receiverQueues.end()) qinext=receiverQueues.begin(); list &toStore=qi->second; MPI_Request &recv_req=toStore.back().request; if(toStore.empty() && doneNodes[qi->first]){ cout << "Finished storing all data from "<first <<" early. Removing from queue."<(peakBufferBlocks,toStore.size()); int recv_done=0; MPI_Status recv_status; //store more crap in the hash do { if(toStore.front().used){ //if we have crap.. mpi_storage_store(toStore); } }while(opt.mpi_maxBuffers && toStore.size()>opt.mpi_maxBuffers && toStore.front().used); if(doneNodes[qi->first]) //no more messages to receive here continue; //see if we got a message... MPI_Test(&recv_req,&recv_done,&recv_status); if(!recv_done){//no message yet continue; } //got a message packets++; MPI_Get_count(&recv_status,MPI_BYTE,&(toStore.back().used)); #ifndef NDEBUG if(recv_status.MPI_SOURCE!=qi->first){ cout << "Bogon packet! Supposed to be received from "<first<<" but came from "<first); //better have received this from the node we thought was sending! assert((toStore.back().used%sizeof(MPI_HashMessage))==0); toStore.back().used/=sizeof(MPI_HashMessage); if(!toStore.back().used){ //got an empty (ie: done!) cout << "Node "<=mpi_myStoreOffset)){ cerr << "Fresh off the wire from "<=mpi_myStoreOffset); assert(msg.keyfirst,0,MPI_COMM_WORLD,&toStore.back().request); //start receiving } } Timer receiveDone; cout << "Finished receiving data in: "<::iterator mi=qi->second.begin();mi!=qi->second.end();++mi) remaining+=mi->used-mi->stored; if(remaining){ cout << "Finishing up storage of final "< &toStore=qi->second; //final mad dash to finish while(!toStore.empty()){ mpi_storage_store(toStore); ticker.tick(); } } cout << endl; } Timer storageDone; mpi_workTime[mpi_id]+=diff(storageDone,storageStart); cout << "Most message blocks in queue at any one time: "< &toStore){ MPI_HashMessage &msg=toStore.front().messages[toStore.front().stored++]; mpi_storeCount[mpi_id]++; #ifndef NDEBUG if(!(msg.key>=mpi_myStoreOffset)){ cerr << "Oh noes! "<=2){ MPI_HashMessage &prior=toStore.front().messages[toStore.front().stored-2]; cerr << "prior: key="<=mpi_myStoreOffset); //err, why are we receiving someone else's messagse?? } #endif msg.key-=mpi_myStoreOffset; //subtract to match my personal offset assert(msg.keyadd(msg.key,Location(msg.seqno,msg.pos)); if(toStore.front().stored>=toStore.front().used){//we're done with that buffer toStore.erase(toStore.begin()); } } void mpi_merge_client_mode(){ list todo; //things with a header & body list pending; //things with just a header int nodesLeft=mpi_jobCount[JOB_STORAGE]; vector doneNodes(mpi_procs,0);//just to make sure word packets=0; word mergeProgressDelay=opt.anchorProgressCheck; word mergeProgressCheck=mergeProgressDelay; if(!opt.mpi_distMerge){ nodesLeft=0; for(uint i=0;i<(uint)mpi_procs;i++) if(mpi_assemblerIds[i]==mpi_id && mpi_jobs[i]==JOB_STORAGE){ nodesLeft++; } } cout << "Hasher "<header,sizeof(MPI_AnchorHeader),MPI_BYTE,MPI_ANY_SOURCE,0,MPI_COMM_WORLD,&incoming->headerReq); spareTime=false; } MPI_Test(&incoming->headerReq,&incoming->headerStored,&status); if(incoming->headerStored){ //is it done? incoming->owner=status.MPI_SOURCE; int payloadSize; MPI_Get_count(&status,MPI_BYTE,&payloadSize); if(!payloadSize){ //done msg cout << "Node "<owner<<" announced being done. "<<(nodesLeft-1)<<" left."<owner>=0); assert(incoming->ownerowner]==JOB_STORAGE); assert(!doneNodes[incoming->owner]); doneNodes[incoming->owner]=1; nodesLeft--; delete incoming; incoming=0; }else{ incoming->locs.resize(incoming->header.count); pending.push_back(incoming); MPI_Irecv(&incoming->locs.front(),sizeof(Location)*incoming->header.count,MPI_BYTE,incoming->owner,1,MPI_COMM_WORLD,&incoming->bodyReq); //note: tag 1 incoming=0; spareTime=false; } } } // word checked=1; //for debug only //check status of pending receives for(list::iterator i=pending.begin();i!=pending.end();){ MPI_Test(&(*i)->bodyReq,&(*i)->bodyStored,&status); if((*i)->bodyStored){ int check_count; MPI_Get_count(&status,MPI_BYTE,&check_count); check_count/=sizeof(Location); assert((word)check_count==(*i)->header.count); list::iterator temp=i; todo.push_back(*i); ++i; pending.erase(temp); packets++; spareTime=false; }else{ ++i; } } //process a loc set if we have it. do { if(((opt.mpi_maxBuffers && todo.size()>=opt.mpi_maxBuffers) || spareTime) && !todo.empty()){ mpi_procAnchorBlock(todo); if(anchorProgress_fh && mergeProgressDelay && !(--mergeProgressCheck)){ SystemInfo infoNow; *anchorProgress_fh<count()<<"\t"<=opt.mpi_maxBuffers); } Timer anchorReceiveDone; cout << "Collected "<count()<<"\t"<count()< &todo){ //demux into loclist LocList l(seq_count); AnchorBlock* block=todo.front(); activeHash=block->header.hashKey; //set up variables to pretend we're a non-mpi run // cout <<"Processing a "<header.count<<" block from "<owner<::iterator i=block->locs.begin();i!=block->locs.end();++i){ l[i->seqno].push_back(*i); } if(opt.mergeFilter){ word combinations=1; for(int s=0;sopt.mergeFilter){ if(repeats) repeats->add(l); goto BlockDone; } } } if(1){ list start; //localize this procLocs(l,start,opt.rifts); //and go } BlockDone: delete block; //free that block todo.erase(todo.begin()); //and kill it mpi_mergeCount[mpi_id]++; } void mpi_extract_client_mode(BitSequence *pat){ cout << "Preparing receiver scheduler..."< receivers; //list of receivers to choose from if(opt.mpi_distMerge){ receivers.reserve(mpi_hasherIds.size()); for(int i=mpi_myJobRank%mpi_hasherIds.size();i<(int)mpi_hasherIds.size();i++) receivers.push_back(mpi_hasherIds[i]); for(int i=0;i::iterator> bufferItes; //per receiver iterator to receiverQ index> multimap buffers; // buffers -> rank (only used if we have a max_buffer limit) int prevChoice=0; vector > sent(receivers.size()); //need a separate queue for each target if(opt.mpi_maxBuffers){ bufferItes.reserve(receivers.size()); for(uint i=0;i(opt.mpi_maxBuffers,i))); } } cout << "Extracting anchors from hash-space."<emptyAt(base)) continue; list fulllist; mfh->getMatchingSets(base,fulllist); for(list::iterator seti=fulllist.begin();seti!=fulllist.end();++seti){ LocList &loclist=*seti; if(opt.mergeFilter && !opt.repeatMap){ //if we'll be culling repeats, but not storing them, we can eliminate the repeats here before sending word combinations=1; for(int s=0;sopt.mergeFilter){ continue; //abort here before even creating the AnchorBlock pointer. } } AnchorBlock* assemble=new AnchorBlock(base); //dump matches in here bool gotAllSeqs=true; word skips=opt.rifts; for(int s=0;s0) skips--; else{ gotAllSeqs=false; break; } } //(re)mux all the locsublists for(LocSubList::iterator site=loclist[s].begin();site!=loclist[s].end();++site) assemble->locs.push_back(*site); } if(gotAllSeqs){ int receiver; if(opt.mpi_maxBuffers){ while(buffers.empty() || (*(buffers.rbegin())).first<1){ //make sure we have a buffer available /* cout << "Waiting for available buffer"<::iterator ite=buffers.end(); --ite; //can't erase reverse iterators, so find end manually... pair choice(*ite); buffers.erase(ite); //pop assert(choice.first>0); //must have receive buffers available. receiver=choice.second; if(choice.first>1) //any buffers left on that host? bufferItes[receiver]=buffers.insert(buffers.begin(),pair(choice.first-1,choice.second)); //if so, put it back on (-1 buffer) (and because it's now -1 buffer, it probably goes at the front, so use that as a hint) else bufferItes[receiver]=buffers.end(); }else{ int choice=++prevChoice; //round robin if((uint)choice>=receivers.size()) choice=0; receiver=choice; } int receiverId=receivers[receiver]; // cout << "Will be sending to "<header.count=assemble->locs.size(); assemble->header.hashKey=base+mpi_myStoreOffset; assert(assemble->header.count>=(unsigned)seq_count-opt.rifts); //must have at least that many... mpi_extractLocCount[mpi_id]+=assemble->header.count; MPI_Issend(&assemble->header,sizeof(MPI_AnchorHeader),MPI_BYTE,receiverId,0,MPI_COMM_WORLD,&(assemble->headerReq)); MPI_Issend(&assemble->locs.front(),sizeof(Location)*assemble->header.count,MPI_BYTE,receiverId,1,MPI_COMM_WORLD,&(assemble->bodyReq)); //note: sent with tag 1 so we can differentiate it from a new header. sent[receiver].push_back(assemble); //tag as sent mpi_extractCount[mpi_id]++; } else { // cout << "Negative! "<locs.size()<<" locs"< > &sent,vector::iterator> &bufferItes,multimap &buffers){ MPI_Status status; int bodyStored; bool packetsLeft=0; for(uint receiver=0;receiver::iterator ite=sent[receiver].begin(); ite!=sent[receiver].end();){ packetsLeft++; AnchorBlock& block=**ite; if(!block.headerStored) MPI_Test(&block.headerReq,&block.headerStored,&status); if(block.headerStored){ MPI_Test(&block.bodyReq,&bodyStored,&status); if(bodyStored){ list::iterator temp(ite); ++ite; delete *temp; sent[receiver].erase(temp); if(opt.mpi_maxBuffers){ //give ourselves another buffer for this receiver multimap::iterator mite(bufferItes[receiver]); int count=0; if(mite!=buffers.end()){ count=(*mite).first; buffers.erase(mite); } bufferItes[receiver]=buffers.insert(pair(count+1,receiver)); } }else{ ++ite; } }else{ break; //packages are received in FIFO order } } return packetsLeft; } void mpi_hashAndStore_client_mode(BitSequence *pat){ //for the nocake option cout << "There is no cake. (Launching cakeless hash+storage mode)."<name+" forwards\n"; mpi_hashSeqLocal(seqs[i]->fwd,pat,1,seqs[i]); } if(!opt.skipRev){ cout << "Hashing "+seqs[i]->name+" backwards\n"; mpi_hashSeqLocal(seqs[i]->rev,pat,-1,seqs[i]); } } } void mpi_hashSeqLocal(BitSequence *bitseq,BitSequence *pat,int sign,Sequence *s){ ticker.reset(bitseq->length()-1); for(SeqPosPairArray::iterator region=bitseq->matchRegions.begin(); region!=bitseq->matchRegions.end();region++){ SeqPos stop=region->second-pat->length()+2; SeqPos start=region->first+1; Window win(bitseq,region->first,pat); word hash=win.hash(); for(SeqPos p=start;p<=stop;p+=opt.hashSkip){ hash=win.hash()-mpi_myStoreOffset; if(hashemptyAt(hash)){ if(opt.hashfilter){ if(mfh->sizeAt(hash)>(unsigned)opt.hashfilter) goto NextHash; } if(opt.seedfilter){ if(mfh->sizeAt(hash,here)>(unsigned)opt.seedfilter) goto NextHash; } } mfh->add(hash,here); } NextHash: ticker.tick(p-1); if(opt.hashSkip>1) win.slide(opt.hashSkip*2); else win.slide(); } } cout << endl; //for ticker } void mpi_anchorMergeClient(int mergeTarget){ cout << "Hasher "<used[0].size())<<" anchors to hasher "<used[0].empty()){ ticker.reset(anchors->used[0].size()); //send anchor data for(usedItree::iterator i=anchors->used[0].begin(); i!=anchors->used[0].end(); ++i){ //vectors have 2 parts we (might) need to send: intervals (the spans of the anchor, critical), members (what hashvals went into making this anchor (only need if opt.retainMembers is on)) vector intervals; assert((*i)->spaces.size()==(uint)seq_count); intervals.reserve(seq_count); for(vector::iterator j=(*i)->spaces.begin(); j!=(*i)->spaces.end(); ++j) intervals.push_back(j->key()); MPI_Ssend(&intervals.front(),sizeof(UsedInt)*seq_count,MPI_BYTE,mergeTarget,0,mpi_job_comm); mpi_anchorSendCount[mpi_id]++; if(opt.retainMembers){ //send member count int memberCount=(*i)->members.size(); MPI_Ssend(&memberCount,1,MPI_INT,mergeTarget,1,mpi_job_comm); //serialize and send members vector > members; for(HashCount::iterator j=(*i)->members.begin();j!=(*i)->members.end();++j) members.push_back(*j); MPI_Ssend(&members.front(),sizeof(pair)*memberCount,MPI_BYTE,mergeTarget,2,mpi_job_comm); } ticker.tick(); } cout << endl; } Timer anchorSendDone; cout << "Anchor merge completed in: "<clusters.size())<<" repeats."<clusters.size()); for(list::iterator ri=repeats->clusters.begin(); ri!=repeats->clusters.end();++ri){ AnchorBlock assemble; //basically have to reassemble the original block for(int si=0;si &senders){ vector activeReq(senders.size()); vector > intervals(senders.size()); vector memberCount(senders.size()); vector > > members(senders.size()); vector senderStates(senders.size(),0); cout << "Hasher "<(cout, " ")); cout << ")..."<(0,0))*memberCount[sender]); senderStates[sender]++; break; default: throw MurasakiException("Invalid state in anchorMergeServer (recv switch)"); } if(senderStates[sender]==3){ //full set IntervalSet a(intervals[sender].begin(),intervals[sender].end(), members[sender].begin(),members[sender].end()); anchors->insert(a); senderStates[sender]=0; //time to start next } switch(senderStates[sender]){ //start next receive case -1:break; //done case 0: MPI_Irecv(&intervals[sender].front(),sizeof(UsedInt)*seq_count,MPI_BYTE,senders[sender],0,mpi_job_comm,&activeReq[sender]); break; case 1: MPI_Irecv(&memberCount[sender],1,MPI_INT,senders[sender],1,mpi_job_comm,&activeReq[sender]); break; case 2: MPI_Irecv(&members[sender].front(),sizeof(pair)*memberCount[sender],MPI_BYTE,senders[sender],2,mpi_job_comm,&activeReq[sender]); break; default: throw MurasakiException("Invalid state in anchorMergeServer (start new receive switch)"); } } } } Timer anchorMergeDone; cout << "Anchor merge completed in: "<used[0].size()< assemble(senders.size()); for(uint sender=0;sender::iterator i=assemble[sender].locs.begin(); i!=assemble[sender].locs.end(); ++i) loclist[i->seqno].push_back(*i); repeats->add(loclist); //start next recv of header senderStates[sender]=0; MPI_Irecv(&assemble[sender].header,sizeof(MPI_AnchorHeader),MPI_BYTE,senders[sender],0,mpi_job_comm,&activeReq[sender]); } break; default:throw MurasakiException("Invalid state in anchorMergeServer (repeat recv switch)"); } } } } Timer repeatRecvDone; cout << "Repeat data send completed in: "<drones, but for now, 0->world will do MPI_Bcast(&longestSeq,1,MPI_UNSIGNED_LONG,0,MPI_COMM_WORLD); MPI_Bcast(&totalSeqLength,1,MPI_UNSIGNED_LONG,0,MPI_COMM_WORLD); MPI_Bcast(&totalHashLength,1,MPI_UNSIGNED_LONG,0,MPI_COMM_WORLD); MPI_Bcast(&globalCounted,1,MPI_UNSIGNED_LONG,0,MPI_COMM_WORLD); MPI_Bcast(globalBaseCount,4,MPI_UNSIGNED_LONG,0,MPI_COMM_WORLD); } MessageBlock::MessageBlock() : used(0),stored(0), messages(new MPI_HashMessage[mpi_bufferSize]) { assert(messages); } MessageBlock::MessageBlock(const MessageBlock &a) : used(0),stored(0), messages(new MPI_HashMessage[mpi_bufferSize]) { assert(!a.used); assert(!a.stored);} //only for copying empty blocks dammit! void MessageBlock::reset(){ used=0; stored=0; } MessageBlock::~MessageBlock(){ if(messages) delete[] messages; } AnchorBlock::AnchorBlock() : header(0),headerStored(0),bodyStored(0),owner(-1) {} AnchorBlock::AnchorBlock(word key) : header(key),headerStored(0),bodyStored(0),owner(-1) {} AnchorBlock::AnchorBlock(const AnchorBlock &a) : header(a.header.hashKey),headerStored(0),bodyStored(0),owner(a.owner) { assert(!a.headerStored); assert(!a.bodyStored);} //only for copying empty blocks dammit! MPI_HashMessage::MPI_HashMessage(const word& _key,const Location& l) : key(_key),seqno(l.seqno),pos(l.pos) {} void MPI_HashMessage::set(const word& _key,const Location& l){ key=_key; seqno=l.seqno; pos=l.pos; } MPI_HashMessage::MPI_HashMessage(){} //empty. don't bother initing MPI_AnchorHeader::MPI_AnchorHeader(word k) : hashKey(k) {} MPI_AnchorHeader::MPI_AnchorHeader() : hashKey(0) {} void mpi_write_histogram(){ if(opt.retainMembers){ if(!dfCount) dfCount=anchors->makeDfCount(); //dfCount might still be null if(dfCount){ cout << "Writing histogram..."< histo; for(word i=0;i (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #ifndef _SCORING_H_ #define _SCORING_H_ #include "globaltypes.h" #include "sequence.h" #include #include typedef long Score; //basic additive logodds-like score const Score MAXSCORE=LONG_MAX; extern Score ScoreMatrix[5][5]; //standard ACGT + N Score scoreColumn(const vector &column); void addColumnPairScores(const vector &column,vector > &output); Score minimumPairScore(const vector > &output); SeqPos fuzzyExtendUntil(vector column,SeqPos max,int direction,long* scoreOut); #endif murasaki/src/cryptohasher.h0000644000177700001440000000266511434752234015443 0ustar krispusers#ifndef __CRYPTOHASHER_H__ #define __CRYPTOHASHER_H__ #include "globaltypes.h" #include "sequence.h" #include class CryptoHasher { protected: public: virtual word digest(const vectorwords, BitSequence *pat)=0; }; #ifdef USE_LIBCRYPTOPP //using Crypto++ library #define CRYPTOPP_ENABLE_NAMESPACE_WEAK 1 #include //fastest #include //medium #include //slow //Non-cryptographic checksums #include #include #include template class GenCryptoHasher : public CryptoHasher { protected: T hashfunc; public: virtual word digest(const vector words, BitSequence *pat){ size_t buflen=pat->wordCount()*sizeof(word); byte bytes[buflen]; for(size_t i=0;iwordCount();i++){ word *dst=((word*)(bytes))+i; *dst=(words[i] & pat->readWord(i)); } byte digest[T::DIGESTSIZE]; hashfunc.CalculateDigest(digest,bytes,buflen); return *((word*)digest);//first word, if it's a good cryptographic function, should be as good as any other word (probably?)... } }; typedef GenCryptoHasher MD5CryptoHasher; typedef GenCryptoHasher SHACryptoHasher; typedef GenCryptoHasher WhirlpoolCryptoHasher; typedef GenCryptoHasher CRC32CryptoHasher; typedef GenCryptoHasher Adler32CryptoHasher; #endif #endif murasaki/src/exceptions.h0000644000177700001440000000170311434752235015102 0ustar krispusers#ifndef __EXCEPTIONS_H_ #define __EXCEPTIONS_H_ #include class MurasakiException : public std::exception { //from std::exception public: MurasakiException(const std::string& reason): reason_s(std::string("Murasaki: ")+reason){} virtual const char* what() const throw () { return reason_s.c_str(); } virtual std::string reason() const throw () { return reason_s; } virtual ~MurasakiException() throw () {} protected: std::string reason_s; }; class MurasakiDebugException : public MurasakiException { public: MurasakiDebugException(const std::string& reason): MurasakiException(std::string("Debug: ")+reason) //prepend debug prefix {} }; class MurasakiAbortException : public MurasakiException { protected: int _status; public: int status(){return _status;}; MurasakiAbortException(const std::string& reason,int s): MurasakiException(std::string("Abort: ")+reason), //prepend debug prefix _status(s) {} }; #endif murasaki/src/bitmap.h0000644000177700001440000000235411434752234014177 0ustar krispusers#ifndef _BITMAP_H_ #define _BITMAP_H_ #include "globaltypes.h" class bitmap { public: inline bitmap(word c): map(new word[(c-1)/WORDSIZE+1]), size((c-1)/WORDSIZE+1) { memset(map,0,sizeof(word)*size); } inline void clear(){ memset(map,0,sizeof(word)*size); } inline bool checkAndSet(word hash){ word idx=hash/WORDSIZE; word mask=((word)1U)<<(MODWORDSIZE(hash)); bool res=(map[idx] & mask); if(!res){ //it seems that leaving this check in is actually faster! map[idx]|=mask; } return res; } inline bool check(word hash) const{ word idx=hash/WORDSIZE; word mask=((word)1U)<<(MODWORDSIZE(hash)); bool res=(map[idx] & mask); return res; } inline void set(word hash){ word idx=hash/WORDSIZE; word mask=((word)1U)<<(MODWORDSIZE(hash)); if(map[idx] & mask) map[idx]|=mask; } inline void flip(word hash){ word idx=hash/WORDSIZE; word mask=((word)1U)<<(MODWORDSIZE(hash)); map[idx]^=mask; } inline void unset(word hash){ word idx=hash/WORDSIZE; word mask=((word)1U)<<(MODWORDSIZE(hash)); if(map[idx] & mask) map[idx]&=~mask; } inline ~bitmap(){ delete[] map; } protected: word *map; word size; }; #endif murasaki/src/murasaki_mpi.h0000644000177700001440000001207011434752234015400 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // murasaki_mpi.h // provides MPI-specific functions ////////////// #ifdef MURASAKI_MPI #ifndef MURASAKI_MPI_H__ #define MURASAKI_MPI_H__ 1 #include "sequence.h" #include "dinkymath.h" #include #include #include using namespace std; //oof. once you start, you can't stop. //signals #define MURASAKI_MPI_REQ_SIZE 0 #define MURASAKI_MPI_REQ_LKUP_NOMATCH 1 #define MURASAKI_MPI_REQ_LKUP_MATCH 2 #define MURASAKI_MPI_REQ_QUIT 999 //types enum MPI_Job {JOB_HASHER=0,JOB_STORAGE,JOB_MAX}; class MPI_HashMessage { public: word key; SeqIdx seqno; SeqPos pos; MPI_HashMessage(const word& key,const Location& l); void set(const word& key,const Location& l); MPI_HashMessage(); private: }; //have to tell MPI what these things are extern MPI_Datatype MPI_HashMessage_type; //for some reason large values cause large problems on mx (stops receiving after the first packet) const int mpi_bufferSize=50000; class MessageBlock { public: int used; int stored; MPI_HashMessage *messages; MPI_Request request; void reset(); MessageBlock(); ~MessageBlock(); MessageBlock(const MessageBlock &a); //fake-copy }; class MPI_AnchorHeader { public: word hashKey; word count; MPI_AnchorHeader(word key); MPI_AnchorHeader(); }; class AnchorBlock { public: vector locs; MPI_AnchorHeader header; MPI_Request headerReq,bodyReq; int headerStored,bodyStored; int owner; AnchorBlock(); AnchorBlock(word hashKey); AnchorBlock(const AnchorBlock &a); //fake-copy }; class GlobalHashPointIterator { public: typedef GlobalHashPointIterator _self; GlobalHashPointIterator(); //start at 0 GlobalHashPointIterator(word x); //start at x word counted() const; word span() const; //size of region covered here pair operator*() const; _self& operator++(); bool atLast() const; bool sanityCheck() const; void seek(word x); //goto first global region prior to "globalPos" string debugString() const; protected: word _count; unsigned _seqi; BitSequence* _bs; unsigned _regi; }; //globals extern int mpi_id; extern int mpi_procs; extern bool mpi_capable; extern word mpi_totalMemory,mpi_totalStorage; extern vector mpi_storeShare; extern vector > mpi_storeOwn; extern map mpi_storeBrk; // start of hash region -> mpi rank extern map mpi_storeBrkStop; // stop+1 of hash region -> id extern word mpi_myStoreOffset; extern int mpi_finalAssembler; extern vector mpi_hostnames; extern map mpi_hostLeader_byName; extern vector mpi_hostLeader;//lowest id on same host extern bool mpi_isHostLeader,mpi_usingShm; extern vector mpi_hashCount,mpi_storeCount,mpi_extractCount,mpi_mergeCount,mpi_extractLocCount,mpi_anchorSendCount,mpi_anchorRecvCount; extern vector mpi_workTime; extern vector mpi_hashShare; extern vector > mpi_hashOwn; extern map mpi_hashBrk; //start of sequence -> mpi rank extern map mpi_hashPoints; //maps each hashBrk to an actual Location extern word mpi_total_hash_size; extern vector mpi_worldMemory; extern vector mpi_jobs; extern vector mpi_jobCount; extern const char *MPI_jobNames[]; extern vector mpi_hasherIds; extern int mpi_myHasherId; extern vector mpi_worldId2jobId; extern vector mpi_assemblerIds; extern int mpi_finalAssembler; extern int mpi_sysv_projid; extern MPI_Comm mpi_leaders_comm,mpi_localhost_comm,mpi_job_comm; extern int mpi_myHostId,mpi_myLeaderRank,mpi_myLocalRank,mpi_myJobRank; //functions void mpi_init(); void mpi_initJobs(); void mpi_types_init(); void mpi_fillHashPoints(); void mpi_syncSeqCounts(); void mpi_hasher_client_mode(BitSequence *pat); void mpi_storage_client_mode(); void mpi_storage_store(list &toStore); void mpi_merge_client_mode(); void mpi_extract_client_mode(BitSequence *pat); word mpi_extract_checkSentQueue(vector > &sent,vector::iterator> &bufferItes,multimap &buffers); void mpi_procAnchorBlock(list &todo); void mpi_hashAndStore_client_mode(BitSequence *pat); void mpi_hashSeqLocal(BitSequence *bitseq,BitSequence *pat,int sign,Sequence *s); void mpi_anchorMergeServer(const vector &senders); void mpi_anchorMergeClient(int mergeTarget); void mpi_write_histogram(); #endif #endif murasaki/src/include/0000755000177700001440000000000011434752243014171 5ustar krispusersmurasaki/src/include/pstream.h0000755000177700001440000017133111434752231016023 0ustar krispusers/* $Id: pstream.h,v 1.90 2005/06/11 09:25:06 redi Exp $ PStreams - POSIX Process I/O for C++ Copyright (C) 2001,2002,2003,2004 Jonathan Wakely This file is part of PStreams. PStreams is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. PStreams is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with PStreams; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /** * @file pstream.h * @brief Declares all PStreams classes. * @author Jonathan Wakely * * Defines classes redi::ipstream, redi::opstream, redi::pstream * and redi::rpstream. */ #ifndef REDI_PSTREAM_H_SEEN #define REDI_PSTREAM_H_SEEN #include #include #include #include #include #include #include // for min() #include // for memcpy(), memmove() etc. #include // for errno #include // for size_t #include // for exit() #include // for pid_t #include // for waitpid() #include // for ioctl() and FIONREAD #if defined(__sun) # include // for FIONREAD on Solaris 2.5 #endif #include // for pipe() fork() exec() and filedes functions #include // for kill() #include // for fcntl() #if REDI_EVISCERATE_PSTREAMS # include // for FILE, fdopen() #endif /// The library version. #define PSTREAMS_VERSION 0x0052 // 0.5.2 /** * @namespace redi * @brief All PStreams classes are declared in namespace redi. * * Like the standard IOStreams, PStreams is a set of class templates, * taking a character type and traits type. As with the standard streams * they are most likely to be used with @c char and the default * traits type, so typedefs for this most common case are provided. * * The @c pstream_common class template is not intended to be used directly, * it is used internally to provide the common functionality for the * other stream classes. */ namespace redi { /// Common base class providing constants and typenames. struct pstreams { /// Type used to specify how to connect to the process. typedef std::ios_base::openmode pmode; /// Type used to hold the arguments for a command. typedef std::vector argv_type; /// Type used for file descriptors. typedef int fd_type; static const pmode pstdin = std::ios_base::out; ///< Write to stdin static const pmode pstdout = std::ios_base::in; ///< Read from stdout static const pmode pstderr = std::ios_base::app; ///< Read from stderr protected: enum { bufsz = 32 }; ///< Size of pstreambuf buffers. enum { pbsz = 2 }; ///< Number of putback characters kept. }; /// Class template for stream buffer. template > class basic_pstreambuf : public std::basic_streambuf , public pstreams { public: // Type definitions for dependent types typedef CharT char_type; typedef Traits traits_type; typedef typename traits_type::int_type int_type; typedef typename traits_type::off_type off_type; typedef typename traits_type::pos_type pos_type; /** @deprecated use pstreams::fd_type instead. */ typedef fd_type fd_t; /// Default constructor. basic_pstreambuf(); /// Constructor that initialises the buffer with @a command. basic_pstreambuf(const std::string& command, pmode mode); /// Constructor that initialises the buffer with @a file and @a argv. basic_pstreambuf( const std::string& file, const argv_type& argv, pmode mode ); /// Destructor. ~basic_pstreambuf(); /// Initialise the stream buffer with @a command. basic_pstreambuf* open(const std::string& command, pmode mode); /// Initialise the stream buffer with @a file and @a argv. basic_pstreambuf* open(const std::string& file, const argv_type& argv, pmode mode); /// Close the stream buffer and wait for the process to exit. basic_pstreambuf* close(); /// Send a signal to the process. basic_pstreambuf* kill(int signal = SIGTERM); /// Close the pipe connected to the process' stdin. void peof(); /// Change active input source. bool read_err(bool readerr = true); /// Report whether the stream buffer has been initialised. bool is_open() const; /// Report whether the process has exited. bool exited(); #if REDI_EVISCERATE_PSTREAMS /// Obtain FILE pointers for each of the process' standard streams. std::size_t fopen(std::FILE*& in, std::FILE*& out, std::FILE*& err); #endif /// Return the exit status of the process. int status() const; /// Return the error number for the most recent failed operation. int error() const; protected: /// Transfer characters to the pipe when character buffer overflows. int_type overflow(int_type c); /// Transfer characters from the pipe when the character buffer is empty. int_type underflow(); /// Make a character available to be returned by the next extraction. int_type pbackfail(int_type c = traits_type::eof()); /// Write any buffered characters to the stream. int sync(); /// Insert multiple characters into the pipe. std::streamsize xsputn(const char_type* s, std::streamsize n); /// Insert a sequence of characters into the pipe. std::streamsize write(const char_type* s, std::streamsize n); /// Extract a sequence of characters from the pipe. std::streamsize read(char_type* s, std::streamsize n); /// Report how many characters can be read from active input without blocking. std::streamsize showmanyc(); protected: /// Enumerated type to indicate whether stdout or stderr is to be read. enum buf_read_src { rsrc_out = 0, rsrc_err = 1 }; /// Initialise pipes and fork process. pid_t fork(pmode mode); /// Wait for the child process to exit. int wait(bool nohang = false); /// Return the file descriptor for the output pipe. fd_type& wpipe(); /// Return the file descriptor for the active input pipe. fd_type& rpipe(); /// Return the file descriptor for the specified input pipe. fd_type& rpipe(buf_read_src which); void create_buffers(pmode mode); void destroy_buffers(pmode mode); /// Writes buffered characters to the process' stdin pipe. bool empty_buffer(); bool fill_buffer(); /// Return the active input buffer. char_type* rbuffer(); buf_read_src switch_read_buffer(buf_read_src); private: basic_pstreambuf(const basic_pstreambuf&); basic_pstreambuf& operator=(const basic_pstreambuf&); void init_rbuffers(); pid_t ppid_; // pid of process fd_type wpipe_; // pipe used to write to process' stdin fd_type rpipe_[2]; // two pipes to read from, stdout and stderr char_type* wbuffer_; char_type* rbuffer_[2]; char_type* rbufstate_[3]; /// Index into rpipe_[] to indicate active source for read operations. buf_read_src rsrc_; int status_; // hold exit status of child process int error_; // hold errno if fork() or exec() fails }; /// Class template for common base class. template > class pstream_common : virtual public std::basic_ios , virtual public pstreams { protected: typedef basic_pstreambuf streambuf_type; /// Default constructor. pstream_common(); /// Constructor that initialises the stream by starting a process. pstream_common(const std::string& command, pmode mode); /// Constructor that initialises the stream by starting a process. pstream_common(const std::string& file, const argv_type& argv, pmode mode); /// Pure virtual destructor. virtual ~pstream_common() = 0; /// Start a process. void do_open(const std::string& command, pmode mode); /// Start a process. void do_open(const std::string& file, const argv_type& argv, pmode mode); public: /// Close the pipe. void close(); /// Report whether the stream's buffer has been initialised. bool is_open() const; /// Return the command used to initialise the stream. const std::string& command() const; /// Return a pointer to the stream buffer. streambuf_type* rdbuf() const; #if REDI_EVISCERATE_PSTREAMS /// Obtain FILE pointers for each of the process' standard streams. std::size_t fopen(std::FILE*& in, std::FILE*& out, std::FILE*& err); #endif protected: std::string command_; ///< The command used to start the process. streambuf_type buf_; ///< The stream buffer. }; /** * @class basic_ipstream * @brief Class template for Input PStreams. * * Reading from an ipstream reads the command's standard output and/or * standard error (depending on how the ipstream is opened) * and the command's standard input is the same as that of the process * that created the object, unless altered by the command itself. */ template > class basic_ipstream : public std::basic_istream , public pstream_common , virtual public pstreams { typedef std::basic_istream istream_type; typedef pstream_common pbase_type; using pbase_type::buf_; // declare name in this scope public: /// Type used to specify how to connect to the process. typedef typename pbase_type::pmode pmode; /// Type used to hold the arguments for a command. typedef typename pbase_type::argv_type argv_type; /// Default constructor, creates an uninitialised stream. basic_ipstream() : istream_type(NULL), pbase_type() { } /** * @brief Constructor that initialises the stream by starting a process. * * Initialises the stream buffer by calling do_open() with the supplied * arguments. * * @param command a string containing a shell command. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, pmode) */ basic_ipstream(const std::string& command, pmode mode = pstdout) : istream_type(NULL), pbase_type(command, mode|pstdout) { } /** * @brief Constructor that initialises the stream by starting a process. * * Initialises the stream buffer by calling do_open() with the supplied * arguments. * * @param file a string containing the pathname of a program to execute. * @param argv a vector of argument strings passed to the new program. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, const argv_type&, pmode) */ basic_ipstream( const std::string& file, const argv_type& argv, pmode mode = pstdout ) : istream_type(NULL), pbase_type(file, argv, mode|pstdout) { } /** * @brief Destructor. * * Closes the stream and waits for the child to exit. */ ~basic_ipstream() { } /** * @brief Start a process. * * Calls do_open( @a %command , @a mode|pstdout ). * * @param command a string containing a shell command. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, pmode) */ void open(const std::string& command, pmode mode = pstdout) { this->do_open(command, mode|pstdout); } /** * @brief Start a process. * * Calls do_open( @a file , @a argv , @a mode|pstdout ). * * @param file a string containing the pathname of a program to execute. * @param argv a vector of argument strings passed to the new program. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, const argv_type&, pmode) */ void open( const std::string& file, const argv_type& argv, pmode mode = pstdout ) { this->do_open(file, argv, mode|pstdout); } /** * @brief Set streambuf to read from process' @c stdout. * @return @c *this */ basic_ipstream& out() { this->buf_.read_err(false); return *this; } /** * @brief Set streambuf to read from process' @c stderr. * @return @c *this */ basic_ipstream& err() { this->buf_.read_err(true); return *this; } }; /** * @class basic_opstream * @brief Class template for Output PStreams. * * Writing to an open opstream writes to the standard input of the command; * the command's standard output is the same as that of the process that * created the pstream object, unless altered by the command itself. */ template > class basic_opstream : public std::basic_ostream , public pstream_common , virtual public pstreams { typedef std::basic_ostream ostream_type; typedef pstream_common pbase_type; using pbase_type::buf_; // declare name in this scope public: /// Type used to specify how to connect to the process. typedef typename pbase_type::pmode pmode; /// Type used to hold the arguments for a command. typedef typename pbase_type::argv_type argv_type; /// Default constructor, creates an uninitialised stream. basic_opstream() : ostream_type(NULL), pbase_type() { } /** * @brief Constructor that initialises the stream by starting a process. * * Initialises the stream buffer by calling do_open() with the supplied * arguments. * * @param command a string containing a shell command. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, pmode) */ basic_opstream(const std::string& command, pmode mode = pstdin) : ostream_type(NULL), pbase_type(command, mode|pstdin) { } /** * @brief Constructor that initialises the stream by starting a process. * * Initialises the stream buffer by calling do_open() with the supplied * arguments. * * @param file a string containing the pathname of a program to execute. * @param argv a vector of argument strings passed to the new program. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, const argv_type&, pmode) */ basic_opstream( const std::string& file, const argv_type& argv, pmode mode = pstdin ) : ostream_type(NULL), pbase_type(file, argv, mode|pstdin) { } /** * @brief Destructor * * Closes the stream and waits for the child to exit. */ ~basic_opstream() { } /** * @brief Start a process. * * Calls do_open( @a %command , @a mode|pstdin ). * * @param command a string containing a shell command. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, pmode) */ void open(const std::string& command, pmode mode = pstdin) { this->do_open(command, mode|pstdin); } /** * @brief Start a process. * * Calls do_open( @a file , @a argv , @a mode|pstdin ). * * @param file a string containing the pathname of a program to execute. * @param argv a vector of argument strings passed to the new program. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, const argv_type&, pmode) */ void open( const std::string& file, const argv_type& argv, pmode mode = pstdin) { this->do_open(file, argv, mode|pstdin); } }; /** * @class basic_pstream * @brief Class template for Bidirectional PStreams. * * Writing to a pstream opened with @c pmode @c pstdin writes to the * standard input of the command. * Reading from a pstream opened with @c pmode @c pstdout and/or @c pstderr * reads the command's standard output and/or standard error. * Any of the process' @c stdin, @c stdout or @c stderr that is not * connected to the pstream (as specified by the @c pmode) * will be the same as the process that created the pstream object, * unless altered by the command itself. */ template > class basic_pstream : public std::basic_iostream , public pstream_common , virtual public pstreams { typedef std::basic_iostream iostream_type; typedef pstream_common pbase_type; using pbase_type::buf_; // declare name in this scope public: /// Type used to specify how to connect to the process. typedef typename pbase_type::pmode pmode; /// Type used to hold the arguments for a command. typedef typename pbase_type::argv_type argv_type; /// Default constructor, creates an uninitialised stream. basic_pstream() : iostream_type(NULL), pbase_type() { } /** * @brief Constructor that initialises the stream by starting a process. * * Initialises the stream buffer by calling do_open() with the supplied * arguments. * * @param command a string containing a shell command. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, pmode) */ basic_pstream(const std::string& command, pmode mode = pstdout|pstdin) : iostream_type(NULL), pbase_type(command, mode) { } /** * @brief Constructor that initialises the stream by starting a process. * * Initialises the stream buffer by calling do_open() with the supplied * arguments. * * @param file a string containing the pathname of a program to execute. * @param argv a vector of argument strings passed to the new program. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, const argv_type&, pmode) */ basic_pstream( const std::string& file, const argv_type& argv, pmode mode = pstdout|pstdin ) : iostream_type(NULL), pbase_type(file, argv, mode) { } /** * @brief Destructor * * Closes the stream and waits for the child to exit. */ ~basic_pstream() { } /** * @brief Start a process. * * Calls do_open( @a %command , @a mode ). * * @param command a string containing a shell command. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, pmode) */ void open(const std::string& command, pmode mode = pstdout|pstdin) { this->do_open(command, mode); } /** * @brief Start a process. * * Calls do_open( @a file , @a argv , @a mode ). * * @param file a string containing the pathname of a program to execute. * @param argv a vector of argument strings passed to the new program. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, const argv_type&, pmode) */ void open( const std::string& file, const argv_type& argv, pmode mode = pstdout|pstdin ) { this->do_open(file, argv, mode); } /** * @brief Set streambuf to read from process' @c stdout. * @return @c *this */ basic_pstream& out() { this->buf_.read_err(false); return *this; } /** * @brief Set streambuf to read from process' @c stderr. * @return @c *this */ basic_pstream& err() { this->buf_.read_err(true); return *this; } }; /** * @class basic_rpstream * @brief template for Restricted PStreams. * * Writing to an rpstream opened with @c pmode @c pstdin writes to the * standard input of the command. * It is not possible to read directly from an rpstream object, to use * an rpstream as in istream you must call either basic_rpstream::out() * or basic_rpstream::err(). This is to prevent accidental reads from * the wrong input source. If the rpstream was not opened with @c pmode * @c pstderr then the class cannot read the process' @c stderr, and * basic_rpstream::err() will return an istream that reads from the * process' @c stdout, and vice versa. * Reading from an rpstream opened with @c pmode @c pstdout and/or * @c pstderr reads the command's standard output and/or standard error. * Any of the process' @c stdin, @c stdout or @c stderr that is not * connected to the pstream (as specified by the @c pmode) * will be the same as the process that created the pstream object, * unless altered by the command itself. */ template > class basic_rpstream : public std::basic_ostream , private std::basic_istream , private pstream_common , virtual public pstreams { typedef std::basic_ostream ostream_type; typedef std::basic_istream istream_type; typedef pstream_common pbase_type; using pbase_type::buf_; // declare name in this scope public: /// Type used to specify how to connect to the process. typedef typename pbase_type::pmode pmode; /// Type used to hold the arguments for a command. typedef typename pbase_type::argv_type argv_type; /// Default constructor, creates an uninitialised stream. basic_rpstream() : ostream_type(NULL), istream_type(NULL), pbase_type() { } /** * @brief Constructor that initialises the stream by starting a process. * * Initialises the stream buffer by calling do_open() with the supplied * arguments. * * @param command a string containing a shell command. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, pmode) */ basic_rpstream(const std::string& command, pmode mode = pstdout|pstdin) : ostream_type(NULL) , istream_type(NULL) , pbase_type(command, mode) { } /** * @brief Constructor that initialises the stream by starting a process. * * Initialises the stream buffer by calling do_open() with the supplied * arguments. * * @param file a string containing the pathname of a program to execute. * @param argv a vector of argument strings passed to the new program. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, const argv_type&, pmode) */ basic_rpstream( const std::string& file, const argv_type& argv, pmode mode = pstdout|pstdin ) : ostream_type(NULL), istream_type(NULL), pbase_type(file, argv, mode) { } /// Destructor ~basic_rpstream() { } /** * @brief Start a process. * * Calls do_open( @a %command , @a mode ). * * @param command a string containing a shell command. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, pmode) */ void open(const std::string& command, pmode mode = pstdout|pstdin) { this->do_open(command, mode); } /** * @brief Start a process. * * Calls do_open( @a file , @a argv , @a mode ). * * @param file a string containing the pathname of a program to execute. * @param argv a vector of argument strings passed to the new program. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, const argv_type&, pmode) */ void open( const std::string& file, const argv_type& argv, pmode mode = pstdout|pstdin ) { this->do_open(file, argv, mode); } /** * @brief Obtain a reference to the istream that reads * the process' @c stdout. * @return @c *this */ istream_type& out() { this->buf_.read_err(false); return *this; } /** * @brief Obtain a reference to the istream that reads * the process' @c stderr. * @return @c *this */ istream_type& err() { this->buf_.read_err(true); return *this; } }; /// Type definition for common template specialisation. typedef basic_pstreambuf pstreambuf; /// Type definition for common template specialisation. typedef basic_ipstream ipstream; /// Type definition for common template specialisation. typedef basic_opstream opstream; /// Type definition for common template specialisation. typedef basic_pstream pstream; /// Type definition for common template specialisation. typedef basic_rpstream rpstream; /** * When inserted into an output pstream the manipulator calls * basic_pstreambuf::peof() to close the output pipe, * causing the child process to receive the end-of-file indicator * on subsequent reads from its @c stdin stream. * * @brief Manipulator to close the pipe connected to the process' stdin. * @param s An output PStream class. * @return The stream object the manipulator was invoked on. * @see basic_pstreambuf::peof() * @relates basic_opstream basic_pstream basic_rpstream */ template inline std::basic_ostream& peof(std::basic_ostream& s) { typedef basic_pstreambuf pstreambuf; if (pstreambuf* p = dynamic_cast(s.rdbuf())) p->peof(); return s; } /* * member definitions for pstreambuf */ /** * @class basic_pstreambuf * Provides underlying streambuf functionality for the PStreams classes. */ /** Creates an uninitialised stream buffer. */ template inline basic_pstreambuf::basic_pstreambuf() : ppid_(-1) // initialise to -1 to indicate no process run yet. , wpipe_(-1) , wbuffer_(NULL) , rsrc_(rsrc_out) , status_(-1) , error_(0) { init_rbuffers(); } /** * Initialises the stream buffer by calling open() with the supplied * arguments. * * @param command a string containing a shell command. * @param mode the I/O mode to use when opening the pipe. * @see open() */ template inline basic_pstreambuf::basic_pstreambuf(const std::string& command, pmode mode) : ppid_(-1) // initialise to -1 to indicate no process run yet. , wpipe_(-1) , wbuffer_(NULL) , rsrc_(rsrc_out) , status_(-1) , error_(0) { init_rbuffers(); open(command, mode); } /** * Initialises the stream buffer by calling open() with the supplied * arguments. * * @param file a string containing the name of a program to execute. * @param argv a vector of argument strings passsed to the new program. * @param mode the I/O mode to use when opening the pipe. * @see open() */ template inline basic_pstreambuf::basic_pstreambuf( const std::string& file, const argv_type& argv, pmode mode ) : ppid_(-1) // initialise to -1 to indicate no process run yet. , wpipe_(-1) , wbuffer_(NULL) , rsrc_(rsrc_out) , status_(-1) , error_(0) { init_rbuffers(); open(file, argv, mode); } /** * Closes the stream by calling close(). * @see close() */ template inline basic_pstreambuf::~basic_pstreambuf() { close(); } /** * Starts a new process by passing @a command to the shell * and opens pipes to the process with the specified @a mode. * * Will duplicate the actions of the shell in searching for an * executable file if the specified file name does not contain a slash (/) * character. * * There is no way to tell whether the shell command succeeded, this * function will always succeed unless resource limits (such as * memory usage, or number of processes or open files) are exceeded. * This means is_open() will return true even if @a command cannot * be executed. * * @param command a string containing a shell command. * @param mode a bitwise OR of one or more of @c out, @c in, @c err. * @return NULL if the shell could not be started or the * pipes could not be opened, @c this otherwise. * @see execlp(3) */ template basic_pstreambuf* basic_pstreambuf::open(const std::string& command, pmode mode) { #if 0 const std::string argv[] = { "sh", "-c", command }; return this->open("sh", std::vector(argv, argv+3), mode); #else basic_pstreambuf* ret = NULL; if (!is_open()) { switch(fork(mode)) { case 0 : // this is the new process, exec command ::execlp("sh", "sh", "-c", command.c_str(), (void*)NULL); // can only reach this point if exec() failed // parent can get exit code from waitpid() ::_exit(errno); // using std::exit() would make static dtors run twice case -1 : // couldn't fork, error already handled in pstreambuf::fork() break; default : // this is the parent process // activate buffers create_buffers(mode); ret = this; } } return ret; #endif } /** * @brief Helper function to close a file descriptor. * * Inspects @a fd and calls close(3) if it has a non-negative value. * * @param fd a file descriptor. * @relates basic_pstreambuf */ inline void close_fd(pstreams::fd_type& fd) { if (fd >= 0 && ::close(fd) == 0) fd = -1; } /** * @brief Helper function to close an array of file descriptors. * * Calls @c close_fd() on each member of the array. * The length of the array is determined automatically by * template argument deduction to avoid errors. * * @param fds an array of file descriptors. * @relates basic_pstreambuf */ template inline void close_fd_array(pstreams::fd_type (&fds)[N]) { for (std::size_t i = 0; i < N; ++i) close_fd(fds[i]); } /** * Starts a new process by executing @a file with the arguments in * @a argv and opens pipes to the process with the specified @a mode. * * By convention @c argv[0] should be the file name of the file being * executed. * Will duplicate the actions of the shell in searching for an * executable file if the specified file name does not contain a slash (/) * character. * * Iff @a file is successfully executed then is_open() will return true. * Note that exited() will return true if file cannot be executed, since * the child process will have exited. * * @param file a string containing the pathname of a program to execute. * @param argv a vector of argument strings passed to the new program. * @param mode a bitwise OR of one or more of @c out, @c in and @c err. * @return NULL if a pipe could not be opened or if the program could * not be executed, @c this otherwise. * @see execvp(3) */ template basic_pstreambuf* basic_pstreambuf::open( const std::string& file, const argv_type& argv, pmode mode ) { basic_pstreambuf* ret = NULL; if (!is_open()) { // constants for read/write ends of pipe enum { RD, WR }; // open another pipe and set close-on-exec fd_type ck_exec[] = { -1, -1 }; if (-1 == ::pipe(ck_exec) || -1 == ::fcntl(ck_exec[RD], F_SETFD, FD_CLOEXEC) || -1 == ::fcntl(ck_exec[WR], F_SETFD, FD_CLOEXEC)) { error_ = errno; close_fd_array(ck_exec); } else { switch(fork(mode)) { case 0 : // this is the new process, exec command { char** arg_v = new char*[argv.size()+1]; for (std::size_t i = 0; i < argv.size(); ++i) { const std::string& src = argv[i]; char*& dest = arg_v[i]; dest = new char[src.size()+1]; dest[ src.copy(dest, src.size()) ] = '\0'; } arg_v[argv.size()] = NULL; ::execvp(file.c_str(), arg_v); // can only reach this point if exec() failed // parent can get error code from ck_exec pipe error_ = errno; ::write(ck_exec[WR], &error_, sizeof(error_)); ::close(ck_exec[WR]); ::close(ck_exec[RD]); ::_exit(error_); // using std::exit() would make static dtors run twice } case -1 : // couldn't fork, error already handled in pstreambuf::fork() close_fd_array(ck_exec); break; default : // this is the parent process // check child called exec() successfully ::close(ck_exec[WR]); switch (::read(ck_exec[RD], &error_, sizeof(error_))) { case 0: // activate buffers create_buffers(mode); ret = this; break; case -1: error_ = errno; break; default: // error_ contains error code from child // call wait() to clean up and set ppid_ to 0 this->wait(); break; } ::close(ck_exec[RD]); } } } return ret; } /** * Creates pipes as specified by @a mode and calls @c fork() to create * a new process. If the fork is successful the parent process stores * the child's PID and the opened pipes and the child process replaces * its standard streams with the opened pipes. * * If an error occurs the error code will be set to one of the possile * errors for @c pipe() or @c fork(). * See your system's documentation for these error codes. * * @param mode an OR of pmodes specifying which of the child's * standard streams to connect to. * @return On success the PID of the child is returned in the parent's * context and zero is returned in the child's context. * On error -1 is returned and the error code is set appropriately. */ template pid_t basic_pstreambuf::fork(pmode mode) { pid_t pid = -1; // Three pairs of file descriptors, for pipes connected to the // process' stdin, stdout and stderr // (stored in a single array so close_fd_array() can close all at once) fd_type fd[] = { -1, -1, -1, -1, -1, -1 }; fd_type* const pin = fd; fd_type* const pout = fd+2; fd_type* const perr = fd+4; // constants for read/write ends of pipe enum { RD, WR }; // N.B. // For the pstreambuf pin is an output stream and // pout and perr are input streams. if (!error_ && mode&pstdin && ::pipe(pin)) error_ = errno; if (!error_ && mode&pstdout && ::pipe(pout)) error_ = errno; if (!error_ && mode&pstderr && ::pipe(perr)) error_ = errno; if (!error_) { pid = ::fork(); switch (pid) { case 0 : { // this is the new process // for each open pipe close one end and redirect the // respective standard stream to the other end if (*pin >= 0) { ::close(pin[WR]); ::dup2(pin[RD], STDIN_FILENO); ::close(pin[RD]); } if (*pout >= 0) { ::close(pout[RD]); ::dup2(pout[WR], STDOUT_FILENO); ::close(pout[WR]); } if (*perr >= 0) { ::close(perr[RD]); ::dup2(perr[WR], STDERR_FILENO); ::close(perr[WR]); } break; } case -1 : { // couldn't fork for some reason error_ = errno; // close any open pipes close_fd_array(fd); break; } default : { // this is the parent process, store process' pid ppid_ = pid; // store one end of open pipes and close other end if (*pin >= 0) { wpipe_ = pin[WR]; ::close(pin[RD]); } if (*pout >= 0) { rpipe_[rsrc_out] = pout[RD]; ::close(pout[WR]); } if (*perr >= 0) { rpipe_[rsrc_err] = perr[RD]; ::close(perr[WR]); } if (rpipe_[rsrc_out] == -1 && rpipe_[rsrc_err] >= 0) { // reading stderr but not stdout, so use stderr for all reads read_err(true); } } } } else { // close any pipes we opened before failure close_fd_array(fd); } return pid; } /** * Closes all pipes and calls wait() to wait for the process to finish. * If an error occurs the error code will be set to one of the possible * errors for @c waitpid(). * See your system's documentation for these errors. * * @return @c this on successful close or @c NULL if there is no * process to close or if an error occurs. */ template basic_pstreambuf* basic_pstreambuf::close() { basic_pstreambuf* ret = NULL; if (is_open()) { sync(); destroy_buffers(pstdin|pstdout|pstderr); // close pipes before wait() so child gets EOF/SIGPIPE close_fd(wpipe_); close_fd_array(rpipe_); if (wait() == 1) { ret = this; } } return ret; } /** * Called on construction to initialise the arrays used for reading. */ template inline void basic_pstreambuf::init_rbuffers() { rpipe_[rsrc_out] = rpipe_[rsrc_err] = -1; rbuffer_[rsrc_out] = rbuffer_[rsrc_err] = NULL; rbufstate_[0] = rbufstate_[1] = rbufstate_[2] = NULL; } template void basic_pstreambuf::create_buffers(pmode mode) { if (mode & pstdin) { delete[] wbuffer_; wbuffer_ = new char_type[bufsz]; this->setp(wbuffer_, wbuffer_ + bufsz); } if (mode & pstdout) { delete[] rbuffer_[rsrc_out]; rbuffer_[rsrc_out] = new char_type[bufsz]; if (rsrc_ == rsrc_out) this->setg(rbuffer_[rsrc_out] + pbsz, rbuffer_[rsrc_out] + pbsz, rbuffer_[rsrc_out] + pbsz); } if (mode & pstderr) { delete[] rbuffer_[rsrc_err]; rbuffer_[rsrc_err] = new char_type[bufsz]; if (rsrc_ == rsrc_err) this->setg(rbuffer_[rsrc_err] + pbsz, rbuffer_[rsrc_err] + pbsz, rbuffer_[rsrc_err] + pbsz); } } template void basic_pstreambuf::destroy_buffers(pmode mode) { if (mode & pstdin) { this->setp(NULL, NULL); delete[] wbuffer_; wbuffer_ = NULL; } if (mode & pstdout) { if (rsrc_ == rsrc_out) this->setg(NULL, NULL, NULL); delete[] rbuffer_[rsrc_out]; rbuffer_[rsrc_out] = NULL; } if (mode & pstderr) { if (rsrc_ == rsrc_err) this->setg(NULL, NULL, NULL); delete[] rbuffer_[rsrc_err]; rbuffer_[rsrc_err] = NULL; } } template typename basic_pstreambuf::buf_read_src basic_pstreambuf::switch_read_buffer(buf_read_src src) { if (rsrc_ != src) { char_type* tmpbufstate[] = {this->eback(), this->gptr(), this->egptr()}; this->setg(rbufstate_[0], rbufstate_[1], rbufstate_[2]); for (std::size_t i = 0; i < 3; ++i) rbufstate_[i] = tmpbufstate[i]; rsrc_ = src; } return rsrc_; } /** * Suspends execution and waits for the associated process to exit, or * until a signal is delivered whose action is to terminate the current * process or to call a signal handling function. If the process has * already exited wait() returns immediately. * * @param nohang true to return immediately if the process has not exited. * @return 1 if the process has exited. * 0 if @a nohang is true and the process has not exited yet. * -1 if no process has been started or if an error occurs, * in which case the error can be found using error(). */ template int basic_pstreambuf::wait(bool nohang) { int exited = -1; if (is_open()) { int status; switch(::waitpid(ppid_, &status, nohang ? WNOHANG : 0)) { case 0 : // nohang was true and process has not exited exited = 0; break; case -1 : error_ = errno; break; default : // process has exited ppid_ = 0; status_ = status; exited = 1; destroy_buffers(pstdin|pstdout|pstderr); close_fd(wpipe_); close_fd_array(rpipe_); break; } } return exited; } /** * Sends the specified signal to the process. A signal can be used to * terminate a child process that would not exit otherwise. * * If an error occurs the error code will be set to one of the possible * errors for @c kill(). See your system's documentation for these errors. * * @param signal A signal to send to the child process. * @return @c this or @c NULL if @c kill() fails. */ template inline basic_pstreambuf* basic_pstreambuf::kill(int signal) { basic_pstreambuf* ret = NULL; if (is_open()) { if (::kill(ppid_, signal)) error_ = errno; else { // TODO call exited() to check for exit and clean up? leave to user? ret = this; } } return ret; } /** * @return True if the associated process has exited, false otherwise. * @see basic_pstreambuf::close() */ template inline bool basic_pstreambuf::exited() { return ppid_ == 0 || wait(true)==1; } /** * @return The exit status of the child process, or -1 if close() * has not yet been called to wait for the child to exit. * @see basic_pstreambuf::close() */ template inline int basic_pstreambuf::status() const { return status_; } /** * @return The error code of the most recently failed operation, or zero. */ template inline int basic_pstreambuf::error() const { return error_; } /** * Closes the output pipe, causing the child process to receive the * end-of-file indicator on subsequent reads from its @c stdin stream. */ template inline void basic_pstreambuf::peof() { sync(); destroy_buffers(pstdin); close_fd(wpipe_); } /** * @return true if a previous call to open() succeeded and wait() has * not been called and determined that the process has exited, * false otherwise. * @warning This function can not be used to determine whether the * command used to initialise the buffer was successfully * executed or not. If the shell command failed this function * will still return true. * You can use exited() to see if it's still open. */ template inline bool basic_pstreambuf::is_open() const { return ppid_ > 0; } /** * Toggle the stream used for reading. If @a readerr is @c true then the * process' @c stderr output will be used for subsequent extractions, if * @a readerr is false the the process' stdout will be used. * @param readerr @c true to read @c stderr, @c false to read @c stdout. * @return @c true if the requested stream is open and will be used for * subsequent extractions, @c false otherwise. */ template inline bool basic_pstreambuf::read_err(bool readerr) { buf_read_src src = readerr ? rsrc_err : rsrc_out; if (rpipe_[src]>=0) { switch_read_buffer(src); return true; } return false; } /** * Called when the internal character buffer is not present or is full, * to transfer the buffer contents to the pipe. * * @param c a character to be written to the pipe. * @return @c traits_type::not_eof(c) if @a c is equal to @c * traits_type::eof(). Otherwise returns @a c if @a c can be * written to the pipe, or @c traits_type::eof() if not. */ template typename basic_pstreambuf::int_type basic_pstreambuf::overflow(int_type c) { if (!empty_buffer()) return traits_type::eof(); else if (!traits_type::eq_int_type(c, traits_type::eof())) return this->sputc(c); else return traits_type::not_eof(c); } template int basic_pstreambuf::sync() { return !exited() && empty_buffer() ? 0 : -1; } /** * @param s character buffer. * @param n buffer length. * @return the number of characters written. */ template std::streamsize basic_pstreambuf::xsputn(const char_type* s, std::streamsize n) { if (n < this->epptr() - this->pptr()) { std::memcpy(this->pptr(), s, n * sizeof(char_type)); this->pbump(n); return n; } else { for (std::streamsize i = 0; i < n; ++i) { if (traits_type::eq_int_type(this->sputc(s[i]), traits_type::eof())) return i; } return n; } } /** * @return true if the buffer was emptied, false otherwise. */ template bool basic_pstreambuf::empty_buffer() { const std::streamsize count = this->pptr() - this->pbase(); const std::streamsize written = this->write(this->wbuffer_, count); if (count > 0 && written == count) { this->pbump(-written); return true; } return false; } /** * Called when the internal character buffer is is empty, to re-fill it * from the pipe. * * @return The first available character in the buffer, * or @c traits_type::eof() in case of failure. */ template typename basic_pstreambuf::int_type basic_pstreambuf::underflow() { if (this->gptr() < this->egptr() || fill_buffer()) return traits_type::to_int_type(*this->gptr()); else return traits_type::eof(); } /** * Attempts to make @a c available as the next character to be read by * @c sgetc(). * * @param c a character to make available for extraction. * @return @a c if the character can be made available, * @c traits_type::eof() otherwise. */ template typename basic_pstreambuf::int_type basic_pstreambuf::pbackfail(int_type c) { if (this->gptr() != this->eback()) { this->gbump(-1); if (!traits_type::eq_int_type(c, traits_type::eof())) *this->gptr() = traits_type::to_char_type(c); return traits_type::not_eof(c); } else return traits_type::eof(); } template std::streamsize basic_pstreambuf::showmanyc() { int avail = 0; #ifdef FIONREAD if (ioctl(rpipe(), FIONREAD, &avail) == -1) avail = -1; else #endif if (const std::ptrdiff_t buflen = this->gptr() - this->eback()) avail += buflen; return std::streamsize(avail); } /** * @return true if the buffer was filled, false otherwise. */ template bool basic_pstreambuf::fill_buffer() { const std::streamsize pb1 = this->gptr() - this->eback(); const std::streamsize pb2 = pbsz; const std::streamsize npb = std::min(pb1, pb2); std::memmove( rbuffer() + pbsz - npb, this->gptr() - npb, npb * sizeof(char_type) ); const std::streamsize rc = read(rbuffer() + pbsz, bufsz - pbsz); if (rc > 0) { this->setg( rbuffer() + pbsz - npb, rbuffer() + pbsz, rbuffer() + pbsz + rc ); return true; } else { this->setg(NULL, NULL, NULL); return false; } } /** * Writes up to @a n characters to the pipe from the buffer @a s. * This currently only works for fixed width character encodings where * each character uses @c sizeof(char_type) bytes. * * @param s character buffer. * @param n buffer length. * @return the number of characters written. */ template inline std::streamsize basic_pstreambuf::write(const char_type* s, std::streamsize n) { return wpipe() >= 0 ? ::write(wpipe(), s, n * sizeof(char_type)) : 0; } /** * Reads up to @a n characters from the pipe to the buffer @a s. * This currently only works for fixed width character encodings where * each character uses @c sizeof(char_type) bytes. * * @param s character buffer. * @param n buffer length. * @return the number of characters read. */ template inline std::streamsize basic_pstreambuf::read(char_type* s, std::streamsize n) { return rpipe() >= 0 ? ::read(rpipe(), s, n * sizeof(char_type)) : 0; } /** @return a reference to the output file descriptor */ template inline typename basic_pstreambuf::fd_type& basic_pstreambuf::wpipe() { return wpipe_; } /** @return a reference to the active input file descriptor */ template inline typename basic_pstreambuf::fd_type& basic_pstreambuf::rpipe() { return rpipe_[rsrc_]; } /** @return a reference to the specified input file descriptor */ template inline typename basic_pstreambuf::fd_type& basic_pstreambuf::rpipe(buf_read_src which) { return rpipe_[which]; } /** @return a pointer to the start of the active input buffer area. */ template inline typename basic_pstreambuf::char_type* basic_pstreambuf::rbuffer() { return rbuffer_[rsrc_]; } /* * member definitions for pstream_common */ /** * @class pstream_common * Abstract Base Class providing common functionality for basic_ipstream, * basic_opstream and basic_pstream. * pstream_common manages the basic_pstreambuf stream buffer that is used * by the derived classes to initialise an IOStream class. */ /** Creates an uninitialised stream. */ template inline pstream_common::pstream_common() : std::basic_ios(NULL) , command_() , buf_() { this->init(&buf_); } /** * Initialises the stream buffer by calling * do_open( @a command , @a mode ) * * @param command a string containing a shell command. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, pmode) */ template inline pstream_common::pstream_common(const std::string& command, pmode mode) : std::basic_ios(NULL) , command_(command) , buf_() { this->init(&buf_); do_open(command, mode); } /** * Initialises the stream buffer by calling * do_open( @a file , @a argv , @a mode ) * * @param file a string containing the pathname of a program to execute. * @param argv a vector of argument strings passed to the new program. * @param mode the I/O mode to use when opening the pipe. * @see do_open(const std::string&, const argv_type&, pmode) */ template inline pstream_common::pstream_common( const std::string& file, const argv_type& argv, pmode mode ) : std::basic_ios(NULL) , command_(file) , buf_() { this->init(&buf_); do_open(file, argv, mode); } /** * This is a pure virtual function to make @c pstream_common abstract. * Because it is the destructor it will be called by derived classes * and so must be defined. It is also protected, to discourage use of * the PStreams classes through pointers or references to the base class. * * @sa If defining a pure virtual seems odd you should read * http://www.gotw.ca/gotw/031.htm (and the rest of the site as well!) */ template inline pstream_common::~pstream_common() { } /** * Calls rdbuf()->open( @a command , @a mode ) * and sets @c failbit on error. * * @param command a string containing a shell command. * @param mode the I/O mode to use when opening the pipe. * @see basic_pstreambuf::open(const std::string&, pmode) */ template inline void pstream_common::do_open(const std::string& command, pmode mode) { if (!buf_.open((command_=command), mode)) this->setstate(std::ios_base::failbit); } /** * Calls rdbuf()->open( @a file, @a argv, @a mode ) * and sets @c failbit on error. * * @param file a string containing the pathname of a program to execute. * @param argv a vector of argument strings passed to the new program. * @param mode the I/O mode to use when opening the pipe. * @see basic_pstreambuf::open(const std::string&, const argv_type&, pmode) */ template inline void pstream_common::do_open( const std::string& file, const argv_type& argv, pmode mode ) { if (!buf_.open((command_=file), argv, mode)) this->setstate(std::ios_base::failbit); } /** Calls rdbuf->close() and sets @c failbit on error. */ template inline void pstream_common::close() { if (!buf_.close()) this->setstate(std::ios_base::failbit); } /** * @return rdbuf()->is_open(). * @see basic_pstreambuf::is_open() */ template inline bool pstream_common::is_open() const { return buf_.is_open(); } /** @return a string containing the command used to initialise the stream. */ template inline const std::string& pstream_common::command() const { return command_; } /** @return a pointer to the private stream buffer member. */ // TODO document behaviour if buffer replaced. template inline typename pstream_common::streambuf_type* pstream_common::rdbuf() const { return const_cast(&buf_); } #if REDI_EVISCERATE_PSTREAMS /** * @def REDI_EVISCERATE_PSTREAMS * If this macro has a non-zero value then certain internals of the * @c basic_pstreambuf template class are exposed. In general this is * a Bad Thing, as the internal implementation is largely undocumented * and may be subject to change at any time, so this feature is only * provided because it might make PStreams useful in situations where * it is necessary to do Bad Things. */ /** * @warning This function exposes the internals of the stream buffer and * should be used with caution. It is the caller's responsibility * to flush streams etc. in order to clear any buffered data. * The POSIX.1 function fdopen(3) is used to obtain the * @c FILE pointers from the streambuf's private file descriptor * members so consult your system's documentation for * fdopen(3). * * @param in A FILE* that will refer to the process' stdin. * @param out A FILE* that will refer to the process' stdout. * @param err A FILE* that will refer to the process' stderr. * @return An OR of zero or more of @c pstdin, @c pstdout, @c pstderr. * * For each open stream shared with the child process a @c FILE* is * obtained and assigned to the corresponding parameter. For closed * streams @c NULL is assigned to the parameter. * The return value can be tested to see which parameters should be * @c !NULL by masking with the corresponding @c pmode value. * * @see fdopen(3) */ template std::size_t basic_pstreambuf::fopen(std::FILE*& in, std::FILE*& out, std::FILE*& err) { in = out = err = NULL; std::size_t open_files = 0; if (wpipe() > -1) { if ((in = ::fdopen(wpipe(), "w"))) { open_files |= pstdin; } } if (rpipe(rsrc_out) > -1) { if ((out = ::fdopen(rpipe(rsrc_out), "r"))) { open_files |= pstdout; } } if (rpipe(rsrc_err) > -1) { if ((err = ::fdopen(rpipe(rsrc_err), "r"))) { open_files |= pstderr; } } return open_files; } /** * @warning This function exposes the internals of the stream buffer and * should be used with caution. * * @param in A FILE* that will refer to the process' stdin. * @param out A FILE* that will refer to the process' stdout. * @param err A FILE* that will refer to the process' stderr. * @return A bitwise-or of zero or more of @c pstdin, @c pstdout, @c pstderr. * @see basic_pstreambuf::fopen() */ template inline std::size_t pstream_common::fopen(std::FILE*& in, std::FILE*& out, std::FILE*& err) { return buf_.fopen(in, out, err); } #endif // REDI_EVISCERATE_PSTREAMS } // namespace redi /** * @mainpage PStreams Reference * @htmlinclude mainpage.html */ #endif // REDI_PSTREAM_H_SEEN // vim: ts=2 sw=2 expandtab murasaki/src/ecolist.cc0000644000177700001440000003730311434752234014525 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #include "ecolist.h" #include "exceptions.h" #include "dinkymath.h" #include #include //a "chunk" is a "location seqbit/idxbit combo" //a "block" is word (in words) that is used to store either chunks (which may straddle up to 2 blocks) //blocks are allocated in "blobs" of initBlocks<&,const pair&)=0; Ecolist::Ecolist() : size(0), blocks(NULL) { assert(inited); } void Ecolist::clear(){ eraseAll(blocks,0); size=0; } ostream& operator<<(ostream& os,Ecolist& a){ return os << "(Ecolist: "< Ecolist::getValAt(SeqPos loc){ assert(loc(seq,idx); } SeqPos Ecolist::capacity(){ SeqPos cap=0; SeqPos ite=0; for(block *root=blocks;root;root=root[initBlocks<=size) return end(); SeqPos ite=0; block* root=blocks; while(remain>=minChunks<(),0,0); } Ecolist::bi_iterator Ecolist::bi_end(){ SeqPos ite=0; list prev; block *root; SeqPos pos=size; for(root=blocks;root;){ block *tmp=root[initBlocks<=size) return bi_end(); SeqPos ite=0; list prev; block* root=blocks; while(remain>=minChunks<WORDSIZE) throw MurasakiException("Can't deal with chunks larger than wordsize. Please use an environment with bigger words, or reduce input size, or use a different hash table"); //now we have about allocation choice. We can vow to never waste a single bit (we'll call this "frugal") or //we can make the most out of a block while assuming that each list has zero entries (we'll call this "optimistic") SeqPos expected=ceil_log2_loop(total)-weight; if(frugal){ minBlocks=lcm(ceil_log2_loop(seqs)+ceil_log2_loop((longest+1)*2),WORDSIZE)/WORDSIZE; for(initBlocks=minBlocks;initBlocks=(size_t)(minChunks<=(size_t)(minChunks<0); assert(size>valoff); } } SeqPos Ecolist::readBits(block* root,SeqPos loc,SeqPos offset,SeqPos size){ SeqPos bitloc=loc*chunkSize+offset; SeqPos wloc=bitloc/WORDSIZE; offset=MODWORDSIZE(bitloc); //offset now means "location inside word to start reading from" word mask=lowN(size)<>offset; if(offset && MODWORDSIZE(bitloc+size-1)0); assert(size=minChunks< Ecolist::EcolistIterator::operator*() const { SeqPos seq=readBits(current,pos,0,Ecolist::seqbits); SeqPos idx=readBits(current,pos,Ecolist::seqbits,Ecolist::idxbits)-Ecolist::idxLongest; return std::pair(seq,idx); } std::pair Ecolist::EcolistIterator::get() const { SeqPos seq=readBits(current,pos,0,Ecolist::seqbits); SeqPos idx=readBits(current,pos,Ecolist::seqbits,Ecolist::idxbits)-Ecolist::idxLongest; return std::pair(seq,idx); } Ecolist::EcolistIterator::_Self& Ecolist::EcolistIterator::operator++(){ pos++; if(pos==end && current[initBlocks< &v) const { writeBits(current,pos,0,Ecolist::seqbits,v.first); writeBits(current,pos,Ecolist::seqbits,Ecolist::idxbits,v.second+Ecolist::idxLongest); } //bidirectional-mojo Ecolist::BidirectionalEcolistIterator::BidirectionalEcolistIterator(block *_start,list _prev,SeqPos _p,SeqPos _ite) : current(_start),prev(_prev),pos(_p),ite(_ite),end(Ecolist::minChunks<<_ite) { } std::pair Ecolist::BidirectionalEcolistIterator::operator*() const { SeqPos seq=readBits(current,pos,0,Ecolist::seqbits); SeqPos idx=readBits(current,pos,Ecolist::seqbits,Ecolist::idxbits)-Ecolist::idxLongest; return std::pair(seq,idx); } std::pair Ecolist::BidirectionalEcolistIterator::get() const { SeqPos seq=readBits(current,pos,0,Ecolist::seqbits); SeqPos idx=readBits(current,pos,Ecolist::seqbits,Ecolist::idxbits)-Ecolist::idxLongest; return std::pair(seq,idx); } Ecolist::BidirectionalEcolistIterator::_Self& Ecolist::BidirectionalEcolistIterator::operator++(){ pos++; if(pos==end && current[initBlocks< &v) const { writeBits(current,pos,0,Ecolist::seqbits,v.first); writeBits(current,pos,Ecolist::seqbits,Ecolist::idxbits,v.second+Ecolist::idxLongest); } //sort mojo void Ecolist::inPlaceSort(){ assert(lessthan); qsort(0,size); } SeqPos Ecolist::partition(SeqPos left,SeqPos right,SeqPos pivoti){ iterator pi(at(pivoti)),li(at(left)),ri(at(right)); val_type pv=*pi; swap(pi,ri); iterator store(li);SeqPos si=left; //using a store lets us get away with forward-only iterators for(iterator i(li),stop(ri);i!=stop;++i){ if(lessthan(*i,pv)){ swap(store,i); ++store;++si; } } swap(ri,store); return si; } void Ecolist::qsort(SeqPos left,SeqPos right){ if(right<=left) return; //left edge. theoretically no reason to believe that middle would be better. (unless it's already been sorted. then we want middle) as a (slight) bonus left edge is guaranteed to be the fastest one to look up. // SeqPos pivoti=left; //Actually, using the seedfilter, we might get a mostly ordered list, which would make us sad, so we'll take middle (which would be optimal in that case anyway) SeqPos pivoti=(left+right)/2; pivoti=partition(left,right,pivoti); qsort(left,pivoti-1); qsort(pivoti+1,right); } void Ecolist::swap(iterator &a,iterator &b){ val_type temp=*a; a.put(*b); b.put(temp); } struct less_map : public binary_function { bool operator()(const Ecolist::val_type& a,const Ecolist::val_type& b) { return Ecolist::lessthan(a,b); } }; void Ecolist::externalSort(){ vector v; v.reserve(size); iterator ite(begin()),stop(end()); for(;ite!=stop;++ite) v.push_back(*ite); std::sort(v.begin(),v.end(),less_map()); ite=begin(); for(SeqPos i=0;i0){ SeqPos half=len/2; SeqPos mid=low+half; high=low+len; val_type here=getValAt(mid); if(lessthan(here,m)){ low=mid+1; len=len-half-1; } else len=half; } return low; } SeqPos Ecolist::int_upper_bound(const val_type &m){ //first elem not 0){ SeqPos half=len/2; SeqPos mid=low+half; high=low+len; val_type here=getValAt(mid); if(lessthan(m,here)) len=half; else{ low=mid+1; len=len-half-1; } } return low; } murasaki/src/svninfo.pl0000755000177700001440000000127511434752234014575 0ustar krispusers#!/usr/bin/perl use strict; local $\="\n"; local $,=","; my @svn=`svn info -rHEAD 2>&1`; exit 1 if $?; #svn not happy? don't return anything. my %svn=kvsplit(@svn); my ($date)=split(/ /,$svn{"Last Changed Date"}); my ($branch)=$svn{URL}; $branch=$1 if $svn{URL}=~m!((?:branches|trunk|svn)/[a-zA-Z/]*)src$!; $branch=~s!/$!!; my @svnstatus=`svn status`; my @modified=grep {/^M/ and !/Makefile$/ } @svnstatus; my $modified="*".join("~",map {m!([^/ \t]+?)\s*$!; "$1"} @modified); print join(":",$branch,"r$svn{'Last Changed Rev'}",$date,(@modified ? ($modified):())); sub kvsplit { my %r; foreach my $l (@_){ chomp $l; next unless $l=~m/^([^:]+): (.*)/; $r{$1}=$2; } return %r; } murasaki/src/msethash.cc0000644000177700001440000000446011434752235014676 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ///////////////////////// // the original multiset hash buckets ////////////////////// #include "sequence.h" #include "msethash.h" #include "murasaki.h" #include using namespace std; MSetHash::MSetHash(BitSequence *pat) : Hash(pat), activeRange((HashIte)NULL,(HashIte)NULL) { fasta=new MHash[hash_size]; memset(fasta,0,sizeof(MHash)*hash_size); } void MSetHash::clear(){ for(word i=0; i; fasta[key]->insert(val); } void MSetHash::getMatchingSets(HashKey key,list &sets){ if(emptyAt(key)) return; HashIte start(fasta[key]->begin()),stop(fasta[key]->end()); for(HashIte si=start;si!=stop;){ sets.push_back(LocList(seq_count)); LocList &locList=sets.back(); for(HashIte setEnd(fasta[key]->upper_bound(*si));si!=setEnd;++si){ locList[si->seqId()].push_back(*si); } } } void MSetHash::lookup(HashKey key,LocList &locList){ for( pair range(fasta[key]->begin(), fasta[key]->end()); range.first!=range.second; range.first++){ locList[range.first->seqId()].push_back(*range.first); } } bool MSetHash::emptyAt(const HashKey key){ if(!fasta[key] || fasta[key]->empty()) return false; return sizeAt(key)==0; } word MSetHash::sizeAt(const HashKey key){ if(!fasta[key]) return 0; else return fasta[key]->size(); } word MSetHash::sizeAt(const HashKey key,const HashVal &val){ return fasta[key]->count(val); } murasaki/src/seqread.cc0000644000177700001440000004015711434752233014507 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // seqread.cc // c++ support for reading various sequence formats ////////////// #include "genopts.h" #include "seqread.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef WITH_MINGW #include "mingw32compat.h" #endif using namespace std; namespace fs = boost::filesystem; namespace io = boost::iostreams; const char seqDiv[]="NNNNNNNNNN"; const boost::regex stitchLine("(\\S+)\\t(\\d+)\t(\\d+)\\t(\\d+)"); SequenceReaderGlobalOptions* SequenceReaderGlobalOptions::_instance; SequenceReader::SequenceReader(): repeatMask(false),lengthOnly(false), recordLength(true),silent(false), verbose(false) {} size_t SequenceReader::readSeqInto(string& dst,const string &filename){ SequenceFileReader reader(filename,options()); char c; while((c=reader.getc())){ dst.append(1,c); } return reader.size(); } size_t SequenceReader::readSeqInto(string& dst,istream& is,const string filename){ SequenceFileReader reader(is,filename,options()); char c; while((c=reader.getc())){ dst.append(1,c); } return reader.size(); } SequenceByteReader::SequenceByteReader(istream& _is,const string _filename,int options) : repeatMask(options & SEQO_RMASK),lengthOnly(options & SEQO_LEN),recordLength(!(options & SEQO_NOREC)),silent(options & SEQO_SILENT),verbose(options & SEQO_VERBOSE), bufsize(4096),linenum(1), finished(false),filetype(SEQFILE_END), filename(_filename),is(_is), redirect(0),count(0),subcount(0), subSeqId(0), bogusCharWarned(false) { if(!is) throw SeqReadException("Couldn't read from "+filename); boost::regex rawre("[acgtnACGTN]+\\n?"); boost::cmatch results; char buf[bufsize]; is.getline(buf,bufsize); if((is.eof() || (is.fail() && is.gcount()>=(bufsize-1))) || regex_match(buf,results,rawre)){//appears to be a raw sequence. read all and dump is.clear(); if(verbose)cerr << "It's a raw file!"<' || buf[0]==';'){ //it's a fasta file! if(verbose)cerr << "It's a fasta file!"< mean we insert 10 N's subSeqName=string(buf+1); return; } //end of fasta file parsing. if(regex_match(buf,results,stitchLine)){ if(verbose)cerr << "It's a stitch file!"<ignoreBogusChars ? "Ignoring.":"Converting to Ns.")); } if(!SequenceReaderGlobalOptions::options()->ignoreBogusChars) dst.append(1,'n'); } } else for(const char *i=buf,*stop=buf+bufsize;i!=stop;i++){ if(!*i) break; switch(*i){ case 'a':case 'c': case 'g': case 't': case 'n': case 'A':case 'C': case 'G': case 'T': case 'N': dst.append(1,*i); break; case 'b': case 'd': case 'e': case 'f': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'o': case 'p': case 'q': case 'r': case 's': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'B': case 'D': case 'E': case 'F': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': if(!bogusCharWarned){ bogusCharWarned=true; warnx("Warning: %s contains illegal (non [acgtn]) characters (%c at %ld (line %ld)). %s",filename.c_str(),*i,(long)size(),(long)linenum,(SequenceReaderGlobalOptions::options()->ignoreBogusChars ? "Ignoring.":"Converting to Ns.")); } if(!SequenceReaderGlobalOptions::options()->ignoreBogusChars) dst.append(1,'n'); } } } bool SequenceByteReader::readMore(){ buffers.push_back(string()); buffers.back().reserve(bufsize); string &dst=buffers.back(); if(is.eof()){ return false; } char buf[bufsize]; bzero(buf,bufsize); switch(filetype){ case SEQFILE_RAW: is.read(buf,bufsize); if(is.bad()) throw SeqReadException("Read() failed mid raw-stream"); procBuf(dst,buf,bufsize); break; case SEQFILE_FASTA: do { is.getline(buf,bufsize); // cout << "Read fasta line "<'){ procBuf(dst,seqDiv,11); subSeqId++; subSeqName=string(buf+1); continue; } if(buf[0]==';')continue; //comment line //otherwise it must be content procBuf(dst,buf,bufsize); if(is.eof()) return true; if(!is.fail()) linenum++; else is.clear(); //just an overly full line break; }while(1); break; case SEQFILE_STITCH: is.getline(buf,bufsize);linenum++; if(is.fail()) //most likely already at eof return false; if(!is.good()) throw SeqReadException("read() failed mid stitch file"); if(1){//purely to scope results boost::cmatch results; if(!regex_match(buf,results,stitchLine)) throw SeqReadException("stitch file suddenly contained a non-stitch line"); //ok, can add a new sequence. dst.append(seqDiv); addRedirect(results); subSeqId++; subSeqName=results[1]; } return true; break; case SEQFILE_GBK: is.getline(buf,bufsize);linenum++; if(is.fail()) //most likely already at eof return false; if(!is.good()) throw SeqReadException("read() failed mid sequence"); procBuf(dst,buf,bufsize); return true; break; default: throw SeqReadException("Read on undefined file format"); } return true; } void SequenceByteReader::addRedirect(boost::cmatch results){ assert(redirect==0); fs::path stitchPath; if(!filename.empty()){//commence the annoying parsing of the filename! fs::path stitchfile(filename); stitchPath=stitchfile.branch_path(); } if(verbose)cerr << results[2] << " region from "<< results[3] << " to "<(results[2]); //assume we read the whole thing... }catch(bad_lexical_cast& e){ throw SeqReadException("Invalid stitch member length: "+results[2]+" :"+e.what()); } } } char SequenceByteReader::getc(){ if(finished)return 0; //already finished on previous pass if(!buffers.empty()){ //do i have any unsent data pending? if so, send that if(subcountgetc(); if(c){ count++; return c; }else redirect=0; } while(readMore()){ //read more out of our file (which might be blank lines, thus the while) //file's not over yet, so return that assert(!buffers.empty()); if(buffers.front().length()){ //real data. safe to return. count++; assert(buffers.front()[subcount]); return buffers.front()[subcount++]; }//if not, try the next line. else buffers.erase(buffers.begin()); //but still have to erasethis empty junk buffer! } //end of file if(verbose)cerr << "End of file ("< &range,int options){ using namespace boost; static const regex rangeSpec("^(.*)[\\(\\{\\[](\\d+)\\W(\\d+)[\\)\\}\\]]$"); smatch results; if(regex_match(filename,results,rangeSpec)){ if(options & SEQO_VERBOSE) cerr << "Selecting range ["<(results[2])-1; //offset by put in 0-based coords range.second=lexical_cast(results[3])-1; if(range.first>range.second) swap(range.first,range.second); }catch(bad_lexical_cast& e){ throw SeqReadException(string("Bad value in range specification (")+results[0]+"):"+e.what()); } return true; }else return false; } bool SequenceFileReader::useRangeSpec(int noOpen){ if(parseRangeSpec(filename,range,options)){ if(ifs) ifs.close(); if(!noOpen){ ifs.open(filename.c_str()); } rangeOnly=true; return true; }else return false; } SequenceFileReader::SequenceFileReader(const string _filename,int _options) : options(_options), filename(_filename),ifs(_filename.c_str()),is(NULL), byteReader(NULL), rangeOnly(false),range(0,0),outCount(0) { useRangeSpec(); if(!ifs) throw SeqReadException("Couldn't open file: "+filename); is=setupInputFilters(ifs); if(!is) throw SeqReadException("Failed to set up input filters"); byteReader=new SequenceByteReader(*is,filename,_options); } SequenceFileReader::SequenceFileReader(istream &_is,const string _filename,int _options) : options(_options), filename(_filename),ifs(),is(setupInputFilters(_is)), byteReader(NULL), rangeOnly(false),range(0,0),outCount(0) { useRangeSpec(1); if(!_is) throw SeqReadException("Invalid input stream"); if(!is) throw SeqReadException("Failed to set up input filters"); byteReader=new SequenceByteReader(*is,filename,_options); } SequenceFileReader::~SequenceFileReader(){ if(is) delete is; if(byteReader) delete byteReader; } istream* SequenceFileReader::setupInputFilters(istream &file){ if(!file) return NULL; //don't create dud filters if(!filename.empty()){ //if it's not actually a file, we can't seek. unsigned char magic_c[2]; file.read((char*)magic_c,2); file.seekg(0); unsigned magic=(magic_c[0]<<8) | magic_c[1]; switch(magic){ case 0x425a: in.push(io::bzip2_decompressor());break; case 0x1f8b: in.push(io::gzip_decompressor());break; case 0x1f9d: in.push(io::zlib_decompressor());break; } } in.push(file); return new istream(&in); } size_t SequenceFileReader::peekLength() const { return getLength(filename,options); } size_t SequenceFileReader::getLength(string filename,int options){ pair range; bool rangeOnly=parseRangeSpec(filename,range,options); size_t length; if(!fs::exists(filename)) throw SeqReadException(string("Sequence file not found: ")+filename); fs::path lengthFile(filename+string(".length")); if(!fs::exists(lengthFile) || (fs::exists(filename) && fs::last_write_time(lengthFile)> size; length=size; is.close(); } if(rangeOnly) return min(length,range.second+1)-range.first; else return length; } char SequenceFileReader::getc(){ if(rangeOnly){ if(byteReader->size()size()getc(); if(byteReader->size()>range.second){ if(options & SEQO_VERBOSE)cerr << "End of subrange"<getc(); if(c)outCount++; return c; } size_t SequenceFileReader::readLength(){ while(byteReader->getc()) ; return byteReader->size(); } size_t SequenceFileReader::size(){ return outCount; } bool SequenceFileReader::eof(){ if(!rangeOnly) return byteReader->eof(); return byteReader->eof() || byteReader->size()>range.second; } string SequenceFileReader::formatString(){ switch(format()){ case SEQFILE_RAW: return "raw";break; case SEQFILE_STITCH: return "stitch";break; case SEQFILE_FASTA: return "fasta";break; case SEQFILE_GBK: return "genebank";break; default: return "unknown";break; } } SequenceReaderGlobalOptions* SequenceReaderGlobalOptions::options(){ if(!_instance) _instance=new SequenceReaderGlobalOptions(); return _instance; } SequenceReaderGlobalOptions::SequenceReaderGlobalOptions(): ignoreBogusChars(false) { if(getenv("MURASAKI_SR_IGNOREBOGUS")) getYesNo(getenv("MURASAKI_SR_IGNOREBOGUS"),ignoreBogusChars,"MURASAKI_SR_IGNOREBOGUS"); } murasaki/src/getsegments.cc0000644000177700001440000001426511434752234015412 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ ////////////// // murasaki project // geneparse.cc // c++ implementation of the geneparse.pl program ////////////// #include #include #include #include "seqread.h" #include #include #include #include #include using namespace std; string program_version(); string program_help(); typedef pair SubSeq; size_t getSubSeqs(SequenceFileReader &reader,list &output); size_t getSubSeqs(SequenceFileReader &reader,list &output,string filename); void writeLenFile(ostream &os,size_t size,const list &segments); int main(int argc,char **argv){ SequenceReader seqreader; bool verbose=false; bool useStdOut=false; //send output to stdout instead of disk ostream* os=&cout; string customOutput; int optc; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"help",0,0,'h'}, {"version",0,0,'V'}, {"verbose",0,0,'v'}, {"stdout",0,0,'c'}, {0,0,0,0} }; int longindex=0; string prefreq; optc=getopt_long(argc,argv,"hVvc",long_options,&longindex); if(optc==-1)break; switch(optc){ case 'h': cout << program_help();exit(0);break; case 'V': cout << program_version();exit(0);break; case 'v': verbose=true;seqreader.verbose=true;break; case 'c': useStdOut=true;break; default: cerr << "Unknown option: "<<(char)optc< segments; size_t size=0; SequenceFileReader reader(cin); useStdOut=true; //only stdout when using stdin size=getSubSeqs(reader,segments); writeLenFile(cout,size,segments); }catch(SeqReadException e){ cerr << "Error reading from stdin: "< segments; size_t size=0; string file(argv[i]); string outfilename(file+".len"); if(!useStdOut){ os=new ofstream(outfilename.c_str()); if(!os || !*os){ warn("Problem opening %s for writing. Skipping.",outfilename.c_str()); continue; //skip this file } } if(verbose)cerr << "Reading "<close(); delete os; os=&cout; } } } return 0; } size_t getSubSeqs(SequenceFileReader &reader,list &output,string filename){ //if we're given the filename, and it's a stitch file we can sneakily strip the //relevant data right out of the stitch file. if(reader.format()!=SEQFILE_STITCH) return getSubSeqs(reader,output); ifstream inf(filename.c_str()); if(!inf) return getSubSeqs(reader,output); using namespace boost; const regex stitchLine("(\\S+)\\t(\\d+)\t(\\d+)\\t(\\d+)"); string buf; size_t lastCoord=0; for(int linenum=1;inf && getline(inf,buf);linenum++){ smatch results; if(!regex_match(buf,results,stitchLine)){ warnx("Invalid stitch (%s) line %d: '%s'. Aborting stitch-read.",filename.c_str(),linenum,buf.c_str()); output.clear(); return getSubSeqs(reader,output); } try { output.push_back(SubSeq(lexical_cast(results[2]),results[1])); lastCoord=lexical_cast(results[4]); }catch(bad_lexical_cast e){ warnx("Failed to parse stitch (%s) line %d: '%s'. %s",filename.c_str(),linenum,buf.c_str(),e.what()); output.clear(); return getSubSeqs(reader,output); } } return lastCoord; } size_t getSubSeqs(SequenceFileReader &reader,list &output){ size_t count=0,inSeg=0; long prevSubSeqId=reader.getSubSeqId(); string prevSubSeqName=reader.getSubSeqName(); while(reader.getc()){ if(reader.getSubSeqId()!=prevSubSeqId){ //subseq changed! assert(inSeg>=1); //murasaki technically forbids 0 length sub-sequences from being recognized output.push_back(SubSeq(inSeg,prevSubSeqName)); inSeg=-10; //this looks a bit hinky, but between every subseq there _should_ be exactly 10 Ns, so this accounts for that // assert(c=='N'); //see? we're looking at an N! prevSubSeqId=reader.getSubSeqId(); prevSubSeqName=reader.getSubSeqName(); } count++; inSeg++; } assert(inSeg>=1); //murasaki technically forbids 0 length sub-sequences from being recognized output.push_back(SubSeq(inSeg,prevSubSeqName)); return count; } void writeLenFile(ostream &os,size_t size,const list &segments){ os << size << endl; for(list::const_iterator i=segments.begin(); i!=segments.end();++i) os << i->first << "\t" << i->second << endl; } string program_help(){ return string("\ Usage: getsegments [options...] [input] [input2 ...]\n\ \n\ Generates a .len file for each input sequence containing the names\n\ and lengths of each subsequence. These files aren't used by Murasaki\n\ itself, but several of the other supporting programs use/require them.\n\ \n\ Options:\n\ --version|-V - program version\n\ --help|-h - this message\n\ --verbose|-v - lots of extra details\n\ --stdout|-c - write output to stdout instead of input.len\n\ "); } string program_version(){ return string("1.0"); } murasaki/src/sequence.h0000644000177700001440000004213511434752234014534 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ //////// // sequence defs // Kris Popendorf /////// #ifndef SEQUENCE__H #define SEQUENCE__H #include "globaltypes.h" #include "cmultiset.h" #include "scoring.h" #include "binseq.hh" #include #include #include #include #include #include #include #include #include "itree.h" #include "seqread.h" #include "shmvector.hh" #ifdef USE_SHM_SYSV #include #include #endif using namespace std; using namespace Itree; //in my world: // A = 00 // C = 01 // G = 10 // T = 11 // isnt that cute? ~G==C and so on (and in alphabetical order!) #define MODWORDSIZE(a) ((a) & (WORDSIZE-1)) extern int _wordsize,hash_max; extern int max_hash_bits;//hash_max limits memory usage extern word hash_size; //actually used size, might be less than max extern word hash_padding; extern int mismatch_penalty,match_score,max_credits; extern bool gappingEnabled; typedef map HashCount; #define BASE_A 0 #define BASE_C 1 #define BASE_G 2 #define BASE_T 3 #define BASE_N 4 #define BASES 5 typedef int BASE_TYPE; //stupid broken enums... #define OVERLAP_NONE 0 #define OVERLAP_START_AB 1 #define OVERLAP_START_BA 2 #define OVERLAP_STOP_AB 4 #define OVERLAP_STOP_BA 8 #define OVERLAP_AINB 16 #define OVERLAP_BINA 32 #define COLINEAR_AB 64 #define COLINEAR_BA 128 #define COLINEAR_NONE 0 typedef int OverlapSense; //doing |= on enums does bad things now apparently typedef int ColinearSense; //to avoid semantic abuse of overlapsense with non-overlapping things extern double globalBaseFreq[4]; extern word globalBaseCount[4],globalCounted; enum SharedMemType {SHM_NONE=0,SHM_MMAP_RO, SHM_MMAP_RW, SHM_SYSV, SHM_END}; void finishGlobalFrequencies(); //forward declarations... class HashMethod; class Window; class Anchor; class Location; class AnchorSet; class UsedMap; class Sequence; void set_seqtable(vector seqs); void initHashParams(int bits,BitSequence *pat); void initConstants(); inline word revComp8bit(word t); inline word revCompWord(word t); inline int highestBit(word t); inline word lowN(int i); inline word highN(int i); inline word first2(word w){return w>>(WORDSIZE-2);} int popCount(word w); string baseToString(BASE_TYPE b); string bitToString(word w); string wordToString(word w,int bits=WORDSIZE); string wordToMaskedString(word w,word pat, int bits=WORDSIZE); string wordToPattern(word w,int bits=WORDSIZE); string repString(string s,int count); string hashCacheName(string name, BitSequence *pat); //Sequence base pointers and meta-data typedef pair SeqPosPair; typedef ShmVector SeqPosPairArray; //bits for sequence index table extern map seq2idx; typedef itree usedItree; typedef usedItree::iterator MapIte; typedef Interval UsedInt; extern int next_sequence_; extern int base_patt_len; extern word activeHash; extern word totalSequenceMemory; extern word sys_pagesize; extern CryptoHasher *cryptoHasher; //for managing UsedInts UsedInt UsedInt_grown(const UsedInt &a,const SeqPos amt); UsedInt UsedInt_inverted(const UsedInt &a,const Sequence& s); UsedInt& UsedInt_invert(UsedInt &a,const Sequence& s); UsedInt UsedInt_coalesced(const UsedInt &a,const UsedInt &b); void UsedInt_coalesce(UsedInt &x,const UsedInt &a); bool UsedInt_contains(const UsedInt &outer,const UsedInt &inner); inline bool isRev(const UsedInt &a){return a.start<0;} string asAnchor(const UsedInt& a,const Sequence *s); bool UsedInt_sanityCheck(const UsedInt &a); OverlapSense UsedInt_overlaps(UsedInt a,UsedInt b); string OverlapSense2str(OverlapSense o); UsedInt& UsedInt_rawInvert(UsedInt &x); SeqPos UsedInt_offset(UsedInt a,UsedInt b); SeqPos UsedInt_distance(UsedInt a,UsedInt b); //distance between nearest ends //inline function defs actually have to be in here if they're used in other .cc's... const inline SeqIdx lookup_seqtable(Sequence* s) { assert(seq2idx.find(s)!=seq2idx.end()); //must be in there already return seq2idx[s]; } inline SeqPos bitSeqCoords(SeqPos seqCoords){ return abs(seqCoords)-1; } inline SeqPos seqCoords(SeqPos bitSeqCoords,int sign){ return sign>0 ? (bitSeqCoords+1):-(bitSeqCoords+1); } inline UsedInt seqCoords(UsedInt bitSeqCoords,int sign){ return UsedInt(seqCoords(bitSeqCoords.start,sign),seqCoords(bitSeqCoords.stop,sign)); } inline word lowN(int i){ return ~(((word)-1)<>i); } class BitSequence { //for packing dna strings into 2 bit chunks public: //accessor inline word getCounted() const {return counted;} inline const word* words() const {return _words;} inline word wordCount() const {return word_count;} inline word reverseOtf(SeqPos pos) const; inline SeqPos length() const {return _length;} int wordsize(); string asString() const; string asPattern() const; inline int readCode(SeqPos bitpos) const; inline BASE_TYPE readBase(SeqPos basepos) const; word readWord(SeqPos wordpos) const; inline word readRawWord(SeqPos wordpos) const; inline word wordAtBase(SeqPos basepos) const; inline SeqPos readBitCount(){return bit_count;} inline int hashLength(){return _hashLength;} int compHashLength(); SeqPosPair* localRegion(SeqPos pos); //find region local to pos SeqPosPairArray::iterator localRegionIte(SeqPos pos); SeqPos spaceRight(SeqPos pos); //matchable space right of pos SeqPos spaceLeft(SeqPos pos); //matchable space left of pos string genBitFilename(); SeqPos cmpRight(SeqPos pos,BitSequence& target,SeqPos targetPos); SeqPos cmpLeft(SeqPos pos,BitSequence& target,SeqPos targetPos); //manipulate others void maskString(string &str) const; //operator bool equal(SeqPos pos,BitSequence& target,SeqPos targetPos,BitSequence& patt); //mutator void invert(); //flips to comp sequence //bitsequence constructors BitSequence(Sequence *s); //from a file. do the smart thing. BitSequence(const string &str); //an anonymous 0101 style pattern (as mask: ie double each bit) BitSequence(SequenceFileReader &reader,Sequence* s); //from either GATC or 0101 (as mask: ie double each bit) BitSequence(const BitSequence& a); //copy BitSequence(const word length,Sequence* s,bool _reverse=false); BitSequence();//uninited ~BitSequence(); BitSequence* reverseComplement(); //generate a reverse complement version friend bool operator==(const BitSequence& a,const BitSequence& b); friend class Window; friend class HashMethod; friend void initHashParams(int bits,BitSequence *pat); friend ostream& operator<<(ostream &os,const BitSequence &a); SeqPosPairArray matchRegions; //match regions contains hashable areas (ie: no Ns) SeqPosPairArray subSeqs; //subSeqs contains all subsequences (contigs, chromosomes, whatever multiple sequences are combined into a single fasta or .stitch file), regions that can be anchored (might contain Ns) UsedInt subSeqBounds(SeqPos at); //mpi specific void mpi_distribute(); //distribute worldwide //shared memory specific SharedMemType shmUsed; string shmAnchor; void shm_distribute(); //distribute to other local nodes void mmap_msync(); protected: //init void initPattern(const string &str); void loadReader(SequenceFileReader &reader); void loadBinary(const SequenceBinary &reader); void init(const BitSequence& a); void compileHashFunc(); void randomHashFunc(int n); word* allocWords(); SeqPos _length; word bit_count,word_count; int _hashLength; bool isPattern; Sequence* seq; word* _words; bool reverse; word counted; //for fast hash computation: HashMethod* hasher; }; class HashMethod { public: typedef vector > InputList; protected: InputList inputs; static map popmap; BitSequence *seq; void addWord(); void removeWord(); string srcSetToString(const set &a); public: HashMethod(const HashMethod &a); HashMethod(const HashMethod &a,const HashMethod &b); //mmm hot function on function action HashMethod(BitSequence *s); HashMethod(BitSequence *s,int n); //random hash function for n words. bad idea, but I won't stop you. string prettyPrint(); //scoring state vector entropy; vector used,unusedWords; double sources,empties,fitness; double entropyHi,entropyLo,entropyTotal; double entropyMean,entropyStd; double totalCorrelationPenalty; word active; inline const InputList& inputlist(){return inputs;} friend ostream& operator<<(ostream &os,const HashMethod &a); int align(int w); //propose an offset for a new input void finalize(); int maxSources(); void mutate(); word hash(Window &w); double fitnessCheck(); void removeDuplicates(); void pruneUseless(); }; inline bool operator<(const HashMethod &a,const HashMethod &b){return a.fitness buffer; public: inline word firstWord(){return buffer[0];} }; class Sequence { public: Sequence(); //empty sequence (ie: useless) Sequence(string); //from a file Sequence(const Sequence &obj); // duplicate ~Sequence(); //have to cleanup fwd and rev string filename,name,baseFilename; SeqPos length() const; inline int getId(){return seqID;} inline SeqPos extCoords(SeqPos mcoords) const {return mcoords<0 ? -1-(rev->length()+mcoords):mcoords;} UsedInt getSubSeqBounds(SeqPos at); UsedInt growInBounds(const UsedInt &basis,SeqPos amount); friend bool operator ==(const Sequence&,const Sequence&); BitSequence *fwd,*rev; BaseIterator iterate(SeqPos at); pair iterate(const UsedInt &at); protected: int seqID; #ifdef USE_SHM_SYSV //sysvipc specific key_t sysv_key; int sysv_shmid; friend class BitSequence; //i need you to be my friend!! #endif }; class Location { public: SeqIdx seqno; SeqPos pos; Sequence* seq() const; inline int seqId() const {return seqno;} //umm this should be "return seqno" i think? BitSequence* bitSeq() const; bool equal(const Location &a,BitSequence &patt) const; friend bool operator==(const Location &a,const Location &b); friend bool operator<(const Location &a,const Location &b); friend ostream& operator<<(ostream &os,const Location &a); inline SeqPos bitSeqPos(){return (pos>0 ? pos:0-pos)-1;} SeqPosPair* localRegion(); SeqPosPairArray::iterator localRegionIte(); //find region local to pos Location(Sequence*,SeqPos); Location(SeqIdx, SeqPos); Location(); //this is dangerous. be careful with it. }; typedef list LocSubList; typedef vector LocList; bool operator==(const HashVal &a,const HashVal &b); class Hash { public: //abstract parts virtual void clear() = 0; virtual void add(const HashKey key,const HashVal &val) = 0; virtual void lookup(HashKey key,LocList &locList) = 0; virtual void getMatchingSets(HashKey base,list &fulllist) = 0; virtual bool emptyAt(const HashKey key) = 0; virtual word sizeAt(const HashKey key) = 0; virtual word sizeAt(const HashKey key,const HashVal &val) = 0; //optional bits virtual void writePerformanceData(string prefix); //for computing memory costs static const word linear_cost(word); static const word bucket_prep_cost(word); // dump / load may be overloaded virtual void dump(ostream &os); virtual void load(istream &is); //non virtual stuff that will never change void writeHistogram(ostream &of); void writeDetailedHistogram(ostream &of); BitSequence* hashpat; bool sanityCheck(); bool sanityCheck(word base); //the boring junk Hash(BitSequence *pat); virtual ~Hash() = 0; protected: //nada }; class AnchorSet; struct ltRegion : binary_function { inline bool operator()(const SeqPosPair &a, const SeqPosPair &b) const { return a.second::iterator _IntervalIterator; typedef vector >::iterator _MemberIterator; IntervalSet(const _IntervalIterator &intStart, const _IntervalIterator &intStop, const _MemberIterator &memberStart,const _MemberIterator &memberStop); string contents(string delim) const; inline string contents() const {return contents("\n");} void invert(); //flip everything from forwards<->backwards void coalesce(const AnchorSet& neighbor); bool contains(const IntervalSet &a) const; OverlapSense overlaps(const AnchorSet &a) const; bool hasGaps(const AnchorSet& a) const; //would merging with a cause gaps? SeqPos offset(const AnchorSet& a) const; pair gapOffset(const AnchorSet& a) const; //no sense in duplicating the above code ColinearSense colinear(const AnchorSet& a,SeqPos MaxDist) const; void add(HashVal&,SeqPos length); void exactExtend(); Score fuzzyExtend(); Score score(); pair entropy() const; bool sanityCheck() const; friend ostream& operator<<(ostream& os,const IntervalSet& a); vector spaces; HashCount members; }; class AnchorSet { //anchorsets are necessarily tied to a set of usedmap entries public: vector spaces; HashCount members; AnchorSet(const IntervalSet &a,UsedMap& used); bool contains(const IntervalSet &a) const; bool sanityCheck() const; //this is straight forward enough size_t hitCount() const; double uniqueness(UsedMap* context) const; SeqPosPair bitscore() const; //though this is outmoded, no reason to actually remove it... Score score() const; string asString() const; friend ostream& operator<<(ostream &os, const AnchorSet &a); }; class UsedMap { public: UsedMap(); bool sanityCheck(); string asString(); int count() const; void insert(IntervalSet& a); word* makeDfCount(); //for calculating tfidf scores, may return 0 if out of mem ostream& saveDetails(ostream &os,ostream &bitos); //make second cerr to skip ostream& writeOut(ostream &os); ostream& writeTfidf(ostream &os); ostream& writeScores(ostream &os); friend AnchorSet::AnchorSet(const IntervalSet &a,UsedMap& used); #ifdef MURASAKI_MPI friend void mpi_anchorMergeClient(int mergeTarget); friend void mpi_anchorMergeServer(const vector &senders); #endif protected: vector used; bool alreadyExists(const IntervalSet &a); bool merge(IntervalSet& a); void add(const IntervalSet& a); void remove(AnchorSet *a); //deletes a! bool fetchOverlaps(int seq,const UsedInt &a,set &out); bool fetchNearbyOverlaps(int seq,const UsedInt &a,SeqPos maxgrow,set &out); }; class RepeatMap { public: void add(const LocList &l); void writeOut(ostream &os); inline word size(){return clusters.size();} RepeatMap(int size); RepeatMap(); #ifdef MURASAKI_MPI friend void mpi_anchorMergeClient(int mergeTarget); #endif protected: list clusters; }; class BaseIterator { protected: bool rev; BitSequence *src; SeqPos idx; SeqPosPairArray::iterator here,prev,next; void shiftFwd(); void shiftBack(); public: BaseIterator(Sequence *seq,SeqPos pos); BaseIterator(const BaseIterator&); BaseIterator& operator++(); //pre-inc BaseIterator& operator--(); //pre-dec BaseIterator operator++(int); //post-inc BaseIterator operator--(int); //post-dec bool operator==(const BaseIterator &a) const; bool operator!=(const BaseIterator &a) const; BASE_TYPE operator*() const; //spit out current base inline int getIdx() const {return idx;} inline SeqPosPair region() const {return *here;} string debugInfo(); }; inline bool isZero(const MapIte& i){return i.key()==UsedInt(0,0);} inline bool isZero(const UsedInt& i){return i==UsedInt(0,0);} UsedInt anchorCoords(const UsedInt &b,Sequence *s); UsedInt operator-(SeqPos a,const UsedInt& b); string SeqPosPair2string(const SeqPosPair& a); #endif murasaki/src/shmvector.hh0000644000177700001440000002407411434752235015111 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #ifndef SHMVECTOR__H #define SHMVECTOR__H #include #include #include #include #include //for memcpy #include "globaltypes.h" #include "murasaki_mpi.h" #include "dinkymath.h" #include #include #include #include #include #include #if defined(OPT_SHMVERBOSE) || !defined(NDEBUG) #define opt_ShmVerbose (1) #else #define opt_ShmVerbose (0) #endif template class ShmVector { protected: T* _data; size_t _size; size_t _capacity; size_t _byteCapacity; key_t sysv_key; int sysv_shmid; bool useShm; public: bool hasShm(){return useShm;} typedef T val_type; class ShmVectorIterator { protected: T* _p; public: typedef ShmVectorIterator _Self; //required to be an "iterator" typedef T value_type; typedef int distance_type; typedef int difference_type; typedef T* pointer; typedef T& reference; typedef random_access_iterator_tag iterator_category; //sanity void put(const reference a,int idx) const { *_p; } void put(const val_type &v) const { *_p=v; } inline reference get() const { //identical to operator*, just there for symmetry with put return *_p; } pointer operator->() const { return &*_p; } //qualify as Iterator inline reference operator*() const { return *_p; } _Self& operator++(){ ++_p; return *this; } _Self operator++(int){ _Self a(*this); ++_p; return a; } _Self& operator--(){ --_p; return *this; } _Self operator--(int){ _Self a(*this); --a._p; return a; } bool operator!=(const _Self& p) const { return _p!=p._p; } bool operator==(const _Self& p) const { return _p==p._p; } //qualify as RandomAccessIterator bool operator<(const ShmVectorIterator &a){ return _p_data+idx) {} }; typedef ShmVectorIterator iterator; void shmInit(){ using namespace std; #if defined(MURASAKI_MPI) && defined(USE_SHM_SYSV) if(useShm){ //mark old for deletion if it exists if(sysv_shmid!=-1 && shmctl(sysv_shmid,IPC_RMID,NULL)<0) ; //that's fine. // throw MurasakiException("Couldn't mark System V IPC shared memory region for deletion for sequence "+string(strerror(errno))); if(mpi_myLocalRank==0){ if((sysv_shmid=shmget(IPC_PRIVATE,_byteCapacity,IPC_CREAT | 00600))==-1) throw MurasakiException("ShmVector: Error creating System V IPC shared memory segment (size: "+humanMemory(_byteCapacity)+"): "+string(strerror(errno))); // MPI_Bcast(&sysv_shmid,sizeof(key_t),MPI_BYTE,0,mpi_localhost_comm); assert(mpi_isHostLeader); if((_data=(T*)shmat(sysv_shmid,NULL,mpi_isHostLeader ? O_RDWR:(O_RDONLY | SHM_RDONLY)))==(void*)-1) throw MurasakiException("ShmVector: Error attaching to System V IPC shared memory segment: "+string(strerror(errno))); if(opt_ShmVerbose) cout << "ShmVector: Mapped a "<_capacity) _size=_capacity; //must always be true //copy old data into new space memcpy(_data,old_data,(oldByteCapacity<_byteCapacity ? oldByteCapacity:_byteCapacity)); if(useShm){ if(old_data && shmdt(old_data)<0) throw MurasakiException("Couldn't detach (old) ShmVector System V shared memory region: "+string(strerror(errno))); }else{ delete[] old_data; } } public: ShmVector() : _data(NULL), _size(0), _capacity(16), _byteCapacity(16*sizeof(T)), sysv_key(0), sysv_shmid(-1), #ifdef MURASAKI_MPI useShm(mpi_capable && mpi_usingShm) #else useShm(false) #endif { shmInit(); } ShmVector(size_t size) : _data(NULL), _size(size), _capacity(_size), _byteCapacity(_capacity*sizeof(T)), sysv_key(0), sysv_shmid(-1), #ifdef MURASAKI_MPI useShm(mpi_capable && mpi_usingShm) #else useShm(false) #endif { shmInit(); } ~ShmVector(){ shmClean(); } size_t capacity(){ return _capacity; } void reserve(size_t c){ if(_size>=c) //can't reserve space smaller than that which we've already used return; shmRealloc(c); } void resize(size_t c){ shmRealloc(c); _size=c; } void push_back(const T& a){ if(_size>=_capacity) reserve((_capacity==0 ? 1:_capacity)*2); _data[_size++]=a; } iterator begin(){ return iterator(this,0); } iterator end(){ return iterator(this,_size); } val_type& operator[](int n){ return _data[n]; } size_t size(){ return _size; } bool empty(){ return !_size; } val_type& front(){ assert(_size); return _data[0]; } #ifdef MURASAKI_MPI void sync(MPI_Comm shareComm,const int leader,const bool asLeader){ std::string vtag; if(useShm){ vtag=string(" ShmVector-Sync (as SHM-")+string(asLeader ? "leader":"slave")+string("): "); if(mpi_myLocalRank!=leader){ if(_data){ cout << vtag << " dettaching old region: "<<(void*)(_data)<<" -> "< #include using namespace std; namespace fs = boost::filesystem; string program_help(); string program_version(); //function decl double compareAlignment(Alignment &ref,Alignment &test); void printResults(vector > &results); //globals ProgressTicker ticker(100); Alignment *unmatched=0; pair keep(-1,-1); string keepfile; ulong searchRadius=0,mergeDistance=0; vector skipJoin; int main(int argc,char** argv){ using boost::lexical_cast; using boost::bad_lexical_cast; int optc; while(1){ //options struct: // name, has_arg, store_pointer, return_value static struct option long_options[] = { {"help",0,0,'?'}, {"version",0,0,'v'}, {"keep",1,0,'k'}, {"keepfile",1,0,'K'}, {"radius",1,0,'r'}, {"join",1,0,'j'}, {"skipjoin",1,0,'J'}, {0,0,0,0} }; int longindex=0; optc=getopt_long(argc,argv,"?vr:K:k:j:J:",long_options,&longindex); if(optc==-1)break; switch(optc){ case 'v': cout << program_version();exit(-1);break; case '?': cout << program_help();exit(-1);break; case 'k': if(!optarg){ cerr << "Missing argument for --keep"<(m[1]); keep.second=lexical_cast(m[2]); }catch(bad_lexical_cast& e){ cerr << "Bad argument to --keep (need numbers)"<(optarg); }catch(bad_lexical_cast& e){ cerr << "Bad argument to --search (need numbers)"<(optarg); }catch(bad_lexical_cast& e){ cerr << "Bad argument to --join (need numbers)"<(optarg); if(skipJoin.size()<=seq) skipJoin.resize(seq+1,false); skipJoin[seq]=true; }catch(bad_lexical_cast& e){ cerr << "Bad argument to --skipjoin (need numbers)"< files; for(int i=optind;i alignments(alignmentCount); unmatched=new Alignment; for(uint i=0;i > results(alignmentCount,vector(alignmentCount,0)); for(uint s=0;s > &results){ uint size=results.size(); cout << "For amount row i, column j:\n percent of anchors from i found to overlap atleast 1 anchor in j"<::iterator i=ref.anchors.begin();i!=ref.anchors.end();++i){ if(searchRadius ? test.contains(*i,searchRadius):test.contains(*i)) inBoth++; else { inRefOnly++; if(ref.id==keep.first && test.id==keep.second) unmatched->add(*i); } } return ((double)inBoth)/((double)total); } string program_help(){ return string("\ Usage: align-compare [options] alignment1 alignment2\n\ \n\ Options\n\ *Takes an argument (like --searchradius 300 or -r300)\n\ --searchradius|r = consider two anchors overlapping if they're within N bp\n\ of each other.\n\ --join|j = joins anchors within N bp of each other\n\ --skipjoins|J = don't perform joins on given sequence (0-indexed,\n\ can be appiled multiple times)\n\ --keep|k = record differences between given pair (eg 3,4)\n\ --keepfile|K = output differences to given file\n\ \n\ *Toggles: (just --merge or -b)\n\ --help|h = this message\n\ --version|v = version string\n\ "); } string program_version(){ return string("align-compare v0.1"); } murasaki/src/mingw32compat.h0000644000177700001440000000337511434752234015421 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ /* Murasaki - multiple genome global alignment program Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifdef __MINGW32__ #ifndef __MINGW32COMPAT_H__ #define __MINGW32COMPAT_H__ typedef unsigned long u_long; typedef unsigned int uint; void srandom(unsigned); long random(); int getpagesize(); void bzero(void*, size_t); #endif #endif murasaki/src/Makefile0000644000177700001440000001623611434752233014215 0ustar krispusers## Murasaki project Makefile # by Kris Popendorf ########################### # End-user configurable parts: # There are a few options here that the end user might want to adjust # if the defaults cause problems. ## ## if you have MPI but don't want to use it, uncomment this line: #USE_MPI=NO ## if have the Crypto++ library but you don't want to use it (or if the autodetect is failing) comment out this line. Or make it YES if you want to force its use despite autodetect failing. #WITH_LIBCRYPTOPP = NO #If your libraries (boost/cryptopp) are in another directory, add them here: INCLUDE = -I./include -I/usr/local/include -I/opt/local/include LIBPATH = -L/usr/local/lib -L/usr/lib -L/opt/local/lib LDLIBS = -lboost_regex-mt -lboost_filesystem-mt -lboost_iostreams-mt -lboost_system ##some systems might use this boost naming scheme, but they're a minority at this point #LDLIBS = -lboost_regex -lboost_filesystem -lboost_iostreams CPPFLAGS += $(OPTIM) $(INCLUDE) -Wall $(MARCH) LDFLAGS += $(LIBPATH) ## if you're really strapped for memory, commenting this out may save you a little memory at the cost of losing the ability to handle >2Gbp sequences LARGESEQ_SUPPORT ?= YES ####### # End users can/should stop modifying here ##### ## if you're desperately trying to compile for windows, you'll want to uncomment this line and adjust the section below #WITH_MINGW ?= YES MINGW_HOME ?= /usr/i586-mingw32msvc ifeq "$(WITH_MINGW)" "YES" CPPFLAGS += -D_WIN32_WINNT=0x500 -DWINVER=0x500 # Windows2000 and later INCLUDE += -I$(MINGW_HOME)/include LIBPATH += -L$(MINGW_HOME)/lib -L$(MINGW_HOME)/GTK/lib # GTK/lib for zlib LDLIBS += -lboost_system -lz -lbz2 -lm endif #autodetect MPI/cryptopp availability USE_MPI ?= $(shell perl makeconf.pl HASMPI) WITH_LIBCRYPTOPP ?= $(shell perl makeconf.pl HASCRYPTOPP $(CXX) $(CPPFLAGS) $(LDFLAGS)) ## debug related options. end users shouldn't change #GPROF ?= YES #WITH_SHM_MMAP ?= YES WITH_SHM_SYSV ?= YES OPTIM += -O3 OPTIM += -DNDEBUG OPTIM += -g #sane defaults. when using MPI, the make default doesn't cut it. ifeq "$(USE_MPI)" "YES" DEFAULT_CXX = mpicxx else DEFAULT_CXX := $(CXX) endif ifeq "${origin CXX}" "default" CXX = $(DEFAULT_CXX) else CXX ?= $(DEFAULT_CXX) endif #openhash specific options: #CPPFLAGS += -DHASHPROFILE MACOS_ARCH ?= ppc # ppc64 BINDIR = .. DEPEND= gccmakedep TMP = /tmp #------------- Architecture stuff --------- UNAME ?= ${shell uname -m} ifeq "$(UNAME)" "i686" #MARCH ?= -march=k8 endif ifeq "$(UNAME)" "x86_64" #MARCH ?= -march=opteron endif ifeq "$(UNAME)" "Darwin" MARCH ?= -arch $(MACOS_ARCH) endif ifeq "$(UNAME)" "Power Macintosh" MARCH ?= -arch $(MACOS_ARCH) endif #------------- Options -------------------- ifeq "$(GPROF)" "YES" LDFLAGS += -pg CPPFLAGS += -pg CPPFLAGS += -pg endif ifeq "$(USE_MPI)" "YES" CPPFLAGS += -DMURASAKI_MPI #CPPFLAGS += -I/usr/lib/mpich/include -I/usr/local/mpich/include #OPT_LIBS += -L/usr/lib/mpich/lib -L/usr/local/mpich/lib -lmpich #note: MPI requierd to use MMAP/SYSV (thus contained in MPI if) ifeq "$(WITH_SHM_MMAP)" "YES" CPPFLAGS += -DUSE_SHM_MMAP endif ifeq "$(WITH_SHM_SYSV)" "YES" CPPFLAGS += -DUSE_SHM_SYSV endif endif ifeq "$(LARGESEQ_SUPPORT)" "YES" CPPFLAGS += -DLARGESEQ_SUPPORT endif ifeq "$(WITH_LIBCRYPTOPP)" "YES" LDLIBS += -lcryptopp -pthread #the way we use libcrypto actually breaks strict-aliasing rules (we convert bytes to words directly), so, just turn that off CPPFLAGS += -DUSE_LIBCRYPTOPP -fno-strict-aliasing endif #------------- SVN Version info ----------- SVNREVISION=$(shell perl svninfo.pl) ifneq "$(SVNREVISION)" "" CPPFLAGS += -DSVNREVISION='$(SVNREVISION)' endif #------------- HG Version info ----------- HGREVISION=$(shell perl hginfo.pl) ifneq "$(HGREVISION)" "" CPPFLAGS += -DHGREVISION='$(HGREVISION)' endif #------------- Project files -------------- PROGRAM = $(BINDIR)/murasaki SRCS = align-and.cc align-best.cc align-compare.cc align-coverage.cc align-mask.cc alignments.cc align-or.cc align-tc.cc arrayhash.cc cgr.cc cgr-compare.cc cgr-image.cc cgr-random.cc cgr-sample.cc dinkymath.cc ecohash.cc ecolist.cc geneparse.cc genopts.cc getsegments.cc msethash.cc murasaki.cc murasaki_mpi.cc openhash.cc options.cc scoring.cc seqread.cc sequence.cc timing.cc binseq.cc OBJ = murasaki.o timing.o dinkymath.o sequence.o msethash.o options.o ecohash.o ecolist.o murasaki_mpi.o arrayhash.o seqread.o scoring.o openhash.o genopts.o binseq.o ifeq "$(WITH_MINGW)" "YES" SRCS += mingw32compat.cc OBJ += mingw32compat.o endif PROG2 = $(BINDIR)/cgr-image PROG3 = $(BINDIR)/cgr-compare PROG4 = $(BINDIR)/cgr-random PROG5 = $(BINDIR)/cgr-sample PROG6 = $(BINDIR)/testeco PROG7 = $(BINDIR)/align-compare PROG8 = $(BINDIR)/align-or PROG9 = $(BINDIR)/geneparse PROG10 = $(BINDIR)/align-and PROG11 = $(BINDIR)/align-best PROG12 = $(BINDIR)/align-coverage PROG13 = $(BINDIR)/getsegments PROG16 = $(BINDIR)/mbfa OBJ2 = cgr-image.o cgr.o dinkymath.o timing.o seqread.o genopts.o OBJ3 = cgr-compare.o cgr.o dinkymath.o timing.o OBJ4 = cgr-random.o cgr.o dinkymath.o timing.o OBJ5 = cgr.o cgr-sample.o seqread.o genopts.o OBJ6 = testeco.o ecolist.o dinkymath.o timing.o OBJ7 = align-compare.o dinkymath.o alignments.o timing.o OBJ8 = align-or.o alignments.o dinkymath.o timing.o OBJ9 = geneparse.o seqread.o genopts.o OBJ10 = align-and.o alignments.o dinkymath.o timing.o OBJ11 = align-best.o alignments.o dinkymath.o timing.o OBJ12 = align-coverage.o alignments.o dinkymath.o timing.o seqread.o genopts.o OBJ13 = getsegments.o seqread.o genopts.o OBJ16 = mbfa.o dinkymath.o timing.o seqread.o binseq.o genopts.o ALLPROGS = $(PROGRAM) $(PROG2) $(PROG3) $(PROG4) $(PROG5) $(PROG6) $(PROG7) $(PROG8) $(PROG9) $(PROG10) $(PROG11) $(PROG12) $(PROG13) $(PROG16) #------------- Make rules ----------------- .cc.o: $(CXX) $(CPPFLAGS) -c $< .cpp.o: $(CXX) $(CPPFLAGS) -c $< #------------- Commands ------------------- all: $(ALLPROGS) Makefile clean: -rm -rf *.o *~ .*~ core $(ALLPROGS) depend: $(SRCS) Makefile $(DEPEND) -- $(CPPFLAGS) -- $(SRCS) 2> $(TMP)/depend-errors $(PROGRAM): $(OBJ) $(CXX) $(CPPFLAGS) -o $(PROGRAM) $(OBJ) $(OPT_LIBS) $(LIBPATH) $(LDLIBS) $(PROG2): $(OBJ2) $(CXX) $(CPPFLAGS) -o $(PROG2) $(OBJ2) $(LIBPATH) $(LIBCGR) $(LDLIBS) $(PROG3): $(OBJ3) $(CXX) $(CPPFLAGS) -o $(PROG3) $(OBJ3) $(LIBPATH) $(LIBCGR) $(LDLIBS) $(PROG4): $(OBJ4) $(CXX) $(CPPFLAGS) -o $(PROG4) $(OBJ4) $(LIBPATH) $(LIBCGR) $(LDLIBS) $(PROG5): $(OBJ5) $(CXX) $(CPPFLAGS) -o $(PROG5) $(OBJ5) $(LIBPATH) $(LIBCGR) $(LDLIBS) $(PROG6): $(OBJ6) $(CXX) $(CPPFLAGS) -o $(PROG6) $(OBJ6) $(LIBPATH) $(LDLIBS) $(PROG7): $(OBJ7) $(CXX) $(CPPFLAGS) -o $(PROG7) $(OBJ7) $(LIBPATH) $(LDLIBS) $(PROG8): $(OBJ8) $(CXX) $(CPPFLAGS) -o $(PROG8) $(OBJ8) $(LIBPATH) $(LDLIBS) $(PROG9): $(OBJ9) $(CXX) $(CPPFLAGS) -o $(PROG9) $(OBJ9) $(LIBPATH) $(LDLIBS) $(PROG10): $(OBJ10) $(CXX) $(CPPFLAGS) -o $(PROG10) $(OBJ10) $(LIBPATH) $(LDLIBS) $(PROG11): $(OBJ11) $(CXX) $(CPPFLAGS) -o $(PROG11) $(OBJ11) $(LIBPATH) $(LDLIBS) $(PROG12): $(OBJ12) $(CXX) $(CPPFLAGS) -o $(PROG12) $(OBJ12) $(LIBPATH) $(LDLIBS) $(PROG13): $(OBJ13) $(CXX) $(CPPFLAGS) -o $(PROG13) $(OBJ13) $(LIBPATH) $(LDLIBS) $(PROG16): $(OBJ16) $(CXX) $(CPPFLAGS) -o $(PROG16) $(OBJ16) $(LIBPATH) $(LDLIBS) codesize: wc $(SRCS) ##magical makedepend dependencies murasaki/src/binseq.cc0000644000177700001440000002771711434752235014355 0ustar krispusers/* Copyright (C) 2006-2008 Keio University (Kris Popendorf) (2006) This file is part of Murasaki. Murasaki is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Murasaki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Murasaki. If not, see . */ #include "binseq.hh" #include #include #include #include //memcpy #include #include #include #include #include #include using namespace std; SequenceBinaryMetadata::SequenceBinaryMetadata() : formatVersion(1), suffix(string(".mbfa")+dstring((int)sizeof(word))+dstring((int)sizeof(SeqPos))) {} bool SequenceBinary::save(){ return save(binaryFilename); } bool SequenceBinary::save(string filename){ //sadly ostream::write sucks for even moderately large values of N on 64bit debian 5.0, //so I'm just going to say: suck it ostream, I'm using C ofstream ofh(filename.c_str(),ios::out | ios::binary); try { ofh.exceptions(ofstream::failbit | ofstream::badbit); char wordsize=sizeof(word),seqposSize=sizeof(SeqPos); ofh.write(BINSEQ_MAGICNUMBER,sizeof(BINSEQ_MAGICNUMBER)-1); ofh.write(CHARCAST(formatMetaData.formatVersion),1); ofh.write(CHARCAST(wordsize),1); ofh.write(CHARCAST(seqposSize),1); ofh.write(CHARCAST(_length),sizeof(_length)); ofh.write(CHARCAST(counters),sizeof(counters)); ofh.write(CHARCAST(regionSizes),sizeof(regionSizes)); assert(sizeof(streamsize)>=sizeof(size_t)); //otherwise this next line is in trouble ofh.write((char*)_words,sizeof(word)*word_count); //the big one... hopefully sizeof(streamsize)>=sizeof(size_t) for(size_t ri=0;ri::iterator si=subSeqNames.begin();si!=subSeqNames.end();++si){ word len=si->length()+1; //include null terminator ofh.write(CHARCAST(len),sizeof(len)); ofh.write(si->c_str(),len); } ofh.close(); }catch(ofstream::failure e){ warn("SequenceBinary::save(%s) failure",filename.c_str()); ofh.close(); unlink(filename.c_str()); return false; } return true; } string SequenceBinary::deriveLocalBinaryFilename(string filename){ filename+=formatMetaData.suffix; //eg. on a 64bit machine with LONGSEQ on these are .mbfa88 files return filename; } string SequenceBinary::deriveBinaryFilename(string filename){ SequenceBinaryMetadata def; filename+=def.suffix; //eg. on a 64bit machine with LONGSEQ on these are .mbfa88 files return filename; } bool SequenceBinary::exists(string filename){ try { SequenceBinary test(filename,false); return true; }catch(SeqReadException e){ return false; } } SequenceBinary::SequenceBinary(string filename,bool completeLoad) : formatMetaData(), baseFilename(filename), binaryFilename(deriveLocalBinaryFilename()) { init(); if((fd=open(binaryFilename.c_str(),O_RDONLY))<0){ throw SeqReadException("Couldn't open MBFA file for reading: "+binaryFilename+" : "+strerror(errno)); } struct stat fdstat; fstat(fd,&fdstat); struct stat srcStat; if(!stat(baseFilename.c_str(),&srcStat) && srcStat.st_mtime>fdstat.st_mtime) throw SeqReadException("MBFA is older than source file. Cowardly refusing to use outdated MBFA."); fdmem_len=fdstat.st_size; if((fdmem=mmap(NULL,fdmem_len,PROT_READ,MAP_PRIVATE,fd,0))==MAP_FAILED){ throw SeqReadException("Failed mapping listener block: "+string(strerror(errno))); } if(strncmp((const char*)fdmem,BINSEQ_MAGICNUMBER,sizeof(BINSEQ_MAGICNUMBER)-1)) throw SeqReadException("Not a Murasaki Binary FASTA file: "+filename); char fileversion; char wordsize=sizeof(word),seqposSize=sizeof(SeqPos); DeSerial parser(fdmem,3); parser.get(fileversion); if(fileversion!=formatMetaData.formatVersion) throw SeqReadException("Wrong MBFA format version. Expected "+dstring(formatMetaData.formatVersion)+", found "+dstring(fileversion)); parser.get(wordsize); if(wordsize!=sizeof(word)) throw SeqReadException("Wrong MBFA system architecture. Expected "+dstring((long)sizeof(word))+", found "+dstring((long)wordsize)); parser.get(seqposSize); if(seqposSize!=sizeof(SeqPos)) throw SeqReadException("Wrong MBFA murasaki architecture. Expected "+dstring((long)sizeof(SeqPos))+", found "+dstring((long)seqposSize)); parser.get(_length); parser.get(counters,sizeof(counters)/sizeof(counters[0])); parser.get(regionSizes,sizeof(regionSizes)/sizeof(regionSizes[0])); bit_count=_length*2; word_count=((bit_count+WORDSIZE-1)/(WORDSIZE)); //rounds up to nearest full word parser.map(_words,word_count); for(size_t ri=0;ri unmaskedRegion(0,0),readableRegion(0,0),subSeqBounds(0,0); long prevSubSeqId=reader.getSubSeqId(); string prevSubSeqName=reader.getSubSeqName(); for(SeqPos i=0;i<(SeqPos)_length;i++){ bit-=2; char c=reader.getc(); unmasked=true; switch(c){ case 'a': unmasked=false;counters[0+4]++; case 'A': counters[0]++;readable=true;break; case 'c': unmasked=false;counters[1+4]++; case 'C': w|=((word)1<=subSeqBounds.first); subSeqs.push_back(subSeqBounds); subSeqNames.push_back(prevSubSeqName); subSeqNamesP.push_back(subSeqNames.back().c_str()); subSeqBounds.first=i+10; //this looks a bit hinky, but between every subseq there _should_ be exactly 10 Ns, so this accounts for that assert(c=='N'); //see? we're looking at an N! prevSubSeqId=reader.getSubSeqId(); prevSubSeqName=reader.getSubSeqName(); } if(bit==0){ assert(wordsStored=subSeqBounds.first); subSeqs.push_back(subSeqBounds); subSeqNames.push_back(prevSubSeqName); subSeqNamesP.push_back(subSeqNames.back().c_str()); regions[0]=&(subSeqs.front()); regions[1]=&(readableRegions.front()); regions[2]=&(unmaskedRegions.front()); regionSizes[0]=subSeqs.size(); regionSizes[1]=readableRegions.size(); regionSizes[2]=unmaskedRegions.size(); } word* SequenceBinary::allocWords(){ if(!word_count) return NULL; // cerr << "Allocing "<>(WORDSIZE-2-offset) & 3); return r; } char SequenceBinary::bitToBase(word w){ switch(w & 3){ case 0: return 'A'; case 1: return 'C'; case 2: return 'G'; case 3: return 'T'; } assert("Oh noes. n & 3 > 3???"); throw SeqBinException("n & 3 > 3. Check your CPU's math."); } char SequenceBinary::rawBase(size_t n){ return bitToBase(rawBaseBits(n)); } ostream& SequenceBinary::asFasta(ostream &os,int nmaskR){ Region *ri[3],*rEnd[3]; vector::const_iterator ni=subSeqNamesP.begin(); for(int i=0;i<3;i++){ ri[i]=regions[i]; rEnd[i]=regions[i]+regionSizes[i]; } const word lineWrap=75; word wrapCount=0; for(;ri[0]!=rEnd[0];++ri[0],++ni){ if(ri[0]!=regions[0]) //not the very first one? os << endl; //then we need a linebreak assert(ni!=subSeqNamesP.end()); os << ">"<<*ni<first;n<=ri[0]->second;n++){ //advance regions if necessary for(int i=1;i<3;i++) while(ri[i]!=rEnd[i] && n>ri[i]->second) ri[i]++; char c='?'; assert(ri[0]!=rEnd[0] && n>=ri[0]->first && n<=ri[0]->second); assert(ri[nmaskR]==rEnd[nmaskR] || n<=ri[nmaskR]->second); if(ri[nmaskR]==rEnd[nmaskR] || nfirst) c='N'; else if(nmaskR<2){ //show repeatmask bases, c=rawBase(n); assert(ri[2]==rEnd[2] || n<=ri[2]->second); if(ri[2]==rEnd[2] || nfirst) c-='A'-'a'; } if(wrapCount>lineWrap){ os << endl; wrapCount=0; } os << c; wrapCount++; } } os << endl; return os; } murasaki/simpal.pl0000755000177700001440000001435211434752242013610 0ustar krispusers#!/usr/bin/perl #Copyright (C) 2006-2008 Keio University #(Kris Popendorf) (2006) # #This file is part of Murasaki. # #Murasaki is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #Murasaki is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with Murasaki. If not, see . ############### ## simpal proof of concept perl implementation ############### use strict; use Getopt::Long; use File::Basename; use Pod::Usage; use Data::Dumper; BEGIN { unshift(@INC,(fileparse($0))[1].'perlmodules'); } use Murasaki; our $seedLength=3; our $maxDistance=300; our $tolerance=1; our $maxLoop=300; my ($help,$man); GetOptions('help|?' => \$help, man => \$man, 'tolerance=i' => \$tolerance, 'maxLoop=i' =>\$maxLoop, 'seedLength=i' => \$seedLength, 'maxDistance=i' => \$maxLoop); pod2usage(1) if $help or $#ARGV<1; pod2usage(-exitstatus => 0, -verbose => 2) if $man; our @seqs; our $srcinit; #find initial palidromes foreach my $seq (@ARGV){ my $start=time; die "File not found $seq" unless -e $seq; my ($fwd,$rev)=(lc `$root/geneparse -c -q $seq`, lc `$root/antisense.pl -c $seq`); my %hash; fillHash($fwd,\%hash); fillHash($rev,\%hash,-1); my @pals=findPals(%hash); my %distmap=palsToDistmap(@pals); push(@seqs,{ pals => \@pals, distmap => \%distmap, file => $seq }); $srcinit=0; print "Total palindromes: ".@pals."\n"; print "Time to hash $seq: ".(time-$start)."\n"; } my @srcvec=countvec($seqs[0]->{distmap}); my $src=$seqs[0]; foreach my $seq (@seqs[1..$#seqs]){ my (@veca,@vecb); my ($namea,$nameb)=($src->{file},$seq->{file}); my $similarity; my $start=time; print "Comparing to $namea to $nameb\n"; if(0){ print "Filtering matches to meet tolerance criteria (misses <= ".($tolerance<0 ? "any":$tolerance).")..\n"; filterMatches($src,$seq,\@veca,\@vecb); print "Time spent filtering: ".(time-$start)."\n"; $start=time; print "Vectorizing $namea...\n"; @veca=countvec(@veca); print "Vectorizing $nameb...\n"; @vecb=countvec(@vecb); print "Normalizing...\n"; # @veca=normalize(@veca); # @vecb=normalize(@vecb); print "Dotting ...\n"; $similarity=dot(\@veca,\@vecb); }else{ $similarity=onepassCompute($src,$seq); print "$nameb matched $seq->{matched} times.\n"; } print "Time spent: ".(time-$start)."\n"; print "Similarity of $namea to $nameb: $similarity\n"; } sub shortf(){ my $a=shift; return sprintf("%.2f",$a); } sub onepassCompute(){ my $total=0; my ($src,$target)=@_; foreach my $apal (@{$src->{pals}}){ foreach my $bpal (@{$target->{pals}}){ my $dist=abs($apal->{dist}-$bpal->{dist}); next if $dist>$maxDistance; # print "Compare: ".join(" to ",prettyPal($apal),prettyPal($bpal))."\n"; if($tolerance<0 or palMismatches($apal,$bpal)<=$tolerance){ # if($tolerance<0 or mismatches($apal->{key},$bpal->{key})<=$tolerance){ my $term=1.0/(exp($dist/5)); # print "Term: $term\n"; $total+=$term; $target->{matched}++; } } } return $total; } sub filterMatches(){ my (%a,%b); my ($src,$target,$aout,$bout)=@_; foreach my $dist (keys(%{$src->{distmap}})){ foreach my $apal (@{$src->{distmap}->{$dist}}){ foreach my $bpal (@{$target->{distmap}->{$dist}}){ if($tolerance<0 or mismatches($apal->{key},$bpal->{key})<=$tolerance){ $a{$apal}=$apal; $b{$bpal}=$bpal; } } } } push(@$aout,values(%a)); push(@$bout,values(%b)); } sub dot { #compute dot product my ($a,$b)=@_; my $sum=0; for(0..($#$a<$#$b ? $#$a:$#$b)){ $sum+=$$a[$_]*$$b[$_]; } return $sum; } sub magnitude { my $sum; foreach(@_){ $sum+=$_*$_; } return sqrt($sum); } sub vecDiv { my $div=shift; return map {$_/$div} @_; } sub normalize { return vecDiv(magnitude(@_),@_); } sub countvec { my @ret; foreach(@_){ $ret[$_->{dist}]++; } return @ret; } sub max { my $best=pop; foreach(@_){ $best=$_ if $best<$_; } return $best; } sub min { my $best=pop; foreach(@_){ $best=$_ if $best>$_; } return $best; } sub palsToDistmap { my %distmap; foreach(@_){ push(@{$distmap{$_->{dist}}},$_); } return %distmap; } sub palMismatches{ my ($a,$b)=@_; return min(mismatches($a->{key},$b->{key}), mismatches(revcomp($a->{key}),revcomp($b->{key}))); } sub mismatches{ my @a=split(//,pop); my @b=split(//,pop); my $count=@a; $count=@b if @b>@a; for my $i (0..($a<$b ? $#a:$#b)){ $count-- if $a[$i] eq $b[$i]; } return $count; } sub findPals { my %in=@_; my @out; foreach my $key (keys %in){ my @hits=@{$in{$key}}; foreach my $rev (grep {$_<0} @hits){ foreach my $fwd (grep {$_>0} @hits){ my $pal={left=>$rev, right => $fwd, dist => $fwd+$rev, key => $key}; push(@out,$pal) if $pal->{dist}<=$maxLoop and $maxLoop>=0; } } } return @out; } sub prettyPal { my $pal=shift; return $pal->{left}." ".revcomp($pal->{key})." -{".$pal->{dist}."}- ".$pal->{key}." ".$pal->{right}; } sub revcomp { ($_)=@_; y/agtcAGTC/tcagTCAG/; return reverse($_); } sub fillHash { my $seq=shift; my $hashref=shift; my %hash=%{$hashref}; my $mult=(shift() ? -1:1); for(0..(length($seq)-$seedLength-1)){ push(@{$hash{substr($seq,$_,$seedLength)}},($_+1)*$mult); } %{$hashref}=%hash; } __END__ =head1 NAME simpal -- proof of concept =head1 SYNOPSIS simpal.pl [options] [target2 target3 ...] =head1 OPTIONS Set options by something like --tolerance=5 or -t 5 or whatever. maxLoop -- specifies max separation for the ends of the palindromes maxDistance -- specifies the maximum difference in palindrome length when comparing palindromes tolerance -- max permissable misses in palindromes between sequences Setting either to -1 means no limit seedlength -- length of seeds to use for finding palindromes =cut