libboulder-perl-1.30.orig/0040755000175000017500000000000007777761464014200 5ustar jojojojolibboulder-perl-1.30.orig/Boulder/0040755000175000017500000000000007777761463015573 5ustar jojojojolibboulder-perl-1.30.orig/Boulder/Blast/0040755000175000017500000000000007777761463016640 5ustar jojojojolibboulder-perl-1.30.orig/Boulder/Blast/NCBI.pm0100644000175000017500000001532507427570357017704 0ustar jojojojopackage Boulder::Blast::NCBI; # NCBI BLAST file format parsing =head1 NAME Boulder::Blast::NCBI - Parse and read NCBI BLAST files =head1 SYNOPSIS Not for direct use. Use Boulder::Blast instead. =head1 DESCRIPTION Specialized parser for NCBI format BLAST output. Loaded automatically by Boulder::Blast. =head1 SEE ALSO L, L, L =head1 AUTHOR Lincoln Stein . Copyright (c) 1998 Cold Spring Harbor Laboratory This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty. =cut use strict; use File::Basename; use Stone; use Boulder::Stream; use Carp; use vars qw($VERSION @ISA); @ISA = 'Boulder::Blast'; $VERSION = 1.02; sub _read_record { my $self = shift; my $fh = shift; my $stone = shift; # we don't find out about the name of the database or the parameters until we # get to the bottom of the file. Too bad. # loop until we find the query name my $line; do { $line = <$fh>; } until ($line && $line=~/^Query=/); if ($line && $line =~ /^Query=\s+(\S+)/) { $stone->insert(Blast_query => $1); } else { croak "Couldn't find query line!"; } do { $line = <$fh>; } until $line=~/([\d,]+) letters/; croak "Couldn't find query length!" unless $1; (my $len = $1) =~ s/,//g; $stone->insert(Blast_query_length => $len); # Read down to the first hit, if any. If we hit /^Parameters/, then we had no # hits. while (<$fh> ) { last if /^(>|\s+Database)/; } if (/^>/) { # we found some hits my $hits = $self->parse_hits($_); $stone->insert(Blast_hits => $hits); } # At this point, one way or another, we're pointing at the Database # line. # Now we should be pointing at statistics while (<$fh>) { if (/Database: (.*)/) { my $db = $1; $stone->insert(Blast_db => $db); $stone->insert(Blast_db_title => basename($db)); } # in case match title, overwrite previous setting of Blast_db_title $stone->insert(Blast_db_title => $1) if /Title: (.*)/; $stone->insert(Blast_db_date => $1) if /Posted date:\s+(.*)/; last if /Lambda/; } # At this point, one way or another, we're pointing at the Matrix # line. We create a parameter stone to hold the results my $parms = new Stone; chomp ($line = <$fh>); if (my ($lambda,$k,$h) = $line =~ /([\d.-]+)/g) { $parms->insert('Lambda' => {Lambda => $lambda, K => $k, H => $h}); } chomp ($line = <$fh>); chomp ($line = <$fh>); if ($line =~ /^Gapped/) { chomp ($line = <$fh>); if (my ($lambda,$k,$h) = $line =~ /([\d.-]+)/g) { $parms->insert('GappedLambda' => {Lambda => $lambda, K => $k, H => $h}); } } while (<$fh>) { chomp; $parms->insert(GapPenalties => $1) if /^Gap Penalties: (.+)/; $parms->insert(Matrix => $1) if /^Matrix: (.+)/; $parms->insert(HSPLength => $1) if /^effective HSP length: (.+)/; $parms->insert(DBLength => $1) if /^length of database: (.+)/; $parms->insert(T => $1) if /T: (.+)/; $parms->insert(A => $1) if /A: (.+)/; $parms->insert(X1 => $1) if /X1: (\d+)/; $parms->insert(X2 => $1) if /X2: (\d+)/; $parms->insert(S1 => $1) if /S1: (\d+)/; $parms->insert(S2 => $1) if /S2: (\d+)/; last if /^S2/; } $stone->insert(Blast_parms => $parms); # finally done! $stone; } # parse the hits and HSPs sub parse_hits { my $self = shift; $_ = shift; my $fh = $self->_fh; my (@hits,@hsps,$accession,$orientation,$hit,$hsp); my ($qstart,$qend,$tstart,$tend); my ($query,$match,$target,$done,$new_hit,$new_hsp); my $signif = 9999; my $expect = 9999; my $ident = 0; while (!$done) { chomp; next unless length($_); $done = /^\s+Database/; # here's how we get out of the loop $new_hit = /^>(\S+)/; $new_hsp = $accession && /Score\s+=\s+([\d.e+]+)\s+bits\s+\((\S+)\)/; # hit a new HSP section if ( $done || $new_hit || $new_hsp ) { if ($hsp) { croak "base alignment is out of whack" unless length($query) == length($target); $hsp->insert(Query => $query, Subject => $target, Alignment => substr($match,0,length($query)), ); $hsp->insert(Query_start => $qstart, Query_end => $qend, Subject_start => $tstart, Subject_end => $tend, Length => 1 + $qend - $qstart, Orientation => $tend > $tstart ? 'plus' : 'minus', ); push(@hsps,$hsp); undef $hsp; } if ($new_hsp) { $hsp = new Stone; $hsp->insert(Score => $2, Bits => $1); ($qstart,$qend,$tstart,$tend) = (undef,undef,undef,undef); # undef all ($query,$match,$target) = (undef,undef,undef); # these too } } # hit a new subject section if ( $done || $new_hit ) { $accession = $1; if ($hit) { $signif = $expect if $signif == 9999; $hit->insert(Hsps => \@hsps, Signif => $signif, Identity => $ident, Expect => $expect, ) if @hsps; undef @hsps; push(@hits,$hit); ($signif,$expect,$ident) = (9999,9999,0); # reset max values } if ($new_hit) { $hit = new Stone; $hit->insert(Name => $accession); next; } } # hit the length = line if (/Length\s*=\s*([\d,]+)/) { (my $len = $1) =~ s/,//g; $hit->insert(Length => $len); next; } # hit the Plus|Minus Strand line if (/(Plus|Minus) Strand HSPs/) { $orientation = lc $1; next; } # None of the following is relevant unless $hsp is defined next unless $hsp; if (/Expect = ([+e\d\.-]+)/) { $hsp->insert(Expect => $1); $expect = $1 < $expect ? $1 : $expect; } if (/P(?:\(\d+\))? = (\S+)/) { $hsp->insert(Signif => $1); $signif = $1 < $signif ? $1 : $signif; } if (/Identities = \S+ \((\d+)%?\)/) { my $idn = $1; $hsp->insert(Identity => "$idn%"); $ident = $idn > $ident ? $idn : $ident; } $hsp->insert(Positives => $1) if /Positives = \S+ \((\S+)\)/; $hsp->insert(Strand => $1) if /Strand =\s+([^,]+)/; $hsp->insert(Frame => $1) if /Frame =\s+([^,]+)/; # process the query sequence if (/^Query:\s+(\d+)\s*(\S+)\s+(\d+)/) { $qstart ||= $1; $qend = $3; $query .= $2; next; } # process the target sequence if (/^Sbjct:\s+(\d+)\s*(\S+)\s+(\d+)/) { $tstart ||= $1; $tend = $3; $target .= $2; next; } # anything else is going to be the match string # this is REALLY UGLY because we have to extract absolute # positions $match .= substr($_,12,60) if $query; } continue { $_ = <$fh>; } return \@hits; } 1; libboulder-perl-1.30.orig/Boulder/Blast/WU.pm0100644000175000017500000001325407117731362017512 0ustar jojojojopackage Boulder::Blast::WU; # WUBLAST file format parsing =head1 NAME Boulder::Blast::WU - Parse and read WU-BLAST files =head1 SYNOPSIS Not for direct use. Use Boulder::Blast instead. =head1 DESCRIPTION Specialized parser for WUBLAST format BLAST output. Loaded automatically by Boulder::Blast. =head1 SEE ALSO L, L, L =head1 AUTHOR Lincoln Stein . Copyright (c) 1998 Cold Spring Harbor Laboratory This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty. =cut use strict; use Stone; use Boulder::Stream; use Carp; use vars qw($VERSION @ISA); @ISA = 'Boulder::Blast'; $VERSION = 1.00; sub _read_record { my $self = shift; my $fh = shift; my $stone = shift; # we don't find out about the name of the database or the parameters until we # get to the bottom of the file. Too bad. # loop until we find the query name my $line; do { $line = <$fh>; } until $line=~/^Query=\s+(\S+)/; croak "Couldn't find query line!" unless $1; $stone->insert(Blast_query => $1); do { $line = <$fh>; } until $line=~/([\d,]+) letters/; croak "Couldn't find query length!" unless $1; (my $len = $1) =~ s/,//g; $stone->insert(Blast_query_length => $len); # Read down to the first hit, if any. If we hit /^Parameters/, then we had no # hits. while (<$fh> ) { last if /^(>|Parameters)/; } if (/^>/) { # we found some hits my $hits = $self->parse_hits($_); $stone->insert(Blast_hits => $hits); } # At this point, one way or another, we're pointing at the Parameters # line. We create a parameter stone to hold the results my $parms = new Stone; while (<$fh>) { chomp; last if /^Statistics/; $parms->insert(Ctxfactor => $1) if /ctxfactor=(\S+)/; $parms->insert(Gapall=>'yes') if /gapall/; $parms->insert(Hspmax=>$1) if /hspmax=(\S+)/; $parms->insert(Expectation=>$1) if /E=(\S+)/; $parms->insert(Matrix=>$1) if /^\s+\+[0-3]\s+0\s+(\S+)/ && !$parms->get('Matrix'); # borscht } $stone->insert(Blast_parms => $parms); # Now we should be pointing at statistics while (<$fh>) { $stone->insert(Blast_db => $1) if /Database: (.*)/; $stone->insert(Blast_db_title => $1) if /Title: (.*)/; $stone->insert(Blast_db_date => $1) if /Posted date:\s+(.*)/; last if /End:/; } # finally done! $stone; } # parse the hits and HSPs sub parse_hits { my $self = shift; $_ = shift; my $fh = $self->_fh; my (@hits,@hsps,$accession,$orientation,$hit,$hsp); my ($qstart,$qend,$tstart,$tend); my ($query,$match,$target,$done,$new_hit,$new_hsp); my $signif = 9999; my $expect = 9999; my $ident = 0; while (!$done) { chomp; next unless length($_); $done = /^Parameters/; # here's how we get out of the loop $new_hit = /^>(\S+)/; $new_hsp = $accession && /Score = (\d+) \((\S+) bits\)/; # hit a new HSP section if ( $done || $new_hit || $new_hsp ) { if ($hsp) { croak "base alignment is out of whack" unless length($query) == length($target); $hsp->insert(Query => $query, Subject => $target, Alignment => substr($match,0,length($query)), ); $hsp->insert(Query_start => $qstart, Query_end => $qend, Subject_start => $tstart, Subject_end => $tend, Length => 1 + $qend - $qstart, Orientation => $tend > $tstart ? 'plus' : 'minus', ); push(@hsps,$hsp); undef $hsp; } if ($new_hsp) { $hsp = new Stone; $hsp->insert(Score => $1, Bits => $2); ($qstart,$qend,$tstart,$tend) = (undef,undef,undef,undef); # undef all ($query,$match,$target) = (undef,undef,undef); # these too } } # hit a new subject section if ( $done || $new_hit ) { $accession = $1; if ($hit) { $hit->insert(Hsps => \@hsps, Signif => $signif, Identity => $ident, Expect => $expect, ) if @hsps; undef @hsps; push(@hits,$hit); ($signif,$expect,$ident) = (9999,9999,0); # reset max values } if ($new_hit) { $hit = new Stone; $hit->insert(Name => $accession); next; } } # hit the length = line if (/Length\s*=\s*([\d,]+)/) { (my $len = $1) =~ s/,//g; $hit->insert(Length => $len); next; } # hit the Plus|Minus Strand line if (/(Plus|Minus) Strand HSPs/) { $orientation = lc $1; next; } # None of the following is relevant unless $hsp is defined next unless $hsp; if (/Expect = ([+e\d\.-]+)/) { $hsp->insert(Expect => $1); $expect = $1 < $expect ? $1 : $expect; } if (/P(?:\(\d+\))? = (\S+)/) { $hsp->insert(Signif => $1); $signif = $1 < $signif ? $1 : $signif; } if (/Identities = \S+ \((\d+%?)\)/) { my $idn = $1; $hsp->insert(Identity => $idn); $ident = $idn > $ident ? $idn : $ident; } $hsp->insert(Positives => $1) if /Positives = \S+ \((\S+)\)/; $hsp->insert(Strand => $1) if /Strand =\s+([^,]+)/; $hsp->insert(Frame => $1) if /Frame =\s+([^,]+)/; # process the query sequence if (/^Query:\s+(\d+)\s+(\S+)\s+(\d+)/) { $qstart ||= $1; $qend = $3; $query .= $2; next; } # process the target sequence if (/^Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)/) { $tstart ||= $1; $tend = $3; $target .= $2; next; } # anything else is going to be the match string # this is REALLY UGLY because we have to extract absolute # positions $match .= substr($_,13,60) if $query; } continue { $_ = <$fh>; } return \@hits; } 1; libboulder-perl-1.30.orig/Boulder/Blast.pm0100644000175000017500000003203507427570036017160 0ustar jojojojopackage Boulder::Blast; # WUBLAST/NCBI BLAST file format parsing =head1 NAME Boulder::Blast - Parse and read BLAST files =head1 SYNOPSIS use Boulder::Blast; # parse from a single file $blast = Boulder::Blast->parse('run3.blast'); # parse and read a set of blast output files $stream = Boulder::Blast->new('run3.blast','run4.blast'); while ($blast = $stream->get) { # do something with $blast object } # parse and read a whole directory of blast runs $stream = Boulder::Blast->new(<*.blast>); while ($blast = $stream->get) { # do something with $blast object } # parse and read from STDIN $stream = Boulder::Blast->new; while ($blast = $stream->get) { # do something with $blast object } # parse and read as a filehandle $stream = Boulder::Blast->newFh(<*.blast>); while ($blast = <$stream>) { # do something with $blast object } # once you have a $blast object, you can get info about it: $query = $blast->Blast_query; @hits = $blast->Blast_hits; foreach $hit (@hits) { $hit_sequence = $hit->Name; # get the ID $significance = $hit->Signif; # get the significance @hsps = $hit->Hsps; # list of HSPs foreach $hsp (@hsps) { $query = $hsp->Query; # query sequence $subject = $hsp->Subject; # subject sequence $signif = $hsp->Signif; # significance of HSP } } =head1 DESCRIPTION The I class parses the output of the B or National Cenber for Biotechnology Information (NCBI) series of BLAST programs and turns them into I records. You may then use the standard Stone access methods to retrieve information about the BLAST run, or add the information to a Boulder stream. The parser works equally well on the contents of a static file, or on information read dynamically from a filehandle or pipe. =head1 METHODS =head2 parse() Method $stone = Boulder::Blast->parse($file_path); $stone = Boulder::Blast->parse($filehandle); The I method accepts a path to a file or a filehandle, parses its contents, and returns a Boulder Stone object. The file path may be absolute or relative to the current directgly. The filehandle may be specified as an IO::File object, a FileHandle object, or a reference to a glob (C<\*FILEHANDLE> notation). If you call I without any arguments, it will try to parse the contents of standard input. =head2 new() Method $stream = Boulder::Blast->new; $stream = Boulder::Blast->new($file [,@more_files]); $stream = Boulder::Blast->new(\*FILEHANDLE); If you wish, you may create the parser first with I I, and then invoke the parser object's I method as many times as you wish to, producing a Stone object each time. =head1 TAGS The following tags are defined in the parsed Blast Stone object: =head2 Information about the program These top-level tags provide information about the version of the BLAST program itself. =over 4 =item Blast_program The name of the algorithm used to run the analysis. Possible values include: blastn blastp blastx tblastn tblastx fasta3 fastx3 fasty3 tfasta3 tfastx3 tfasty3 =item Blast_version This gives the version of the program in whatever form appears on the banner page, e.g. "2.0a19-WashU". =item Blast_program_date This gives the date at which the program was compiled, if and only if it appears on the banner page. =back =head2 Information about the run These top-level tags give information about the particular run, such as the parameters that were used for the algorithm. =over 4 =item Blast_run_date This gives the date and time at which the similarity analysis was run, in the format "Fri Jul 6 09:32:36 1998" =item Blast_parms This points to a subrecord containing information about the algorithm's runtime parameters. The following subtags are used. Others may be added in the future: Hspmax the value of the -hspmax argument Expectation the value of E Matrix the matrix in use, e.g. BLOSUM62 Ctxfactor the value of the -ctxfactor argument Gapall The value of the -gapall argument =back =head2 Information about the query sequence and subject database Thse top-level tags give information about the query sequence and the database that was searched on. =over 4 =item Blast_query The identifier for the search sequence, as defined by the FASTA format. This will be the first set of non-whitespace characters following the ">" character. In other words, the search sequence "name". =item Blast_query_length The length of the query sequence, in base pairs. =item Blast_db The Unix filesystem path to the subject database. =item Blast_db_title The title of the subject database. =back =head2 The search results: the I tag. Each BLAST hit is represented by the tag I. There may be zero, one, or many such tags. They will be presented in reverse sorted order of significance, i.e. most significant hit first. Each I tag is a Stone subrecord containing the following subtags: =over 4 =item Name The name/identifier of the sequence that was hit. =item Length The total length of the sequence that was hit =item Signif The significance of the hit. If there are multiple HSPs in the hit, this will be the most significant (smallest) value. =item Identity The percent identity of the hit. If there are multiple HSPs, this will be the one with the highest percent identity. =item Expect The expectation value for the hit. If there are multiple HSPs, this will be the lowest expectation value in the set. =item Hsps One or more sub-sub-tags, pointing to a nested record containing information about each high-scoring segment pair (HSP). See the next section for details. =back =head2 The Hsp records: the I tag Each I tag will have at least one, and possibly several I tags, each one corresponding to a high-scoring segment pair (HSP). These records contain detailed information about the hit, including the alignments. Tags are as follows: =over 4 =item Signif The significance (P value) of this HSP. =item Bits The number of bits of significance. =item Expect Expectation value for this HSP. =item Identity Percent identity. =item Positives Percent positive matches. =item Score The Smith-Waterman alignment score. =item Orientation The word "plus" or "minus". This tag is only present for nucleotide searches, when the reverse complement match may be present. =item Strand Depending on algorithm used, indicates complementarity of match and possibly the reading frame. This is copied out of the blast report. Possibilities include: "Plus / Minus" "Plus / Plus" -- blastn algorithm "+1 / -2" "+2 / -2" -- blastx, tblastx =item Query_start Position at which the HSP starts in the query sequence (1-based indexing). =item Query_end Position at which the HSP stops in the query sequence. =item Subject_start Position at which the HSP starts in the subject (target) sequence. =item Subject_end Position at which the HSP stops in the subject (target) sequence. =item Query, Subject, Alignment These three tags contain strings which, together, create the gapped alignment of the query sequence with the subject sequence. For example, to print the alignment of the first HSP of the first match, you might say: $hsp = $blast->Blast_hits->Hsps; print join("\n",$hsp->Query,$hsp->Alignment,$hsp->Subject),"\n"; =back See the bottom of this manual page for an example BLAST run. =head1 CAVEATS This module has been extensively tested with WUBLAST, but very little with NCBI BLAST. It probably will not work with PSI Blast or other variants. The author plans to adapt this module to parse other formats, as well as non-BLAST formats such as the output of Fastn. =head1 SEE ALSO L, L =head1 AUTHOR Lincoln Stein . Copyright (c) 1998-1999 Cold Spring Harbor Laboratory This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty. =head1 EXAMPLE BLASTN RUN This output was generated by the I program, which is located in the F subdirectory of the I distribution directory. It is a typical I (nucleotide->nucleotide) run; however long lines (usually DNA sequences) have been truncated. Also note that per the Boulder protocol, the percent sign (%) is escaped in the usual way. It will be unescaped when reading the stream back in. Blast_run_date=Fri Nov 6 14:40:41 1998 Blast_db_date=2:40 PM EST Nov 6, 1998 Blast_parms={ Hspmax=10 Expectation=10 Matrix=+5,-4 Ctxfactor=2.00 } Blast_program_date=05-Feb-1998 Blast_db= /usr/tmp/quickblast18202aaaa Blast_version=2.0a19-WashU Blast_query=BCD207R Blast_db_title= test.fasta Blast_query_length=332 Blast_program=blastn Blast_hits={ Signif=3.5e-74 Expect=3.5e-74, Name=BCD207R Identity=100%25 Length=332 Hsps={ Subject=GTGCTTTCAAACATTGATGGATTCCTCCCCTTGACATATATATATACTTTGGGTTCCCGCAA... Signif=3.5e-74 Length=332 Bits=249.1 Query_start=1 Subject_end=332 Query=GTGCTTTCAAACATTGATGGATTCCTCCCCTTGACATATATATATACTTTGGGTTCCCGCAA... Positives=100%25 Expect=3.5e-74, Identity=100%25 Query_end=332 Orientation=plus Score=1660 Strand=Plus / Plus Subject_start=1 Alignment=||||||||||||||||||||||||||||||||||||||||||||||||||||||||||... } } = =head1 Example BLASTP run Here is the output from a typical I (protein->protein) run. Long lines have again been truncated. Blast_run_date=Fri Nov 6 14:37:23 1998 Blast_db_date=2:36 PM EST Nov 6, 1998 Blast_parms={ Hspmax=10 Expectation=10 Matrix=BLOSUM62 Ctxfactor=1.00 } Blast_program_date=05-Feb-1998 Blast_db= /usr/tmp/quickblast18141aaaa Blast_version=2.0a19-WashU Blast_query=YAL004W Blast_db_title= elegans.fasta Blast_query_length=216 Blast_program=blastp Blast_hits={ Signif=0.95 Expect=3.0, Name=C28H8.2 Identity=30%25 Length=51 Hsps={ Subject=HMTVEFHVTSQSW---FGFEDHFHMIIR-AVNDENVGWGVRYLSMAF Signif=0.95 Length=46 Bits=15.8 Query_start=100 Subject_end=49 Query=HLTQD-HGGDLFWGKVLGFTLKFNLNLRLTVNIDQLEWEVLHVSLHF Positives=52%25 Expect=3.0, Identity=30%25 Query_end=145 Orientation=plus Score=45 Subject_start=7 Alignment=H+T + H W GF F++ +R VN + + W V ++S+ F } } Blast_hits={ Signif=0.99 Expect=4.7, Name=ZK896.2 Identity=24%25 Length=340 Hsps={ Subject=FSGKFTTFVLNKDQATLRMSSAEKTAEWNTAFDSRRGFF----TSGNYGL... Signif=0.99 Length=101 Bits=22.9 Query_start=110 Subject_end=243 Query=FWGKVLGFTL-KFNLNLRLTVNIDQLEWEVLHVSLHFWVVEVSTDQTLSVE... Positives=41%25 Expect=4.7, Identity=24%25 Query_end=210 Orientation=plus Score=65 Subject_start=146 Alignment=F GK F L K LR++ EW S + T +... } } = =cut use strict; use Stone; use Boulder::Stream; use Carp; use vars qw($VERSION @ISA); @ISA = 'Boulder::Stream'; *get = \&read_record; $VERSION = 1.01; sub new { my $self = shift; $self = bless {},$self unless ref $self; $self->_open(@_); return $self; } # parse the contents of filehandle and emit a boulderio stream to stdout sub parse { my $self = shift; $self = $self->new(shift) unless ref($self); $self->read_record(); } sub _fh { my $self = shift; $self->{'fh'} = $_[0] if defined($_[0]); return $self->{'fh'}; } sub read_record { my $self = shift; return if $self->done; my $fh = $self->_fh; my $stone = new Stone; local $/ = "\n"; # normalize input stream return unless defined(my $line = <$fh>); croak "Doesn't look like a BLAST stream to me - top line = '$_'" unless $line=~/BLAST/; return unless my ($program,$version,$date) = $line=~ /^(\S+) (\S+) \[([^\]]+)\]/; $stone->insert ( Blast_version => $version, Blast_program => lc $program, Blast_program_date => $date ); # the date isn't part of the file, so we use the creation date of the file # for this purpose. If not available, then we are reading from a pipe # (maybe) and we use the current time. my $timestamp = -f $fh ? (stat(_))[9] : time; $stone->insert(Blast_run_date => scalar localtime($timestamp)); if ($version =~ /WashU/) { require Boulder::Blast::WU; bless $self,'Boulder::Blast::WU'; } else { require Boulder::Blast::NCBI; bless $self,'Boulder::Blast::NCBI'; } $self->_read_record($fh,$stone); } sub _read_record { croak "unimplemented"; } sub _open { my $self = shift; if (@_ > 1) { push @ARGV,@_; $self->_fh(\*ARGV); return; } my $fh = shift; unless (defined $fh) { # if $fh is null, then set it to ARGV $fh ||= \*ARGV; } elsif (!UNIVERSAL::isa($fh,'GLOB') && !UNIVERSAL::isa($fh,'FileHandle')) { # if $fh isn't a filehandle, then treat it as a filename to open croak "File does not exist" unless -e (my $name = $fh); $fh = Symbol::gensym; open($fh,$name) or croak "Can't open $name: $!\n"; } $self->_fh($fh); } 1; libboulder-perl-1.30.orig/Boulder/Genbank.pm0100644000175000017500000010440307777564337017476 0ustar jojojojopackage Boulder::Genbank; use strict; use Boulder::Stream; use Stone::GB_Sequence; use Carp; use vars qw(@ISA $VERSION); @ISA = qw(Boulder::Stream); $VERSION = 1.10; # Hard-coded defaults - must modify for your site use constant YANK => '/usr/local/bin/yank'; #use constant BATCH_URI => '/cgi-bin/Entrez/qserver.cgi/result'; #use constant BATCH_URI => '/htbin-post/Entrez/query'; use constant HOST => 'www.ncbi.nlm.nih.gov'; use constant BATCH_URI => '/IEB/ToolBox/XML/xbatch.cgi'; # Genbank entry parsing constants # (may need to adjust!) my $KEYCOL=0; my $VALUECOL=12; my $FEATURECOL=5; my $FEATUREVALCOL=21; =head1 NAME Boulder::Genbank - Fetch Genbank data records as parsed Boulder Stones =head1 SYNOPSIS use Boulder::Genbank # network access via Entrez $gb = Boulder::Genbank->newFh( qw(M57939 M28274 L36028) ); while ($data = <$gb>) { print $data->Accession; @introns = $data->features->Intron; print "There are ",scalar(@introns)," introns.\n"; $dna = $data->Sequence; print "The dna is ",length($dna)," bp long.\n"; my @features = $data->features(-type=>[ qw(Exon Source Satellite) ], -pos=>[90,310] ); foreach (@features) { print $_->Type,"\n"; print $_->Position,"\n"; print $_->Gene,"\n"; } } # another syntax $gb = new Boulder::Genbank(-accessor=>'Entrez', -fetch => [qw/M57939 M28274 L36028/]); # local access via Yank $gb = new Boulder::Genbank(-accessor=>'Yank', -fetch=>[qw/M57939 M28274 L36028/]); while (my $s = $gb->get) { # etc. } # parse a file of Genbank records $gb = new Boulder::Genbank(-accessor=>'File', -fetch => '/usr/local/db/gbpri3.seq'); while (my $s = $gb->get) { # etc. } # parse flatfile records yourself open (GB,"/usr/local/db/gbpri3.seq"); local $/ = "//\n"; while () { my $s = Boulder::Genbank->parse($_); # etc. } =head1 DESCRIPTION Boulder::Genbank provides retrieval and parsing services for NCBI Genbank-format records. It returns Genbank entries in L format, allowing easy access to the various fields and values. Boulder::Genbank is a descendent of Boulder::Stream, and provides a stream-like interface to a series of Stone objects. >> IMPORTANT NOTE << As of January 2002, NCBI has changed their Batch Entrez interface. I have modified Boulder::Genbank so as to use a "demo" interface, which fixes things, but this isn't guaranteed in the long run. I have written to NCBI, and they may fix this -- or they may not. >> IMPORTANT NOTE << Access to Genbank is provided by three different I, which together give access to remote and local Genbank databases. When you create a new Boulder::Genbank stream, you provide one of the three accessors, along with accessor-specific parameters that control what entries to fetch. The three accessors are: =over 4 =item Entrez This provides access to NetEntrez, accessing the most recent Genbank information directly from NCBI's Web site. The parameters passed to this accessor are either a series of Genbank accession numbers, or an Entrez query (see http://www.ncbi.nlm.nih.gov/Entrez/linking.html). If you provide a list of accession numbers, the stream will return a series of stones corresponding to the numbers. Otherwise, if you provided an Entrez query, the entries returned will be in the order returned by Entez. =item File This provides access to local Genbank entries by reading from a flat file (typically one of the .seq files downloadable from NCBI's Web site). The stream will return a Stone corresponding to each of the entries in the file, starting from the top of the file and working downward. The parameter in this case is the path to the local file. =item Yank This provides access to local Genbank entries using Will Fitzhugh's Yank program. Yank provides fast indexed access to a Genbank flat file using the accession number as the key. The parameter passed to the Yank accessor is a list of accession numbers. Stones will be returned in the requested order. By default the yank binary lives in /usr/local/bin/yank. To support other locations, you may define the environment variable YANK to contain the full path. =back It is also possible to parse a single Genbank entry from a text string stored in a scalar variable, returning a Stone object. =head2 Boulder::Genbank methods This section lists the public methods that the I class makes available. =over 4 =item new() # Network fetch via Entrez, with accession numbers $gb=new Boulder::Genbank(-accessor => 'Entrez', -fetch => [qw/M57939 M28274 L36028/]); # Same, but shorter and uses -> operator $gb = Boulder::Genbank->new qw(M57939 M28274 L36028); # Network fetch via Entrez, with a query # Network fetch via Entrez, with a query $query = 'Homo sapiens[Organism] AND EST[Keyword]'; $gb=new Boulder::Genbank(-accessor => 'Entrez', -fetch => $query); # Local fetch via Yank, with accession numbers $gb=new Boulder::Genbank(-accessor => 'Yank', -fetch => [qw/M57939 M28274 L36028/]); # Local fetch via File $gb=new Boulder::Genbank(-accessor => 'File', -fetch => '/usr/local/genbank/gbpri3.seq'); The new() method creates a new I stream on the accessor provided. The three possible accessors are B, B and B. If successful, the method returns the stream object. Otherwise it returns undef. new() takes the following arguments: -accessor Name of the accessor to use -fetch Parameters to pass to the accessor -proxy Path to an HTTP proxy, used when using the Entrez accessor over a firewall. Specify the accessor to use with the B<-accessor> argument. If not specified, it defaults to B. B<-fetch> is an accessor-specific argument. The possibilities are: For B, the B<-fetch> argument may point to a scalar, in which case it is interpreted as an Entrez query string. See http://www.ncbi.nlm.nih.gov/Entrez/linking.html for a description of the query syntax. Alternatively, B<-fetch> may point to an array reference, in which case it is interpreted as a list of accession numbers to retrieve. If B<-fetch> points to a hash, it is interpreted as extended information. See L<"Extended Entrez Parameters"> below. For B, the B<-fetch> argument must point to an array reference containing the accession numbers to retrieve. For B, the B<-fetch> argument must point to a string-valued scalar, which will be interpreted as the path to the file to read Genbank entries from. For Entrez (and Entrez only) Boulder::Genbank allows you to use a shortcut syntax in which you provde new() with a list of accession numbers: $gb = new Boulder::Genbank('M57939','M28274','L36028'); =item newFh() This works like new(), but returns a filehandle. To recover each GenBank record read from the filehandle with the <> operator: $fh = Boulder::GenBank->newFh('M57939','M28274','L36028'); while ($record = <$fh>) { print $record->asString; } =item get() The get() method is inherited from I, and simply returns the next parsed Genbank Stone, or undef if there is nothing more to fetch. It has the same semantics as the parent class, including the ability to restrict access to certain top-level tags. The object returned is a L object, which is a descendent of L. =item put() The put() method is inherited from the parent Boulder::Stream class, and will write the passed Stone to standard output in Boulder format. This means that it is currently not possible to write a Boulder::Genbank object back into Genbank flatfile form. =back =head2 Extended Entrez Parameters The Entrez accessor recognizes extended parameters that allow you the ability to customize the search. Instead of passing a query string scalar or a list of accession numbers as the B<-fetch> argument, pass a hash reference. The hashref should contain one or more of the following keys: =over =item B<-query> The Entrez query to process. =item B<-accession> The list of accession numbers to fetch, as an array ref. =item B<-db> The database to search. This is a single-letter database code selected from the following list: m MEDLINE p Protein n Nucleotide s Popset =item B<-proxy> An HTTP proxy to use. For example: -proxy => http://www.firewall.com:9000 If you think you need this, get the correct URL from your system administrator. =back As an example, here's how to search for ESTs from Oryza sativa that have been entered or modified since 1999. my $gb = new Boulder::Genbank( -accessor=>Entrez, -query=>'Oryza sativa[Organism] AND EST[Keyword] AND 1999[MDAT]', -db => 'n' }); =head1 METHODS DEFINED BY THE GENBANK STONE OBJECT Each record returned from the Boulder::Genbank stream defines a set of methods that correspond to features and other fields in the Genbank flat file record. L gives the full details, but they are listed for reference here: =head2 $length = $entry->length Get the length of the sequence. =head2 $start = $entry->start Get the start position of the sequence, currently always "1". =head2 $end = $entry->end Get the end position of the sequence, currently always the same as the length. =head2 @feature_list = $entry->features(-pos=>[50,450],-type=>['CDS','Exon']) features() will search the entry feature list for those features that meet certain criteria. The criteria are specified using the B<-pos> and/or B<-type> argument names, as shown below. =over 4 =item -pos Provide a position or range of positions which the feature must B. A single position is specified in this way: -pos => 1500; # feature must overlap postion 1500 or a range of positions in this way: -pos => [1000,1500]; # 1000 to 1500 inclusive If no criteria are provided, then features() returns all the features, and is equivalent to calling the Features() accessor. =item -type, -types Filter the list of features by type or a set of types. Matches are case-insensitive, so "exon", "Exon" and "EXON" are all equivalent. You may call with a single type as in: -type => 'Exon' or with a list of types, as in -types => ['Exon','CDS'] The names "-type" and "-types" can be used interchangeably. =head2 $seqObj = $entry->bioSeq; Returns a L object from the Bioperl project. Dies with an error message unless the Bio::Seq module is installed. =back =head1 OUTPUT TAGS The tags returned by the parsing operation are taken from the NCBI ASN.1 schema. For consistency, they are normalized so that the initial letter is capitalized, and all subsequent letters are lowercase. This section contains an abbreviated list of the most useful/common tags. See "The NCBI Data Model", by James Ostell and Jonathan Kans in "Bioinformatics: A Practical Guide to the Analysis of Genes and Proteins" (Eds. A. Baxevanis and F. Ouellette), pp 121-144 for the full listing. =head2 Top-Level Tags These are tags that appear at the top level of the parsed Genbank entry. =over 4 =item Accession The accession number of this entry. Because of the vagaries of the Genbank data model, an entry may have multiple accession numbers (e.g. after a merging operation). Accession may therefore be a multi-valued tag. Example: my $accessionNo = $s->Accession; =item Authors The list of authors, as they appear on the AUTHORS line of the Genbank record. No attempt is made to parse them into individual authors. =item Basecount The nucleotide basecount for the entry. It is presented as a Boulder Stone with keys "a", "c", "t" and "g". Example: my $A = $s->Basecount->A; my $C = $s->Basecount->C; my $G = $s->Basecount->G; my $T = $s->Basecount->T; print "GC content is ",($G+$C)/($A+$C+$G+$T),"\n"; =item Blob The entire flatfile record as an unparsed chunk of text (a "blob"). This is a handy way of reassembling the record for human inspection. =item Comment The COMMENT line from the Genbank record. =item Definition The DEFINITION line from the Genbank record, unmodified. =item Features The FEATURES table. This is a complex stone object with multiple subtags. See the L<"The Features Tag"> for details. =item Journal The JOURNAL line from the Genbank record, unmodified. =item Keywords The KEYWORDS line from the Genbank record, unmodified. No attempt is made to parse the keywords into separate values. Example: my $keywords = $s->Keywords =item Locus The LOCUS line from the Genbank record. It is not further parsed. =item Medline, Nid References to other database accession numbers. =item Organism The taxonomic name of the organism from which this entry was derived. This line is taken from the Genbank entry unmodified. See the NCBI data model documentation for an explanation of their taxonomic syntax. =item Reference The REFERENCE line from the Genbank entry. There are often multiple Reference lines. Example: my @references = $s->Reference; =item Sequence The DNA or RNA sequence of the entry. This is presented as a single lower-case string, with all base numbers and formatting characters removed. =item Source The entry's SOURCE field; often giving clues on how the sequencing was performed. =item Title The TITLE field from the paper describing this entry, if any. =back =head2 The Features Tag The Features tag points to a Stone record that contains multiple subtags. Each subtag is the name of a feature which points, in turn, to a Stone that describes the feature's location and other attributes. The full list of feature is beyond this document, but the following are the features that are most often seen: Cds a CDS Intron an intron Exon an exon Gene a gene Mrna an mRNA Polya_site a putative polyadenylation signal Repeat_unit a repetitive region Source More information about the organism and cell type the sequence was derived from Satellite a microsatellite (dinucleotide repeat) Each feature will contain one or more of the following subtags: =over 4 =item DB_xref A cross-reference to another database in the form DB_NAME:accession_number. See the NCBI Web site for a description of these cross references. =item Evidence The evidence for this feature, either "experimental" or "predicted". =item Gene If the feature involves a gene, this will be the gene's name (or one of its names). This subtag is often seen in "Gene" and Cds features. Example: foreach ($s->Features->Cds) { my $gene = $_->Gene; my $position = $_->Position; Print "Gene $gene ($position)\n"; } =item Map If the feature is mapped, this provides a map position, usually as a cytogenetic band. =item Note A grab-back for various text notes. =item Number When multiple features of this type occur, this field is used to number them. Ordinarily this field is not needed because Boulder::Genbank preserves the order of features. =item Organism If the feature is Source, this provides the source organism. =item Position The position of this feature, usually expresed as a range (1970..1975). =item Product The protein product of the feature, if applicable, as a text string. =item Translation The protein translation of the feature, if applicable. =back =head1 SEE ALSO L, L =head1 AUTHOR Lincoln Stein . Copyright (c) 1997-2000 Lincoln D. Stein This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty. =head1 EXAMPLE GENBANK OBJECT The following is an excerpt from a moderately complex Genbank Stone. The Sequence line and several other long lines have been truncated for readability. Authors=Spritz,R.A., Strunk,K., Surowy,C.S.O., Hoch,S., Barton,D.E. and Francke,U. Authors=Spritz,R.A., Strunk,K., Surowy,C.S. and Mohrenweiser,H.W. Locus=HUMRNP7011 2155 bp DNA PRI 03-JUL-1991 Accession=M57939 Accession=J04772 Accession=M57733 Keywords=ribonucleoprotein antigen. Sequence=aagcttttccaggcagtgcgagatagaggagcgcttgagaaggcaggttttgcagcagacggcagtgacagcccag... Definition=Human small nuclear ribonucleoprotein (U1-70K) gene, exon 10 and 11. Journal=Nucleic Acids Res. 15, 10373-10391 (1987) Journal=Genomics 8, 371-379 (1990) Nid=g337441 Medline=88096573 Medline=91065657 Features={ Polya_site={ Evidence=experimental Position=1989 Gene=U1-70K } Polya_site={ Position=1990 Gene=U1-70K } Polya_site={ Evidence=experimental Position=1992 Gene=U1-70K } Polya_site={ Evidence=experimental Position=1998 Gene=U1-70K } Source={ Organism=Homo sapiens Db_xref=taxon:9606 Position=1..2155 Map=19q13.3 } Cds={ Codon_start=1 Product=ribonucleoprotein antigen Db_xref=PID:g337445 Position=join(M57929:329..475,M57930:183..245,M57930:358..412, ... Gene=U1-70K Translation=MTQFLPPNLLALFAPRDPIPYLPPLEKLPHEKHHNQPYCGIAPYIREFEDPRDAPPPTR... } Cds={ Codon_start=1 Product=ribonucleoprotein antigen Db_xref=PID:g337444 Evidence=experimental Position=join(M57929:329..475,M57930:183..245,M57930:358..412, ... Gene=U1-70K Translation=MTQFLPPNLLALFAPRDPIPYLPPLEKLPHEKHHNQPYCGIAPYIREFEDPR... } Polya_signal={ Position=1970..1975 Note=putative Gene=U1-70K } Intron={ Evidence=experimental Position=1100..1208 Gene=U1-70K } Intron={ Number=10 Evidence=experimental Position=1100..1181 Gene=U1-70K } Intron={ Number=9 Evidence=experimental Position=order(M57937:702..921,1..1011) Note=2.1 kb gap Gene=U1-70K } Intron={ Position=order(M57935:272..406,M57936:1..284,M57937:1..599, <1..>1208) Gene=U1-70K } Intron={ Evidence=experimental Position=order(M57935:284..406,M57936:1..284,M57937:1..599, <1..>1208) Note=first gap-0.14 kb, second gap-0.62 kb Gene=U1-70K } Intron={ Number=8 Evidence=experimental Position=order(M57935:272..406,M57936:1..284,M57937:1..599, <1..>1181) Note=first gap-0.14 kb, second gap-0.62 kb Gene=U1-70K } Exon={ Number=10 Evidence=experimental Position=1012..1099 Gene=U1-70K } Exon={ Number=11 Evidence=experimental Position=1182..(1989.1998) Gene=U1-70K } Exon={ Evidence=experimental Position=1209..(1989.1998) Gene=U1-70K } Mrna={ Product=ribonucleoprotein antigen Position=join(M57928:358..668,M57929:319..475,M57930:183..245, ... Gene=U1-70K } Mrna={ Product=ribonucleoprotein antigen Citation=[2] Evidence=experimental Position=join(M57928:358..668,M57929:319..475,M57930:183..245, ... Gene=U1-70K } Gene={ Position=join(M57928:207..719,M57929:1..562,M57930:1..577, ... Gene=U1-70K } } Reference=1 (sites) Reference=2 (bases 1 to 2155) = =cut # new() takes named parameters: # -accessor=> Reference to an object class that will return a series of # Genbank records. Predefined objects include 'Yank', 'Entrez' and 'File'. # (defaults to 'Entrez'). # -fetch=> Parameters to pass to the subroutine. Can be a list of accession numbers # or an entrez query. # -out=> Output filehandle. Defaults to STDOUT. # # If you don't use named parameters, then will assume method 'yank' on # a list of accession numbers. # e.g. # $gb = new Boulder::Genbank(-accessor=>'Yank',-fetch=>[qw/M57939 M28274 L36028/]); sub new { my($package,@parameters) = @_; # superclass constructor my($self) = $package->SUPER::new(); # figure out whether parameters are named. Look for # an initial '-' my %parameters; if ($parameters[0]=~/^-/) { %parameters = @parameters; $self->{accessor} = $parameters{'-accessor'} || 'Entrez'; $self->{OUT} = $parameters{'-out'} || \*STDOUT; $self->{format} = $parameters{'-format'}; } else { $self->{accessor}='Entrez'; $parameters{-fetch} = \@parameters; } $self->{format} ||= 'stone'; $parameters{-format} = $self->{format}; $self->{accessor} = new {$self->{accessor}}(\%parameters); return bless $self,$package; } sub read_record { my($self,@tags) = @_; my($s); my $query = $self->{query}; if (wantarray) { my(@result); while (!$self->{EOF}) { $s = $self->read_one_record(@tags); next unless $s; next if $query && !(&$query); push(@result,$s); } return @result; } # we get here if in a scalar context while (!$self->{EOF}) { $s = $self->read_one_record(@tags); next unless $s; return $s unless $query; return $s if &$query; } return undef; } sub parse { my $self = shift; my $record = shift; return unless $record; my $tags = shift; my %ok; %ok = map {$_ => 1} @$tags if ref($tags) eq 'ARRAY'; my($s,@lines,$line,$accumulated,$key,$keyword,$value,$feature,@features); $s = Stone::GB_Sequence->new; @lines = split("\n",$record); foreach $line (@lines) { if ($line=~/^ACCESSION\s+(.+)/) { foreach ($1=~/(\S+)/g) { $self->_addToStone('Accession',$_,$s,\%ok); } next; } # special case for the VERSION if ($line=~/^VERSION/) { my @vernid = split(/\s+/,$line); my ($junk,$tmp)=split(/:/,$vernid[2]); $vernid[2]='g'.$tmp; $self->_addToStone('Version',$vernid[1],$s,\%ok); $self->_addToStone('Nid',$vernid[2],$s,\%ok); next; } # special case for the features table if ($line=~/^FEATURES/..$line=~/^ORIGIN/) { undef $keyword; if ($line=~/^FEATURES/) { undef @features; next; } if ($line =~ /^(BASE COUNT|ORIGIN)/) { push(@features,$feature) if $feature; $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok) if @features; undef @features; undef $feature; next if $line =~ /^BASE COUNT/; # special case for the sequence itself if ($line=~/^ORIGIN/) { # next line added by Luca Toldo $self->_addToStone('Blob',$record,$s,\%ok); $self->_addToStone($key,$accumulated,$s,\%ok) if $key; last; } } my($featurelabel) = _trim(substr($line,$FEATURECOL,$FEATUREVALCOL-$FEATURECOL)); my($featurevalue) = _trim(substr($line,$FEATUREVALCOL)); if ($featurelabel) { push(@features,$feature) if $feature; $feature = {'label'=>$featurelabel,'value'=>$featurevalue}; } else { $feature->{'value'} .= $featurevalue; } next; } $keyword = _trim(substr($line,0,$VALUECOL-1)); $value = _trim(substr($line,$VALUECOL)); if ($keyword && $key) { $self->_addToStone($key,_trim($accumulated),$s,\%ok); $accumulated = $value; next; } $accumulated .= " $value"; } continue { $key = $keyword if $keyword; } my($sequence) = $record=~/\nORIGIN[^\n]*\n(.+)\\?\\?/s; # $sequence=~s/[\s0-9-]+//g; # remove white space and numbers # $sequence =~ s/^\s*\d+\s//mg; # remove leading numbers and whitespace # $sequence =~ s/(\S{1,10}) /$1/g; # remove spacer $sequence =~ s/^.{9,10}//mg; # remove leading numbers and whitespace $sequence =~ s/(.{10}) /$1/g; # remove spacers $sequence =~ s/\n//g; $self->_addToStone('Sequence',$sequence,$s,\%ok); return $s; } sub read_one_record { my($self,@tags) = @_; my(%ok); my $accessor = $self->{'accessor'}; my $record = $accessor->fetch_next(); unless ($record) { $self->{EOF}++; return undef; } return $record unless $self->{format} eq 'stone'; return $self->parse($record,\@tags); } sub _trim { my($v) = @_; $v=~s/^\s+//; $v=~s/\s+$//; return $v; } sub _canonicalize { my $h = shift; substr($h,0)=~tr/a-z/A-Z/; substr($h,1,length($h)-1)=~tr/A-Z/a-z/; $h; } sub _addToStone { my($self,$label,$value,$stone,$ok) = @_; return unless !%{$ok} || $ok->{$label}; $stone->insert(_canonicalize($label),$value); } sub _addFeaturesToStone { my($self,$features,$basecount,$stone,$ok) = @_; # first add the basecount if (!%{$ok} || $ok->{'BASECOUNT'}) { my(%counts) = $basecount=~/(\d+)\s+([gatcGATC])/g; %counts = map { uc $_ } reverse %counts; $stone->insert('Basecount',new Stone(%counts)); } if (!%{$ok} || $ok->{'FEATURES'}) { # now add the features my($f) = new Stone; foreach (@$features) { my($p) = new Stone; my($q) = $_->{'value'}; my($label) = _canonicalize($_->{'label'}); my($position) = $q=~m!^([^/\s]+)!; my @qualifiers = $q=~m@/(\w+)=(.+?)(?=\"|/\w+=|$)@g; # slower but ?better my %qualifiers; while (my($key,$value) = splice(@qualifiers,0,2)) { $value =~ s/^\s*\"//; # trim off extra space and quotes $value =~ s/\"\s*$//; $value =~ s/^\s+//; # trim off extra space and quotes $value =~ s/\s+$//; $value =~ s/\s+//g if uc($key) eq 'TRANSLATION'; # get rid of spaces in protein translation $p->insert(_canonicalize($key)=>$value); } $p->insert('Position'=>$position); $f->insert($label=>$p); undef $p; } $stone->insert('Features',$f); } } # ---------------------------------------------------------------------------------------- # -------------------------- DEFINITION OF ACCESSOR OBJECTS ------------------------------ package GenbankAccessor; use Carp; sub new { my($class,@parameters) = @_; croak "GenbankAccessor::new: Abstract class\n"; } sub fetch_next { my($self) = @_; croak "GenbankAccessor::fetch_next: Abstract class\n"; } sub DESTROY { } package Yank; use strict; use Carp; use vars '@ISA'; @ISA=qw(GenbankAccessor); my $YANK = $ENV{YANK} || Boulder::Genbank::YANK(); sub new { my($package,$param) = @_; croak "Yank::new(): need at least one Genbank acccession number" unless $param; croak "Yank::new(): yank executable not found" unless -x $YANK; $param->{-fetch} ||= $param->{-param}; # for backward compatibility $param->{-fetch} || croak "Provide list of accession numbers to yank"; my @accession = @{$param->{-fetch}}; my $tmpfile = "/usr/tmp/yank$$"; open (TMP,">$tmpfile") || croak "Yank::new(): couldn't open tmpfile $tmpfile for write: $!"; print TMP join("\n",@accession),"\n"; close TMP; open(YANK,"$YANK -b < $tmpfile |") || croak "Yank::new(): couldn't open pipe from yank: $!"; return bless {'tmpfile'=>$tmpfile,'fh'=>\*YANK},$package; } sub fetch_next { my($self) = @_; return undef unless $self->{'fh'}; local($/) = "//\n"; my($line); my($fh) = $self->{'fh'}; chomp($line = <$fh>); return $line; } sub DESTROY { my($self) = shift; close $self->{'fh'} if $self->{'fh'}; unlink $self->{'tmpfile'} if $self->{'tmpfile'} } package File; use vars '@ISA'; use Symbol; use Carp; @ISA=qw(GenbankAccessor); sub new { my($package,$param) = @_; my $path = $param->{-fetch} || $param->{-path} || $param->{-param}; my $fh; if (!$path) { $fh = \*ARGV; } elsif (ref $path eq 'GLOB') { $fh = $path; } else { $fh = Symbol::gensym; open ($fh,$path) or croak "File::new(): couldn't open $path: $!"; } return bless {'fh'=>$fh},$package; } sub fetch_next { my $self = shift; return undef unless $self->{'fh'}; local($/)="//\n"; my $line; my $fh = $self->{'fh'}; $line = <$fh>; chomp $line if $line; return $line; } package Entrez; use Carp; use vars '@ISA'; use IO::Socket; use CGI 'escape'; # used by Entrez accessor, may need to change in the future use constant ENTREZ_HOST => 'www.ncbi.nlm.nih.gov'; use constant QUERY_URI => '/entrez/utils/pmqty.fcgi'; use constant BATCH_URI => '/entrez/utils/pmfetch.fcgi'; use constant XBATCH_URI => '/IEB/ToolBox/XML/xbatch.cgi'; use constant PROTO => 'HTTP/1.0'; use constant CRLF => "\r\n"; use constant MAX_ENTRIES => 10_000; @ISA=qw(GenbankAccessor); sub new { my($package,$param) = @_; croak "Entrez::new(): usage [list of accession numbers] or {args => values}" unless $param; my $self = bless {},ref($package) || $package; $self->{query} = $param->{-query}; $self->{accession} = $param->{-fetch} || $param->{-param}; $self->{db} = $param->{-db} || 'n'; $self->{format} = $param->{-format} || 'stone'; $self->{proxy} = $param->{-proxy}; $self->{limit} = $param->{-limit}; if ($self->{query}) { $self->{accession} = $self->get_accessions($self->{query}); } elsif ($self->{accession}) { my @accessions = ref $self->{accession} ? @{$self->{accession}} : ($self->{accession}); $self->{accession} = \@accessions; } else { croak "Must provide a 'query' or 'accession' argument" unless $self->{query} || $self->{accession} ; } $self; } sub fetch_next { my $self = shift; # if any additional records are left, then return them if ($self->{'records'} && @{$self->{'records'}}) { my $data = shift @{$self->{'records'}}; if ($data=~/\S/) { $self->_cleanup(\$data); return $data; } else { $self->{'records'} = []; } } my $format = $self->{format}; local ($/) = $format eq 'fasta' ? "\n>" : "//\n"; # if we have a socket open, then read a record if ($self->{'socket'}) { if (my $data = $self->_getline) { $self->_cleanup(\$data); return $data; } elsif (!$self->{accession} || @{$self->{accession}} == 0) { # nothing more to do return; } } die "Must provide either a list of accession numbers or an Entrez query" unless $self->{accession} || $self->{query}; return unless $self->get_entries; my $data = $self->_getline; $self->_cleanup(\$data); return $data; } sub _cleanup { my ($self,$d) = @_; $$d =~ s/\A\s+//; $$d=~s!//\n$!!; return unless $self->{format} eq 'fasta'; chomp $$d; substr($$d,0,0)='>' unless $$d =~/^>/; } sub get_accessions { my $self = shift; my $query = shift; my $sock = $self->_build_connection(ENTREZ_HOST) or return; # bug here: assume that the server will give us everything when we ask for 1 billion entries my $request = $self->_build_post(ENTREZ_HOST, QUERY_URI, undef, sprintf("db=%s&dispmax=%d&report=gen&mode=text&tool=boulder&term=%s",$self->db,$self->limit, escape($query))); print $sock $request; my $status = $self->_read_header($sock); return unless $status == 200; local $/ = ' '; my $line = $sock->getline; chomp $line; warn "*** ENTREZ: $line ***" unless $line =~ /^\d+$/; my @accessions = $line; while (defined ($line = $sock->getline)) { chomp $line; push @accessions,$line; } return \@accessions; } sub db { my $self = shift; my $db = $self->{db} || 'n'; my $translated = { m => 'medline', p => 'protein', n => 'nucleotide', s => 'popset' }->{$db}; $translated || $db; } # BUG: one billion = infinity sub limit { shift->{limit} || 1_000_000_000; } sub _build_connection { my $self = shift; my $host = shift; my ($hostent,$peer,$peerport); if (my $proxy = $self->{proxy}) { $proxy =~ m!^http://([^/]+)/?! or return; $hostent = $1; } else { $hostent = $host; } ($peer,$peerport) = split(':',$hostent); $peerport ||= 'http(80)'; my $sock = IO::Socket::INET->new( PeerAddr => $peer, PeerPort => $peerport, Proto => 'tcp' ); $sock; } sub _build_post { my $self = shift; my ($host,$uri,$type,$param) = @_; my $path = $self->{proxy} ? "http://$host$uri" : $uri; $type ||= 'application/x-www-form-urlencoded'; my $length = length($param); my $request = join (CRLF, "POST $path ".PROTO, "User-agent: Mozilla/5.0 [en] (PalmOS)", "Content-Type: $type", "Content-Length: $length", CRLF ); $request.$param; } sub _read_header { my $self = shift; my $sock = shift; local $/ = CRLF.CRLF; my $header = $sock->getline; return 500 unless $header; return 500 unless $header =~ /^HTTP\/[\d.]+ (\d+)/; $1; } sub get_entries { my $self = shift; my $format = $self->{format}; my $sock = $self->_build_connection(ENTREZ_HOST) or return; # create the multipart form... my $db = $self->{'db'}; my $boundary = '-' x 30 . int rand(10E14); my $name = 'Content-Disposition: form-data; name='; my %canned = ('db' => $db, 'FORMAT' => $format eq 'fasta' ? 1 : 0, 'REQUEST_TYPE' => $self->{accession} ? 'LIST_OF_GIS' : 'ADVANCED_QUERY', 'ORGNAME' => '', 'LIST_ORG' => '(None)', 'QUERY' => "$self->{query}\r\n", 'SAVETO' => 'YES', 'NOHEADER' => 'YES', ); my @records = map {qq(Content-Disposition: form-data; name="$_"\r\n\r\n$canned{$_}\r\n)} keys %canned; if (my $a = $self->{accession}) { my @accessions = splice(@$a,0,MAX_ENTRIES); return unless @accessions; my $accessions = join "\n",@accessions; push @records, qq{Content-Disposition: form-data; name="UID"; filename="accession.txt"\r\nContent-type: text/plain\r\n\r\n$accessions\r\n}; } my $content = "$boundary\r\n" . join("$boundary\r\n",@records) . "$boundary--\r\n"; my $request = $self->_build_post(ENTREZ_HOST, XBATCH_URI, "multipart/form-data; boundar=$boundary", $content); print $sock $request; local($/) = CRLF . CRLF; my $header = $sock->getline; return unless $header; return unless $header =~ /^HTTP\/[\d.]+ 200/; $/ = "\n"; my $line = $sock->getline; # this handles the case of Batch Entrez complaining that we're trying to # get too many sequences at once! if ($line =~ /exceed limit/) { my @accessions; while ($_ = $sock->getline) { chomp; push @accessions,$_; } delete $self->{query}; $self->{accession} = \@accessions; return $self->get_entries; # horrible recursion here! } else { while ($line =~ /(\*\*\*\* |WARNING: |ERROR: )(.+)/) { warn "**** GENBANK $1: $2\n"; $line = $sock->getline; } } $self->{bufferedline} = $line; if ($format eq 'fasta') { return unless $line =~ /^>/; } else { return unless $line =~ /LOCUS/; } $self->{socket} = $sock; return 1; } sub _getline { my $l = $_[0]->{socket}->getline; if ($_[0]->{bufferedline}) { $l = "$_[0]->{bufferedline}$l"; delete $_[0]->{bufferedline}; } return $l; } 1; __END__ libboulder-perl-1.30.orig/Boulder/Labbase.pm0100644000175000017500000000512606620376334017444 0ustar jojojojo# NOTE: This implementation is obviously incomplete. Don't try to use it. package Boulder::Labbase; # Given access to a boulderio schema for Labbase, return information # about tokens (materials). use Boulder::Stream; require Exporter; @ISA = qw(Exporter Boulder::Stream); @EXPORT = (); @EXPORT_OK = (); use Carp; use LabBase; # To create a new Boulder::Stream use new(). # new() takes named parameters: # -schema=> A Stone() object containing the schema for the token. # -in=> LabBase object to get tokens from. # -out=> LabBase object to store tokens into. # # To fetch the Stone object corresponding to a token # use get() -or- read_record(). # The semantics of tag name lists are the same as in # Boulder::Stream. # To store the Stone object corresponding to a token: # use put() -or- write_record(). # parameters: # -token=> token to write # -step=> the current step # -workflow=> the current workflow name # -state=> name of the current state sub new { my($package) = shift; my($schema,$in,$out) = rearrange([SCHEMA,IN,OUT],@_); $out = $in unless $out; croak "Usage: Boulder::Labbase::new(-schema=>schema,-in=>lb_in,-out=>lb_out)\n" unless ref($schema)=~/Stone/ && ref($in)=~/LabBase/; # superclass constructor my($self) = new Boulder::Stream(); # Add some extra parameters to the object $self->{'schema'} = $schema; $self->{'IN'} = $in; $self->{'OUT'} = $out; $self->{'passthru'} = undef; } # This is a low-level routine for "priming the pump" on a token. # It sends a query to the database which will be used later to # create the token stream. You must pass it all the LabBase materials # that are # sub fetch_token { my($self) = shift; my($ } sub rearrange { my($self,$order,@param) = @_; return () unless @param; return @param unless (defined($param[0]) && substr($param[0],0,1) eq '-') || $self->use_named_parameters; my $i; for ($i=0;$i<@param;$i+=2) { $param[$i]=~s/^\-//; # get rid of initial - if present $param[$i]=~tr/a-z/A-Z/; # parameters are upper case } my(%param) = @param; # convert into associative array my(@return_array); my($key)=''; foreach $key (@$order) { my($value); # this is an awful hack to fix spurious warnings when the # -w switch is set. if (ref($key) eq 'ARRAY') { foreach (@$key) { last if defined($value); $value = $param{$_}; delete $param{$_}; } } else { $value = $param{$key}; delete $param{$key}; } push(@return_array,$value); } push (@return_array,$self->make_attributes(\%param)) if %param; return (@return_array); } libboulder-perl-1.30.orig/Boulder/LocusLink.pm0100644000175000017500000004701307777564415020034 0ustar jojojojopackage Boulder::LocusLink; use Boulder::Stream; require Exporter; @ISA = qw(Exporter Boulder::Stream); @EXPORT = (); @EXPORT_OK = (); use Carp; $VERSION=1.00; use constant DEFAULT_LOCUSLINK_PATH => '/data/LocusLink/LL_tmpl'; =head1 NAME Boulder::LocusLink - Fetch LocusLink data records as parsed Boulder Stones =head1 SYNOPSIS # parse a file of LocusLink records $ll = new Boulder::LocusLink(-accessor=>'File', -param => '/home/data/LocusLink/LL_tmpl'); while (my $s = $ll->get) { print $s->Identifier; print $s->Gene; } # parse flatfile records yourself open (LL,"/home/data/LocusLink/LL_tmpl"); local $/ = "*RECORD*"; while () { my $s = Boulder::LocusLink->parse($_); # etc. } =head1 DESCRIPTION Boulder::LocusLink provides retrieval and parsing services for LocusLink records Boulder::LocusLink provides retrieval and parsing services for NCBI LocusLink records. It returns Unigene entries in L format, allowing easy access to the various fields and values. Boulder::LocusLink is a descendent of Boulder::Stream, and provides a stream-like interface to a series of Stone objects. Access to LocusLink is provided by one I, which give access to local LocusLink database. When you create a new Boulder::LocusLink stream, you provide the accessors, along with accessor-specific parameters that control what entries to fetch. The accessors is: =over 2 =item File This provides access to local LocusLink entries by reading from a flat file (typically Hs.dat file downloadable from NCBI's Ftp site). The stream will return a Stone corresponding to each of the entries in the file, starting from the top of the file and working downward. The parameter is the path to the local file. =back It is also possible to parse a single LocusLink entry from a text string stored in a scalar variable, returning a Stone object. =head2 Boulder::LocusLink methods This section lists the public methods that the I class makes available. =over 4 =item new() # Local fetch via File $ug=new Boulder::LocusLink(-accessor => 'File', -param => '/data/LocusLink/Hs.dat'); The new() method creates a new I stream on the accessor provided. The only possible accessors is B. If successful, the method returns the stream object. Otherwise it returns undef. new() takes the following arguments: -accessor Name of the accessor to use -param Parameters to pass to the accessor Specify the accessor to use with the B<-accessor> argument. If not specified, it defaults to B. B<-param> is an accessor-specific argument. The possibilities is: For B, the B<-param> argument must point to a string-valued scalar, which will be interpreted as the path to the file to read LocusLink entries from. =item get() The get() method is inherited from I, and simply returns the next parsed LocusLink Stone, or undef if there is nothing more to fetch. It has the same semantics as the parent class, including the ability to restrict access to certain top-level tags. =item put() The put() method is inherited from the parent Boulder::Stream class, and will write the passed Stone to standard output in Boulder format. This means that it is currently not possible to write a Boulder::LocusLink object back into LocusLink flatfile form. =back =head1 OUTPUT TAGS The tags returned by the parsing operation are taken from the names shown in the Flat file Hs.dat since no better description of them is provided yet by the database source producer. =head2 Top-Level Tags These are tags that appear at the top level of the parsed LocusLink entry. =over 4 =item Identifier The LocusLink identifier of this entry. Identifier is a single-value tag. Example: my $identifierNo = $s->Identifier; =item Current_locusid If a locus has been merged with another, the Current_locusid contains the previous LOCUSID line (A bit confusing, shall be called "previous_locusid", but this is defined in NCBI README File ... ). Example: my $prevlocusid=$s->Current_locusid; =item Organism Source species ased on NCBI's Taxonomy Example: my $theorganism=$s->Organism; =item Status Type of reference sequence record. If "PROVISIONAL" then means that is generated automatically from existing Genbank record and information stored in the LocusLink database, no curation. If "REVIEWED" than it means that is generated from the most representative complete GenBank sequence or merge of GenBank sequenes and from information stored in the LocusLink database Example: my $thestatus=$s->Status; =item LocAss Here comes a complex record ... made up of LOCUS_STRING, NM The value in the LOCUS field of the RefSeq record , NP The RefSeq accession number for an mRNA record, PRODUCT The name of the produc tof this transcript, TRANSVAR a variant-specific description, ASSEMBLY The Genbank accession used to assemble the refseq record Example: my $theprod=$s->LocAss->Product; =item AccProt Here comes a complex record ... made up of ACCNUM Nucleotide sequence accessio number TYPE e=EST, m=mRNA, g=Genomic PROT set of PID values for the coding region or regions annotated on the nucleotide record. The first value is the PID (an integer or null), then either MMDB or na, separated from the PID by a |. If MMDB is present, it indicates there are structur edata available for a protein related to the protein referenced by the PID Example: my $theprot=$s->AccProt->Prot; =item OFFICIAL_SYMBOL The symbol used for gene reports, validated by the appropriate nomenclature committee =item PREFERRED_SYMBOL Interim symbol used for display =item OFFICIAL_GENE_NAME The gene description used for gene reports validate by the appropriate nomenclatur eommittee. If the symbol is official, the gene name will be official. No records will have both official and interim nomenclature. =item PREFERRED_GENE_NAME Interim used for display =item PREFERRED_PRODUCT The name of the product used in the RefSeq record =item ALIAS_SYMBOL Other symbols associated with this gene =item ALIAS_PROT Other protein names associated with this gene =item PhenoTable A complex record made up of Phenotype Phenotype_ID =item SUmmary =item Unigene =item Omim =item Chr =item Map =item STS =item ECNUM =item ButTable BUTTON LINK =item DBTable DB_DESCR DB_LINK =item PMID a subset of publications associated with this locus with the link being the PubMed unique identifier comma separated =back =head1 SEE ALSO L, L, L =head1 AUTHOR Lincoln Stein . Luca I.G. Toldo Copyright (c) 1997 Lincoln D. Stein Copyright (c) 1999 Luca I.G. Toldo This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty. =cut # # Following did not require any changes compared to Genbank.pm # sub new { my($package,@parameters) = @_; # superclass constructor my($self) = new Boulder::Stream; # figure out whether parameters are named. Look for # an initial '-' if ($parameters[0]=~/^-/) { my(%parameters) = @parameters; $self->{'accessor'}=$parameters{'-accessor'} || 'File'; $self->{'param'}=$parameters{'-param'}; $self->{'OUT'}=$parameters{'-out'} || 'main::STDOUT'; } else { $self->{'accessor'}='File'; $self->{'param'}=[@parameters]; } croak "Require parameters" unless defined($self->{'param'}); $self->{'accessor'} = new {$self->{'accessor'}}($self->{'param'}); return bless $self,$package; } # # Following required no changes compared to Genbank.pm # sub read_record { my($self,@tags) = @_; my($s); if (wantarray) { my(@result); while (!$self->{'done'}) { $s = $self->read_one_record(@tags); next unless $s; next if $query && !(&$query); push(@result,$s); } return @result; } # we get here if in a scalar context while (!$self->{'done'}) { $s = $self->read_one_record(@tags); next unless $s; return $s unless $query; return $s if &$query; } return undef; } # # Here is everything new # sub parse { my $self = shift; my $record = shift; return unless $record; my $tags = shift; my %ok; %ok = map {$_ => 1} @$tags if ref($tags) eq 'ARRAY'; my($s,@lines,$line,$accumulated,$key,$keyword,$value,$feature,@features, $label); $s = new Stone; # following this line the parsing of the record must be done # each key-value pair is stored by the following command: # $self->_addToStone($key,$value,$stone,\%ok); # # Process new record lines # # (@recordlines)=split(/\n/,$record); undef $LocusLinkid, $curlocusid, $organism, $status, $locusstring; undef @locass; foreach $line (@recordlines) { if ($line=~/^LOCUSID:/) { ($key,$LocusLinkid)=split(/\s+/,$line); $self->_addToStone('Identifier',$LocusLinkid,$s,\%ok); } elsif ($line=~/^CURRENT_LOCUSID/) { ($key,$curlocusid)=split(/\s+/,$line); $self->_addToStone('Current_locusid',$curlocusid,$s,\%ok); } elsif ($line=~/^ORGANISM/) { ($key,$organism)=split(/\s+/,$line); $self->_addToStone('Organism',$organism,$s,\%ok); } elsif ($line=~/^STATUS/) { ($key,$status)=split(/\s+/,$line); $self->_addToStone('Status',$status,$s,\%ok); # special case for the LOCUS_STRING .. ASSEMBLY table } elsif ($line=~/^LOCUS_STRING/..$line=~/^ASSEMBLY/) { if ($line=~/^LOCUS_STRING:/) { undef @locass; ($key,$locusstring)=split(/\s+/,$line); $locass= {'label'=>'LOCUS_STRING','value'=>$locusstring}; push(@locass,$locass); } if ($line=~/^NM:/) { ($key,$nm)=split(/\s+/,$line); $locass= {'label'=>'NM','value'=>$nm}; push(@locass,$locass); } if ($line=~/^NP:/) { ($key,$np)=split(/\s+/,$line); $locass= {'label'=>'NP','value'=>$np}; push(@locass,$locass); } if ($line=~/^PRODUCT:/) { ($key,$product)=split(/\s+/,$line); $locass= {'label'=>'PRODUCT','value'=>$product}; push(@locass,$locass); } if ($line=~/^TRANSVAR:/) { ($key,$transvar)=split(/\s+/,$line); $locass= {'label'=>'TRANSVAR','value'=>$transvar}; push(@locass,$locass); } if ($line=~/^ASSEMBLY:/) { ($key,$assembly)=split(/\s+/,$line); $locass= {'label'=>'ASSEMBLY','value'=>$assembly}; push(@locass,$locass); $self->_addLocassToStone(\@locass,_trim($'),$s,\%ok); next; } # special case for the ACCNUM .. SYMBOL table } elsif ($line=~/^ACCNUM/..$line=~/_SYMBOL:/) { if ($line=~/^ACCNUM:/) { undef @accsym; ($key,$accnum)=split(/\s+/,$line); $accsym= {'label'=>'ACCNUM','value'=>$accnum}; push(@accsym,$accsym); } if ($line=~/^TYPE:/) { ($key,$type)=split(/\s+/,$line); if ($type=~/e/) { $type="EST"; } elsif ($type=~/m/) { $type="mRNA"; } elsif ($type=~/g/) { $type="genomic"; } $accsym= {'label'=>'TYPE','value'=>$type}; push(@accsym,$accsym); } if ($line=~/^PROT:/) { ($key,$prot)=split(/\s+/,$line); $accsym= {'label'=>'PROT','value'=>$prot}; push(@accsym,$accsym); } if ($line=~/_SYMBOL:/) { $self->_addAccSymToStone(\@accsym,_trim($'),$s,\%ok); ($key,$value)=split(/:\s+/,$line); $self->_addToStone($key,$value,$s,\%ok); } } elsif ($line=~/_GENE_NAME:/) { ($key,$value)=split(/:\s+/,$line); $self->_addToStone($key,$value,$s,\%ok); } elsif ($line=~/^PREFERRED_PRODUCT:/) { ($key,$value)=split(/:\s+/,$line); $self->_addToStone($key,$value,$s,\%ok); } elsif ($line=~/^ALIAS_SYMBOL:/) { ($key,$value)=split(/:\s+/,$line); $self->_addToStone($key,$value,$s,\%ok); } elsif ($line=~/^ALIAS_PROT:/) { ($key,$value)=split(/:\s+/,$line); $self->_addToStone($key,$value,$s,\%ok); # special case for the PHENOTYPE table } elsif ($line=~/^PHENOTYPE/..$line=~/PHENOTYPE_ID:/) { if ($line=~/^PHENOTYPE:/) { undef @pheno; ($key,$pheno)=split(/\s+/,$line); $phenol= {'label'=>'PHENOTYPE','value'=>$pheno}; push(@phenol,$phenol); } if ($line=~/^PHENOTYPE_ID:/) { ($key,$phenoid)=split(/\s+/,$line); $phenol= {'label'=>'PHENOTYPE_ID','value'=>$phenoid}; push(@phenol,$phenol); $self->_addPhenotypeToStone(\@phenol,_trim($'),$s,\%ok); next; } } elsif ($line=~/^SUMMARY:/) { ($key,$value)=split(/:\s+/,$line); $self->_addToStone($key,$value,$s,\%ok); } elsif ($line=~/^UNIGENE:/) { ($key,$value)=split(/:\s+/,$line); $self->_addToStone($key,$value,$s,\%ok); } elsif ($line=~/^OMIM:/) { ($key,$value)=split(/:\s+/,$line); $self->_addToStone($key,$value,$s,\%ok); } elsif ($line=~/^CHR:/) { ($key,$value)=split(/:\s+/,$line); $self->_addToStone('Chromosome',$value,$s,\%ok); } elsif ($line=~/^MAP:/) { ($key,$value)=split(/:\s+/,$line); $self->_addToStone('Cytogenetic_location',$value,$s,\%ok); } elsif ($line=~/^STS/) { undef @ststab,@stsraw,$stsr,$markname,$chrnum,$sts_id,$d_seg,$a; ($key,$stsr)=split(/\s+/,$line); ($markname,$chrnum,$sts_id,$d_seg)=split(/\|/,$stsr); $a= {'label'=>'Marker_Name','value'=>$markname}; push(@ststab,$a); $a= {'label'=>'Chromosome_Number','value'=>$chrnum}; push(@ststab,$a); $a= {'label'=>'Sts_id','value'=>$sts_id}; push(@ststab,$a); $a= {'label'=>'D_Segment','value'=>$d_seg}; push(@ststab,$a); $self->_addStsToStone(\@ststab,_trim($'),$s,\%ok); } elsif ($line=~/^ECNUM/) { ($key,$value)=split(/:\s+/,$line); $self->_addToStone('EC',$value,$s,\%ok); } elsif ($line=~/^BUTTON:/..$line=~/^LINK:/) { if ($line=~/^BUTTON:/) { undef @pheno; ($key,$pheno)=split(/\s+/,$line); $phenol= {'label'=>'BUTTON','value'=>$pheno}; push(@phenol,$phenol); } if ($line=~/^LINK:/) { ($key,$phenoid)=split(/\s+/,$line); $phenol= {'label'=>'LINK','value'=>$phenoid}; push(@phenol,$phenol); $self->_addButtonToStone(\@phenol,_trim($'),$s,\%ok); next; } } elsif ($line=~/^DB_DESCR:/..$line=~/^DB_LINK:/) { if ($line=~/^DB_DESCR:/) { undef @pheno; ($key,$pheno)=split(/\s+/,$line); $phenol= {'label'=>'DB_DESCR','value'=>$pheno}; push(@phenol,$phenol); } if ($line=~/^DB_LINK:/) { ($key,$phenoid)=split(/\s+/,$line); $phenol= {'label'=>'DB_LINK','value'=>$phenoid}; push(@phenol,$phenol); $self->_addDBToStone(\@phenol,_trim($'),$s,\%ok); next; } } elsif ($line=~/^PMID:/) { ($key,$value)=split(/:\s+/,$line); (@medlinearray)=split(/\,/,$value); foreach $medlineid (@medlinearray) { $self->_addToStone('MedlineID',$medlineid,$s,\%ok); } } } # return $s; } # # Following is unchanged from Genbank.pm # sub read_one_record { my($self,@tags) = @_; my(%ok); my $accessor = $self->{'accessor'}; my $record = $accessor->fetch_next(); unless ($record) { $self->{'done'}++; return undef; } return $self->parse($record,\@tags); } # # Following is unchanged from Genbank.pm # sub _trim { my($v) = @_; $v=~s/^\s+//; $v=~s/\s+$//; return $v; } # # Following is unchanged from Genbank.pm # sub _canonicalize { my $h = shift; substr($h,0)=~tr/a-z/A-Z/; substr($h,1,length($h)-1)=~tr/A-Z/a-z/; $h; } # # Following is unchanged from Genbank.pm # sub _addToStone { my($self,$xlabel,$value,$stone,$ok) = @_; return unless !%{$ok} || $ok->{$xlabel}; $stone->insert(_canonicalize($xlabel),$value); } # # Following is entirely rewritten # sub _addLocassToStone { my($self,$features,$basecount,$stone,$ok) = @_; if (!%{$ok} || $ok->{'LocAss'}) { # now add the features my($f) = new Stone; my %qualifiers; foreach (@$features) { my($q) = $_->{'value'}; my($label) = _canonicalize($_->{'label'}); $f->insert($label,$q); } $stone->insert('LocAss',$f); } } sub _addAccSymToStone { my($self,$features,$basecount,$stone,$ok) = @_; if (!%{$ok} || $ok->{'AccSym'}) { # now add the features my($f) = new Stone; my %qualifiers; foreach (@$features) { my($q) = $_->{'value'}; my($label) = _canonicalize($_->{'label'}); $f->insert($label,$q); } $stone->insert('AccSym',$f); } } sub _addPhenotypeToStone { my($self,$features,$basecount,$stone,$ok) = @_; if (!%{$ok} || $ok->{'PhenoTable'}) { # now add the features my($f) = new Stone; my %qualifiers; foreach (@$features) { my($q) = $_->{'value'}; my($label) = _canonicalize($_->{'label'}); $f->insert($label,$q); } $stone->insert('PhenoTable',$f); } } sub _addStsToStone { my($self,$features,$basecount,$stone,$ok) = @_; if (!%{$ok} || $ok->{'StsTable'}) { # now add the features my($f) = new Stone; my %qualifiers; foreach (@$features) { my($q) = $_->{'value'}; my($label) = _canonicalize($_->{'label'}); $f->insert($label,$q); } $stone->insert('StsTable',$f); } } sub _addButtonToStone { my($self,$features,$basecount,$stone,$ok) = @_; if (!%{$ok} || $ok->{'ButtonTable'}) { # now add the features my($f) = new Stone; my %qualifiers; foreach (@$features) { my($q) = $_->{'value'}; my($label) = _canonicalize($_->{'label'}); $f->insert($label,$q); } $stone->insert('ButtonTable',$f); } } sub _addDBToStone { my($self,$features,$basecount,$stone,$ok) = @_; if (!%{$ok} || $ok->{'DBTable'}) { # now add the features my($f) = new Stone; my %qualifiers; foreach (@$features) { my($q) = $_->{'value'}; my($label) = _canonicalize($_->{'label'}); $f->insert($label,$q); } $stone->insert('DBTable',$f); } } # -------------------------- DEFINITION OF ACCESSOR OBJECTS ------------------------------ # #only name changes for avoid namespace collisions # package LocusLinkAccessor; use Carp; sub new { my($class,@parameters) = @_; croak "LocusLinkAccessor::new: Abstract class\n"; } sub fetch_next { my($self) = @_; croak "LocusLinkAccessor::fetch_next: Abstract class\n"; } sub DESTROY { } # # Following, only the File package since the only one supported. # If other access methods must be supported, then here appropriate # packages and methods must be implemented # package File; use Carp; @ISA=qw(LocusLinkAccessor); $DEFAULT_PATH = Boulder::LocusLink::DEFAULT_LOCUSLINK_PATH(); # # Following, removed the search for the string locus in the file # as validation that the input be compliant with parser # sub new { my($package,$path) = @_; $path = $DEFAULT_PATH unless $path; open (UG,$path) or croak "File::new(): couldn't open $path: $!"; # read the junk at the beginning my $found; $found++; croak "File::new(): $path doesn't look like a LocusLink flat file" unless $found; $_ = ; return bless {'fh'=>UG},$package; } # # Following, changed the record separator # sub fetch_next { my $self = shift; return undef unless $self->{'fh'}; local($/)=">>"; my($line); my($fh) = $self->{'fh'}; chomp($line = <$fh>); return $line; } 1; __END__ libboulder-perl-1.30.orig/Boulder/Medline.pm0100644000175000017500000003706607207005246017471 0ustar jojojojopackage Boulder::Medline; # use Boulder::Stream; require Exporter; @ISA = qw(Exporter Boulder::Stream); @EXPORT = (); @EXPORT_OK = (); use Carp; $VERSION=1.02; use constant DEFAULT_MEDLINE_PATH => '/data/medline/medline.txt'; =head1 NAME Boulder::Medline - Fetch Medline data records as parsed Boulder Stones =head1 SYNOPSIS # parse a file of Medline records $ml = new Boulder::Medline(-accessor=>'File', -param => '/data/medline/medline.txt'); while (my $s = $ml->get) { print $s->Identifier; print $s->Abstract; } # parse flatfile yourself open (ML,"/data/medline/medline.txt"); local $/ = "*RECORD*"; while () { my $s = Boulder::Medline->parse($_); # etc. } =head1 DESCRIPTION Boulder::Medline provides retrieval and parsing services for Medline records Boulder::Medline provides retrieval and parsing services for NCBI Medline records. It returns Medline entries in L format, allowing easy access to the various fields and values. Boulder::Medline is a descendent of Boulder::Stream, and provides a stream-like interface to a series of Stone objects. Access to Medline is provided by one I, which give access to local Medline database. When you create a new Boulder::Medline stream, you provide the accessors, along with accessor-specific parameters that control what entries to fetch. The accessors is: =over 2 =item File This provides access to local Medline entries by reading from a flat file. The stream will return a Stone corresponding to each of the entries in the file, starting from the top of the file and working downward. The parameter is the path to the local file. =back It is also possible to parse a single Medline entry from a text string stored in a scalar variable, returning a Stone object. =head2 Boulder::Medline methods This section lists the public methods that the I class makes available. =over 4 =item new() # Local fetch via File $ml=new Boulder::Medline(-accessor => 'File', -param => '/data/medline/medline.txt'); The new() method creates a new I stream on the accessor provided. The only possible accessors is B. If successful, the method returns the stream object. Otherwise it returns undef. new() takes the following arguments: -accessor Name of the accessor to use -param Parameters to pass to the accessor Specify the accessor to use with the B<-accessor> argument. If not specified, it defaults to B. B<-param> is an accessor-specific argument. The possibilities is: For B, the B<-param> argument must point to a string-valued scalar, which will be interpreted as the path to the file to read Medline entries from. =item get() The get() method is inherited from I, and simply returns the next parsed Medline Stone, or undef if there is nothing more to fetch. It has the same semantics as the parent class, including the ability to restrict access to certain top-level tags. =item put() The put() method is inherited from the parent Boulder::Stream class, and will write the passed Stone to standard output in Boulder format. This means that it is currently not possible to write a Boulder::Medline object back into Medline flatfile form. =back =head1 OUTPUT TAGS The tags returned by the parsing operation are taken from the MEDLARS definition file MEDDOC.DOC =head2 Top-Level Tags These are tags that appear at the top level of the parsed Medline entry. =over 4 ABSTRACT ABSTRACT AUTHOR ADDRESS AUTHOR CALL NUMBER CAS REGISTRY/EC NUMBER CLASS UPDATE DATE COMMENTS COUNTRY DATE OF ENTRY DATE OF PUBLICATION ENGLISH ABSTRACT INDICATOR ENTRY MONTH GENE SYMBOL ID NUMBER INDEXING PRIORITY ISSN ISSUE/PART/SUPPLEMENT JOURNAL SUBSET JOURNAL TITLE CODE LANGUAGE LAST REVISION DATE MACHINE-READABLE IDENTIFIER MeSH HEADING NO-AUTHOR INDICATOR NOT FOR PUBLICATION NUMBER OF REFERENCES PAGINATION PERSONAL NAME AS SUBJECT PUBLICATION TYPE RECORD ORIGINATOR SECONDARY SOURCE ID SPECIAL LIST INDICATOR TITLE TITLE ABBREVIATION TRANSLITERATED/VERNACULAR TITLE UNIQUE IDENTIFIER VOLUME ISSUE =item Identifier The Medline identifier of this entry. Identifier is a single-value tag. Example: my $identifierNo = $s->Identifier; =item Title The Medline title for this entry. Example: my $titledef=$s->Title; =back =head1 SEE ALSO L, L, L =head1 AUTHOR Lincoln Stein . Luca I.G. Toldo Copyright (c) 1997 Lincoln D. Stein Copyright (c) 1999 Luca I.G. Toldo This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty. =cut # # Following did not require any changes compared to Genbank.pm # sub new { my($package,@parameters) = @_; # superclass constructor my($self) = new Boulder::Stream; # figure out whether parameters are named. Look for # an initial '-' if ($parameters[0]=~/^-/) { my(%parameters) = @parameters; $self->{'accessor'}=$parameters{'-accessor'} || 'File'; $self->{'param'}=$parameters{'-param'}; $self->{'OUT'}=$parameters{'-out'} || 'main::STDOUT'; } else { $self->{'accessor'}='File'; $self->{'param'}=[@parameters]; } croak "Require parameters" unless defined($self->{'param'}); $self->{'accessor'} = new {$self->{'accessor'}}($self->{'param'}); return bless $self,$package; } # # Following required no changes compared to Genbank.pm # sub read_record { my($self,@tags) = @_; my($s); if (wantarray) { my(@result); while (!$self->{'done'}) { $s = $self->read_one_record(@tags); next unless $s; next if $query && !(&$query); push(@result,$s); } return @result; } # we get here if in a scalar context while (!$self->{'done'}) { $s = $self->read_one_record(@tags); next unless $s; return $s unless $query; return $s if &$query; } return undef; } # # Here is everything new # sub parse { my $self = shift; my $record = shift; return unless $record; my $tags = shift; my %ok; %ok = map {$_ => 1} @$tags if ref($tags) eq 'ARRAY'; my($s,@lines,$line,$accumulated,$key,$keyword,$value,$feature,@features, $label); $s = new Stone; # following this line the parsing of the record must be done # each key-value pair is stored by the following command: # $self->_addToStone($key,$value,$stone,\%ok); # # Process new record lines # # (@recordlines)=split(/\n/,$record); for ($i=0; $i<=$#recordlines; $i++) { $line=@recordlines[$i]; if ($line=~/^UI/) { ($junk,$ui)=split(/ \- /,$line);$label="ID"; }elsif ($line=~/^DA/) { ($junk,$da)=split(/ \- /,$line);$label="DA"; }elsif ($line=~/^PMID/) { ($junk,$pmid)=split(/\- /,$line);$label="PMID"; }elsif ($line=~/^AD/) { ($junk,$ad)=split(/ \- /,$line);$label="AD"; }elsif ($line=~/^SO/) { ($junk,$so)=split(/ \- /,$line);$label="SO"; }elsif ($line=~/^EM/) { ($junk,$em)=split(/ \- /,$line);$label="EM"; }elsif ($line=~/^AA/) { ($junk,$aa)=split(/ \- /,$line);$label="AA"; }elsif ($line=~/^JC/) { ($junk,$jc)=split(/ \- /,$line);$label="JC"; }elsif ($line=~/^VI/) { ($junk,$vi)=split(/ \- /,$line);$label="VI"; }elsif ($line=~/^IP/) { ($junk,$ip)=split(/ \- /,$line);$label="IP"; }elsif ($line=~/^CY/) { ($junk,$cj)=split(/ \- /,$line);$label="CY"; }elsif ($line=~/^DP/) { ($junk,$dp)=split(/ \- /,$line);$label="DP"; }elsif ($line=~/^IS/) { ($junk,$is)=split(/ \- /,$line);$label="IS"; }elsif ($line=~/^TA/) { ($junk,$ta)=split(/ \- /,$line);$label="TA"; }elsif ($line=~/^PG/) { ($junk,$pg)=split(/ \- /,$line);$label="PG"; }elsif ($line=~/^TI/) { ($junk,$ti)=split(/ \- /,$line);$label="TI"; }elsif ($line=~/^AB/) { ($junk,$ab)=split(/ \- /,$line);$label="AB"; }elsif ($line=~/^CA/) { ($junk,$ca)=split(/ \- /,$line);$label="CA"; }elsif ($line=~/^CU/) { ($junk,$cu)=split(/ \- /,$line);$label="CU"; }elsif ($line=~/^CY/) { ($junk,$cy)=split(/ \- /,$line);$label="CY"; }elsif ($line=~/^DP/) { ($junk,$dp)=split(/ \- /,$line);$label="DP"; }elsif ($line=~/^EA/) { ($junk,$ea)=split(/ \- /,$line);$label="EA"; }elsif ($line=~/^PY/) { ($junk,$py)=split(/ \- /,$line);$label="PY"; }elsif ($line=~/^LR/) { ($junk,$lr)=split(/ \- /,$line);$label="LR"; }elsif ($line=~/^MRI/) { ($junk,$mri)=split(/ \- /,$line);$label="MRI"; }elsif ($line=~/^NI/) { ($junk,$ni)=split(/ \- /,$line);$label="NI"; }elsif ($line=~/^NP/) { ($junk,$np)=split(/ \- /,$line);$label="NP"; }elsif ($line=~/^RF/) { ($junk,$rf)=split(/ \- /,$line);$label="RF"; }elsif ($line=~/^PG/) { ($junk,$pg)=split(/ \- /,$line);$label="PG"; }elsif ($line=~/^LI/) { ($junk,$li)=split(/ \- /,$line);$label="LI"; }elsif ($line=~/^TT/) { ($junk,$tt)=split(/ \- /,$line);$label="TT"; # following are records which may appear multiple times }elsif ($line=~/^RO/) { ($junk,$ro_tmp)=split(/ \- /,$line);$label="RO";$ro.=$ro_tmp."\n"; }elsif ($line=~/^RN/) { ($junk,$rn_tmp)=split(/ \- /,$line);$label="RN";$rn.=$rn_tmp."\n"; }elsif ($line=~/^LA/) { ($junk,$la_tmp)=split(/ \- /,$line);$label="LA";$la.=$la_tmp."\n"; }elsif ($line=~/^SB/) { ($junk,$sb_tmp)=split(/ \- /,$line);$label="SB";$sb.=$sb_tmp."\n"; }elsif ($line=~/^GS/) { ($junk,$gs_tmp)=split(/ \- /,$line);$label="GS";$gs.=$gs_tmp."\n"; }elsif ($line=~/^MH/) { ($junk,$mh_tmp)=split(/ \- /,$line);$label="MH";$mh.=$mh_tmp."\n"; }elsif ($line=~/^PT/) { ($junk,$pt_tmp)=split(/ \- /,$line);$label="PT";$pt.=$pt_tmp."\n"; }elsif ($line=~/^AU/) { ($junk,$au_tmp)=split(/ \- /,$line);$label="AU"; $au.=$au_tmp."\n"; }elsif ($line=~/^PS/) { ($junk,$ps_tmp)=split(/ \- /,$line);$label="PS";$ps.=$ps_tmp."\n"; }elsif ($line=~/^CM/) { ($junk,$cm_tmp)=split(/ \- /,$line);$label="CM";$cm.=$cm_tmp."\n"; }elsif ($line=~/^SI/) { ($junk,$si_tmp)=split(/ \- /,$line);$label="SI";$si.=$si_tmp."\n"; }elsif ($line=~/^ID/) { ($junk,$id_tmp)=split(/ \- /,$line);$label="ID";$id.=$id_tmp."\n"; } else { # handle multiline records with empty header if ($label=~/TI/) { $ti.=$line; } elsif ($label=~/AB/) { $ab.=$line; } else { } } } # First add the single field records $self->_addToStone('Identifier',$ui,$s,\%ok); $self->_addToStone('Title',$ti,$s,\%ok); $self->_addToStone('Abstract',$ab,$s,\%ok); $self->_addToStone('AbstractAuthor',$aa,$s,\%ok); $self->_addToStone('Address',$ab,$s,\%ok); $self->_addToStone('CallNumber',$ca,$s,\%ok); $self->_addToStone('ClassUpdateDate',$cu,$s,\%ok); $self->_addToStone('Country',$cy,$s,\%ok); $self->_addToStone('DateOfEntry',$da,$s,\%ok); $self->_addToStone('DateOfPublication',$dp,$s,\%ok); $self->_addToStone('EnglishAbstractIndicator',$ea,$s,\%ok); $self->_addToStone('EntryMonth',$em,$s,\%ok); $self->_addToStone('IndexingPriority',$py,$s,\%ok); $self->_addToStone('ISSN',$is,$s,\%ok); $self->_addToStone('IssuePartSupplement',$is,$s,\%ok); $self->_addToStone('JournalTitleCode',$jc,$s,\%ok); $self->_addToStone('LastRevisionDate',$lr,$s,\%ok); $self->_addToStone('MachineReadableIdentifier',$mri,$s,\%ok); $self->_addToStone('NoAuthorIndicator',$ni,$s,\%ok); $self->_addToStone('NotForPublication',$np,$s,\%ok); $self->_addToStone('NumberOfReferences',$rf,$s,\%ok); $self->_addToStone('Pagination',$pg,$s,\%ok); $self->_addToStone('SpecialListIndicator',$li,$s,\%ok); $self->_addToStone('TitleAbbreviation',$ta,$s,\%ok); $self->_addToStone('TranslitteratedVernacularTitle',$tt,$s,\%ok); $self->_addToStone('VolumeIssue',$vi,$s,\%ok); #Then handle all other fields which may have multiple values (@TMPs)=split(/\n/,$au); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('Author',$aui,$s,\%ok); } (@TMPs)=split(/\n/,$rn); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('CASRegistryECNumber',$aui,$s,\%ok); } (@TMPs)=split(/\n/,$cm); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('Comments',$aui,$s,\%ok); } (@TMPs)=split(/\n/,$gs); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('GeneSymbol',$aui,$s,\%ok); } (@TMPs)=split(/\n/,$id); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('IDNumber',$aui,$s,\%ok); } (@TMPs)=split(/\n/,$sb); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('JournalSubset',$aui,$s,\%ok); } (@TMPs)=split(/\n/,$la); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('Language',$aui,$s,\%ok); } (@TMPs)=split(/\n/,$mh); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('MeSHHeading',$aui,$s,\%ok); } (@TMPs)=split(/\n/,$ps); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('PersonalNameAsSubject',$aui,$s,\%ok); } (@TMPs)=split(/\n/,$pt); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('PublicationType',$aui,$s,\%ok); } (@TMPs)=split(/\n/,$ro); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('RecordOriginator',$aui,$s,\%ok); } (@TMPs)=split(/\n/,$si); foreach $aui (@TMPs) { $aui=~s/\n//g; $self->_addToStone('SecondarySourceId',$aui,$s,\%ok); } # return $s; } # # Following is unchanged from Genbank.pm # sub read_one_record { my($self,@tags) = @_; my(%ok); my $accessor = $self->{'accessor'}; my $record = $accessor->fetch_next(); unless ($record) { $self->{'done'}++; return undef; } return $self->parse($record,\@tags); } # # Following is unchanged from Genbank.pm # sub _trim { my($v) = @_; $v=~s/^\s+//; $v=~s/\s+$//; return $v; } # # Following is unchanged from Genbank.pm # sub _canonicalize { my $h = shift; substr($h,0)=~tr/a-z/A-Z/; substr($h,1,length($h)-1)=~tr/A-Z/a-z/; $h; } # # Following is unchanged from Genbank.pm # sub _addToStone { my($self,$xlabel,$value,$stone,$ok) = @_; return unless !%{$ok} || $ok->{$xlabel}; $stone->insert(_canonicalize($xlabel),$value); } # # Following is entirely rewritten # sub _addFeaturesToStone { my($self,$features,$basecount,$stone,$ok) = @_; } # -------------------------- DEFINITION OF ACCESSOR OBJECTS ------------------------------ # #only name changes for avoid namespace collisions # package MedlineAccessor; use Carp; sub new { my($class,@parameters) = @_; croak "MedlineAccessor::new: Abstract class\n"; } sub fetch_next { my($self) = @_; croak "MedlineAccessor::fetch_next: Abstract class\n"; } sub DESTROY { } # # Following, only the File package since the only one supported. # If other access methods must be supported, then here appropriate # packages and methods must be implemented # package File; use Carp; @ISA=qw(MedlineAccessor); $DEFAULT_PATH = Boulder::Medline::DEFAULT_MEDLINE_PATH(); # # Following, removed the search for the string locus in the file # as validation that the input be compliant with parser # sub new { my($package,$path) = @_; $path = $DEFAULT_PATH unless $path; open (ML,$path) or croak "File::new(): couldn't open $path: $!"; # read the junk at the beginning my $found; $found++; croak "File::new(): $path doesn't look like a Medline flat file" unless $found; $_ = ; return bless {'fh'=>ML},$package; } # # Following, changed the record separator # sub fetch_next { my $self = shift; return undef unless $self->{'fh'}; local($/)="*RECORD*\n"; my($line); my($fh) = $self->{'fh'}; chomp($line = <$fh>); return $line; } 1; __END__ libboulder-perl-1.30.orig/Boulder/Omim.pm0100644000175000017500000003166007041144720017004 0ustar jojojojopackage Boulder::Omim; # use Boulder::Stream; require Exporter; @ISA = qw(Exporter Boulder::Stream); @EXPORT = (); @EXPORT_OK = (); use Carp; $VERSION=1.01; use constant DEFAULT_OMIM_PATH => '/data/omim/omim.txt'; =head1 NAME Boulder::Omim - Fetch Omim data records as parsed Boulder Stones =head1 SYNOPSIS # parse a file of Omim records $om = new Boulder::Omim(-accessor=>'File', -param => '/data/omim/omim.txt'); while (my $s = $om->get) { print $s->Identifier; print $s->Text; } # parse flatfile records yourself open (OM,"/data/omim/omim.txt"); local $/ = "*RECORD*"; while () { my $s = Boulder::Omim->parse($_); # etc. } =head1 DESCRIPTION Boulder::Omim provides retrieval and parsing services for OMIM records Boulder::Omim provides retrieval and parsing services for NCBI Omim records. It returns Omim entries in L format, allowing easy access to the various fields and values. Boulder::Omim is a descendent of Boulder::Stream, and provides a stream-like interface to a series of Stone objects. Access to Omim is provided by one I, which give access to local Omim database. When you create a new Boulder::Omim stream, you provide the accessors, along with accessor-specific parameters that control what entries to fetch. The accessors is: =over 2 =item File This provides access to local Omim entries by reading from a flat file (typically omim.txt file downloadable from NCBI's Ftp site). The stream will return a Stone corresponding to each of the entries in the file, starting from the top of the file and working downward. The parameter is the path to the local file. =back It is also possible to parse a single Omim entry from a text string stored in a scalar variable, returning a Stone object. =head2 Boulder::Omim methods This section lists the public methods that the I class makes available. =over 4 =item new() # Local fetch via File $om=new Boulder::Omim(-accessor => 'File', -param => '/data/omim/omim.txt'); The new() method creates a new I stream on the accessor provided. The only possible accessors is B. If successful, the method returns the stream object. Otherwise it returns undef. new() takes the following arguments: -accessor Name of the accessor to use -param Parameters to pass to the accessor Specify the accessor to use with the B<-accessor> argument. If not specified, it defaults to B. B<-param> is an accessor-specific argument. The possibilities is: For B, the B<-param> argument must point to a string-valued scalar, which will be interpreted as the path to the file to read Omim entries from. =item get() The get() method is inherited from I, and simply returns the next parsed Omim Stone, or undef if there is nothing more to fetch. It has the same semantics as the parent class, including the ability to restrict access to certain top-level tags. =item put() The put() method is inherited from the parent Boulder::Stream class, and will write the passed Stone to standard output in Boulder format. This means that it is currently not possible to write a Boulder::Omim object back into Omim flatfile form. =back =head1 OUTPUT TAGS The tags returned by the parsing operation are taken from the names shown in the network Entrez interface to Omim. =head2 Top-Level Tags These are tags that appear at the top level of the parsed Omim entry. =over 4 =item Identifier The Omim identifier of this entry. Identifier is a single-value tag. Example: my $identifierNo = $s->Identifier; =item Title The Omim title for this entry. Example: my $titledef=$s->Title; =item Text The Text of this Omim entry Example: my $thetext=$s->Text; =item Mini The text condensed version, also called "Mini" in Entrez interface Example: my $themini=$s->Mini; =item SeeAlso References to other relevant work. Example: my $thereviews=$s->Reviews; =item CreationDate This field contains the name of the person who originated the initial entry in OMIM and the date it appeared in the database. The entry may have been subsequently added to, edited, or totally rewritten by others, and their attribution is listed in the CONTRIBUTORS field. Example: my $theCreation=$s->CreationDate; =item Contributors This field contains a list, in chronological order, of the persons who have contributed significantly to the content of the MIM entry. The name is followed by "updated", "edited" or "re-created". Example: my @theContributors=$s->Contributors; =item History This field contains the edit history of this record, with an identifier and a date in which minor changes had been performed on the record. Example: my @theHistory=$s->History; =item References The references cited in the entry. Example: my @theReferences=$s->References; =item ClinicalSynopsis The content of the Clinical Synopsis data field. Example: my @theClinicalSynopsis=$s->ClinicalSynopsis; =item AllelicVariants The Allelic Variants Example: my @theAllelicVariants=$s->AllelicVariants; =back =head1 SEE ALSO L, L, L =head1 AUTHOR Lincoln Stein . Luca I.G. Toldo Copyright (c) 1997 Lincoln D. Stein Copyright (c) 1999 Luca I.G. Toldo This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty. =cut # # Following did not require any changes compared to Genbank.pm # sub new { my($package,@parameters) = @_; # superclass constructor my($self) = new Boulder::Stream; # figure out whether parameters are named. Look for # an initial '-' if ($parameters[0]=~/^-/) { my(%parameters) = @parameters; $self->{'accessor'}=$parameters{'-accessor'} || 'File'; $self->{'param'}=$parameters{'-param'}; $self->{'OUT'}=$parameters{'-out'} || 'main::STDOUT'; } else { $self->{'accessor'}='File'; $self->{'param'}=[@parameters]; } croak "Require parameters" unless defined($self->{'param'}); $self->{'accessor'} = new {$self->{'accessor'}}($self->{'param'}); return bless $self,$package; } # # Following required no changes compared to Genbank.pm # sub read_record { my($self,@tags) = @_; my($s); if (wantarray) { my(@result); while (!$self->{'done'}) { $s = $self->read_one_record(@tags); next unless $s; next if $query && !(&$query); push(@result,$s); } return @result; } # we get here if in a scalar context while (!$self->{'done'}) { $s = $self->read_one_record(@tags); next unless $s; return $s unless $query; return $s if &$query; } return undef; } # # Here is everything new # sub parse { my $self = shift; my $record = shift; return unless $record; my $tags = shift; my %ok; %ok = map {$_ => 1} @$tags if ref($tags) eq 'ARRAY'; my($s,@lines,$line,$accumulated,$key,$keyword,$value,$feature,@features, $label); $s = new Stone; # following this line the parsing of the record must be done # each key-value pair is stored by the following command: # $self->_addToStone($key,$value,$stone,\%ok); # # Process new record lines # # (@recordlines)=split(/\n/,$record); for ($i=0; $i<=$#recordlines; $i++) { $line=@recordlines[$i]; if ($line=~/\*FIELD\* NO/) { $label="ID"; $i++; $omimid=@recordlines[$i]; } elsif ($line=~/\*FIELD\* TI/) { $label="TI"; } elsif ($line=~/\*FIELD\* TX/) { $label="TX"; } elsif ($line=~/\*FIELD\* RF/) { $label="RF"; } elsif ($line=~/\*FIELD\* CS/) { $label="CS"; } elsif ($line=~/\*FIELD\* CD/) { $label="CD"; } elsif ($line=~/\*FIELD\* ED/) { $label="ED"; } elsif ($line=~/\*FIELD\* AV/) { $label="AV"; } elsif ($line=~/\*FIELD\* SA/) { $label="SA"; } elsif ($line=~/\*FIELD\* CN/) { $label="CN"; } elsif ($line=~/\*FIELD\* MN/) { $label="MN"; } else { if ($label=~/TI/) { $ti{$omimid}.=$line; } elsif ($label=~/TX/) { $tx{$omimid}.=$line; } elsif ($label=~/RF/) { $rf{$omimid}.=$line."\n"; } elsif ($label=~/CS/) { $cs{$omimid}.=$line."\n"; } elsif ($label=~/CD/) { $cd{$omimid}.=$line."\n"; } elsif ($label=~/ED/) { $ed{$omimid}.=$line."\n"; } elsif ($label=~/AV/) { $av{$omimid}.=$line."\n"; } elsif ($label=~/SA/) { $sa{$omimid}.=$line; } elsif ($label=~/CN/) { $cn{$omimid}.=$line."\n"; } elsif ($label=~/MN/) { $mn{$omimid}.=$line; } else { } } } if (defined($omimid)) { # First add the single field records $self->_addToStone('Identifier',$omimid,$s,\%ok); $self->_addToStone('Title',$ti{$omimid},$s,\%ok); $self->_addToStone('Text',$tx{$omimid},$s,\%ok); $self->_addToStone('Mini',$mn{$omimid},$s,\%ok); $self->_addToStone('SeeAlso',$sa{$omimid},$s,\%ok); $self->_addToStone('CreationDate',$cd{$omimid},$s,\%ok); #Then handle all other fields which may have multiple values (@EDs)=split(/\n/,$ed{$omimid}); foreach $edi (@EDs) { $edi=~s/\n//g; $self->_addToStone('History',$edi,$s,\%ok); } # (@CNs)=split(/\n/,$cn{$omimid}); foreach $cni (@CNs) { $cni=~s/\n//g; $self->_addToStone('Contributors',$cni,$s,\%ok); } # (@references)=split(/\n\n/,$rf{$omimid}); foreach $reference (@references) { $reference=~s/\n//g; $self->_addToStone('References',$reference,$s,\%ok); } # (@ClinicalSynopsis)=split(/\n\n/,$cs{$omimid}); foreach $main (@ClinicalSynopsis) { $main=~s/\n//g; ($id,$values)=split(/:/,$main); (@lines)=split(/;/,$values); foreach (@lines) { $_=~s/\s+/ /g; (@toclean)=split(//,$_); $maxlen=$#toclean; for ($i=0; $i<=$maxlen; $i++ ) { $curchar=shift(@toclean); if ($curchar=~/\s/) { } else { unshift(@toclean,$curchar); $i=$maxlen+1000; } } $tmpval=join('',@toclean); $self->_addToStone('ClinicalSynopsis',$tmpval,$s,\%ok); } } # (@AllelicVariants)=split(/\n\n\./,$av{$omimid}); foreach $variant (@AllelicVariants) { undef $variantf; (@details)=split(/\n/,$variant); $idtmp=@details[0]; for ($i=1; $i<=$#details; $i++ ) { $variantf.=@details[$i]." "; } $self->_addToStone('AlelicVariants',$variantf,$s,\%ok); } undef $ti{$omimid}, $tx{$omimid}, $sa{$omimid}, $mn{$omimid}; undef $cd{$omimid}, $ed{$omimid}, $cn{$omimid}; undef $rf{$omimid}, $av{$omimid}, $cs{$omimid}; } # return $s; } # # Following is unchanged from Genbank.pm # sub read_one_record { my($self,@tags) = @_; my(%ok); my $accessor = $self->{'accessor'}; my $record = $accessor->fetch_next(); unless ($record) { $self->{'done'}++; return undef; } return $self->parse($record,\@tags); } # # Following is unchanged from Genbank.pm # sub _trim { my($v) = @_; $v=~s/^\s+//; $v=~s/\s+$//; return $v; } # # Following is unchanged from Genbank.pm # sub _canonicalize { my $h = shift; substr($h,0)=~tr/a-z/A-Z/; substr($h,1,length($h)-1)=~tr/A-Z/a-z/; $h; } # # Following is unchanged from Genbank.pm # sub _addToStone { my($self,$xlabel,$value,$stone,$ok) = @_; return unless !%{$ok} || $ok->{$xlabel}; $stone->insert(_canonicalize($xlabel),$value); } # # Following is entirely rewritten # sub _addFeaturesToStone { my($self,$features,$basecount,$stone,$ok) = @_; } # -------------------------- DEFINITION OF ACCESSOR OBJECTS ------------------------------ # #only name changes for avoid namespace collisions # package OmimAccessor; use Carp; sub new { my($class,@parameters) = @_; croak "OmimAccessor::new: Abstract class\n"; } sub fetch_next { my($self) = @_; croak "OmimAccessor::fetch_next: Abstract class\n"; } sub DESTROY { } # # Following, only the File package since the only one supported. # If other access methods must be supported, then here appropriate # packages and methods must be implemented # package File; use Carp; @ISA=qw(OmimAccessor); $DEFAULT_PATH = Boulder::Omim::DEFAULT_OMIM_PATH(); # # Following, removed the search for the string locus in the file # as validation that the input be compliant with parser # sub new { my($package,$path) = @_; $path = $DEFAULT_PATH unless $path; open (OM,$path) or croak "File::new(): couldn't open $path: $!"; # read the junk at the beginning my $found; $found++; croak "File::new(): $path doesn't look like a Omim flat file" unless $found; $_ = ; return bless {'fh'=>OM},$package; } # # Following, changed the record separator # sub fetch_next { my $self = shift; return undef unless $self->{'fh'}; local($/)="*RECORD*\n"; my($line); my($fh) = $self->{'fh'}; chomp($line = <$fh>); return $line; } 1; __END__ libboulder-perl-1.30.orig/Boulder/Store.pm0100644000175000017500000006375207777564337017240 0ustar jojojojo# $Id: Store.pm,v 1.6 2002/06/28 20:31:59 lstein Exp $ # Prototype support library for storing Boulder streams. # Basic design is as follows: # The "data" file, named .records contains # a recno style data file. Records are delimited by # newlines. Each record has this form: # tag=long string value&tag=long string value... # Subrecords are delimited by {} pairs as per the # usual boulderio format. # The "index" file, named .index, is a DB_Hash # that contains several things: # 1. Indexes. The key is used to translate # from index to the list of record entries. # 2. Other information: # .INDICES -- list of tags that are indexed. package Boulder::Store; =head1 NAME Boulder::Store - Simple persistent storage for Stone tag/value objects =head1 SYNOPSIS Boulder:Store; my $store=new Boulder::Store('test.db',1); my $s = new Stone (Name=>'george', Age=>23, Sex=>M, Address=>{ Street=>'29 Rockland drive', Town=>'Fort Washington', ZIP=>'77777' } ); $store->put($s); $store->put(new Stone(Name=>'fred', Age=>30, Sex=>M, Address=>{ Street=>'19 Gravel Path', Town=>'Bedrock', ZIP=>'12345'}, Phone=>{ Day=>'111-1111', Eve=>'222-2222' } )); $store->put(new Stone(Name=>'andrew', Age=>18, Sex=>M)); $store->add_index('Name'); my $stone = $store->get(0); print "name = ",$stone->Name; =head1 DESCRIPTION Boulder::Store provides persistent storage for Boulder objects using a simple DB_File implementation. To use it, you need to have Berkeley db installed (also known as libdb), and the Perl DB_File module. See the DB_File package for more details on obtaining Berkeley db if you do not already have it. Boulder::Store provides an unsophisticated query mechanism which takes advantage of indexes that you specify. Despite its lack of sophistication, the query system is often very helpful. =head1 CLASS METHODS =over 4 =item $store = Boulder::Store->new("database/path",$writable) The B method creates a new Boulder::Store object and associates it with the database file provided in the first parameter (undef is a valid pathname, in which case all methods work but the data isn't stored). The second parameter should be a B value if you want to open the database for writing. Otherwise it's opened read only. Because the underlying storage implementation is not multi-user, only one process can have the database for writing at a time. A B-based locking mechanism is used to give a process that has the database opened for writing exclusive access to the database. This also prevents the database from being opened for reading while another process is writing to it (this is a B thing). Multiple simultaneous processes can open the database read only. Physically the data is stored in a human-readable file with the extension ".data". =back =head1 OBJECT METHODS =over 4 =item $stone = $store->read_record(@taglist) The semantics of this call are exactly the same as in B. Stones are returned in sequential order, starting with the first record. In addition to their built-in tags, each stone returned from this call has an additional tag called "record_no". This is the zero-based record number of the stone in the database. Use the B method to begin iterating from the beginning of the database. If called in an array context, B returns a list of all stones in the database that contains one or more of the provided tags. =item $stone = $store->write_record($stone [,$index]) This has the same semantics as B. A stone is appended to the end of the database. If successful, this call returns the record number of the new entry. By providing an optional second parameter, you can control where the stone is entered. A positive numeric index will write the stone into the database at that position. A value of -1 will use the Stone's internal record number (if present) to determine where to place it. =item $stone = $store->get($record_no) This is random access to the database. Provide a record number and this call will return the stone stored at that position. =item $record_number = $store->put($stone,$record_no) This is a random write to the database. Provide a record number and this call stores the stone at the indicated position, replacing whatever was there before. If no record number is provided, this call will look for the presence of a 'record_no' tag in the stone itself and put it back in that position. This allows you to pull a stone out of the database, modify it, and then put it back in without worrying about its record number. If no record is found in the stone, then the effect is identical to write_record(). The record number of the inserted stone is returned from this call, or -1 if an error occurred. =item $store->delete($stone),Boulder::Store::delete($record_no) These method calls delete a stone from the database. You can provide either the record number or a stone containing the 'record_no' tag. B: if the database is heavily indexed deletes can be time-consuming as it requires the index to be brought back into synch. =item $record_count = $store->length() This returns the length of the database, in records. =item $store->reset() This resets the database, nullifying any queries in effect, and causing read_record() to begin fetching stones from the first record. =item $store->query(%query_array) This creates a query on the database used for selecting stones in B. The query is an associative array. Three types of keys/value pairs are allowed: =over 4 =item (1) $index=>$value This instructs Boulder::Store to look for stones containing the specified tags in which the tag's value (determined by the Stone B method) exactly matches the provided value. Example: $db->query('STS.left_primer.length'=>30); Only the non-bracketed forms of the index string are allowed (this is probably a bug...) If the tag path was declared to be an index, then this search will be fast. Otherwise Boulder::Store must iterate over every record in the database. =item (2) EVAL=>'expression' This instructs Boulder::Store to look for stones in which the provided expression evaluates to B. When the expression is evaluated, the variable B<$s> will be set to the current record's stone. As a shortcut, you can use "" as shorthand for "$s->index('index.string')". =item (3) EVAL=>['expression1','expression2','expression3'...] This lets you provide a whole bunch of expressions, and is exactly equivalent to EVAL=>'(expression1) && (expression2) && (expression3)'. =back You can mix query types in the parameter provided to B. For example, here's how to look up all stones in which the sex is male and the age is greater than 30: $db->query('sex'=>'M',EVAL=>' > 30'); When a query is in effect, B returns only Stones that satisfy the query. In an array context, B returns a list of all Stones that satisfy the query. When no more satisfactory Stones are found, B returns B until a new query is entered or B is called. =item $store->add_index(@indices) Declare one or more tag paths to be a part of a fast index. B will take advantage of this record when processing queries. For example: $db->add_index('age','sex','person.pets'); You can add indexes any time you like, when the database is first created or later. There is a trade off: B, B, and other data-modifying calls will become slower as more indexes are added. The index is stored in an external file with the extension ".index". An index file is created even if you haven't indexed any tags. =item $store->reindex_all() Call this if the index gets screwed up (or lost). It rebuilds it from scratch. =back =head1 CAVEATS Boulder::Store makes heavy use of the flock() call in order to avoid corruption of DB_File databases when multiple processes try to write simultaneously. flock() may not work correctly across NFS mounts, particularly on Linux machines that are not running the rpc.lockd daemon. Please confirm that your flock() works across NFS before attempting to use Boulder::Store. If the store.t test hangs during testing, this is the likely culprit. =head1 AUTHOR Lincoln D. Stein , Cold Spring Harbor Laboratory, Cold Spring Harbor, NY. This module can be used and distributed on the same terms as Perl itself. =head1 SEE ALSO L, L, L =cut use Boulder::Stream; use Carp; use Fcntl; use DB_File; $VERSION = '1.20'; @ISA = 'Boulder::Stream'; $lockfh='lock00000'; $LOCK_SH = 1; $LOCK_EX = 2; $LOCK_UN = 8; # Override the old new() method. # There is no passthrough behavior in the database version, # because this is usually undesirable. # In this case,$in is the pathname to the database to open. sub new { my($package,$in,$writable) = @_; my $self = bless { 'records'=>undef, # filled in by _open_databases 'dbrecno'=>undef, # filled in by _open_databases 'index'=>undef, # filled in by _open_databases 'writable'=>$writable, 'basename'=>$in, 'passthru'=>undef, 'binary'=>'true', 'nextrecord'=>0, # next record to retrieve during iterations 'query_records'=>undef, # list of records during optimized queries 'query_test'=>undef, # an expression to apply to each record during a query 'IN'=>undef, 'OUT'=>undef, 'delim'=>'=', 'record_stop'=>"\n", 'line_end'=>'&', 'index_delim'=>' ', 'subrec_start'=>"\{", 'subrec_end'=>"\}" },$package; return undef unless _lock($self,'lock'); return _open_databases($self,$in) ? $self : undef; } sub DESTROY { my $self = shift; undef $self->{'dbrecno'}; untie %{$self->{'index'}}; untie @{$self->{'records'}}; _lock($self,'unlock'); } ##################### # private routines #################### # Obtain exclusive privileges if database is # writable. Otherwise obtain shared privileges. # Note that this call does not work across file systems, # at least on non-linux systems. Should use fcntl() # instead (but don't have Stevens at hand). sub _lock { my($self,$lockit) = @_; my $in = $self->{'basename'}; my $lockfilename = "$in.lock"; if ($lockit eq 'lock') { $lockfh++; open($lockfh,"+>$lockfilename") || return undef; $self->{'lockfh'}=$lockfh; return flock($lockfh,$self->{'writable'} ? $LOCK_EX : $LOCK_SH); } else { my $lockfh = $self->{'lockfh'}; unlink $lockfilename; flock($lockfh,$LOCK_UN); close($lockfh); 1; } } sub _open_databases { my $self = shift; # Try to open up and/or create the recno and index files my($in)=$self->{'basename'}; my (@records,%index); my ($permissions) = $self->{'writable'} ? (O_RDWR|O_CREAT) : O_RDONLY; $self->{'dbrecno'} = tie(@records,DB_File,"$in.data", $permissions,0640,$DB_RECNO) || return undef; tie(%index,DB_File,"$in.index",$permissions,0640,$DB_HASH) || return undef; $self->{'records'}=\@records; $self->{'index'}=\%index; 1; } ######################################################################### # DELETE EVERYTHING FROM THE DATABASE ######################################################################### sub empty { my $self = shift; my($base) = $self->{'basename'}; &DESTROY($self); # this closes the database and releases locks # delete the files foreach ('.data','.index') { unlink "$base$_"; } # Now reopen things return _open_databases($self); } ######################################################################## # DATA STORAGE ######################################################################## # This overrides the base object write_record. # It writes the stone into the given position in the file. # You can provide an index to put the record at a particular # position, leave it undef to append the record to the end # of the table, or provide a -1 to use the current record # number of the stone to get the position. Just for fun, # we return the record number of the added object. sub write_record { my($self,$stone,$index) = @_; unless ($self->{'writable'}) { warn "Attempt to write to read-only database $self->{'basename'}"; return undef; } my ($nextrecord); if (defined($index) && $index == -1) { my $stonepos = $stone->get('record_no'); $nextrecord = defined($stonepos) ? $stonepos : $self->length; } else { $nextrecord = (defined($index) && ($index >= 0) && ($index < $self->length)) ? $index : $self->length; } # We figure out here what indices need to be updated my %need_updating; # indexes that need fixing if ($nextrecord != $self->length) { my $old = $self->get($nextrecord); if ($old) { foreach ($self->indexed_keys) { my $oldvalue = join('',$old->index($_)); my $newvalue = join('',$stone->index($_)); $need_updating{$_}++ if $oldvalue ne $newvalue; } } $self->unindex_record($nextrecord,keys %need_updating) if %need_updating; } else { grep($need_updating{$_}++,$self->indexed_keys); } # Write out the Stone record. $stone->replace('record_no',$nextrecord); # keep track of this please my ($key,$value,@value,@lines); foreach $key ($stone->tags) { @value = $stone->get($key); $key = $self->escapekey($key); foreach $value (@value) { if (ref $value && defined $value->{'.name'}) { $value = $self->escapeval($value); push(@lines,"$key$self->{delim}$value"); } else { push(@lines,"$key$self->{delim}$self->{subrec_start}"); push(@lines,_write_nested($self,1,$value)); } } } $self->{'records'}->[$nextrecord]=join("$self->{line_end}",@lines); $self->index_record($nextrecord,keys %need_updating) if %need_updating; $nextrecord; } # put() is an alias for write_record, except that it # requires a record number. sub put { my($self,$stone,$record_no) = @_; croak 'Usage: put($stone [,$record_no])' unless defined $stone; $record_no = $stone->get('record_no') unless defined($record_no); $self->write_record($stone,$record_no); } # Delete the record number from the database. You may # provide either a numeric recno, or the stone itself. # The deleted stone is returned (sans its record no). sub delete { my($self,$s) = @_; my $recno; if ( $s->isa('Stone') ) { $recno = $s->get('record_no'); } else { $recno = $s; } $self->unindex_record($recno); # remove from the index $s = $self->get($recno) unless $s->isa('Stone'); delete $s->{recno}; # record number is gonzo $self->{'dbrecno'}->del($recno); # this does the actual delete $self->renumber_indices($recno); $self->renumber_records($recno); return $s; } ######################################################################## # DATA RETRIEVAL ######################################################################## sub read_one_record { my($self,@keywords) = @_; return undef if $self->done; my(%interested,$key,$value); grep($interested{$_}++,@keywords); $interested{'record_no'}++; # always interested in this one my $delim=$self->{'delim'}; my $subrec_start=$self->{'subrec_start'}; my $subrec_end=$self->{'subrec_end'}; my ($stone,$pebble,$found); while (1) { undef $self->{LEVEL},last unless $_ = $self->next_pair; if (/$subrec_end$/o) { $self->{LEVEL}--,last if $self->{LEVEL}; next; } next unless ($key,$value) = split($self->{delim},$_); $key = $self->unescapekey($key); $stone = new Stone() unless $stone; if ((!@keywords) || $interested{$key}) { $found++; if ($value =~ /$subrec_start/o) { $self->{LEVEL}++; $pebble = read_one_record($self); # call ourselves recursively $stone->insert($key=>$pebble); next; } $stone->insert($key=>$self->unescapeval($value)); } } return undef unless $found; return $stone; } # Read_record has the semantics that if a query is active, # it will only return stones that satisfy the query. sub read_record { my($self,@tags) = @_; my $query = $self->{'query_test'}; my $s; if (wantarray) { my(@result); while (!$self->done) { $s = $self->read_one_record(@tags); next unless $s; next if $query && !($query->($s)); push(@result,$s); } return @result; } else { while (!$self->done) { $s = $self->read_one_record(@tags); next unless $s; return $s unless $query; return $s if $query->($s); } return undef; } } # Random access. This will have the interesting side effect # of causing read_record() to begin iterating from this record # number. sub get { my($self,$record,@tags) = @_; $self->{'nextrecord'} = $record if defined($record); undef $self->{'EOF'}; return $self->read_record(@tags); } # Reset database so we start iterating over the entire # database at record no 0 again. sub reset { my $self = shift; $self->{'EOF'} = undef; $self->{'nextrecord'} = 0; $self->{'query_test'} = undef; $self->{'query_records'} = undef; } # Return the number of records in this file sub length { my $self = shift; return $self->{'dbrecno'}->length; } # Return the number of unread query records sub length_qrecs { my $self = shift; return $#{$self->{'query_records'}} + 1; } # Create a query. read_record() will then # iterate over the query results. A query consists of # an associative array of this form: # index1=>value1, # index2=>value2, # ... # indexN=>valueN, # 'EVAL'=>[expression1,expression2,expression3...] # 'EVAL'=>expression # # The index forms test for equality, and take advantage # of any fast indexed keywords you've declared. For # example, this will identify all white males: # $db->query('Demographics.Sex'=>'M', # 'Demographics.Race'=>'white'); # # The code form allows you to retrieve Stones satisfying # any arbitrary snippets of Perl code. Internally, the # variable "$s" will be set to the current Stone. # For example, find all whites > 30 years of age: # # $db->query('Demographics.Race'=>'white', # 'EVAL'=>'$s->index(Age) > 30'); # # EVAL (and "eval" too) expressions are ANDed together # in the order you declare them. Internally indexed # keywords are evaluated first in order to speed things up. # A cute feature that may go away: # You can use the expression as shorthand # for $s->index('path.to.index') sub query { my($self,%query) = @_; my($type,@expressions,%keylookups); foreach $type (keys %query) { if ($type =~ /^EVAL$/i) { push (@expressions,$query{$type}) unless ref $query{$type}; push (@expressions,@{$query{$type}}) if ref $query{$type}; } else { $keylookups{$type} = $query{$type}; } } # All the eval expressions are turned into a piece # of perl code. my $perlcode; foreach (@expressions) { s/<([\w.]+)>/\$s->index('$1')/g; $_ = "($_)"; } my %fast; grep($fast{$_}++,$self->indexed_keys); my %fastrecs; my $fastset; # this flag keeps track of the first access to %fastrecs foreach (keys %keylookups) { if ($fast{$_}) { my (@records) = $self->lookup($_,$keylookups{$_}); if ($fastset) { my %tmp; grep($fastrecs{$_} && $tmp{$_}++,@records); %fastrecs = %tmp; } else { grep($fastrecs{$_}++,@records); $fastset++; } } else { # slow record-by-record search unshift(@expressions,"(\$s->index('$_') eq '$keylookups{$_}')"); } } $perlcode = 'sub { my $s = shift;' . join(' && ',@expressions) . ';}' if @expressions; $perlcode = 'sub {1;}' unless @expressions; # The next step either looks up a compiled query or # creates one. We use a package global for this # purpose, since the same query may be used for # different databases. my $coderef; unless ($coderef = $QUERIES{$perlcode}) { $coderef = $QUERIES{$perlcode} = eval $perlcode; return undef if $@; } $self->reset; # clear out old information $self->{'query_test'} = $coderef; # set us to check each record against the code $self->{'query_records'} = [keys %fastrecs] if $fastset; return 1; } # fetch() allows you to pass a query to the # database, and get out all the stones that hit. # Internally it is just a call to query() followed # by an array-context call to read_record sub fetch { my($self,%query) = @_; $self->query(%query); my(@result) = $self->read_record(); # call in array context return @result; } #-------------------------------------- # Internal (private) procedures. #-------------------------------------- sub _write_nested { my($self,$level,$stone) = @_; my($key,$value,@value,@lines); foreach $key ($stone->tags) { @value = $stone->get($key); $key = $self->escapekey($key); foreach $value (@value) { if (ref $value && defined $value->{'.name'}) { $value = $self->escapeval($value); push(@lines,"$key$self->{delim}$value"); } else { push(@lines,"$key$self->{delim}$self->{subrec_start}"); push(@lines,_write_nested($self,$level+1,$value)); } } } push(@lines,$self->{'subrec_end'}); return @lines; } # This finds an array of key/value pairs and # stashes it where we can find it. # This is overriden from the basic Boulder::Stream class, # and relies on the state variable 'nextrecord' to tell # it where to start reading from. sub read_next_rec { my($self) = @_; my $data; # two modes of retrieval: # 1. regular iterate through the entire database # 2. iterate through subset of records in 'query_records' unless ($self->{'query_records'}) { return !($self->{EOF}++) if $self->length <= $self->{'nextrecord'}; $data = $self->{'records'}->[$self->{'nextrecord'}]; $self->{'nextrecord'}++; } else { my $nextrecord = shift @{$self->{'query_records'}}; return !($self->{EOF}++) unless $nextrecord ne ''; $data = $self->{'records'}->[$nextrecord]; } # unpack the guy into pairs $self->{PAIRS}=[split($self->{'line_end'},$data)]; } # This fiddles 'nextrecord' or 'query_records', as appropriate, so that # the next call to read_next_rec will skip over $skip records. sub skip_recs { my($self,$skip) = @_; unless ($self->{'query_records'}) { $self->{'nextrecord'} += $skip; } else { splice(@{$self->{'query_records'}}, 0, $skip); } } # Index a stone record sub index_record { my ($self,$recno,@indices) = @_; my $s = $self->get($recno); return undef unless defined($s); my($index,@values,$value); @indices = $self->indexed_keys unless @indices; foreach $index (@indices) { @values = $s->index($index); foreach $value (@values) { my %current; grep($current{$_}++,split(" ",$self->{'index'}->{"$index:$value"})); $current{$recno}++; # add us to the list $self->{'index'}->{"$index:$value"} = join(" ",keys %current); } } 1; } # This is a NOP for now. sub unindex_record { my ($self,$recno,@indices) = @_; my $s = $self->get($recno); return undef unless defined($s); my($index,@values,$value); @indices = $self->indexed_keys unless @indices; foreach $index (@indices) { @values = $s->index($index); foreach $value (@values) { my %current; grep($current{$_}++,split(" ",$self->{'index'}->{"$index:$value"})); delete $current{$recno}; # remove us from the list $self->{'index'}->{"$index:$value"} = join(" ",keys %current); # put index back } } 1; }; # This gets called after a record delete, when all the indexes need to be # shifted downwards -- this is probably WAY slow. sub renumber_indices { my ($self,$deleted_recno) = @_; while (($key,$value) = each %{$self->{'index'}}) { next if $key =~/^\./; @values = split(" ",$value); foreach (@values) { $_-- if $_ > $deleted_recno; } # This will probably put us into an infinite loop! $self->{'index'}->{$key} = join(" ",@values); } } # This also gets called after a record delete, when all the indexes need to be # shifted downwards -- this is probably WAY slow. sub renumber_records { my ($self,$deleted_recno) = @_; $self->reset; $recno = -1; while ($s=$self->read_record) { $recno++; next unless $s->get('record_no') > $deleted_recno; $s->replace('record_no',$recno); $self->put($s); } } # Look up a stone record using its index. Will return a list # of the matching records sub lookup { my ($self,$index,$value) = @_; my %records; grep($records{$_}++,split(" ",$self->{'index'}->{"$index:$value"})); return keys %records; } # Add an index (or list of indices) to the database. # If new, then we do a reindexing. sub add_index { my ($self,@indices) = @_; my (%oldindices); grep($oldindices{$_}++,$self->indexed_keys); my (@newindices) = grep(!$oldindices{$_},@indices); $self->reindex_some_keys(@newindices); $self->{'index'}->{'.INDICES'}=join($self->{'index_delim'},keys %oldindices,@newindices); } # Return the indexed keys as an associative array (convenient) sub indexed_keys { my $self = shift; return split($self->{'index_delim'},$self->{'index'}->{'.INDICES'}); } # Reindex all records that contain records involving the provided indices. sub reindex_some_keys { my($self,@new) = @_; my ($s,$index,$value); $self->reset; # reset to beginning of database while ($s=$self->read_record) { # return all the stones foreach $index (@new) { foreach $value ($s->index($index)){ # pull out all the values at this index (if any) my %current; grep($current{$_}++,split(" ",$self->{'index'}->{"$index:$value"})); $current{$s->get('record_no')}++; $self->{'index'}->{"$index:$value"}=join(" ",keys %current); } } } } # Completely rebuild the index. sub reindex_all { my $self = shift; my ($index,$s,@values,$value); $self->reset; foreach $index ($self->indexed_keys) { undef %records; while ($s=$self->read_record) { # return all the stones foreach $value ($s->index($index)){ # pull out all the values at this index (if any) $records{"$index:$value"}->{$s->get('record_no')}++; } } foreach (keys %records) { $self->{'index'}->{$_}=join(" ",keys %{$records{$_}}); } } } 1; libboulder-perl-1.30.orig/Boulder/Stream.pm0100644000175000017500000004337407311174747017357 0ustar jojojojopackage Boulder::Stream; # CHANGE HISTORY: # version 1.07 # patches from Andy Law to quash warnings under -w switch # changes from 1.04 to 1.05 # - new() will now accept filehandle globs, IO::File, and FileHandle objects # changes from 1.03 to 1.04 # - Fixed regexp bug that broke on tags with embedded spaces -pete # Changes from 1.01 to 1.03 # - Fixed a problem in escaping the {} characters # Changes from 1.00 to 1.01 # - Added the asTable() method to Boulder::Stream =head1 NAME Boulder::Stream - Read and write tag/value data from an input stream =head1 SYNOPSIS #!/bin/perl # Read a series of People records from STDIN. # Add an "Eligible" tag to all those whose # Age >= 35 and Friends list includes "Fred" use Boulder::Stream; # filestream way: my $stream = Boulder::Stream->newFh; while ( my $record = <$stream> ) { next unless $record->Age >= 35; my @friends = $record->Friends; next unless grep {$_ eq 'Fred'} @friends; $record->insert(Eligible => 'yes'); print $stream $record; } # object oriented way: my $stream = Boulder::Stream->new; while (my $record = $stream->get ) { next unless $record->Age >= 35; my @friends = $record->Friends; next unless grep {$_ eq 'Fred'} @friends; $record->insert(Eligible => 'yes'); print $stream $record; } =head1 DESCRIPTION Boulder::Stream provides stream-oriented access to L IO hierarchical tag/value data. It can be used in a magic tied filehandle mode, as shown in the synopsis, or in object-oriented mode. Using tied filehandles, L objects are read from input using the standard <> operator. Stone objects printed to the tied filehandle appear on the output stream in L format. By default, data is read from the magic ARGV filehandle (STDIN or a list of files provided on the command line) and written to STDOUT. This can be changed to the filehandles of your choice. =head2 Pass through behavior When using the object-oriented form of Boulder::Stream, tags which aren't specifically requested by the get() method are passed through to output unchanged. This allows pipes of programs to be constructed easily. Most programs will want to put the tags back into the boulder stream once they're finished, potentially adding their own. Of course some programs will want to behave differently. For example, a database query program will generate but not read a B stream, while a report generator will read but not write the stream. This convention allows the following type of pipe to be set up: query_database | find_vector | find_dups | \ | blast_sequence | pick_primer | mail_report If all the programs in the pipe follow the conventions, then it will be possible to interpose other programs, such as a repetitive element finder, in the middle of the pipe without disturbing other components. =head1 SKELETON BOULDER PROGRAM Here is a skeleton example. #!/bin/perl use Boulder::Stream; my $stream = Boulder::Stream->newFh; while ( my $record = <$stream> ) { next unless $record->Age >= 35; my @friends = $record->Friends; next unless grep {$_ eq 'Fred'} @friends; $record->insert(Eligible => 'yes'); print $stream $record; } The code starts by creating a B object to handle the I/O. It reads from the stream one record at a time, returning a L object. We recover the I and I tags, and continue looping unless the Age is greater or equal to 35, and the list of Friends contains "Fred". If these criteria match, then we insert a new tag named Eligible and print the record to the stream. The output may look like this: Name=Janice Age=36 Eligible=yes Friends=Susan Friends=Fred Friends=Ralph = Name=Ralph Age=42 Eligible=yes Friends=Janice Friends=Fred = Name=Susan Age=35 Eligible=yes Friends=Susan Friends=Fred = Note that in this case only records that meet the criteria are echoed to standard output. The object-oriented version of the program looks like this: #!/bin/perl use Boulder::Stream; my $stream = Boulder::Stream->new; while ( my $record = $stream->get('Age','Friends') ) { next unless $record->Age >= 35; my @friends = $record->Friends; next unless grep {$_ eq 'Fred'} @friends; $record->insert(Eligible => 'yes'); $stream->put($record); } The get() method is used to fetch Stones containing one or more of the indicated tags. The put() method is used to send the result to standard output. The pass-through behavior might produce a set of records like this one: Name=Janice Age=36 Eligible=yes Friends=Susan Friends=Fred Friends=Ralph = Name=Phillip Age=30 = Name=Ralph Age=42 Eligible=yes Friends=Janice Friends=Fred = Name=Barbara Friends=Agatha Friends=Janice = Name=Susan Age=35 Eligible=yes Friends=Susan Friends=Fred = Notice that there are now two records ("Phillip" and "Barbara") that do not contain the Eligible tag. =head1 Boulder::Stream METHODS =head2 $stream = Boulder::Stream->new(*IN,*OUT) =head2 $stream = Boulder::Stream->new(-in=>*IN,-out=>*OUT) The B method creates a new B object. You can provide input and output filehandles. If you leave one or both undefined B will default to standard input or standard output. You are free to use files, pipes, sockets, and other types of file handles. You may provide the filehandle arguments as bare words, globs, or glob refs. You are also free to use the named argument style shown in the second heading. =head2 $fh = Boulder::Stream->newFh(-in=>*IN, -out=>*OUT) Returns a filehandle object tied to a Boulder::Stream object. Reads on the filehandle perform a get(). Writes invoke a put(). To retrieve the underlying Boulder::Stream object, call Perl's built-in tied() function: $stream = tied $fh; =head2 $stone = $stream->get(@taglist) =head2 @stones = $stream->get(@taglist) Every time get() is called, it will return a new Stone object. The Stone will be created from the input stream, using just the tags provided in the argument list. Pass no tags to receive whatever tags are present in the input stream. If none of the tags that you specify are in the current boulder record, you will receive an empty B. At the end of the input stream, you will receive B. If called in an array context, get() returns a list of all stones from the input stream that contain one or more of the specified tags. =head2 $stone = $stream->read_record(@taglist) Identical to get(>, but the name is longer. =head2 $stream->put($stone) Write a B to the output filehandle. =head2 $stream->write_record($stone) Identical to put(), but the name is longer. =head2 Useful State Variables in a B Every Boulder::Stream has several state variables that you can adjust. Fix them in this fashion: $a = new Boulder::Stream; $a->{delim}=':'; $a->{record_start}='['; $a->{record_end}=']'; $a->{passthru}=undef; =over 4 =item * delim This is the delimiter character between tags and values, "=" by default. =item * record_start This is the start of nested record character, "{" by default. =item * record_end This is the end of nested record character, "}" by default. =item * passthru This determines whether unrecognized tags should be passed through from the input stream to the output stream. This is 'true' by default. Set it to undef to override this behavior. =back =head1 BUGS Because the delim, record_start and record_end characters in the B object are used in optimized (once-compiled) pattern matching, you cannot change these values once get() has once been called. To change the defaults, you must create the Boulder::Stream, set the characters, and only then begin reading from the input stream. For the same reason, different Boulder::Stream objects cannot use different delimiters. =head1 AUTHOR Lincoln D. Stein , Cold Spring Harbor Laboratory, Cold Spring Harbor, NY. This module can be used and distributed on the same terms as Perl itself. =head1 SEE ALSO L, L, L, L, L, L, L =cut require 5.004; use strict; use Stone; use Carp; use Symbol(); use vars '$VERSION'; $VERSION=1.07; # Pseudonyms and deprecated methods. *get = \&read_record; *next = \&read_record; *put = \&write_record; # Call this with IN and OUT filehandles of your choice. # If none specified, defaults to <>/STDOUT. sub new { my $package = shift; my ($in,$out) = rearrange(['IN','OUT'],@_); $in = $package->to_fh($in) || \*main::ARGV; $out = $package->to_fh($out,1) || \*main::STDOUT; my $pack = caller; return bless { 'IN'=>$in, 'OUT'=>$out, 'delim'=>'=', 'record_stop'=>"=\n", 'line_end'=>"\n", 'subrec_start'=>"\{", 'subrec_end'=>"\}", 'binary'=>'true', 'passthru'=>'true' },$package; } # You are free to redefine the following magic variables: # $a = new Boulder::Stream; # $a->{delim} separates tag = value ['='] # $a->{line_end} separates tag=value pairs [ newline ] # $a->{record_stop} ends records ["=\n"] # $a->{subrec_start} begins a nested record [ "{" ] # $a->{subrec_end} ends a nested record [ "}" ] # $a->{passthru} if true, passes unread tags -> output [ 'true' ] # $a->{binary} if true, escapes and unescapes records [ 'true' ] # Since escaping/unescaping has some overhead, you might want to undef # 'binary' in order to improve performance. # Read in and return a Rolling Stone record. Will return # undef() when an empty record is hit. You can specify # keys that you are interested in getting, as in the # original boulder package. sub read_one_record { my($self,@keywords) = @_; return if $self->done; my(%interested,$key,$value); grep($interested{$_}++,@keywords); my $out=$self->{OUT}; my $delim=$self->{'delim'}; my $subrec_start=$self->{'subrec_start'}; my $subrec_end=$self->{'subrec_end'}; my ($pebble,$found); # This is a small hack to ensure that we respect the # record delimiters even when we don't make an # intervening record write. if (!$self->{WRITE} && $self->{INVOKED} && !$self->{LEVEL} && $self->{'passthru'} && $self->{PASSED}) { print $out ($self->{'record_stop'}); } else { $self->{INVOKED}++; # keep track of our invocations } undef $self->{WRITE}; undef $self->{PASSED}; my $stone = new Stone(); while (1) { last unless $_ = $self->next_pair; if (/^#/) { print $out ("$_$self->{line_end}") if $self->{'passthru'}; next; } if (/^\s*$delim/o) { undef $self->{LEVEL}; last; } if (/$subrec_end$/o) { $self->{LEVEL}--,last if $self->{LEVEL}; print $out ("$_$self->{line_end}") if $self->{'passthru'}; next; } next unless ($key,$value) = /^\s*(.+?)\s*$delim\s*(.*)/o; if ((!@keywords) || $interested{$key}) { $found++; if ($value=~/^\s*$subrec_start/o) { $self->{LEVEL}++; $pebble = read_one_record($self); # call ourselves recursively $pebble = new Stone() unless defined($pebble); # an empty record is still valid $stone->insert($self->unescapekey($key)=>$pebble); next; } $stone->insert($self->unescapekey($key)=>$self->unescapeval($value)); } elsif ($self->{'passthru'}) { print $out ("$_$self->{line_end}"); $self->{PASSED}++; # flag that we will need to write a record delimiter } } return undef unless $found; return $stone; } # Write out the specified Stone record. sub write_record { my($self,@stone)=@_; for my $stone (@stone) { $self->{'WRITE'}++; my $out=$self->{OUT}; # Write out a Stone record in boulder format. my ($key,$value,@value); foreach $key ($stone->tags) { @value = $stone->get($key); $key = $self->escapekey($key); foreach $value (@value) { next unless ref $value; if (exists $value->{'.name'}) { $value = $self->escapeval($value); print $out ("$key$self->{delim}$value\n"); } else { print $out ("$key$self->{delim}$self->{subrec_start}\n"); _write_nested($self,1,$value); } } } print $out ("$self->{delim}\n"); } 1; } # read_record() returns one stone if called in a scalar # context and all the stones if called in an array # context. sub read_record { my($self,@tags) = @_; if (wantarray) { my(@result,$s); while (!$self->done) { $s = $self->read_one_record(@tags); push(@result,$s) if $s; } return @result; } else { my $s; while (!$self->done) { $s = $self->read_one_record(@tags); return $s if $s; } return undef; } } # ---------------------------------------------------------------- # TIED INTERFACE METHODS # ---------------------------------------------------------------- # newFh() is a class method that returns a tied filehandle # sub newFh { my $class = shift; return unless my $self = $class->new(@_); return $self->fh; } # fh() returns a filehandle that you can read stones from sub fh { my $self = shift; my $class = ref($self) || $self; my $s = Symbol::gensym; tie $$s,$class,$self; return $s; } sub TIEHANDLE { my $class = shift; return bless {stream => shift},$class; } sub READLINE { my $self = shift; return $self->{stream}->read_record(); } sub PRINT { my $self = shift; $self->{stream}->write_record(@_); } #-------------------------------------- # Internal (private) procedures. #-------------------------------------- # This finds an array of key/value pairs and # stashes it where we can find it. sub read_next_rec { my($self) = @_; my($olddelim) = $/; $/="\n".$self->{record_stop}; my($in) = $self->{IN}; my $data = <$in>; chomp($data) if defined $data; if ($in !~ /ARGV/) { $self->{EOF}++ if eof($in); } else { $self->{EOF}++ if eof(); } $/=$olddelim; $self->{PAIRS}=[grep($_,split($self->{'line_end'},$data))] if defined $data; } # This returns TRUE when we've reached the end # of the input stream sub done { my $self = shift; return if defined $self->{PAIRS} && @{$self->{PAIRS}}; return $self->{EOF}; } # This returns the next key/value pair. sub next_pair { my $self = shift; $self->read_next_rec unless $self->{PAIRS}; return unless $self->{PAIRS}; return shift @{$self->{PAIRS}} if @{$self->{PAIRS}}; undef $self->{PAIRS}; return undef; } sub _write_nested { my($self,$level,$stone) = @_; my $indent = ' ' x $level; my($key,$value,@value); my $out = $self->{OUT}; foreach $key ($stone->tags) { @value = $stone->get($key); $key = $self->escapekey($key); foreach $value (@value) { if (exists $value->{'.name'}) { $value = $self->escapeval($value); print $out ($indent,"$key$self->{delim}$value\n"); } else { print $out ($indent,"$key$self->{delim}$self->{subrec_start}\n"); _write_nested($self,$level+1,$value); } } } print $out (' ' x ($level-1),$self->{'subrec_end'},"\n"); } # Escape special characters. sub escapekey { my($s,$toencode)=@_; return $toencode unless $s->{binary}; my $specials=" $s->{delim}$s->{subrec_start}$s->{subrec_end}$s->{line_end}$s->{record_stop}%"; $toencode=~s/([$specials])/uc sprintf("%%%02x",ord($1))/oge; return $toencode; } sub escapeval { my($s,$toencode)=@_; return $toencode unless $s->{binary}; my $specials="$s->{delim}$s->{subrec_start}$s->{subrec_end}$s->{line_end}$s->{record_stop}%"; $toencode=~s/([$specials])/uc sprintf("%%%02x",ord($1))/oge; return $toencode; } # Unescape special characters sub unescapekey { unescape(@_); } sub unescapeval { unescape(@_); } # Unescape special characters sub unescape { my($s,$todecode)=@_; return $todecode unless $s->{binary}; $todecode =~ s/%([0-9a-fA-F]{2})/pack("c",hex($1))/ge; return $todecode; } # utility routine to turn type globs, barewords, IO::File structs, etc into # filehandles. sub to_fh { my ($pack,$thingy,$write) = @_; return unless $thingy; return $thingy if defined fileno($thingy); my $caller; while (my $package = caller(++$caller)) { my $qualified_thingy = Symbol::qualify_to_ref($thingy,$package); return $qualified_thingy if defined fileno($qualified_thingy); } # otherwise try to open it as a file my $fh = Symbol::gensym(); $thingy = ">$thingy" if $write; open ($fh,$thingy) || croak "$pack open of $thingy: $!"; return \*$fh; } sub DESTROY { my $self = shift; my $out=$self->{OUT}; print $out ($self->{'delim'},"\n") if !$self->{WRITE} && $self->{INVOKED} && !$self->{LEVEL} && $self->{'passthru'} && $self->{PASSED}; } ##################################################################### ###################### private routines ############################# sub rearrange { my($order,@param) = @_; return unless @param; my %param; if (ref $param[0] eq 'HASH') { %param = %{$param[0]}; } else { return @param unless (defined($param[0]) && substr($param[0],0,1) eq '-'); my $i; for ($i=0;$i<@param;$i+=2) { $param[$i]=~s/^\-//; # get rid of initial - if present $param[$i]=~tr/a-z/A-Z/; # parameters are upper case } %param = @param; # convert into associative array } my(@return_array); local($^W) = 0; my($key)=''; foreach $key (@$order) { my($value); if (ref($key) eq 'ARRAY') { foreach (@$key) { last if defined($value); $value = $param{$_}; delete $param{$_}; } } else { $value = $param{$key}; delete $param{$key}; } push(@return_array,$value); } push (@return_array,{%param}) if %param; return @return_array; } 1; libboulder-perl-1.30.orig/Boulder/String.pm0100644000175000017500000001072607212727371017362 0ustar jojojojopackage Boulder::String; use Boulder::Stream; @ISA = 'Boulder::Stream'; $DATE="4 Dec 2000"; $VERSION=1.01; =head1 NAME Boulder::String - Read and write tag/value data from a string. =head1 SYNOPSIS #!/bin/perl # Read a B from stdin and create a string that can be # passed to a dumb sub, which doesn't know about Stones. use Boulder::Stream; use Boulder::String; my $stream = Boulder::Stream->newFh; # read a stone from stdin my $record = <$stream> ); print $stream $record; } =head1 DESCRIPTION Boulder::String provides access to L IO hierarchical tag/value data. Stone objects printed to the tied string are appended to the string in L format. =head1 Boulder::Stream METHODS =head2 $stream = Boulder::Stream->new($in_string,\$out_string); The B method creates a new B object. You must provide an input string and a reference to an output string. The input string may be empty. =head2 $stream->write_record($stone) Write the passed stone in L IO format into $out_string. =head1 AUTHOR Lincoln D. Stein , Cold Spring Harbor Laboratory, Cold Spring Harbor, NY. This module can be used and distributed on the same terms as Perl itself. Patches and bug fixes contributed by Bernhard Schmalhofer . =head1 SEE ALSO L, L, L, L, L, L, L, L =cut # Override Stream.pm to allow the input and output to be # strings. If outString is not defined, then we fall back # to Boulder::Stream behavior, otherwise we append to the # indicated string. sub new { my($package,$inString,$outStringRef) = @_; die "Usage: Boulder::String::new(\$inString,\\\$outString)\n" unless defined($inString) && !ref($inString); die "Usage: Boulder::String::new(\$inString,\\\$outString)\n" if defined($outStringRef) && (ref($outStringRef) ne 'SCALAR'); return bless { 'IN'=>undef, 'OUT'=>undef, 'INSTRING'=>$inString, 'OUTSTRING'=>$outStringRef, 'delim'=>'=', 'record_stop'=>"=\n", 'line_end'=>"\n", 'subrec_start'=>"\{", 'subrec_end'=>"\}", 'binary'=>'true', 'passthru'=>undef },$package; } # Write out the specified Stone record. sub write_record { my($self,$stone)=@_; my $out = $self->{OUTSTRING}; return unless $out; $self->{'WRITE'}++; # Write out a Stone record in boulder format. my ($key,$value,@value); foreach $key ($stone->tags) { @value = $stone->get($key); $key = $self->escapekey($key); foreach $value (@value) { next unless ref $value; if (exists $value->{'.name'}) { $value = $self->escapeval($value); $$out .= "$key$self->{delim}$value\n"; } else { $$out .= "$key$self->{delim}$self->{subrec_start}\n"; _write_nested($self,1,$value); } } } ${$self->{OUTSTRING}} .= "$self->{delim}\n"; } #-------------------------------------- # Internal (private) procedures. #-------------------------------------- # This finds an array of key/value pairs and # stashes it where we can find it. sub read_next_rec { my($self) = @_; unless (defined($self->{RECORDS})) { $self->{RECORDS} = [split("\n$self->{record_stop}",$self->{INSTRING})]; } my($nextrec) = shift(@{$self->{RECORDS}}); $self->{PAIRS}=[grep($_,split($self->{'line_end'},$nextrec))]; } # This returns TRUE when we've reached the end # of the input stream sub done { my $self = shift; return undef if @{$self->{PAIRS}}; return undef unless ref($self->{RECORDS}); return !scalar(@{$self->{RECORDS}}); } sub _write_nested { my($self,$level,$stone) = @_; my $indent = ' ' x $level; my($key,$value,@value); my $out = $self->{OUTSTRING}; return unless ref($out); foreach $key ($stone->tags) { @value = $stone->get($key); $key = $self->escapekey($key); foreach $value (@value) { next unless ref $value; if (exists $value->{'.name'}) { $value = $self->escapeval($value); $$out .= "$indent$key$self->{delim}$value\n"; } else { $$out .= "$indent$key$self->{delim}$self->{subrec_start}\n"; _write_nested($self,$level+1,$value); } } } $$out .= (' ' x ($level-1)) . "$self->{'subrec_end'}\n"; } sub DESTROY { my $self = shift; $out=$self->{OUTSTRING}; if (ref($out) && !$self->{WRITE} && $self->{INVOKED} && !$self->{LEVEL} && $self->{'passthru'} && $self->{PASSED}) { $$out .= "$self->{'delim'}\n"; } } 1; libboulder-perl-1.30.orig/Boulder/Swissprot.pm0100644000175000017500000002517107021032676020124 0ustar jojojojopackage Boulder::Swissprot; use Boulder::Stream; =head1 NAME Boulder::SwissProt - Fetch SwissProt data records as parsed Boulder Stones =head1 SYNOPSIS == missing == =head1 DESCRIPTION == missing == =head1 SEE ALSO L, L, L =head1 AUTHOR Lincoln Stein . Luca I.G. Toldo Copyright (c) 1997 Lincoln D. Stein Copyright (c) 1999 Luca I.G. Toldo This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty. =cut require Exporter; @ISA = qw(Exporter Boulder::Stream); @EXPORT = (); @EXPORT_OK = (); use Carp; $VERSION = 1.0; # Hard-coded defaults - must modify for your site use constant YANK => '/usr/local/bin/yank'; use constant DEFAULT_SW_PATH => 'new_seq.dat'; # Genbank entry parsing constants # (may need to adjust!) $KEYCOL=0; $VALUECOL=12; $FEATURECOL=5; $FEATUREVALCOL=21; # new() takes named parameters: # -accessor=> Reference to an object class that will return a series of # Swissprot records. Predefined objects include 'Yank', 'Entrez' and 'File'. # (defaults to 'Entrez'). # -param=> Parameters to pass to the subroutine. Can be a list of accession numbers # or an entrez query. # -out=> Output filehandle. Defaults to STDOUT. # # If you don't use named parameters, then will assume method 'yank' on # a list of accession numbers. # e.g. # $sw = new Boulder::Swissprot(-accessor=>'Yank',-param=>[qw/M57939 M28274 L36028/]); sub new { my($package,@parameters) = @_; # superclass constructor my($self) = new Boulder::Stream; # figure out whether parameters are named. Look for # an initial '-' if ($parameters[0]=~/^-/) { my(%parameters) = @parameters; $self->{'accessor'}=$parameters{'-accessor'} || 'Entrez'; $self->{'param'}=$parameters{'-param'}; $self->{'OUT'}=$parameters{'-out'} || 'main::STDOUT'; } else { $self->{'accessor'}='Yank'; $self->{'param'}=[@parameters]; } croak "Require parameters" unless defined($self->{'param'}); $self->{'accessor'} = new {$self->{'accessor'}}($self->{'param'}); return bless $self,$package; } sub read_record { my($self,@tags) = @_; my($s); if (wantarray) { my(@result); while (!$self->{'done'}) { $s = $self->read_one_record(@tags); next unless $s; next if $query && !(&$query); push(@result,$s); } return @result; } # we get here if in a scalar context while (!$self->{'done'}) { $s = $self->read_one_record(@tags); next unless $s; return $s unless $query; return $s if &$query; } return undef; } sub parse { my $self = shift; my $record = shift; return unless $record; my $tags = shift; my %ok; %ok = map {$_ => 1} @$tags if ref($tags) eq 'ARRAY'; my($s,@lines,$line,$accumulated,$key,$keyword,$value,$feature,@features); $s = new Stone; @lines = split("\n",$record); foreach $line (@lines) { # special case for the sequence itself if ($line=~/^SQ/) { $self->_addToStone($key,$accumulated,$s,\%ok) if $key; last; } if ($line=~/^ID /) { ($key,$id)=split(/\s+/,$line); $self->_addToStone('Identifier',$id,$s,\%ok); next; } elsif ($line =~/^AC /) { ($key,$acc)=split(/\s+/,$line); $acc=~s/\;//g; $self->_addToStone('Accession',$acc,$s,\%ok); next; } elsif ($line =~/^DE /) { ($key,$des)=split(/\s+/,$line); $des=~s/\;//g; $self->_addToStone('Description',$des,$s,\%ok); next; } elsif ($line =~/^OS /) { ($key,$os)=split(/\s+/,$os); $os=~s/\;//g; $self->_addToStone('Organism',$os,$s,\%ok); next; } elsif ($line =~/^GN /) { ($key,$gn)=split(/\s+/,$line); $gn=~s/\.//g; $self->_addToStone('Gene_name',$gn,$s,\%ok); next; } elsif ($line=~/^CC -!- FUNCTION:/) { } elsif ($line=~/^CC -!- SUBCELLULAR LOCATION:/) { } elsif ($line=~/^CC -!- SUBUNIT:/) { } elsif ($line=~/^CC -!- SIMILARITY: :/) { } } ($sequence)=$record=~/\nSQ.*\n([\s\S]+)/; $sequence=~s/[\s0-9-]+//g; # remove white space $self->_addToStone('Sequence',$sequence,$s,\%ok); return $s; } sub read_one_record { my($self,@tags) = @_; my(%ok); my $accessor = $self->{'accessor'}; my $record = $accessor->fetch_next(); unless ($record) { $self->{'done'}++; return undef; } return $self->parse($record,\@tags); } sub _trim { my($v) = @_; $v=~s/^\s+//; $v=~s/\s+$//; return $v; } sub _canonicalize { my $h = shift; substr($h,0)=~tr/a-z/A-Z/; substr($h,1,length($h)-1)=~tr/A-Z/a-z/; $h; } sub _addToStone { my($self,$label,$value,$stone,$ok) = @_; return unless !%{$ok} || $ok->{$label}; $stone->insert(_canonicalize($label),$value); } sub _addFeaturesToStone { my($self,$features,$basecount,$stone,$ok) = @_; # first add the basecount if (!%{$ok} || $ok->{'BASECOUNT'}) { my(%counts) = $basecount=~/(\d+)\s+([gatcGATC])/g; %counts = reverse %counts; $stone->insert('Basecount',new Stone(%counts)); } if (!%{$ok} || $ok->{'FEATURES'}) { # now add the features my($f) = new Stone; foreach (@$features) { my($q) = $_->{'value'}; my($label) = _canonicalize($_->{'label'}); my($position) = $q=~m!^([^/]+)!; my @qualifiers = $q=~m!/(\w+)=([^/]+)!g; my %qualifiers; while (my($key,$value) = splice(@qualifiers,0,2)) { $value =~ s/^\s*\"//; $value =~s/\"\s*$//; $value=~s/\s+//g if uc($key) eq 'TRANSLATION'; # get rid of spaces in protein translation $qualifiers{_canonicalize($key)} = $value; } $f->insert($label=>new Stone('Position'=>$position,%qualifiers)); } $stone->insert('Features',$f); } } # ---------------------------------------------------------------------------------------- # -------------------------- DEFINITION OF ACCESSOR OBJECTS ------------------------------ package SwissprotAccessor; use Carp; sub new { my($class,@parameters) = @_; croak "SwissprotAccessor::new: Abstract class\n"; } sub fetch_next { my($self) = @_; croak "SwissprotAccessor::fetch_next: Abstract class\n"; } sub DESTROY { } package Yank; use Carp; @ISA=qw(SwissprotAccessor); $YANK = Boulder::Swissprot::YANK(); sub new { my($package,$param) = @_; croak "Yank::new(): need at least one Swissprot acccession number" unless $param; croak "Yank::new(): yank executable not found" unless -x $YANK; my (@accession) = ref($param) eq 'ARRAY' ? @$param : $param; my($tmpfile) = "/usr/tmp/yank$$"; open (TMP,">$tmpfile") || croak "Yank::new(): couldn't open tmpfile $tmpfile for write: $!"; print TMP join("\n",@accession),"\n"; close TMP; open(YANK,"$YANK < $tmpfile |") || croak "Yank::new(): couldn't open pipe from yank: $!"; return bless {'tmpfile'=>$tmpfile,'fh'=>YANK},$package; } sub fetch_next { my($self) = @_; return undef unless $self->{'fh'}; local($/) = "//\n"; my($line); my($fh) = $self->{'fh'}; chomp($line = <$fh>); return $line; } sub DESTROY { my($self) = shift; close $self->{'fh'} if $self->{'fh'}; unlink $self->{'tmpfile'} if $self->{'tmpfile'} } package File; use Carp; @ISA=qw(SwissprotAccessor); $DEFAULT_PATH = Boulder::Swissprot::DEFAULT_SW_PATH(); sub new { my($package,$path) = @_; $path = $DEFAULT_PATH unless $path; open (SW,$path) or croak "File::new(): couldn't open $path: $!"; # read the junk at the beginning my $found; $_ = ; return bless {'fh'=>SW},$package; } sub fetch_next { my $self = shift; return undef unless $self->{'fh'}; local($/)="//\n"; my($line); my($fh) = $self->{'fh'}; chomp($line = <$fh>); return $line; } package Entrez; use Carp; use IO::Socket; use constant HOST => 'www.ncbi.nlm.nih.gov'; use constant URI => '/htbin-post/Entrez/query?form=6&Dopt=g&html=no'; use constant PROTO => 'HTTP/1.0'; use constant CRLF => "\r\n"; @ISA=qw(SwissprotAccessor); sub new { my($package,$param) = @_; croak "Entrez::new(): usage [list of accession numbers] or {args => values}" unless $param; my $self = {}; $self->{query} = $param unless ref($param); $self->{accession} = $param if ref($param) eq 'ARRAY'; %$self = map { s/^-//; $_; } %$param if ref($param) eq 'HASH'; $self->{query} || $self->{accession} || croak "Must provide a 'query' or 'accession' argument"; $self->{max} ||= 100; $self->{'db'} ||= 'n'; return bless $self,$package; } sub fetch_next { my $self = shift; # if any additional records are left, then return them if (@{$self->{'records'}}) { my $data = shift @{$self->{'records'}}; if ($data=~/\S/) { $self->_cleanup(\$data); return $data; } else { $self->{'records'} = []; } } # if we have a socket open, then read a record if ($self->{'socket'}) { my $data = $self->{'socket'}->getline; $self->_cleanup(\$data); return $data; } # otherwise if we are reading from a series of accession numbers, # do a one-time fetch if (exists $self->{'accession'}) { my $accession = shift @{$self->{'accession'}}; return unless $accession; my $sock = $self->_request(URI . "&db=$self->{db}&uid=$accession"); return unless $sock; @{$self->{'records'}} = $sock->getlines; my $data = shift @{$self->{'records'}}; $self->_cleanup(\$data); return $data; } # Otherwise we are running a query. Need to set up the socket $self->{query} =~ s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/eg; my $search = URI . "&db=$self->{db}&dispmax=$self->{max}&term=$self->{query}"; $search .= "&relpubdate=$self->{age}" if $self->{age} > 0; $self->{'socket'} = $self->_request($search); return unless $self->{'socket'}; my $data = $self->{'socket'}->getline; $self->_cleanup(\$data); return $data; } sub _cleanup { my ($self,$d) = @_; $$d =~ s/\A\s+//; $$d=~s!//\n$!!; } sub _request { my $self = shift; my $uri = shift; my $sock = IO::Socket::INET->new( PeerAddr => HOST, PeerPort => 'http(80)', Proto => 'tcp' ); return unless $sock; print $sock "GET $uri ",PROTO,CRLF,CRLF; $sock->input_record_separator( CRLF . CRLF); my $header = $sock->getline; return unless $header; return unless $header =~ /^HTTP\/[\d.]+ 200/; # read until we get to the '----' line $sock->input_record_separator("\n"); while ($_ = $sock->getline) { return undef if /^ERROR/; if (/^------/) { $sock->input_record_separator("//\n"); return $sock; } } return; } 1; __END__ libboulder-perl-1.30.orig/Boulder/Unigene.pm0100644000175000017500000003206507041144741017500 0ustar jojojojopackage Boulder::Unigene; # use Boulder::Stream; require Exporter; @ISA = qw(Exporter Boulder::Stream); @EXPORT = (); @EXPORT_OK = (); use Carp; $VERSION=1.0; use constant DEFAULT_UNIGENE_PATH => '/data/unigene/Hs.dat'; =head1 NAME Boulder::Unigene - Fetch Unigene data records as parsed Boulder Stones =head1 SYNOPSIS # parse a file of Unigene records $ug = new Boulder::Unigene(-accessor=>'File', -param => '/data/unigene/Hs.dat'); while (my $s = $ug->get) { print $s->Identifier; print $s->Gene; } # parse flatfile records yourself open (UG,"/data/unigene/Hs.dat"); local $/ = "*RECORD*"; while () { my $s = Boulder::Unigene->parse($_); # etc. } =head1 DESCRIPTION Boulder::Unigene provides retrieval and parsing services for UNIGENE records Boulder::Unigene provides retrieval and parsing services for NCBI Unigene records. It returns Unigene entries in L format, allowing easy access to the various fields and values. Boulder::Unigene is a descendent of Boulder::Stream, and provides a stream-like interface to a series of Stone objects. Access to Unigene is provided by one I, which give access to local Unigene database. When you create a new Boulder::Unigene stream, you provide the accessors, along with accessor-specific parameters that control what entries to fetch. The accessors is: =over 2 =item File This provides access to local Unigene entries by reading from a flat file (typically Hs.dat file downloadable from NCBI's Ftp site). The stream will return a Stone corresponding to each of the entries in the file, starting from the top of the file and working downward. The parameter is the path to the local file. =back It is also possible to parse a single Unigene entry from a text string stored in a scalar variable, returning a Stone object. =head2 Boulder::Unigene methods This section lists the public methods that the I class makes available. =over 4 =item new() # Local fetch via File $ug=new Boulder::Unigene(-accessor => 'File', -param => '/data/unigene/Hs.dat'); The new() method creates a new I stream on the accessor provided. The only possible accessors is B. If successful, the method returns the stream object. Otherwise it returns undef. new() takes the following arguments: -accessor Name of the accessor to use -param Parameters to pass to the accessor Specify the accessor to use with the B<-accessor> argument. If not specified, it defaults to B. B<-param> is an accessor-specific argument. The possibilities is: For B, the B<-param> argument must point to a string-valued scalar, which will be interpreted as the path to the file to read Unigene entries from. =item get() The get() method is inherited from I, and simply returns the next parsed Unigene Stone, or undef if there is nothing more to fetch. It has the same semantics as the parent class, including the ability to restrict access to certain top-level tags. =item put() The put() method is inherited from the parent Boulder::Stream class, and will write the passed Stone to standard output in Boulder format. This means that it is currently not possible to write a Boulder::Unigene object back into Unigene flatfile form. =back =head1 OUTPUT TAGS The tags returned by the parsing operation are taken from the names shown in the Flat file Hs.dat since no better description of them is provided yet by the database source producer. =head2 Top-Level Tags These are tags that appear at the top level of the parsed Unigene entry. =over 4 =item Identifier The Unigene identifier of this entry. Identifier is a single-value tag. Example: my $identifierNo = $s->Identifier; =item Title The Unigene title for this entry. Example: my $titledef=$s->Title; =item Gene The Gene associated with this Unigene entry Example: my $thegene=$s->Gene; =item Cytoband The cytological band position of this entry Example: my $thecytoband=$s->Cytoband; =item Counts The number of EST in this record Example: my $thecounts=$s->Counts; =item LocusLink The id of the LocusLink entry associated with this record Example: my $thelocuslink=$s->LocusLink; =item Chromosome This field contains a list, of the chromosomes numbers in which this entry has been linked Example: my @theChromosome=$s->Chromosome; =back =head2 STS Multiple records in the form ^STS ACC=XXXXXX NAME=YYYYYY =over 4 =item ACC =item NAME =back =head2 TXMAP Multiple records in the form ^TXMAP XXXXXXX; MARKER=YYYYY; RHPANEL=ZZZZ The TXMAP tag points to a Stone record that contains multiple subtags. Each subtag is the name of a feature which points, in turn, to a Stone that describes the feature's location and other attributes. Each feature will contain one or more of the following subtags: =over 4 =item MARKER =item RHPANEL =back =head2 PROTSIM Multiple records in the form ^PROTSIM ORG=XXX; PROTID=DBID:YYY; PCT=ZZZ; ALN=QQQQ Where DBID is PID for indicate presence of GenPept identifier, SP to indicate SWISSPROT identifier, PIR to indicate PIR identifier, PRF to indicate ??? =over 4 =item ORG =item PROTID =item PCT =item ALN =back =head2 SEQUENCE Multiple records in the form ^SEQUENCE ACC=XXX; NID=YYYY; PID = CLONE= END= LID= =over =item ACC =item NID =item PID =item CLONE =item END =item LID =back =head1 SEE ALSO L, L, L =head1 AUTHOR Lincoln Stein . Luca I.G. Toldo Copyright (c) 1997 Lincoln D. Stein Copyright (c) 1999 Luca I.G. Toldo This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty. =cut # # Following did not require any changes compared to Genbank.pm # sub new { my($package,@parameters) = @_; # superclass constructor my($self) = new Boulder::Stream; # figure out whether parameters are named. Look for # an initial '-' if ($parameters[0]=~/^-/) { my(%parameters) = @parameters; $self->{'accessor'}=$parameters{'-accessor'} || 'File'; $self->{'param'}=$parameters{'-param'}; $self->{'OUT'}=$parameters{'-out'} || 'main::STDOUT'; } else { $self->{'accessor'}='File'; $self->{'param'}=[@parameters]; } croak "Require parameters" unless defined($self->{'param'}); $self->{'accessor'} = new {$self->{'accessor'}}($self->{'param'}); return bless $self,$package; } # # Following required no changes compared to Genbank.pm # sub read_record { my($self,@tags) = @_; my($s); if (wantarray) { my(@result); while (!$self->{'done'}) { $s = $self->read_one_record(@tags); next unless $s; next if $query && !(&$query); push(@result,$s); } return @result; } # we get here if in a scalar context while (!$self->{'done'}) { $s = $self->read_one_record(@tags); next unless $s; return $s unless $query; return $s if &$query; } return undef; } # # Here is everything new # sub parse { my $self = shift; my $record = shift; return unless $record; my $tags = shift; my %ok; %ok = map {$_ => 1} @$tags if ref($tags) eq 'ARRAY'; my($s,@lines,$line,$accumulated,$key,$keyword,$value,$feature,@features, $label); $s = new Stone; # following this line the parsing of the record must be done # each key-value pair is stored by the following command: # $self->_addToStone($key,$value,$stone,\%ok); # # Process new record lines # # (@recordlines)=split(/\n/,$record); undef $unigeneid, $title, $gene,$cytoband, $locuslink, $chromosome, $scount; undef $sts, $txmap,$protsim,$sequence; undef @sts,@txmaps,@protsims,@sequences; foreach $line (@recordlines) { if ($line=~/^ID/) { ($key,$unigeneid)=split(/\s+/,$line); $self->_addToStone('Identifier',$unigeneid,$s,\%ok); } elsif ($line=~/^TITLE/) { (@titles)=split(/\s+/,$line); shift @titles; $title=join(' ',@titles); $self->_addToStone('Title',$title,$s,\%ok); } elsif ($line=~/^GENE/) { ($key,$gene)=split(/\s+/,$line); $self->_addToStone('Gene',$gene,$s,\%ok); } elsif ($line=~/^CYTOBAND/) { ($key,$cytoband)=split(/\s+/,$line); $self->_addToStone('Cytoband',$cytoband,$s,\%ok); } elsif ($line=~/^LOCUSLINK/) { ($key,$locuslink)=split(/\s+/,$line); $self->_addToStone('Locuslink',$locuslink,$s,\%ok); } elsif ($line=~/^CHROMOSOME/) { ($key,$chromosome)=split(/\s+/,$line); $self->_addToStone('Chromosome',$chromosome,$s,\%ok); } elsif ($line=~/^SCOUNT/) { ($key,$scount)=split(/\s+/,$line); $self->_addToStone('Scount',$scount,$s,\%ok); } elsif ($line=~/^STS/) { #STS ACC=XXX; NAME=YYY; (@sts)=split(/\s+/,$line); shift @sts; $sts=join(' ',@sts); ($tmpacc,$tmpname)=split(/\s+/,$sts); ($jnk,$acc)=split(/\=/,$tmpacc); ($jnk,$name)=split(/\=/,$tmpname); undef @features; $featurelabel="Accession"; $featurevalue=$name; $feature = {'label'=>$featurelabel,'value'=>$featurevalue}; push(@features,$feature); $featurelabel="Name"; $feature = {'label'=>$featurelabel,'value'=>$featurevalue}; push(@features,$feature); $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok); } elsif ($line=~/^TXMAP/) { #TXMAP XXX; MARKER=YYY; RHPANEL=ZZZ; (@txmaps)=split(/\s+/,$line); shift @txmaps; $txmap=join(' ',@txmaps); # $self->_addToStone('TXMAP',$txmap,$s,\%ok); undef @features; $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok); } elsif ($line=~/^PROTSIM/) { #PROTSIM ORG=QQQ; PROTID=RRR; PCT=SSSS; ALN=TTTT; (@protsims)=split(/\s+/,$line); shift @protsims; $protsim=join(' ',@protsims); # $self->_addToStone('PROTSIM',$protsim,$s,\%ok); undef @features; $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok); } elsif ($line=~/^SEQUENCE/) { #SEQUENCE ACC=XXXX; NID=YYYY; PID=RRRRR; CLONE=QQQ; END=PPPP; LID=ZZZZ; (@sequences)=split(/\s+/,$line); shift @sequences; $sequence=join(' ',@sequences); # $self->_addToStone('SEQUENCE',$sequence,$s,\%ok); undef @features; $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok); } } # return $s; } # # Following is unchanged from Genbank.pm # sub read_one_record { my($self,@tags) = @_; my(%ok); my $accessor = $self->{'accessor'}; my $record = $accessor->fetch_next(); unless ($record) { $self->{'done'}++; return undef; } return $self->parse($record,\@tags); } # # Following is unchanged from Genbank.pm # sub _trim { my($v) = @_; $v=~s/^\s+//; $v=~s/\s+$//; return $v; } # # Following is unchanged from Genbank.pm # sub _canonicalize { my $h = shift; substr($h,0)=~tr/a-z/A-Z/; substr($h,1,length($h)-1)=~tr/A-Z/a-z/; $h; } # # Following is unchanged from Genbank.pm # sub _addToStone { my($self,$xlabel,$value,$stone,$ok) = @_; return unless !%{$ok} || $ok->{$xlabel}; $stone->insert(_canonicalize($xlabel),$value); } # # Following is entirely rewritten # sub _addFeaturesToStone { my($self,$features,$basecount,$stone,$ok) = @_; my($f) = new Stone; foreach (@$features) { my($q) = $_->{'value'}; my($label) = _canonicalize($_->{'label'}); my($position) = $q=~m!^([^/]+)!; my @qualifiers = $q=~m!/(\w+)=([^/]+)!g; my %qualifiers; while (my($key,$value) = splice(@qualifiers,0,2)) { $value =~ s/^\s*\"//; $value =~s/\"\s*$//; $value=~s/\s+//g if uc($key) eq 'TRANSLATION'; $qualifiers{_canonicalize($key)} = $value; } $f->insert($label=>new Stone('Position'=>$position,%qualifiers)); } $stone->insert('Features',$f); } # -------------------------- DEFINITION OF ACCESSOR OBJECTS ------------------------------ # #only name changes for avoid namespace collisions # package UnigeneAccessor; use Carp; sub new { my($class,@parameters) = @_; croak "UnigeneAccessor::new: Abstract class\n"; } sub fetch_next { my($self) = @_; croak "UnigeneAccessor::fetch_next: Abstract class\n"; } sub DESTROY { } # # Following, only the File package since the only one supported. # If other access methods must be supported, then here appropriate # packages and methods must be implemented # package File; use Carp; @ISA=qw(UnigeneAccessor); $DEFAULT_PATH = Boulder::Unigene::DEFAULT_UNIGENE_PATH(); # # Following, removed the search for the string locus in the file # as validation that the input be compliant with parser # sub new { my($package,$path) = @_; $path = $DEFAULT_PATH unless $path; open (UG,$path) or croak "File::new(): couldn't open $path: $!"; # read the junk at the beginning my $found; $found++; croak "File::new(): $path doesn't look like a Unigene flat file" unless $found; $_ = ; return bless {'fh'=>UG},$package; } # # Following, changed the record separator # sub fetch_next { my $self = shift; return undef unless $self->{'fh'}; local($/)="//\n"; my($line); my($fh) = $self->{'fh'}; chomp($line = <$fh>); return $line; } 1; __END__ libboulder-perl-1.30.orig/Boulder/XML.pm0100644000175000017500000001763407021036075016551 0ustar jojojojo# $Id: XML.pm,v 1.2 1999/11/30 21:06:05 lstein Exp $ # Boulder::XML # # XML input/output for Stone objects package Boulder::XML; =head1 NAME Boulder::XML - XML format input/output for Boulder streams =head1 SYNOPSIS use Boulder::XML; $stream = Boulder::XML->newFh; while ($stone = <$stream>) { print $stream $stone; } =head1 DESCRIPTION Boulder::XML generates BoulderIO streams from XML files and/or streams. It is also able to output Boulder Stones in XML format. Its semantics are similar to those of Boulder::Stream, except that there is never any pass-through behavior. Because XML was not designed for streaming, some care must be taken when reading an XML document into a series of Stones. Consider this XML document: Lincoln Stein Jean Siao September 29, 1999 1999 Lincoln Stein This is the abstract. It is not anything very fancy, but it will do. Fitchberg J Journal of Irreproducible Results 23 1998 Clemenson V Ecumenica 10 1968 Ruggles M Journal of Aesthetic Surgery 10 1999 Ordinarily the document will be construed as a single Paper tag containing subtags Author, Date, Copyright, Abstract, and so on. However it might be desirable to fetch out just the citation tags as a series of Stones. In this case, you can declare Citation to be the top level tag by passing the B<-tag> argument to new(). Now calling get() will return each of the three Citation sections in turn. If no tag is explicitly declared to be the top level tag, then Boulder::XML will take the first tag it sees in the document. It is possible to stream XML files. You can either separate them into separate documents and use the automatic ARGV processing features of the BoulderIO library, or separate the XML documents using a B string similar to the delimiters used in MIME multipart documents. By default, BoulderIO uses a delimiter of E!--Boulder::XML--E. B Instead, it is a way to represent BoulderIO tag/value streams in XML format. The module uses XML::Parser to parse the XML streams, and therefore any syntactic error in the stream can cause the XML parser to quit with an error. Another thing to be aware of is that there are certain XML constructions that will not translate into BoulderIO format, specifically free text that contains embedded tags. This is OK: Jean Siao but this is not: The extremely illustrious Jean Siao In BoulderIO format, tags can contain other tags or text, but cannot contain a mixture of tags and text. =head2 CONSTRUCTORS =over 4 =item $stream = Boulder::XML->new(*IN,*OUT); =item $stream = Boulder::XML->new(-in=>*IN,-out=>*OUT,-tag=>$tag,-delim=>$delim,-strip=>$strip) new() creates a new Boulder::XML stream that can be read from or written to. All arguments are optional. -in Filehandle to read from. If a file name is provided, will open the file. Defaults to the magic <> filehandle. -out Filehandle to write to. If a file name is provided, will open the file for writing. Defaults to STDOUT -tag The top-level XML tag to consider as the Stone record. Defaults to the first tag seen when reading from an XML file, or to EStoneE when writing to an output stream without previously having read. -delim Delimiter to use for delimiting multiple Stone objects in an XML stream. -strip If true, automatically strips leading and trailing whitespace from text contained within tags. =item $fh = Boulder::XML->newFh(*IN,*OUT); =item $fh = Boulder::XML->newFh(-in=>*IN,-out=>*OUT,-tag=>$tag,-delim=>$delim,-strip=>$strip) The newFh() constructor creates a tied filehandle that can read and write Boulder::XML streams. Invoking <> on the filehandle will perform a get(), returning a Stone object. Calling print() on the filehandle will perform a put(), writing a Stone object to output in XML format. =back =head2 METHODS =over 4 =item $stone = $stream->get() =item $stream->put($stone) =item $done = $stream->done All these methods have the same semantics as the similar methods in L, except that pass-through behavior doesn't apply. =back =head1 AUTHOR Lincoln D. Stein , Cold Spring Harbor Laboratory, Cold Spring Harbor, NY. This module can be used and distributed on the same terms as Perl itself. =head1 SEE ALSO L, L, L =cut use Boulder::Stream; use Stone; use XML::Parser; use strict; use vars qw(@ISA); @ISA = 'Boulder::Stream'; *rearrange = \&Boulder::Stream::rearrange; *put = \&write_record; sub new { my $package = shift; my($in,$out,$tag,$delim,$strip) = rearrange(['IN','OUT','TAG','DELIM','STRIP'],@_); my $self = bless { 'top_level' => $tag, 'delim' => $delim || '', 'strip' => $strip, 'in' => Boulder::Stream->to_fh($in) || \*ARGV, 'out' => Boulder::Stream->to_fh($out,1) || \*STDOUT, },$package; my $parser = XML::Parser->new( ErrorContext => 2, Stream_Delimiter => $self->{delim}, ); @ARGV = ('-') if $self->{in} == \*ARGV and !@ARGV; $parser->setHandlers( Start => sub { $self->_start(@_) }, Default => sub { $self->_default(@_) }, End => sub { $self->_end(@_) } ); $self->{'parser'} = $parser; return $self; } sub read_one_record { my ($self,@tags) = @_; return shift @{$self->{stones}} if $self->{stones} && @{$self->{stones}}; my $fh = $self->magic_file_open || return; $self->{parser}->parse($fh); return shift @{$self->{stones}}; } sub write_record { my $self = shift; my @stone = @_; my $out = $self->{out}; print $out $self->{delim},"\n" if $self->{printed}++; print $out qq(\n\n); for my $stone (@stone) { next unless ref $stone && $stone->can('asXML'); print $out $stone->asXML($self->{top_level}); } } sub magic_file_open { my $self = shift; my $fh = $self->{in}; return $fh unless $fh == \*main::ARGV; return $fh unless eof $fh; return unless my $a = shift @ARGV; open $fh,$a or die "$a: $!"; return $fh; } sub done { my $self = shift; return if defined $self->{stones} && @{$self->{stones}}; return eof $self->{in} if $self->{in} != \*main::ARGV; return $self->{in} && eof $self->{in} && !@ARGV; } sub _default { my ($self,$p, $string) = @_; return unless $string=~/\S/; if ($self->{'strip'}) { # strip leading whitespace $string =~ s/^\s+//; $string =~ s/\s+$//; } return unless $self->{stack} && @{$self->{stack}}; my $stone = $self->{stack}[-1]; my $current = $stone->name(); $current .= $string; $stone->name($current); } sub _start { my ($self,$p, $element, %attributes) = @_; $self->{top_level} ||= $element; if ($element eq $self->{top_level}) { $self->{stack} = [$self->{stone} = new Stone]; # empty stone $self->{stone}->attributes(\%attributes) if %attributes; return; } return unless $self->{stack}[-1]; my $s = new Stone; $self->{stack}[-1]->insert($element => $s); push(@{$self->{stack}},$s); $s->attributes(\%attributes) if %attributes; } sub _end { my ($self,$p, $element) = @_; pop @{$self->{stack}}; if ( $element eq $self->{top_level} ) { push @{$self->{stones}},$self->{stone}; delete $self->{stone}; delete $self->{stack}; } } # End end 1; libboulder-perl-1.30.orig/Boulder.pod0100644000175000017500000001435307123725441016257 0ustar jojojojo=head1 NAME Boulder - An API for hierarchical tag/value structures =head1 SYNOPSIS # Read a series of People records from STDIN. # Add an "Eligibility" attribute to all those whose # Age >= 35 and Friends list includes "Fred" use Boulder::Stream; my $stream = Boulder::Stream->newFh; while ( my $record = <$stream> ) { next unless $record->Age >= 35; my @friends = $record->Friends; next unless grep {$_ eq 'Fred'} @friends; $record->insert(Eligibility => 'yes'); print $stream $record; } Related manual pages: basics ------ Stone hierarchical tag/value records Stone::Cursor Traverse a hierarchy Boulder::Stream stream-oriented storage for Stones Boulder::Store record-oriented storage for Stones Boulder::XML XML conversion for Stones Boulder::String conversion to strings genome-related --------------- Boulder::Genbank parse Genbank (DNA sequence) records Boulder::Blast parse BLAST (basic local alignment search tool) reports Boulder::Medline parse Medline (pubmed) records Boulder::Omim parse OMIM (online Mendelian inheritance in man) records Boulder::Swissprot parse Swissprot records Boulder::Unigene parse Unigene records =head1 DESCRIPTION =head2 Boulder IO Boulder IO is a simple TAG=VALUE data format designed for sharing data between programs connected via a pipe. It is also simple enough to use as a common data exchange format between databases, Web pages, and other data representations. The basic data format is very simple. It consists of a series of TAG=VALUE pairs separated by newlines. It is record-oriented. The end of a record is indicated by an empty delimiter alone on a line. The delimiter is "=" by default, but can be adjusted by the user. An example boulder stream looks like this: Name=Lincoln Stein Home=/u/bush202/lds32 Organization=Cold Spring Harbor Laboratory Login=lds32 Password_age=20 Password_expires=60 Alias=lstein Alias=steinl = Name=Leigh Deacon Home=/u/bush202/tanager Organization=Cold Spring Harbor Laboratory Login=tanager Password_age=2 Password_expires=60 = Notes: =over 4 =item (1) There is no need for all tags to appear in all records, or indeed for all the records to be homogeneous. =item (2) Multiple values are allowed, as with the Alias tag in the second record. =item (3) Lines can be any length, as in a potential 40 Kbp DNA sequence entry. =item (4) Tags can be any alphanumeric character (upper or lower case) and may contain embedded spaces. Conventionally we use the characters A-Z0-9_, because they can be used without single quoting as keys in Perl associative arrays, but this is merely stylistic. Values can be any character at all except for the reserved characters {}=% and newline. You can incorporate binary data into the data stream by escaping these characters in the URL manner, using a % sign followed by the (capitalized) hexadecimal code for the character. The module makes this automatic. =back =head2 Hierarchical Records The simple boulder format can be extended to accomodate nested relations and other intresting structures. Nested records can be created in this way: Name=Lincoln Stein Home=/u/bush202/lds32 Organization=Cold Spring Harbor Laboratory Login=lds32 Password_age=20 Password_expires=60 Privileges={ ChangePasswd=yes CronJobs=yes Reboot=yes Shutdown=no } = Name=Leigh Deacon Home=/u/bush202/tanager Organization=Cold Spring Harbor Laboratory Login=tanager Password_age=2 Password_expires=60 Privileges={ ChangePasswd=yes CronJobs=no Reboot=no Shutdown=no } = As in the original format, tags may be multivalued. For example, there might be several Privilege record assigned to a login account. Each subrecord may contain further subrecords. Within the program, a hierarchical record is encapsulated within a "Stone", an opaque structure that implements methods for fetching and settings its various tags. =head2 Using Boulder for I/O The Boulder API was designed to make reading and writing of complex hierarchical records almost as easy as reading and writing single lines of text. =over 4 =item Boulder::Stream The main component of the Boulder modules is Boulder::Stream, which provides a stream-oriented view of the data. You can read and write to Boulder::Streams via tied filehandles, or via method calls. Data records are flattened into a simple format called "boulderio" format. =item Boulder::XML Boulder::XML acts like Boulder::Stream, but the serialization format is XML. You need XML::Parser installed to use this module. =item Boulder::Store This is a simple persistent storage class which allows you to store several (thousand) Stone's into a DB_File database. You must have libdb and the Perl DB_File extensions installed in order to take advantage of this class. =item Boulder::Genbank =item Boulder::Unigene =item Boulder::OMIM =item Boulder::Blast =item Boulder::Medline =item Boulder::SwissProt These are parsers and accessors for various biological data sources. They act like Boulder::Stream, but return a set of Stone objects that have certain prescribed tags and values. Many of these modules were written by Luca I.G. Toldo . =back =head2 Stone Objects The Stone object encapsulates a set of tags and values. Any tag can be single- or multivalued, and tags are allowed to contain subtags to any depth. A simple set of methods named tags(), get(), put(), insert(), replace() and so forth, allows you to examine the tags that are available, get and set their values, and search for particular tags. In addition, an autoload mechanism allows you to use method calls to access tags, for example: my @friends = $record->Friends; is equivalent to: my @friends = $record->get('Friends'); A Stone::Cursor class allows you to traverse Stones systematically. A full explanation of the Stone class can be found in its manual page. =head1 AUTHOR Lincoln D. Stein , Cold Spring Harbor Laboratory, Cold Spring Harbor, NY. This module can be used and distributed on the same terms as Perl itself. =head1 SEE ALSO L, L, L, L, L, L =cut libboulder-perl-1.30.orig/ChangeLog0100644000175000017500000000416207777564337015752 0ustar jojojojo * (version 1.28) * Boulder::Store fixes from Aaron Wangberg Mon Feb 4 17:52:03 EST 2002 * (Version 1.27) * Batch entrez queries work again using semi-published, semi-private interface The problem with the published interface is there's no way to pass a list of Genbank ACCESSION numbers, just GI numbers. So I have to use the XML demo. * Patches from Michael Peterson to fix BLAST parsing. Thu Jan 24 14:21:02 EST 2002 * (Version 1.26) * Partial fix for Entrez retrieval, but Entrez queries don't work :-( 2001-12-28 Lincoln Stein * (version 1.25) * fixed GB_Stone bugs that prevented parsing of some of the feature fields * fixed Boulder/Genbank bugs that prevented parsing of some newer genbank entries that use the ORIGIN field. * fixed problems parsing genomic entries that contain gaps. 2001-06-14 Lincoln Stein * (version 1.24) * Fixed bugs introduced into 1.23 version when parsing Boulder::Genbank records from Entrez 2001-06-13 Lincoln Stein * (version 1.23) * Boulder::Genbank now correctly handles sequences that use whitespace to represent gaps 2001-06-11 Lincoln Stein * (version 1.22) * now reports BatchEntry errors to standard error * patch from Will Fitzhugh to correct problems with Yank accessor 2001-03-09 Lincoln Stein * (version 1.21) * fixed loop termination problem in Boulder::Genbank 2000-07-13 Lincoln Stein * (version 1.20) * Fixes to Boulder::Genbank provided by Lester Hui. Now handles repeated multivalued features better. 2000-06-08 Lincoln Stein * Fixed Boulder::Blast::NCBI parsing error that caused it to miss some HSPs * Fixed Boulder::Blast::{NCBI,WU} treatment of % identity -- had been returning identity as a fraction of 1 rather than a percentage (as documented) 1999-12-02 Lincoln Stein * Fixed parameter bug in File accessor for Boulder::Genbank. * Documented problems with flock() across NFS filesystems. * Boulder::Genbank no longer "eats" the list of accession numbers passed to it. libboulder-perl-1.30.orig/MANIFEST0100644000175000017500000000103007777564337015320 0ustar jojojojoBoulder.pod Boulder/Blast.pm Boulder/Genbank.pm Boulder/Labbase.pm Boulder/LocusLink.pm Boulder/Store.pm Boulder/Stream.pm Boulder/String.pm Boulder/Medline.pm Boulder/Unigene.pm Boulder/Omim.pm Boulder/Swissprot.pm Boulder/XML.pm Boulder/Blast/NCBI.pm Boulder/Blast/WU.pm ChangeLog MANIFEST Makefile.PL README Stone.pm Stone/Cursor.pm Stone/GB_Sequence.pm docs/blast_tags.txt docs/genbank_tags.txt docs/javaboulder.txt eg/gb_get eg/gb_search eg/genbank.pl eg/genbank2.pl eg/genbank3.pl eg/quickblast.pl eg/test.pl t/store.t t/stream.t libboulder-perl-1.30.orig/Makefile.PL0100644000175000017500000000072007021033444016112 0ustar jojojojouse ExtUtils::MakeMaker; eval "use XML::Parser; 1" || warn "XML::Parser module not found. This module is required to use Boulder::XML\n"; WriteMakefile( 'NAME' => 'Boulder', 'DISTNAME' => 'Boulder', 'VERSION_FROM' => 'Stone.pm', 'PMLIBDIRS' => ['Boulder','Stone'], 'linkext' => { LINKTYPE=>'' }, # no link needed 'dist' => {'COMPRESS'=>'gzip -9f', 'SUFFIX' => 'gz', 'ZIP'=>'/usr/bin/zip','ZIPFLAGS'=>'-rl'}, ); libboulder-perl-1.30.orig/README0100644000175000017500000000226007060753157015036 0ustar jojojojoThis is the Boulder IO package. Boulder provides a simple stream-oriented format for transmitting data objects between one or more processes. It does not provide for the serialization of Perl objects the way FreezeThaw or Data::Dumper do, but it does provide the advantage of being language independent. In addition to a stream-oriented interface, Boulder comes with a simple record-oriented database-oriented interface, Boulder::Store, which provides query and search capabilities comparable to many flat file DBMS systems. At the MIT Genome Center, Boulder is used for many interprocess communication tasks, as well as an abstract database interface for several large databases, including GenBank. To install, run the following commands: % perl Makefile.PL % make % make test (optional) % make install If any of the tests fail, run "make test TEST_VERBOSE=1" and note which subtests fail. Note that you will need the DB_File module installed in order to take advantage of the Boulder::Store database functions. Please see the documentation, Boulder.pod, and the various .pm files for more information. See the eg/ directory for some practical examples. Lincoln Stein lstein@cshl.org libboulder-perl-1.30.orig/Stone.pm0100644000175000017500000007560507777564337015640 0ustar jojojojo# ----------------- Stone --------------- # This is basic unit of the boulder stream, and defines a # multi-valued hash array type of structure. package Stone; use strict; use vars qw($VERSION $AUTOLOAD $Fetchlast); use overload '""' => 'toString', 'fallback' =>' TRUE'; $VERSION = '1.30'; require 5.004; =head1 NAME Stone - In-memory storage for hierarchical tag/value data structures =head1 SYNOPSIS use Stone; my $stone = Stone->new( Jim => { First_name => 'James', Last_name => 'Hill', Age => 34, Address => { Street => ['The Manse', '19 Chestnut Ln'], City => 'Garden City', State => 'NY', Zip => 11291 } }, Sally => { First_name => 'Sarah', Last_name => 'James', Age => 30, Address => { Street => 'Hickory Street', City => 'Katonah', State => 'NY', Zip => 10578 } } ); @tags = $stone->tags; # yields ('James','Sally'); $address = $stone->Jim->Address; # gets the address subtree @street = $address->Street; # yeilds ('The Manse','19 Chestnut Ln') $address = $stone->get('Jim')->get('Address'); # same as $stone->Jim->Address $address = $stone->get('Jim.Address'); # another way to express same thing # first Street tag in Jim's address $address = $stone->get('Jim.Address.Street[0]'); # second Street tag in Jim's address $address = $stone->get('Jim.Address.Street[1]'); # last Street tag in Jim's address $address = $stone->get('Jim.Address.Street[#]'); # insert a tag/value pair $stone->insert(Martha => { First_name => 'Martha', Last_name => 'Steward'} ); # find the first Address $stone->search('Address'); # change an existing subtree $martha = $stone->Martha; $martha->replace(Last_name => 'Stewart'); # replace a value # iterate over the tree with a cursor $cursor = $stone->cursor; while (my ($key,$value) = $cursor->each) { print "$value: Go Bluejays!\n" if $key eq 'State' and $value eq 'Katonah'; } # various format conversions print $stone->asTable; print $stone->asString; print $stone->asHTML; print $stone->asXML('Person'); =head1 DESCRIPTION A L consists of a series of tag/value pairs. Any given tag may be single-valued or multivalued. A value can be another Stone, allowing nested components. A big Stone can be made up of a lot of little stones (pebbles?). You can obtain a Stone from a L or L persistent database. Alternatively you can build your own Stones bit by bit. Stones can be exported into string, XML and HTML representations. In addition, they are flattened into a linearized representation when reading from or writing to a L or one of its descendents. L was designed for subclassing. You should be able to create subclasses which create or require particular tags and data formats. Currently only L subclasses L. =head1 CONSTRUCTORS Stones are either created by calling the new() method, or by reading them from a L or persistent database. =head2 $stone = Stone->new() This is the main constructor for the Stone class. It can be called without any parameters, in which case it creates an empty Stone object (no tags or values), or it may passed an associative array in order to initialize it with a set of tags. A tag's value may be a scalar, an anonymous array reference (constructed using [] brackets), or a hash references (constructed using {} brackets). In the first case, the tag will be single-valued. In the second, the tag will be multivalued. In the third case, a subsidiary Stone will be generated automatically and placed into the tree at the specified location. Examples: $myStone = new Stone; $myStone = new Stone(Name=>'Fred',Age=>30); $myStone = new Stone(Name=>'Fred', Friend=>['Jill','John','Jerry']); $myStone = new Stone(Name=>'Fred', Friend=>['Jill', 'John', 'Gerald' ], Attributes => { Hair => 'blonde', Eyes => 'blue' } ); In the last example, a Stone with the following structure is created: Name Fred Friend Jill Friend John Friend Gerald Attributes Eyes blue Hair blonde Note that the value corresponding to the tag "Attributes" is itself a Stone with two tags, "Eyes" and "Hair". The XML representation (which could be created with asXML()) looks like this: blue blonde Jill John Gerald Fred More information on Stone initialization is given in the description of the insert() method. =head1 OBJECT METHODS Once a Stone object is created or retrieved, you can manipulate it with the following methods. =head2 $stone->insert(%hash) =head2 $stone->insert(\%hash) This is the main method for adding tags to a Stone. This method expects an associative array as an argument or a reference to one. The contents of the associative array will be inserted into the Stone. If a particular tag is already present in the Stone, the tag's current value will be appended to the list of values for that tag. Several types of values are legal: =over 4 =item * A B value The value will be inserted into the C. $stone->insert(name=>Fred, age=>30, sex=>M); $stone->dump; name[0]=Fred age[0]=30 sex[0]=M =item * An B reference A multi-valued tag will be created: $stone->insert(name=>Fred, children=>[Tom,Mary,Angelique]); $stone->dump; name[0]=Fred children[0]=Tom children[1]=Mary children[2]=Angelique =item * A B reference A subsidiary C object will be created and inserted into the object as a nested structure. $stone->insert(name=>Fred, wife=>{name=>Agnes,age=>40}); $stone->dump; name[0]=Fred wife[0].name[0]=Agnes wife[0].age[0]=40 =item * A C object or subclass The C object will be inserted into the object as a nested structure. $wife = new Stone(name=>agnes, age=>40); $husband = new Stone; $husband->insert(name=>fred, wife=>$wife); $husband->dump; name[0]=fred wife[0].name[0]=agnes wife[0].age[0]=40 =back =head2 $stone->replace(%hash) =head2 $stone->replace(\%hash) The B method behaves exactly like C with the exception that if the indicated key already exists in the B, its value will be replaced. Use B when you want to enforce a single-valued tag/value relationship. =head2 $stone->insert_list($key,@list) =head2 $stone->insert_hash($key,%hash) =head2 $stone->replace_list($key,@list) =head2 $stone->replace_hash($key,%hash) These are primitives used by the C and C methods. Override them if you need to modify the default behavior. =head2 $stone->delete($tag) This removes the indicated tag from the Stone. =head2 @values = $stone->get($tag [,$index]) This returns the value at the indicated tag and optional index. What you get depends on whether it is called in a scalar or list context. In a list context, you will receive all the values for that tag. You may receive a list of scalar values or (for a nested record) or a list of Stone objects. If called in a scalar context, you will either receive the first or the last member of the list of values assigned to the tag. Which one you receive depends on the value of the package variable C<$Stone::Fetchlast>. If undefined, you will receive the first member of the list. If nonzero, you will receive the last member. You may provide an optional index in order to force get() to return a particular member of the list. Provide a 0 to return the first member of the list, or '#' to obtain the last member. If the tag contains a period (.), get() will call index() on your behalf (see below). If the tag begins with an uppercase letter, then you can use the autogenerated method to access it: $stone->Tag_name([$index]) This is exactly equivalent to: $stone->get('Teg_name' [,$index]) =head2 @values = $stone->search($tag) Searches for the first occurrence of the tag, traversing the tree in a breadth-first manner, and returns it. This allows you to retrieve the value of a tag in a deeply nested structure without worrying about all the intermediate nodes. For example: $myStone = new Stone(Name=>'Fred', Friend=>['Jill', 'John', 'Gerald' ], Attributes => { Hair => 'blonde', Eyes => 'blue' } ); $hair_colour = $stone->search('Hair'); The disadvantage of this is that if there is a tag named "Hair" higher in the hierarchy, this tag will be retrieved rather than the lower one. In an array context this method returns the complete list of values from the matching tag. In a scalar context, it returns either the first or the last value of multivalued tags depending as usual on the value of C<$Stone::Fetchlast>. C<$Stone::Fetchlast> is also consulted during the depth-first traversal. If C<$Fetchlast> is set to a true value, multivalued intermediate tags will be searched from the last to the first rather than the first to the last. The Stone object has an AUTOLOAD method that invokes get() when you call a method that is not predefined. This allows a very convenient type of shortcut: $name = $stone->Name; @friends = $stone->Friend; $eye_color = $stone->Attributes->Eyes In the first example, we retrieve the value of the top-level tag Name. In the second example, we retrieve the value of the Friend tag.. In the third example, we retrieve the attributes stone first, then the Eyes value. NOTE: By convention, methods are only autogenerated for tags that begin with capital letters. This is necessary to avoid conflict with hard-coded methods, all of which are lower case. =head2 @values = $stone->index($indexstr) You can access the contents of even deeply-nested B objects with the C method. You provide a B, and receive a value or list of values back. Tag paths look like this: tag1[index1].tag2[index2].tag3[index3] Numbers in square brackets indicate which member of a multivalued tag you're interested in getting. You can leave the square brackets out in order to return just the first or the last tag of that name, in a scalar context (depending on the setting of B<$Stone::Fetchlast>). In an array context, leaving the square brackets out will return B multivalued members for each tag along the path. You will get a scalar value in a scalar context and an array value in an array context following the same rules as B. You can provide an index of '#' in order to get the last member of a list or a [?] to obtain a randomly chosen member of the list (this uses the rand() call, so be sure to call srand() at the beginning of your program in order to get different sequences of pseudorandom numbers. If there is no tag by that name, you will receive undef or an empty list. If the tag points to a subrecord, you will receive a B object. Examples: # Here's what the data structure looks like. $s->insert(person=>{name=>Fred, age=>30, pets=>[Fido,Rex,Lassie], children=>[Tom,Mary]}, person=>{name=>Harry, age=>23, pets=>[Rover,Spot]}); # Return all of Fred's children @children = $s->index('person[0].children'); # Return Harry's last pet $pet = $s->index('person[1].pets[#]'); # Return first person's first child $child = $s->index('person.children'); # Return children of all person's @children = $s->index('person.children'); # Return last person's last pet $Stone::Fetchlast++; $pet = $s->index('person.pets'); # Return any pet from any person $pet = $s->index('person[?].pet[?]'); I that B may return a B object if the tag path points to a subrecord. =head2 $array = $stone->at($tag) This returns an ARRAY REFERENCE for the tag. It is useful to prevent automatic dereferencing. Use with care. It is equivalent to: $stone->{'tag'} at() will always return an array reference. Single-valued tags will return a reference to an array of size 1. =head2 @tags = $stone->tags() Return all the tags in the Stone. You can then use this list with get() to retrieve values or recursively traverse the stone. =head2 $string = $stone->asTable() Return the data structure as a tab-delimited table suitable for printing. =head2 $string = $stone->asXML([$tagname]) Return the data structure in XML format. The entire data structure will be placed inside a top-level tag called . If you wish to change this top-level tag, pass it as an argument to asXML(). An example follows: print $stone->asXML('Address_list'); # yields:
10578 Katonah Hickory Street NY
Smith 30 Sarah
11291 Garden City The Manse 19 Chestnut Ln NY
Hill 34 James
=head2 $hash = $stone->attributes([$att_name, [$att_value]]]) attributes() returns the "attributes" of a tag. Attributes are a series of unique tag/value pairs which are associated with a tag, but are not contained within it. Attributes can only be expressed in the XML representation of a Stone:
10578 Katonah Hickory Street NY
Called with no arguments, attributes() returns the current attributes as a hash ref: my $att = $stone->Address->attributes; my $type = $att->{type}; Called with a single argument, attributes() returns the value of the named attribute, or undef if not defined: my $type = $stone->Address->attributes('type'); Called with two arguments, attributes() sets the named attribute: my $type = $stone->Address->attributes(type => 'Rural Free Delivery'); You may also change all attributes in one fell swoop by passing a hash reference as the single argument: $stone->attributes({id=>'Sally Mae',version=>'2.1'}); =head2 $string = $stone->toString() toString() returns a simple version of the Stone that shows just the topmost tags and the number of each type of tag. For example: print $stone->Jim->Address; #yields => Zip(1),City(1),Street(2),State(1) This method is used internally for string interpolation. If you try to print or otherwise manipulate a Stone object as a string, you will obtain this type of string as a result. =head2 $string = $stone->asHTML([\&callback]) Return the data structure as a nicely-formatted HTML 3.2 table, suitable for display in a Web browser. You may pass this method a callback routine which will be called for every tag/value pair in the object. It will be passed a two-item list containing the current tag and value. It can make any modifications it likes and return the modified tag and value as a return result. You can use this to modify tags or values on the fly, for example to turn them into HTML links. For example, this code fragment will turn all tags named "Sequence" blue: my $callback = sub { my ($tag,$value) = @_; return ($tag,$value) unless $tag eq 'Sequence'; return ( qq($tag),$value ); } print $stone->asHTML($callback); =head2 Stone::dump() This is a debugging tool. It iterates through the B object and prints out all the tags and values. Example: $s->dump; person[0].children[0]=Tom person[0].children[1]=Mary person[0].name[0]=Fred person[0].pets[0]=Fido person[0].pets[1]=Rex person[0].pets[2]=Lassie person[0].age[0]=30 person[1].name[0]=Harry person[1].pets[0]=Rover person[1].pets[1]=Spot person[1].age[0]=23 =head2 $cursor = $stone->cursor() Retrieves an iterator over the object. You can call this several times in order to return independent iterators. The following brief example is described in more detail in L. my $curs = $stone->cursor; while (my($tag,$value) = $curs->next_pair) { print "$tag => $value\n"; } # yields: Sally[0].Address[0].Zip[0] => 10578 Sally[0].Address[0].City[0] => Katonah Sally[0].Address[0].Street[0] => Hickory Street Sally[0].Address[0].State[0] => NY Sally[0].Last_name[0] => James Sally[0].Age[0] => 30 Sally[0].First_name[0] => Sarah Jim[0].Address[0].Zip[0] => 11291 Jim[0].Address[0].City[0] => Garden City Jim[0].Address[0].Street[0] => The Manse Jim[0].Address[0].Street[1] => 19 Chestnut Ln Jim[0].Address[0].State[0] => NY Jim[0].Last_name[0] => Hill Jim[0].Age[0] => 34 Jim[0].First_name[0] => James =head1 AUTHOR Lincoln D. Stein . =head1 COPYRIGHT Copyright 1997-1999, Cold Spring Harbor Laboratory, Cold Spring Harbor NY. This module can be used and distributed on the same terms as Perl itself. =head1 SEE ALSO L, L, L, L, L, L =cut use Stone::Cursor; use Carp; use constant DEFAULT_WIDTH=>25; # column width for pretty-printing # This global controls whether you will get the first or the # last member of a multi-valued attribute when you invoke # get() in a scalar context. $Stone::Fetchlast=0; sub AUTOLOAD { my($pack,$func_name) = $AUTOLOAD=~/(.+)::([^:]+)$/; my $self = shift; croak "Can't locate object method \"$func_name\" via package \"$pack\". ", "Tag names must begin with a capital letter in order to be called this way" unless $func_name =~ /^[A-Z]/; return $self->get($func_name,@_); } # Create a new Stone object, filling it with the # provided tag/value pairs, if any sub new { my($pack,%initial_values) = @_; my($self) = bless {},$pack; $self->insert(%initial_values) if %initial_values; return $self; } # Insert the key->value pairs into the Stone object, # appending to any similarly-named keys that were there before. sub insert { my($self,@arg) = @_; my %hash; if (ref $arg[0] and ref $arg[0] eq 'HASH') { %hash = %{$arg[0]}; } else { %hash = @arg; } foreach (keys %hash) { $self->insert_list($_,$hash{$_}); } } # Add the key->value pairs to the Stone object, # replacing any similarly-named keys that were there before. sub replace { my($self,@arg) = @_; my %hash; if (ref $arg[0] and ref $arg[0] eq 'HASH') { %hash = %{$arg[0]}; } else { %hash = @arg; } foreach (keys %hash) { $self->replace_list($_,$hash{$_}); } } # Fetch the value at the specified key. In an array # context, this will return the entire array. In a scalar # context, this will return either the first or the last member # of the array, depending on the value of the global Fetchlast. # You can specify an optional index to index into the resultant # array. # Codes: # digit (12) returns the 12th item # hash sign (#) returns the last item # question mark (?) returns a random item # zero (0) returns the first item sub get { my($self,$key,$index) = @_; return $self->index($key) if $key=~/[.\[\]]/; if (defined $index) { return $self->get_last($key) if $index eq '#' || $index == -1; if ($index eq '?') { my $size = scalar(@{$self->{$key}}); return $self->{$key}->[rand($size)]; } return $self->{$key}->[$index] if $index ne ''; } if (wantarray) { return @{$self->{$key}} if $self->{$key}; return my(@empty); } return $self->get_first($key) unless $Fetchlast; return $self->get_last($key); } # Returns 1 if the key exists. sub exists { my($self,$key,$index) = @_; return 1 if defined($self->{$key}) && !$index; return 1 if defined($self->{$key}->[$index]); return undef; } # return an array reference at indicated tag. # Equivalent to $stone->{'tag'} sub at { my $self = shift; return $self->{$_[0]}; } # # Delete the indicated key entirely. sub delete { my($self,$key) = @_; delete $self->{$key}; $self->_fix_cursors; } # Return all the tags in the stone. sub tags { my $self = shift; return grep (!/^\./,keys %{$self}); } # Return attributes as a hash reference # (only used by asXML) sub attributes { my $self = shift; my ($tag,$value) = @_; if (defined $tag) { return $self->{'.att'} = $tag if ref $tag eq 'HASH'; return $self->{'.att'}{$tag} = $value if defined $value; return $self->{'.att'}{$tag}; } return $self->{'.att'} ||= {}; } # Fetch an Iterator on the Stone. sub cursor { my $self = shift; return new Stone::Cursor($self); } # Convert a stone into a straight hash sub to_hash { my ($self) = shift; my ($key,%result); foreach $key (keys %$self) { next if substr($key,0,1) eq '.'; my ($value,@values); foreach $value (@{$self->{$key}}) { # NG 00-10-04 changed to convert values with .name into those names # NG 00-10-04 and to convert recursive results to HASH ref push(@values,!ref($value)? $value: defined ($value->{'.name'})? $value->{'.name'}: {$value->to_hash()}); } $result{$key} = @values > 1 ? [@values] : $values[0]; } return %result; } # Search for a particular tag and return it using a breadth-first search sub search { my ($self,$tag) = @_; return $self->get($tag) if $self->{$tag}; foreach ($self->tags()) { my @objects = $self->get($_); @objects = reverse(@objects) if $Fetchlast; foreach my $obj (@objects) { next unless ref($obj) and $obj->isa('Stone'); my @result = $obj->search($tag); return wantarray ? @result : ($Fetchlast ? $result[$#result] : $result[0]); } } return wantarray ? () : undef; } # Extended indexing, using a compound index that # looks like: # key1[index].key2[index].key3[index] # If indices are left out, then you can get # multiple values out: # 1. In a scalar context, you'll get the first or last # value from each position. # 2. In an array context, you'll get all the values! sub index { my($self,$index) = @_; return &_index($self,split(/\./,$index)); } sub _index { my($self,@indices) = @_; my(@value,$key,$position,$i); my(@results); $i = shift @indices; if (($key,$position) = $i=~/(.+)\[([\d\#\?]+)\]/) { # has a position @value = $self->get($key,$position); # always a scalar } elsif (wantarray) { @value = $self->get($i); } else { @value = scalar($self->get($i)); } foreach (@value) { next unless ref $_; if (@indices) { push @results,&_index($_,@indices) if $_->isa('Stone') && !exists($_->{'.name'}); } else{ push @results,$_; } } return wantarray ? @results : $results[0]; } # Return the data structure as a nicely-formatted tab-delimited table sub asTable { my $self = shift; my $string = ''; $self->_asTable(\$string,0,0); return $string; } # Return the data structure as a nice string representation (problematic) sub asString { my $self = shift; my $MAXWIDTH = shift || DEFAULT_WIDTH; my $tabs = $self->asTable; return '' unless $tabs; my(@lines) = split("\n",$tabs); my($result,@max); foreach (@lines) { my(@fields) = split("\t"); for (my $i=0;$i<@fields;$i++) { $max[$i] = length($fields[$i]) if !defined($max[$i]) or $max[$i] < length($fields[$i]); } } foreach (@max) { $_ = $MAXWIDTH if $_ > $MAXWIDTH; } # crunch long lines my $format1 = join(' ',map { "^"."<"x $max[$_] } (0..$#max)) . "\n"; my $format2 = ' ' . join(' ',map { "^"."<"x ($max[$_]-1) } (0..$#max)) . "~~\n"; $^A = ''; foreach (@lines) { my @data = split("\t"); push(@data,('')x(@max-@data)); formline ($format1,@data); formline ($format2,@data); } return ($result = $^A,$^A='')[0]; } # Return the data structure as an HTML table sub asHTML { my $self = shift; my $modify = shift; $modify ||= \&_default_modify_html; my $string = "\n"; $self->_asHTML(\$string,$modify,0,0); $string .= "\n
"; return $string; } # Return data structure using XML syntax # Top-level tag is unless otherwise specified sub asXML { my $self = shift; my $top = shift || "Stone"; my $modify = shift || \&_default_modify_xml; my $att; if (exists($self->{'.att'})) { my $a = $self->attributes; foreach (keys %$a) { $att .= qq( $_="$a->{$_}"); } } my $string = "<${top}${att}>\n"; $self->_asXML(\$string,$modify,0,1); $string .="\n"; return $string; } # This is the method used for string interpolation sub toString { my $self = shift; return $self->{'.name'} if exists $self->{'.name'}; my @tags = map { my @v = $self->get($_); my $cnt = scalar @v; "$_($cnt)" } $self->tags; return '' unless @tags; return join ',',@tags; } sub _asTable { my $self = shift; my ($string,$position,$level) = @_; my $pos = $position; foreach my $tag ($self->tags) { my @values = $self->get($tag); foreach my $value (@values) { $$string .= "\t" x ($level-$pos) . "$tag\t"; $pos = $level+1; if (exists $value->{'.name'}) { $$string .= "\t" x ($level-$pos+1) . "$value\n"; $pos=0; } else { $pos = $value->_asTable($string,$pos,$level+1); } } } return $pos; } sub _asXML { my $self = shift; my ($string,$modify,$pos,$level) = @_; foreach my $tag ($self->tags) { my @values = $self->get($tag); foreach my $value (@values) { my($title,$contents) = $modify ? $modify->($tag,$value) : ($tag,$value); my $att; if (exists $value->{'.att'}) { my $a = $value->{'.att'}; foreach (keys %$a) { $att .= qq( $_="$a->{$_}"); } } $$string .= ' ' x ($level-$pos) . "<${title}${att}>"; $pos = $level+1; if (exists $value->{'.name'}) { $$string .= ' ' x ($level-$pos+1) . "$contents\n"; $pos=0; } else { $$string .= "\n" . ' ' x ($level+1); $pos = $value->_asXML($string,$modify,$pos,$level+1); $$string .= ' ' x ($level-$pos) . "\n"; } } } return $pos; } sub _asHTML { my $self = shift; my ($string,$modify,$position,$level) = @_; my $pos = $position; foreach my $tag ($self->tags) { my @values = $self->get($tag); foreach my $value (@values) { my($title,$contents) = $modify->($tag,$value); $$string .= "" unless $position; $$string .= "" x ($level-$pos) . "$title"; $pos = $level+1; if (exists $value->{'.name'}) { $$string .= "" x ($level-$pos+1) . "$contents\n"; $pos=0; } else { $pos = $value->_asHTML($string,$modify,$pos,$level+1); } } } return $pos; } sub _default_modify_html { my ($tag,$value) = @_; return ("$tag",$value); } sub _default_modify_xml { my ($tag,$value) = @_; $value =~ s/&/&/g; $value =~ s/>/>/g; $value =~ s/cursor; my ($key,$value); while (($key,$value)=$i->each) { print "$key=$value\n"; } # this has to be done explicitly here or it won't happen. $i->DESTROY; } # return the name of the Stone sub name { $_[0]->{'.name'} = $_[1] if defined $_[1]; return $_[0]->{'.name'} } # --------- LOW LEVEL DATA INSERTION ROUTINES --------- # Append a set of values to the key. # One or more values may be other Stones. # You can pass the same value multiple times # to enter multiple values, or alternatively # pass an anonymous array. sub insert_list { my($self,$key,@values) = @_; foreach (@values) { my $ref = ref($_); if (!$ref) { # Inserting a scalar my $s = new Stone; $s->{'.name'} = $_; push(@{$self->{$key}},$s); next; } if ($ref=~/Stone/) { # A simple insertion push(@{$self->{$key}},$_); next; } if ($ref eq 'ARRAY') { # A multivalued insertion $self->insert_list($key,@{$_}); # Recursive insertion next; } if ($ref eq 'HASH') { # Insert a record, potentially recursively $self->insert_hash($key,%{$_}); next; } warn "Attempting to insert a $ref into a Stone. Be alert.\n"; push(@{$self->{$key}},$_); } $self->_fix_cursors; } # Put the values into the key, replacing # whatever was there before. sub replace_list { my($self,$key,@values) = @_; $self->{$key}=[]; # clear it out $self->insert_list($key,@values); # append the values } # Similar to put_record, but doesn't overwrite the # previous value of the key. sub insert_hash { my($self,$key,%values) = @_; my($newrecord) = $self->new_record($key); foreach (keys %values) { $newrecord->insert_list($_,$values{$_}); } } # Put a new associative array at the indicated key, # replacing whatever was there before. Multiple values # can be represented with an anonymous ARRAY reference. sub replace_hash { my($self,$key,%values) = @_; $self->{$key}=[]; # clear it out $self->insert_hash($key,%values); } #------------------- PRIVATE SUBROUTINES----------- # Create a new record at indicated key # and return it. sub new_record { my($self,$key) = @_; my $stone = new Stone(); push(@{$self->{$key}},$stone); return $stone; } sub get_first { my($self,$key) = @_; return $self->{$key}->[0]; } sub get_last { my($self,$key) = @_; return $self->{$key}->[$#{$self->{$key}}]; } # This is a private subroutine used for registering # and unregistering cursors sub _register_cursor { my($self,$cursor,$register) = @_; if ($register) { $self->{'.cursors'}->{$cursor}=$cursor; } else { delete $self->{'.cursors'}->{$cursor}; delete $self->{'.cursors'} unless %{$self->{'.cursors'}}; } } # This is a private subroutine used to alert cursors that # our contents have changed. sub _fix_cursors { my($self) = @_; return unless $self->{'.cursors'}; my($cursor); foreach $cursor (values %{$self->{'.cursors'}}) { $cursor->reset; } } # This is a private subroutine. It indexes # all the way into the structure. #sub _index { # my($self,@indices) = @_; # my $stone = $self; # my($key,$index,@h); # while (($key,$index) = splice(@indices,0,2)) { # unless (defined($index)) { # return scalar($stone->get($key)) unless wantarray; # return @h = $stone->get($key) if wantarray; # } else { # $stone= ($index eq "\#") ? $stone->get_last($key): # $stone->get($key,$index); # last unless ref($stone)=~/Stone/; # } # } # return $stone; #} sub DESTROY { my $self = shift; undef %{$self->{'.cursor'}}; # not really necessary ? } 1; libboulder-perl-1.30.orig/Stone/0040755000175000017500000000000007777761464015270 5ustar jojojojolibboulder-perl-1.30.orig/Stone/Cursor.pm0100644000175000017500000001060307021032676017052 0ustar jojojojo# A simple iterator on a Stone. package Stone::Cursor; =head1 NAME Stone::Cursor - Traverse tags and values of a Stone =head1 SYNOPSIS use Boulder::Store; $store = Boulder::Store->new('./soccer_teams'); my $stone = $store->get(28); $cursor = $stone->cursor; while (my ($key,$value) = $cursor->each) { print "$value: Go Bluejays!\n" if $key eq 'State' and $value eq 'Katonah'; } =head1 DESCRIPTION Boulder::Cursor is a utility class that allows you to create one or more iterators across a L object. This is used for traversing large Stone objects in order to identify or modify portions of the record. =head2 CLASS METHODS =item Boulder::Cursor->new($stone) Return a new Boulder::Cursor over the specified L object. This will return an error if the object is not a L or a descendent. This method is usually not called directly, but rather indirectly via the L cursor() method: my $cursor = $stone->cursor; =head2 OBJECT METHODS =item $cursor->each() Iterate over the attached B. Each iteration will return a two-valued list consisting of a tag path and a value. The tag path is of a form that can be used with B (in fact, a cursor is used internally to implement the B method. When the end of the B is reached, C will return an empty list, after which it will start over again from the beginning. If you attempt to insert or delete from the stone while iterating over it, all attached cursors will reset to the beginnning. For example: $cursor = $s->cursor; while (($key,$value) = $cursor->each) { print "$value: BOW WOW!\n" if $key=~/pet/; } =item $cursor->reset() This resets the cursor back to the beginning of the associated B. =head1 AUTHOR Lincoln D. Stein . =head1 COPYRIGHT Copyright 1997-1999, Cold Spring Harbor Laboratory, Cold Spring Harbor NY. This module can be used and distributed on the same terms as Perl itself. =head1 SEE ALSO L, L =cut #------------------- Boulder::Cursor--------------- *next_pair = \&each; # New expects a Stone object as its single # parameter. sub new { my($package,$stone) = @_; die "Boulder::Cursor: expect a Stone object parameter" unless ref($stone); my $self = bless {'stone'=>$stone},$package; $self->reset; $stone->_register_cursor($self,'true'); return $self; } # This procedure does a breadth-first search # over the entire structure. It returns an array that looks like this # (key1[index1].key2[index2].key3[index3],value) sub each { my $self = shift; my $short_keys = shift; my $stack = $self->{'stack'}; my($found,$key,$value); my $top = $stack->[$#{$stack}]; while ($top && !$found) { $found++ if ($key,$value) = $top->next; if (!$found) { # this iterator is done pop @{$stack}; $top = $stack->[$#{$stack}]; next; } if ( ref $value && !exists $value->{'.name'} ) { # found another record to begin iterating on if (%{$value}) { undef $found; $top = $value->cursor; push @{$stack},$top; next; } else { undef $value; } } } unless ($found) { $self->reset; return (); } return ($key,$value) if $short_keys; my @keylist = map {($_->{'keys'}->[$_->{'hashindex'}]) . "[" . ($_->{'arrayindex'}-1) ."]"; } @{$stack}; return (join(".",@keylist),$value); } sub reset { my $self = shift; $self->{'arrayindex'} = 0; $self->{'hashindex'} = 0; $self->{'keys'}=[$self->{'stone'}->tags]; $self->{'stack'}=[$self]; } sub DESTROY { my $self = shift; if (ref $self->{'stone'}) { $self->{'stone'}->_register_cursor($self,undef); } } # Next will return the next index in its Stone object, # indexing first through the members of the array, and then through # the individual keys. When iteration is finished, it resets itself # and returns an empty array. sub next { my $self = shift; my($arrayi,$hashi,$stone,$keys) = ($self->{'arrayindex'}, $self->{'hashindex'}, $self->{'stone'}, $self->{'keys'}); unless ($stone->exists($keys->[$hashi],$arrayi)) { $self->{hashindex}=++$hashi; $self->{arrayindex}=$arrayi=0; unless (defined($keys->[$hashi]) && defined($stone->get($keys->[$hashi],$arrayi))) { $self->reset; return (); } } $self->{arrayindex}++; return ($keys->[$hashi],$stone->get($keys->[$hashi],$arrayi)); } 1; libboulder-perl-1.30.orig/Stone/GB_Sequence.pm0100644000175000017500000001272307413123705017721 0ustar jojojojopackage Stone::GB_Sequence; use strict; use Carp; use vars '@ISA'; =head1 NAME Stone::GB_Sequence - Specialized Access to GenBank Records =head1 SYNOPSIS use Boulder::Genbank; # No need to use Stone::GB_Sequence directly $gb = Boulder::Genbank->newFh qw(M57939 M28274 L36028); while ($entry = <$gb>) { print "Entry's length is ",$entry->length,"\n"; @cds = $entry->match_features(-type=>'CDS'); @exons = $entry->match_features(-type=>'Exon',-start=>100,-end=>300); } } =head1 DESCRIPTION Stone::GB_Sequence provides several specialized access methods to the various fields in a GenBank flat file record. You can return the sequence as a Bio::Seq object, or query the sequence for features that match positional or descriptional criteria that you provide. =head1 CONSTRUCTORS This class is not intended to be created directly, but via a L stream. =head1 METHODS In addition to the standard L methods and accessors, the following methods are provided. In the synopses, the variable C<$entry> refers to a previously-created Stone::GB_Sequence object. =head2 $length = $entry->length Get the length of the sequence. =head2 $start = $entry->start Get the start position of the sequence, currently always "1". =head2 $end = $entry->end Get the end position of the sequence, currently always the same as the length. =head2 @feature_list = $entry->features(-pos=>[50,450],-type=>['CDS','Exon']) features() will search the entry feature list for those features that meet certain criteria. The criteria are specified using the B<-pos> and/or B<-type> argument names, as shown below. =over 4 =item -pos Provide a position or range of positions which the feature must B. A single position is specified in this way: -pos => 1500; # feature must overlap postion 1500 or a range of positions in this way: -pos => [1000,1500]; # 1000 to 1500 inclusive If no criteria are provided, then features() returns all the features, and is equivalent to calling the Features() accessor. =item -type, -types Filter the list of features by type or a set of types. Matches are case-insensitive, so "exon", "Exon" and "EXON" are all equivalent. You may call with a single type as in: -type => 'Exon' or with a list of types, as in -types => ['Exon','CDS'] The names "-type" and "-types" can be used interchangeably. =head2 $seqObj = $entry->bioSeq; Returns a L object from the Bioperl project. Dies with an error message unless the Bio::Seq module is installed. =back =head1 AUTHOR Lincoln D. Stein . =head1 COPYRIGHT Copyright 1997-1999, Cold Spring Harbor Laboratory, Cold Spring Harbor NY. This module can be used and distributed on the same terms as Perl itself. =head1 SEE ALSO L, L, L =cut @ISA = 'Stone'; # -------------------- h'mmmmm, bogus alert! -------------------- # Return list of all features that overlap a particular range # # Example: # @features = $gb->grab_features(-pos=>[50,450],-types=>['CDS','Exon']) # sub features { my $self = shift; my %param = @_; my %p; @p{map {s/^-//; s/s$//; lc $_} keys %param} = values %param; my $f = $self->Features; # regularize coordinates my $pos = $p{po}; my ($left,$right) = ref($pos) ? @$pos : $pos; $left ||= 0; $right ||= $left; ($left,$right) = ($right,$left) if $left > $right; # regularize types my @types = ref $p{type} ? @{$p{type}} : $p{type} if $p{type}; @types = $f->tags unless @types; # flatten into a list of all features of the specified type(s) my @features; for my $t (@types) { my @f = $f->get("\u\L$t"); foreach (@f) { $_->insert(Type=>$t) unless $_->get('Type'); } push @features,@f; } @features = grep { _overlap_filter($_,$left,$right) } @features if $left > 0 || $right > 0; return @features; } sub length { my $self = shift; return length $self->Sequence; } sub start { 1; } sub end { $_[0]->length } sub bioSeq { my $self = shift; my $id = $self->Accession; my $seq = $self->Sequence; my $desc = $self->Definition; eval { require Bio::Seq } || croak "Bio::Seq module not installed"; return Bio::Seq->new(-id=>$id,-sequence=>$seq,-desc=>$desc); } sub _feature_filter { my $f = shift; my $types = shift; foreach (@$types) { return 1 if $f->get("\u\L$_"); } return; } sub _overlap_filter { # assumes left and right are numerically sorted my $f = shift; my ($left,$right) = @_; return unless my $p = $f->Position; # simplest case -- single position if ($p =~ /^(\d+)$/) { return 1 if $1 >= $left and $1 <= $right; } # another simple case -- either/or if ($p =~ /^(\d+)[.^](\d+)$/) { return 1 if $1 == $left || $2 == $left || $1 == $right || $2 == $right; } # next simplest case -- a range if ($p =~ /^?(\d+)$/) { ($2,$1) = ($1,$2) if $1 > $2; return 1 if ($left >= $1 and $left <= $2) or ($right >= $1 and $left <= $2); } # complex case, a join(), order() or group() # not sure this is handled correctly in all the crazy combos if ($p =~ /^(?:join|order|group|complement)/) { $p =~ s/\((\d)+\.\d+\)\.\./$1../g; # (1.10).. => 1.. $p =~ s/\.\.\(\d+\.(\d+)\)/..$1/g; # ..(1.10) => ..10 my @ranges = $p =~ /[(),](?\d+)/g; foreach (@ranges) { next unless /?(\d+)/; ($2,$1) = ($1,$2) if $1 > $2; return 1 if ($left >= $1 and $left <= $2) or ($right >= $1 and $left <= $2); } } return; } 1; __END__ libboulder-perl-1.30.orig/docs/0040755000175000017500000000000007777761464015130 5ustar jojojojolibboulder-perl-1.30.orig/docs/blast_tags.txt0100644000175000017500000002462206620651053017771 0ustar jojojojo TAGS USED IN BOULDER REPRESENTATION OF BLAST/FAST OUTPUT July 28, 1998 Lincoln D. Stein Last modified: November 6, 1998 INTRODUCTION ------------ The boulder format is used by the quickblast program and other tools to run and parse the output of the BLAST and FAST family of sequence similarity searching tools. This document describes the tags that are expected in the boulder stream from BLAST and BLAST-like searches. DEFINED TAGS ------------ 1. Information about the program Blast_program The name of the algorithm used to run the analysis. Possible values include: blastn blastp blastx tblastn tblastx fasta3 fastx3 fasty3 tfasta3 tfastx3 tfasty3 Blast_version This gives the version of the program in whatever form appears on the banner page, e.g. "2.0a19-WashU". Blast_program_date This gives the date at which the program was compiled, if and only if it appears on the banner page. 2. Information about the run Blast_run_date This gives the date and time at which the similarity analysis was run, in the format "Fri Jul 6 09:32:36 1998" Blast_parms This points to a subrecord containing information about the algorithm's runtime parameters. The following subtags are used. Others may be added in the future: Hspmax the value of the -hspmax argument Expectation the value of E Matrix the matrix in use, e.g. BLOSUM62 Ctxfactor the value of the -ctxfactor argument Gapall The value of the -gapall argument 3. Information about the query sequence and subject database Blast_query The identifier for the search sequence, as defined by the FASTA format. This will be the first set of non-whitespace characters following the ">" character. Blast_db Unix filesystem path to the subject database. Blast_db_title The title of the subject database. 4. The search results Blast_hits This represents a list of BLAST (or FAST) hits on the query sequence. There may be zero, one, or many such tags, each one corresponding to a hit on the subject database. Each of these tags will contain one or more Hsp (high-scoring segment tags). Blast_hits points to a boulder subrecord containing the following subtags: Name The name of the sequence that was hit. Length The total length of the sequence that was hit Signif The significance of the hit. If there are multiple HSPs in the hit, this will be the most significant (smallest) value. Identity The percent identity of the hit. If there are multiple HSPs, this will be the one with the highest percent identity. Expect The expectation value for the hit. If there are multiple HSPs, this will be the lowest expectation value in the set. Hsps One or more sub-sub-tags, pointing to a nested record containing information about each high-scoring segment pair (HSP). See section (5) below. 5. The Hsp records Each Blast_hit tag will have at least one, and possibly several Hsp tags, each corresponding to a high-scoring segment pair (HSP). These records contain detailed information about the hit, including the alignments. Tags are as follows: Signif The significance (P value) of this HSP. Bits The number of bits of significance. Expect Expectation value for this HSP Identity Percent identity Positives Percent positive matches Score The Smith-Waterman alignment score Orientation "plus" or "minus", only present for nucleotide matches Strand Depending on algorithm used, indicates complementarity of match and possibly the reading frame. This is copied out of the blast report. Possibilities include: "Plus / Minus" "Plus / Plus" -- blastn algorithm "+1 / -2" "+2 / -2" -- blastx, tblastx Query_start Position at which the HSP starts in the query sequence (1-based indexing). Query_end Position at which the HSP stops in the query sequence. Subject_start Position at which the HSP starts in the subject (target) sequence. Subject_end Position at which the HSP stops in the subject (target) sequence. Query, Subject, Alignment These three tags contain strings which, together, create the gapped alignment of the query sequence with the subject sequence. EXAMPLES -------- 1. A BLASTN run Here is the output from a typical blastn (nucleotide->nucleotide) run. Long lines have been truncated. Note also that the percent sign (%) is escaped in the usual way. It will be unescaped when reading the boulder stream back in. Blast_run_date=Fri Nov 6 14:40:41 1998 Blast_db_date=2:40 PM EST Nov 6, 1998 Blast_parms={ Hspmax=10 Expectation=10 Matrix=+5,-4 Ctxfactor=2.00 } Blast_program_date=05-Feb-1998 Blast_db= /usr/tmp/quickblast18202aaaa Blast_version=2.0a19-WashU Blast_query=BCD207R Blast_db_title= test.fasta Blast_query_length=332 Blast_program=blastn Blast_hits={ Signif=3.5e-74 Expect=3.5e-74, Name=BCD207R Identity=100%25 Length=332 Hsps={ Subject=GTGCTTTCAAACATTGATGGATTCCTCCCCTTGACATATATATATACTTTGGGTTCCCGCAA... Signif=3.5e-74 Length=332 Bits=249.1 Query_start=1 Subject_end=332 Query=GTGCTTTCAAACATTGATGGATTCCTCCCCTTGACATATATATATACTTTGGGTTCCCGCAA... Positives=100%25 Expect=3.5e-74, Identity=100%25 Query_end=332 Orientation=plus Score=1660 Strand=Plus / Plus Subject_start=1 Alignment=||||||||||||||||||||||||||||||||||||||||||||||||||||||||||... } } = 2. A BLASTP run Here is the output from a typical blastp (protein->protein) run. Long lines have again been truncated. Blast_run_date=Fri Nov 6 14:37:23 1998 Blast_db_date=2:36 PM EST Nov 6, 1998 Blast_parms={ Hspmax=10 Expectation=10 Matrix=BLOSUM62 Ctxfactor=1.00 } Blast_program_date=05-Feb-1998 Blast_db= /usr/tmp/quickblast18141aaaa Blast_version=2.0a19-WashU Blast_query=YAL004W Blast_db_title= elegans.fasta Blast_query_length=216 Blast_program=blastp Blast_hits={ Signif=0.95 Expect=3.0, Name=C28H8.2 Identity=30%25 Length=51 Hsps={ Subject=HMTVEFHVTSQSW---FGFEDHFHMIIR-AVNDENVGWGVRYLSMAF Signif=0.95 Length=46 Bits=15.8 Query_start=100 Subject_end=49 Query=HLTQD-HGGDLFWGKVLGFTLKFNLNLRLTVNIDQLEWEVLHVSLHF Positives=52%25 Expect=3.0, Identity=30%25 Query_end=145 Orientation=plus Score=45 Subject_start=7 Alignment=H+T + H W GF F++ +R VN + + W V ++S+ F } } Blast_hits={ Signif=0.99 Expect=4.7, Name=ZK896.2 Identity=24%25 Length=340 Hsps={ Subject=FSGKFTTFVLNKDQATLRMSSAEKTAEWNTAFDSRRGFF----TSGNYGL... Signif=0.99 Length=101 Bits=22.9 Query_start=110 Subject_end=243 Query=FWGKVLGFTL-KFNLNLRLTVNIDQLEWEVLHVSLHFWVVEVSTDQTLSVE... Positives=41%25 Expect=4.7, Identity=24%25 Query_end=210 Orientation=plus Score=65 Subject_start=146 Alignment=F GK F L K LR++ EW S + T +... } } Blast_hits={ Signif=0.99 Expect=4.9, Name=F15H9.1 Identity=31%25 Length=426 Hsps={ Subject=LTLFIWIIACGFNNTQPLSYEENIQRLESCGKILPEESSNSITKSKYF... Signif=0.99 Length=66 Bits=23.2 Query_start=141 Subject_end=77 Query=VSLHFWVVEVSTDQT--LSVENGIRRIHS-SLILSSITNQSFSVSESDKR... Positives=45%25 Expect=4.9, Identity=31%25 Query_end=206 Orientation=plus Score=66 Subject_start=4 Alignment=++L W++ + T LS E I+R+ S IL ++ S + S+... } } Blast_hits={ Signif=0.99 Expect=5.3, Name=D1007.9 Identity=19%25 Length=160 Hsps={ Subject=DFEIGILEKGFWAALRENTNVQQKFEITDRKMHNFMIFTAEKLKKKN... Signif=0.99 Length=78 Bits=21.1 Query_start=135 Subject_end=129 Query=EWEVLHVSLHFWV-VEVSTDQTLSVENGIRRIHSSLILSS--ITNQSFS... Positives=51%25 Expect=5.3, Identity=19%25 Query_end=212 Orientation=plus Score=60 Subject_start=49 Alignment=++E+ + FW + +T+ E R++H+ +I ++ + +... } } Blast_hits={ Signif=9999 Expect=5.5, Name=T08E11.4 Identity=36%25 Length=299 Hsps={ Subject=LDSPCTETMNQKTFMLNHTVQNVSNIKERGIYTTNVE Length=36 Bits=18.0 Query_start=22 Subject_end=165 Query=IESTTTQVENQDVFFLTLLVQTVSN-GSGGRFVNNTQ Positives=48%25 Expect=5.5, Identity=35%25 Query_end=57 Orientation=plus Score=51 Subject_start=129 Alignment=++S T+ NQ F L VQ VSN G + N + } Hsps={ Subject=LQCEKKHFDSKDWSIEVELVLTLKSSNG-QRL--SFTASSTLNEPA... Length=55 Bits=17.2 Query_start=134 Subject_end=239 Query=LEWEVLHVSLHFWVVEVSTDQTLSVENGIRRIHSSLILSSITNQSFSV... Positives=45%25 Expect=5.5, Identity=36%25 Query_end=188 Orientation=plus Score=49 Subject_start=187 Alignment=L+ E H W +EV TL NG +R+ S SS N+... } } Blast_hits={ Signif=1.00 Expect=6.4, Name=K03H1.7 Identity=22%25 Length=88 Hsps={ Subject=MGQSFSSSFNAKQTSSSTPQKSTKTPTCSDPRSPSQDIERTPIQVK... Signif=1.00 Length=84 Bits=18.7 Query_start=1 Subject_end=88 Query=MGVT-SGGLNFKDTVFNEQQRDIESTTT---QVENQDVFFLTLLVQTVS... Positives=44%25 Expect=6.4, Identity=22%25 Query_end=84 Orientation=plus Score=53 Subject_start=1 Alignment=MG + S N K T + Q+ ++ T + +QD+ + V... } } Blast_hits={ Signif=1.00 Expect=8.3, Name=Y67D8A_380.c Identity=41%25 Length=47 Hsps={ Subject=TLLDAVSSDGSGADVEASGEDVQT Signif=1.00 Length=24 Bits=14.4 Query_start=38 Subject_end=44 Query=TLLVQTVSNGSGGRFVNNTQDIQT Positives=58%25 Expect=8.3, Identity=41%25 Query_end=61 Orientation=plus Score=41 Subject_start=21 Alignment=TLL S+GSG + +D+QT } } Blast_hits={ Signif=1.00 Expect=8.9, Name=C06A5.2 Identity=26%25 Length=214 Hsps={ Subject=TSGKKNKYKFKYGQIDQQELSINNDKTEYLENAQ---LTKLMSHADVR... Signif=1.00 Length=100 Bits=21.1 Query_start=4 Subject_end=207 Query=TSGGLN---FKDTVFNEQQRDIESTTTQ-VENQDVFFLTLLVQ--TVSNG... Positives=50%25 Expect=8.9, Identity=26%25 Query_end=103 Orientation=plus Score=60 Subject_start=106 Alignment=TSG N FK ++Q+ I + T+ +EN LT L+ V... } } Blast_hits={ Signif=1.00 Expect=9.5, Name=Y71F9B_275.b Identity=26%25 Length=264 Hsps={ Subject=IKYGVISYENVKQDLLDWRWIYISGRLHKPVLEVIKPRQDMCDLVTENRR... Signif=1.00 Length=85 Bits=21.5 Query_start=119 Subject_end=190 Query=LKFNLNLRLTVNIDQLEWEVLHVS--LHFWVVEVSTDQT----LSVENGIRRI... Positives=46%25 Expect=9.5, Identity=26%25 Query_end=203 Orientation=plus Score=61 Subject_start=98 Alignment=+K+ + V D L+W +++S LH V+EV + L EN ... } } = libboulder-perl-1.30.orig/docs/genbank_tags.txt0100644000175000017500000002323206622654527020300 0ustar jojojojo TAGS USED IN BOULDER REPRESENTATION OF GENBANK NUCLEOTIDE RECORDS August 3, 1998 Lincoln D. Stein Last modified: November 12, 1998 INTRODUCTION ------------ The boulder format is used by the Boulder::Genbank module, as well as by the gb_search and gb_fetch programs, to retrieve and parse Genbank entries from NCBI as well as from local files. This document describes the tags that are returned in the boulder stream from Boulder::Genbank. DEFINED TAGS ------------ The tags returned by the parsing operation are taken from the NCBI ASN.1 schema. For consistency, they are normalized so that the initial letter is capitalized, and all subsequent letters are lowercase. This section contains an abbreviated list of the most useful/common tags. See "The NCBI Data Model", by James Ostell and Jonathan Kans in "Bioinformatics: A Practical Guide to the Analysis of Genes and Proteins" (Eds. A. Baxevanis and F. Ouellette), pp 121-144 for the full listing. Top-Level Tags -------------- These are tags that appear at the top level of the parsed Genbank entry. Accession The accession number of this entry. Because of the vagaries of the Genbank data model, an entry may have multiple accession numbers (e.g. after a merging operation). Accession may therefore be a multi-valued tag. Example: my $accessionNo = $s->Accession; Authors The list of authors, as they appear on the AUTHORS line of the Genbank record. No attempt is made to parse them into individual authors. Basecount The nucleotide basecount for the entry. It is presented as a Boulder Stone with keys "a", "c", "t" and "g". Example: my $A = $s->Basecount->a; my $C = $s->Basecount->c; my $G = $s->Basecount->g; my $T = $s->Basecount->t; print "GC content is ",($G+$C)/($A+$C+$G+$T),"\n"; Comment The COMMENT line from the Genbank record. Definition The DEFINITION line from the Genbank record, unmodified. Features The FEATURES table. This is a complex stone object with multiple subtags. See the the section on "The Features Tag" for details. Journal The JOURNAL line from the Genbank record, unmodified. Keywords The KEYWORDS line from the Genbank record, unmodified. No attempt is made to parse the keywords into separate values. Example: my $keywords = $s->Keywords Locus The LOCUS line from the Genbank record. It is not further parsed. Medline, Nid References to other database accession numbers. Organism The taxonomic name of the organism from which this entry was derived. This line is taken from the Genbank entry unmodified. See the NCBI data model documentation for an explanation of their taxonomic syntax. Reference The REFERENCE line from the Genbank entry. There are often multiple Reference lines. Example: my @references = $s->Reference; Sequence The DNA or RNA sequence of the entry. This is presented as a single lower-case string, with all base numbers and formatting characters removed. Source The entry's SOURCE field; often giving clues on how the sequencing was performed. Title The TITLE field from the paper describing this entry, if any. The Features Tag ---------------- The Features tag points to a Stone record that contains multiple subtags. Each subtag is the name of a feature which points, in turn, to a Stone that describes the feature's location and other attributes. The full list of feature is beyond this document, but the following are the features that are most often seen: Cds a CDS Intron an intron Exon an exon Gene a gene Mrna an mRNA Polya_site a putative polyadenylation signal Repeat_unit a repetitive region Source More information about the organism and cell type the sequence was derived from Satellite a microsatellite (dinucleotide repeat) Each feature will contain one or more of the following subtags: DB_xref A cross-reference to another database in the form DB_NAME:accession_number. See the NCBI Web site for a description of these cross references. Evidence The evidence for this feature, either "experimental" or "predicted". Gene If the feature involves a gene, this will be the gene's name (or one of its names). This subtag is often seen in "Gene" and Cds features. Example: foreach ($s->Features->Cds) { my $gene = $_->Gene; my $position = $_->Position; Print "Gene $gene ($position)\n"; } Map If the feature is mapped, this provides a map position, usually as a cytogenetic band. Note A grab-back for various text notes. Number When multiple features of this type occur, this field is used to number them. Ordinarily this field is not needed because Boulder::Genbank preserves the order of features. Organism If the feature is Source, this provides the source organism. Position The position of this feature, usually expresed as a range (1970..1975). Product The protein product of the feature, if applicable, as a text string. Translation The protein translation of the feature, if applicable. EXAMPLE GENBANK OBJECT ---------------------- The following is an excerpt from a moderately complex Genbank Stone. The Sequence line and several other long lines have been truncated for readability. Authors=Spritz,R.A., Strunk,K., Surowy,C.S.O., Hoch,S., Barton,D.E. and Francke,U. Authors=Spritz,R.A., Strunk,K., Surowy,C.S. and Mohrenweiser,H.W. Locus=HUMRNP7011 2155 bp DNA PRI 03-JUL-1991 Accession=M57939 Accession=J04772 Accession=M57733 Keywords=ribonucleoprotein antigen. Sequence=aagcttttccaggcagtgcgagatagaggagcgcttgagaaggcaggttttgcagcagacggcagtgacagcccag... Definition=Human small nuclear ribonucleoprotein (U1-70K) gene, exon 10 and 11. Journal=Nucleic Acids Res. 15, 10373-10391 (1987) Journal=Genomics 8, 371-379 (1990) Nid=g337441 Medline=88096573 Medline=91065657 Features={ Polya_site={ Evidence=experimental Position=1989 Gene=U1-70K } Polya_site={ Position=1990 Gene=U1-70K } Polya_site={ Evidence=experimental Position=1992 Gene=U1-70K } Polya_site={ Evidence=experimental Position=1998 Gene=U1-70K } Source={ Organism=Homo sapiens Db_xref=taxon:9606 Position=1..2155 Map=19q13.3 } Cds={ Codon_start=1 Product=ribonucleoprotein antigen Db_xref=PID:g337445 Position=join(M57929:329..475,M57930:183..245,M57930:358..412, ... Gene=U1-70K Translation=MTQFLPPNLLALFAPRDPIPYLPPLEKLPHEKHHNQPYCGIAPYIREFEDPRDAPPPTR... } Cds={ Codon_start=1 Product=ribonucleoprotein antigen Db_xref=PID:g337444 Evidence=experimental Position=join(M57929:329..475,M57930:183..245,M57930:358..412, ... Gene=U1-70K Translation=MTQFLPPNLLALFAPRDPIPYLPPLEKLPHEKHHNQPYCGIAPYIREFEDPR... } Polya_signal={ Position=1970..1975 Note=putative Gene=U1-70K } Intron={ Evidence=experimental Position=1100..1208 Gene=U1-70K } Intron={ Number=10 Evidence=experimental Position=1100..1181 Gene=U1-70K } Intron={ Number=9 Evidence=experimental Position=order(M57937:702..921,1..1011) Note=2.1 kb gap Gene=U1-70K } Intron={ Position=order(M57935:272..406,M57936:1..284,M57937:1..599, <1..>1208) Gene=U1-70K } Intron={ Evidence=experimental Position=order(M57935:284..406,M57936:1..284,M57937:1..599, <1..>1208) Note=first gap-0.14 kb, second gap-0.62 kb Gene=U1-70K } Intron={ Number=8 Evidence=experimental Position=order(M57935:272..406,M57936:1..284,M57937:1..599, <1..>1181) Note=first gap-0.14 kb, second gap-0.62 kb Gene=U1-70K } Exon={ Number=10 Evidence=experimental Position=1012..1099 Gene=U1-70K } Exon={ Number=11 Evidence=experimental Position=1182..(1989.1998) Gene=U1-70K } Exon={ Evidence=experimental Position=1209..(1989.1998) Gene=U1-70K } Mrna={ Product=ribonucleoprotein antigen Position=join(M57928:358..668,M57929:319..475,M57930:183..245, ... Gene=U1-70K } Mrna={ Product=ribonucleoprotein antigen Citation=[2] Evidence=experimental Position=join(M57928:358..668,M57929:319..475,M57930:183..245, ... Gene=U1-70K } Gene={ Position=join(M57928:207..719,M57929:1..562,M57930:1..577, ... Gene=U1-70K } } Reference=1 (sites) Reference=2 (bases 1 to 2155) = libboulder-perl-1.30.orig/docs/javaboulder.txt0100644000175000017500000004063706620604427020153 0ustar jojojojo PARTIAL SPECIFICATION OF THE JAVA BOULDER API class Boulder.Stone public class Stone extends Object; In order to work around Java's strong typing and to emulate the behavior of Perl Boulder, the Stone object becomes a wrapper around simple types (strings and numerics) as well as a more complex object that contains tag/value pairs. This allows the retrieval and storage methods to work on Stones rather than the more generic Objects, and should reduce the amount of upcasting necessary. Constructors: ------------- * Stone() Create new Stone object containing no initial tags. * Stone(Dictionary d) Create new Stone object initialized from a Dictionary object. Each key in the dictionary becomes a tag in the Stone. Certain values are treated specially: -Dictionary values are recursively turned into sub-Stone objects. -Array and Vector values are turned into multivalued entries that share the same tag. *Stone(String s) Create a Stone wrapped around a string. *Stone(Number n) Create a Stone wrapped around any of the numeric objects (Integer, Long, Float, Double). Static (class) Methods: ----------------------- * static public Stone fromString(String s) Create a Stone from its string representation. Object Methods: --------------- *public String toString() Return the string representation of the Stone in Boulder format. Simple Stones are cast to the string representation of their scalar value. Complex Stones are represented (recursively) in boulderio form. This serializes a Stone in a form suitable for reconstitution with fromString(). *public Integer toInt() throws NumberFormatException Converts a simple Stone to an Integer object. Complex Stones and simple Stones that do not contain parseable contents throw a NumberFormatException. *public Float toFloat() throws NumberFormatException Converts a simple Stone to a Float object. Complex Stones and simple Stones that do not contain parseable contents throw a NumberFormatException. *public Double toDouble() throws NumberFormatException Converts a simple Stone to a Double object. Complex Stones and simple Stones that do not contain parseable contents throw a NumberFormatException. *public void insert(Stone s) Insert Stone s into the current Stone, merging their keys and values. *public void insert(Dictionary d) Insert the key/value pairs contained in dictionary into the Stone. Similarly-named keys are appended to, making them multi-valued. On simple Stones this, and the other insertion methods, silently converts the simple Stone into a complex Stone, deleting its simple value. [This behavior is open for discussion; I think it's better than making the method a no-op] *public void insert(String tag,Object value) Insert an Object into the Stone at the indicated tag. Similarly-named keys are appended to, making them multi-valued. *public void replace(Dictionary d) Insert the key/value pairs contained in dictionary into the Stone. Similarly-named keys are replaced, overwriting their previous contents. *public void replace(Stone s) Merge Stone s into the current Stone, overwriting any keys in common. *public void replace(String tag, Stone value) Insert a sub-Stone (complex or scalar) into the Stone at the indicated tag. Similarly-named keys are replaced, overwriting their previous contents. *public void subtract(Stone s) Remove from this Stone all the tags and associated values present in Stone s. Only the top level of tags are affected (no need to do recursive subtraction). *public void intersect(Stone s) Remove from this Stone any tags and associated values not in common with Stone s. *public Stone[] get(String tag) Return an array of the Stone at the indicated tag. Returns null if the tag is not found. Also returns null if called on a scalar Stone. *public Stone get(String tag, integer n) Return the nth Stone at the given tag. Uses zero-based indexing. Returns null if the tag is not found or if the index is out of bounds (? should it raise an array exception). Negative numbers count in from the right end of the array. The last item is index -1. *public Stone getFirst(String tag) Return the first Stone at the indicated tag. Returns null if the tag is not found. This is the same as get(tag,0). *public Stone getLast(String tag) Return the first Stone at the indicated tag. Returns null if the tag is not found. This is the same as get(tag,-1). *public Stone getAny(String tag) Return a random value from the indicated tag. This has never been used, to my knowledge, but it is a feature of the Perl implementation. *public String[] tags() Return all the tags available in this Stone. If it is a scalar Stone, an empty (not null) array is returned. *public Boolean exists(String tag) Returns true if the indicated tag exists. *public void delete(String tag) Delete the tag and its associated subtree from the Stone. *public Stone[] search(String tag) Recursively searches through the Stone and its subtrees for the first tag that matches the argument and returns its contents. The search method is depth-first (top-level tags returned preferentially). *public Stone[] index(String index_string) Follows a path through the Stone, returning the value. The path is of the form: tag1[index].tag2[index].tag3[index] Indexes can be omitted, in which case the path follows the first value of the tag. Indexes match any of the following expressions: [0-9]+ index leftward from first value -[0-9]+ index rightward from last value # last item \? random item (question mark) *public Stone[] path(String index_string) This is a better name for index(), but unfortunately not part of the Perl Boulder API. Maybe index() should be phased out. *public Enumeration cursor() Return an Enumeration over the Stone object. Each call to nextElement() takes a step in a breadth-first traversal of the Stone. The elements of the Enumeration are Stones with three tags: tag name interpretation -------- -------------- "tag" String representing the name of the current element's tag "path" String representing full path to current element "value" The value pointed to by the tag. =========================================================================== public interface Boulder.Filter; Boulder.Filter can prefilter a BoulderIO stream so that only certain Stones are passed to higher layers. Its filter() method is presented with each candidate Stone in turn. It returns a boolean True to accept the Stone, or False to filter it. *public abstract Boolean filter(Stone s) Return True if this Stone should be passed up to higher layers. =========================================================================== public interface Boulder.IO; This interface defines everything that a generic Boulder IO class should be able to do. Both Boulder.Stream (serial input/output) and Boulder.Store (record-oriented input/output) implement this interface. Note that Boulder.IO has an intrinsic cursor behavior, in that it returns Stones in some defined order. * public abstract Stone read_record() throws IOException Reads a new Stone from input and returns it. If no further stones can be read returns NULL. If an I/O error occurs returns IOException. Returns EOFException if the caller makes additional calls to read_record() after it has returned NULL. * public abstract Stone read_record(String[] f) throws IOException Reads a new Stone from input and returns it, filtering tag(s) based on an array of tag filter patterns. The argument is an array of strings of the form "tag1.tag2.tag3....", corresponding to a set of tag paths. For example, if the current Stone has the structure: NAME=Fred DEMOGRAPHICS={ AGE=62 GENDER=Male PHYSICAL_ATTRIBUTES={ BALDING=Y OVERWEIGHT=N } } ADDRESS={ STREET=1313 Mockingbird Lane TOWN=Port Washington STATE=NY } then the stone returned by read_record(["NAME","DEMOGRAPHICS.AGE","ADDRESS"]) will return the Stone: NAME=Fred DEMOGRAPHICS={ AGE=62 } ADDRESS={ STREET=1313 Mockingbird Lane TOWN=Port Washington STATE=NY } If no tags match the filter specification, returns an empty (but not NULL) Stone. If no further stones can be read returns NULL. If an I/O error occurs, returns IOException. Returns EOFException if the caller makes additional calls to read_record() after it has returned NULL. * public abstract Stone get() throws IOException * public abstract Stone get(String[] f) throws IOException The Perl version of Boulder uses get() as a synonum for read_record(), because some people requested it. Now I use get() in preference to the longer form and am open to entirely replacing read_record() with get(). * public abstract void write_record(Stone s) throws IOException * public abstract void write_record(Stone s, String[] f) throws IOException These two methods write a Stone to an output device or file. In the first form, the Stone is written out with all its fields intact. In the second form, the Stone's tags are first filtered on the specified array of filtering rules. The format of filtering rules is identical to the read_record() method. If the Stone cannot be successfully written, this method throws an IOException. * public abstract void put(Stone s) throws IOException * public abstract void put(Stone s, String[] f) throws IOException These methods are synonyms for write_record() in the Perl implementation. They are shorter, and if you like them better maybe they should be the canonical names. * public void filter(Boulder.Filter s) This method adds a filter to the Boulder.IO object. The filter is presented with each Stone in turn and selects whether to accept the Stone or reject it. See the Boulder.Filter interface for details. =========================================================================== public class Boulder.Stream implements Boulder.IO; This class defines a Boulder class that reads and writes to a type of I/O that behaves in a serial fashion. Constructors: ------------- * public Stream() Create a new Boulder.Stream object attached to standard in and standard output. The get_record() method will read Stones from standard input one at a time until standard input is exhausted. The write_record() method will emit Stones to standard output. * public Stream(InputStream in, OutputStream out) Create a Boulder.Stream, tieing it to the specified InputStream and OutputStream objects. * public void passthru(Boolean pass) A Boulder.Stream object can behave in either of two ways. It can gobble up the Stone objects that are read via get_record() completely, in which case it emits nothing unless write_record() is called, or it can pass the unwanted components of the object through to its output stream. In pass through mode, a program that repeatedly calls read_record(["NAME","DEMOGRAPHICS.AGE","ADDRESS"]) on an input stream containing the Stone given in the example above, would emit the following Stone automatically even if it doesn't make a call to write_record(): DEMOGRAPHICS={ GENDER=Male PHYSICAL_ATTRIBUTES={ BALDING=Y OVERWEIGHT=N } } In pass through mode, any calls to write_record() performed before the next read_record() will merge the contents of the Stone specified by write_record() with the passed through portion of the Stone. If, after calling read_record(), the program were to call write_record() with a Stone with this structure: NAME=Andrew DEMOGRAPHICS={ FAVORITE_COLOR=blue } Then the resulting stone would be: NAME=Andrew DEMOGRAPHICS={ GENDER=Male FAVORITE_COLOR=blue PHYSICAL_ATTRIBUTES={ BALDING=Y OVERWEIGHT=N } } Note that this behavior is slightly different than the Perl implementation, in which only top-level tags are merged. That behavior has always seemed a bit bogus, and this is more logical (but perhaps not more useful in practice). The passthru() method accepts a boolean indicating whether passthru behavior should be activated or not. The default is "false", for no passthru. * public Boolean passthru() This method returns the state of the passthru flag. =========================================================================== public interface Boulder.Store extends Boulder.IO; Boulder.Store adds unique record IDs to the basic Boulder.IO scheme, turning it into a database of sorts. The record IDs can be used to fetch and store Stones in a non-linear fashion, and provides simple indexing and querying services. The record ID is a declared tag in the Stone that must be present and unique. The record ID can be generated automatically if desired. The serial access methods behave as they do in the parent interface. Stones are returned one at a time (in some implementation-specific order). The set of Stones returned and their order can be affected by the query() method, however. The behavior of the write_record() method is dependent on flags that control whether missing record IDs are automatically generated, and whether Stones can overwrite objects with identical record IDs that are already in the database. * public abstract void setRecordID (String tag) Declare that this tag will be used as the unique record ID in the Stone. It is assumed that classes that implement the Boulder.Store interface will store this tag name somewhere in the associated database. The default is "ID". * public abstract String getRecordID() Return the special tag. * public abstract Boolean setIndex (String tag) Declare that a tag (or tag path) is an index. Returns a true value if the database supports indexing on this tag. * public abstract String[] getIndex() Returns an array of all the tags that are indexes. * public abstract void setAutoID(Boolean doAuto) Sets a flag that allows write_record() to automatically add a record ID tag to Stones that do not already contain the designated tag. The default is false. * public abstract Boolean getAutoID() Returns the state of the auto record generation flag. * public abstract void setClobber(Boolean clobber) Sets a flag that allows write_record() to clobber (overwrite) any Stone in the database that already has the ID of the Stone being written. The default is false. * public abstract Boolean getClobber() Returns the state of the clobber flag. * public abstract Stone read_record(String ID) Read the Stone with the indicated record ID from the database. Returns NULL if not present. * public abstract Stone read_record(String[] f,String ID) Reads the Stone with the indicated record ID from the database, filtering its tags on the provided array of tag patterns. * public abstract void write_record(Stone s) throws IOException; Writes the Stone to the database, using the value of its record ID tag to put the Stone in the right place. If no record ID tag is present, and autoID is true, then write_record() generates a new unique record ID tag and adds it to the Stone. If the record ID is not unique, and clobber is false, then throws an InvalidObjectException. * public abstract void write_record(Stone s,String ID) throws IOException; First replaces the Stone's record ID tag with the indicated ID and then calls write_record(Stone s) to insert the Stone into the database. * public abstract Boolean query(String s) Add a query to the BoulderIO object. The query is a string whose syntax and semantics are implementation-specific. Once the query is in force, repeated calls to read_record() will return the set of Stone objects that match the query. The order of Stones returned may be affected by the query as well. The query remains in force until all selected Stones are exhausted (read_record() returns null). Calling query() before exhausting the Stones resets the state of the BoulderIO object. The boolean result from query() indicates whether the query string is syntactically acceptable, not whether the query will return a non-empty set. Possibly it would be better to raise an exception for queries that fail the syntax check. What do you think? ============================================================ public class Boulder.Recno extends Boulder.Store; Boulder.Store implements a record-oriented/random-access type of retrieval. Each Stone is associated with a unique record number that can be used to fetch and retrieve Stones in a non-linear fashion. The record number is an integer greater than or equal to zero. In the Perl implementation, the record numbers are continuous. When a record is deleted, the remaining Stones are renumbered to fill in the gap. (This is a decision that we might want to reconsidered.) = unfinished = libboulder-perl-1.30.orig/eg/0040755000175000017500000000000007777761464014573 5ustar jojojojolibboulder-perl-1.30.orig/eg/gb_search0100755000175000017500000000476107777564337016443 0ustar jojojojo#!/usr/bin/perl use strict 'vars'; use LWP::UserAgent; use URI::Escape; use Getopt::Long; use strict; use constant GENBANK_URL=>"http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?form=4&html=no"; use constant CHUNKSIZE=>10000; my ($TERM,$AGE,$MAX,$DB,$VERBOSE,$BUFFER,$COUNT,$GOT,$GOT_TOTAL); GetOptions('max=i' => \$MAX, 'db=s' =>\$DB, 'age=i' => \$AGE, 'count' => \$COUNT, 'verbose!'=>\$VERBOSE) || die < Query GenBank for a list of accession numbers. The query string should be in the form recognized by NCBI\'s term parser. See http://www.ncbi.nlm.nih.gov/Entrez/linking.html for examples. Options: -db Database to search (n) -max Max entries to return (100) -age Only fetch accessions entered ago -verbose Show brief description line -count Just retrieve the count that would be retrieved Database specifiers: m MEDLINE p Protein n Nucleotide t 3-D structure c Genome Example search: gb_search -verbose -db n 'Oryza sativa[Organism] AND EST[Keyword]' Some common field modifiers: [All Fields] [Accession] [Author Name] [Feature Key] [Gene Name] [Keyword] [Organism] USAGE ; $MAX ||= 100; $DB ||= 'n'; $DB = lc(substr($DB,0,1)); # first character, whatever it is $BUFFER = ''; my $max = $MAX > CHUNKSIZE ? CHUNKSIZE : $MAX; $TERM = GENBANK_URL . "&db=$DB&dispmax=$max&term=" . uri_escape("@ARGV"); $TERM .= "&relpubdate=$AGE" if $AGE > 0; $TERM .= "&Dopt=" . ($COUNT ? 'q' : 'd'); $GOT_TOTAL=0; my $agent = LWP::UserAgent->new(); while ($GOT_TOTAL < $MAX) { $GOT = 0; my $req = HTTP::Request->new('GET' => "$TERM&dispstart=$GOT_TOTAL"); my $response = $agent->request($req,\&process_text); die "Request failure: ",$response->status_line unless $response->is_success; last if $GOT < CHUNKSIZE; $GOT_TOTAL += $GOT; } exit 0; sub process_text ($$$) { my ($chunk,$response,$protocol) = @_; my ($position,$remainder); $BUFFER .= $chunk; if ($COUNT) { print $1,"\n" if $BUFFER=~/^(\d+)\n/; $GOT = $MAX; return; } elsif ($DB =~ /^[mt]$/) { while ($BUFFER=~/^\s+(\d+) (.+)$/mg) { print $VERBOSE ? "$1\t$2" : $1,"\n"; $position = pos($BUFFER); $GOT++; } } else { while ($BUFFER=~/^\s+\d+ (\w+)\n\n(.*)\ngi/mg) { print $VERBOSE ? "$1\t$2" : $1,"\n"; $position = pos($BUFFER); $GOT++; } } substr($BUFFER,0,$position) = ''; pos($BUFFER)=0; } libboulder-perl-1.30.orig/eg/gb_get0100755000175000017500000000412707777564337015751 0ustar jojojojo#!/usr/bin/perl use strict 'vars'; use lib '..','../blib/lib','.','./blib/lib'; use Boulder::Stream; use Boulder::Genbank; use Getopt::Long; use Text::Abbrev; my ($ACCESSOR,$DELAY,$FIRST,$BOULDER,$DB); GetOptions('accessor=s' => \$ACCESSOR, 'boulder' =>\$BOULDER, 'db=s' => \$DB, 'delay=i' => \$DELAY) || die < Use the Entrez|Yank methods for retrieving records. -boulder Return boulder format file -db Database -delay Seconds to sleep between retrievals (10) Options may be abbreviated, i.e. -acc E Database specifiers: m MEDLINE p Protein n Nucleotide t 3-D structure c Genome USAGE ; $ACCESSOR ||= 'Entrez'; unless (defined $DELAY) { warn "Warning: No -delay parameter. Introducing a 1 second delay between fetches\n"; $DELAY = 1; } my @accession_numbers; my $from_stdin = !@ARGV; my $abbrev = abbrev qw(Entrez Yank entrez yank); my $accessor = $abbrev->{$ACCESSOR} || die "Unknown access method $ACCESSOR"; substr($accessor,0,1)=~tr/a-z/A-Z/; #canonicalize while (defined (my $a = shift)) { $from_stdin++,last if $a eq '-'; push(@accession_numbers,$a); } if ($from_stdin) { while (<>) { chomp; next if /^\#/; my ($a) = /^(\w+)/; next unless $a; push(@accession_numbers,$a); } } $DB ||= 'n'; my $stream = new Boulder::Stream; my $gb = new Boulder::Genbank(-accessor=>$accessor,-fetch=>\@accession_numbers,-db=>$DB) || die "Couldn't open new Boulder::Genbank object"; my $count; while (1) { last if $count++ > @accession_numbers; next unless my $s = $gb->get; sleep $DELAY if $FIRST++ && $DELAY > 0; unless ($BOULDER) { print $s->Blob,"//\n"; } else { $stream->put($s); } } libboulder-perl-1.30.orig/eg/genbank.pl0100755000175000017500000000244406620376335016522 0ustar jojojojo#!/usr/local/bin/perl # This requires LWP to be installed. use lib '.','..'; use Boulder::Genbank; $gb = new Boulder::Genbank(-accessor=>'Entrez',-param=>[qw/M57939 M28274 L36028/]); while (my $s = $gb->get) { @introns = $s->Features->Intron; print "There are ",scalar(@introns)," introns.\n"; if (@introns) { foreach (sort {$a->Number <=> $b->Number} @introns) { print "Intron number ",$_->Number,":\n", "\tPosition = ",$_->Position,"\n", "\tEvidence = ",$_->Evidence,"\n"; } } @exons = $s->Features->Exon; print "There are ",scalar(@exons)," exons.\n"; if (@exons) { foreach (sort {$a->Number <=> $b->Number} @exons) { print "Exon number ",$_->Number,":\n", "\tPosition = ",$_->Position,"\n", "\tEvidence = ",$_->Evidence,"\n", "\tGene = ",$_->Gene,"\n"; } } if ($s->Features->Polya_signal || $s->Features->Polya_site) { print "The first PolyA site is at ",$s->Features->Polya_signal ? $s->Features->Polya_signal->Position : $s->Features->Polya_site->Position ,"\n"; } print "This sequence has the following top level tags: ",join(',',$s->tags),"\n"; print "\n","Here's the whole thing as a table:\n"; print $s->asTable; print "------------------------------------\n"; } libboulder-perl-1.30.orig/eg/genbank2.pl0100755000175000017500000000110006620376335016570 0ustar jojojojo#!/usr/local/bin/perl # This requires LWP to be installed. use lib '.','..'; use Boulder::Genbank; $gb = new Boulder::Genbank(-accessor=>'Entrez',-param=>[qw/M57939/]); while (my $s = $gb->get) { print "Test document

\n"; print $s->asHTML(\&wrap_long_lines); print "\n"; } sub wrap_long_lines { my ($tag,$value) = @_; if ($tag =~ /Sequence|Translation/) { $value=~s/(.{10})/$1 /g; $value = "$value"; } return ("$tag",qq{$value}); } libboulder-perl-1.30.orig/eg/genbank3.pl0100755000175000017500000000043706620376335016605 0ustar jojojojo#!/usr/local/bin/perl # This requires LWP to be installed. use lib '.','..'; use Boulder::Genbank; use Boulder::Stream; $gb = new Boulder::Genbank(-accessor=>'Entrez',-param=>[qw/M57939 M28274 L36028/]); $stream = new Boulder::Stream; while (my $s = $gb->get) { $stream->put($s); } libboulder-perl-1.30.orig/eg/quickblast.pl0100755000175000017500000001350106722055733017252 0ustar jojojojo#!/usr/local/bin/perl # do a quick BLAST search a la fasta use Getopt::Long; use IO::File; use File::Basename 'basename','dirname'; use File::Path 'mkpath','rmtree'; use Boulder::Stream; use Boulder::Blast; use strict 'vars'; use sigtrap qw(die normal-signals); use constant BLAST_DEFAULT => 'blastn'; use constant ARGS => '-gapall -hspmax 10'; use constant WUBLAST => '/usr/local/wublast/bin'; use vars qw/$DB $PROGRAM $PARAMS $DIR $STREAM $TMPDIR $DELETE_TMPDIR $BOULDER $TABULAR $CUTOFF $MINLEN $target $blast $hit $hsp/; GetOptions( 'tabular!' => \$TABULAR, 'boulder!' => \$BOULDER, 'db=s' => \$DB, 'tmp=s' => \$TMPDIR, 'dir=s' => \$DIR, 'program=s' => \$PROGRAM, 'params=s' => \$PARAMS, 'cutoff=f' => \$CUTOFF, 'minlen=f' => \$MINLEN, ) || die < [options] Run BLAST on one or more sequences and summarize the results. The source database is an ordinary fasta file. The program runs pressdb to create a temporary blast database in /usr/tmp (or other location of TMPDIR). Options: -source source fasta database (no default) -dir Where to save intermediate results in directory (don\'t save) -tmp -program Variant of BLAST to run (blastn) -params Parameters to pass to program -minlen Minimum HSP length, as fraction of total search length (0.0) -cutoff Minimum significance cutoff -tabular Produce output in tabular format -boulder Produce output in boulder format (default) USAGE ; my $WUBLAST = -d WUBLAST ? WUBLAST : dirname(`which blastn`); die "Can't find blast" unless -x "$WUBLAST/blastn"; my $PRESSDB = "$WUBLAST/pressdb"; my $SETDB = "$WUBLAST/setdb"; my $DO_UNLINK = 0; # parameter consistency checking $TMPDIR ||= $ENV{TMPDIR} || '/usr/tmp'; $DB || die "Specify database to search. Try -h for help.\n"; $PROGRAM ||= BLAST_DEFAULT; $BOULDER = !$TABULAR if defined $TABULAR; $BOULDER++ if !defined($BOULDER) && !defined($TABULAR); $PARAMS = ARGS unless defined $PARAMS; if ($DIR) { die "Specify a valid directory for output files. Try -h for help.\n" unless -d $DIR || mkpath($DIR); } else { $DIR = make_tmpdir(); $DELETE_TMPDIR++; } $CUTOFF = 0.01 unless defined($CUTOFF); $MINLEN ||= 0.0; die "minimum length must be between 0.0 and 1.0" unless $MINLEN >= 0.0 && $MINLEN <= 1.0; $target = pressdb($DB,$PROGRAM); $STREAM = new Boulder::Stream if $BOULDER; $|=1; { # localize input record local($/) = ">"; while (<>) { chomp; next unless my($description,@dna) = split("\n"); my ($identifier) = $description=~/^(\S+)/; my $output_file = "$DIR/$identifier.blast"; my $blast = IO::File->new("| $PROGRAM $target - $PARAMS > $output_file 2>/dev/null ") || die "Couldn't open BLAST program: $!"; print $blast ">$description\n"; foreach (@dna) { print $blast $_,"\n"; } $blast->close; die "Error during execution of blast program, status code $? ($!)\n" if $?; summarize_results($output_file); } } sub summarize_results { my $file = shift; $blast = Boulder::Blast->parse($file); return unless $blast->Blast_hits; # this code is called to write out the boulder stream, if requested if ($BOULDER) { $STREAM->put($blast); return; } # if we get here, we're producing the tabular summary # find the longest hit foreach $hit ($blast->Blast_hits) { next unless $hit->Signif < $CUTOFF; foreach $hsp ($hit->Hsps) { next unless $hsp->Signif < $CUTOFF; next unless abs($hsp->Length/$hit->Length) >= $MINLEN; write; } } } sub pressdb { my $db = shift; my $program = shift; # see if there's already a suitable database around if ($ENV{BLASTDB}) { my @paths = split(':',$ENV{BLASTDB}); foreach (@paths) { next unless -r "$_/$db.csq"; undef $DO_UNLINK; return "$_/$db"; } } # find a suitable temporary name $target = "$TMPDIR/quickblast${$}aaaa"; my $tries = 100; while (--$tries) { last if IO::File->new($target,O_WRONLY|O_EXCL|O_CREAT,0600); $target++; } return unless $tries; my $base = basename($db); # Convert source file into temporary file for processing if ($program =~ /^(blastp|blastx)$/) { # setdb is a pain in the a**. We have to play stupid little # tricks to get it to work the way we want it to work unless ($db =~ m!^/!) { # resolve relative paths require Cwd; my $cwd = Cwd::cwd(); $db = "$cwd/$db"; } unlink $target; symlink($db,$target) || die "Couldn't symlink $db=>$target"; system($SETDB,'-t',$base,$target); } else { system($PRESSDB,'-o',$target,'-t',$base,$db); } die "Couldn't make temporary BLAST database: $!" if $?; $DO_UNLINK++; return $target; } sub make_tmpdir { my $dir = "$TMPDIR/quickblastout${$}aaaa"; my $tries = 100; while (--$tries) { my ($success) = mkpath ($dir++,0,0700); return $success if $success; } } format STDOUT_TOP= Search Target Score P Len % Idnty Search Range Target Range . format STDOUT= @<<<<<<<<<<<<<<<<<<<<<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @##### @>>>>>>>> @##### @>>>> @>>>>>> @>>>>>> @>>>>>>> @>>>>>> { $blast->Blast_query ." (".$blast->Blast_query_length.")", $hit->Name ." (".$hit->Length.")", $hsp->Score,$hsp->Signif,$hsp->Length, $hsp->Identity, $hsp->Query_start,$hsp->Query_end, ($hsp->Orientation eq 'plus' ? ( $hsp->Subject_start,$hsp->Subject_end ) : ( $hsp->Subject_end,$hsp->Subject_start ) ) } . # tidy up by removing temporary database END { if ($target && $DO_UNLINK) { unlink $target; unlink <$target.*>; } rmtree ($DIR) if $DIR && $DELETE_TMPDIR; } libboulder-perl-1.30.orig/eg/test.pl0100755000175000017500000000131506620376335016070 0ustar jojojojo#!/usr/local/bin/perl use lib '.','..'; use Stone; my $stone = Stone->new(name=>'fred', age=>30, address=>{ Mail=>{ street=>'19 Gravel Path', town=>'Bedrock', ZIP=>'12345' }, Electronic=>{ fax=>'111,1111', email=>'foo@bar.com' } }, phone=>{ day=>[[qw/xxxx-xxxx yyy-yyyy/], [qw/111-1111 333-3333/] ], eve=>'222-2222' }, friends=>[qw/amy agnes wendy joe/], preferences=>{ candy=>[qw/sweet chocolate caramel/], sports=>[qw/basketball baseball/], dining=>[qw/ethnic/] } ); print $stone->asTable; libboulder-perl-1.30.orig/t/0040755000175000017500000000000007777761464014443 5ustar jojojojolibboulder-perl-1.30.orig/t/stream.t0100755000175000017500000000330107021033100016051 0ustar jojojojo#!/usr/local/bin/perl -w use lib '..','../blib/lib','../blib/arch'; use Boulder::Stream; BEGIN { unlink "test.stream"; $^W = 0; } END { unlink "test.stream"; } sub test ($$;$) { my($num, $true,$msg) = @_; print($true ? "ok $num\n" : "not ok $num $msg\n"); } my($s,@s); open (OUT,">test.stream"); print "1..20\n"; test 1,$stream = new Boulder::Stream(\*STDIN,\*OUT); test 2,$s = new Stone; test 3,$s = new Stone(name=>'fred',age=>30); test 4,join(' ',sort $s->tags) eq 'age name'; $s->insert(sex=>M); test 5,join(' ',sort $s->tags) eq 'age name sex'; $s->insert(address=>{ street=>'19 Gravel Path', town=>'Bedrock', ZIP=>'12345'}, phone=>{ day=>'111-1111', eve=>'222-2222' } ); test 6,join(' ',sort $s->tags) eq 'address age name phone sex'; $s->delete('sex'); test 7,join(' ',sort $s->tags) eq 'address age name phone'; test 8,$s->get('age') == 30; test 9,ref($s->get('address')) eq 'Stone'; test 10,$s->get('address')->get('town') eq 'Bedrock'; test 11,$s->index('phone.day') eq '111-1111'; $s->get('phone')->insert('day'=>'999-9999'); test 12,scalar(@s = $s->index('phone.day')) == 2; test 13,join(' ',sort(@s)) eq '111-1111 999-9999'; test 14,$stream->put($s); $s = new Stone (name=>'george', age=>23, address=>{ street=>'29 Rockland drive', town=>'Fort Washington', ZIP=>'77777' } ); test 15,$stream->put($s); close OUT; test 16,open(IN,"test.stream"); test 17,$stream = new Boulder::Stream(\*IN,\*STDOUT); $s = $stream->get; test 18,join(' ',sort($s->index('phone.day'))) eq '111-1111 999-9999'; $s = $stream->get; test 19,$s->index('address.town') eq 'Fort Washington'; test 20,!$stream->get; libboulder-perl-1.30.orig/t/store.t0100755000175000017500000000434307021032676015742 0ustar jojojojo#!/usr/local/bin/perl use lib '..','../blib/lib','../blib/arch'; use Boulder::Store; BEGIN { unlink ; $^W = 0; } END { unlink ; } sub test ($$;$) { my($num, $true,$msg) = @_; print($true ? "ok $num\n" : "not ok $num $msg\n"); } my($s,@s,$store); print "1..29\n"; test 1,$store=new Boulder::Store('test.db',1); test 2,$s = new Stone (name=>'george', age=>23, sex=>M, address=>{ street=>'29 Rockland drive', town=>'Fort Washington', ZIP=>'77777' } ); test 3,$store->write_record($s) ne ''; test 4,$store->write_record(new Stone(name=>'fred', age=>30, sex=>M, address=>{ street=>'19 Gravel Path', town=>'Bedrock', ZIP=>'12345'}, phone=>{ day=>'111-1111', eve=>'222-2222' } )); test 5,$store->write_record(new Stone(name=>'andrew', age=>18, sex=>M)); test 6,$store->write_record(new Stone(name=>'gertrude', age=>46, sex=>F)); test 7,$store->write_record(new Stone(name=>'abigail', age=>29, sex=>F)); test 8,$store->write_record(new Stone(name=>'james', age=>34, sex=>M)); test 9,4 == $store->put(new Stone(name=>'angelique', age=>16, sex=>F), 4); test 10,5 == $store->put(new Stone(name=>'mabel', age=>16, record_no=>5, sex=>F) ); test 11,$store->add_index('age'); undef $store; test 12,$store=new Boulder::Store('test.db'=>1); test 13,$s=$store->get(0); test 14,$s->index('address.ZIP') eq '77777'; test 15,$s=$store->get(2); test 16,$s->index('age') == 18; test 17,$s=$store->get(5); test 18,$s->index('age') == 16; test 19,$store->query('sex'=>F) and @s = $store->read_record(); test 20,$s[0]->get('sex') eq 'F'; test 21,@s==3; test 22,$store->query(eval=>' > 25') and @s = $store->read_record(); $ok = 1; foreach (@s) { undef $ok unless $_->get('age') > 25; } test 23,$ok; test 24,@s==2; test 25,$store->add_index('name'); test 26,$store->query('name'=>'mabel') and @s = $store->read_record();; test 27,@s==1; test 28,$s[0]->get('age') == 16 and $s[0]->get('name') eq 'mabel'; test 29,!($store->query('name'=>'foobar') and @s = $store->read_record());