perlindex-1.606/0000755000175000017500000000000012072367011011665 5ustar upfupfperlindex-1.606/perlindex.PL0000644000175000017500000005341312072366004014123 0ustar upfupf#!/usr/local/bin/perl use Config; use File::Basename qw(&basename &dirname); # List explicitly here the variables you want Configure to # generate. Metaconfig only looks for shell variables, so you # have to mention them as if they were shell variables, not # %Config entries. Thus you write # $startperl # to ensure Configure will look for $Config{startperl}. # This forces PL files to create target in same directory as PL file. # This is so that make depend always knows where to find PL derivatives. chdir(dirname($0)); ($file = basename($0)) =~ s/\.PL$//; $file =~ s/\.pl$// if ($Config{'osname'} eq 'VMS' or $Config{'osname'} eq 'OS2'); # "case-forgiving" open OUT,">$file" or die "Can't create $file: $!"; print "Extracting $file (with variable substitutions)\n"; chmod 0775, $file; # In this section, perl variables will be expanded during extraction. # You can use $Config{...} to use Configure variables. print OUT <<"!GROK!THIS!"; $Config{'startperl'} eval 'exec perl -S \$0 "\$@"' if 0; !GROK!THIS! print OUT <<'!NO!SUBS!'; # -*- Mode: Perl -*- # Author : Ulrich Pfeifer # Created On : Mon Jan 22 13:00:41 1996 # Last Modified On: Sun Jan 6 22:26:28 2013 # Language : Perl # Update Count : 399 # Status : Unknown, Use with caution! # # (C) Copyright 1996-2005, Ulrich Pfeifer, all rights reserved. # This file is free software; you can redistribute it and/or modify it # under the same terms as Perl itself. # # # %SEEN is used to store the mtime and absolute pathes to # files which have been indexed. # # %FN $FN{'last'} greatest documentid # $FN{$did} a pair of $mtf and $filename where $mtf is the # number of occurances of the most frequent word in # the document with number $did. # # %IDF $IDF{'*all*'} number of documents (essentially the same as # $FN{'last'}) # $IDF{$word} number of documents containing $word # # %IF $IF{$word} list of pairs ($docid,$tf) where $docid is # the number of a document containing $word $tf # use Fcntl; use less 'time'; use Getopt::Long; use File::Basename; use Text::English; use Config; # NDBM_File as LAST resort BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File SDBM_File ODBM_File NDBM_File) } use AnyDBM_File; !NO!SUBS! eval "\$x = pack 'w', 1;"; if ($x eq "\001") { print OUT "\$p = 'w'; # compressed int patch available\n"; } else { print OUT "\$p = 'S'; # change to 'I' for large collections\n"; } $index_dir = $Config{'man1direxp'}; $index_dir =~ s:/[^/]*$::; $index_dir = $ENV{'INDEXDIR'} if $ENV{'INDEXDIR'}; print OUT <<"EOC"; \$nroff = \'$Config{'nroff'}\' || \'nroff\'; \$man1direxp = \'$Config{'man1direxp'}\'; \$man3direxp = \'$Config{'man3direxp'}\'; \$IDIR = \'$index_dir\'; \$prefix = \'$Config{'prefix'}\'; EOC ; # Use Term::ReadKey for character-at-a-time input if available. # Else use "stty cbreak" if BSD, "stty icanon" if non-BSD. if (eval 'require Term::ReadKey') { print OUT "use Term::ReadKey;\n"; $BSD_STYLE= -1; } elsif ($Config{'d_bsd'}) { $BSD_STYLE= 1; } else { $BSD_STYLE= 0; } # # Let's look for a stemmer # eval "use Text::English;"; # unless ($@) { # $stemmer = \&Text::English::stem; # } else { # eval "require 'Stem.pl';"; # unless ($@) { # $stemmer = \&stem; # } # } # In the following, perl variables are not expanded during extraction. print OUT <<'!NO!SUBS!'; $pager= $Config{'pager'}; if ($Config{'osname'} eq 'hpux') { $pager= "col | $pager"; } if (exists $ENV{PAGER}) { $pager=$ENV{PAGER} } $stemmer = \&Text::English::stem; $debug = 0; $opt_index = ''; # make perl -w happy $opt_menu = 1; $opt_maxhits = 20; $opt_cbreak = 1; &GetOptions( 'index', 'cbreak!', 'maxhits=i', 'menu!', 'verbose', 'dict:i', 'idir=s', ) || die "Usage: $0 [-index] [words ...]\n"; if (defined $opt_idir) { $IDIR = $opt_idir; # avoid to many changes below. } if (defined $opt_dict) { $opt_dict ||= 100; } my $GC_REQUIRED = 0; # garbage collect required? Global variable. if ($opt_index) { # check whether we can use Pod::Text to extract POD $PodText = 0; eval { require Pod::Text; print "Using Pod::Text\n"; $PodText = 1; }; $IoScalar = 0; eval { require IO::Scalar; print "Using IO::Scalar\n"; $IoScalar = 1; }; &initstop; tie (%IF, AnyDBM_File, "$IDIR/index_if", O_CREAT|O_RDWR, 0644) or die "Could not tie $IDIR/index_if: $!\n"; tie (%IDF, AnyDBM_File, "$IDIR/index_idf", O_CREAT|O_RDWR, 0644) or die "Could not tie $IDIR/index_idf: $!\n"; tie (%SEEN, AnyDBM_File, "$IDIR/index_seen", O_CREAT|O_RDWR, 0644) or die "Could not tie $IDIR/index_seen: $!\n"; tie (%FN, AnyDBM_File, "$IDIR/index_fn", O_CREAT|O_RDWR, 0644) or die "Could not tie $IDIR/index_fn: $!\n"; require File::Find; unless (@ARGV) { # breaks compatibility :-( my %seen; my @perllib = grep(length && -d && !$seen{$_}++, @Config{qw(installprivlib installarchlib installsitelib installvendorlib installscript installsitearch)}); for $dir (@perllib) { print "Scanning $dir ... \n"; if (-l $dir) { # debian symlinks installarchlib but we do not want to follow links in general my $target = readlink $dir; $dir =~ s:[^/]+$:$target:; } File::Find::find( \&wanted, $dir ); } } for $name (@ARGV) { add_to_index($name); } # Check if all (previuosly) indexed files are still available # This may take some time. warn "Validating index ...\n"; while (my ($fns, $value) = each %SEEN) { my $path = $fns; $path = $prefix.'/'.$path unless $path =~ m:^/:; unless (-f $path) { my ($mtime, $did) = unpack "$p$p", $value; # mark document as deleted warn "Marking document $did ($fns) as deleted\n"; delete $FN{$did}; delete $SEEN{$fns}; $GC_REQUIRED++; } } if ($GC_REQUIRED) { print STDERR "Garbage collecting\r"; # garbage collection, this is awfully slow my $progress = 0; my $words = keys %IF; my %if_new; tie (%if_new, AnyDBM_File, "$IDIR/index_if.new", O_CREAT|O_RDWR, 0644) or die "Could not tie $IDIR/index_if: $!\n"; while (my ($word,$list) = each %IF) { print STDERR "Garbage collecting ".(++$progress)."/".$words."\r"; my %post = unpack($p.'*',$list); #delete $IF{$word}; $IDF{$word} = 0; while (my ($did,$tf) = each %post) { if (exists $FN{$did}) { $if_new{$word} = '' unless defined $if{$word}; # perl -w $if_new{$word} .= pack($p.$p, $did, $tf); $IDF{$word}++; } } } untie %if_new; untie %IF; opendir(IDX, $IDIR) or die "Could not read dir '$IDIR': $!"; for $file (readdir DIR) { my $old = $file; if ($file =~ s/^index_if\.new/index_if/) { rename "$IDIR/$old", "$IDIR/$file"; } } print STDERR "\rGarbage collecting ... done"; } untie %IF unless $GC_REQUIRED; untie %IDF; untie %FN; untie %SEEN; } elsif ($opt_dict) { tie (%IDF, AnyDBM_File, "$IDIR/index_idf", O_RDONLY, 0644) or die "Could not tie $IDIR/index_idf: $!\n". "Did you run '$0 -index'?\n"; while (($key,$val) = each %IDF) { printf "%-20s %d\n", $key, $val if $val >= $opt_dict; } untie %IDF; } else { tie (%IF, AnyDBM_File, "$IDIR/index_if", O_RDONLY, 0644) or die "Could not tie $IDIR/index_if: $!\n". "Did you run '$0 -index'?\n"; tie (%IDF, AnyDBM_File, "$IDIR/index_idf", O_RDONLY, 0644) or die "Could not tie $IDIR/index_idf: $!\n"; tie (%FN, AnyDBM_File, "$IDIR/index_fn", O_RDONLY, 0644) or die "Could not tie $IDIR/index_fn: $!\n"; &search(@ARGV); untie %IF; untie %IDF; untie %FN; untie %SEEN; } sub wanted { my $fns = $File::Find::name; if ($File::Find::name eq $man3direxp) { $File::Find::prune = 1; } if (-f $_ and $File::Find::name =~ /man|bin|\.(pod|pm|txt)$/) { add_to_index($File::Find::name); } } sub index { my $fn = shift; my $fns = shift; my $did = shift; my %tf; my $maxtf = 0; if ($fn =~ /\.txt$/) { open (IN, "<$fn") || warn "Could not open $fn: $!\n", return (0); while ($line = ) { warn "=> $line\n" if $debug; for $word (&normalize($line)) { next if $stop{$word}; $tf{$word}++; } } close IN; } elsif($PodText and -T $fn) { my $result; my $parser = Pod::Text->new(sentence => 0, width => 78); if ($IoScalar) { my $tmpfile = new IO::Scalar; open(IN,"<$fn"); $parser->parse_from_filehandle(*IN, $tmpfile); warn "===> $fn, $tmpfile" if $debug; #for my $line (split /\n/, $tmpfile) { while ($tmpfile =~ s/^(.*\n)//) { warn "=> $1\n" if $debug; for $word (&normalize($1)) { next if $stop{$word}; $tf{$word}++; } } close IN; } else { my $tmpfile = "$IDIR/tmppod.txt"; $parser->parse_from_file($fn, $tmpfile); open (IN, "<$tmpfile") || warn "Could not open $fn: $!\n", return (0); while ($line = ) { warn "=> $line\n" if $debug; for $word (&normalize($line)) { next if $stop{$word}; $tf{$word}++; } } close IN; unlink $tmpfile; } } else { # no Pod::Text found open(IN, "<$fn") || warn "Could not open $fn: $!\n", return (0); while ($line = ) { warn "=> $line\n" if $debug; if ($line =~ /^=head/) { $pod = 1; } elsif ($line =~ /^=cut/){ $pod = 0; } else { next unless $pod; } for $word (&normalize($line)) { next if $stop{$word}; $tf{$word}++; } } close(IN); } for $tf (values %tf) { $maxtf = $tf if $tf > $maxtf; } for $word (keys %tf) { $IDF{$word}++; $IF{$word} = '' unless defined $IF{$word}; # perl -w $IF{$word} .= pack($p.$p, $did, $tf{$word}); } $FN{$did} = pack($p, $maxtf).$fns; print STDERR "$fns\n"; 1; } sub add_to_index { my ($name) = @_; my $fns = $name; $fns =~ s:\Q$prefix/::; if (exists $SEEN{$fns}) { my ($mtime, $did) = unpack "$p$p", $SEEN{$fns}; if ((stat $name)[9] > $mtime) { # mark document as deleted delete $FN{$did}; warn "Marking document $did ($name) as deleted\n"; $GC_REQUIRED++; } else { # index up to date next; } } next unless -f $name; if ($name !~ /(~|,v)$/) { $did = $FN{'last'}++; if (&index($name, $fns, $did)) { my ($mtime) = (stat $name)[9]; $SEEN{$fns} = pack "$p$p", (stat $name)[9], $did; } } } sub normalize { my $line = join ' ', @_; my @result; $line =~ tr/A-Z/a-z/; $line =~ tr/a-z0-9_/ /cs; for $word (split / /, $line ) { $word =~ s/^\d+//; next unless length($word) > 2; if ($stemmer) { push @result, &$stemmer($word); } else { push @result, $word; } } @result; } sub search { my %score; my $maxhits = $opt_maxhits; my (@unknown, @stop); &initstop if $opt_verbose; for $word (normalize(@_)) { unless ($IF{$word}) { if ($stop{$word}) { push @stop, $word; } else { push @unknown, $word; } next; } my %post = unpack($p.'*',$IF{$word}); my $idf = log($FN{'last'}/$IDF{$word}); for $did (keys %post) { # skip deleted documents next unless exists $FN{$did}; my ($maxtf) = unpack($p, $FN{$did}); $score{$did} = 0 unless defined $score{$did}; # perl -w $score{$did} += $post{$did} / $maxtf * $idf; } } if ($opt_verbose) { print "Unkown: @unknown\n" if @unknown; print "Ingnore: @stop\n" if @stop; } if ($opt_menu) { my @menu; my $answer = ''; my $no = 0; my @s = ('1' .. '9', 'a' .. 'z'); my %s; for $did (sort {$score{$b} <=> $score{$a}} keys %score) { my ($mtf, $path) = unpack($p.'a*', $FN{$did}); my $s = $s[$no]; push @menu, sprintf "%s %6.3f %s\n", $s, $score{$did}, $path; $s{$s} = ++$no; last unless --$maxhits; } &cbreak('on') if $opt_cbreak; while (1) { print @menu; print "\nEnter Number or 'q'> "; if ($opt_cbreak) { read(TTYIN,$answer,1); print "\n"; } else { $answer = ; } last if $answer =~ /^q/i; $answer = ($s{substr($answer,0,1)})-1; if ($answer >= 0 and $answer <= $#menu) { my $selection = $menu[$answer]; chomp($selection); my ($no, $score, $path) = split ' ', $selection, 3; $path = $prefix.'/'.$path unless $path =~ m:^/:; if ($path =~ /\.txt$/) { my $pdf = $path; $pdf =~ s:pages/(\S+)_(\d+)\.txt$:$1.pdf:; my $page = $2+0; my $endp = $page+2; my $tmp = "/tmp/perlinde$$.pdf"; if (-f $pdf) { print "pdftk A=$pdf cat $page-$endp output $tmp\n"; system "pdftk", "A=".$pdf, 'cat', "$page-$endp", 'output', $tmp and system "pdftk", "A=".$pdf, 'cat', "$page-end", 'output', $tmp; system "acroread", $tmp; unlink $tmp; } else { print STDERR "$pager '$path'\n"; system $pager, $path; } } elsif ($selection =~ m:/man:) { my ($page, $sect) = ($selection =~ m:([^/]*)\.(.{1,3})$:); print STDERR "Running man $sect $page\n"; system 'man', $sect, $page; } else { print STDERR "Running pod2man $path\n"; system "pod2man --official $path | $nroff -man | $pager"; } } else { my $path = $prefix."/bin/perlindex"; system "pod2man --official $path | $nroff -man | $pager"; } } &cbreak('off') if $opt_cbreak; } else { for $did (sort {$score{$b} <=> $score{$a}} keys %score) { printf("%6.3f %s\n", $score{$did}, (unpack($p.'a*', $FN{$did}))[1]); last unless --$maxhits; } } } sub cbreak { my $mode = shift; if ($mode eq 'on') { open(TTYIN, "/dev/tty") || die "can't write /dev/tty: $!"; select(TTYOUT); $| = 1; select(STDOUT); $SIG{'QUIT'} = $SIG{'INT'} = 'cbreak'; !NO!SUBS! ; if ($BSD_STYLE == -1) { print OUT "\tReadMode 3; # Set cbreak mode\n"; } elsif ($BSD_STYLE) { print OUT "\tsystem \"stty cbreak /dev/tty 2>&1\";\n"; #print OUT "\tsystem \"stty cbreak echo /dev/tty 2>&1\";\n"; } else { print OUT "\tsystem \"stty\", '-icanon', 'eol', \"\\001\";\n"; } print OUT " } else {\n"; if ($BSD_STYLE == -1) { print OUT "\tReadMode 0; # Restore non-cbreak mode\n"; } elsif ($BSD_STYLE) { print OUT "\tsystem \"stty -cbreak /dev/tty 2>&1\";\n"; #print OUT "\tsystem \"stty -cbreak echo /dev/tty 2>&1\";\n"; } else { print OUT "\tsystem \"stty\", 'icanon', 'eol', '^\@'; # ascii null\n"; } print OUT <<'!NO!SUBS!'; } } $stopinited = 0; # perl -w sub initstop { return if $stopinited++; while () { next if /^\#/; for (normalize($_)) { $stop{$_}++; } } } =head1 NAME perlindex - index and query perl manual pages =head1 SYNOPSIS perlindex -index perlindex tell me where the flowers are =head1 DESCRIPTION "C" generates an AnyDBM_File index which can be searched with free text queries "C I". Each word of the query is searched in the index and a score is generated for each document containing it. Scores for all words are added and the documents with the highest score are printed. All words are stemed with Porters algorithm (see L) before indexing and searching happens. The score is computed as: $score{$document} += $tf{$word,$document}/$maxtf{$document} * log ($N/$n{$word}); where =over 10 =item C<$N> is the number of documents in the index, =item C<$n{$word}> is the number of documents containing the I, =item C<$tf{$word,$document}> is the number of occurances of I in the I, and =item C<$maxtf{$document}> is the maximum freqency of any word in I. =back =head1 OPTIONS All options may be abreviated. =over 10 =item B<-maxhits> maxhits Maximum numer of hits to display. Default is 15. =item B<-menu> =item B<-nomenu> Use the matches as menu for calling C. Default is B<-menu>.q =item B<-cbreak> =item B<-nocbreak> Switch to cbreak in menu mode or dont. B<-cbreak> is the default. =item B<-verbose> Generates additional information which query words have been not found in the database and which words of the query are stopwords. =back =head1 EXAMPLE perlindex foo bar 1 3.735 lib/pod/perlbot.pod 2 2.640 lib/pod/perlsec.pod 3 2.153 lib/pod/perldata.pod 4 1.920 lib/Symbol.pm 5 1.802 lib/pod/perlsub.pod 6 1.586 lib/Getopt/Long.pm 7 1.190 lib/File/Path.pm 8 1.042 lib/pod/perlop.pod 9 0.857 lib/pod/perlre.pod a 0.830 lib/Shell.pm b 0.691 lib/strict.pm c 0.691 lib/Carp.pm d 0.680 lib/pod/perlpod.pod e 0.680 lib/File/Find.pm f 0.626 lib/pod/perlsyn.pod Enter Number or 'q'> Hitting the keys C<1> to C will display the corresponding manual page. Hitting C quits. All other keys display this manual page. =head1 FILES The index will be generated in your man directory. Strictly speaking in C<$Config{man1direxp}/..> The following files will be generated: index_fn # docid -> (max frequency, filename) index_idf # term -> number of documents containing term index_if # term -> (docid, frequency)* index_seen # fn -> indexed? =head1 AUTHOR Ulrich Pfeifer EFE =cut __END__ # freeWAIS-sf stopwords a about above according across actually adj after afterwards again against all almost alone along already also although always among amongst an and another any anyhow anyone anything anywhere are aren't around as at b be became because become becomes becoming been before beforehand begin beginning behind being below beside besides between beyond billion both but by c can can't cannot caption co co. could couldn't d did didn't do does doesn't don't down during e each eg eight eighty either else elsewhere end ending enough etc even ever every everyone everything everywhere except f few fifty first five vfor former formerly forty found " four from further g h had has hasn't have haven't he he'd he'll he's hence her here here's hereafter hereby herein hereupon hers herself him himself his how however hundred i i'd i'll i'm i've ie if in inc. indeed instead into is isn't it it's its itself j k l last later latter latterly least less let let's like likely ltd m made make makes many maybe me meantime meanwhile might million miss more moreover most mostly mr mrs much must my myself n namely neither never nevertheless next nine ninety no nobody none nonetheless noone nor not nothing now nowhere o of off often on once one one's only onto or other others otherwise our ours ourselves out over overall own p per perhaps q r rather recent recently s same seem seemed seeming seems seven seventy several she she'd she'll she's should shouldn't since six sixty so some somehow someone something sometime sometimes somewhere still stop such t taking ten than that that'll that's that've the their them themselves then thence there there'd there'll there're there's there've thereafter thereby therefore therein thereupon these they they'd they'll they're they've thirty this those though thousand three through throughout thru thus to together too toward towards trillion twenty two u under unless unlike unlikely until up upon us used using v very via w was wasn't we we'd we'll we're we've well were weren't what what'll what's what've whatever when whence whenever where where's whereafter whereas whereby wherein whereupon wherever whether which while whither who who'd who'll who's whoever whole whom whomever whose why will with within without won't would wouldn't x y yes yet you you'd you'll you're you've your yours yourself yourselves z # occuring in more than 100 files acc accent accents and are bell can character corrections crt daisy dash date defined definitions description devices diablo dummy factors following font for from fudge give have header holds log logo low lpr mark name nroff out output perl pitch put rcsfile reference resolution revision see set simple smi some string synopsis system that the this translation troff typewriter ucb unbreakable use used user vroff wheel will with you !NO!SUBS! perlindex-1.606/MANIFEST0000644000175000017500000000024012072365021013011 0ustar upfupfChangeLog MANIFEST README Makefile.PL perlindex.PL t/basic.t lib/Text/English.pm META.yml Module meta-data (added by MakeMaker) perlindex-1.606/README0000644000175000017500000000113012072365021012537 0ustar upfupfThis is the perlindex distribution Perlindex is a program to index and search the perl documentation. The newest version is always available from CPAN/authors/id/ULPFR. Perlindex should work with any Perl >= 5.002 at least. With Perl >= 5.003_07 it will use compress integers which results in smaller indices. To install do: perl Makefile.PL make install perlindex -index You may give additional files to index on the command line. I'd recommend to specify the perl utils (perldoc, h2xs, ...): perlindex -index /usr/local/ls6/perl-5.002/bin/* Ulrich Pfeifer perlindex-1.606/Makefile.PL0000644000175000017500000000206512072365021013641 0ustar upfupf#!/usr/bin/perl # -*- Mode: Perl -*- # Author : Ulrich Pfeifer # Created On : Tue May 27 17:27:28 1997 # Last Modified On: Mon Nov 17 19:47:45 2008 # Language : CPerl # Update Count : 36 # # (C) Copyright 1997-2005, Ulrich Pfeifer, all rights reserved. This # file is free software; you can redistribute it and/or modify it # under the same terms as Perl itself. $VERSION = "1.606"; use ExtUtils::MakeMaker; # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. WriteMakefile( 'NAME' => 'perlindex', 'VERSION' => $VERSION, 'LIBS' => [''], # e.g., '-lm' 'DEFINE' => '', # e.g., '-DHAVE_SOMETHING' 'INC' => '', # e.g., '-I/usr/include/other' 'dist' => { SUFFIX => "gz", COMPRESS => "gzip -f"}, # we do bundle that module in the distribution # 'PREREQ_PM' => { 'Text::English' => 0 }, 'EXE_FILES' => [ 'perlindex' ], 'clean' => { 'FILES' => 'perlindex' }, ); perlindex-1.606/ChangeLog0000644000175000017500000001152112072365021013436 0ustar upfupf2013-01-06 Ulrich Pfeifer * Do not decrement IDF in GC code. I am not sure why the previous code could produce negative document frequencies. 2011-01-30 Damyan Ivanov * Description: Replace usage of 'find.pl' by File::Find find.pl is scheduled for removal from the perl core 2008-11-17 Ulrich Pfeifer * perlindex.PL: applied typo patch from Slaven Rezic (http://rt.cpan.org/Ticket/Display.html?id=40890) 2008-10-19 Ulrich Pfeifer * perlindex.PL: Garbage collect can not change the index while scanning it. Doing a copy now. * perlindex.PL: Fixed $gc_required scoping error. Removed code duplication. Added progress indication for GC collect. * perlindex.PL: Fixed the indexing of the default directories (code duplication needs to be removed) and added checking for removed files (http://rt.cpan.org/Ticket/Display.html?id=39863). 2008-10-19 Ulrich Pfeifer * perlindex.PL: Added support for updating documents as requested by SREZIC in http://rt.cpan.org/Ticket/Display.html?id=39862. Update is transparent - except for the time used in the garbage collect phase. 2006-07-02 Ulrich Pfeifer * perlindex.PL (index): moving the check for Pod::Text in the index function. The result went out of scope before (thanks Florian for the bug report). Fixing the loop variable in the main loop (thanks Florian for the bug report). Adding support for IO::Scalar. 2006-03-19 Ulrich Pfeifer * Fixed a bug reported by Florian Ragwitz: Absolute filenames were access incorrectly from hit list. 2005-09-18 Ulrich Pfeifer * Integrated a patch from Marek Rouchal to use Pod::Text for parsing if available. I did rework the patch a little to remove the need for IO::Scalar which does no seem to be part of debian sarge. Now underscore is also a valid letter. 2005-04-10 Ulrich Pfeifer * Re-Added Text::English as it does not seem to be available separately. I'd rather not package a "foreign" separately. 2005-04-03 Ulrich Pfeifer * Removed Text::English from Distribution * Determine pager and search path at run time (Patch from Marek, Ticket #4506) 2004-05-02 Florian Ragwitz * please parameterize the $IDIR in the make process * clarified license * clarified authorship of Text::English 2003-06-19 Slaven Rezic * t/basic.t: made test more safe and portable 2003-06-18 Ulrich Pfeifer Added some regression tests before the CPAN testers beat me up. 2003-06-18 Slaven Rezic [cpan #2820] Fix indexed directories In some perl installations, installsitelib is not part of the privlib directory. That is, privlib is something like /usr/local/lib/perl5/5.8.0 and installsitelib something like /usr/local/lib/perl5/site_perl/5.8.0. This causes the perlindex indexer not to dive into installsitelib. Sun Mar 10 13:05:46 MET 1996 "Chuck D. Phillips (NON-HP Employee)" By default, the program perlindex isn't deleted when you do a "make clean" the result is that the Config.pm constants don't get updated next time you do a "make all". To fix this, you can add the following line to the WriteMakefile() parameters in Makefile.PL: 'clean' => { 'FILES' => 'perlindex' }, Nit: Term::ReadKey is more reliable for cbreak than using $d_bsd. On HPUX, setting BSD_Style to either 0 or 1 doesn't quite work right. I've hacked my own copy of perlindex.PL to prefer Term::ReadKey if available. Otherwise, it defaults to old behavior execept that it resolves during "make all" instead of at run time. (I also insert "col" between the nroff and the pager to avoid some garbage on HPUX.) I've included the diffs at the bottom. patch7 Description: Fixed test for compressed int patch. Fri Mar 8 20:26:27 MET 1996 Ulrich Pfeifer patch6 Description: man3direxp will not be indexed any more. Even if inside of privlibexp. Fixed bug with -nomenu. Did eat characters ;-) cbreak for non bsd systems (hopefully). Wed Feb 28 13:45:28 MET 1996 Ulrich Pfeifer patch5 Description: Nroff, man1direxp, privlibexp, prefix and pager are now determined at extraction time. Nroff and pager were hardcoded before. Fri Feb 23 11:22:12 MET 1996 Ulrich Pfeifer . Description: Fixed version computation. Fixed menu numbering. First hit could not be selected before. Added chmod 0755 to perlindex.PL Thu Feb 22 19:17:54 MET 1996 Ulrich Pfeifer . Description: New version numbering. Added cbreak mode. Thu Feb 22 16:43:57 MET 1996 Ulrich Pfeifer . Description: perlindex-1.606/META.yml0000664000175000017500000000072712072367011013146 0ustar upfupf--- #YAML:1.0 name: perlindex version: 1.606 abstract: ~ author: [] license: unknown distribution_type: module configure_requires: ExtUtils::MakeMaker: 0 build_requires: ExtUtils::MakeMaker: 0 requires: {} no_index: directory: - t - inc generated_by: ExtUtils::MakeMaker version 6.57_05 meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: 1.4 perlindex-1.606/t/0000755000175000017500000000000012072367011012130 5ustar upfupfperlindex-1.606/t/basic.t0000644000175000017500000000227012072365021013376 0ustar upfupf#!/usr/bin/perl -w # -*- Mode: Perl -*- # $Basename$ # $Revision: 1.3 $ # Author : Ulrich Pfeifer # Created On : Wed Jun 18 19:44:37 2003 # Last Modified By: Ulrich Pfeifer # Last Modified On: Tue Oct 21 10:29:42 2008 # Language : CPerl # # (C) Copyright 2003, UUNET Deutschland GmbH, Germany # use strict; use Test; BEGIN { if (!eval { require File::Temp; require File::Spec; require Cwd; 1; }) { print "1..0 # SKIP: File::Temp and/or File::Spec not available, skipping tests\n"; exit(0); } File::Temp->import(qw(tempdir)); } BEGIN { plan tests => 2, todo => [] } sub run { my ($cmd, $test) = @_; local $/; open(SUB, "$^X $cmd < " . File::Spec->devnull . " 2>&1 |") or die $!; my $result = ; close SUB or return; return &$test($result); } my $tmp = tempdir(CLEANUP => 1); my $cwd = Cwd::getcwd(); ok( run( "-Mblib ./perlindex -idir $tmp --index $cwd/README $cwd/MANIFEST $cwd/perlindex.PL", sub { print "[[$_[0]]]\n"; $_[0] =~ /MANIFEST/ } ) ); ok( run( "-Mblib ./perlindex -idir $tmp --nomenu index", sub { print "[[$_[0]]]\n"; $_[0] =~ /perlindex.PL/ } ) ); perlindex-1.606/lib/0000755000175000017500000000000012072367011012433 5ustar upfupfperlindex-1.606/lib/Text/0000755000175000017500000000000012072367011013357 5ustar upfupfperlindex-1.606/lib/Text/English.pm0000644000175000017500000000776312072365021015322 0ustar upfupf#!/usr/bin/perl # -*- Mode: Perl -*- # Author : Ian Phillipps # Last Modified On: Sun May 2 15:35:33 2004 # Language : CPerl package Text::English; $VERSION = $VERSION = '0.01'; sub stem { my @parms = @_; foreach( @parms ) { $_ = lc $_; # Step 0 - remove punctuation s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//; next unless /^[a-z]+$/; # step1a_rules if( /[^s]s$/ ) { s/sses$/ss/ || s/ies$/i/ || s/s$// } # step1b_rules. The business with rule==106 is embedded in the # boolean expressions here. (/[aeiouy][^aeiouy].*eed$/ && s/eed$/ee/ ) || ( s/([aeiou].*)ed$/$1/ || s/([aeiouy].*)ing$/$1/ ) && ( # step1b1_rules s/at$/ate/ || s/bl$/ble/ || s/iz$/ize/ || s/bb$/b/ || s/dd$/d/ || s/ff$/f/ || s/gg$/g/ || s/mm$/m/ || s/nn$/n/ || s/pp$/p/ || s/rr$/r/ || s/tt$/t/ || s/ww$/w/ || s/xx$/x/ || # This is wordsize==1 && CVC...addanE... s/^[^aeiouy]+[aeiouy][^aeiouy]$/$&e/ ) #DEBUG && warn "step1b1: $_\n" ; # step1c_rules #DEBUG warn "step1c: $_\n" if s/([aeiouy].*)y$/$1i/; # step2_rules if ( s/ational$/ate/ || s/tional$/tion/ || s/enci$/ence/ || s/anci$/ance/ || s/izer$/ize/ || s/iser$/ise/ || s/abli$/able/ || s/alli$/al/ || s/entli$/ent/ || s/eli$/e/ || s/ousli$/ous/ || s/ization$/ize/ || s/isation$/ise/ || s/ation$/ate/ || s/ator$/ate/ || s/alism$/al/ || s/iveness$/ive/ || s/fulnes$/ful/ || s/ousness$/ous/ || s/aliti$/al/ || s/iviti$/ive/ || s/biliti$/ble/ ) { my ($l,$m) = ($`,$&); #DEBUG warn "step 2: l=$l m=$m\n"; $_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/; } # step3_rules if ( s/icate$/ic/ || s/ative$// || s/alize$/al/ || s/iciti$/ic/ || s/ical$/ic/ || s/ful$// || s/ness$// ) { my ($l,$m) = ($`,$&); #DEBUG warn "step 3: l=$l m=$m\n"; $_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/; } # step4_rules if ( s/al$// || s/ance$// || s/ence$// || s/er$// || s/ic$// || s/able$// || s/ible$// || s/ant$// || s/ement$// || s/ment$// || s/ent$// || s/sion$/s/ || s/tion$/t/ || s/ou$// || s/ism$// || s/ate$// || s/iti$// || s/ous$// || s/ive$// || s/ize$// || s/ise$// ) { my ($l,$m) = ($`,$&); # Look for two consonant/vowel transitions # NB simplified... #DEBUG warn "step 4: l=$l m=$m\n"; $_ = $l.$m unless $l =~ /[^aeiou][aeiouy].*[^aeiou][aeiouy]/; } # step5a_rules #DEBUG warn("step 5a: $_\n") && s/e$// if ( /[^aeiou][aeiouy].*[^aeiou][aeiouy].*e$/ || ( /[aeiou][^aeiouy].*e/ && ! /[^aeiou][aeiouy][^aeiouwxy]e$/) ); # step5b_rules #DEBUG warn("step 5b: $_\n") && s/ll$/l/ if /[^aeiou][aeiouy].*[^aeiou][aeiouy].*ll$/; # Cosmetic step s/(.)i$/$1y/; } @parms; } 1; __END__ =head1 NAME Text::English - Porter's stemming algorithm =head1 SYNOPSIS use Text::English; @stems = Text::English::stem( @words ); =head1 DESCRIPTION This routine applies the Porter Stemming Algorithm to its parameters, returning the stemmed words. It is derived from the C program "stemmer.c" as found in freewais and elsewhere, which contains these notes: Purpose: Implementation of the Porter stemming algorithm documented in: Porter, M.F., "An Algorithm For Suffix Stripping," Program 14 (3), July 1980, pp. 130-137. Provenance: Written by B. Frakes and C. Cox, 1986. I have re-interpreted areas that use Frakes and Cox's "WordSize" function. My version may misbehave on short words starting with "y", but I can't think of any examples. The step numbers correspond to Frakes and Cox, and are probably in Porter's article (which I've not seen). Porter's algorithm still has rough spots (e.g current/currency, -ings words), which I've not attempted to cure, although I have added support for the British -ise suffix. =head1 NOTES This is version 0.1. I would welcome feedback, especially improvements to the punctuation-stripping step. =head1 AUTHOR Ian Phillipps =head1 COPYRIGHT Copyright Public IP Exchange Ltd (PIPEX). Available for use under the same terms as perl. =cut