Biblio-LCC-0.09/0040755000076500007650000000000011104645144013205 5ustar nkuitsenkuitseBiblio-LCC-0.09/CHANGES0100644000076500007650000000020211104642167014171 0ustar nkuitsenkuitse2008-11-06 * version 0.09 Cleaned up for release to CPAN 2007-08-28 * version 0.08 Reimplemented as a module (again). Biblio-LCC-0.09/lib/0040755000076500007650000000000011104645144013753 5ustar nkuitsenkuitseBiblio-LCC-0.09/lib/Biblio/0040755000076500007650000000000011104645144015153 5ustar nkuitsenkuitseBiblio-LCC-0.09/lib/Biblio/LCC.pm0100644000076500007650000001740511104642222016110 0ustar nkuitsenkuitsepackage Biblio::LCC; use strict; use warnings; use vars qw($VERSION); $VERSION = '0.09'; # Normalize a class (e.g., "PN1997.5") or an actual call number # (e.g., "PN1997.5 .B3 B5 1977") sub normalize { my ($cls, $lcc) = @_; my ($alpha, $int, $frac, $rmdr) = $cls->parse($lcc); my $norm; if ($frac eq '') { if ($int eq '') { $norm = sprintf "%-3s", $alpha; } else { $norm = sprintf "%-3s%4d %s", $alpha, $int, $rmdr; } } else { $norm = sprintf "%-3s%4d.%d %s", $alpha, $int, $frac, $rmdr; } $norm =~ s/ +$//; return $norm; } # Normalize a classification range # NOTE: A range can be specified using a hyphen (e.g., 'Z105-106'), which # is the usual way, or a less-than sign (e.g., 'Z105<107'), which # lets you do specify that a call number must be strictly less than # the end of the range. If you just specify one classification (e.g., # "AS137"), the second classification is the same as the first and a # hyphen is assumed. sub normalize_range { my ($cls, $str) = @_; my ($begin, $end, $rel); $str =~ s/ +A\s*-\s*Z$//; # Strip " A-Z" at end -- common (but meaningless) idiom $str =~ s/(.+)\.([^-<.]+)\s*(<|--?)\s*\.(.+)/$1.$2$3$1.$4/; # foo.bar-.baz ==> foo.bar-foo.baz if ($str =~ /^([^-<]+)\s*(<|--?)\s*(.+)/) { # Examples: # PN1997-1999 # PN 1997-PN 1999 # PN1997<2000 ($begin, $rel, $end) = ($1, substr($2, 0, 1), $3); } elsif ($str =~ /^([^-<]+)$/) { # Examples: # KNW # RA 401.3 # HD1308.A5 ($begin, $rel, $end) = ($1, '-', $1); } else { return; # XXX } if ($end =~ /^\d+/) { $begin =~ /^([A-Z]+)/ and $end = "$1$end"; } ($begin, $end) = map { $cls->normalize($_) } ($begin, $end); if ($rel eq '-') { $end .= '~'; } return ($begin, $end); } # Parse a classification or call number sub parse { my ($cls, $lcc) = @_; my ($alpha, $int, $frac, $rmdr) = ('', '', '', ''); $lcc =~ s/^([A-Z]{1,3}) *// or die "Invalid LCC: $lcc"; $alpha = $1; if ($lcc =~ s/^(\d+)//) { $int = $1; if ($lcc =~ s/^\.(\d+)//) { $frac = $1; } $rmdr = $lcc; $rmdr =~ s/^ *\.?(?=[A-Z])/./; } $rmdr =~ s/\.(?=[A-Z])//g; $rmdr =~ s/(?<=\d)(?=[A-Z])/ /g; return ($alpha, $int, $frac, $rmdr); } # Add an offset (e.g, '116.5') to a classification (e.g., 'HD6290') sub add { my ($cls, $lcc, $offset) = @_; my ($alpha, $int, $frac, $rmdr) = $cls->parse($lcc); die "Can't add to a classification with a fractional part" if $frac ne ''; die "Can't add to a classification with a remainder" if $rmdr ne ''; $int += int($offset); if ($offset =~ /\./) { $frac = $offset - $int; return "$alpha$int.$frac" } else { return "$alpha$int"; } } =head1 NAME Biblio::LCC - parse and normalize LC-style call numbers =head1 SYNOPSIS use Biblio::LCC; $normalized = Biblio::LCC->normalize('PS3573.A472242 A88 1998'); ($begin, $limit) = Biblio::LCC->normalize_range('E184.5-E185'); @parts = Biblio::LCC->parse($call_number); $call_number = Biblio::LCC->add($class, $offset); =head1 DESCRIPTION B parses Library of Congress classification ranges and call numbers and normalizes them into a form suitable for a straight ASCII sort. =head1 PUBLIC METHODS =over 4 =item B(I<$call_number>) $normalized = Biblio::LCC->normalize('PS3573.A472242 A88 1998'); Convert an LC-style class (e.g., 'PS' or 'E184.5') or call number (e.g., 'PS3573.A472242 A88 1998') into a string that may be compared to other normalized call numbers (see B below). =item B(I<$call_number_range>) ($begin, $limit) = Biblio::LCC->normalize_range('E184.5-E185'); Parse a call number range, producing a pair of strings I and I such that a call number falls within the range if and only if its normalized form, in a straight lexicographic ASCII comparison, is greater than or equal to I and strictly less than I. The range may be specified in one of three ways: =over 4 =item I B<-> I A pair of call numbers; the range includes the beginning call number, the ending call number, and any call numbers that have the ending call number as a prefix. For example, the (unnormalized) range C encompasses any class or call number from C up to B C In this form, the alphabetic string that begins the second call number may be omitted, so (for example) C is equivalent to C. Space is optional around the hyphen. =item I A single class or call number, in unnormalized form. This is equivalent to a pair in which each call number is the same. For example, the unnormalized range C encompasses call numbers from C up to but not including C. =item I E I A pair of call numbers; the range includes the first call number and any call number up and not including the ending call number. For example, the unnormalized range C<< DT6.7EDT7 >> includes everything greater than C and less than C. In this form, the alphabetic string that begins the second call number may be omitted, as in the form of range that uses a hyphen to separate the parts. Space is optional around the less-than sign (E). =back =item B(I<$call_number>) ($alpha, $int, $frac, $rmdr) = Biblio::LCC->parse($call_number); Split an LC call number into alphabetic, integer, decimal fraction, and remainder (i.e., everything else). =item B(I<$class>, I<$offset>) $call_number = Biblio::LCC->add($class, $offset); Add an offset (e.g., '180.3') to a base LC class (e.g., 'GN1600') to produce another LC class (e.g., 'GN1780.3'). The base class may have only alphabetic and integer parts; an exception will be thrown if it has a fractional part (e.g., as in 'GN1600.1') or a remainder (e.g., as in 'GN1600 R5'). =back =head1 HOW IT WORKS Call numbers are first analyzed into four parts. For example, take the call number B. =over 4 =item B GB The one to three alphabetic characters that begin the call number. =item B 1001 An integer from 1 to 9999 that follows. =item B 72 Digits that follow a decimal point after the integer part. =item B M32 E73 1988 Everything that follows. =back The LC Classification allows for a wide range of possible call numbers that do not fall into the simple (alpha, integer, fraction, remainder) model that this module implements. For example, the following are all valid call numbers: =over 4 =item B =item B =item B =back It may be that in some cases further analysis, and fully correct sorting, are not possible without hardcoded knowledge of the LC classification. In many cases, however, a more sophisticated parsing model, while more complex, would result in better normalization. =head1 BUGS There are no known bugs. Please report bugs on this module's RT page: L. =head1 TO DO Implement a C method and rewrite other methods so they may be used as class B instance methods. Special handling of "special" call numbers (e.g., in the Gs). Allow caller to specify prefixes to strip (e.g., "Folio"). Parse straight from the 050 or 090 field of a MARC record. Better error reporting. =head1 AUTHOR Paul Hoffman (nkuitse AT cpan DOT org) =head1 COPYRIGHT Copyright 2007-2008 Paul M. Hoffman. This is free software, and is made available under the same terms as Perl itself. Biblio-LCC-0.09/Makefile.PL0100644000076500007650000000071311104642305015151 0ustar nkuitsenkuitseuse ExtUtils::MakeMaker; WriteMakefile( 'NAME' => 'Biblio-LCC', 'AUTHOR' => 'Paul Hoffman ', 'VERSION_FROM' => 'lib/Biblio/LCC.pm', 'LICENSE' => 'perl', 'PREREQ_PM' => { 'Test::More' => 0, 'Getopt::Long' => 0, 'File::Basename' => 0, }, 'EXE_FILES' => [ 'script/lccnorm' ], ); Biblio-LCC-0.09/MANIFEST0100644000076500007650000000035411104645145014336 0ustar nkuitsenkuitseCHANGES lib/Biblio/LCC.pm Makefile.PL MANIFEST This list of files MANIFEST.SKIP script/lccnorm t/99-pod.t t/data-files/books.dat t/data-files/profile.sift META.yml Module meta-data (added by MakeMaker) Biblio-LCC-0.09/MANIFEST.SKIP0100644000076500007650000000020711104644656015106 0ustar nkuitsenkuitse\.DS_Store$ \.cvsignore$ ^Makefile(\.bak)?$ (^|/)CVS/ ^MANIFEST.bak$ ^blibdirs$ ^pm_to_blib$ ^blib\b ^\.nkpr (^|/)_darcs/ ^Biblio-LCC- Biblio-LCC-0.09/META.yml0100644000076500007650000000063711104645145014462 0ustar nkuitsenkuitse# http://module-build.sourceforge.net/META-spec.html #XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX# name: Biblio-LCC version: 0.09 version_from: lib/Biblio/LCC.pm installdirs: site requires: File::Basename: 0 Getopt::Long: 0 Test::More: 0 distribution_type: module generated_by: ExtUtils::MakeMaker version 6.30 Biblio-LCC-0.09/script/0040755000076500007650000000000011104645144014511 5ustar nkuitsenkuitseBiblio-LCC-0.09/script/lccnorm0100644000076500007650000007771011104641026016075 0ustar nkuitsenkuitse#!/usr/bin/perl -w # lccnorm by Paul Hoffman (nkuitse AT nkuitse DOT com). # Copyright 2007 Paul M. Hoffman. # This software is made available under the same terms as Perl; # see below for details. use strict; # -------------------------------- Required modules use Biblio::LCC; use File::Basename qw(basename); use Getopt::Long qw(:config posix_default gnu_compat require_order bundling no_ignore_case); # -------------------------------- Variables use vars qw($PROGRAM $VERSION $AUTHOR $COPYRIGHT $PAGER); $PROGRAM = basename($0); $VERSION = '0.08'; $AUTHOR = 'Paul Hoffman (nkuitse AT nkuitse DOT com)'; $COPYRIGHT = "Copyright 2007 Paul M. Hoffman"; my $action = \&run; my $verbose; my $in_delim = "\t"; my $out_join; my ($i, $j) = (0, 0); # Field(s) that contain the call no. (or range) my ($output_pos, $copy, $at_beginning, $at_end, $input_is_ranges, $errors_are_fatal); # -------------------------------- Setup $| = 1; # -------------------------------- Read command-line options and arguments GetOptions( 'f|fields=s' => sub { my $field_spec = $_[1]; if ($field_spec =~ /^(\d+)-(\d+)$/) { ($i, $j) = ($1 - 1, $2 - 1); exit usage("Invalid field spec: end must be greater than beginning") unless $i < $j; exit usage("Invalid field spec: must be 1, 2, 3, or 4 fields") unless $j - $i < 4; } elsif ($field_spec =~ /^(\d+)$/) { ($i, $j) = ($1 - 1, $1 - 1); } }, 'r|ranges' => \$input_is_ranges, 'd|delimiter=s' => \$in_delim, 'j|join=s' => \$out_join, 'c|copy' => \$copy, 'b|beginning' => \$at_beginning, 'e|end' => \$at_end, 'D|die-on-error' => \$errors_are_fatal, 'v|verbose' => \$verbose, 'h|help' => sub { $action = \&help }, 'V|version' => sub { $action = \&version }, 'M|manual' => sub { $action = \&manual }, 'L|license' => sub { $action = \&license }, ) or exit usage(); $out_join = $in_delim unless defined $out_join; $in_delim = qr/\Q$in_delim\E/ if defined $in_delim; if ($copy) { exit usage("Must specify -b|--beginning or -e|--end when using option -c|--copy") unless $at_beginning or $at_end; } # -------------------------------- Perform the desired action $action->(); # -------------------------------- Action functions sub run { # --- Figure out how to convert fields to a range string # Number of fields that hold the call number or range my $n = $j - $i + 1; # Function to construct an (unnormalized) call number or range from the input fields my $field2str; if ($n == 1) { # Ranges: ("E184-E184.5") --> "E184-E184.5" # Call #: ("Z699.22 .H54 1982") $field2str = sub { $_[0] }; } elsif ($n == 2) { # Ranges: ("E184", "E184.5") --> "E184-E184.5" # Call #: ("Z", "699.22 .H54 1982") or some such $field2str = $input_is_ranges ? sub { join('-', @_) } : sub { join(' ', @_) }; } elsif ($n == 3) { # ("E", "184", "184.5") --> "E184-E184.5" # Call #: ("Z", "699.22", ".H54 1982") or some such $field2str = $input_is_ranges ? sub { "$_[0] $_[1]-$_[0] $_[2]" } : sub { join(' ', @_) }; } elsif ($n == 4) { # ("E", "184", "E", "184.5") --> "E184-E184.5" # Call #: ("Z", "699.22", ".H54", "1982") or some such $field2str = $input_is_ranges ? sub { "$_[0] $_[1]-$_[2] $_[3]" } : sub { join(' ', @_) }; } else { die "This can't happen, because we eliminated this possibility earlier"; } # --- Process the input (finally!) my $line_num = 1; while (<>) { chomp; my $line = $_; # --- Split the line up into fields my @fields = split /$in_delim/; # --- Isolate the fields that hold the call number (or range) from # those (if any) that precede and follow them my (@before, @callnum, @after); @before = splice @fields, 0, $i if $i > 0; @callnum = splice @fields, 0, $n; @after = @fields; my @result; # --- Normalize the call number or range if ($input_is_ranges) { my $range = $field2str->(@callnum); eval { @result = Biblio::LCC->normalize_range($range); }; } else { my $callnum = $field2str->(@callnum); eval { @result = (Biblio::LCC->normalize($callnum)); }; } if ($@) { print STDERR "Error in input line $line_num: $line\n"; exit 2 if $errors_are_fatal; next; } my @output; if ($copy) { @output = $at_beginning ? (@result, @before, @callnum, @after) : ( @before, @callnum, @after, @result) ; } else { @output = $at_beginning ? (@result, @before, @after) : $at_end ? ( @before, @after, @result ) : ( @before, @result, @after ) ; } print join($out_join, @output), "\n"; } continue { $line_num++; } } sub help { usage(); options(); exit 0; } sub version { print "This is $PROGRAM " if $verbose; print $VERSION; print <<"EOS" if $verbose; by $AUTHOR. $COPYRIGHT. This is free software, made available under the same terms as Perl itself. EOS print "\n" unless $verbose; exit 0; } sub manual { system(podder()) == 0 or print STDERR "I can't find a way to print the manual page for $PROGRAM.\n"; } sub license { my $pager = pager(); if ($pager and which('sed')) { system(qq{sed '/^This software is made available/,/^The End/!d' "$0" | "$pager"}); } else { print STDERR "I can't find a way to print the license for $PROGRAM\n"; } } # -------------------------------- Other functions sub options { print <<"EOS"; Options: -f|--fields RANGE Field(s) containing the call number (or range) -r|--ranges Input is ranges -d|--delimiter STR Input delimiter (default: tab) -j|--join STR Output delimiter (default: same as for input) -c|--copy Don't delete source fields -b|--beginning Put normalized string at beginning of line -e|--end Put normalized string at end of line -D|--die-on-error Exit with an error as soon as an error is encountered -v|--verbose Be verbose -h|--help Print help and exit -V|--version Print version and exit -M|--manual Print manual (using pager) and exit -L|--license Print license (using pager) and exit EOS } sub usage { print STDERR "$_\n" foreach @_; print <<"EOS"; Usage: $PROGRAM [OPTION...] [FILE...] EOS return 1; } sub pager { return $PAGER if defined $PAGER; $PAGER = $ENV{'PAGER'} || which(qw(less more)) || which('cat') || 0; return $PAGER; } sub podder { return "cat $0 | pod2man -n $PROGRAM -s 1 -r $VERSION -c '' | nroff -man | " . pager() if pager() and which('pod2man') and which('nroff'); return "man 1 $PROGRAM" if manpage($PROGRAM); return "perldoc -F $0" if which('perldoc'); return undef; } sub which { my $out; no warnings; foreach my $prog (@_) { $out = `which $prog 2>/dev/null`; if (defined $out) { chomp $out; return $out if -x $out; } } return undef; } sub manpage { return unless which('man'); my $out = `man -w 1 $PROGRAM 2>/dev/null`; return unless defined $out; chomp $out; return $out; } =head1 NAME lccnorm - normalize Library of Congress Classification call numbers =head1 SYNOPSIS B [I