gnuhtml2latex/gnuhtml2latex 0000755 0001750 0001750 00000026604 11375327602 015434 0 ustar gwolf gwolf #!/usr/bin/perl -w
# gnuhtml2latex html to latex converter
# Copyright (c) 1999 Tomasz Węgrzanowski
# Maintenance taken over by Gunnar Wolf, 2005
# Copyright (c) 2005-2010 Gunnar Wolf
#
# gnuhtml2latex is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# On Debian GNU/Linux systems, the complete text of the GNU General
# Public License can be found in `/usr/share/common-licenses/GPL'.
#
# THIS IS VERY ALPHA
use strict;
use Getopt::Std;
getopts('a:bcf:gh:i:no:pst:HPS:',\%main::opts);
unless (defined $main::opts{o}) { $main::opts{o} = '{article}' }
unless (defined $main::opts{h}) { $main::opts{h} = '' }
unless (defined $main::opts{f}) { $main::opts{f} = '' }
$main::num = ($main::opts{n})?'':'*';
{
my %tagstable_start = (
'p' => '\\par ',
'b' => '\\textbf{',
'i' => '\\textit{',
'u' => '\\underline{',
'dt' => '\\item[',
'dd' => ']',
'br' => '\\\\',
'em' => '\\emph{',
'h1' => "\\section${main::num}\{",
'h2' => "\\subsection${main::num}\{",
'h3' => "\\subsubsection${main::num}\{",
'h4' => "\\paragraph${main::num}\{",
'h5' => "\\subparagraph${main::num}\{",
'h6' => "\\subparagraph${main::num}\{",
'li' => '\\item ',
'ul' => '\\begin{itemize}',
'ol' => '\\begin{enumerate}',
'dl' => '\\begin{description}',
'tt' => '\\texttt{',
'kbd' => '{\\tt\\bf ',
'var' => '\\textit{',
'dfn' => '{\\bf\\it ',
'cite' => '{\\sc ',
'samp' => '\\texttt{',
'strong' => '\\textbf{',
'listing' => '\\begin{verbatim}',
'code' => '\\texttt{',
'pre' => '\\begin{verbatim}',
'blockquote' => '\\begin{quotation}'
);
my %tagstable_end = (
'b' => '}',
'i' => '}',
'u' => '}',
'em' => '}',
'h1' => '}',
'h2' => '}',
'h3' => '}',
'h4' => '}',
'h5' => '}',
'h6' => '}',
'tt' => '}',
'kbd' => '}',
'var' => '}',
'dfn' => '}',
'cite' => '}',
'samp' => '}',
'strong' => '}',
'ul' => '\\end{itemize}',
'ol' => '\\end{enumerate}',
'dl' => '\\end{description}',
'listing' => '\\end{verbatim}',
'code' => '}',
'pre' => '\\end{verbatim}',
'blockquote' => '\\end{quotation}'
);
my $mode = 0;
my $firstfile = 1;
my $lastfile = 1;
my $substitution = 1;
package HTML::LatexMaker;
use HTML::Parser;
use HTML::Entities;
@HTML::LatexMaker::ISA = ( "HTML::Parser" );
1;
sub firstfile { my $self = shift; $firstfile = shift; }
sub lastfile { my $self = shift; $lastfile = shift; }
sub anchor_convert {
my ($attr, $attseq)=@_;
return unless defined($main::opts{H});
return unless defined($attr->{href});
printf "\\href{%s}{",$attr->{href};
}
sub image_convert {
my ($attr, $attseq, $url, $caption, $localimg, $imgtype, $wget);
($attr, $attseq) = @_;
return unless defined($main::opts{g});
return unless defined($attr->{src});
$wget = find_wget();
$localimg = $url = $attr->{src};
$localimg =~ s!(?:http|ftp)://!!;
$localimg =~ s!\?.*!!;
if ($localimg =~ s/\.(png|jpg|eps|gif|tif)$//) {
$imgtype = $1;
} else {
warn "Cannot determine a valid image type for $url - Trying with .png";
$imgtype = 'png'
}
$localimg =~ s![/?&.]!_!g;
$localimg .= ".$imgtype" if $imgtype;
if (-f $localimg) {
warn "$localimg: Already here, skipping download\n";
} elsif ($wget) {
system($wget, $url, '-nv', '--load-cookies', '/tmp/wget.cookies',
'-O', '-nc', $localimg);
} else {
warn "wget not found, you will need to create `$localimg'\n" .
"(Original URL: $url)\n";
}
$caption = $attr->{title} || $attr->{alt} ||
sprintf('\href{%s}{%s}', $url, $url);
printf "
\\begin{figure}
\\centering
\\includegraphics[width=0.4\\textwidth]{%s}
\\caption{%s}
\\end{figure}", $localimg, $caption;
}
sub find_wget {
for my $path (split /:/,$ENV{PATH}) {
my $wget = "$path/wget";
return $wget if -x $wget;
}
warn "wget not found in path - No images will be downloaded\n";
return undef;
}
sub start {
my %tag_tbl;
my ( $self, $tag, $attr, $attrseq ) = @_;
%tag_tbl = (
html => sub { start_mode(1); return },
head => sub { start_mode(2); return },
body => sub { start_mode(3); return },
pre => sub { $substitution = 0; },
listing => sub { $substitution = 0; },
a => sub { anchor_convert($attr, $attrseq); return },
img => sub { image_convert($attr, $attrseq); return }
);
&{$tag_tbl{$tag}} if $tag_tbl{$tag};
return unless( $mode == 3 and defined $tagstable_start{$tag} );
print $tagstable_start{$tag};
}
sub end {
my %tag_tbl;
my ( $self, $tag ) = @_;
%tag_tbl = (
html => sub { end_mode(0); return },
head => sub { end_mode(1); return },
body => sub { end_mode(1); return },
pre => sub { $substitution=1 },
listing => sub { $substitution=1 },
a => sub { $main::opts{H} && do { print "}"; return } },
);
&{$tag_tbl{$tag}} if $tag_tbl{$tag};
return unless( $mode == 3 and defined $tagstable_end{$tag} );
print $tagstable_end{$tag};
}
sub text {
my ( $self, $text ) = @_;
return unless( $mode == 3 );
# Handle some things that decode_entities doesn't.
# (This needs to be done *before* calling decode_entities: otherwise
# there'd be no way of distinguishing `&FOO;' from `&FOO;'.)
# We use `!' for internal purposes during entity translation.
$text =~ s/!|&\#(?:0*33|x0*21);/!bang;/g;
# Handle `‘“', `–—' and so on by inserting
# thin space between the translations in such cases.
$text =~ s/&\#(?:x0*2d|0*45);/-/g;
$text =~ s/(—|–|-)(?=(?:—|–|-))/$1!thinsp;/g;
$text =~ s/(&[lr][sd]quo;)(?=(?:&[lr][sd]quo;))/$1!thinsp;/g;
# There are many things that decode_entities doesn't handle.
# A few of those things we handle ourselves. The final replacement
# happens later (so that we correctly handle the various quotes
# whether they're literal, numeric character ref, or symbolic ref).
# In the meantime we change from `&FOO;' to `!FOO;'.
$text =~ s/&([mn]dash|[lr][sd]quo|hellip);/!$1;/g;
$text = decode_entities($text);
$text =~ s/\\/!backslash;/g;
# Does not work properly.
# $text =~ s/([~\`\'\"]+)/!verb|$1|/g;
if ($substitution) {
$text =~ s/([_&%\{\}\#])/\\$1/g;
}
$text =~ s/\$/\\\$/g;
$text =~ s/\^/\\^{}/g;
$text =~ s/!backslash;/\$\\backslash\$/g;
$text =~ s/!mdash;/---/g;
$text =~ s/!ndash;/--/g;
$text =~ s/!lsquo;/`/g; #`;
$text =~ s/!rsquo;/'/g; #';
$text =~ s/!ldquo;/``/g;
$text =~ s/!rdquo;/''/g;
$text =~ s/!hellip;/\ldots{}/g;
$text =~ s/!thinsp;/\$\\,\$/g;
# $text =~ s/!verb|/\\verb|/g;
$text =~ s/!bang;/!/g;
$text =~ s/\xa0/~/g;
#$text =~ s/>/\$>\$/g;
# Whatever looks like an URL should be made into one
$text =~ s![[{]?((?:http|ftp)://\S+)[\]}]?!\\url{$1}!g;
print $text;
}
sub start_mode {
my ( $mode_new, $skip_pre );
($mode_new) = @_;
$skip_pre = $main::opts{P};
if ( $mode_new == 1 && $firstfile) {
print "% This file was converted from HTML to LaTeX with\n" .
"% gnuhtml2latex program\n" .
"% (c) Tomasz Wegrzanowski 1999\n" .
"% (c) Gunnar Wolf 2005-2010\n" .
"% Version : $main::version.\n";
if (!$skip_pre) {
print '\documentclass'.$main::opts{o}."\n";
print "\\usepackage{hyperref}\n" if $main::opts{H};
if ($main::opts{g}) {
print "\\usepackage{graphicx}\n";
print "\\DeclareGraphicsExtensions{.png,.jpg,.eps,.gif,.tif}\n";
}
}
}
if ( $mode_new == 3 && $firstfile) {
print "\\begin{document}\n" unless $skip_pre;
print $main::opts{h};
if ( defined $main::opts{a} or defined $main::opts{t} or
defined $main::opts{c} ){
if ( defined $main::opts{a} or defined $main::opts{t} ) {
print ('\\title{'.$main::opts{t}.'}') if $main::opts{t};
print ( '\\author{'.($main::opts{a} or '')."}\n\\maketitle" );
}
if ( $main::opts{c} ) { print "\n\\tableofcontents\n" }
}
if ( $main::opts{p} ) { print "\n\\newpage" }
}
$mode = $mode_new;
}
sub end_mode {
my ( $mode_new, $skip_post);
($mode_new ) = @_;
$skip_post = $main::opts{P};
if ( $mode == 3 && $lastfile ) {
print $main::opts{f};
print "\\end{document}\n" unless $skip_post;
}
$mode = $mode_new;
}
}
$main::version = '0.4';
if ( $main::opts{i} ) {
open FILE, $main::opts{i} or
die "$main::opts{i} $1";
@ARGV=;
close FILE;
}
if ( $main::opts{b} ) {
if (@ARGV>=1) {
my $filename=$ARGV[0];
open FILE, $filename or die "$filename $!";
$filename =~ s/\.html?$//;
my $outfile = $filename.".tex";
unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
my $doc = new HTML::LatexMaker;
$doc->ignore_elements($main::opts{S}) if $main::opts{S};
$doc->lastfile(0);
$doc->parse_file (\*FILE);
$doc->firstfile(0);
close FILE;
for (my $i=1; $i < @ARGV-1; $i++) {
$filename=$ARGV[$i];
open FILE, $filename or next;
$filename =~ s/\.html?$//;
$outfile = $filename.".tex";
unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
$doc->parse_file (\*FILE);
close FILE;
}
$filename=$ARGV[@ARGV-1];
open FILE, $filename or die;
$filename =~ s/\.html?$//;
$outfile = $filename.".tex";
unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
$doc->lastfile(1);
$doc->parse_file (\*FILE);
}
} else {
foreach my $filename(@ARGV) {
open FILE, $filename or next;
$filename =~ s/\.html?$//;
my $outfile = $filename.".tex";
unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
my $doc = new HTML::LatexMaker;
$doc->ignore_elements($main::opts{S}) if $main::opts{S};
$doc->parse_file (\*FILE);
close FILE;
}
}
=head1 NAME
gnuhtml2latex - html to latex converter
=head1 SYNOPSIS
B F<[options]> F
=head1 OPTIONS
=over
=item -a [author]
speecify document author
=item -b
Process more than one input HTML file (they all get concatenated and
written to a single output file, or to STDOUT if F<-s> is set)
=item -c
Use table of contents
=item -f [string]
Specify foonote
=item -h [string]
Specify header
=item -i filename
Get the list of files to be converted from the specified filename
=item -n
Use numbered sections
=item -H
use hyperref package to process anchors
=item -g
Include images. If wget is installed, it will be used in order to
download the images; otherwise, their position will just be marked in
the resulting TeX document.
=item -o [string]
Specify document style
=item -p
Break page after title / table of contents
=item -P
Partial / plain: Omit preamble and postamble. Note that F<-P> makes
F<-H> and F<-o> meaningless (as they act in the preamble)
=item -S
Skip (ignore) the specified comma-separated tags, along with all of
their content.
=item -s
Write to STDOUT instead of to inputfilename.tex
=item -t [title]
Specify title of document
=back
=head1 DESCRIPTION
This aims to be replacement of html2latex.
Program takes html file foo.html or foo.htm file
and makes latex file foo.tex from it
=head1 NOT VERY AMBITIOUS TODO
For people who want only functionality of original html2latex
bugfixes - Im sure there is plenty of bugs inside
clueful backslash escaping
more entities from outside of iso-8895-1
tables
performance boost
and a lot more
=head1 MORE AMBITIOUS TODO
For people who want a real tool
make it part of some html processor
=head1 FUTURE OF THIS PACKAGE
This is very possible that functions of this package will be included
to some more general project. This package was made mainly to make world
a bit more free.
=cut