gnuhtml2latex/gnuhtml2latex0000755000175000017500000002660411375327602015434 0ustar gwolfgwolf#!/usr/bin/perl -w # gnuhtml2latex html to latex converter # Copyright (c) 1999 Tomasz Węgrzanowski # Maintenance taken over by Gunnar Wolf, 2005 # Copyright (c) 2005-2010 Gunnar Wolf # # gnuhtml2latex is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # On Debian GNU/Linux systems, the complete text of the GNU General # Public License can be found in `/usr/share/common-licenses/GPL'. # # THIS IS VERY ALPHA use strict; use Getopt::Std; getopts('a:bcf:gh:i:no:pst:HPS:',\%main::opts); unless (defined $main::opts{o}) { $main::opts{o} = '{article}' } unless (defined $main::opts{h}) { $main::opts{h} = '' } unless (defined $main::opts{f}) { $main::opts{f} = '' } $main::num = ($main::opts{n})?'':'*'; { my %tagstable_start = ( 'p' => '\\par ', 'b' => '\\textbf{', 'i' => '\\textit{', 'u' => '\\underline{', 'dt' => '\\item[', 'dd' => ']', 'br' => '\\\\', 'em' => '\\emph{', 'h1' => "\\section${main::num}\{", 'h2' => "\\subsection${main::num}\{", 'h3' => "\\subsubsection${main::num}\{", 'h4' => "\\paragraph${main::num}\{", 'h5' => "\\subparagraph${main::num}\{", 'h6' => "\\subparagraph${main::num}\{", 'li' => '\\item ', 'ul' => '\\begin{itemize}', 'ol' => '\\begin{enumerate}', 'dl' => '\\begin{description}', 'tt' => '\\texttt{', 'kbd' => '{\\tt\\bf ', 'var' => '\\textit{', 'dfn' => '{\\bf\\it ', 'cite' => '{\\sc ', 'samp' => '\\texttt{', 'strong' => '\\textbf{', 'listing' => '\\begin{verbatim}', 'code' => '\\texttt{', 'pre' => '\\begin{verbatim}', 'blockquote' => '\\begin{quotation}' ); my %tagstable_end = ( 'b' => '}', 'i' => '}', 'u' => '}', 'em' => '}', 'h1' => '}', 'h2' => '}', 'h3' => '}', 'h4' => '}', 'h5' => '}', 'h6' => '}', 'tt' => '}', 'kbd' => '}', 'var' => '}', 'dfn' => '}', 'cite' => '}', 'samp' => '}', 'strong' => '}', 'ul' => '\\end{itemize}', 'ol' => '\\end{enumerate}', 'dl' => '\\end{description}', 'listing' => '\\end{verbatim}', 'code' => '}', 'pre' => '\\end{verbatim}', 'blockquote' => '\\end{quotation}' ); my $mode = 0; my $firstfile = 1; my $lastfile = 1; my $substitution = 1; package HTML::LatexMaker; use HTML::Parser; use HTML::Entities; @HTML::LatexMaker::ISA = ( "HTML::Parser" ); 1; sub firstfile { my $self = shift; $firstfile = shift; } sub lastfile { my $self = shift; $lastfile = shift; } sub anchor_convert { my ($attr, $attseq)=@_; return unless defined($main::opts{H}); return unless defined($attr->{href}); printf "\\href{%s}{",$attr->{href}; } sub image_convert { my ($attr, $attseq, $url, $caption, $localimg, $imgtype, $wget); ($attr, $attseq) = @_; return unless defined($main::opts{g}); return unless defined($attr->{src}); $wget = find_wget(); $localimg = $url = $attr->{src}; $localimg =~ s!(?:http|ftp)://!!; $localimg =~ s!\?.*!!; if ($localimg =~ s/\.(png|jpg|eps|gif|tif)$//) { $imgtype = $1; } else { warn "Cannot determine a valid image type for $url - Trying with .png"; $imgtype = 'png' } $localimg =~ s![/?&.]!_!g; $localimg .= ".$imgtype" if $imgtype; if (-f $localimg) { warn "$localimg: Already here, skipping download\n"; } elsif ($wget) { system($wget, $url, '-nv', '--load-cookies', '/tmp/wget.cookies', '-O', '-nc', $localimg); } else { warn "wget not found, you will need to create `$localimg'\n" . "(Original URL: $url)\n"; } $caption = $attr->{title} || $attr->{alt} || sprintf('\href{%s}{%s}', $url, $url); printf " \\begin{figure} \\centering \\includegraphics[width=0.4\\textwidth]{%s} \\caption{%s} \\end{figure}", $localimg, $caption; } sub find_wget { for my $path (split /:/,$ENV{PATH}) { my $wget = "$path/wget"; return $wget if -x $wget; } warn "wget not found in path - No images will be downloaded\n"; return undef; } sub start { my %tag_tbl; my ( $self, $tag, $attr, $attrseq ) = @_; %tag_tbl = ( html => sub { start_mode(1); return }, head => sub { start_mode(2); return }, body => sub { start_mode(3); return }, pre => sub { $substitution = 0; }, listing => sub { $substitution = 0; }, a => sub { anchor_convert($attr, $attrseq); return }, img => sub { image_convert($attr, $attrseq); return } ); &{$tag_tbl{$tag}} if $tag_tbl{$tag}; return unless( $mode == 3 and defined $tagstable_start{$tag} ); print $tagstable_start{$tag}; } sub end { my %tag_tbl; my ( $self, $tag ) = @_; %tag_tbl = ( html => sub { end_mode(0); return }, head => sub { end_mode(1); return }, body => sub { end_mode(1); return }, pre => sub { $substitution=1 }, listing => sub { $substitution=1 }, a => sub { $main::opts{H} && do { print "}"; return } }, ); &{$tag_tbl{$tag}} if $tag_tbl{$tag}; return unless( $mode == 3 and defined $tagstable_end{$tag} ); print $tagstable_end{$tag}; } sub text { my ( $self, $text ) = @_; return unless( $mode == 3 ); # Handle some things that decode_entities doesn't. # (This needs to be done *before* calling decode_entities: otherwise # there'd be no way of distinguishing `&FOO;' from `&FOO;'.) # We use `!' for internal purposes during entity translation. $text =~ s/!|&\#(?:0*33|x0*21);/!bang;/g; # Handle `‘“', `–—' and so on by inserting # thin space between the translations in such cases. $text =~ s/&\#(?:x0*2d|0*45);/-/g; $text =~ s/(—|–|-)(?=(?:—|–|-))/$1!thinsp;/g; $text =~ s/(&[lr][sd]quo;)(?=(?:&[lr][sd]quo;))/$1!thinsp;/g; # There are many things that decode_entities doesn't handle. # A few of those things we handle ourselves. The final replacement # happens later (so that we correctly handle the various quotes # whether they're literal, numeric character ref, or symbolic ref). # In the meantime we change from `&FOO;' to `!FOO;'. $text =~ s/&([mn]dash|[lr][sd]quo|hellip);/!$1;/g; $text = decode_entities($text); $text =~ s/\\/!backslash;/g; # Does not work properly. # $text =~ s/([~\`\'\"]+)/!verb|$1|/g; if ($substitution) { $text =~ s/([_&%\{\}\#])/\\$1/g; } $text =~ s/\$/\\\$/g; $text =~ s/\^/\\^{}/g; $text =~ s/!backslash;/\$\\backslash\$/g; $text =~ s/!mdash;/---/g; $text =~ s/!ndash;/--/g; $text =~ s/!lsquo;/`/g; #`; $text =~ s/!rsquo;/'/g; #'; $text =~ s/!ldquo;/``/g; $text =~ s/!rdquo;/''/g; $text =~ s/!hellip;/\ldots{}/g; $text =~ s/!thinsp;/\$\\,\$/g; # $text =~ s/!verb|/\\verb|/g; $text =~ s/!bang;/!/g; $text =~ s/\xa0/~/g; #$text =~ s/>/\$>\$/g; # Whatever looks like an URL should be made into one $text =~ s![[{]?((?:http|ftp)://\S+)[\]}]?!\\url{$1}!g; print $text; } sub start_mode { my ( $mode_new, $skip_pre ); ($mode_new) = @_; $skip_pre = $main::opts{P}; if ( $mode_new == 1 && $firstfile) { print "% This file was converted from HTML to LaTeX with\n" . "% gnuhtml2latex program\n" . "% (c) Tomasz Wegrzanowski 1999\n" . "% (c) Gunnar Wolf 2005-2010\n" . "% Version : $main::version.\n"; if (!$skip_pre) { print '\documentclass'.$main::opts{o}."\n"; print "\\usepackage{hyperref}\n" if $main::opts{H}; if ($main::opts{g}) { print "\\usepackage{graphicx}\n"; print "\\DeclareGraphicsExtensions{.png,.jpg,.eps,.gif,.tif}\n"; } } } if ( $mode_new == 3 && $firstfile) { print "\\begin{document}\n" unless $skip_pre; print $main::opts{h}; if ( defined $main::opts{a} or defined $main::opts{t} or defined $main::opts{c} ){ if ( defined $main::opts{a} or defined $main::opts{t} ) { print ('\\title{'.$main::opts{t}.'}') if $main::opts{t}; print ( '\\author{'.($main::opts{a} or '')."}\n\\maketitle" ); } if ( $main::opts{c} ) { print "\n\\tableofcontents\n" } } if ( $main::opts{p} ) { print "\n\\newpage" } } $mode = $mode_new; } sub end_mode { my ( $mode_new, $skip_post); ($mode_new ) = @_; $skip_post = $main::opts{P}; if ( $mode == 3 && $lastfile ) { print $main::opts{f}; print "\\end{document}\n" unless $skip_post; } $mode = $mode_new; } } $main::version = '0.4'; if ( $main::opts{i} ) { open FILE, $main::opts{i} or die "$main::opts{i} $1"; @ARGV=; close FILE; } if ( $main::opts{b} ) { if (@ARGV>=1) { my $filename=$ARGV[0]; open FILE, $filename or die "$filename $!"; $filename =~ s/\.html?$//; my $outfile = $filename.".tex"; unless ( $main::opts{s} ) { open STDOUT,">$outfile" } my $doc = new HTML::LatexMaker; $doc->ignore_elements($main::opts{S}) if $main::opts{S}; $doc->lastfile(0); $doc->parse_file (\*FILE); $doc->firstfile(0); close FILE; for (my $i=1; $i < @ARGV-1; $i++) { $filename=$ARGV[$i]; open FILE, $filename or next; $filename =~ s/\.html?$//; $outfile = $filename.".tex"; unless ( $main::opts{s} ) { open STDOUT,">$outfile" } $doc->parse_file (\*FILE); close FILE; } $filename=$ARGV[@ARGV-1]; open FILE, $filename or die; $filename =~ s/\.html?$//; $outfile = $filename.".tex"; unless ( $main::opts{s} ) { open STDOUT,">$outfile" } $doc->lastfile(1); $doc->parse_file (\*FILE); } } else { foreach my $filename(@ARGV) { open FILE, $filename or next; $filename =~ s/\.html?$//; my $outfile = $filename.".tex"; unless ( $main::opts{s} ) { open STDOUT,">$outfile" } my $doc = new HTML::LatexMaker; $doc->ignore_elements($main::opts{S}) if $main::opts{S}; $doc->parse_file (\*FILE); close FILE; } } =head1 NAME gnuhtml2latex - html to latex converter =head1 SYNOPSIS B F<[options]> F =head1 OPTIONS =over =item -a [author] speecify document author =item -b Process more than one input HTML file (they all get concatenated and written to a single output file, or to STDOUT if F<-s> is set) =item -c Use table of contents =item -f [string] Specify foonote =item -h [string] Specify header =item -i filename Get the list of files to be converted from the specified filename =item -n Use numbered sections =item -H use hyperref package to process anchors =item -g Include images. If wget is installed, it will be used in order to download the images; otherwise, their position will just be marked in the resulting TeX document. =item -o [string] Specify document style =item -p Break page after title / table of contents =item -P Partial / plain: Omit preamble and postamble. Note that F<-P> makes F<-H> and F<-o> meaningless (as they act in the preamble) =item -S Skip (ignore) the specified comma-separated tags, along with all of their content. =item -s Write to STDOUT instead of to inputfilename.tex =item -t [title] Specify title of document =back =head1 DESCRIPTION This aims to be replacement of html2latex. Program takes html file foo.html or foo.htm file and makes latex file foo.tex from it =head1 NOT VERY AMBITIOUS TODO For people who want only functionality of original html2latex bugfixes - Im sure there is plenty of bugs inside clueful backslash escaping more entities from outside of iso-8895-1 tables performance boost and a lot more =head1 MORE AMBITIOUS TODO For people who want a real tool make it part of some html processor =head1 FUTURE OF THIS PACKAGE This is very possible that functions of this package will be included to some more general project. This package was made mainly to make world a bit more free. =cut