html-xml-utils-6.9/ 0000755 0001750 0000144 00000000000 12421511013 011253 5 0000000 0000000 html-xml-utils-6.9/textwrap.e 0000644 0001750 0000144 00000000557 12374701365 013250 0000000 0000000 extern void set_indent(int n);
extern void set_linelen(int n);
extern void flush();
extern void outc(char c, _Bool preformatted);
extern void out(string s, _Bool preformatted);
extern void outn(string s, size_t n, _Bool preformatted);
extern void outln(char *s, _Bool preformatted);
extern void outbreak();
extern void inc_indent(void);
extern void dec_indent(void);
html-xml-utils-6.9/asc2xml.c 0000644 0001750 0000144 00000006407 12174313455 012736 0000000 0000000 /*
*
* Program to convert files from ASCII or ISO-8859-1 to UTF8.
*
* Usage:
*
* asc2xml
*
* Reads from stdin and write to stdout. Converts from ASCII (in fact:
* Latin-1) (with or without -escapes) to UTF8, removing all
* -escapes, except those representing ASCII characters.
*
* Part of HTML-XML-utils, see:
* http://www.w3.org/Tools/HTML-XML-utils/
*
* Author: Bert Bos attribute attributes
%%
start
: {data = h.start ? h.start() : NULL;}
document {call(h.end, (data));}
;
document
: document COMMENT {call(h.comment, (data, $2));}
| document TEXT {call(h.text, (data, $2));}
| document starttag
| document endtag
| document decl
| document PROCINS {call(h.pi, (data, $2));}
| document ENDINCL {call(h.endincl, (data));}
| document error
| /* empty */
;
starttag
: START attributes '>' {call(h.starttag, (data, $1, $2));}
| START attributes EMPTYEND {call(h.emptytag, (data, $1, $2));}
;
attributes
: attribute attributes {$$ = $1; $$->next = $2;}
| /* empty */ {$$ = NULL;}
;
attribute
: NAME {pairlist h = malloc(sizeof(*h));
assert(h != NULL); h->name = $1;
h->value=NULL; $$ = h;}
| NAME '=' NAME {pairlist h = malloc(sizeof(*h));
assert(h != NULL); h->name = $1;
h->value = $3; $$ = h;}
| NAME '=' STRING {pairlist h = malloc(sizeof(*h));
assert(h != NULL); h->name = $1;
h->value = $3; $$ = h;}
;
endtag
: END '>' {call(h.endtag, (data, $1));}
;
decl
: DOCTYPE NAME NAME STRING STRING '>' {call(h.decl, (data, $2, $4, $5));}
| DOCTYPE NAME NAME STRING '>' {if (strcasecmp($3, "public") == 0)
call(h.decl, (data, $2, $4, NULL));
else /* "system" */
call(h.decl, (data, $2, NULL, $4));}
| DOCTYPE NAME '>' {call(h.decl, (data, $2, NULL, NULL));}
;
html-xml-utils-6.9/cexport.1 0000644 0001750 0000144 00000000625 11111474036 012754 0000000 0000000 .de d \" begin display
.sp
.in +4
.nf
..
.de e \" end display
.in -4
.fi
.sp
..
.TH CEXPORT 1 "31 Mar 2000"
.SH NAME
cexport \- create header file with exported declarations from a C file
.SH SYNOPSIS
.B cexport
.RB "[\| " \-c
.IR cpp\-command " \|]"
.RB "[\| " \-e
.IR extension " \|]"
.RB "[\| " \-h " \|]"
.RI "[\| " cc\-options " \|]"
.RI "[\| " file " [\| " file... " \|] \|]"
.SH DESCRIPTION
[ToDo]
html-xml-utils-6.9/hxtoc.1 0000644 0001750 0000144 00000006506 12123434146 012423 0000000 0000000 .de d \" begin display
.sp
.in +4
.nf
..
.de e \" end display
.in -4
.fi
.sp
..
.TH "HXTOC" "1" "10 Jul 2011" "6.x" "HTML-XML-utils"
.SH NAME
hxtoc \- insert a table of contents in an HTML file
.SH SYNOPSIS
.B hxtoc
.RB "[\| " \-x " \|]"
.RB "[\| " \-l
.IR low " \|]"
.RB "[\| " \-h
.IR high " \|]"
.RI "[\| " file " \|]"
.RB "[\| " \-t " \|]"
.RB "[\| " \-d " \|]"
.RB "[\| " \-c
.IR class " \|]"
.SH DESCRIPTION
.LP
The
.B hxtoc
command reads an HTML file, inserts missing ID attributes in all H1 to
H6 elements between the levels
.B \-l
and
.B \-h
(unless the option
.B \-d
is in effect, see below) and also inserts A elements with NAME
attributes, so old browsers will recognize the H1 to H6 headers as
target anchors as well (unless the option
.B \-t
is in effect). The output is written to stdout.
.LP
If there is a comment of the form
.d
.e
or a pair of comments
.d
\&...
.e
then the comment, or the pair with everything in between, will be
replaced by a table of contents, consisting of a list (UL) of links to
all headers in the document.
.LP
The text of headers is copied to this table of contents, including any
inline markup, except that DFN tags and SPAN tags with a CLASS of
"index" are omitted (but the elements content is copied).
.LP
If a header has a CLASS attribute with as value (or one of its values)
the keyword "no-toc", then that header will not appear in the table
of contents.
.SH OPTIONS
The following options are supported:
.TP 10
.B \-x
Use XML conventions: empty elements are written with a slash at the
end:
*
*
*
* To do: if the template adds something like "(eds)", allow it to be
* changed to "(ed)" if there is only one editor.
*
* Copyright © 1994-2004 World Wide Web Consortium
* See http://www.w3.org/Consortium/Legal/copyright-software
*
* Author: Bert Bos .
.TP
.BI \-b " base"
Sets the base URL for resolving relative URLs. By default the file
given as argument is the base URL.
.TP
.B \-f
Removes the comments after including the files. This means
.B hxincl
connot be run on the resulting file later to update the inclusions.
(Mnemonic:
.BR f inal
or
.BR f rozen.)
.TP
.BI \-s " name=substitution"
Include a different file than the one mentioned in the directive. If
the comment is
.d
.e
the file
.I substitution
is included instead. And if the file name in the comment includes a
variable called
.I name
delimited by %, e.g.,
.d
.e
then
.RI % name %
is replaced by
.I substitution
and thus the file
.RI xxx\- substitution
is included. The option
.B \-s
may occur multiple times. %-delimited variables are expanded
recursively, i.e., if the substitution text contains a variable, that
variable is expanded, too. E.g., if the two options
.B \-s name=%p1%.rrr
and
.B \-s p1=subst
are given, then the "xxx-%name%" will expand to "xxx-subst.rrr".
.TP
.BI \-M " target"
Instead of outputing the input file with all inclusions expanded,
output just the list of all files that the input includes,
recursively, in the form of a rule that is suitable for a
Makefile. The
.I target
is printed as the target of that rule.
.TP
.B \-G
Suppress error messages if a file to include cannot be found. (Only
with
.BR \-M .)
.SH OPERANDS
The following operand is supported:
.TP 10
.I file\-or\-URL
The name of an HTML or XML file or the URL of one. If absent, standard
input is read instead.
.SH "EXIT STATUS"
The following exit values are returned:
.TP 10
.B 0
Successful completion.
.TP
.B > 0
An error occurred in the parsing of one of the HTML or XML files.
.SH ENVIRONMENT
To use a proxy to retrieve remote files, set the environment variables
.B http_proxy
or
.BR ftp_proxy "."
E.g.,
.B http_proxy="http://localhost:8080/"
.SH BUGS
.LP
Assumes UTF-8 as input. Doesn't expand character entities. Instead
pipe the input through
.BR hxunent (1)
and
.BR asc2xml (1)
to convert it to UTF-8.
.LP
Remote files (specified with a URL) are currently only supported for
HTTP. Password-protected files or files that depend on HTTP "cookies"
are not handled. (You can use tools such as
.BR curl (1)
or
.BR wget (1)
to retrieve such files.)
.SH "SEE ALSO"
.BR asc2xml (1),
.BR hxnormalize (1),
.BR hxnum (1),
.BR hxprune (1),
.BR hxtoc (1),
.BR hxunent (1),
.BR xml2asc (1),
.BR UTF-8 " (RFC 2279)"
html-xml-utils-6.9/Makefile.in 0000644 0001750 0000144 00000161574 12421510515 013264 0000000 0000000 # Makefile.in generated by automake 1.11.6 from Makefile.am.
# @configure_input@
# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
# Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
@SET_MAKE@
# Copyright © 1994-2004 World Wide Web Consortium
# See http://www.w3.org/Consortium/Legal/copyright-software
#
# Author: Bert Bos
" >$@
for f in $(HTML_MANS); do\
echo "
" >>$@
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:
html-xml-utils-6.9/hxremove.1 0000644 0001750 0000144 00000003114 12041463321 013117 0000000 0000000 .de d \" begin display
.sp
.in +4
.nf
..
.de e \" end display
.in -4
.fi
.sp
..
.TH "HXREMOVE" "1" "10 Jul 2011" "6.x" "HTML-XML-utils"
.SH NAME
hxremove \- remove elements from an XML file by means of a CSS selector
.SH SYNOPSIS
.B hxremove
.RB "[\| " \-i " \|]"
.RB "[\| " \-l
.IR language " \|]"
.I selector
.SH DESCRIPTION
.B hxremove
reads a well-formed XML document from standard input and writes it to
standard output without any elements that match the CSS selector that
is given as argument. For example
.d
.B hxremove ol li:first-child
.e
removes the first li (list item in XHTML) from every ol (ordered
list).
.PP
Assumes that class selectors (".foo") refer to an attribute called
"class". And assumes that ID selectors ("#foo") refer to an attribute
called "id".
.PP
To handle HTML files, make them well-formed XML first, e.g., with
.BR "hxnormalize -x" .
.PP
Compare with
.BR hxselect ,
which removes everything
.I but
the selected elements.
.SH OPTIONS
The following options are supported:
.TP 10
.B \-i
Match case-insensitively. Useful for HTML and some other
SGML-based languages.
.TP
.BI \-l " language"
Sets the default language, in case the root element doesn't
have an xml:lang attribute (default: none). Example:
.B \-l en
.TP
.B \-?
Show command usage.
.SH OPERANDS
The following operand is supported:
.TP
.I selector
A selector. Most selectors from CSS level 3 are supported, with the
exception of pseudo-classes, pseudo-elements and selectors with
\fBlast\-\fR in their name.
.SH "SEE ALSO"
.BR asc2xml (1),
.BR xml2asc (1),
.BR hxnormalize (1),
.BR hxselect (1),
.BR UTF-8 " (RFC 2279)"
html-xml-utils-6.9/hxprintlinks.1 0000644 0001750 0000144 00000001144 11606170750 014027 0000000 0000000 .de d \" begin display
.sp
.in +4
.nf
..
.de e \" end display
.in -4
.fi
.sp
..
.TH "HXPRINTLINKS" "1" "10 Jul 2011" "6.x" "HTML-XML-utils"
.SH NAME
hxprintlinks \- number links and add a table of URLs at the end of an HTML file
.SH SYNOPSIS
.B hxprintlinks
.RB "[\| " \-b
.IR base " \|]"
.I file
.SH DESCRIPTION
.B hxprintlinks
adds a numbered table of all URLs (links) found in the file to the end of
the HTML file.
.SH OPTIONS
The following options are supported:
.TP
.BI \-b " base"
Prefix all URLs with the given \fIbase\fR.
.SH OPERANDS
The following operand is supported:
.TP
.I file
The file to work on.
html-xml-utils-6.9/connectsock.c 0000644 0001750 0000144 00000007753 11516104145 013674 0000000 0000000 /* connectsock.c
*
* Part of HTML-XML-utils, see:
* http://www.w3.org/Tools/HTML-XML-utils/
*
* Copyright © 1994-2011 World Wide Web Consortium
* See http://www.w3.org/Consortium/Legal/copyright-software
*
* Author: Bert Bos \n");
}
return NULL;
}
/* end -- called after the last event is reported */
void end(void *clientdata)
{
if (format == HTML) {
printf("
\n");
printf("\n");
printf("\n");
}
}
/* handle_comment -- called after a comment is parsed */
void handle_comment(void *clientdata, string commenttext)
{
free(commenttext);
}
/* handle_text -- called after a text chunk is parsed */
void handle_text(void *clientdata, string text)
{
/* There may be several consecutive calls to this routine. */
/* escape(text); */
free(text);
}
/* handle_decl -- called after a declaration is parsed */
void handle_decl(void *clientdata, string gi, string fpi, string url)
{
/* skip */
if (gi) free(gi);
if (fpi) free(fpi);
if (url) free(url);
}
/* handle_pi -- called after a PI is parsed */
void handle_pi(void *clientdata, string pi_text)
{
if (pi_text) free(pi_text);
}
/* handle_starttag -- called after a start tag is parsed */
void handle_starttag(void *clientdata, string name, pairlist attribs)
{
/* ToDo: print text of anchor, if available */
conststring h;
if (strcasecmp(name, "base") == 0) {
h = pairlist_get(attribs, "href");
if (h) base = strdup(h); /* Use as base from now on */
output("base", NULL, h);
} else if (strcasecmp(name, "link") == 0) {
output("link", pairlist_get(attribs, "rel"), pairlist_get(attribs, "href"));
} else if (strcasecmp(name, "a") == 0) {
output("a", pairlist_get(attribs, "rel"), pairlist_get(attribs, "href"));
} else if (strcasecmp(name, "img") == 0) {
output("img", NULL, pairlist_get(attribs, "src"));
output("longdesc", NULL, pairlist_get(attribs, "longdesc"));
} else if (strcasecmp(name, "input") == 0) {
output("input", NULL, pairlist_get(attribs, "href"));
} else if (strcasecmp(name, "object") == 0) {
output("object", NULL, pairlist_get(attribs, "data"));
} else if (strcasecmp(name, "area") == 0) {
output("area", pairlist_get(attribs, "rel"), pairlist_get(attribs, "href"));
} else if (strcasecmp(name, "ins") == 0) {
output("ins", NULL, pairlist_get(attribs, "cite"));
} else if (strcasecmp(name, "del") == 0) {
output("del", NULL, pairlist_get(attribs, "cite"));
} else if (strcasecmp(name, "q") == 0) {
output("q", NULL, pairlist_get(attribs, "cite"));
} else if (strcasecmp(name, "blockquote") == 0) {
output("bq", NULL, pairlist_get(attribs, "cite"));
} else if (strcasecmp(name, "form") == 0) {
output("form", pairlist_get(attribs, "method"), pairlist_get(attribs, "action"));
} else if (strcasecmp(name, "frame") == 0) {
output("frame", NULL, pairlist_get(attribs, "src"));
} else if (strcasecmp(name, "iframe") == 0) {
output("iframe", NULL, pairlist_get(attribs, "src"));
} else if (strcasecmp(name, "head") == 0) {
output("head", NULL, pairlist_get(attribs, "profile"));
} else if (strcasecmp(name, "script") == 0) {
output("script", NULL, pairlist_get(attribs, "src"));
} else if (strcasecmp(name, "body") == 0) {
output("body", NULL, pairlist_get(attribs, "background"));
}
/* Free memory */
pairlist_delete(attribs);
free(name);
}
/* handle_emptytag -- called after an empty tag is parsed */
void handle_emptytag(void *clientdata, string name, pairlist attribs)
{
handle_starttag(clientdata, name, attribs);
}
/* handle_endtag -- called after an endtag is parsed (name may be "") */
void handle_endtag(void *clientdata, string name)
{
free(name);
}
/* --------------------------------------------------------------------- */
/* usage -- print usage message and exit */
static void usage(string progname)
{
fprintf(stderr,
"Version %s\nUsage: %s [-l] [-r] [-h] [-b base] [-t] [HTML-file]\n",
VERSION, progname);
exit(1);
}
int main(int argc, char *argv[])
{
int c, status = 200;
/* Bind the parser callback routines to our handlers */
set_error_handler(handle_error);
set_start_handler(start);
set_end_handler(end);
set_comment_handler(handle_comment);
set_text_handler(handle_text);
set_decl_handler(handle_decl);
set_pi_handler(handle_pi);
set_starttag_handler(handle_starttag);
set_emptytag_handler(handle_emptytag);
set_endtag_handler(handle_endtag);
/* Parse command line arguments */
while ((c = getopt(argc, argv, "lb:rht")) != -1) {
switch (c) {
case 'l': format = Long; break; /* Long listing */
case 'b': base = strdup(optarg); break; /* Set base of URL */
case 'r': relative = true; break; /* Do not make URLs absolute */
case 'h': format = HTML; break; /* Output in HTML format */
case 't': format = Tuple; break; /* Output as 4-tuples */
default: usage(argv[0]);
}
}
if (optind == argc) {
yyin = stdin;
self = "-";
} else if (optind == argc - 1) {
if (!base) base = strdup(argv[optind]);
if (eq(argv[optind], "-")) yyin = stdin;
else yyin = fopenurl(argv[optind], "r", &status);
self = argv[optind];
} else {
usage(argv[0]);
}
if (yyin == NULL) {perror(argv[optind]); exit(1);}
if (status != 200) errexit("%s : %s\n", argv[optind], http_strerror(status));
if (yyparse() != 0) exit(3);
if (base) free(base);
return has_error ? 1 : 0;
}
html-xml-utils-6.9/hxincl.c 0000644 0001750 0000144 00000025367 12174313455 012660 0000000 0000000 /*
* incl - expand included files
*
* Searches for and expands the referenced file
* in place. File may be a URL. Works recursively. Other accepted
* syntaxes:
*
*
*
*
* ...
* ...
* ...
*
* If there are no quotes, the file name may not include whitespace.
*
* Copyright 1994-2012 World Wide Web Consortium
* See http://www.w3.org/Consortium/Legal/copyright-software
*
* Author: Bert Bos
* Created: 2 Dec 1998
* Version: $Id: hxincl.c,v 1.13 2013-07-25 21:02:59 bbos Exp $
*
**/
#include "config.h"
#include */
static bool use_div = false; /* Option -d */
/* handle_error -- called when a parse error occurred */
static void handle_error(void *clientdata, const string s, int lineno)
{
fprintf(stderr, "%d: %s\n", lineno, s);
}
/* start -- called before the first event is reported */
static void* start(void)
{
tree = create();
return NULL;
}
/* end -- called after the last event is reported */
static void end(void *clientdata)
{
/* skip */
}
/* handle_comment -- called after a comment is parsed */
static void handle_comment(void *clientdata, string commenttext)
{
tree = append_comment(tree, commenttext);
}
/* handle_text -- called after a tex chunk is parsed */
static void handle_text(void *clientdata, string text)
{
tree = append_text(tree, text);
}
/* handle_declaration -- called after a declaration is parsed */
static void handle_decl(void *clientdata, string gi,
string fpi, string url)
{
tree = append_declaration(tree, gi, fpi, url);
}
/* handle_proc_instr -- called after a PI is parsed */
static void handle_pi(void *clientdata, string pi_text)
{
tree = append_procins(tree, pi_text);
}
/* handle_starttag -- called after a start tag is parsed */
static void handle_starttag(void *clientdata, string name, pairlist attribs)
{
conststring id;
tree = html_push(tree, name, attribs);
/* If it has an ID, store it (so we don't accidentally generate it) */
if ((id = pairlist_get(attribs, "id"))) storeID(id);
}
/* handle_emptytag -- called after an empty tag is parsed */
static void handle_emptytag(void *clientdata, string name, pairlist attribs)
{
handle_starttag(clientdata, name, attribs);
}
/* handle_endtag -- called after an endtag is parsed (name may be "") */
static void handle_endtag(void *clientdata, string name)
{
tree = html_pop(tree, name);
}
/* indent -- print level times a number of spaces */
static void indent(int level)
{
for (; level > 0; level--) printf(INDENT);
}
/* is_div -- t is a division (DIV, SECTION, ARTICLE, ASIDE or NAV) */
static bool is_div(Tree t)
{
assert(t->tp == Element);
return eq(t->name, "div") ||
eq(t->name, "section") || /* HTML5 */
eq(t->name, "article") || /* HTML5 */
eq(t->name, "aside") || /* HTML5 */
eq(t->name, "nav"); /* HTML5 */
}
/* heading_level -- return 1..6 if t is H1..H6, else 0 */
static int heading_level(Tree t)
{
assert(t->tp == Element);
if (has_class(t->attribs, NO_TOC)) return 0;
if (eq(t->name, "h1")) return 1;
if (eq(t->name, "h2")) return 2;
if (eq(t->name, "h3")) return 3;
if (eq(t->name, "h4")) return 4;
if (eq(t->name, "h5")) return 5;
if (eq(t->name, "h6")) return 6;
return 0;
}
/* div_parent -- if t is the first child of a section elt, return that elt */
static Tree div_parent(Tree t)
{
Tree h, result = NULL;
assert(t->tp == Element);
assert(t->parent);
if (t->parent->tp != Element) return NULL;
if (has_class(t->parent->attribs, NO_TOC)) return NULL;
if (is_div(t->parent)) result = t->parent;
else if (!eq(t->parent->name, "header")) return NULL;
else if (!(result = div_parent(t->parent))) return NULL;
for (h = t->parent->children; h != t; h = h->sister) {
if (h->tp == Element) return NULL;
if (h->tp == Text && !only_space(h->text)) return NULL;
}
return result;
}
/* first_child_is_heading -- true if first child is a Hn or HEADER */
static bool first_child_is_heading(Tree t)
{
Tree h;
assert(t->tp == Element);
for (h = t->children; h; h = h->sister) {
switch (h->tp) {
case Element:
return eq(h->name, "header") || heading_level(h) > 0;
case Text:
if (!only_space(h->text))
return false;
break;
default:
break;
}
}
return false;
}
static void expand(Tree t, bool *write, bool exp, bool keep_anchors,
int div_depth);
/* toc -- create a table of contents */
static void toc(Tree t, int *curlevel, bool *item_is_open, int div_depth)
{
conststring val, id;
int level;
Tree h, div = NULL;
bool write = true;
switch (t->tp) {
case Text: break;
case Comment: break;
case Declaration: break;
case Procins: break;
case Element:
if (use_div && is_div(t) && first_child_is_heading(t)) {
/* It's a section element with a heading as first child */
div_depth++;
level = 0;
} else {
/* Check if the element is a heading and what its level is */
level = heading_level(t);
if (level && use_div && (div = div_parent(t))) level = div_depth;
}
/* If it's a header for the ToC, create a list item for it */
if (level >= toc_low && level <= toc_high) {
/* Ensure there is an ID to point to */
h = use_div && div ? div : t;
if (! (id = get_attrib(h, "id"))) {
id = gen_id(h);
set_attrib(h, "id", id);
}
assert(*curlevel <= level || *item_is_open);
while (*curlevel > level) {
printf(xml ? "\n" : "\n");
indent(*curlevel - toc_low);
printf("
");
(*curlevel)--;
}
if (*curlevel == level && *item_is_open) {
printf(xml ? "\n" : "\n");
} else if (*item_is_open) {
printf("\n");
(*curlevel)++;
indent(*curlevel - toc_low);
printf("\n", toc_class);
}
while (*curlevel < level) {
indent(*curlevel - toc_low);
printf("
\n");
printf("", END_TOC);
if (!strncmp(s, BEGIN_TOC, sizeof(BEGIN_TOC) - 1))
*write = false; /* Suppress old ToC */
} else if (exp && !strncmp(s, END_TOC, sizeof(END_TOC) - 1)) {
*write = true;
} else {
printf("", h->text);
}
break;
case Declaration:
printf("name);
if (h->text) printf(" PUBLIC \"%s\"", h->text);
if (h->url) printf(" %s\"%s\"", h->text ? "" : "SYSTEM ", h->url);
printf(">");
break;
case Procins:
if (*write) printf("%s>", h->text);
break;
case Element:
if (use_div && is_div(h) && first_child_is_heading(h)) {
/* It's a section element with a heading as first child */
div_depth++;
level = div_depth;
} else {
/* Check if the element is a heading and what its level is */
level = heading_level(h);
if (level && use_div && div_parent(h)) level = 0;
}
/* Give DIVs and headers an ID, if they need one */
if (level >= toc_low && level <= toc_high) {
if (!get_attrib(h, "id")) set_attrib(h, "id", gen_id(h));
}
if (*write) {
if (! keep_anchors && eq(h->name, "a")) {
/* Don't write the and tags */
expand(h, write, exp, false, div_depth);
} else if (! keep_anchors && eq(h->name, "span")
&& has_class(h->attribs, INDEX)) {
/* Don't write \n", toc_class);
}
indent(*curlevel - toc_low);
if ((val = get_attrib(t, "class"))) {
printf("
");
level--;
}
if (item_is_open && xml) printf("\n", toc_class);
level = toc_low;
toc(get_root(t), &level, &item_is_open, 1);
while (level > toc_low) {
printf(xml ? "
" >$@
for f in $(HTML_MANS); do\
echo "
" >>$@
# This is inconvenient. In automake version 1.11, $(wildcard) worked,
# but not in version 1.14. :-(
# TESTS = $(wildcard $(top_srcdir)/tests/*.sh)
TESTS = tests/addid1.sh tests/addid1.sh tests/ascxml.sh\
tests/cdata1.sh tests/cite1.sh tests/cite2.sh tests/cite3.sh\
tests/cite4.sh\
tests/clean1.sh tests/copy1.sh tests/copy2.sh tests/copy3.sh\
tests/copy4.sh tests/copy5.sh tests/copy6.sh\
tests/copy7.sh tests/extract1.sh\
tests/hxnsxml1.sh tests/hxnsxml2.sh tests/hxnsxml3.sh\
tests/hxnsxml4.sh tests/incl1.sh tests/incl10.sh\
tests/incl11.sh tests/incl2.sh tests/incl3.sh tests/incl4.sh\
tests/incl5.sh tests/incl6.sh tests/incl7.sh tests/incl8.sh\
tests/incl9.sh tests/index.sh tests/index2.sh tests/index3.sh\
tests/index4.sh tests/index5.sh\
tests/mkbib1.sh tests/normalize1.sh\
tests/pipe1.sh tests/pipe2.sh tests/pipe3.sh tests/ref1.sh\
tests/ref2.sh tests/ref3.sh tests/relurl1.sh tests/relurl2.sh\
tests/relurl3.sh tests/remove1.sh tests/remove2.sh\
tests/tabletrans1.sh tests/tabletrans2.sh tests/tabletrans3.sh\
tests/toc1.sh tests/toc2.sh tests/uncdata1.sh tests/unpipe1.sh\
tests/unpipe2.sh tests/unpipe3.sh tests/wls1.sh tests/wls2.sh\
tests/xmlasc1.sh tests/xmlasc2.sh tests/xmlasc3.sh\
tests/xmlasc4.sh tests/xmlasc5.sh tests/xmlasc6.sh\
tests/xmlasc7.sh tests/xmlns1.sh tests/xref1.sh tests/xref2.sh\
tests/xref3.sh tests/xref4.sh tests/xref5.sh tests/xref6.sh\
tests/xref7.sh
html-xml-utils-6.9/hxindex.c 0000644 0001750 0000144 00000070216 12307602472 013031 0000000 0000000 /*
* Insert an index between "" and "",
* or replacing the comment ""
*
* The index links to elements with ID attributes as well as with
* empty elements.
*
* Any tags with a class of "bctarget" are not copied, but
* regenerated. They are assumed to be backwards-compatible versions
* of ID attributes on their parent elements. But if the option -t or
* -x are given, those elements are removed.
*
* There's a limit of 100000 index terms (10^(MAXIDLEN-1)).
*
* Index terms are elements with a class of "index", "index-inst" or
* "index-def", as well as all elements. The contents of the
* element is the index term, unless the element has a title
* attribute. The title attribute can contain "|" and "!!":
*
* "term"
* "term1|term2|term3|..."
* "term!!subterm!!subsubterm!!..."
* "term1!!subterm1|term2!!subterm2|..."
* etc.
*
* For backward compatibility with an earlier Perl program, "::" is
* accepted as an alternative for "!!", but it is better not to use
* both separators in the same project, since the sorting maybe
* adversely affected.
*
* Class "index-def" results in a bold entry in the index, "index" in
* a normal one. "index-inst" is an alias for "index", provided for
* backward compatibility.
*
* To do: get rid of MAXSTR.
* To do: an option to split the index at each new first letter.
*
* Copyright © 1994-2005 World Wide Web Consortium
* See http://www.w3.org/Consortium/Legal/copyright-software
*
* Author: Bert Bos ");
}
/* Print new subterms, if any */
for (j = i; j < term->nrkeys; j++) {
indent(j);
printf("
");
}
}
#if 0
int listmode = 0;
if (folding_cmp(globalprevious->sortkeys, globalprevious->nrkeys,
term->sortkeys, term->nrkeys) == 0)
if (globalurlprevious) {
string d = strchr(globalurlprevious,'#');
if (d)
listmode = strncmp(globalurlprevious, term->url, d - globalurlprevious);
else
listmode = strcmp(globalurlprevious, term->url);
}
#endif
/* Print a link */
switch (term->importance) {
case 1:
#if 0
printf("%s url);
print_full_term(term);
printf("\">%s", use_secno ? term->secno : "#");
#else
printf(", ");
printf("url);
printf("\"");
if (use_secno) print_title(term);
printf(">");
if (!use_secno) putchar('#');
else if (term->secno) print_escaped(term->secno);
else print_escaped(unknown_name);
printf("");
#endif
break;
case 2:
#if 0
printf("%s url);
print_full_term(term);
printf("\">%s", use_secno ? term->secno : "#");
#else
printf(", ");
printf("url);
printf("\"");
if (use_secno) print_title(term);
printf(">");
if (!use_secno) putchar('#');
else if (term->secno) print_escaped(term->secno);
else print_escaped(unknown_name);
printf("");
#endif
break;
default:
assert(! "Cannot happen\n");
}
/* Remember this term */
globalprevious = term;
globalurlprevious = term->url;
}
/* mkindex -- write out an index */
static void mkindex(Indexterm terms)
{
int i;
printf("
");
/* Initialize globalprevious to a term with an unlikely sortkey */
new(globalprevious);
globalprevious->nrkeys = 1;
newarray(globalprevious->sortkeys, globalprevious->nrkeys);
newarray(globalprevious->sortkeys[0], 15);
wcscpy(globalprevious->sortkeys[0], L"zzzzzzzzzzzzzz");
twalk(terms, write_index_item);
/* Close all open lists */
for (i = 0; i < globalprevious->nrkeys; i++) printf("\n
");
}
/* expand -- write the tree, add if needed and replace */
static void expand(Tree t, bool *write, Indexterm terms)
{
conststring val;
Tree h;
pairlist a;
string s;
bool do_tag;
for (h = t->children; h != NULL; h = h->sister) {
switch (h->tp) {
case Text:
if (*write) printf("%s", h->text);
break;
case Comment:
s = newstring(h->text);
trim(s);
if (eq(s, INDEX) || eq(s, BEGIN_INDEX)) {
if (!final) printf("\n", BEGIN_INDEX);
mkindex(terms);
if (!final) printf("", END_INDEX);
if (eq(s, BEGIN_INDEX)) *write = false; /* Skip old index */
} else if (eq(s, END_INDEX)) {
*write = true;
} else {
printf("", h->text);
}
dispose(s);
break;
case Declaration:
printf("name);
if (h->text) printf(" PUBLIC \"%s\"", h->text);
if (h->url) printf(" %s\"%s\"", h->text ? "" : "SYSTEM ", h->url);
printf(">");
break;
case Procins:
if (*write) printf("%s>", h->text);
break;
case Element:
if (*write) {
/* If an was inserted by index itself, remove it */
do_tag = !eq(h->name, "a") || !has_class(h->attribs, TARGET);
if (do_tag) {
printf("<%s", h->name);
for (a = h->attribs; a != NULL; a = a->next) {
printf(" %s", a->name);
if (a->value != NULL) printf("=\"%s\"", a->value);
}
assert(! is_empty(h->name) || h->children == NULL);
printf(xml && is_empty(h->name) ? " />" : ">");
/* Insert an if element has an ID and is not */
if (bctarget && is_mixed(h->name) && (val = get_attrib(h, "id"))
&& !eq(h->name, "a") && ! xml)
printf("", TARGET, val);
}
expand(h, write, terms);
if (do_tag && ! is_empty(h->name)) printf("%s>", h->name);
}
break;
case Root:
assert(! "Cannot happen");
break;
default:
assert(! "Cannot happen");
}
}
}
/* termcmp -- comparison routine for Indexterms */
static int termcmp(const void *a1, const void *b1)
{
Indexterm a = (Indexterm)a1, b = (Indexterm)b1;
int r;
assert(a);
assert(b);
assert(a->sortkeys);
assert(b->sortkeys);
assert(a->nrkeys > 0);
assert(b->nrkeys > 0);
r = folding_cmp(a->sortkeys, a->nrkeys, b->sortkeys, b->nrkeys);
if (r != 0) return r;
return strcmp(a->url, b->url); /* Terms are equal, compare URL instead */
}
/* copy_contents -- recursively expand contents of element t into a string */
static void copy_contents(Tree t, string *s)
{
Tree h;
int i;
pairlist a;
string p;
for (h = t->children; h != NULL; h = h->sister) {
switch (h->tp) {
case Text:
i = *s ? strlen(*s) : 0;
renewarray(*s, i + strlen(h->text) + 1);
/* Copy, but transform all whitespace to spaces */
for (p = h->text; *p; p++, i++) (*s)[i] = isspace(*p) ? ' ' : *p;
(*s)[i] = '\0';
break;
case Comment: break;
case Declaration: break;
case Procins: break;
case Element:
/* Only certain tags are retained */
if (eq(h->name, "span") || eq(h->name, "code") || eq(h->name, "tt")
|| eq(h->name, "acronym") || eq(h->name, "abbr")
|| eq(h->name, "bdo") || eq(h->name, "kbd") || eq(h->name, "samp")
|| eq(h->name, "sub") || eq(h->name, "sup")
|| eq(h->name, "var")) {
strapp(s, "<", h->name, NULL);
for (a = h->attribs; a != NULL; a = a->next) {
if (! a->value) strapp(s, " ", a->name, NULL);
else strapp(s, " ", a->name, "=\"", a->value, "\"", NULL);
}
assert(! is_empty(h->name) || h->children == NULL);
if (is_empty(h->name)) {
strapp(s, xml ? " />" : ">", NULL);
} else {
strapp(s, ">", NULL);
copy_contents(h, s);
strapp(s, "", h->name, ">", NULL);
}
} else { /* Ignore tag, copy contents */
copy_contents(h, s);
}
break;
case Root: assert(! "Cannot happen"); break;
default: assert(! "Cannot happen");
}
}
}
/* copy_to_index -- copy the contents of element h to the index db */
static void copy_to_index(Tree t, Indexterm *terms, int importance,
conststring secno, conststring doctitle)
{
conststring id, title;
string h;
Indexterm term;
int i, n;
id = get_attrib(t, "id");
#ifdef USE_DATA_ATTRIBUTE
if (! (title = get_attrib(t, "data-index")))
#endif
title = get_attrib(t, "title");
/* Get term either from title attribute or contents */
if (title) {
i = 0;
while (title[i]) {
n = strcspn(title + i, "|"); /* Find | or \0 */
new(term);
term->importance = importance;
term->secno = secno ? newstring(secno) : NULL;
term->doctitle = newstring(doctitle);
term->url = NULL;
strapp(&term->url, base, "#", id, NULL);
h = newnstring(title + i, n);
parse_subterms(term, h);
if (! tsearch(term, (void**)terms, termcmp))
errx(1, "Out of memory while parsing term %s\n", h);
i += n;
if (title[i]) i++; /* Skip '|' */
}
if (final) /* Remove used attribute */
#ifdef USE_DATA_ATTRIBUTE
if (!delete_attrib(t, "data-index"))
#endif
delete_attrib(t, "title");
} else { /* Recursively copy contents */
h = NULL;
copy_contents(t, &h);
if (h) { /* Non-empty contents */
new(term);
term->importance = importance;
term->secno = secno ? newstring(secno) : NULL;
term->doctitle = newstring(doctitle);
term->url = NULL;
strapp(&term->url, base, "#", id, NULL);
parse_subterms(term, h);
if (! tsearch(term, (void**)terms, termcmp))
errx(1, "Out of memory while parsing term %s", h);
}
}
}
/* collect -- collect index terms, add IDs where needed */
static void collect(Tree t, Indexterm *terms, string *secno, string *doctitle)
{
int importance;
Tree h;
for (h = t->children; h != NULL; h = h->sister) {
switch (h->tp) {
case Text: case Comment: case Declaration: case Procins: break;
case Element:
if (eq(h->name, "title")) {
dispose(*doctitle);
copy_contents(h, doctitle);
}
if (has_class(h->attribs, SECNO)) {
dispose(*secno);
copy_contents(h, secno);
trim(*secno);
} else if (has_class(h->attribs, NO_NUM)) {
dispose(*secno);
*secno = newstring(unknown_name);
}
if (eq(h->name, "dfn")) importance = 2;
else if (has_class(h->attribs,INDEX)||has_class(h->attribs,INDEX_INST))
importance = 1;
else if (userclassnames && has_class_in_list(h->attribs, userclassnames))
importance = 1;
else if (has_class(h->attribs, INDEX_DEF)) importance = 2;
else importance = 0;
if (importance != 0) {
/* Give it an ID, if it doesn't have one */
if (! get_attrib(h, "id")) set_attrib(h, "id", gen_id(h));
copy_to_index(h, terms, importance, *secno, *doctitle);
} else {
collect(h, terms, secno, doctitle);
}
break;
case Root: assert(! "Cannot happen"); break;
default: assert(! "Cannot happen");
}
}
}
/* load_index -- read persistent term db from file */
static void load_index(const string indexdb, Indexterm *terms)
{
FILE *f;
int n1, n2, n3, n4, n5;
char line[MAXSTR];
Indexterm term;
string h;
if (! (f = fopen(indexdb, "r"))) return; /* Assume file not found... */
while (fgets(line, sizeof(line), f)) {
#if 1
n1 = strcspn(line, "\t");
if (line[n1] != '\t') errx(1, "Illegal syntax in %s", indexdb);
n2 = n1 + 1 + strcspn(line + n1 + 1, "\t");
if (line[n2] != '\t') errx(1, "Illegal syntax in %s", indexdb);
n3 = n2 + 1 + strcspn(line + n2 + 1, "\t");
if (line[n3] != '\t') errx(1, "Illegal syntax in %s", indexdb);
n4 = n3 + 1 + strcspn(line + n3 + 1, "\t");
if (line[n4] != '\t') errx(1, "Illegal syntax in %s", indexdb);
n5 = n4 + 1 + strcspn(line + n4 + 1, "\t\n");
if (line[n5] != '\n') errx(1, "Illegal syntax in %s", indexdb);
new(term);
h = newnstring(line, n1);
switch (line[n1 + 1]) {
case '1': term->importance = 1; break;
case '2': term->importance = 2; break;
default: errx(1, "Error in %s (column 2 must be '1' or '2')", indexdb);
}
term->url = newnstring(line + n2 + 1, n3 - n2 - 1);
term->secno = newnstring(line + n3 + 1, n4 - n3 - 1);
term->doctitle = newnstring(line + n4 + 1, n5 - n4 - 1);
parse_subterms(term, h);
if (! tsearch(term, (void**)terms, termcmp))
errx(1, "Out of memory while loading %s", indexdb);
#else
chomp(line);
n = strcspn(line, "\t");
if (line[n] != '\t') errx(1, "Illegal syntax in %s", indexdb);
new(term);
h = newnstring(line, n);
switch (line[n + 1]) {
case '1': term->importance = 1; break;
case '2': term->importance = 2; break;
default: errx(1, "Error in %s (column 2 must be '1' or '2')", indexdb);
}
if (line[n+2] != '\t') errx(1, "Illegal syntax in %s", indexdb);
n3 = n + 3 + strcspn(line + n + 3, "\t");
if (line[n3] != '\t') errx(1, "Illegal syntax in %s", indexdb);
n4 = n3 + 1 + strcspn(line + n3 + 1, "\t");
if (line[n4] != '\t') errx(1, "Illegal syntax in %s", indexdb);
term->url = newnstring(line + n + 3, n3 - n - 3);
term->secno = newnstring(line + n3 + 1, n4 - n3 - 1);
term->doctitle = newstring(line + n4 + 1);
parse_subterms(term, h);
if (! tsearch(term, (void**)terms, termcmp))
errx(1, "Out of memory while loading %s", indexdb);
#endif
}
fclose(f);
}
/* save_a_term -- write one term to globalfile */
static void save_a_term(const void *term1, const VISIT which, const int dp)
{
Indexterm term = *(Indexterm*)term1;
int i;
if (which == endorder || which == leaf) {
for (i = 0; i < term->nrkeys; i++) {
if (i > 0) fprintf(globalfile, "!!");
fprintf(globalfile, "%s", term->terms[i]);
}
fprintf(globalfile, "\t%d\t%s\t%s\t%s\n", term->importance, term->url,
term->secno ? term->secno : (use_secno ? unknown_name : "#"),
term->doctitle);
}
}
/* save_index -- write terms to file */
static void save_index(const string indexdb, Indexterm terms)
{
if (! (globalfile = fopen(indexdb, "w")))
errx(1, "%s: %s", indexdb, strerror(errno));
twalk(terms, save_a_term);
fclose(globalfile);
}
/* usage -- print usage message and exit */
static void usage(string name)
{
errx(1, "Version %s\nUsage: %s [-i indexdb] [-b base] [-x] [-t] [-n] [-c userclass] [-s template] [-u phrase] [html-file]",
VERSION, name);
}
/* tokenize -- split string s into tokens at each comma, return an array */
static string * tokenize(string s)
{
string * t;
int i, n;
assert(s && s[0]);
for (t = NULL, n = 0; *s; s += i + 1, n++) {
i = strcspn(s, ",");
renewarray(t, n + 1);
t[n] = newnstring(s, i);
}
renewarray(t, n + 1); /* Make final item NULL */
t[n] = NULL;
return t;
}
/* main */
int main(int argc, char *argv[])
{
bool write = true;
Indexterm termtree = NULL; /* Sorted tree of terms */
string secno, doctitle;
int c, status = 200;
/* Bind the parser callback routines to our handlers */
set_error_handler(handle_error);
set_start_handler(start);
set_end_handler(end);
set_comment_handler(handle_comment);
set_text_handler(handle_text);
set_decl_handler(handle_decl);
set_pi_handler(handle_pi);
set_starttag_handler(handle_starttag);
set_emptytag_handler(handle_emptytag);
set_endtag_handler(handle_endtag);
yyin = NULL;
while ((c = getopt(argc, argv, "txb:i:cnfrs:u:")) != -1)
switch (c) {
case 't': bctarget = false; break; /* Don't write after each ID */
case 'x': xml = true; break; /* Output as XML */
case 'b': base = newstring(optarg); break; /* Set base of URL */
case 'i': indexdb = newstring(optarg); break; /* Set name of index db */
case 'c': userclassnames = tokenize(optarg); break; /* Set class names */
case 'n': use_secno = true; break; /* Print section numbers instead of "#" */
case 'f': final = true; break; /* "Final": remove used attributes */
case 'r': trim_punct = false; break; /* Do not remove trailing punctuation */
case 's': section_name = newstring(optarg); break;
case 'u': unknown_name = newstring(optarg); break;
default: usage(argv[0]);
}
if (optind == argc) yyin = stdin;
else if (argc > optind + 1) usage(argv[0]);
else if (eq(argv[optind], "-")) yyin = stdin;
else yyin = fopenurl(argv[optind], "r", &status);
if (yyin == NULL) {perror(argv[optind]); exit(1);}
if (status != 200) errx(1, "%s : %s", argv[optind], http_strerror(status));
if (!base) base = newstring("");
if (!section_name) section_name = newstring("section %s");
if (!unknown_name) unknown_name = newstring("??");
/* Apply user's locale */
setlocale(LC_ALL, "");
/* Read the index DB into memory */
if (indexdb) load_index(indexdb, &termtree);
/* Parse, build tree, collect existing IDs */
if (yyparse() != 0) exit(3);
/* Scan for index terms, add them to the tree, add IDs where needed */
secno = NULL;
doctitle = newstring("");
collect(get_root(tree), &termtree, &secno, &doctitle);
/* Write out the document, adding and replacing */
expand(get_root(tree), &write, termtree);
/* Store terms to file */
if (indexdb) save_index(indexdb, termtree);
fclose(yyin);
#if 0
tree_delete(tree); /* Just to test memory mgmt */
#endif
return 0;
}
html-xml-utils-6.9/hxextract.1 0000644 0001750 0000144 00000003462 11606170750 013311 0000000 0000000 .de d \" begin display
.sp
.in +4
.nf
..
.de e \" end display
.in -4
.fi
.sp
..
.TH "HXEXTRACT" "1" "10 Jul 2011" "6.x" "HTML-XML-utils"
.SH NAME
hxextract \- extract selected elements from a HTML or XML file
.SH SYNOPSIS
.B hxextract
.RB "[\| " \-h
.RB "| " \-? " \|]"
.RB "[\| " \-x " \|]"
.RB "[\| " \-s
.IR text " \|]"
.RB "[\| " \-e
.IR text " \|]"
.RB "[\| " \-b
.IR base " \|]"
.I element-or-class
.RB "[\| " \-c
.IR "configfile" " | "
.IR file\-or\-URL " \|]"
.SH DESCRIPTION
.B hxextract
outputs all elements with a certain name and/or class.
.PP
Input must be well-formed, since no HTML heuristics are applied.
.SH OPTIONS
The following options are supported:
.TP 10
.B \-x
Use XML format conventions.
.TP 10
.BI \-s " text"
Insert
.I text
at the start of the output.
.TP 10
.BI \-e " text"
Insert
.I text
at the end of the output.
.TP 10
.BI \-b " base"
URL base
.TP 10
.BI \-c " configfile"
Read @chapter lines from
.I configfile
(lines must be of the form "@chapter filename") and extract elements from each of those files.
.TP 10
.BR \-h ", " \-?
Print command usage.
.SH OPERANDS
The following operands are supported:
.TP 10
.I element-or-class
The name of an element to extract (e.g., "H2"), or the name of a class
preceded by "." (e.g., ".example") or a combination of both (e.g.,
"H2.example").
.TP
.I file-or-URL
A file name or a URL. To read from standard input, use "-".
.SH ENVIRONMENT
To use a proxy to retrieve remote files, set the environment variables
.B http_proxy
and
.BR ftp_proxy "."
E.g.,
.B http_proxy="http://localhost:8080/"
.SH BUGS
.LP
Remote files (specified with a URL) are currently only supported for
HTTP. Password-protected files or files that depend on HTTP "cookies"
are not handled. (You can use tools such as
.BR curl (1)
or
.BR wget (1)
to retrieve such files.)
.SH "SEE ALSO"
.BR hxselect (1)
html-xml-utils-6.9/cexport.c 0000645 0001750 0000144 00000031707 12374701340 013047 0000000 0000000 /* cexport.c -- create header file of EXPORT'ed declarations from c files */
/*
* Author: Bert Bos
%{L:
.e
This template starts with four lines of preamble, including the sort
string %A%D on line 3. The sort string itself will not be output, but
the rest of the comment will.
.PP
From the line
.B %{L:
to the line
.B %}
is the template. E.g., the line that
starts with
.B TEXT COMMENT START END NAME STRING PROCINS
%token EMPTYEND DOCTYPE ENDINCL
%type
.TP
.BI \-l " low"
Sets the lowest numbered header to appear in the table of
content. Default is 1 (i.e., H1).
.TP
.BI \-h " high"
Sets the highest numbered header to appear in the table of
content. Default is 6 (i.e., H6).
.TP
.B \-t
Normally,
.B hxtoc
adds both ID attributes and empty A elements with a NAME attribute and
CLASS="bctarget", so that older browsers that do no understand ID will
still find the target. With this option, the A elements will not be
generated.
.TP
.BI \-c " class"
The generated UL elements in the table of contents will have a CLASS attribute with the value
.I class.
The default is "toc".
.TP
.B \-d
Tries to use sectioning elements as targets in the table of contents
instead of H1 to H6. A sectioning elements is a DIV, SECTION, ARTICLE,
ASIDE or NAV element whose first child is a heading element (H1 to H6)
or an HGROUP. The sectioning element will be given an ID if it doesn't
have one yet. With this option, the level of any H1 to H6 that is the
first child of a sectioning element (or of an HGROUP that is itself
the first child of a sectioning element) is not determined by its
name, but by the nesting depth of the sectioning elements. (Any H1 to
H6 that are not the first child of a sectioning element still have
their level implied by their name.)
.SH OPERANDS
The following operand is supported:
.TP 10
.I file
The name of an HTML file. If absent, standard input is read instead.
.SH "DIAGNOSTICS"
The following exit values are returned:
.TP 10
.B 0
Successful completion.
.TP
.B > 0
An error occurred in the parsing of the HTML file.
.B hxtoc
will try to correct the error and produce output anyway.
.SH "SEE ALSO"
.BR asc2xml (1),
.BR hxnormalize (1),
.BR hxnum (1),
.BR xml2asc (1)
.SH BUGS
.LP
The error recovery for incorrect HTML is primitive.
html-xml-utils-6.9/heap.e 0000644 0001750 0000144 00000001534 12374701364 012302 0000000 0000000 #define fatal(msg) fatal3(msg, __FILE__, __LINE__)
#define new(p) if (((p)=malloc(sizeof(*(p))))); else fatal3("out of memory", __FILE__, __LINE__)
#define dispose(p) if (!(p)) ; else (free((void*)p), (p) = (void*)0)
#define heapmax(p) 9999999
#define newstring(s) heap_newstring(s, __FILE__, __LINE__)
#define newnstring(s,n) heap_newnstring(s, n, __FILE__, __LINE__)
#define newarray(p,n) if (((p)=malloc((n)*sizeof(*(p))))); else fatal3("out of memory", __FILE__, __LINE__)
#define renewarray(p,n) if (((p)=realloc(p,(n)*sizeof(*(p))))); else fatal3("out of memory", __FILE__, __LINE__)
extern void fatal3(const char *s, const char *file, const unsigned int line);
extern char * heap_newstring(const char *s, const char *file, const int line);
extern char * heap_newnstring(const char *s, const size_t n,
const char *file, const int line);
html-xml-utils-6.9/hxwls.1 0000644 0001750 0000144 00000003146 11606170750 012443 0000000 0000000 .de d \" begin display
.sp
.in +4
.nf
..
.de e \" end display
.in -4
.fi
.sp
..
.TH "HXWLS" "1" "10 Jul 2011" "6.x" "HTML-XML-utils"
.SH NAME
hxwls \- list links in an HTML file
.SH SYNOPSIS
.B hxwls
.RB "[\| " \-l " \|]"
.RB "[\| " \-t " \|]"
.RB "[\| " \-r " \|]"
.RB "[\| " \-h " \|]"
.RB "[\| " \-b
.IR " base" " \|]"
.RI "[\| " file " \|]"
.SH DESCRIPTION
.LP
The
.B hxwls
command reads an HTML file (standard input by default) and prints out
all links it finds. The output is written to stdout.
.SH OPTIONS
The following options are supported:
.TP 10
.B \-l
Produce a long listing. Instead of just the URI,
.B hxwls
prints three columns: the element name, the value of the REL
attribute, and the target URI.
.TP
.B \-t
Produce a tuple listing.
.B hxwls
prints four columns: the URI of the document itself, the element name,
the value of the REL attribute, and the target URI.
.TP
.BI \-r
Print relative URLs as they are, without converting them to absolute
URLs.
.TP
.BI \-b " base"
Use
.I base
as the initial base URL. If there is a
. Implies
.BR \-e .
.TP
.B \-e
Always insert endtags, even if HTML does not require them (for
example:
text
text
term 1 text... other
EOF cmp -s $TMP2 $TMP3 html-xml-utils-6.9/tests/incl7.sh 0000755 0001750 0000144 00000001004 12040262563 013715 0000000 0000000 : trap 'rm -rf $TMP1 $TMP2 $TMP3' 0 TMP1=`mktemp /tmp/tmp.XXXXXXXXXX` || exit 1 TMP2=`mktemp /tmp/tmp.XXXXXXXXXX` || exit 1 TMP3=`mktemp -d /tmp/tmp.XXXXXXXXXX` || exit 1 mkdir -p $TMP3/dir1/dir2 echo 'Test' >$TMP3/dir1/test1 echo '' >$TMP3/dir1/dir2/test2 echo '' | ./hxincl -b $TMP3/. >$TMP1 (echo 'Test' echo '' echo '' ) >$TMP2 cmp -s $TMP1 $TMP2 html-xml-utils-6.9/tests/index2.sh 0000755 0001750 0000144 00000002425 12051517030 014074 0000000 0000000 : trap 'rm $TMP1 $TMP2 $TMP3' 0 TMP1=`mktemp /tmp/tmp.XXXXXXXXXX` || exit 1 TMP2=`mktemp /tmp/tmp.XXXXXXXXXX` || exit 1 TMP3=`mktemp /tmp/tmp.XXXXXXXXXX` || exit 1 cat >$TMP1 <<-EOFA-term
<M-term>
Z-term
Remove this.
EOF
# The echo adds a newline at the end of the file
#
(./hxnum $TMP1 | LC_ALL=C ./hxindex -t -n -f; echo) >$TMP2
cat >$TMP3 < A-term
<M-term>
Z-term
1. Heading 0
Heading 1
1.1. Heading 2
1.1.1. Heading 3
2. Index