xmlformat-1.04/0000755000175000017500000000000010470215403012265 5ustar pauldevelxmlformat-1.04/xmlformat.pl0000755000175000017500000014542410355364247014665 0ustar pauldevel#! /usr/bin/perl -w # vim:set ts=2 sw=2 expandtab: # xmlformat - configurable XML file formatter/pretty-printer # Copyright (c) 2004, 2005 Kitebird, LLC. All rights reserved. # Some portions are based on the REX shallow XML parser, which # is Copyright (c) 1998, Robert D. Cameron. These include the # regular expression parsing variables and the shallow_parse() # method. # This software is licensed as described in the file LICENSE, # which you should have received as part of this distribution. # Syntax: xmlformat [config-file] xml-file # Default config file is $ENV{XMLFORMAT_CONF} or ./xmlformat.conf, in that # order. # Paul DuBois # paul@kitebird.com # 2003-12-14 # The input document first is parsed into a list of strings. Each string # represents one of the following: # - text node # - processing instruction (the XML declaration is treated as a PI) # - comment # - CDATA section # - DOCTYPE declaration # - element tag (either , , or ), *including attributes* # Entities are left untouched. They appear in their original form as part # of the text node in which they occur. # The list of strings then is converted to a hierarchical structure. # The document top level is represented by a reference to a list. # Each list element is a reference to a node -- a hash that has "type" # and "content" key/value pairs. The "type" key indicates the node # type and has one of the following values: # "text" - text node # "pi" - processing instruction node # "comment" - comment node # "CDATA" - CDATA section node # "DOCTYPE" - DOCTYPE node # "elt" - element node # (For purposes of this program, it's really only necessary to have "text", # "elt", and "other". The types other than "text" and "elt" currently are # all treated the same way.) # For all but element nodes, the "content" value is the text of the node. # For element nodes, the "content" hash is a reference to a list of # nodes for the element's children. In addition, an element node has # three additional key/value pairs: # - The "name" value is the tag name within the opening tag, minus angle # brackets or attributes. # - The "open_tag" value is the full opening tag, which may also be the # closing tag. # - The "close_tag" value depends on the opening tag. If the open tag is # "", the close tag is "". If the open tag is "", the # close tag is the empty string. # If the tree structure is converted back into a string with # tree_stringify(), the result can be compared to the input file # as a regression test. The string should be identical to the original # input document. use strict; use Getopt::Long; $Getopt::Long::ignorecase = 0; # options are case sensitive $Getopt::Long::bundling = 1; # allow short options to be bundled my $PROG_NAME = "xmlformat"; my $PROG_VERSION = "1.04"; my $PROG_LANG = "Perl"; # ---------------------------------------------------------------------- package XMLFormat; use strict; # ---------------------------------------------------------------------- # Regular expressions for parsing document components. Based on REX. # SPE = shallow parsing expression # SE = scanning expression # CE = completion expression # RSB = right square brackets # QM = question mark my $TextSE = "[^<]+"; my $UntilHyphen = "[^-]*-"; my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-"; my $CommentCE = "$Until2Hyphens>?"; my $UntilRSBs = "[^\\]]*\\](?:[^\\]]+\\])*\\]+"; my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>"; my $S = "[ \\n\\t\\r]+"; my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; my $Name = "(?:$NameStrt)(?:$NameChar)*"; my $QuoteSE = "\"[^\"]*\"|'[^']*'"; my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*"; my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; my $S1 = "[\\n\\r\\t ]"; my $UntilQMs = "[^?]*\\?+"; my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>"; my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S"; my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*\\](?:$S)?)?>?"; my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; my $PI_CE = "$Name(?:$PI_Tail)?"; my $EndTagCE = "$Name(?:$S)?>?"; my $AttValSE = "\"[^<\"]*\"|'[^<']*'"; my $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?"; my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"; my $XML_SPE = "$TextSE|$MarkupSPE"; # ---------------------------------------------------------------------- # Allowable options and their possible values: # - The keys of this hash are the allowable option names # - The value for each key is list of allowable option values # - If the value is undef, the option value must be numeric # If any new formatting option is added to this program, it # must be specified here, *and* a default value for it should # be listed in the *DOCUMENT and *DEFAULT pseudo-element # option hashes. my %opt_list = ( "format" => [ "block", "inline", "verbatim" ], "normalize" => [ "yes", "no" ], "subindent" => undef, "wrap-length" => undef, "entry-break" => undef, "exit-break" => undef, "element-break" => undef ); # Object creation: set up the default formatting configuration # and variables for maintaining input and output document. sub new { my $type = shift; my $self = {}; # Formatting options for each element. $self->{elt_opts} = { }; # The formatting options for the *DOCUMENT and *DEFAULT pseudo-elements can # be overridden in the configuration file, but the options must also be # built in to make sure they exist if not specified in the configuration # file. Each of the structures must have a value for every option. # Options for top-level document children. # - Do not change entry-break: 0 ensures no extra newlines before # first element of output. # - Do not change exit-break: 1 ensures a newline after final element # of output document. # - It's probably best not to change any of the others, except perhaps # if you want to increase the element-break. $self->{elt_opts}->{"*DOCUMENT"} = { "format" => "block", "normalize" => "no", "subindent" => 0, "wrap-length" => 0, "entry-break" => 0, # do not change "exit-break" => 1, # do not change "element-break" => 1 }; # Default options. These are used for any elements in the document # that are not specified explicitly in the configuration file. $self->{elt_opts}->{"*DEFAULT"} = { "format" => "block", "normalize" => "no", "subindent" => 1, "wrap-length" => 0, "entry-break" => 1, "exit-break" => 1, "element-break" => 1 }; # Run the *DOCUMENT and *DEFAULT options through the option-checker # to verify that the built-in values are legal. my $err_count = 0; foreach my $elt_name (keys (%{$self->{elt_opts}})) # ... for each element { # Check each option for element while (my ($opt_name, $opt_val) = each (%{$self->{elt_opts}->{$elt_name}})) { my $err_msg; ($opt_val, $err_msg) = check_option ($opt_name, $opt_val); if (!defined ($err_msg)) { $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val; } else { warn "LOGIC ERROR: $elt_name default option is invalid\n"; warn "$err_msg\n"; ++$err_count; } } } # Make sure that the every option is represented in the # *DOCUMENT and *DEFAULT structures. foreach my $opt_name (keys (%opt_list)) { foreach my $elt_name (keys (%{$self->{elt_opts}})) { if (!exists ($self->{elt_opts}->{$elt_name}->{$opt_name})) { warn "LOGIC ERROR: $elt_name has no default '$opt_name' option\n"; ++$err_count; } } } die "Cannot continue; internal default formatting options must be fixed\n" if $err_count > 0; bless $self, $type; # bless object and return it } # Initialize the variables that are used per-document sub init_doc_vars { my $self = shift; # Elements that are used in the document but not named explicitly # in the configuration file. $self->{unconf_elts} = { }; # List of tokens for current document. $self->{tokens} = [ ]; # List of line numbers for each token $self->{line_num} = [ ]; # Document node tree (constructed from the token list). $self->{tree} = [ ]; # Variables for formatting operations: # out_doc = resulting output document (constructed from document tree) # pending = array of pending tokens being held until flushed $self->{out_doc} = ""; $self->{pending} = [ ]; # Inline elements within block elements are processed using the # text normalization (and possible line-wrapping) values of their # enclosing block. Blocks and inlines may be nested, so we maintain # a stack that allows the normalize/wrap-length values of the current # block to be determined. $self->{block_name_stack} = [ ]; # for debugging $self->{block_opts_stack} = [ ]; # A similar stack for maintaining each block's current break type. $self->{block_break_type_stack} = [ ]; } # Accessors for token list and resulting output document sub tokens { my $self = shift; return $self->{tokens}; } sub out_doc { my $self = shift; return $self->{out_doc}; } # Methods for adding strings to output document or # to the pending output array sub add_to_doc { my ($self, $str) = @_; $self->{out_doc} .= $str; } sub add_to_pending { my ($self, $str) = @_; push (@{$self->{pending}}, $str); } # Block stack mainenance methods # Push options onto or pop options off from the stack. When doing # this, also push or pop an element onto the break-level stack. sub begin_block { my ($self, $name, $opts) = @_; push (@{$self->{block_name_stack}}, $name); push (@{$self->{block_opts_stack}}, $opts); push (@{$self->{block_break_type_stack}}, "entry-break"); } sub end_block { my $self = shift; pop (@{$self->{block_name_stack}}); pop (@{$self->{block_opts_stack}}); pop (@{$self->{block_break_type_stack}}); } # Return the current block's normalization status or wrap length sub block_normalize { my $self = shift; my $size = @{$self->{block_opts_stack}}; my $opts = $self->{block_opts_stack}->[$size-1]; return $opts->{normalize} eq "yes"; } sub block_wrap_length { my $self = shift; my $size = @{$self->{block_opts_stack}}; my $opts = $self->{block_opts_stack}->[$size-1]; return $opts->{"wrap-length"}; } # Set the current block's break type, or return the number of newlines # for the block's break type sub set_block_break_type { my ($self, $type) = @_; my $size = @{$self->{block_break_type_stack}}; $self->{block_break_type_stack}->[$size-1] = $type; } sub block_break_value { my $self = shift; my $size = @{$self->{block_opts_stack}}; my $opts = $self->{block_opts_stack}->[$size-1]; $size = @{$self->{block_break_type_stack}}; my $type = $self->{block_break_type_stack}->[$size-1]; return $opts->{$type}; } # ---------------------------------------------------------------------- # Read configuration information. For each element, construct a hash # containing a hash key and value for each option name and value. # After reading the file, fill in missing option values for # incomplete option structures using the *DEFAULT options. sub read_config { my $self = shift; my $conf_file = shift; my @elt_names = (); my $err_msg; my $in_continuation = 0; my $saved_line = ""; open (FH, $conf_file) or die "Cannot read config file $conf_file: $!\n"; while () { chomp; next if /^\s*($|#)/; # skip blank lines, comments if ($in_continuation) { $_ = $saved_line . " " . $_; $saved_line = ""; $in_continuation = 0; } if (!/^\s/) { # Line doesn't begin with whitespace, so it lists element names. # Names are separated by whitespace or commas, possibly followed # by a continuation character or a comment. if (/\\$/) { s/\\$//; # remove continuation character $saved_line = $_; $in_continuation = 1; next; } s/\s*#.*$//; # remove any trailing comment @elt_names = split (/[\s,]+/, $_); # make sure each name has an entry in the elt_opts structure foreach my $elt_name (@elt_names) { $self->{elt_opts}->{$elt_name} = { } unless exists ($self->{elt_opts}->{$elt_name}); } } else { # Line begins with whitespace, so it contains an option # to apply to the current element list, possibly followed by # a comment. First check that there is a current list. # Then parse the option name/value. die "$conf_file:$.: Option setting found before any " . "elements were named.\n" if !@elt_names; s/\s*#.*$//; my ($opt_name, $opt_val) = /^\s+(\S+)(?:\s+|\s*=\s*)(\S+)$/; die "$conf_file:$.: Malformed line: $_\n" unless defined ($opt_val); # Check option. If illegal, die with message. Otherwise, # add option to each element in current element list ($opt_val, $err_msg) = check_option ($opt_name, $opt_val); die "$conf_file:$.: $err_msg\n" if defined ($err_msg); foreach my $elt_name (@elt_names) { $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val; } } } close (FH); # For any element that has missing option values, fill in the values # using the options for the *DEFAULT pseudo-element. This speeds up # element option lookups later. It also makes it unnecessary to test # each option to see if it's defined: All element option structures # will have every option defined. my $def_opts = $self->{elt_opts}->{"*DEFAULT"}; foreach my $elt_name (keys (%{$self->{elt_opts}})) { next if $elt_name eq "*DEFAULT"; foreach my $opt_name (keys (%{$def_opts})) { next if exists ($self->{elt_opts}->{$elt_name}->{$opt_name}); # already set $self->{elt_opts}->{$elt_name}->{$opt_name} = $def_opts->{$opt_name}; } } } # Check option name to make sure it's legal. Check the value to make sure # that it's legal for the name. Return a two-element array: # (value, undef) if the option name and value are legal. # (undef, message) if an error was found; message contains error message. # For legal values, the returned value should be assigned to the option, # because it may get type-converted here. sub check_option { my ($opt_name, $opt_val) = @_; # - Check option name to make sure it's a legal option # - Then check the value. If there is a list of values # the value must be one of them. Otherwise, the value # must be an integer. return (undef, "Unknown option name: $opt_name") unless exists ($opt_list{$opt_name}); my $allowable_val = $opt_list{$opt_name}; if (defined ($allowable_val)) { return (undef, "Unknown '$opt_name' value: $opt_val") unless grep (/^$opt_val$/, @{$allowable_val}); } else # other options should be numeric { # "$opt_val" converts $opt_val to string for pattern match return (undef, "'$opt_name' value ($opt_val) should be an integer") unless "$opt_val" =~ /^\d+$/; } return ($opt_val, undef); } # Return hash of option values for a given element. If no options are found: # - Add the element name to the list of unconfigured options. # - Assign the default options to the element. (This way the test for the # option fails only once.) sub get_opts { my $self = shift; my $elt_name = shift; my $opts = $self->{elt_opts}->{$elt_name}; if (!defined ($opts)) { $self->{unconf_elts}->{$elt_name} = 1; $opts = $self->{elt_opts}->{$elt_name} = $self->{elt_opts}->{"*DEFAULT"}; } return $opts; } # Display contents of configuration options to be used to process document. # For each element named in the elt_opts structure, display its format # type, and those options that apply to the type. sub display_config { my $self = shift; # Format types and the additional options that apply to each type my $format_opts = { "block" => [ "entry-break", "element-break", "exit-break", "subindent", "normalize", "wrap-length" ], "inline" => [ ], "verbatim" => [ ] }; foreach my $elt_name (sort (keys (%{$self->{elt_opts}}))) { print "$elt_name\n"; my %opts = %{$self->{elt_opts}->{$elt_name}}; my $format = $opts{format}; # Write out format type, then options that apply to the format type print " format = $format\n"; foreach my $opt_name (@{$format_opts->{$format}}) { print " $opt_name = $opts{$opt_name}\n"; } print "\n"; } } # Display the list of elements that are used in the document but not # configured in the configuration file. # Then re-unconfigure the elements so that they won't be considered # as configured for the next document, if there is one. sub display_unconfigured_elements { my $self = shift; my @elts = keys (%{$self->{unconf_elts}}); if (@elts == 0) { print "The document contains no unconfigured elements.\n"; } else { print "The following document elements were assigned no formatting options:\n"; foreach my $line ($self->line_wrap ([ join (" ", sort (@elts)) ], 0, 0, 65)) { print "$line\n"; } } foreach my $elt_name (@elts) { delete ($self->{elt_opts}->{$elt_name}); } } # ---------------------------------------------------------------------- # Main document processing routine. # - Argument is a string representing an input document # - Return value is the reformatted document, or undef. An undef return # signifies either that an error occurred, or that some option was # given that suppresses document output. In either case, don't write # any output for the document. Any error messages will already have # been printed when this returns. sub process_doc { my $self = shift; my ($doc, $verbose, $check_parser, $canonize_only, $show_unconf_elts) = @_; my $str; $self->init_doc_vars (); # Perform lexical parse to split document into list of tokens warn "Parsing document...\n" if $verbose; $self->shallow_parse ($doc); if ($check_parser) { warn "Checking parser...\n" if $verbose; # concatentation of tokens should be identical to original document if ($doc eq join ("", @{$self->tokens ()})) { print "Parser is okay\n"; } else { print "PARSER ERROR: document token concatenation differs from document\n"; } return undef; } # Assign input line number to each token $self->assign_line_numbers (); # Look for and report any error tokens returned by parser warn "Checking document for errors...\n" if $verbose; if ($self->report_errors () > 0) { warn "Cannot continue processing document.\n"; return undef; } # Convert the token list to a tree structure warn "Converting document tokens to tree...\n" if $verbose; if ($self->tokens_to_tree () > 0) { warn "Cannot continue processing document.\n"; return undef; } # Check: Stringify the tree to convert it back to a single string, # then compare to original document string (should be identical) # (This is an integrity check on the validity of the to-tree and stringify # operations; if one or both do not work properly, a mismatch should occur.) #$str = $self->tree_stringify (); #print $str; #warn "ERROR: mismatch between document and resulting string\n" if $doc ne $str; # Canonize tree to remove extraneous whitespace warn "Canonizing document tree...\n" if $verbose; $self->tree_canonize (); if ($canonize_only) { print $self->tree_stringify () . "\n"; return undef; } # One side-effect of canonizing the tree is that the formatting # options are looked up for each element in the document. That # causes the list of elements that have no explicit configuration # to be built. Display the list and return if user requested it. if ($show_unconf_elts) { $self->display_unconfigured_elements (); return undef; } # Format the tree to produce formatted XML as a single string warn "Formatting document tree...\n" if $verbose; $self->tree_format (); # If the document is not empty, add a newline and emit a warning if # reformatting failed to add a trailing newline. This shouldn't # happen if the *DOCUMENT options are set up with exit-break = 1, # which is the reason for the warning rather than just silently # adding the newline. $str = $self->out_doc (); if ($str ne "" && $str !~ /\n$/) { warn "LOGIC ERROR: trailing newline had to be added\n"; $str .= "\n"; } return $str; } # ---------------------------------------------------------------------- # Parse XML document into array of tokens and store array sub shallow_parse { my ($self, $xml_document) = @_; $self->{tokens} = [ $xml_document =~ /$XML_SPE/g ]; } # ---------------------------------------------------------------------- # Extract a tag name from a tag and return it. # Dies if the tag cannot be found, because this is supposed to be # called only with a legal tag. sub extract_tag_name { my $tag = shift; die "Cannot find tag name in tag: $tag\n" unless $tag =~ /^<\/?($Name)/; return $1; } # ---------------------------------------------------------------------- # Assign an input line number to each token. The number indicates # the line number on which the token begins. sub assign_line_numbers { my $self = shift; my $line_num = 1; $self->{line_num} = [ ]; for (my $i = 0; $i < @{$self->{tokens}}; $i++) { my $token = $self->{tokens}->[$i]; push (@{$self->{line_num}}, $line_num); # count newlines and increment line counter (tr returns no. of matches) $line_num += ($token =~ tr/\n/\n/); } } # ---------------------------------------------------------------------- # Check token list for errors and report any that are found. Error # tokens are those that begin with "<" but do not end with ">". # Returns the error count. # Does not modify the original token list. sub report_errors { my $self = shift; my $err_count = 0; for (my $i = 0; $i < @{$self->{tokens}}; $i++) { my $token = $self->{tokens}->[$i]; if ($token =~ /^$/) { my $line_num = $self->{line_num}->[$i]; warn "Malformed token at line $line_num, token " . ($i+1) . ": $token\n"; ++$err_count; } } warn "Number of errors found: $err_count\n" if $err_count > 0; return $err_count; } # ---------------------------------------------------------------------- # Helper routine to print tag stack for tokens_to_tree sub print_tag_stack { my ($label, @stack) = @_; if (@stack < 1) { warn " $label: none\n"; } else { warn " $label:\n"; for (my $i = 0; $i < @stack; $i++) { warn " ", ($i+1), ": ", $stack[$i], "\n"; } } } # Convert the list of XML document tokens to a tree representation. # The implementation uses a loop and a stack rather than recursion. # Does not modify the original token list. # Returns an error count. sub tokens_to_tree { my $self = shift; my @tag_stack = (); # stack for element tags my @children_stack = (); # stack for lists of children my $children = [ ]; # current list of children my $err_count = 0; for (my $i = 0; $i < @{$self->{tokens}}; $i++) { my $token = $self->{tokens}->[$i]; my $line_num = $self->{line_num}->[$i]; my $tok_err = "Error near line $line_num, token " . ($i+1) . " ($token)"; if ($token !~ /^), close the # element immediately, giving it an empty child list. # - Otherwise, push tag and child list on stacks, begin new child # list for element body. if ($token =~ /\/>$/) # tag is of form { push (@{$children}, element_node ($token, "", [ ])); } else # tag is of form { push (@tag_stack, $token); push (@children_stack, $children); $children = [ ]; } } } # At this point, the stacks should be empty if the document is # well-formed. if (@tag_stack) { warn "Error at EOF: Unclosed tags; malformed document?\n"; print_tag_stack ("unclosed tags", @tag_stack); ++$err_count; } if (@children_stack) { warn "Error at EOF: Unprocessed child elements; malformed document?\n"; # TODO: print out info about them ++$err_count; } $self->{tree} = $children; return $err_count; } # Node-generating helper methods for tokens_to_tree # Generic node generator sub node { return { "type" => $_[0], "content" => $_[1] }; } # Generators for specific non-element nodes sub text_node { return node ("text", $_[0]); } sub comment_node { return node ("comment", $_[0]); } sub pi_node { return node ("pi", $_[0]); } sub doctype_node { return node ("DOCTYPE", $_[0]); } sub cdata_node { return node ("CDATA", $_[0]); } # For an element node, create a standard node with the type and content # key/value pairs. Then add pairs for the "name", "open_tag", and # "close_tag" hash keys. sub element_node { my ($open_tag, $close_tag, $children) = @_; my $elt = node ("elt", $children); # name is the open tag with angle brackets and attibutes stripped $elt->{name} = extract_tag_name ($open_tag); $elt->{open_tag} = $open_tag; $elt->{close_tag} = $close_tag; return $elt; } # ---------------------------------------------------------------------- # Convert the given XML document tree (or subtree) to string form by # concatentating all of its components. Argument is a reference # to a list of nodes at a given level of the tree. # Does not modify the node list. sub tree_stringify { my $self = shift; my $children = shift || $self->{tree}; # use entire tree if no arg; my $str = ""; for (my $i = 0; $i < @{$children}; $i++) { my $child = $children->[$i]; # - Elements have list of child nodes as content (process recursively) # - All other node types have text content if ($child->{type} eq "elt") { $str .= $child->{open_tag} . $self->tree_stringify ($child->{content}) . $child->{close_tag}; } else { $str .= $child->{content}; } } return $str; } # ---------------------------------------------------------------------- # Put tree in "canonical" form by eliminating extraneous whitespace # from element text content. # $children is a list of child nodes # This function modifies the node list. # Canonizing occurs as follows: # - Comment, PI, DOCTYPE, and CDATA nodes remain untouched # - Verbatim elements and their descendants remain untouched # - Within non-normalized block elements: # - Delete all-whitespace text node children # - Leave other text node children untouched # - Within normalized block elements: # - Convert runs of whitespace (including line-endings) to single spaces # - Trim leading whitespace of first text node # - Trim trailing whitespace of last text node # - Trim whitespace that is adjacent to a verbatim or non-normalized # sub-element. (For example, if a is followed by # more text, delete any whitespace at beginning of that text.) # - Within inline elements: # - Normalize the same way as the enclosing block element, with the # exception that a space at the beginning or end is not removed. # (Otherwise, three blind mice # would become threeblindmice.) sub tree_canonize { my $self = shift; $self->{tree} = $self->tree_canonize2 ($self->{tree}, "*DOCUMENT"); } sub tree_canonize2 { my $self = shift; my $children = shift; my $par_name = shift; # Formatting options for parent my $par_opts = $self->get_opts ($par_name); # If parent is a block element, remember its formatting options on # the block stack so they can be used to control canonization of # inline child elements. $self->begin_block ($par_name, $par_opts) if $par_opts->{format} eq "block"; # Iterate through list of child nodes to preserve, modify, or # discard whitespace. Return resulting list of children. # Canonize element and text nodes. Leave everything else (comments, # processing instructions, etc.) untouched. my @new_children = (); while (@{$children}) { my $child = shift (@{$children}); if ($child->{type} eq "elt") { # Leave verbatim elements untouched. For other element nodes, # canonize child list using options appropriate to element. if ($self->get_opts ($child->{name})->{format} ne "verbatim") { $child->{content} = $self->tree_canonize2 ($child->{content}, $child->{name}); } } elsif ($child->{type} eq "text") { # Delete all-whitespace node or strip whitespace as appropriate. # Paranoia check: We should never get here for verbatim elements, # because normalization is irrelevant for them. die "LOGIC ERROR: trying to canonize verbatim element $par_name!\n" if $par_opts->{format} eq "verbatim"; if (!$self->block_normalize ()) { # Enclosing block is not normalized: # - Delete child all-whitespace text nodes. # - Leave other text nodes untouched. next if $child->{content} =~ /^\s*$/; } else { # Enclosing block is normalized, so normalize this text node: # - Convert runs of whitespace characters (including # line-endings characters) to single spaces. # - Trim leading whitespace if this node is the first child # of a block element or it follows a non-normalized node. # - Trim leading whitespace if this node is the last child # of a block element or it precedes a non-normalized node. # These are nil if there is no prev or next child my $prev_child = $new_children[$#new_children]; my $next_child = $children->[0]; $child->{content} =~ s/\s+/ /g; $child->{content} =~ s/^ // if (!defined ($prev_child) && $par_opts->{format} eq "block") || $self->non_normalized_node ($prev_child); $child->{content} =~ s/ $// if (!defined ($next_child) && $par_opts->{format} eq "block") || $self->non_normalized_node ($next_child); # If resulting text is empty, discard the node. next if $child->{content} =~ /^$/; } } push (@new_children, $child); } # Pop block stack if parent was a block element $self->end_block () if $par_opts->{format} eq "block"; return \@new_children; } # Helper function for tree_canonize(). # Determine whether a node is normalized. This is used to check # the node that is adjacent to a given text node (either previous # or following). # - No is node is nil # - No if the node is a verbatim element # - If the node is a block element, yes or no according to its # normalize option # - No if the node is an inline element. Inlines are normalized # if the parent block is normalized, but this method is not called # except while examinine normalized blocks. So its inline children # are also normalized. # - No if node is a comment, PI, DOCTYPE, or CDATA section. These are # treated like verbatim elements. sub non_normalized_node { my $self = shift; my $node = shift; return 0 if !$node; my $type = $node->{type}; if ($type eq "elt") { my $node_opts = $self->get_opts ($node->{name}); if ($node_opts->{format} eq "verbatim") { return 1; } if ($node_opts->{format} eq "block") { return $node_opts->{normalize} eq "no"; } if ($node_opts->{format} eq "inline") { return 0; } die "LOGIC ERROR: non_normalized_node: unhandled node format.\n"; } if ($type eq "comment" || $type eq "pi" || $type eq "DOCTYPE" || $type eq "CDATA") { return 1; } if ($type eq "text") { die "LOGIC ERROR: non_normalized_node: got called for text node.\n"; } die "LOGIC ERROR: non_normalized_node: unhandled node type.\n"; } # ---------------------------------------------------------------------- # Format (pretty-print) the document tree # Does not modify the node list. # The class maintains two variables for storing output: # - out_doc stores content that has been seen and "flushed". # - pending stores an array of strings (content of text nodes and inline # element tags). These are held until they need to be flushed, at # which point they are concatenated and possibly wrapped/indented. # Flushing occurs when a break needs to be written, which happens # when something other than a text node or inline element is seen. # If parent name and children are not given, format the entire document. # Assume prevailing indent = 0 if not given. sub tree_format { my $self = shift; my $par_name = shift || "*DOCUMENT"; # format entire document if no arg my $children = shift || $self->{tree}; # use entire tree if no arg my $indent = shift || 0; # Formatting options for parent element my $par_opts = $self->get_opts ($par_name); # If parent is a block element: # - Remember its formatting options on the block stack so they can # be used to control formatting of inline child elements. # - Set initial break type to entry-break. # - Shift prevailing indent right before generating child content. if ($par_opts->{format} eq "block") { $self->begin_block ($par_name, $par_opts); $self->set_block_break_type ("entry-break"); $indent += $par_opts->{"subindent"}; } # Variables for keeping track of whether the previous child # was a text node. Used for controlling break behavior in # non-normalized block elements: No line breaks are added around # text in such elements, nor is indenting added. my $prev_child_is_text = 0; my $cur_child_is_text = 0; foreach my $child (@{$children}) { $prev_child_is_text = $cur_child_is_text; # Text nodes: just add text to pending output if ($child->{type} eq "text") { $cur_child_is_text = 1; $self->add_to_pending ($child->{content}); next; } $cur_child_is_text = 0; # Element nodes: handle depending on format type if ($child->{type} eq "elt") { my $child_opts = $self->get_opts ($child->{name}); # Verbatim elements: # - Print literally without change (use _stringify). # - Do not line-wrap or add any indent. if ($child_opts->{format} eq "verbatim") { $self->flush_pending ($indent); $self->emit_break (0) unless $prev_child_is_text && !$self->block_normalize (); $self->set_block_break_type ("element-break"); $self->add_to_doc ($child->{open_tag} . $self->tree_stringify ($child->{content}) . $child->{close_tag}); next; } # Inline elements: # - Do not break or indent. # - Do not line-wrap content; just add content to pending output # and let it be wrapped as part of parent's content. if ($child_opts->{format} eq "inline") { $self->add_to_pending ($child->{open_tag}); $self->tree_format ($child->{name}, $child->{content}, $indent); $self->add_to_pending ($child->{close_tag}); next; } # If we get here, node is a block element. # - Break and flush any pending output # - Break and indent (no indent if break count is zero) # - Process element itself: # - Put out opening tag # - Put out element content # - Put out any indent needed before closing tag. None needed if: # - Element's exit-break is 0 (closing tag is not on new line, # so don't indent it) # - There is no separate closing tag (it was in format) # - Element has no children (tags will be written as # , so don't indent closing tag) # - Element has children, but the block is not normalized and # the last child is a text node # - Put out closing tag $self->flush_pending ($indent); $self->emit_break ($indent) unless $prev_child_is_text && !$self->block_normalize (); $self->set_block_break_type ("element-break"); $self->add_to_doc ($child->{open_tag}); $self->tree_format ($child->{name}, $child->{content}, $indent); $self->add_to_doc (" " x $indent) unless $child_opts->{"exit-break"} <= 0 || $child->{close_tag} eq "" || !@{$child->{content}} || (@{$child->{content}} && $child->{content}->[$#{$child->{content}}]->{type} eq "text" && $child_opts->{normalize} eq "no"); $self->add_to_doc ($child->{close_tag}); next; } # Comments, PIs, etc. (everything other than text and elements), # treat similarly to verbatim block: # - Flush any pending output # - Put out a break # - Add node content to collected output $self->flush_pending ($indent); $self->emit_break (0) unless $prev_child_is_text && !$self->block_normalize (); $self->set_block_break_type ("element-break"); $self->add_to_doc ($child->{content}); } $prev_child_is_text = $cur_child_is_text; # Done processing current element's children now. # If current element is a block element: # - If there were any children, flush any pending output and put # out the exit break. # - Pop the block stack if ($par_opts->{format} eq "block") { if (@{$children}) { $self->flush_pending ($indent); $self->set_block_break_type ("exit-break"); $self->emit_break (0) unless $prev_child_is_text && !$self->block_normalize (); } $self->end_block (); } } # Emit a break - the appropriate number of newlines according to the # enclosing block's current break type. # In addition, emit the number of spaces indicated by indent. (indent # > 0 when breaking just before emitting an element tag that should # be indented within its parent element.) # Exception: Emit no indent if break count is zero. That indicates # any following output will be written on the same output line, not # indented on a new line. # Initially, when processing a node's child list, the break type is # set to entry-break. Each subsequent break is an element-break. # (After child list has been processed, an exit-break is produced as well.) sub emit_break { my ($self, $indent) = @_; # number of newlines to emit my $break_value = $self->block_break_value (); $self->add_to_doc ("\n" x $break_value); # add indent if there *was* a break $self->add_to_doc (" " x $indent) if $indent > 0 && $break_value > 0; } # Flush pending output to output document collected thus far: # - Wrap pending contents as necessary, with indent before *each* line. # - Add pending text to output document (thus "flushing" it) # - Clear pending array. sub flush_pending { my ($self, $indent) = @_; # Do nothing if nothing to flush return if !@{$self->{pending}}; # If current block is not normalized: # - Text nodes cannot be modified (no wrapping or indent). Flush # text as is without adding a break or indent. # If current block is normalized: # - Add a break. # - If line wrap is disabled: # - Add indent if there is a break. (If there isn't a break, text # should immediately follow preceding tag, so don't add indent.) # - Add text without wrapping # - If line wrap is enabled: # - First line indent is 0 if there is no break. (Text immediately # follows preceding tag.) Otherwise first line indent is same as # prevailing indent. # - Any subsequent lines get the prevailing indent. # After flushing text, advance break type to element-break. my $s = ""; if (!$self->block_normalize ()) { $s .= join ("", @{$self->{pending}}); } else { $self->emit_break (0); my $wrap_len = $self->block_wrap_length (); my $break_value = $self->block_break_value (); if ($wrap_len <= 0) { $s .= " " x $indent if $break_value > 0; $s .= join ("", @{$self->{pending}}); } else { my $first_indent = ($break_value > 0 ? $indent : 0); # Wrap lines, then join by newlines (don't add one at end) my @lines = $self->line_wrap ($self->{pending}, $first_indent, $indent, $wrap_len); $s .= join ("\n", @lines); } } $self->add_to_doc ($s); $self->{pending} = [ ]; $self->set_block_break_type ("element-break"); } # Perform line-wrapping of string array to lines no longer than given # length (including indent). # Any word longer than line length appears by itself on line. # Return array of lines (not newline-terminated). # $strs - reference to array of text items to be joined and line-wrapped. # Each item may be: # - A tag (such as ). This should be treated as # an atomic unit, which is important for preserving inline tags intact. # - A possibly multi-word string (such as "This is a string"). In this # latter case, line-wrapping preserves internal whitespace in the # string, with the exception that if whitespace would be placed at # the end of a line, it is discarded. # $first_indent - indent for first line # $rest_indent - indent for any remaining lines # $max_len - maximum length of output lines (including indent) sub line_wrap { my ($self, $strs, $first_indent, $rest_indent, $max_len) = @_; # First, tokenize the strings my @words = (); foreach my $str (@{$strs}) { if ($str =~ /^", "word", "" gets merged to "word". But # "", " ", "word", " ", "" gets left as separate tokens. my @words2 = (); foreach my $word (@words) { # If there is a previous word that does not end with whitespace, # and the currrent word does not begin with whitespace, concatenate # current word to previous word. Otherwise append current word to # end of list of words. if (@words2 && $words2[$#words2] !~ /\s$/ && $word !~ /^\s/) { $words2[$#words2] .= $word; } else { push (@words2, $word); } } my @lines = (); my $line = ""; my $llen = 0; # set the indent for the first line my $indent = $first_indent; # saved-up whitespace to put before next non-white word my $white = ""; foreach my $word (@words2) # ... while words remain to wrap { # If word is whitespace, save it. It gets added before next # word if no line-break occurs. if ($word =~ /^\s/) { $white .= $word; next; } my $wlen = length ($word); if ($llen == 0) { # New output line; it gets at least one word (discard any # saved whitespace) $line = " " x $indent . $word; $llen = $indent + $wlen; $indent = $rest_indent; $white = ""; next; } if ($llen + length ($white) + $wlen > $max_len) { # Word (plus saved whitespace) won't fit on current line. # Begin new line (discard any saved whitespace). push (@lines, $line); $line = " " x $indent . $word; $llen = $indent + $wlen; $indent = $rest_indent; $white = ""; next; } # add word to current line with saved whitespace between $line .= $white . $word; $llen += length ($white) + $wlen; $white = ""; } # push remaining line, if any push (@lines, $line) if $line ne ""; return @lines; } 1; # ---------------------------------------------------------------------- # Begin main program package main; my $usage = < \$help, # print help message "backup|b=s" => \$backup_suffix, # make backup using suffix "canonized-output" => \$canonize_only, # print canonized document "check-parser" => \$check_parser, # verify parser integrity "config-file|f=s" => \$conf_file, # config file "in-place|i" => \$in_place, # format in place "show-config" => \$show_conf, # show configuration file # need better name "show-unconfigured-elements" => \$show_unconf_elts, # show unconfigured elements "verbose|v" => \$verbose, # be verbose "version|V" => \$show_version, # show version info ) or do { print "$usage\n"; exit (1); }; if (defined ($help)) { print "$usage\n"; exit (0); } if (defined ($show_version)) { print "$PROG_NAME $PROG_VERSION ($PROG_LANG version)\n"; exit (0); } # --in-place option requires a named file warn "WARNING: --in-place/-i option ignored (requires named input files)\n" if defined ($in_place) && @ARGV == 0; # --backup/-b is meaningless without --in-place if (defined ($backup_suffix)) { if (!defined ($in_place)) { die "--backup/-b option meaningless without --in-place/-i option\n"; } } # Save input filenames my @in_file = @ARGV; my $xf = XMLFormat->new (); # If a configuration file was named explicitly, use it. An error occurs # if the file does not exist. # If no configuration file was named, fall back to: # - The file named by the environment variable XMLFORMAT_CONF, if it exists # - ./xmlformat.conf, if it exists # If no configuration file can be found at all, the built-in default options # are used. (These are set up in new().) my $env_conf_file = $ENV{XMLFORMAT_CONF}; my $def_conf_file = "./xmlformat.conf"; # If no config file was named, but XMLFORMAT_CONF is set, use its value # as the config file name. if (!defined ($conf_file)) { $conf_file = $env_conf_file if defined ($env_conf_file); } # If config file still isn't defined, use the default file if it exists. if (!defined ($conf_file)) { if (-r $def_conf_file && ! -d $def_conf_file) { $conf_file = $def_conf_file; } } if (defined ($conf_file)) { warn "Reading configuration file...\n" if $verbose; die "Configuration file '$conf_file' is not readable.\n" if ! -r $conf_file; die "Configuration file '$conf_file' is a directory.\n" if -d $conf_file; $xf->read_config ($conf_file) } if ($show_conf) # show configuration and exit { $xf->display_config (); exit(0); } my ($in_doc, $out_doc); # Process arguments. # - If no files named, read string, write to stdout. # - If files named, read and process each one. Write output to stdout # unless --in-place option was given. Make backup of original file # if --backup option was given. if (@ARGV == 0) { warn "Reading document...\n" if $verbose; { local $/ = undef; $in_doc = <>; # slurp input document as single string } $out_doc = $xf->process_doc ($in_doc, $verbose, $check_parser, $canonize_only, $show_unconf_elts); if (defined ($out_doc)) { warn "Writing output document...\n" if $verbose; print $out_doc; } } else { foreach my $file (@ARGV) { warn "Reading document $file...\n" if $verbose; open (IN, $file) or die "Cannot read $file: $!\n"; { local $/ = undef; $in_doc = ; # slurp input document as single string } close (IN); $out_doc = $xf->process_doc ($in_doc, $verbose, $check_parser, $canonize_only, $show_unconf_elts); next unless defined ($out_doc); if (defined ($in_place)) { if (defined ($backup_suffix)) { warn "Making backup of $file to $file$backup_suffix...\n" if $verbose; rename ($file, $file . $backup_suffix) or die "Could not rename $file to $file$backup_suffix: $!\n"; } warn "Writing output document to $file...\n" if $verbose; open (OUT, ">$file") or die "Cannot write to $file: $!\n"; print OUT $out_doc; close (OUT); } else { warn "Writing output document...\n" if $verbose; print $out_doc; } } } warn "Done!\n" if $verbose; exit (0); xmlformat-1.04/LICENSE0000644000175000017500000000741310320326274013302 0ustar pauldevelxmlformat is distributed under a BSD-style license. This license applies to the entire xmlformat distribution, with the exception of the REX parser (described below). Copyright (c) 2004, 2005, Kitebird, LLC. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of Kitebird nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---------------------------------------------------------------------- The REX parser xmlformat contains code based on the REX parser, which is Copyright (c) 1998, Robert D. Cameron. REX is described in this document: http://www.cs.sfu.ca/~cameron/REX.html The document contains a Perl implementation of REX: --- begin REX code --- # REX/Perl 1.0 # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions", # Technical Report TR 1998-17, School of Computing Science, Simon Fraser # University, November, 1998. # Copyright (c) 1998, Robert D. Cameron. # The following code may be freely used and distributed provided that # this copyright and citation notice remains intact and that modifications # or additions are clearly identified. $TextSE = "[^<]+"; $UntilHyphen = "[^-]*-"; $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-"; $CommentCE = "$Until2Hyphens>?"; $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+"; $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>"; $S = "[ \\n\\t\\r]+"; $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; $Name = "(?:$NameStrt)(?:$NameChar)*"; $QuoteSE = "\"[^\"]*\"|'[^']*'"; $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*"; $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; $S1 = "[\\n\\r\\t ]"; $UntilQMs = "[^?]*\\?+"; $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>"; $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S"; $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?"; $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; $PI_CE = "$Name(?:$PI_Tail)?"; $EndTagCE = "$Name(?:$S)?>?"; $AttValSE = "\"[^<\"]*\"|'[^<']*'"; $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?"; $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"; $XML_SPE = "$TextSE|$MarkupSPE"; sub ShallowParse { my($XML_document) = @_; return $XML_document =~ /$XML_SPE/g; } --- end REX code --- The Perl and Ruby implementations of xmlformat contain parsers that are based on the preceding code and are essentially the same, with the exception of changes to variable and function names. xmlformat-1.04/docs/0000755000175000017500000000000010470216511013216 5ustar pauldevelxmlformat-1.04/docs/xmlformat.xml0000644000175000017500000014164310073104726015765 0ustar pauldevel
The <command>xmlformat</command> XML Document Formatter DuBois Paul paul@kitebird.com Introduction xmlformat is a formatter (or "pretty-printer") for XML documents. It is useful when you want XML documents to have a standard format. This includes situations such as the following: XML documents that are maintained in a version control system, where people who use different XML editors work on the documents. XML editors typically impose their own style conventions on files. The application of different style conventions to successive document revisions can result in large version diffs where most of the bulk is related only to changes in format rather than content. This can be a problem if, for example, the version control system automatically sends the diffs to a committer's mailing list that people read. If documents are rewritten to a common format before they are committed, these diffs become smaller. They better reflect content changes and are easier for people to scan and understand. Similarly, if you send an XML document to someone who edits it and sends it back, it's easier to see what was changed by putting the before and after versions in a common format. This is a simple alternative to using a more sophisticated semantic XML diff utility. Of course, these benefits can be obtained by using any XML pretty printer. So why does xmlformat exist? Because most XML formatters reformat documents using a set of built-in rules that determine the output style. Some allow you to select from one of several predefined output styles. That's fine if the style you want is the style produced by one of these tools. If not, you're stuck. That's where xmlformat comes in, because it's configurable. You can specify formatting options in a file and xmlformat will apply them to your documents. If you have different applications for which you want different styles, you can select the style by using the appropriate configuration file. xmlformat has a default overall output style, but you can redefine the default style, and you can override the default on a per-element basis. For example, you can indicate whether the element should be treated as a block element, an inline element, or a verbatim element. For any block element, you can control several formatting properties: Spacing (line breaks) between nested sub-elements. You can also control spacing between the element's opening and closing tags and its content. (The general assumption is that if a block element has non-inline sub-elements, you'll want to space those sub-elements evenly within the enclosing block, though possibly with different spacing between the opening tag and the first child, or between the last child and the closing tag.) Indentation of nested sub-elements. Whitespace normalization and line-wrapping of text within the element. xmlformat is free software. You can redistribute it or modify it under the terms specified in the LICENSE file. For installation instructions, see the INSTALL file. How to Use <command>xmlformat</command> To format the file mydoc.xml using the default formatting options, use the following command. (% represents your shell prompt here; do not type it as part of the command.) % xmlformat mydoc.xml (xmlformat might be installed as xmlformat.pl or xmlformat.rb, depending on implementation language. In that case, you should invoke it under the appropriate name.) The built-in formatting options cause each element to begin a new line, with sub-element indentation of one space, and no text normalization. Suppose mydoc.xml looks like this: A B C D ]]> xmlformat will produce this result by default: A B C D ]]> The default style is perhaps suitable for data-oriented XML documents that contain no mixed-content elements. For more control over output, specify a configuration file. If the formatting options are stored in a file named xf-opts.conf, you can apply them to the document by specifying a option: % xmlformat --config-file=xf-opts.conf mydoc.xml If you do not specify a configuration file using a (or ) option, xmlformat uses the following rules to determine what formatting options to use: If the environment variable XMLFORMAT_CONF is defined, xmlformat uses its value as the name of the configuration file. Otherwise, if a file named xmlformat.conf exists in the current directory, xmlformat uses it as the configuration file. Otherwise, xmlformat uses a set of built-in formatting options. Configuration options and configuration file syntax are described in . To see the command-line options that xmlformat supports, invoke it with the or option: % xmlformat --help Usage: xmlformat [options] xml-file Options: --help, -h Print this message --backup suffix -b suffix Back up the input document, adding suffix to the input filename to create the backup filename. --canonized-output Proceed only as far as the document canonization stage, printing the result. --check-parser Parse the document into tokens and verify that their concatenation is identical to the original input document. This option suppresses further document processing. --config-file file_name, -f file_name Specify the configuration filename. If no file is named, xmlformat uses the file named by the environment variable XMLFORMAT_CONF, if it exists, or ./xmlformat.conf, if it exists. Otherwise, xmlformat uses built-in formatting options. --in-place, -i Format the document in place, replacing the contents of the input file with the reformatted document. (It's a good idea to use --backup along with this option.) --show-config Show configuration options after reading configuration file. This option suppresses document processing. --show-unconfigured-elements Show elements that are used in the document but for which no options were specified in the configuration file. This option suppresses document output. --verbose, -v Be verbose about processing stages. --version, -V Show version information and exit. Do not use the or reformatting option until you are certain your configuration options are set up the way you want. Unpleasant consequences may occur otherwise. For example, if you have verbatim elements that you have forgotten to declare as verbatim, they will be reformatted and you will have to restore them to their original state later. Use of the or option can help you recover from this kind of problem. xmlformat writes the result to the standard output by default. To perform an "in-place" conversion that writes the reformatted document back to the original file, use the or option. This is useful when you want to format multiple documents with a single command; streaming multiple output documents to the standard output concatenates them, which is likely not what you want. Because in-place formatting replaces the original document, it's prudent to make a backup of the original using the (or ) option. This option takes a suffix value to be added to each input filename to produce the backup filename. To inspect the default (built-in) configuration options, use this command: % xmlformat --config-file=/dev/null --show-config The Document Processing Model XML documents consist primarily of elements arranged in nested fashion. Elements may also contain text. xmlformat acts to rearrange elements by removing or adding line breaks and indentation, and to reformat text. Document Components XML elements within input documents may be of three types: block elements This is the default element type. The DocBook <chapter>, <sect1>, and <para> elements are examples of block elements. Typically a block element will begin a new line. (That is the default formatting behavior, although xmlformat allows you to override it.) Spacing between sub-elements can be controlled, and sub-elements can be indented. Whitespace in block element text may be normalized. If normalization is in effect, line-wrapping may be applied as well. Normalization and line-wrapping may be appropriate for a block element with mixed content (such as <para>). inline elements These are elements that are contained within a block or within other inlines. The DocBook <emphasis> and <literal> elements are examples of inline elements. Normalization and line-wrapping of inline element tags and content is handled the same way as for the enclosing block element. In essence, an inline element is treated as part of parent's "text" content. verbatim elements No formatting is done for verbatim elements. The DocBook <programlisting> and <screen> elements are examples of verbatim elements. Verbatim element content is written out exactly as it appears in the input document. This also applies to child elements. Any formatting that would otherwise be performed on them is suppressed when they occur within a verbatim element. xmlformat never reformats element tags. In particular, it does not change whitespace betweeen attributes or which attribute values. This is true even for inline tags within line-wrapped block elements. xmlformat handles empty elements as follows: If an element appears as <abc/> in the input document, it is written as <abc/>. If an element appears as <abc></abc>, it is written as <abc></abc>. No line break is placed between the two tags. XML documents may contain other constructs besides elements and text: Processing instructions Comments DOCTYPE declaration CDATA sections xmlformat handles these constructs much the same way as verbatim elements. It does not reformat them. Line Breaks and Indentation Line breaks within block elements are controlled by the entry-break, element-break, and exit-break formatting options. A break value of n means n newlines. (This produces n-1 blank lines.) Example. Suppose input text looks like this: ]]> Here, an <elt> element contains three nested <subelt> elements, which for simplicity are empty. This input can be formatted several ways, depending on the configuration options. The following examples show how to do this. To produce output with all sub-elements are on the same line as the <elt> element, add a section to the configuration file that defines <elt> as a block element and sets all its break values to 0: elt format block entry-break 0 exit-break 0 element-break 0 Result: ]]> To leave the sub-elements together on the same line, but on a separate line between the <elt> tags, leave the element-break value set to 0, but set the entry-break and exit-break values to 1. To suppress sub-element indentation, set subindent to 0. elt format block entry-break 1 exit-break 1 element-break 0 subindent 0 Result: ]]> To indent the sub-elements, make the subindent value greater than zero. elt format block entry-break 1 exit-break 1 element-break 0 subindent 2 Result: ]]> To cause the each sub-element begin a new line, change the element-break to 1. elt format block entry-break 1 exit-break 1 element-break 1 subindent 2 Result: ]]> To add a blank line between sub-elements, increase the element-break from 1 to 2. elt format block entry-break 1 exit-break 1 element-break 2 subindent 2 Result: ]]> To also produce a blank line after the <elt> opening tag and before the closing tag, increase the entry-break and exit-break values from 1 to 2. elt format block entry-break 2 exit-break 2 element-break 2 subindent 2 Result: ]]> To have blank lines only after the opening tag and before the closing tag, but not have blank lines between the sub-elements, decrease the element-break from 2 to 1. elt format block entry-break 2 exit-break 2 element-break 1 subindent 2 Result: ]]> Breaks within block elements are suppressed in certain cases: Breaks apply to nested block or verbatim elements, but not to inline elements, which are, after all, inline. (If you really want an inline to begin a new line, define it as a block element.) Breaks are not applied to text within non-normalized blocks. Non-normalized text should not be changed, and adding line breaks changes the text. For example if <x> elements are normalized, you might elect to format this: This is a sentence.]]> Like this: This is a sentence. ]]> Here, breaks are added before and after the text to place it on a separate line. But if <x> is not normalized, the text content will be written as it appears in the input, to avoid changing it. Text Handling The XML standard considers whitespace nodes insignificant in elements that contain only other elements. In other words, for elements that have element content, sub-elements may optionally be separated by whitespace, but that whitespace is insignificant and may be ignored. An element that has mixed content may have text (#PCDATA) content, optionally interspersed with sub-elements. In this case, whitespace-only nodes may be significant. xmlformat treats only literal whitespace as whitespace. This includes the space, tab, newline (linefeed), and carriage return characters. xmlformat does not resolve entity references, so entities such as &#32; or &#x20; that represent whitespace characters are seen as non-whitespace text, not as whitespace. xmlformat doesn't know whether a block element has element content or mixed content. It handles text content as follows: If an element has element content, it will have only sub-elements and possibly all-whitespace text nodes. In this case, it is assumed that you'll want to control line-break behavior between sub-elements, so that the (all-whitespace) text nodes can be discarded and replaced with the proper number of newlines, and possibly indentation. If an element has mixed content, you may want to leave text nodes alone, or you may want to normalize (and possibly line-wrap) them. In xmlformat, normalization converts runs of whitespace characters to single spaces, and discards leading and trailing whitespace. To achieve this kind of formatting, xmlformat recognizes normalize and wrap-length configuration options for block elements. They affect text formatting as follows: You can enable or disable text normalization by setting the normalize option to yes or no. Within a normalized block, runs of whitespace are converted to single spaces. Leading and trailing whitespace is discarded. Line-wrapping and indenting may be applied. In a non-normalized block, text nodes are not changed as long as they contain any non-whitespace characters. No line-wrapping or indenting is applied. However, if a text node contains only whitespace (for example, a space or newline between sub-elements), it is assumed to be insignficant and is discarded. It may be replaced by line breaks and indentation when output formatting occurs. Consider the following input: A B ]]> Suppose that the <row> and <cell> elements both are to be treated as non-normalized. The contents of the <cell> elements are text nodes that contain non-whitespace characters, so they would not be reformatted. On the other hand, the spaces between tags are all-whitespace text nodes and are not significant. This means that you could reformat the input like this: A B ]]> Or like this: A B ]]> Or like this: A B ]]> In each of those cases, the whitespace between tags was subject to reformatting, but the text content of the <cell> elements was not. The input would not be formatted like this: AB]]> Or like this: A B ]]> In both of those cases, the text content of the <cell> elements has been modified, which is not allowed within non-normalized blocks. You would have to declare <cell> to have a normalize value of yes to achieve either of those output styles. Now consider the following input: This is a sentence. ]]> Suppose that <para> is to be treated as a normalized element. It could be reformatted like this: This is a sentence.]]> Or like this: This is a sentence. ]]> Or like this: This is a sentence. ]]> Or even (with line-wrapping) like this: This is a sentence. ]]> The preceding description of normalization is a bit oversimplified. Normalization is complicated by the possibility that non-normalized elements may occur as sub-elements of a normalized block. In the following example, a verbatim block occurs in the middle of a normalized block: This is a paragraph that contains a code listing in the middle. ]]> In general, when this occurs, any whitespace in text nodes adjacent to non-reformatted nodes is discarded. There is no "preserve all whitespace as is" mode for block elements. Even if normalization is disabled for a block, any all-whitespace text nodes are considered dispensible. If you really want all text within an element to be preserved intact, you should declare it as a verbatim element. (Within verbatim elements, nothing is ever reformatted, so whitespace is significant as a result.) If you want to see how xmlformat handles whitespace nodes and text normalization, invoke it with the option. This option causes xmlformat to display the document after it has been canonized by removing whitespace nodes and performing text normalization, but before it has been reformatted in final form. By examining the canonized document, you can see what effect your configuration options have on treatment of the document before line-wrapping and indentation is performed and line breaks are added. Using Configuration Files An xmlformat configuration file specifies formatting options to be associated with particular elements in XML documents. For example, you can format <itemizedlist> elements differently than <orderedlist> elements. (However, you cannot format <listitem> elements differentially depending on the type of list in which they occur.) You can also specify options for a "pseudo-element" named *DEFAULT. These options are applied to any element for which the options are not specified explicitly. The following sections describe the general syntax of configuration files, then discuss the allowable formatting options that can be assigned to elements. Configuration File Syntax A configuration file consists of sections. Each section begins with a line that names one or more elements. (Element names do not include the "<" and ">" angle brackets.) The element line is followed by option lines that each name a formatting option and its value. Each option is applied to every element named on its preceding element line. Element lines and option lines are distinguished based on leading whitespace (space or tab characters): Element lines have no leading whitespace. Option lines begin with at least one whitespace character. On element lines that name multiple elements, the names should be separated by spaces or commas. These are legal element lines: para title para,title para, title On option lines, the option name and value should be separated by whitespace and/or an equal sign. These are legal option lines: normalize yes normalize=yes normalize = yes Blank lines are ignored. Lines that begin "#" as the first non-white character are taken as comments and ignored. Comments beginning with "#" may also follow the last element name on an element line or the option value on an option line. Example configuration file: para format block entry-break 1 exit-break 1 normalize yes wrap-length 72 literal replaceable userinput command option emphasis format inline programlisting format verbatim It is not necessary to specify all of an element's options at the same time. Thus, this configuration file: para, title format block normalize yes title wrap-length 50 para wrap-length 72 Is equivalent to this configuration file: para format block normalize yes wrap-length 72 title format block normalize yes wrap-length 50 If an option is specified multiple times for an element, the last value is used. For the following configuration file, para ends up with a wrap-length value of 68: para format block wrap-length 60 wrap-length 72 para wrap-length 68 To continue an element line onto the next line, end it with a backslash character. xmlformat will interpret the next line as containing more element names for the current section: chapter appendix article \ section simplesection \ sect1 sect2 sect3 \ sect4 sect5 format block entry-break 1 element-break 2 exit-break 1 normalize no subindent 0 Continuation can be useful when you want to apply a set of formatting options to a large number of elements. Continuation lines are allowed to begin with whitespace (though it's possible they may appear to the casual observer to be option lines if they do). Continuation is not allowed for option lines. A configuration file may contain options for two special "pseudo-element" names: *DOCUMENT and *DEFAULT. (The names begin with a "*" character so as not to conflict with valid element names.) *DEFAULT options apply to any element that appears in the input document but that was not configured explicitly in the configuration file. *DOCUMENT options are used primarily to control line breaking between top-level nodes of the document, such as the XML declaration, the DOCTYPE declaration, the root element, and any comments or processing instructions that occur outside the root element. It's common to supply *DEFAULT options in a configuration file to override the built-in values. However, it's normally best to leave the *DOCUMENT options alone, except possibly to change the element-break value. Before reading the input document, xmlformat sets up formatting options as follows: It initializes the built-in *DOCUMENT and *DEFAULT options, It reads the contents of the configuration file, assigning formatting options to elements as listed in the file. Note that although *DOCUMENT and *DEFAULT have built-in default values, the defaults they may be overridden in the configuration file. After reading the configuration file, any missing formatting options for each element are filled in using the options from the *DEFAULT pseudo-element. For example, if para is defined as a block element but no subindent value is defined, para "inherits" the subindent value from the *DEFAULT settings. Missing options are filled in from the *DEFAULT options only after reading the entire configuration file. For the settings below, *DEFAULT has a subindent value of 2 (not 0) after the file has been read. Thus, para also is assigned a subindent value of 2. *DEFAULT subindent 0 para format block normalize yes *DEFAULT subindent 2 Formatting Options The allowable formatting options are as follows: format {block | inline | verbatim} entry-break n element-break n exit-break n subindent n normalize {no | yes} wrap-length n A value list shown as { value1 | value2 | ... } indicates that the option must take one of the values in the list. A value shown as n indicates that the option must have a numeric value. Details for each of the formatting options follow. format {block | inline | verbatim} This option is the most important, because it determines the general way in which the element is formatted, and it determines whether the other formatting options are used or ignored: For block elements, all other formatting options are significant. For inline elements, all other formatting options are ignored. Inline elements are normalized, wrapped, and indented according to the formatting options of the enclosing block element. For verbatim elements, all other formatting options are ignored. The element content is written out verbatim (literally), without change, even if it contains other sub-elements. This means no normalization of the contents, no indenting, and no line-wrapping. Nor are any breaks added within the element. A configuration file may specify any option for elements of any type, but xmlformat will ignore inapplicable options. One reason for this is to allow you to experiment with changing an element's format type without having to disable other options. If you use the command-line option to see the configuration that xmlformat will use for processing a document, it displays only the applicable options for each element. entry-break n element-break n exit-break n These options indicate the number of newlines (line breaks) to write after the element opening tag, between child sub-elements, and before the element closing tag. They apply only to block elements. A value of 0 means "no break". A value of 1 means one newline, which causes the next thing to appear on the next line with no intervening blank line. A value n greater than 1 produces n-1 intervening blank lines. Some examples: An entry-break value of 0 means the next token will appear on same line immediately after the opening tag. An exit-break value of 0 means the closing tag will appear on same line immediately after the preceding token. subindent n This option indicates the number of spaces by which to indent child sub-elements, relative to the indent of the enclosing parent. It applies only to block elements. The value may be 0 to suppress indenting, or a number n greater than 0 to produce indenting. This option does not affect the indenting of the element itself. That is determined by the subindent value of the element's own parent. Note: subindent does not apply to text nodes in non-normalized blocks, which are written as is without reformatting. subindent also does not apply to verbatim elements or to the following non-element constructs, all of which are written with no indent: Processing instructions Comments DOCTYPE declarations CDATA sections normalize {no | yes} This option indicates whether or not to perform whitespace normalization in text. This option is used for block elements, but it also affects inline elements because their content is normalized the same way as their enclosing block element. If the value is no, whitespace-only text nodes are not considered significant and are discarded, possibly to be replaced with line breaks and indentation. If the value is yes, normalization causes removal of leading and trailing whitespace within the element, and conversion of runs of whitespace characters (including line-ending characters) to single spaces. Text normalization is discussed in more detail in . wrap-length n Line-wrapping length. This option is used only for block elements and line-wrapping occurs only if normalization is enabled. The option affects inline elements because they are line-wrapped the same way as their enclosing block element. Setting the wrap-length option to 0 disables wrapping. Setting it to a value n greater than 0 enables wrapping to lines at most n characters long. (Exception: If a word longer than n characters occurs in text to be wrapped, it is placed on a line by itself. A word will never be broken into pieces.) The line length is adjusted by the current indent when wrapping is performed to keep the right margin of wrapped text constant. For example if the wrap-length value is 60 and the current indent is 10, lines are wrapped to a maximum of 50 characters. Any prevailing indent is added to the beginning of each line, unless the text will be written immediately following a tag on the same line. This can occur if the text occurs after the opening tag of the block and the entry-break is 0, or the text occurs after the closing tag of a sub-element and the element-break is 0. How <command>xmlformat</command> Works Briefly, xmlformat processes an XML document using the following steps: Read the document into memory as a single string. Parse the document into a list of tokens. Convert the list of tokens into nodes in a tree structure, tagging each node according to the token type. Discard extraneous whitespace nodes and normalize text nodes. (The meaning of "normalize" is described in .) Process the tree to produce a single string representing the reformatted document. Print the string. xmlformat is not an XSLT processor. In essence, all it does is add or delete whitespace to control line breaking, indentation, and text normalization. xmlformat uses the REX parser developed by Robert D. Cameron (see ). REX performs a parse based on a regular expression that operates on a string representing the XML document. The parse produces a list of tokens. REX does a pure lexical scan that performs no alteration of the text except to tokenize it. In particular: REX doesn't normalize any whitespace, including line endings. This is true for text elements, and for whitespace within tags (including between attributes and within attribute values). Any normalization or reformatting to be done is performed in later stages of xmlformat operation. REX leaves entity references untouched. It doesn't try to resolve them. This means it doesn't complain about undefined entities, which to my mind is an advantage. (A pretty printer shouldn't have to read a DTD or a schema.) If the XML is malformed, errors can be detected easily: REX produces error tokens that begin with "<" but do not end with ">". xmlformat expects its input documents to be legal XML. It does not consider fixing broken documents to be its job, so if xmlformat finds error tokens in the result produced by REX, it lists them and exits. Assuming the document contains no error tokens, xmlformat uses the token list to construct a tree structure. It categorizes each token based on its initial characters: Initial Characters Token Type <!-- comment <? processing instruction (this includes the <?xml?> instruction) <!DOCTYPE DOCTYPE declaration <![ CDATA section </ element closing tag < element opening tag Anything token not beginning with one of the sequences shown in the preceding table is a text token. The token categorization determineas the node types of nodes in the document tree. Each node has a label that identifies the node type: Label Node Type comment comment node pi processing instruction node DOCTYPE DOCTYPE declaration node CDATA CDATA section node elt element node text text node If the document is not well-formed, tree construction will fail. In this case, xmlformat displays one or more error messages and exits. For example, this document is invalid: This is a malformed document.

]]>
Running that document through xmlformat produces the following result: MISMATCH open (strong), close (p); malformed document? Non-empty tag stack; malformed document? Non-empty children stack; malformed document? Cannot continue. That is admittedly cryptic, but remember that it's not xmlformat's job to repair (or even diagnose) bad XML. If a document is not well-formed, you may find Tidy a useful tool for fixing it up. Tokens of each type except element tokens correspond to single distinct nodes in the document. Elements are more complex. They may consist of multiple tokens, and may contain children: An element with a combined opening/closing tag (such as <abc/>) consists of a single token. An element with separate opening and closing tags (such as <abc>...</abc>) consists of at least the two tags, plus any children that appear between the tags. Element opening tag tokens include any attributes that are present, because xmlformat performs no tag reformatting. Tags are preserved intact in the output, including any whitespace between attributes or within attribute values. In addition to the type value that labels a node as a given node type, each node has content: For all node types except elements, the content is the text of the token from which the node was created. For element nodes, the content is the list of child nodes that appear within the element. An empty element has an empty child node list. In addition to the content, element nodes contain other information: The literal text of the opening and closing tags. If an element is written in single-tag form (<abc/>), the closing tag is empty. The element name that is present in the opening tag. (This is maintained separately from the opening tag so that a pattern match need not be done on the opening tag each time it's necessary to determine the element name.) After constructing the node tree, xmlformat performs two operations on it: The tree is "canonized" to normalize text nodes and to discard extraneous whitespace nodes. A whitespace node is a text node consisting of nothing but whitespace characters (space, tab, carriage return, linefeed (newline)). Decisions about which whitespace nodes are extraneous are based on the configuration options supplied to xmlformat. The canonized tree is used to produce formatted output. xmlformat performs line-wrapping of element content, and adds indentation and line breaks. Decisions about how to apply these operations are based on the configuration options. Here's an example input document, representing a single-row table: 12 3 ]]> After reading this in and constructing the tree, the canonized output looks like this: 123]]> The output after applying the default formatting options looks like this: 1 2 3 ]]>
Prerequisites xmlformat has very few prerequisites. It requires no extra modules other than an option-processing module. In particular: xmlformat requires no XML processing modules. XML parsing is done with a single (rather complex) regular expression developed by Robert D. Cameron. A paper that discusses development of this parsing expression is available; see . xmlformat requires no text-processing modules such as Text::Wrap. I tested Text::Wrap to see if it was suitable for xmlformat. It was not, for the following reasons: If Text::Wrap encounters an individual word that is longer than the line length, older versions of Text::Wrap invoke die(). In newer versions, you can have long words left intact. Text::Wrap converts runs of spaces in the leading indent to tabs. (Though this can be suppressed.) Text::Wrap reformats inline tags (and may change attribute values). xmlformat preserves tags intact. In addition, the simple algorithm used by xmlformat appears to be about twice as fast as Text::Wrap (at least on Mac OS X). References Original REX paper by Robert D. Cameron: http://www.cs.sfu.ca/~cameron/REX.html ftp://fas.sfu.ca/pub/cs/TR/1998/CMPT1998-17.html This paper contains REX implementations in Perl, JavaScript, and LEX. The Perl implementation was used as the basis of XML parsing in xmlformat.pl and xmlformat.rb. A Python implementation of REX: http://mail.python.org/pipermail/xml-sig/1999-November/001628.html A PHP implementation of REX: http://traumwind.de/computer/php/REX/
xmlformat-1.04/docs/tutorial.xml0000644000175000017500000004573010073107122015610 0ustar pauldevel
<command>xmlformat</command> Tutorial DuBois Paul paul@kitebird.com Introduction This document is a user guide that provides a tutorial introduction to the xmlformat program. Another document, The xmlformat Document Formatter, describes the capabilities of xmlformat in more detail. Formatting a Document Suppose you have an XML document named doc1.xml that looks like this: I bought a new coffee cup! 200421 ]]> Suppose further that you want it to look like this: I bought a new coffee cup! 2004 2 1 ]]> By happy coincidence, that happens to be exactly the default output style produced by xmlformat. To reformat your document, all you have to do is run xmlformat with the document filename as the argument, saving the output in another file: % xmlformat doc1.xml > output Note: % represents your shell prompt; do not type it as part of the command. If you are confident that the output style produced by xmlformat will be as you desire, you can be reckless and perform an in-place conversion: % xmlformat -i doc1.xml In this case, xmlformat reads the document from the input file, reformats it, and writes it back out to the same file, replacing the file's original contents. If you are not quite so reckless, use in conjunction with a option to make a backup file that contains the original document. takes an argument that specifies the suffix to add to the original filename to create the backup filename. For example, to back up the original doc1.xml file in a file named doc1.xml.bak, use this command: % xmlformat -i -b .bak doc1.xml Using a Configuration File In the preceding example, the desired output style for doc1.xml was the same as what xmlformat produces by default. But what if the default style is not what you want? In that case, you must tell xmlformat how to handle your document. This is at once both the weakness and strength of xmlformat. The weakness is that it is extra work to instruct xmlformat how you want it to format a document. The strength is that it's possible to do so. Other XML formatters do not require any extra work, but that's because they are not configurable. Suppose doc2.xml looks like this: Compiling and Running a Program To compile and run the program, use the following commands, where source-file is the name of the source file: cc source-file ./a.out ]]> That's ugly, and you want it to rewrite it like this: Compiling and Running a Program To compile and run the program, use the following commands, where source-file is the name of the source file: cc source-file ./a.out ]]> The key characteristics of this rewrite are as follows: Child elements of the <example> element are separated by blank lines, but not indented within it. The text inside the <para> element is reformatted, adjusted to 60 characters per line and indented. The contents of the <screen> element are left alone. Unfortunately, if you run doc2.xml through xmlformat, it comes out like this: Compiling and Running a Program To compile and run the program, use the following commands, where source-file is the name of the source file: cc source-file ./a.out ]]> This output is unsuitable. Among the offenses committed by xmlformat, two are most notable: The text of the <para> element has been left alone, not reformatted. The <screen> element content has been reformatted, not left intact. In these respects, it appears that xmlformat has done exactly the opposite of what was wanted! Furthermore, had you used the option to reformat the file in place without using to make a backup, at this point you would have a file containing a <screen> element that you'd have to fix up by hand to restore it to its original condition. What a worthless, worthless program! The rewriting of the <screen> element points to an important lesson: Before trusting xmlformat with your documents, it's best to run some tests and tune your configuration as necessary to make sure it will produce the results you want. Otherwise, you may produce changes that affect the integrity of your documents. This is particularly true when they contain elements such as <screen> or <programlisting> that should be copied verbatim, without change. Configuring xmlformat amounts to writing a configuration file that instructs it what to do. For doc2.xml, that means telling xmlformat to leave the <screen> element alone, to normalize the text of the paragraph to fill lines and wrap them to a given length, and to put blank lines around sub-elements of the <example> element. Let's begin by creating a very basic configuration file. What should we call it? xmlformat can read configuration settings from a file named on the command line with a or option. This means you can name the file whatever you want. However, if you put the settings in a file named xmlformat.conf in the current directory, xmlformat will read the file automatically. That's an easier approach, because you won't need to use a command-line option to specify the configuration file. So create a file named xmlformat.conf that contains the following two lines: screen format = verbatim These lines specify that <screen> elements should be formatted as verbatim elements. That is, xmlformat should reproduce their content in the output exactly as it appears in the input, without modification. The first line must begin in column 1 (no preceding spaces or tabs). The second line must begin with at least one space or tab. Presence or absence of whitespace is how xmlformat distinguish the names of elements to be formatted from the instructions that indicate how to format them. After creating xmlformat.conf, run xmlformat again to process doc2.xml. It reads the newly created configuration file and produces this result: Compiling and Running a Program To compile and run the program, use the following commands, where source-file is the name of the source file: cc source-file ./a.out ]]> That's a little better: xmlformat has not destroyed the <screen> element by reformatting it. But problems remain: The paragraph content has not been reformatted, and there are no blank lines between sub-elements. Let's take care of the paragraph next. To set up its formatting, add a section to xmlformat.conf for <para> elements: para format = block normalize = yes wrap-length = 60 subindent = 1 screen format = verbatim The order of sections in the configuration file doesn't matter. Put them in the order that makes most sense to you. The order of option lines under the initial section line doesn't matter, either. The first two options in the para section specify that the <para> element is a block element, and that text within it should be normalized. Turning on the normalize option tells xmlformat that it's okay to reformat the text within the element. This means that runs of whitespace within the text are collapsed to single spaces, and that whitespace at the beginning and end of the text can be adjusted (typically to put the text on different lines than the element's opening and closing tags). Enabling normalization also allows you to perform text line-wrapping and indenting. The wrap-length option specifies the maximum number of characters per line, and subindent specifies the indenting of text and sub-elements, relative to the element's own tags. Note that when xmlformat performs line-wrapping, it includes the currently prevailing indent as part of the line length. (For example, if the prevailing indent is 20 spaces and wrap-length value is 60, lines will contain at most 40 characters following the indentation.) After adding the para section to xmlformat.conf, xmlformat produces this result: Compiling and Running a Program To compile and run the program, use the following commands, where source-file is the name of the source file: cc source-file ./a.out ]]> The paragraph now is wrapped and indented. However, it doesn't seem to be wrapped quite correctly, because the <replaceable> element actually would fit on the previous line. This happens because no formatting options were specified for <replaceable> in the configuration file. As a result, it is treated as having the default element type of block, using the default behavior that block elements are written out beginning on a new line. To fix this problem, we should configure <replaceable> as an inline element. That will cause it to be formatted inline with the other text (and thus line-wrapped along with it). Modify the configuration file to include a replaceable section: this: para format = block normalize = yes wrap-length = 60 subindent = 1 replaceable format = inline screen format = verbatim The resulting output after making this change is as follows: Compiling and Running a Program To compile and run the program, use the following commands, where source-file is the name of the source file: cc source-file ./a.out ]]> We're getting close now. All we need to do is space out the <example> child elements with a blank line in between. Sub-element spacing is controlled by three formatting properties: entry-break controls spacing after the opening tag of an element (that is, the spacing upon entry into the element's content). element-break controls the spacing between sub-elements. exit-break controls spacing before the closing tag of an element (that is, the spacing upon exit from the element's content). The value for each of these formatting options should be an integer indicating the number of newlines to write. A value of 1 causes one newline, which acts simply to break to the next line. To get a blank line, the break value needs to be 2. Modify the configuration file by adding a section for <example> elements: example format = block entry-break = 2 element-break = 2 exit-break = 2 subindent = 0 para format = block normalize = yes wrap-length = 60 subindent = 1 replaceable format = inline screen format = verbatim The resulting output is: Compiling and Running a Program To compile and run the program, use the following commands, where source-file is the name of the source file: cc source-file ./a.out ]]> We're done! You may be thinking, "Wow, that's a lot of messing around just to format that tiny little document." That's true. However, the effort of setting up configuration files tends to be "reusable," in the sense that you can use the same file to format multiple documents that all should be written using the same style. Also, if you have different projects requiring different styles, it tends to be easiest to begin setting up the configuration file for one project by beginning with a copy of the file from another project. Discovering "Inherited" Formatting Options In the final formatting of doc2.xml, note that the paragraph tags appear on separate lines preceding and following the paragraph content. This occurs despite the fact that the configuration file specifies no break values in the para section, because if you omit formatting options for an element, it "inherits" the default properties. In the case of the <para> element, the relevant unspecified properties are the entry-break and exit-break values. For block elements, both have a value of 1 by default (that is, one newline), which causes a line break after the opening tag and before the closing tag. If you want to see all the formatting options xmlformat will use, run it with the option. For example: % xmlformat --show-config *DEFAULT format = block entry-break = 1 element-break = 1 exit-break = 1 subindent = 1 normalize = no wrap-length = 0 *DOCUMENT format = block entry-break = 0 element-break = 1 exit-break = 1 subindent = 0 normalize = no wrap-length = 0 example format = block entry-break = 2 element-break = 2 exit-break = 2 subindent = 0 normalize = no wrap-length = 0 para format = block entry-break = 1 element-break = 1 exit-break = 1 subindent = 1 normalize = yes wrap-length = 60 replaceable format = inline screen format = verbatim No configuration file is specified on the command line, so xmlformat reads the default configuration file, xmlformat.conf. Then it displays the resulting configuration options. You can see that the para section has inherited break values from the *DEFAULT section. Checking for Unconfigured Elements Any elements appearing in the input document that are not named in the configuration file are formatted using the values of the *DEFAULT section. If the file contains no *DEFAULT section, xmlformat uses built-in default values. If you want to see whether there are any elements in the document for which you haven't specified any formatting options, run xmlformat with the option. For example: % xmlformat --show-unconfigured-elements doc2.xml The following document elements were assigned no formatting options: title As it happens, the title already formats in the desired fashion, so there's no necessity of adding anything more to the configuration file.
xmlformat-1.04/docs/doc-cmd0000755000175000017500000000202410011017657014451 0ustar pauldevel#! /usr/bin/perl -w # vim:set ts=2 sw=2 expandtab: # doc-cmd - filter to process embedded shell commands in XML document # Suppose a document contains this: # # # ... a bunch of stuff here # # Then you mark the lines in the file from the comment to the closing # tag and shove it through doc-cmd. In vi, with the cursor # on the comment line, you could do this with 4!!./doc-cmd. doc-cmd # reads the first line, extracts the command, runs it, and writes as # its output the original comment line plus the output from the command. # This provides a way to embed within an XML document a command that # produces part of a document, plus an easy way to re-run the command # should the document fragment need to be regenerated. use strict; $_ = <>; die "Did not find an XML comment containing a command\n" unless /\ xmlformat-1.04/tests/wrap3.conf0000644000175000017500000000026010002341015015316 0ustar pauldevel*DEFAULT format block entry-break 1 element-break 1 exit-break 1 subindent 0 para normalize yes wrap-length 15 xmlformat-1.04/tests/inline1.res0000644000175000017500000000047507777165540015535 0ustar pauldevel This is a normalized paragraph with an inline element. This is a non-normalized paragraph with an inline element. xmlformat-1.04/tests/inline2.res0000644000175000017500000000043507777165540015532 0ustar pauldevel This is a paragraph with an inline element in the middle. xmlformat-1.04/tests/empty-doc.res0000644000175000017500000000000007775563753016065 0ustar pauldevelxmlformat-1.04/tests/inline3.res0000644000175000017500000000116507777267630015537 0ustar pauldevel This is a normalized paragraph with a long longlong long inline element. xmlformat-1.04/tests/data.conf0000644000175000017500000000025407774643440015233 0ustar pauldevel# This configuration file is empty. Because it contains no options # to override the built-in default formatting values, the effect is # to cause the defaults to be used. xmlformat-1.04/tests/inline1.xml0000644000175000017500000000050107777165540015532 0ustar pauldevel This is a normalized paragraph with an inline element. This is a non-normalized paragraph with an inline element. xmlformat-1.04/tests/inline2.xml0000644000175000017500000000043607777111061015527 0ustar pauldevel This is a paragraph with an inline element in the middle. xmlformat-1.04/tests/empty-doc.xml0000644000175000017500000000000007775563753016074 0ustar pauldevelxmlformat-1.04/tests/inline3.xml0000644000175000017500000000117407777267630015546 0ustar pauldevel This is a normalized paragraph with a long long long long inline element. xmlformat-1.04/tests/xsltext.res0000644000175000017500000000050507774643440015700 0ustar pauldevel x x x xmlformat-1.04/tests/subelt2.conf0000644000175000017500000000037310002341015015647 0ustar pauldevel# entry/exit break of 1, no element break, no subindent # subelts will all appear on single line between elt opening/closing tags. elt format block entry-break 1 exit-break 1 element-break 0 subindent 0 xmlformat-1.04/tests/nobreak.conf0000644000175000017500000000041110002341015015701 0ustar pauldevel# Default options *DEFAULT format block subindent 2 entry-break 2 exit-break 2 element-break 0 normalize no para format block entry-break 1 exit-break 1 normalize yes xmlformat-1.04/tests/subelt6.conf0000644000175000017500000000061210002341015015647 0ustar pauldevel# entry/exit break of 2 element break of 1, no subindent # Each subelt will appear on its own line between elt opening/closing tags, # with no blank line between them. # There will also be a blank line between opening tag and subelts, # and between subelts and closing tag. elt format block entry-break 2 exit-break 2 element-break 1 subindent 0 xmlformat-1.04/tests/xsltext.xml0000644000175000017500000000047507774643440015715 0ustar pauldevel x x x xmlformat-1.04/tests/Makefile0000644000175000017500000000003307775641504015106 0ustar pauldevelall: clean:: rm -f *.out xmlformat-1.04/tests/inline1.conf0000644000175000017500000000105710002341015015626 0ustar pauldevel# Options for top-level document children *DOCUMENT format block subindent 0 entry-break 0 exit-break 1 element-break 2 normalize no # Default options *DEFAULT format block subindent 0 entry-break 0 exit-break 0 element-break 0 normalize yes wrap-length 0 para-norm format block normalize yes wrap-length 60 para-no-norm format block normalize no inline format inline xmlformat-1.04/tests/test1.conf0000644000175000017500000000100510002341015015320 0ustar pauldevel# Options for top-level document children *DOCUMENT format block subindent 0 entry-break 0 exit-break 1 element-break 2 normalize no # Default options *DEFAULT format block subindent 2 entry-break 2 exit-break 2 element-break 2 normalize no itemizedlist format block para format block entry-break 1 exit-break 1 normalize yes wrap-length 72 xmlformat-1.04/tests/break1.conf0000644000175000017500000000120310002341015015425 0ustar pauldevel# Default options *DEFAULT format block subindent 2 entry-break 2 exit-break 2 element-break 2 normalize no # No entry/exit break, subindent = 0 item1 format block entry-break 0 exit-break 0 element-break 2 subindent 0 # No entry/exit break, subindent = 2 item2 format block entry-break 0 exit-break 0 element-break 2 subindent 2 title para todo remark format block entry-break 1 exit-break 1 normalize yes subindent 2 wrap-length 72 xmlformat-1.04/tests/table1.conf0000644000175000017500000000024710002341015015437 0ustar pauldevel# Format everything on the same line table row cell format block normalize no entry-break 0 element-break 0 exit-break 0 xmlformat-1.04/tests/continuation.res0000644000175000017500000000007510001261545016654 0ustar pauldevel xmlformat-1.04/tests/xsltext.conf0000644000175000017500000000061110002341015015775 0ustar pauldevel# Options for top-level document children *DOCUMENT format block subindent 0 entry-break 0 exit-break 1 element-break 2 normalize no # Default options *DEFAULT format block subindent 2 entry-break 2 exit-break 2 element-break 0 normalize no xsl:text format verbatim xmlformat-1.04/tests/data.res0000644000175000017500000000131010000142105015031 0ustar pauldevel 1-1 1-2 2-1 2-2 3-1 3-2
This is a paragraph with literal text and emphasized text.
xmlformat-1.04/tests/empty-elt.conf0000644000175000017500000000140610002341015016205 0ustar pauldevel# Options for top-level document children *DOCUMENT format block subindent 0 entry-break 0 exit-break 1 element-break 2 normalize no # Default options *DEFAULT format block subindent 2 entry-break 2 exit-break 2 element-break 2 normalize no wrap-length 0 programlisting format verbatim itemizedlist orderedlist format block listitem format block # this causes following to be on same line entry-break 0 element-break 2 exit-break 0 subindent 0 para format block entry-break 1 exit-break 1 normalize yes wrap-length 72 xmlformat-1.04/tests/subelt3.conf0000644000175000017500000000036210002341015015646 0ustar pauldevel# entry/exit/element break of 1, no subindent # Each subelt will appear on its own line between elt opening/closing tags. elt format block entry-break 1 exit-break 1 element-break 1 subindent 0 xmlformat-1.04/tests/data.xml0000644000175000017500000000123410000011540015043 0ustar pauldevel 1-11-22-12-23-13-2
This is a paragraph with literal text and emphasized text.
xmlformat-1.04/tests/subelt7.conf0000644000175000017500000000064210002341015015653 0ustar pauldevel# entry/exit/element break of 2, subindent of 4 spaces # Each subelt will appear on its own line between elt opening/closing tags, # with a blank line between them. # There will also be a blank line between opening tag and subelts, # and between subelts and closing tag. # subelts will be indented elt format block entry-break 2 exit-break 2 element-break 1 subindent 4 xmlformat-1.04/tests/nobreak.res0000644000175000017500000000015007774643440015602 0ustar pauldevel para1 para2 para3 xmlformat-1.04/tests/continuation.xml0000644000175000017500000000007210002341015016650 0ustar pauldevel xmlformat-1.04/tests/wrap1.conf0000644000175000017500000000105210002341015015314 0ustar pauldevel#Two types of paragraphs. # Both have line-wrapping with subindent, but one has an entry/exit # break and the other does not. This is to test the handling of the # initial indent when the text is written immediately after the open # tag vs. when it is written on the next line. *DEFAULT format block normalize yes wrap-length 40 subindent 2 para-with-break entry-break 1 element-break 1 exit-break 1 para-without-break entry-break 0 element-break 0 exit-break 0 xmlformat-1.04/tests/nobreak.xml0000644000175000017500000000011607774643440015613 0ustar pauldevel para1 para2 para3 xmlformat-1.04/tests/wrap1.res0000644000175000017500000000064007774707271015222 0ustar pauldevel A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. xmlformat-1.04/tests/wrap2.res0000644000175000017500000000061410000011540015157 0ustar pauldevel A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. xmlformat-1.04/tests/inline2.conf0000644000175000017500000000075310002341015015631 0ustar pauldevel# Options for top-level document children *DOCUMENT format block subindent 0 entry-break 0 exit-break 1 element-break 2 normalize no # Default options *DEFAULT format block subindent 0 entry-break 0 exit-break 0 element-break 0 normalize yes wrap-length 0 para format block normalize yes wrap-length 60 inline format inline xmlformat-1.04/tests/wrap3.res0000644000175000017500000000063210000621516015172 0ustar pauldevel this-is-a-very-long-word-at-the-beginning short short short short this-is-a-very-long-word-in-the-middle short short short short short short short this-is-a-very-long-word-at-the-end xmlformat-1.04/tests/Notes0000644000175000017500000000073510002420647014446 0ustar pauldevelFor each test x, there are several files: x.conf - the configuration file containing the formatting options x.xml - the input XML document x.res - the expected result from xmlformat To run the test, type "./runtest x" in the parent directory. This will generate output into the file x.out, compare it to x.res, and display a diff if they are different. To run all the tests, type "./runtest all". runtest uses xmlformat.pl by default. To use xmlformat.rb, use runtest -r. xmlformat-1.04/tests/wrap1.xml0000644000175000017500000000062107774647567015243 0ustar pauldevel A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. xmlformat-1.04/tests/break2.conf0000644000175000017500000000041110002341015015426 0ustar pauldevel*DEFAULT subindent 0 norm-elt format block normalize yes entry-break 1 element-break 1 exit-break 1 non-norm-elt format block normalize no programlisting format verbatim xmlformat-1.04/tests/wrap2.xml0000644000175000017500000000061507777711117015231 0ustar pauldevel A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. A paragraph with several lines. xmlformat-1.04/tests/wrap3.xml0000644000175000017500000000062410000621516015202 0ustar pauldevel this-is-a-very-long-word-at-the-beginning short short short short this-is-a-very-long-word-in-the-middle short short short short short short short this-is-a-very-long-word-at-the-end xmlformat-1.04/tests/table2.conf0000644000175000017500000000052610002341015015440 0ustar pauldevel# Begin each row on new line, format cells within row on same line. # Indent rows with table by 2 spaces. table row cell format block normalize no table entry-break 1 element-break 1 exit-break 1 subindent 2 row cell entry-break 0 element-break 0 exit-break 0 xmlformat-1.04/tests/break1.res0000644000175000017500000000033310000013236015273 0ustar pauldevel This is a paragraph. This is a paragraph. xmlformat-1.04/tests/break2.res0000644000175000017500000000130210000013236015271 0ustar pauldevel This is text with a in the middle. This is text with a verbatim element in the middle. This is text with a in the middle. This is text with a verbatim element in the middle. xmlformat-1.04/tests/norm.res0000644000175000017500000000137407777362470015151 0ustar pauldevel This text should have all surrounding whitespace removed This text should have leading/trailing whitespace removed, but leave a space around the literal element> This text should have leading/trailing whitespace removed and also spaces adjacent to the programlisting. The whitespace around should not be removed entirely. xmlformat-1.04/tests/break1.xml0000644000175000017500000000030010000013236015274 0ustar pauldevel This is a paragraph. This is a paragraph. xmlformat-1.04/tests/break2.xml0000644000175000017500000000130010000013236015276 0ustar pauldevel This is text with a in the middle. This is text with a verbatim element in the middle. This is text with a in the middle. This is text with a verbatim element in the middle. xmlformat-1.04/tests/subelt4.conf0000644000175000017500000000044010002341015015644 0ustar pauldevel# entry/exit break of 1, element break of 2, no subindent # Each subelt will appear on its own line between elt opening/closing tags, # with a blank line between them. elt format block entry-break 1 exit-break 1 element-break 2 subindent 0 xmlformat-1.04/tests/subelt1.res0000644000175000017500000000004707775575712015554 0ustar pauldevel xmlformat-1.04/tests/subelt2.res0000644000175000017500000000005107775575712015550 0ustar pauldevel xmlformat-1.04/tests/subelt3.res0000644000175000017500000000005307775575712015553 0ustar pauldevel xmlformat-1.04/tests/subelt4.res0000644000175000017500000000005507775575712015556 0ustar pauldevel xmlformat-1.04/tests/norm.xml0000644000175000017500000000142507777362470015155 0ustar pauldevel This text should have all surrounding whitespace removed This text should have leading/trailing whitespace removed, but leave a space around the literal element> This text should have leading/trailing whitespace removed and also spaces adjacent to the programlisting. The whitespace around should not be removed entirely. xmlformat-1.04/tests/subelt5.res0000644000175000017500000000005707775575712015561 0ustar pauldevel xmlformat-1.04/tests/subelt6.res0000644000175000017500000000005507775641561015554 0ustar pauldevel xmlformat-1.04/tests/subelt7.res0000644000175000017500000000007107775577475015567 0ustar pauldevel xmlformat-1.04/tests/subelt1.xml0000644000175000017500000000004707775575712015563 0ustar pauldevel xmlformat-1.04/tests/subelt2.xml0000644000175000017500000000004707775575712015564 0ustar pauldevel xmlformat-1.04/tests/wrap2.conf0000644000175000017500000000104110002341015015313 0ustar pauldevel#Two types of paragraphs. # Both are listed as line-wrapped, but one is normalized and the other # is not. # - This tests the behavior that line-wrapping should be ignored # in non-normalized paragraphs. # - It also tests that no extra break should be added around # non-normalized text. *DEFAULT format block entry-break 1 element-break 1 exit-break 1 subindent 0 para-with-wrap normalize yes wrap-length 60 para-without-wrap normalize no wrap-length 60 xmlformat-1.04/tests/subelt3.xml0000644000175000017500000000004707775575712015565 0ustar pauldevel xmlformat-1.04/tests/subelt4.xml0000644000175000017500000000004707775575712015566 0ustar pauldevel xmlformat-1.04/tests/subelt5.xml0000644000175000017500000000004707775575712015567 0ustar pauldevel xmlformat-1.04/tests/subelt6.xml0000644000175000017500000000004707775641561015564 0ustar pauldevel xmlformat-1.04/tests/subelt7.xml0000644000175000017500000000004707775577475015601 0ustar pauldevel xmlformat-1.04/tests/inline3.conf0000644000175000017500000000075310002341015015632 0ustar pauldevel# Options for top-level document children *DOCUMENT format block subindent 0 entry-break 0 exit-break 1 element-break 2 normalize no # Default options *DEFAULT format block subindent 0 entry-break 0 exit-break 0 element-break 0 normalize yes wrap-length 0 para format block normalize yes wrap-length 60 inline format inline xmlformat-1.04/tests/indent1.conf0000644000175000017500000000041010002341015015621 0ustar pauldevel*DEFAULT format block entry-break 1 element-break 1 exit-break 1 subindent 0 para entry-break 0 element-break 0 exit-break 0 normalize yes wrap-length 40 subindent 2 xmlformat-1.04/tests/test1.res0000644000175000017500000000015307774643440015224 0ustar pauldevel para with para with xmlformat-1.04/tests/indent1.res0000644000175000017500000000145010000623676015510 0ustar pauldevel This is a line of text in the outer para. This is a line of text in the outer para. This is a line of text in the outer para. This is a line of text in the outer para.This is a line of text in the inner para. This is a line of text in the inner para. This is a line of text in the inner para. This is a line of text in the inner para.This is a line of text in the inner para. This is a line of text in the inner para. This is a line of text in the inner para. This is a line of text in the inner para. xmlformat-1.04/tests/table3.conf0000644000175000017500000000056610002341015015445 0ustar pauldevel# Begin each element on new line, indenting sub elements two spaces. # Leave cell contents on same line as cell tags table row cell format block normalize no entry-break 1 element-break 1 exit-break 1 subindent 2 # Override break values for cell cell entry-break 0 element-break 0 exit-break 0 xmlformat-1.04/tests/norm.conf0000644000175000017500000000106010002341015015234 0ustar pauldevel*DOCUMENT format block normalize no entry-break 0 exit-break 1 element-break 1 subindent 0 wrap-length 0 # Default options *DEFAULT format block subindent 0 entry-break 0 exit-break 1 element-break 1 normalize yes wrap-length 0 programlisting format verbatim para format block normalize yes entry-break 0 element-break 1 exit-break 0 literal format inline xmlformat-1.04/tests/test1.xml0000644000175000017500000000013407774643440015232 0ustar pauldevelpara with para with xmlformat-1.04/tests/table1.res0000644000175000017500000000017207774643440015335 0ustar pauldevel123456
xmlformat-1.04/tests/table2.res0000644000175000017500000000020107774643440015327 0ustar pauldevel123456
xmlformat-1.04/tests/indent1.xml0000644000175000017500000000141610000623676015521 0ustar pauldevel This is a line of text in the outer para. This is a line of text in the outer para. This is a line of text in the outer para. This is a line of text in the outer para. This is a line of text in the inner para. This is a line of text in the inner para. This is a line of text in the inner para. This is a line of text in the inner para. This is a line of text in the inner para. This is a line of text in the inner para. This is a line of text in the inner para. This is a line of text in the inner para. xmlformat-1.04/tests/table3.res0000644000175000017500000000024507774643440015340 0ustar pauldevel 1 2 3 4 5 6
xmlformat-1.04/tests/subelt1.conf0000644000175000017500000000032010002341015015636 0ustar pauldevel# No entry/element/exit break # (subindent doesn't matter) # elt and subelts all will appear on a single line elt format block entry-break 0 exit-break 0 element-break 0 xmlformat-1.04/tests/table1.xml0000644000175000017500000000017507774643440015347 0ustar pauldevel123456
xmlformat-1.04/tests/subelt5.conf0000644000175000017500000000057610002341015015657 0ustar pauldevel# entry/exit/element break of 2, no subindent # Each subelt will appear on its own line between elt opening/closing tags, # with a blank line between them. # There will also be a blank line between opening tag and subelts, # and between subelts and closing tag. elt format block entry-break 2 exit-break 2 element-break 2 subindent 0 xmlformat-1.04/tests/table2.xml0000644000175000017500000000017507774643440015350 0ustar pauldevel123456
xmlformat-1.04/tests/empty-elt.res0000644000175000017500000000040007775563531016102 0ustar pauldevel xmlformat-1.04/tests/table3.xml0000644000175000017500000000017507774643440015351 0ustar pauldevel123456
xmlformat-1.04/ChangeLog0000644000175000017500000000247210470215222014043 0ustar pauldevelVersion 1.04 (released 2006-08-14) - Assign each token an input line number and display the line number in error messages. This provides better information to the user about the location of problems in input files. - Print the token stack when an error occurs. This provides some idea of the context of the element that is malformed or has malformed content. Version 1.03 (released 2004-03-26) - In xmlformat.rb, made some changes needed for Ruby 1.8: - Convert @@xml_spe parsing expression to Regexp with Regexp.new(). scan() method doesn't work with string argument now, apparently. - In parsing patterns, change literal ] to \\] to suppress warnings - In xmlformat.pl: - In parsing patterns, change literal ] to \\]. This isn't actually necessary, but better preserves parallelism with Ruby version. Version 1.02 (released 2004-02-06) - Added --in-place/-i option for in-place reformatting. (Requires named input file or files.) - Added --backup/-b option for making backup of each input file (used with --in-place). - If multiple input files are named on the command line, they are processed as separate documents, not as one combined input. (This was necessary to make --in-place and --backup work correctly.) - Added a tutorial document. Version 1.01 (released 2004-01-18) - Initial public release. xmlformat-1.04/README0000644000175000017500000000205510334242051013145 0ustar pauldevelxmlformat - an XML document formatter Paul DuBois paul@kitebird.com This is the distribution for xmlformat 1.04. If you find bugs, please let me know. The current version of xmlformat is always available at: http://www.kitebird.com/software/ xmlformat is free software, distributed under a BSD-style license. For specific licensing information, see the LICENSE file. For installation instructions, see the INSTALL file. xmlformat has two implementations, one in Ruby and one in Perl. They should produce identical output in all cases. Documentation is in the docs subdirectory. Tests are in the tests directory, though you run them in the main xmlformat directory: - To run all tests: make test - To run all tests for the Ruby version: ./runtest all - To run all tests for the Perl version: ./runtest -p all - To run an individual test for the Ruby version: ./runtest testname - To run an individual test for the Perl version: ./runtest -p testname A test name is the name of its .xml file, minus the .xml suffix. For more information, see tests/Notes. xmlformat-1.04/bad1.res0000644000175000017500000000023510334242107013607 0ustar pauldevelMalformed token at line 3, token 5: ): Tag mismatch; malformed document? open tag: close tag:
enclosing tags: 1: Error at EOF: Unclosed tags; malformed document? unclosed tags: 1: Error at EOF: Unprocessed child elements; malformed document? Cannot continue processing document. xmlformat-1.04/BUGS0000644000175000017500000000426010021371043012745 0ustar pauldevelxmlformat bugs Ruby version is slower than the Perl version, a difference that shows up particularly for larger documents. I haven't done any profiling to determine which parts of the program account for the differences. ---------- Within normalized elements, the line-wrapping algorithm preserves inline tags, but it doesn't take any internal line-breaks within those tags into account in its line-length calculations. Consider this element: This is text with an inline element in the middle. The opening tag is considered to have a length equal to its total number of characters. The line-wrapping algorithm could be made more complex to take the line-breaks into account. I haven't bothered, and may never bother. If such a change is made, no indenting should be applied that would change the attribute value. ---------- Line-wrapping length calculations don't take into account the possibility that text from a different element may occur on the same line if break values are set to zero. For example, with a wrap-length of 15, you could end up with output like this: This is a line of text.This is a line of text. The middle line has more than 15 characters of text. This also shows that wrap-length doesn't take into account the length of tags of enclosing blocks on the same line, which can also be considered a bug. Fix: Set all your break values > 0. ---------- Normalization converts runs of spaces to single spaces. This means that if you write two spaces after periods in text, normalization will convert them to single spaces. Even if normalization didn't do that, the two spaces would be lost if line-wrapping is enabled and they occur at a line break. I don't know if this is really a bug, but it's something to be aware of. ---------- Doesn't recognize multi-byte files. In some cases, you can work around this. For example, an editor might save a file as Unicode even when the document contains only ASCII characters. Re-save the file as an ASCII file (or some single-byte encoding such as ISO-8859-1) and it should work. xmlformat-1.04/bad3.res0000644000175000017500000000020110334242107013602 0ustar pauldevelError near line 1, token 1 (): Close tag w/o preceding open tag; malformed document? Cannot continue processing document. xmlformat-1.04/bad4.res0000644000175000017500000000074410355364361013631 0ustar pauldevelError near line 2, token 8 (): Tag mismatch; malformed document? open tag: close tag: enclosing tags: 1: 2: Error near line 3, token 10 (): Tag mismatch; malformed document? open tag: close tag: enclosing tags: 1: Error at EOF: Unclosed tags; malformed document? unclosed tags: 1: Error at EOF: Unprocessed child elements; malformed document? Cannot continue processing document. xmlformat-1.04/test.conf0000644000175000017500000000103307775152333014130 0ustar pauldevel# Test processing of comments at end of element-name and option lines. # Test , separators in element-name lines. # Test optional = sign processing in option lines. # To see if programs parse it properly, invoke like this: # xmlformat.pl --config-file=test.conf --show-config | more # xmlformat.rb --config-file=test.conf --show-config | more elt1 elt2 elt3# comment elt1 elt2 elt3 # comment elt1,elt2,elt3# comment elt1,elt2,elt3 # comment format inline format=inline format = inline format inline# comment format inline # comment xmlformat-1.04/bad5.res0000644000175000017500000000030010334242107013604 0ustar pauldevelError at EOF: Unclosed tags; malformed document? unclosed tags: 1: 2: Error at EOF: Unprocessed child elements; malformed document? Cannot continue processing document. xmlformat-1.04/xmlformat.rb0000755000175000017500000013672310355364247014657 0ustar pauldevel#!/usr/bin/ruby -w # vim:set ts=2 sw=2 expandtab: # xmlformat.rb - XML document reformatter # Copyright (c) 2004, 2005, Kitebird, LLC. All rights reserved. # Some portions are based on the REX shallow XML parser, which # is Copyright (c) 1998, Robert D. Cameron. These include the # regular expression parsing variables and the shallow_parse() # method. # This software is licensed as described in the file LICENSE, # which you should have received as part of this distribution. # Differences from Perl version: # - Pattern for classifying token as text node is different. # (cannot use !~ op for case) # - It's important to use \A and \z|\Z rather than ^ and $ in pattern # matches on tokens, because ^ and $ might match after/before a # newline for a token that spans multiple lines! require "getoptlong" PROG_NAME = "xmlformat" PROG_VERSION = "1.04" PROG_LANG = "Ruby" # ---------------------------------------------------------------------- # XMLFormat module # Contains: # - Methods for parsing XML document # - Methods for reading configuration file and operating on configuration # information. module XMLFormat # ---------------------------------------------------------------------- # Module methods # warn - print message to stderr # die - print message to stderr and exit def warn(*args) $stderr.print args end def die(*args) $stderr.print args exit(1) end # ---------------------------------------------------------------------- # Module variables - these do not vary per class invocation # Regular expressions for parsing document components. Based on REX. # Compared to Perl version, these variable names use more Ruby-like # lettercase. (Ruby likes to interpret variables that begin with # uppercase as constants.) # spe = shallow parsing expression # se = scanning expression # ce = completion expression # rsb = right square brackets # qm = question mark @@text_se = "[^<]+" @@until_hyphen = "[^-]*-" @@until_2_hyphens = "#{@@until_hyphen}(?:[^-]#{@@until_hyphen})*-" @@comment_ce = "#{@@until_2_hyphens}>?" @@until_rsbs = "[^\\]]*\\](?:[^\\]]+\\])*\\]+" @@cdata_ce = "#{@@until_rsbs}(?:[^\\]>]#{@@until_rsbs})*>" @@s = "[ \\n\\t\\r]+" @@name_strt = "[A-Za-z_:]|[^\\x00-\\x7F]" @@name_char = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]" @@name = "(?:#{@@name_strt})(?:#{@@name_char})*" @@quote_se = "\"[^\"]*\"|'[^']*'" @@dt_ident_se = "#{@@s}#{@@name}(?:#{@@s}(?:#{@@name}|#{@@quote_se}))*" @@markup_decl_ce = "(?:[^\\]\"'><]+|#{@@quote_se})*>" @@s1 = "[\\n\\r\\t ]" @@until_qms = "[^?]*\\?+" @@pi_tail = "\\?>|#{@@s1}#{@@until_qms}(?:[^>?]#{@@until_qms})*>" @@dt_item_se = "<(?:!(?:--#{@@until_2_hyphens}>|[^-]#{@@markup_decl_ce})|\\?#{@@name}(?:#{@@pi_tail}))|%#{@@name};|#{@@s}" @@doctype_ce = "#{@@dt_ident_se}(?:#{@@s})?(?:\\[(?:#{@@dt_item_se})*\\](?:#{@@s})?)?>?" @@decl_ce = "--(?:#{@@comment_ce})?|\\[CDATA\\[(?:#{@@cdata_ce})?|DOCTYPE(?:#{@@doctype_ce})?" @@pi_ce = "#{@@name}(?:#{@@pi_tail})?" @@end_tag_ce = "#{@@name}(?:#{@@s})?>?" @@att_val_se = "\"[^<\"]*\"|'[^<']*'" @@elem_tag_se = "#{@@name}(?:#{@@s}#{@@name}(?:#{@@s})?=(?:#{@@s})?(?:#{@@att_val_se}))*(?:#{@@s})?/?>?" @@markup_spe = "<(?:!(?:#{@@decl_ce})?|\\?(?:#{@@pi_ce})?|/(?:#{@@end_tag_ce})?|(?:#{@@elem_tag_se})?)" @@xml_spe = Regexp.new("#{@@text_se}|#{@@markup_spe}") # ---------------------------------------------------------------------- # Allowable formatting options and their possible values: # - The keys of this hash are the allowable option names # - The value for each key is list of allowable option values # - If the value is nil, the option value must be numeric # If any new formatting option is added to this program, it # must be specified here, *and* a default value for it should # be listed in the *DOCUMENT and *DEFAULT pseudo-element # option hashes. @@opt_list = { "format" => [ "block", "inline", "verbatim" ], "normalize" => [ "yes", "no" ], "subindent" => nil, "wrap-length" => nil, "entry-break" => nil, "exit-break" => nil, "element-break" => nil } class XMLFormatter # Object creation: set up the default formatting configuration # and variables for maintaining input and output document. def initialize # Formatting options for each element. @elt_opts = { } # The formatting options for the *DOCUMENT and *DEFAULT pseudo-elements can # be overridden in the configuration file, but the options must also be # built in to make sure they exist if not specified in the configuration # file. Each of the structures must have a value for every option. # Options for top-level document children. # - Do not change entry-break: 0 ensures no extra newlines before # first element of output. # - Do not change exit-break: 1 ensures a newline after final element # of output document. # - It's probably best not to change any of the others, except perhaps # if you want to increase the element-break. @elt_opts["*DOCUMENT"] = { "format" => "block", "normalize" => "no", "subindent" => 0, "wrap-length" => 0, "entry-break" => 0, # do not change "exit-break" => 1, # do not change "element-break" => 1 } # Default options. These are used for any elements in the document # that are not specified explicitly in the configuration file. @elt_opts["*DEFAULT"] = { "format" => "block", "normalize" => "no", "subindent" => 1, "wrap-length" => 0, "entry-break" => 1, "exit-break" => 1, "element-break" => 1 } # Run the *DOCUMENT and *DEFAULT options through the option-checker # to verify that the built-in values are legal. err_count = 0 @elt_opts.keys.each do |elt_name| # ... for each element @elt_opts[elt_name].each do |opt_name, opt_val| # ... for each option opt_val, err_msg = check_option(opt_name, opt_val) if err_msg.nil? @elt_opts[elt_name][opt_name] = opt_val else warn "LOGIC ERROR: #{elt_name} default option is invalid\n" warn "#{err_msg}\n" err_count += 1 end end end # Make sure that the every option is represented in the # *DOCUMENT and *DEFAULT structures. @@opt_list.keys.each do |opt_name| @elt_opts.keys.each do |elt_name| if !@elt_opts[elt_name].has_key?(opt_name) warn "LOGIC ERROR: #{elt_name} has no default '#{opt_name}' option\n" err_count += 1 end end end if err_count > 0 raise "Cannot continue; internal default formatting options must be fixed" end end # Initialize the variables that are used per-document def init_doc_vars # Elements that are used in the document but not named explicitly # in the configuration file. @unconf_elts = { } # List of tokens for current document. @tokens = [ ] # List of line numbers for each token @line_num = [ ] # Document node tree (constructed from the token list) @tree = [ ] # Variables for formatting operations: # @out_doc = resulting output document (constructed from document tree) # @pending = array of pending tokens being held until flushed @out_doc = "" @pending = [ ] # Inline elements within block elements are processed using the # text normalization (and possible line-wrapping) values of their # enclosing block. Blocks and inlines may be nested, so we maintain # a stack that allows the normalize/wrap-length values of the current # block to be determined. @block_name_stack = [ ] # for debugging @block_opts_stack = [ ] # A similar stack for maintaining each block's current break type. @block_break_type_stack = [ ] end # Accessors for token list and resulting output document def tokens return @tokens end def out_doc return @out_doc end # Methods for adding strings to output document or # to the pending output array def add_to_doc(str) @out_doc << str end def add_to_pending(str) @pending << str end # Block stack maintenance methods # Push options onto or pop options off from the stack. When doing # this, also push or pop an element onto the break-level stack. def begin_block(name, opts) @block_name_stack << name @block_opts_stack << opts @block_break_type_stack << "entry-break" end def end_block @block_name_stack.pop @block_opts_stack.pop @block_break_type_stack.pop end # Return the current block's normalization status or wrap length def block_normalize return @block_opts_stack.last["normalize"] == "yes" end def block_wrap_length return @block_opts_stack.last["wrap-length"] end # Set the current block's break type, or return the number of newlines # for the block's break type def set_block_break_type(type) @block_break_type_stack[@block_break_type_stack.size-1] = type end def block_break_value return @block_opts_stack.last[@block_break_type_stack.last] end # Read configuration information. For each element, construct a hash # containing a hash key and value for each option name and value. # After reading the file, fill in missing option values for # incomplete option structures using the *DEFAULT options. def read_config(conf_file) elt_names = nil in_continuation = false saved_line = "" File.open(conf_file) do |fh| fh.each_line do |line| line.chomp! next if line =~ /^\s*($|#)/ # skip blank lines, comments if in_continuation line = saved_line + " " + line saved_line = "" in_continuation = false end if line !~ /^\s/ # Line doesn't begin with whitespace, so it lists element names. # Names are separated by whitespace or commas, possibly followed # by a continuation character or comment. if line =~ /\\$/ in_continuation = true saved_line = line.sub(/\\$/, "") # remove continuation character next end line.sub!(/\s*#.*$/, "") # remove any trailing comment elt_names = line.split(/[\s,]+/) # make sure each name has an entry in the elt_opts structure elt_names.each do |elt_name| @elt_opts[elt_name] = { } unless @elt_opts.has_key?(elt_name) end else # Line begins with whitespace, so it contains an option # to apply to the current element list, possibly followed by # a comment. First check that there is a current list. # Then parse the option name/value. if elt_names.nil? raise "#{conf_file}:#{$.}: Option setting found before any " + "elements were named.\n" end line.sub!(/\s*#.*$/, "") line =~ /^\s*(\S+)(?:\s+|\s*=\s*)(\S+)$/ opt_name, opt_val = $1, $2 raise "#{conf_file}:#{$.}: Malformed line: #{$_}" if opt_val.nil? # Check option. If illegal, die with message. Otherwise, # add option to each element in current element list opt_val, err_msg = check_option(opt_name, opt_val) raise "#{conf_file}:#{$.}: #{err_msg}\n" unless err_msg.nil? elt_names.each do |elt_name| @elt_opts[elt_name][opt_name] = opt_val end end end end # For any element that has missing option values, fill in the values # using the options for the *DEFAULT pseudo-element. This speeds up # element option lookups later. It also makes it unnecessary to test # each option to see if it's defined: All element option structures # will have every option defined. def_opts = @elt_opts["*DEFAULT"] @elt_opts.keys.each do |elt_name| next if elt_name == "*DEFAULT" def_opts.keys.each do |opt_name| next if @elt_opts[elt_name].has_key?(opt_name) # already set @elt_opts[elt_name][opt_name] = def_opts[opt_name] end end end # Check option name to make sure it's legal. Check the value to make sure # that it's legal for the name. Return a two-element array: # (value, nil) if the option name and value are legal. # (nil, message) if an error was found; message contains error message. # For legal values, the returned value should be assigned to the option, # because it may get type-converted here. def check_option(opt_name, opt_val) # - Check option name to make sure it's a legal option # - Then check the value. If there is a list of values # the value must be one of them. Otherwise, the value # must be an integer. if !@@opt_list.has_key?(opt_name) return [ nil, "Unknown option name: #{opt_name}" ] end allowable_val = @@opt_list[opt_name] if !allowable_val.nil? if !allowable_val.find { |val| val == opt_val } return [ nil, "Unknown '#{opt_name}' value: #{opt_val}" ] end elsif !opt_val.is_a?(Integer) if opt_val =~ /^\d+$/ opt_val = opt_val.to_i else return [ nil, "'#{opt_name}' value (#{opt_val}) should be an integer" ] end end return [ opt_val, nil ] end private :check_option # Return hash of option values for a given element. If no options are found: # - Add the element name to the list of unconfigured options. # - Assign the default options to the element. (This way the test for the # option fails only once.) def get_opts(elt_name) opts = @elt_opts[elt_name] if opts.nil? @unconf_elts[elt_name] = 1 opts = @elt_opts[elt_name] = @elt_opts["*DEFAULT"] end return opts end private :get_opts # Display contents of configuration options to be used to process document. # For each element named in the elt_opts structure, display its format # type, and those options that apply to the type. def display_config # Format types and the additional options that apply to each type format_opts = { "block" => [ "entry-break", "element-break", "exit-break", "subindent", "normalize", "wrap-length" ], "inline" => [ ], "verbatim" => [ ] } @elt_opts.keys.sort.each do |elt_name| puts elt_name opts = @elt_opts[elt_name] format = opts["format"] # Write out format type, then options that apply to the format type puts " format = #{format}" format_opts[format].each do |opt_name| puts " #{opt_name} = #{opts[opt_name]}" end puts end end # Display the list of elements that are used in the document but not # configured in the configuration file. # Then re-unconfigure the elements so that they won't be considered # as configured for the next document, if there is one. def display_unconfigured_elements elts = @unconf_elts.keys if elts.empty? puts "The document contains no unconfigured elements." else puts "The following document elements were assigned no formatting options:" puts line_wrap(elts.sort.join(" "), 0, 0, 65).join("\n") end elts.each do |elt_name| @elt_opts.delete(elt_name) end end # ---------------------------------------------------------------------- # Main document processing routine. # - Argument is a string representing an input document # - Return value is the reformatted document, or nil. An nil return # signifies either that an error occurred, or that some option was # given that suppresses document output. In either case, don't write # any output for the document. Any error messages will already have # been printed when this returns. def process_doc(doc, verbose, check_parser, canonize_only, show_unconf_elts) init_doc_vars # Perform lexical parse to split document into list of tokens warn "Parsing document...\n" if verbose shallow_parse(doc) if (check_parser) warn "Checking parser...\n" if verbose # concatentation of tokens should be identical to original document if doc == tokens.join("") puts "Parser is okay" else puts "PARSER ERROR: document token concatenation differs from document" end return nil end # Assign input line number to each token assign_line_numbers # Look for and report any error tokens returned by parser warn "Checking document for errors...\n" if verbose if report_errors > 0 warn "Cannot continue processing document.\n" return nil end # Convert the token list to a tree structure warn "Convert document tokens to tree...\n" if verbose if tokens_to_tree > 0 warn "Cannot continue processing document.\n" return nil end # Check: Stringify the tree to convert it back to a single string, # then compare to original document string (should be identical) # (This is an integrity check on the validity of the to-tree and stringify # operations; if one or both do not work properly, a mismatch should occur.) #str = tree_stringify #print str #warn "ERROR: mismatch between document and resulting string\n" if doc != str # Canonize tree to remove extraneous whitespace warn "Canonizing document tree...\n" if verbose tree_canonize if (canonize_only) puts tree_stringify return nil end # One side-effect of canonizing the tree is that the formatting # options are looked up for each element in the document. That # causes the list of elements that have no explicit configuration # to be built. Display the list and return if user requested it. if show_unconf_elts display_unconfigured_elements return nil end # Format the tree to produce formatted XML as a single string warn "Formatting document tree...\n" if verbose tree_format # If the document is not empty, add a newline and emit a warning if # reformatting failed to add a trailing newline. This shouldn't # happen if the *DOCUMENT options are set up with exit-break = 1, # which is the reason for the warning rather than just silently # adding the newline. str = out_doc if !str.empty? && str !~ /\n\z/ warn "LOGIC ERROR: trailing newline had to be added\n" str << "\n" end return str end # ---------------------------------------------------------------------- # Parse XML document into array of tokens and store array def shallow_parse(xml_document) @tokens = xml_document.scan(@@xml_spe) end # ---------------------------------------------------------------------- # Extract a tag name from a tag and return it. This uses a subset # of the document-parsing pattern elements. # Dies if the tag cannot be found, because this is supposed to be # called only with a legal tag. def extract_tag_name(tag) match = /\A<\/?(#{@@name})/.match(tag) return match[1] if match raise "Cannot find tag name in tag: #{tag}" end private :extract_tag_name # ---------------------------------------------------------------------- # Assign an input line number to each token. The number indicates # the line number on which the token begins. def assign_line_numbers line_num = 1; @line_num = [ ] @tokens.each do |token| @line_num << line_num line_num += token.count "\n" end end private :assign_line_numbers # ---------------------------------------------------------------------- # Check token list for errors and report any that are found. Error # tokens are those that begin with "<" but do not end with ">". # Returns the error count. # Does not modify the original token list. def report_errors err_count = 0 @tokens.each_index do |i| token = @tokens[i] if token =~ /\A\Z/ warn "Malformed token at line #{@line_num[i]}, token #{i+1}: #{token}\n" err_count += 1 end end warn "Number of errors found: #{err_count}\n" if err_count > 0 return err_count end # ---------------------------------------------------------------------- # Helper routine to print tag stack for tokens_to_tree def print_tag_stack(label, stack) if stack.size < 1 warn " #{label}: none\n" else warn " #{label}:\n" stack.each_with_index do |tag, i| warn " #{i+1}: #{tag}\n" end end end # Convert the list of XML document tokens to a tree representation. # The implementation uses a loop and a stack rather than recursion. # Does not modify the original token list. # Returns an error count. def tokens_to_tree tag_stack = [ ] # stack for element tags children_stack = [ ] # stack for lists of children children = [ ] # current list of children err_count = 0 # Note: the text token pattern test assumes that all text tokens # are non-empty. This should be true, because REX doesn't create # empty tokens. @tokens.each_index do |i| token = @tokens[i] line_num = @line_num[i] tok_err = "Error near line #{line_num}, token #{i+1} (#{token})" case token when /\A[^<]/ # text children << text_node(token) when /\A 1 root 4 1 xmlformat-1.04/INSTALL0000644000175000017500000000164410002613505013317 0ustar pauldevelThere are two versions of xmlformat: - xmlformat.rb, written in Ruby - xmlformat.pl, written in Perl Both should produce identical results. To install xmlformat, copy the version you want to use to some public directory that is listed in your PATH variable. You may wish to rename the script to xmlformat so that you don't have to type the .rb or .pl extension each time you invoke it. Example: cp xmlformat.rb /usr/local/bin/xmlformat (or) cp xmlformat.pl /usr/local/bin/xmlformat If you want to install both versions, do not rename them: cp xmlformat.rb /usr/local/bin/xmlformat.rb cp xmlformat.pl /usr/local/bin/xmlformat.pl In this case you will need to specify the extension when invoking the program to indicate which version you want. If your Ruby or Perl programs are not at the location listed in the first line of the installed script, you'll need to edit that line to have the correct location. xmlformat-1.04/bad2.xml0000644000175000017500000000011210002401004013572 0ustar pauldevel Hello world. xmlformat-1.04/bad3.xml0000644000175000017500000000004410000166646013621 0ustar pauldeveldocument begins with end tag xmlformat-1.04/bad4.xml0000644000175000017500000000014310334242107013617 0ustar pauldevel Hello big bad world. xmlformat-1.04/bad5.xml0000644000175000017500000000012010334242107013613 0ustar pauldevel Hello xmlformat-1.04/Makefile0000644000175000017500000000047410007043530013726 0ustar pauldevelall: # Execute this target after "svn export" into a new directory and before # packaging the directory for distribution. dist-prep:: (cd docs;make dist-prep) clean:: (cd tests;make clean) (cd docs;make clean) test:: @echo Test Ruby version... ./runtest -r all @echo Test Perl version... ./runtest -p all