HTML-TokeParser-Simple-3.16000755000765000024 012162510100 16000 5ustar00curtispoestaff000000000000HTML-TokeParser-Simple-3.16/Build.PL000444000765000024 144312162510100 17433 0ustar00curtispoestaff000000000000use strict; use warnings; use Module::Build; use 5.006; my $builder = Module::Build->new( module_name => 'HTML::TokeParser::Simple', license => 'perl', dist_author => 'Curtis "Ovid" Poe ', dist_version_from => 'lib/HTML/TokeParser/Simple.pm', requires => { 'HTML::Parser' => 3.25, 'HTML::TokeParser' => 2.24, 'Test::More' => 0, 'Sub::Override' => 0, 'perl' => 5.006, # for 'our' declarations }, add_to_cleanup => ['HTML-TokeParser-Simple-*'], create_makefile_pl => 'traditional', meta_merge => { resources => { repository => 'https://github.com/Ovid/html-tokeparser-simple', }, }, ); $builder->create_build_script(); HTML-TokeParser-Simple-3.16/Changes000444000765000024 657612162510100 17446 0ustar00curtispoestaff000000000000Revision history for Perl extension HTML::TokeParser::Simple. 3.16 2013-06-26 Fix POD errors and move POD tests to XT directory. Fix spelling errors. 3.15 2005-11-29 Fixed bug where rewrite_tag would unescape entities. Thanks to "Paul Bijnens" for the bug report and fix. 3.14 2005-10-08 Added POD tests Converted to Module::Build All classes now state which methods they override Carp is now only loaded on demand peek() now allows you to peek at the next tokens 3.13 Sat Set 25, 2004 Corrected @INC paths in the tests. Fixed still more embarrassing POD issues. Sigh. 3.12 Mon Sep 20, 2004 Corrected minor, but embarrassing, POD issue. 3.11 Sun Sep 19, 2004 Broke start and end tags out into their own classes. D'oh! 3.1 Sun Sep 19, 2004 Added subclasses for each token type. Massive POD update. Test class for internals. 3.0 Sat Sep 18, 2004 'get_foo' methods replace the older 'return_foo' methods. 'return_foo' methods now call the new 'get_foo' methods and are deprecated (though they do not yet issue warnings.) The constructor has been overloaded to allow users to explicitly state the source type of the HTML, if necessary. Also, if LWP::Simple is installed, the HTML may now be fetched from a URL. $token->set_attr($token->return_attr) now works. Added is_pi(). A shorthand for is_process_instruction(). Broke out HTML::TokeParser::Simple::Token into its own file. I may eventually refactor this into separate classes. 2.2 Sat Jan 31, 2004 'return_attr' now takes and optional argument. If supplied, will return just that attribute value. Updated POD. 2.1 Mon Jun 9, 2003 Made 'rewrite_tag' a public method. Fixed some formatting. Fixed bug where get_attrseq returned a hashref instead of an arrayref upon failure. Updated POD. Broke all HTML munging tests out into their own test and added some tests 2.0 Sun Jun 1, 2003 is_tag, is_start_tag, and is_end_tag now can take regexes. Added set_attr and delete_attr methods to rewrite tags on the fly Converted to Test::More and doubled the number of tests Updated POD Updated Makefile.PL to list Test::More as a requirement Deprecated 'return_text' now carps 1.4 Wed Nov 13, 2002 Tokens are now blessed into a separate class to prevent people from trying to call parser methods on them. return_text() has been deprecated in favor of as_is(). Tests and POD updated. Updated Makefile.PL to list HTML::TokeParser as a requirement (whoops!) 1.3 Fri July 5 13:11:00 2002 Added is_tag() method and made the leading slash on an end tag optional. Updated the POD and and tests. 1.2 Fri Jun 21 08:05:00 2002 Turns out the the POD had more errors than I though. There are no changes in the code or the tests, but the POD has been updated. 1.1 Fri Mar 1 15:52:00 2002 Finally got off my duff and added support for the get_tag() method. Also overloaded the is_/(start|end)/_tag methods to allow testing for the tag and tag type at the same time. 0.01 Mon Feb 4 09:31:52 2002 - original version; created by h2xs 1.21 with options -AX -n HTML::TokeParser::Simple HTML-TokeParser-Simple-3.16/Makefile.PL000444000765000024 77412162510100 20077 0ustar00curtispoestaff000000000000# Note: this file was auto-generated by Module::Build::Compat version 0.4003 require 5.006; use ExtUtils::MakeMaker; WriteMakefile ( 'NAME' => 'HTML::TokeParser::Simple', 'VERSION_FROM' => 'lib/HTML/TokeParser/Simple.pm', 'PREREQ_PM' => { 'HTML::Parser' => '3.25', 'HTML::TokeParser' => '2.24', 'Sub::Override' => 0, 'Test::More' => 0 }, 'INSTALLDIRS' => 'site', 'EXE_FILES' => [], 'PL_FILES' => {} ) ; HTML-TokeParser-Simple-3.16/MANIFEST000444000765000024 113312162510100 17264 0ustar00curtispoestaff000000000000Build.PL Changes lib/HTML/TokeParser/Simple.pm lib/HTML/TokeParser/Simple/Token.pm lib/HTML/TokeParser/Simple/Token/Comment.pm lib/HTML/TokeParser/Simple/Token/Declaration.pm lib/HTML/TokeParser/Simple/Token/ProcessInstruction.pm lib/HTML/TokeParser/Simple/Token/Tag.pm lib/HTML/TokeParser/Simple/Token/Tag/End.pm lib/HTML/TokeParser/Simple/Token/Tag/Start.pm lib/HTML/TokeParser/Simple/Token/Text.pm Makefile.PL MANIFEST META.yml Module meta-data (added by MakeMaker) README t/get_tag.t t/get_token.t t/munge_html.t t/constructor.t t/internals.t t/data/test.html xt/pod-coverage.t xt/pod.t META.json HTML-TokeParser-Simple-3.16/META.json000444000765000024 460612162510100 17564 0ustar00curtispoestaff000000000000{ "abstract" : "Easy to use C interface", "author" : [ "Curtis \"Ovid\" Poe " ], "dynamic_config" : 1, "generated_by" : "Module::Build version 0.4003, CPAN::Meta::Converter version 2.120630", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : "2" }, "name" : "HTML-TokeParser-Simple", "prereqs" : { "configure" : { "requires" : { "Module::Build" : "0.40" } }, "runtime" : { "requires" : { "HTML::Parser" : "3.25", "HTML::TokeParser" : "2.24", "Sub::Override" : "0", "Test::More" : "0", "perl" : "5.006" } } }, "provides" : { "HTML::TokeParser::Simple" : { "file" : "lib/HTML/TokeParser/Simple.pm", "version" : "3.16" }, "HTML::TokeParser::Simple::Token" : { "file" : "lib/HTML/TokeParser/Simple/Token.pm", "version" : "3.16" }, "HTML::TokeParser::Simple::Token::Comment" : { "file" : "lib/HTML/TokeParser/Simple/Token/Comment.pm", "version" : "3.16" }, "HTML::TokeParser::Simple::Token::Declaration" : { "file" : "lib/HTML/TokeParser/Simple/Token/Declaration.pm", "version" : "3.15" }, "HTML::TokeParser::Simple::Token::ProcessInstruction" : { "file" : "lib/HTML/TokeParser/Simple/Token/ProcessInstruction.pm", "version" : "3.16" }, "HTML::TokeParser::Simple::Token::Tag" : { "file" : "lib/HTML/TokeParser/Simple/Token/Tag.pm", "version" : "3.16" }, "HTML::TokeParser::Simple::Token::Tag::End" : { "file" : "lib/HTML/TokeParser/Simple/Token/Tag/End.pm", "version" : "3.16" }, "HTML::TokeParser::Simple::Token::Tag::Start" : { "file" : "lib/HTML/TokeParser/Simple/Token/Tag/Start.pm", "version" : "3.16" }, "HTML::TokeParser::Simple::Token::Text" : { "file" : "lib/HTML/TokeParser/Simple/Token/Text.pm", "version" : "3.16" } }, "release_status" : "stable", "resources" : { "license" : [ "http://dev.perl.org/licenses/" ], "repository" : { "url" : "https://github.com/Ovid/html-tokeparser-simple" } }, "version" : "3.16" } HTML-TokeParser-Simple-3.16/META.yml000444000765000024 320112162510100 17402 0ustar00curtispoestaff000000000000--- abstract: 'Easy to use C interface' author: - "Curtis \"Ovid\" Poe " build_requires: {} configure_requires: Module::Build: 0.40 dynamic_config: 1 generated_by: 'Module::Build version 0.4003, CPAN::Meta::Converter version 2.120630' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: 1.4 name: HTML-TokeParser-Simple provides: HTML::TokeParser::Simple: file: lib/HTML/TokeParser/Simple.pm version: 3.16 HTML::TokeParser::Simple::Token: file: lib/HTML/TokeParser/Simple/Token.pm version: 3.16 HTML::TokeParser::Simple::Token::Comment: file: lib/HTML/TokeParser/Simple/Token/Comment.pm version: 3.16 HTML::TokeParser::Simple::Token::Declaration: file: lib/HTML/TokeParser/Simple/Token/Declaration.pm version: 3.15 HTML::TokeParser::Simple::Token::ProcessInstruction: file: lib/HTML/TokeParser/Simple/Token/ProcessInstruction.pm version: 3.16 HTML::TokeParser::Simple::Token::Tag: file: lib/HTML/TokeParser/Simple/Token/Tag.pm version: 3.16 HTML::TokeParser::Simple::Token::Tag::End: file: lib/HTML/TokeParser/Simple/Token/Tag/End.pm version: 3.16 HTML::TokeParser::Simple::Token::Tag::Start: file: lib/HTML/TokeParser/Simple/Token/Tag/Start.pm version: 3.16 HTML::TokeParser::Simple::Token::Text: file: lib/HTML/TokeParser/Simple/Token/Text.pm version: 3.16 requires: HTML::Parser: 3.25 HTML::TokeParser: 2.24 Sub::Override: 0 Test::More: 0 perl: 5.006 resources: license: http://dev.perl.org/licenses/ repository: https://github.com/Ovid/html-tokeparser-simple version: 3.16 HTML-TokeParser-Simple-3.16/README000444000765000024 147512162510100 17024 0ustar00curtispoestaff000000000000HTML/TokeParser/Simple version 1.2 ================================= HTML::TokeParser::Simple is a subclass of HTML::TokeParser that uses easy-to-remember method calls to work with the tokens. Rather than try to remember a bunch of array indices or try to write a bunch of constants for them, you can now do something like: $token->is_start_tag( 'form' ) Instead of $token->[0] eq 'S' and $token->[1] eq 'form' INSTALLATION To install this module type the following: perl Makefile.PL make make test make install DEPENDENCIES This module requires these other modules and libraries: HTML::TokeParser 3.25 COPYRIGHT AND LICENCE Copyright (c) 2002 by Curtis "Ovid" Poe. All rights reserved. This program is free software; you may redistribute it and/or modify it under the same terms as Perl itself. HTML-TokeParser-Simple-3.16/lib000755000765000024 012162510100 16546 5ustar00curtispoestaff000000000000HTML-TokeParser-Simple-3.16/lib/HTML000755000765000024 012162510100 17312 5ustar00curtispoestaff000000000000HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser000755000765000024 012162510100 21371 5ustar00curtispoestaff000000000000HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple.pm000444000765000024 4611012162510100 23337 0ustar00curtispoestaff000000000000package HTML::TokeParser::Simple; use strict; use HTML::TokeParser; use HTML::TokeParser::Simple::Token; use HTML::TokeParser::Simple::Token::Tag; use HTML::TokeParser::Simple::Token::Tag::Start; use HTML::TokeParser::Simple::Token::Tag::End; use HTML::TokeParser::Simple::Token::Text; use HTML::TokeParser::Simple::Token::Comment; use HTML::TokeParser::Simple::Token::Declaration; use HTML::TokeParser::Simple::Token::ProcessInstruction; our $VERSION = '3.16'; use base 'HTML::TokeParser'; # constructors my %FACTORY_CLASSES = ( S => 'HTML::TokeParser::Simple::Token::Tag::Start', E => 'HTML::TokeParser::Simple::Token::Tag::End', T => 'HTML::TokeParser::Simple::Token::Text', C => 'HTML::TokeParser::Simple::Token::Comment', D => 'HTML::TokeParser::Simple::Token::Declaration', PI => 'HTML::TokeParser::Simple::Token::ProcessInstruction', ); sub _croak { my ($proto, $message) = @_; require Carp; Carp::croak($message); } sub new { my ($class, @args) = @_; return 1 == @args ? $class->SUPER::new(@args) : $class->_init(@args); } sub _init { my ($class, $source_type, $source) = @_; my %sources = ( file => sub { $source }, handle => sub { $source }, string => sub { \$source }, url => sub { eval "require LWP::Simple"; $class->_croak("Cannot load LWP::Simple: $@") if $@; my $content = LWP::Simple::get($source); $class->_croak("Could not fetch content from ($source)") unless defined $content; return \$content; }, ); unless (exists $sources{$source_type}) { $class->_croak("Unknown source type ($source_type)"); } return $class->new($sources{$source_type}->()); } sub get_token { my $self = shift; my @args = @_; my $token = $self->SUPER::get_token( @args ); return unless defined $token; if (my $factory_class = $FACTORY_CLASSES{$token->[0]}) { return $factory_class->new($token); } else { # this should never happen $self->_croak("Cannot determine token class for token (@$token)"); } } sub get_tag { my $self = shift; my @args = @_; my $token = $self->SUPER::get_tag( @args ); return unless defined $token; return $token->[0] =~ /^\// ? HTML::TokeParser::Simple::Token::Tag::End->new($token) : HTML::TokeParser::Simple::Token::Tag::Start->new($token); } sub peek { my ($self, $count) = @_; $count ||= 1; unless ($count =~ /^\d+$/) { $self->_croak("Argument to peek() must be a positive integer, not ($count)"); } my $items = 0; my $html = ''; my @tokens; while ( $items++ < $count && defined ( my $token = $self->get_token ) ) { $html .= $token->as_is; push @tokens, $token; } $self->unget_token(@tokens); return $html; } 1; __END__ =head1 NAME HTML::TokeParser::Simple - Easy to use C interface =head1 SYNOPSIS use HTML::TokeParser::Simple; my $p = HTML::TokeParser::Simple->new( $somefile ); while ( my $token = $p->get_token ) { # This prints all text in an HTML doc (i.e., it strips the HTML) next unless $token->is_text; print $token->as_is; } =head1 DESCRIPTION C is an excellent module that's often used for parsing HTML. However, the tokens returned are not exactly intuitive to parse: ["S", $tag, $attr, $attrseq, $text] ["E", $tag, $text] ["T", $text, $is_data] ["C", $text] ["D", $text] ["PI", $token0, $text] To simplify this, C allows the user ask more intuitive (read: more self-documenting) questions about the tokens returned. You can also rebuild some tags on the fly. Frequently, the attributes associated with start tags need to be altered, added to, or deleted. This functionality is built in. Since this is a subclass of C, all C methods are available. To truly appreciate the power of this module, please read the documentation for C and C. =head1 CONTRUCTORS =head2 C The constructor for C can be used just like C's constructor: my $parser = HTML::TokeParser::Simple->new($filename); # or my $parser = HTML::TokeParser::Simple->new($filehandle); # or my $parser = HTML::TokeParser::Simple->new(\$html_string); =head2 C If you wish to be more explicit, there is a new style of constructor available. my $parser = HTML::TokeParser::Simple->new(file => $filename); # or my $parser = HTML::TokeParser::Simple->new(handle => $filehandle); # or my $parser = HTML::TokeParser::Simple->new(string => $html_string); Note that you do not have to provide a reference for the string if using the string constructor. As a convenience, you can also attempt to fetch the HTML directly from a URL. my $parser = HTML::TokeParser::Simple->new(url => 'http://some.url'); This method relies on C. If this module is not found or the page cannot be fetched, the constructor will C. =head1 PARSER METHODS =head2 get_token This method will return the next token that C method would return. However, it will be blessed into a class appropriate which represents the token type. =head2 get_tag This method will return the next token that C method would return. However, it will be blessed into either the L or L class. =head2 peek As of version C<3.14>, you can now C at the upcomings tokens without affecting the state of the parser. By default, C will return the text of the next token, but specifying an integer C<$count> will return the text of the next C<$count> tokens. This is useful when you're trying to debug where you are in a document. warn $parser->peek(3); # show the next 3 tokens =head1 ACCESSORS The following methods may be called on the token object which is returned, not on the parser object. =head2 Boolean Accessors These accessors return true or false. =over 4 =item * C Use this to determine if you have any tag. An optional "tag type" may be passed. This will allow you to match if it's a I tag. The supplied tag is case-insensitive. if ( $token->is_tag ) { ... } Optionally, you may pass a regular expression as an argument. =item * C Use this to determine if you have a start tag. An optional "tag type" may be passed. This will allow you to match if it's a I start tag. The supplied tag is case-insensitive. if ( $token->is_start_tag ) { ... } if ( $token->is_start_tag( 'font' ) ) { ... } Optionally, you may pass a regular expression as an argument. To match all header (h1, h2, ... h6) tags: if ( $token->is_start_tag( qr/^h[123456]$/ ) ) { ... } =item * C Use this to determine if you have an end tag. An optional "tag type" may be passed. This will allow you to match if it's a I end tag. The supplied tag is case-insensitive. When testing for an end tag, the forward slash on the tag is optional. while ( $token = $p->get_token ) { if ( $token->is_end_tag( 'form' ) ) { ... } } Or: while ( $token = $p->get_token ) { if ( $token->is_end_tag( '/form' ) ) { ... } } Optionally, you may pass a regular expression as an argument. =item * C Use this to determine if you have text. Note that this is I to be confused with the C (I) method described below! C will identify text that the user typically sees display in the Web browser. =item * C Are you still reading this? Nobody reads POD. Don't you know you're supposed to go to CLPM, ask a question that's answered in the POD and get flamed? It's a rite of passage. Really. C is used to identify comments. See the HTML::Parser documentation for more information about comments. There's more than you might think. =item * C This will match the DTD at the top of your HTML. (You I use DTD's, don't you?) =item * C Process Instructions are from XML. This is very handy if you need to parse out PHP and similar things with a parser. Currently, there appear to be some problems with process instructions. You can override C if you need to. =item * C This is a shorthand for C. =back =head2 Data Accessors Some of these were originally C methods, but that name was not only unwieldy, but also went against reasonable conventions. The C methods listed below still have C methods available for backwards compatibility reasons, but they merely call their C counterpart. For example, calling C actually calls C internally. =over 4 =item * C Do you have a start tag or end tag? This will return the type (lower case). Note that this is I the same as the C method on the actual parser object. =item * C If you have a start tag, this will return a hash ref with the attribute names as keys and the values as the values. If you pass in an attribute name, it will return the value for just that attribute. Returns false if the token is not a start tag. =item * C For a start tag, this is an array reference with the sequence of the attributes, if any. Returns false if the token is not a start tag. =item * C This method has been heavily deprecated (for a couple of years) in favor of C. Programmers were getting confused over the difference between C, C, and some parser methods such as C and friends. Using this method still succeeds, but will now carp and B in the next major release of this module. =item * C This is the exact text of whatever the token is representing. =item * C For processing instructions, this will return the token found immediately after the opening tag. Example: For . Returns false if the token is not a process instruction. =back =head1 MUTATORS The C and C methods allow the programmer to rewrite start tag attributes on the fly. It should be noted that bad HTML will be "corrected" by this. Specifically, the new tag will have all attributes lower-cased with the values properly quoted. Self-closing tags (e.g. Ehr /E) are also handled correctly. Some older browsers require a space prior to the final slash in a self-closed tag. If such a space is detected in the original HTML, it will be preserved. Calling a mutator on an token type that does not support that property is a no-op. For example: if ($token->is_comment) { $token->set_attr(foo => 'bar'); # does nothing } =over 4 =item * C This method attempts to delete the attribute specified. It will silently fail if called on anything other than a start tag. The argument is case-insensitive, but must otherwise be an exact match of the attribute you are attempting to delete. If the attribute is not found, the method will return without changing the tag. # $token->delete_attr('bgcolor'); print $token->as_is; # After this method is called, if successful, the C, C and C methods will all return updated results. =item * C This method will set the value of an attribute. If the attribute is not found, then C will have the new attribute listed at the end. #

$token->set_attr(class => 'some_class'); print $token->as_is; #

# $token->set_attr('bgcolor','red'); print $token->as_is; # After this method is called, if successful, the C, C and C methods will all return updated results. =item * C Under the premise that C methods should accept what their corresponding C methods emit, the following works: $tag->set_attr($tag->get_attr); Theoretically that's a no-op and for purposes of rendering HTML, it should be. However, internally this calls C<$tag-Erewrite_tag>, so see that method to understand how this may affect you. Of course, this is useless if you want to actually change the attributes, so you can do this: my $attrs = { class => 'headline', valign => 'top' }; $token->set_attr($attrs) if $token->is_start_tag('td') && $token->get_attr('class') eq 'stories'; =item * C This method rewrites the tag. The tag name and the name of all attributes will be lower-cased. Values that are not quoted with double quotes will be. This may be called on both start or end tags. Note that both C and C call this method prior to returning. If called on a token that is not a tag, it simply returns. Regardless of how it is called, it returns the token. # $token->rewrite_tag; print $token->as_is; # A quick cleanup of sloppy HTML is now the following: my $parser = HTML::TokeParser::Simple->new( string => $ugly_html ); while (my $token = $parser->get_token) { $token->rewrite_tag; print $token->as_is; } =back =head1 PARSER VERSUS TOKENS The parser returns tokens that are blessed into appropriate classes. Some people get confused and try to call parser methods on tokens and token methods on the parser. To prevent this, C versions 1.4 and above now bless all tokens into appropriate token classes. Please keep this in mind while using this module (and many thanks to PodMaster L for pointing out this issue to me.) =head1 EXAMPLES =head2 Finding comments For some strange reason, your Pointy-Haired Boss (PHB) is convinced that the graphics department is making fun of him by embedding rude things about him in HTML comments. You need to get all HTML comments from the HTML. use strict; use HTML::TokeParser::Simple; my @html_docs = glob( "*.html" ); open PHB, "> phbreport.txt" or die "Cannot open phbreport for writing: $!"; foreach my $doc ( @html_docs ) { print "Processing $doc\n"; my $p = HTML::TokeParser::Simple->new( file => $doc ); while ( my $token = $p->get_token ) { next unless $token->is_comment; print PHB $token->as_is, "\n"; } } close PHB; =head2 Stripping Comments Uh oh. Turns out that your PHB was right for a change. Many of the comments in the HTML weren't very polite. Since your entire graphics department was just fired, it falls on you need to strip those comments from the HTML. use strict; use HTML::TokeParser::Simple; my $new_folder = 'no_comment/'; my @html_docs = glob( "*.html" ); foreach my $doc ( @html_docs ) { print "Processing $doc\n"; my $new_file = "$new_folder$doc"; open PHB, "> $new_file" or die "Cannot open $new_file for writing: $!"; my $p = HTML::TokeParser::Simple->new( $file => doc ); while ( my $token = $p->get_token ) { next if $token->is_comment; print PHB $token->as_is; } close PHB; } =head2 Changing form tags Your company was foo.com and now is bar.com. Unfortunately, whoever wrote your HTML decided to hardcode "http://www.foo.com/" into the C attribute of the form tags. You need to change it to "http://www.bar.com/". use strict; use HTML::TokeParser::Simple; my $new_folder = 'new_html/'; my @html_docs = glob( "*.html" ); foreach my $doc ( @html_docs ) { print "Processing $doc\n"; my $new_file = "$new_folder$doc"; open FILE, "> $new_file" or die "Cannot open $new_file for writing: $!"; my $p = HTML::TokeParser::Simple->new( file => $doc ); while ( my $token = $p->get_token ) { if ( $token->is_start_tag('form') ) { my $action = $token->get_attr(action); $action =~ s/www\.foo\.com/www.bar.com/; $token->set_attr('action', $action); } print FILE $token->as_is; } close FILE; } =head1 CAVEATS For compatibility reasons with C, methods that return references are violating encapsulation and altering the references directly B alter the state of the object. Subsequent calls to C can thus have unexpected results. Do not alter these references directly unless you are following behavior described in these docs. In the future, certain methods such as C, C and others may return a copy of the reference rather than the original reference. This behavior has not yet been changed in order to maintain compatibility with previous versions of this module. At the present time, your author is not aware of anyone taking advantage of this "feature," but it's better to be safe than sorry. Use of C<$HTML::Parser::VERSION> which is less than 3.25 may result in incorrect behavior as older versions do not always handle XHTML correctly. It is the programmer's responsibility to verify that the behavior of this code matches the programmer's needs. Note that C processes text in 512 byte chunks. This sometimes will cause strange behavior and cause text to be broken into more than one token. You can suppress this behavior with the following command: $p->unbroken_text( [$bool] ); See the C documentation and http://www.perlmonks.org/index.pl?node_id=230667 for more information. =head1 BUGS There are no known bugs, but that's no guarantee. Address bug reports and comments to: Eeop_divo_sitruc@yahoo.comE. When sending bug reports, please provide the version of C, C, C, the version of Perl, and the version of the operating system you are using. Reverse the name to email the author. =head1 SUBCLASSING You may wish to change the behavior of this module. You probably do not want to subclass C. Instead, you'll want to subclass one of the token classes. C is the base class for all tokens. Global behavioral changes should go there. Otherwise, see the appropriate token class for the behavior you wish to alter. =head1 SEE ALSO L L L L L L =head1 COPYRIGHT Copyright (c) 2004 by Curtis "Ovid" Poe. All rights reserved. This program is free software; you may redistribute it and/or modify it under the same terms as Perl itself =head1 AUTHOR Curtis "Ovid" Poe Eeop_divo_sitruc@yahoo.comE Reverse the name to email the author. =cut HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple000755000765000024 012162510100 22622 5ustar00curtispoestaff000000000000HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple/Token.pm000444000765000024 436712162510100 24407 0ustar00curtispoestaff000000000000package HTML::TokeParser::Simple::Token; use strict; our $VERSION = '3.16'; sub new { my ($class, $token) = @_; $class->_croak("This class should not be instantiated") if __PACKAGE__ eq $class; return bless $token, $class; } sub _croak { my ($proto, $message) = @_; require Carp; Carp::croak($message); } sub _carp { my ($proto, $message) = @_; require Carp; Carp::carp($message); } sub is_tag {} sub is_start_tag {} sub is_end_tag {} sub is_text {} sub is_comment {} sub is_declaration {} sub is_pi {} sub is_process_instruction {} sub rewrite_tag { shift } sub delete_attr {} sub set_attr {} sub get_tag {} sub return_tag {} # deprecated sub get_attr {} sub return_attr {} # deprecated sub get_attrseq {} sub return_attrseq {} # deprecated sub get_token0 {} sub return_token0 {} # deprecated # get_foo methods sub return_text { my ($self) = @_; $self->_carp('return_text() is deprecated. Use as_is() instead'); goto &as_is; } sub as_is { return shift->[-1] } 1; __END__ =head1 NAME HTML::TokeParser::Simple::Token - Base class for C tokens. =head1 SYNOPSIS use HTML::TokeParser::Simple; my $p = HTML::TokeParser::Simple->new( $somefile ); while ( my $token = $p->get_token ) { # This prints all text in an HTML doc (i.e., it strips the HTML) next unless $token->is_text; print $token->as_is; } =head1 DESCRIPTION This is the base class for all returned tokens. It should never be instantiated directly. In fact, it will C if it is. =head1 METHODS The following list of methods are provided by this class. Most of these are stub methods which must be overridden in a subclass. See L for descriptions of these methods. =over 4 =item * as_is =item * delete_attr =item * get_attr =item * get_attrseq =item * get_tag =item * get_token0 =item * is_comment =item * is_declaration =item * is_end_tag =item * is_pi =item * is_process_instruction =item * is_start_tag =item * is_tag =item * is_text =item * return_attr =item * return_attrseq =item * return_tag =item * return_text =item * return_token0 =item * rewrite_tag =item * set_attr =back HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple/Token000755000765000024 012162510100 23702 5ustar00curtispoestaff000000000000HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple/Token/Comment.pm000444000765000024 141212162510100 25775 0ustar00curtispoestaff000000000000package HTML::TokeParser::Simple::Token::Comment; use strict; our $VERSION = '3.16'; use base 'HTML::TokeParser::Simple::Token'; sub is_comment { 1 } 1; __END__ =head1 NAME HTML::TokeParser::Simple::Token::Comment - Token.pm comment class. =head1 SYNOPSIS use HTML::TokeParser::Simple; my $p = HTML::TokeParser::Simple->new( $somefile ); while ( my $token = $p->get_token ) { # This prints all text in an HTML doc (i.e., it strips the HTML) next unless $token->is_text; print $token->as_is; } =head1 DESCRIPTION This is the class for comment tokens. See L for detailed information about comments. =head1 OVERRIDDEN METHODS =head2 is_comment C will return true if the token is the DTD at the top of the HTML. =cut HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple/Token/Declaration.pm000444000765000024 135112162510100 26622 0ustar00curtispoestaff000000000000package HTML::TokeParser::Simple::Token::Declaration; use strict; our $VERSION = '3.15'; use base 'HTML::TokeParser::Simple::Token'; sub is_declaration { 1 } 1; __END__ =head1 NAME HTML::TokeParser::Simple::Token::Declaration - Token.pm declaration class. =head1 SYNOPSIS use HTML::TokeParser::Simple; my $p = HTML::TokeParser::Simple->new( $somefile ); while ( my $token = $p->get_token ) { # This prints all text in an HTML doc (i.e., it strips the HTML) next unless $token->is_text; print $token->as_is; } =head1 DESCRIPTION This is the declaration class for tokens. =head1 OVERRIDDEN METHODS =head2 is_declaration C will return true if the token is the DTD at the top of the HTML. =cut HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple/Token/ProcessInstruction.pm000444000765000024 223712162510100 30261 0ustar00curtispoestaff000000000000package HTML::TokeParser::Simple::Token::ProcessInstruction; use strict; our $VERSION = '3.16'; use base 'HTML::TokeParser::Simple::Token'; sub return_token0 { goto &get_token0 } # deprecated sub get_token0 { return shift->[1]; } sub is_pi { 1 } sub is_process_instruction { 1 } 1; __END__ =head1 NAME HTML::TokeParser::Simple::Token::ProcessInstruction - Token.pm process instruction class. =head1 SYNOPSIS use HTML::TokeParser::Simple; my $p = HTML::TokeParser::Simple->new( $somefile ); while ( my $token = $p->get_token ) { # This prints all text in an HTML doc (i.e., it strips the HTML) next unless $token->is_text; print $token->as_is; } =head1 DESCRIPTION Process Instructions are from XML. This is very handy if you need to parse out PHP and similar things with a parser. Currently, there appear to be some problems with process instructions. You can override this class if you need finer grained handling of process instructions. C and C both return true. =head1 OVERRIDDEN METHODS =over 4 =item * get_token0 =item * is_pi =item * is_process_instruction =item * return_token0 =back =cut HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple/Token/Tag.pm000444000765000024 300312162510100 25104 0ustar00curtispoestaff000000000000package HTML::TokeParser::Simple::Token::Tag; use strict; our $VERSION = '3.16'; use base 'HTML::TokeParser::Simple::Token'; my %INSTANCE; sub new { my ($class, $object) = @_; $class->_croak("This is a base class that should not be instantiated") if __PACKAGE__ eq $class; my $self = bless $object, $class; $self->_init; } sub _get_attrseq { return [] } sub _get_attr { return {} } sub _set_text { my $self = shift; $self->[-1] = shift; return $self; } # attribute munging methods # get_foo methods sub return_text { carp('return_text() is deprecated. Use as_is() instead'); goto &as_is; } sub as_is { return shift->_get_text; } sub get_tag { return shift->_get_tag; } 1; __END__ =head1 NAME HTML::TokeParser::Simple::Token::Tag - Token.pm tag class. =head1 SYNOPSIS use HTML::TokeParser::Simple; my $p = HTML::TokeParser::Simple->new( $somefile ); while ( my $token = $p->get_token ) { # This prints all text in an HTML doc (i.e., it strips the HTML) next unless $token->is_text; print $token->as_is; } =head1 DESCRIPTION This is the base class for start and end tokens. It should not be instantiated. See C and C for details. =head1 OVERRIDDEN METHODS The following list of methods are provided by this class. See L for descriptions of these methods. =over 4 =item * as_is =item * get_tag =item * return_text =back HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple/Token/Text.pm000444000765000024 136012162510100 25321 0ustar00curtispoestaff000000000000package HTML::TokeParser::Simple::Token::Text; use strict; our $VERSION = '3.16'; use base 'HTML::TokeParser::Simple::Token'; sub as_is { return shift->[1]; } sub is_text { 1 } 1; __END__ =head1 NAME HTML::TokeParser::Simple::Token::Text - Token.pm text class. =head1 SYNOPSIS use HTML::TokeParser::Simple; my $p = HTML::TokeParser::Simple->new( $somefile ); while ( my $token = $p->get_token ) { # This prints all text in an HTML doc (i.e., it strips the HTML) next unless $token->is_text; print $token->as_is; } =head1 DESCRIPTION This class represents "text" tokens. See the C documentation for details. =head1 OVERRIDDEN METHODS =over 4 =item * as_is =item * is_text =back =cut HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple/Token/Tag000755000765000024 012162510100 24415 5ustar00curtispoestaff000000000000HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple/Token/Tag/End.pm000444000765000024 501612162510100 25620 0ustar00curtispoestaff000000000000package HTML::TokeParser::Simple::Token::Tag::End; use strict; our $VERSION = '3.16'; use base 'HTML::TokeParser::Simple::Token::Tag'; my %TOKEN = ( tag => 1, text => 2 ); # in order to maintain the 'drop-in replacement' ability with HTML::TokeParser, # we cannot alter the array refs. Thus we must store instance data here. Ugh. my %INSTANCE; sub _init { my $self = shift; if ('E' eq $self->[0]) { $INSTANCE{$self}{offset} = 0; $INSTANCE{$self}{tag} = $self->[1]; } else { $INSTANCE{$self}{offset} = -1; my $tag = $self->[0]; $tag =~ s/^\///; $INSTANCE{$self}{tag} = $tag; } return $self; } sub _get_offset { return $INSTANCE{+shift}{offset} } sub _get_text { return shift->[-1] } sub _get_tag { my $self = shift; return $INSTANCE{$self}{tag}; } sub DESTROY { delete $INSTANCE{+shift} } sub rewrite_tag { my $self = shift; # capture the final slash if the tag is self-closing my ($self_closing) = $self->_get_text =~ m{(\s?/)>$}; $self_closing ||= ''; my $first = $self->is_end_tag ? '/' : ''; my $tag = sprintf '<%s%s%s>', $first, $self->get_tag, $self_closing; $self->_set_text($tag); return $self; } sub return_text { require Carp; Carp::carp('return_text() is deprecated. Use as_is() instead'); goto &as_is; } sub as_is { return shift->_get_text; } sub get_tag { return shift->_get_tag; } # is_foo methods sub is_tag { my $self = shift; return $self->is_end_tag( @_ ); } sub is_end_tag { my ($self, $tag) = @_; return $tag ? $self->_match_tag($tag) : 1; } sub _match_tag { my ($self, $tag) = @_; if ('Regexp' eq ref $tag) { return $self->_get_tag =~ $tag; } else { $tag = lc $tag; $tag =~ s/^\///; return $self->_get_tag eq $tag; } } 1; __END__ =head1 NAME HTML::TokeParser::Simple::Token::Tag::End - Token.pm "end tag" class. =head1 SYNOPSIS use HTML::TokeParser::Simple; my $p = HTML::TokeParser::Simple->new( $somefile ); while ( my $token = $p->get_token ) { # This prints all text in an HTML doc (i.e., it strips the HTML) next unless $token->is_text; print $token->as_is; } =head1 DESCRIPTION This class does most of the heavy lifting for C. See the C docs for details. =head1 OVERRIDDEN METHODS =over 4 =item * as_is =item * get_tag =item * is_end_tag =item * is_tag =item * return_text =item * rewrite_tag =back =cut HTML-TokeParser-Simple-3.16/lib/HTML/TokeParser/Simple/Token/Tag/Start.pm000444000765000024 1054012162510100 26225 0ustar00curtispoestaff000000000000package HTML::TokeParser::Simple::Token::Tag::Start; use strict; our $VERSION = '3.16'; use base 'HTML::TokeParser::Simple::Token::Tag'; use HTML::Entities qw/encode_entities/; my %TOKEN = ( tag => 1, attr => 2, attrseq => 3, text => 4 ); my %INSTANCE; sub _init { my $self = shift; if ('S' eq $self->[0]) { $INSTANCE{$self}{offset} = 0; $INSTANCE{$self}{tag} = $self->[1]; } else { $INSTANCE{$self}{offset} = -1; my $tag = $self->[0]; $tag =~ s/^\///; $INSTANCE{$self}{tag} = $tag; } return $self; } sub _get_offset { return $INSTANCE{+shift}{offset} } sub _get_text { return shift->[-1] } sub _get_tag { my $self = shift; return $INSTANCE{$self}{tag}; } sub _get_attrseq { my $self = shift; my $index = $TOKEN{attrseq} + $self->_get_offset; return $self->[$index]; } sub _get_attr { my $self = shift; my $index = $TOKEN{attr} + $self->_get_offset; return $self->[$index]; } sub DESTROY { delete $INSTANCE{+shift} } sub return_attr { goto &get_attr } sub return_attrseq { goto &get_attrseq } sub return_tag { goto &get_tag } # attribute munging methods sub set_attr { my ($self, $name, $value) = @_; return 'HASH' eq ref $name ? $self->_set_attr_from_hashref($name) : $self->_set_attr_from_string($name, $value); } sub _set_attr_from_string { my ($self, $name, $value) = @_; $name = lc $name; my $attr = $self->get_attr; my $attrseq = $self->get_attrseq; unless (exists $attr->{$name}) { push @$attrseq => $name; } $attr->{$name} = $value; $self->rewrite_tag; } sub _set_attr_from_hashref { my ($self, $attr_hash) = @_; while (my ($attr, $value) = each %$attr_hash) { $self->set_attr($attr, $value); } return $self; } sub rewrite_tag { my $self = shift; my $attr = $self->get_attr; my $attrseq = $self->get_attrseq; # capture the final slash if the tag is self-closing my ($self_closing) = $self->_get_text =~ m{(\s?/)>$}; $self_closing ||= ''; my $tag = ''; foreach ( @$attrseq ) { next if $_ eq '/'; # is this a bug in HTML::TokeParser? $tag .= sprintf qq{ %s="%s"} => $_, encode_entities($attr->{$_}); } my $first = $self->is_end_tag ? '/' : ''; $tag = sprintf '<%s%s%s%s>', $first, $self->get_tag, $tag, $self_closing; $self->_set_text($tag); return $self; } sub delete_attr { my ($self,$name) = @_; $name = lc $name; my $attr = $self->get_attr; return unless exists $attr->{$name}; delete $attr->{$name}; my $attrseq = $self->get_attrseq; @$attrseq = grep { $_ ne $name } @$attrseq; $self->rewrite_tag; } # get_foo methods sub return_text { require Carp; Carp::carp('return_text() is deprecated. Use as_is() instead'); goto &as_is; } sub as_is { return shift->_get_text; } sub get_tag { return shift->_get_tag; } sub get_token0 { return ''; } sub get_attr { my $self = shift; my $attributes = $self->_get_attr; return @_ ? $attributes->{lc shift} : $attributes; } sub get_attrseq { my $self = shift; $self->_get_attrseq; } # is_foo methods sub is_tag { my $self = shift; return $self->is_start_tag( @_ ); } sub is_start_tag { my ($self, $tag) = @_; return $tag ? $self->_match_tag($tag) : 1; } sub _match_tag { my ($self, $tag) = @_; return 'Regexp' eq ref $tag ? $self->_get_tag =~ $tag : $self->_get_tag eq lc $tag; } 1; __END__ =head1 NAME HTML::TokeParser::Simple::Token::Tag::Start - Token.pm "start tag" class. =head1 SYNOPSIS use HTML::TokeParser::Simple; my $p = HTML::TokeParser::Simple->new( $somefile ); while ( my $token = $p->get_token ) { # This prints all text in an HTML doc (i.e., it strips the HTML) next unless $token->is_text; print $token->as_is; } =head1 DESCRIPTION This class does most of the heavy lifting for C. See the C docs for details. =head1 OVERRIDDEN METHODS =over 4 =item * as_is =item * delete_attr =item * get_attr =item * get_attrseq =item * get_tag =item * get_token0 =item * is_start_tag =item * is_tag =item * return_attr =item * return_attrseq =item * return_tag =item * return_text =item * rewrite_tag =item * set_attr =back =cut HTML-TokeParser-Simple-3.16/t000755000765000024 012162510100 16243 5ustar00curtispoestaff000000000000HTML-TokeParser-Simple-3.16/t/constructor.t000555000765000024 456112162510100 21163 0ustar00curtispoestaff000000000000#!/usr/bin/perl -w use strict; use warnings; use Sub::Override; use Test::More tests => 15; my $CLASS; BEGIN { chdir 't' if -d 't'; unshift @INC => '../lib'; $CLASS = 'HTML::TokeParser::Simple'; use_ok($CLASS) || die; } can_ok($CLASS, 'new'); eval { $CLASS->new(unknown_source_type => 'asdf') }; like( $@, qr/^Unknown source type \(unknown_source_type\)/, '... and calling it with an unknown source type should croak()'); my $test_html_file = 'data/test.html'; ok(my $parser = $CLASS->new(file => $test_html_file), '... we should be able to specify a filename with the constructor'); isa_ok($parser, $CLASS); my $token = $parser->get_tag('body'); is_deeply( $token->get_attr, { alink => "#0000ff", bgcolor => "#ffffff" }, '... and it should be able to parse the file'); undef $parser; open FILE, '<', $test_html_file or die "Cannot open ($test_html_file) for reading: $!"; ok($parser = $CLASS->new(handle => \*FILE), '... we should be able to specify a filehandle with the constructor'); isa_ok($parser, $CLASS); $token = $parser->get_tag('body'); is_deeply( $token->get_attr, { alink => "#0000ff", bgcolor => "#ffffff" }, '... and it should be able to parse the file'); my $html = '

'; ok($parser = $CLASS->new(string => $html), '... we should be able to specify a string with the constructor'); isa_ok($parser, $CLASS); $token = $parser->get_tag('a'); is_deeply( $token->get_attr, { href => "foo.html" }, '... and it should be able to parse the file'); eval "require LWP::Simple"; SKIP: { skip "Cannot load LWP::Simple", 3 if $@; my $override = Sub::Override->new( 'LWP::Simple::get' => sub($) { return '

' } ); ok($parser = $CLASS->new(url => 'http://bogus.url'), '... we should be able to specify a URL with the constructor'); $token = $parser->get_tag('a'); is_deeply( $token->get_attr, { href => "bar.html" }, '... and it should be able to parse the file'); $override->restore; $override = Sub::Override->new( 'LWP::Simple::get' => sub($) { undef } ); eval { $CLASS->new(url => 'http://bogus.url') }; like( $@, qr{\QCould not fetch content from (http://bogus.url)\E}, '... but the URL constructor should croak if we cannot fetch the content'); } HTML-TokeParser-Simple-3.16/t/get_tag.t000555000765000024 1253712162510100 20232 0ustar00curtispoestaff000000000000#!/usr/bin/perl use strict; use warnings; use Test::More tests => 59; my $CLASS; BEGIN { chdir 't' if -d 't'; unshift @INC => '../lib/'; $CLASS = 'HTML::TokeParser::Simple'; use_ok($CLASS) || die; } my $TOKEN_CLASS = "${CLASS}::Token"; my $TAG_CLASS = "${TOKEN_CLASS}::Tag"; can_ok($CLASS, 'new'); my $p = $CLASS->new(\*DATA); isa_ok( $p, $CLASS => '... and the object it returns' ); can_ok($p, 'get_tag'); my $token = $p->get_tag; isa_ok( $token, $TAG_CLASS => '... and the object it returns' ); my $old_token = $token; can_ok($token, 'is_declaration'); ok(! $token->is_declaration, '... and it should return false' ); is_deeply($token,$old_token, '... and the token should not be changed' ); can_ok($token, 'is_start_tag'); ok( $token->is_start_tag('html'), '... and it should correctly identify a given start tag' ); ok(!$token->is_start_tag('fake'), "... bug it shouldn't give false positives" ); ok( $token->is_start_tag, '... and it should correctly identify a start tag' ); can_ok($token, 'is_tag'); ok($token->is_tag('html'), '... and it should identify a token as a given tag' ); ok(!$token->is_tag('fake'), "... and it shouldn't give false positives"); ok($token->is_tag, '... and it should identify that the token is a tag'); can_ok($token, 'get_tag'); ok(my $tag = $token->get_tag, '... and calling it should succeed' ); is($tag, 'html', '... by returning the correct tag'); can_ok($token, 'return_tag'); ok($tag = $token->return_tag, '... and calling this deprecated method should succeed' ); is($tag, 'html', '... by returning the correct tag'); # important to remember that whitespace counts as a token. $token = $p->get_tag for ( 1 .. 2 ); can_ok($token, 'is_comment'); ok(!$token->is_comment, "... but it shouldn't have false positives"); can_ok($token, 'return_text'); { my $warning; local $SIG{__WARN__} = sub { $warning = shift }; is($token->return_text, '', '... and it should return the correct text' ); ok( $warning, '... while issuing a warning'); like($warning, qr/\Qreturn_text() is deprecated. Use as_is() instead\E/, '... with an appropriate error message'); } can_ok($token, 'as_is'); is( $token->as_is, '<title>', '... and it should return the correct text'); $token = $p->get_tag; can_ok($token, 'is_end_tag'); ok( $token->is_end_tag('/title'), '... and it should identify a particular end tag' ); ok( $token->is_end_tag('title'), '... even without a slash' ); ok( $token->is_end_tag('TITLE'), '... regardless of case' ); ok( $token->is_end_tag, '... and should identify the token as just being an end tag' ); $token = $p->get_tag for 1..2; can_ok($token, 'get_attr'); my $attr = $token->get_attr; is( ref $attr , 'HASH', '... and it should return a hashref' ); is( $attr->{'bgcolor'}, '#ffffff','... correctly identifying the bgcolor' ); is( $attr->{'alink'}, '#0000ff', '... and the alink color' ); is($token->get_attr('bgcolor'), '#ffffff', '... and fetching a specific attribute should succeed'); is($token->get_attr('BGCOLOR'), '#ffffff', '... and fetching a specific attribute should succeed'); is($token->get_attr('alink'), '#0000ff', '... and fetching a specific attribute should succeed'); can_ok($token, 'return_attr'); $attr = $token->return_attr; is( ref $attr , 'HASH', '... and calling this deprecated method should return a hashref' ); is( $attr->{'bgcolor'}, '#ffffff','... correctly identifying the bgcolor' ); is( $attr->{'alink'}, '#0000ff', '... and the alink color' ); is($token->return_attr('bgcolor'), '#ffffff', '... and fetching a specific attribute should succeed'); is($token->return_attr('BGCOLOR'), '#ffffff', '... and fetching a specific attribute should succeed'); is($token->return_attr('alink'), '#0000ff', '... and fetching a specific attribute should succeed'); can_ok($token, 'set_attr'); $attr = $token->get_attr; $attr->{bgcolor} = "whatever"; $token->set_attr($attr); is($token->as_is, '<body alink="#0000ff" bgcolor="whatever">', 'set_attr() should accept what get_attr() returns'); can_ok($token, 'get_attrseq'); my $arrayref = $token->get_attrseq; is( ref $arrayref, 'ARRAY', '... and it should return an array reference' ); is( scalar @{$arrayref}, 2, '... with the correct number of elements' ); is( "@$arrayref", 'alink bgcolor', '... in the correct order' ); can_ok($token, 'return_attrseq'); $arrayref = $token->return_attrseq; is( ref $arrayref, 'ARRAY', '... and calling this deprecated method should return an array reference' ); is( scalar @{$arrayref}, 2, '... with the correct number of elements' ); is( "@$arrayref", 'alink bgcolor', '... in the correct order' ); __DATA__ <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN"> <html> <head> <!-- This is a comment --> <title>This is a title "; ?>

Do not edit this HTML lest the tests fail!!!

HTML-TokeParser-Simple-3.16/t/get_token.t000555000765000024 1222412162510100 20570 0ustar00curtispoestaff000000000000#!/usr/bin/perl -w use strict; use warnings; use Test::More tests => 56; #use Test::More 'no_plan'; my $CLASS; BEGIN { chdir 't' if -d 't'; unshift @INC => '../lib'; $CLASS = 'HTML::TokeParser::Simple'; use_ok($CLASS) || die; } my $TOKEN_CLASS = "${CLASS}::Token"; can_ok($CLASS, 'new'); my $p = $CLASS->new(\*DATA); isa_ok( $p, $CLASS => '... and the return value' ); can_ok($p, 'get_token'); my $token = $p->get_token; isa_ok( $token, $TOKEN_CLASS => '... and the return value' ); can_ok($token, 'is_declaration'); ok( $token->is_declaration, '... and it should correctly identify one' ); $token = $p->get_token for 1 .. 2; can_ok($token, 'is_start_tag'); ok( $token->is_start_tag('html'), '... and it should identify the token as a particular start tag' ); ok( $token->is_start_tag, '... or as a start tag in general'); ok(!$token->is_start_tag('fake'), '... but it should not return false positives'); can_ok($token, 'get_tag'); is( $token->get_tag, 'html', '... and it should return the correct tag' ); can_ok($token, 'return_tag'); is( $token->return_tag, 'html', '... and calling this deprecated method should return the correct tag' ); can_ok($p, 'peek'); is $p->peek, $p->peek, '... and calling it should not change the state of the parser'; is $p->peek(1000), $p->peek(1000),'... even if we try to peek beyond the end of the document'; like $p->peek, qr/^\s+$/, 'Calling peek without arguments should return the next token'; like $p->peek(4), qr/^\s+\s+/, '... and passing an integer value should return the next X tokens'; eval { $p->peek('html') }; like $@, qr/^\QArgument to peek() must be a positive integer, not (html)/, '... but passing it a non-integer value should croak'; # important to remember that whitespace counts as a token. $token = $p->get_token for 1 .. 4; can_ok($token, 'is_comment'); ok( $token->is_comment, '... and it should correctly identify a comment' ); can_ok($token, 'return_text'); { my $warning; local $SIG{__WARN__} = sub { $warning = shift }; is($token->return_text, '', '... and it should return the correct text' ); ok( $warning, '... while issuing a warning'); like($warning, qr/\Qreturn_text() is deprecated. Use as_is() instead\E/, '... with an appropriate error message'); } can_ok($token, 'as_is'); is( $token->as_is, '', '... and it should return the correct text' ); $token = $p->get_token for ( 1..3 ); can_ok($token, 'is_text'); ok( $token->is_text, '... and it should correctly identify text'); $token = $p->get_token; can_ok($token, 'is_end_tag'); ok( $token->is_end_tag('/title'), '... and it should identify a particular end tag' ); ok( $token->is_end_tag('title'), '... even without a slash' ); ok( $token->is_end_tag('TITLE'), '... regardless of case' ); ok( $token->is_end_tag, '... and should identify the token as just being an end tag' ); $token = $p->get_token for ( 1..2 ); can_ok($token, 'is_pi'); ok( $token->is_pi, '... and it should correctly identify them' ); my $non_start_tag = $token; # squirrel this away for the set_attr test can_ok($token, 'is_process_instruction'); ok( $token->is_process_instruction, '... and it should correctly identify them' ); can_ok($token, 'get_token0'); # diag($token->get_token0); # more research needed. This doesn't seem to return everything correctly ok( $token->get_token0, '... and it should return something'); can_ok($token, 'return_token0'); # diag($token->return_token0); # more research needed. This doesn't seem to return everything correctly ok( $token->return_token0, '... and calling this deprecated method should return something'); do { $token = $p->get_token } until $token->is_start_tag( 'body' ); can_ok($token, 'get_attr'); my $attr = $token->get_attr; is( ref $attr , 'HASH', '... and it should return a hashref' ); is( $attr->{'bgcolor'}, '#ffffff','... correctly identifying the bgcolor' ); is( $attr->{'alink'}, '#0000ff', '... and the alink color' ); can_ok($token, 'get_attrseq'); my $arrayref = $token->get_attrseq; is( ref $arrayref, 'ARRAY', '... and it should return an array reference' ); is( scalar @{$arrayref}, 2, '... with the correct number of elements' ); is( "@$arrayref", 'alink bgcolor','... in the correct order' ); can_ok($token, 'return_attrseq'); $arrayref = $token->return_attrseq; is( ref $arrayref, 'ARRAY', '... and calling this deprecated method should return an array reference' ); is( scalar @{$arrayref}, 2, '... with the correct number of elements' ); is( "@$arrayref", 'alink bgcolor','... in the correct order' ); __DATA__ This is a title "; ?>

Do not edit this HTML lest the tests fail!!!



HTML-TokeParser-Simple-3.16/t/internals.t000555000765000024 320112162510100 20563 0ustar00curtispoestaff000000000000 #!/usr/bin/perl -w use strict; use warnings; use Sub::Override; use Test::More tests => 16; my $CLASS; BEGIN { chdir 't' if -d 't'; unshift @INC => '../lib'; $CLASS = 'HTML::TokeParser::Simple'; use_ok($CLASS) || die; } my $test_html_file = 'data/test.html'; my $parser1 = $CLASS->new(file => $test_html_file); my $token1; for (0 .. 16) { $token1 = $parser1->get_token; } my $parser2 = $CLASS->new(file => $test_html_file); my $token2 = $parser2->get_tag('body'); can_ok($token1, '_get_text'); can_ok($token2, '_get_text'); is($token1->_get_text, $token2->_get_text, '... and _get_text should return the same value regardless of source'); # can_ok($token1, '_get_attrseq'); can_ok($token2, '_get_attrseq'); is_deeply($token1->_get_attrseq, $token2->_get_attrseq, '... and _get_attrseq should return the same value regardless of source'); my @attrseq = qw/alink bgcolor/; is_deeply($token1->_get_attrseq, \@attrseq, '... and it should match the correct attribute sequence'); can_ok($token1, '_get_attr'); can_ok($token2, '_get_attr'); is_deeply($token1->_get_attr, $token2->_get_attr, '... and _get_attr should return the same value regardless of source'); my %attr = ( alink => '#0000ff', bgcolor => '#ffffff', ); is_deeply($token1->_get_attr, \%attr, '... and it should match the correct attributes'); can_ok($token1, '_get_tag'); can_ok($token2, '_get_tag'); is_deeply($token1->_get_tag, $token2->_get_tag, '... and _get_tag should return the same value regardless of source'); is($token1->_get_tag, 'body', '... and it should match the correct tag'); HTML-TokeParser-Simple-3.16/t/munge_html.t000555000765000024 1133712162510100 20754 0ustar00curtispoestaff000000000000#!/usr/bin/perl -w use strict; use warnings; use Test::More tests => 24; #use Test::More 'no_plan'; my $CLASS; BEGIN { chdir 't' if -d 't'; unshift @INC => '../lib'; $CLASS = 'HTML::TokeParser::Simple'; use_ok($CLASS) || die; } my $p = $CLASS->new(\*DATA); my $token; do { $token = $p->get_token } until $token->is_end_tag('head'); can_ok($token, 'set_attr'); do { $token = $p->get_token } until $token->is_start_tag('body'); can_ok($token, 'set_attr'); $token->set_attr(foo => 'bar'); is($token->as_is, '', '... but a good token should set the new attribute'); $token->set_attr(bgcolor => 'white'); is($token->as_is, '', '... or overwrite an existing one'); is_deeply($token->get_attrseq, [qw{alink bgcolor foo}], '... and the attribute sequence should be updated'); my $attr = { alink => "#0000ff", bgcolor => "white", foo => "bar" }; is_deeply($token->get_attr, $attr, '... as should the attributes themselves'); can_ok($token, 'delete_attr'); $token->delete_attr('asdf'); is($token->as_is, '', '... and deleting a non-existent attribute should be a no-op'); $token->delete_attr('foo'); is($token->as_is, '', '... and deleting an existing one should succeed'); $token->set_attr('foo', 'bar'); $token->delete_attr('FOO'); is($token->as_is, '', '... and deleting should be case-insensitive'); do { $token = $p->get_token } until $token->is_start_tag('h1'); my $regex = qr/^h\d$/; ok($token->is_tag($regex), 'Calling is_tag() with a regex should succeed'); ok(!$token->is_tag(qr/x/), '... and not return false positives'); ok($token->is_start_tag($regex), 'Calling is_start_tag() with a regex should succeed'); ok(!$token->is_start_tag(qr/x/), '... and not return false positives'); do { $token = $p->get_token } until $token->is_start_tag('hr'); $token->set_attr('class','fribble'); is($token->as_is, '
', 'Setting attributes on self-closing tags should succeed'); $token->delete_attr('class'); is($token->as_is, '
', '... as should deleting them'); do { $token = $p->get_token } until $token->is_start_tag('hr'); $token->set_attr('class','fribble'); is($token->as_is, '
', 'Setting attributes on self-closing tags should succeed'); $token->delete_attr('class'); is($token->as_is, '
', '... as should deleting them'); can_ok( $token, 'rewrite_tag'); my ($html,$fixed_html) = fetch_html(); my $parser = $CLASS->new(\$html); my $new_html = ''; while (my $token = $parser->get_token) { $token->rewrite_tag; $new_html .= $token->as_is; } is($new_html, $fixed_html, '... and it should correctly rewrite all tags'); $html = ''; $parser = HTML::TokeParser::Simple->new(\$html); my $span = $parser->get_tag('span'); is $span->as_is, $html, 'We should be able to fetch tags with escaped attributes'; ok $span->rewrite_tag, '... and rewriting said tags should succeed'; is $span->as_is, $html, '... and the attributes should be properly escaped'; sub fetch_html { my $html = <<' END_HTML'; This is a title "; ?>

Do not edit this HTML lest the tests fail!!!



END_HTML my $fixed_html = <<' END_HTML'; This is a title "; ?>

Do not edit this HTML lest the tests fail!!!



END_HTML return $html,$fixed_html; } __DATA__ This is a title "; ?>

Do not edit this HTML lest the tests fail!!!



HTML-TokeParser-Simple-3.16/t/data000755000765000024 012162510100 17154 5ustar00curtispoestaff000000000000HTML-TokeParser-Simple-3.16/t/data/test.html000444000765000024 46412162510100 21142 0ustar00curtispoestaff000000000000 This is a title "; ?>

Do not edit this HTML lest the tests fail!!!

HTML-TokeParser-Simple-3.16/xt000755000765000024 012162510100 16433 5ustar00curtispoestaff000000000000HTML-TokeParser-Simple-3.16/xt/pod-coverage.t000555000765000024 102512162510100 21331 0ustar00curtispoestaff000000000000#!perl -T use strict; use Test::More; eval "use Test::Pod::Coverage 0.08"; plan skip_all => "Test::Pod::Coverage required for testing POD coverage" if $@; my %exception_for = ( 'HTML::TokeParser::Simple::Token' => qr/new/, 'HTML::TokeParser::Simple::Token::Tag' => qr/new/, ); my @modules = Test::Pod::Coverage::all_modules(); plan tests => scalar @modules; foreach my $module (@modules) { my $exception = $exception_for{$module}; pod_coverage_ok( $module, $exception ? { trustme => [$exception] } : () ); } HTML-TokeParser-Simple-3.16/xt/pod.t000555000765000024 21512162510100 17520 0ustar00curtispoestaff000000000000#!perl -T use Test::More; eval "use Test::Pod 1.14"; plan skip_all => "Test::Pod 1.14 required for testing POD" if $@; all_pod_files_ok();