HTML-Parser-3.81/000755 000765 000024 00000000000 14366103441 014724 5ustar00olafaldersstaff000000 000000 HTML-Parser-3.81/Parser.xs000644 000765 000024 00000033454 14366103441 016545 0ustar00olafaldersstaff000000 000000 /* * Copyright 1999-2016, Gisle Aas. * Copyright 1999-2000, Michael A. Chase. * * This library is free software; you can redistribute it and/or * modify it under the same terms as Perl itself. */ #define PERL_NO_GET_CONTEXT /* we want efficiency */ #include "EXTERN.h" #include "perl.h" #include "XSUB.h" #include "ppport.h" #define DOWARN (PL_dowarn & G_WARN_ON) #define RETHROW croak(Nullch) /* * Include stuff. We include .c files instead of linking them, * so that they don't have to pollute the external dll name space. */ #ifdef EXTERN #undef EXTERN #endif #define EXTERN static /* Don't pollute */ #include "hparser.h" #include "util.c" #include "hparser.c" /* * Support functions for the XS glue */ static SV* check_handler(pTHX_ SV* h) { SvGETMAGIC(h); if (SvROK(h)) { SV* myref = SvRV(h); if (SvTYPE(myref) == SVt_PVCV) return newSVsv(h); if (SvTYPE(myref) == SVt_PVAV) return SvREFCNT_inc(myref); croak("Only code or array references allowed as handler"); } return SvOK(h) ? newSVsv(h) : 0; } static PSTATE* get_pstate_iv(pTHX_ SV* sv) { PSTATE *p; MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, '~') : NULL; if (!mg) croak("Lost parser state magic"); p = (PSTATE *)mg->mg_ptr; if (!p) croak("Lost parser state magic"); if (p->signature != P_SIGNATURE) croak("Bad signature in parser state object at %p", p); return p; } static PSTATE* get_pstate_hv(pTHX_ SV* sv) /* used by XS typemap */ { HV* hv; SV** svp; sv = SvRV(sv); if (!sv || SvTYPE(sv) != SVt_PVHV) croak("Not a reference to a hash"); hv = (HV*)sv; svp = hv_fetchs(hv, "_hparser_xs_state", 0); if (svp) { if (SvROK(*svp)) return get_pstate_iv(aTHX_ SvRV(*svp)); else croak("_hparser_xs_state element is not a reference"); } croak("Can't find '_hparser_xs_state' element in HTML::Parser hash"); return 0; } static void free_pstate(pTHX_ PSTATE* pstate) { int i; SvREFCNT_dec(pstate->buf); SvREFCNT_dec(pstate->pend_text); SvREFCNT_dec(pstate->skipped_text); #ifdef MARKED_SECTION SvREFCNT_dec(pstate->ms_stack); #endif SvREFCNT_dec(pstate->bool_attr_val); for (i = 0; i < EVENT_COUNT; i++) { SvREFCNT_dec(pstate->handlers[i].cb); SvREFCNT_dec(pstate->handlers[i].argspec); } SvREFCNT_dec(pstate->report_tags); SvREFCNT_dec(pstate->ignore_tags); SvREFCNT_dec(pstate->ignore_elements); SvREFCNT_dec(pstate->ignoring_element); SvREFCNT_dec(pstate->tmp); pstate->signature = 0; Safefree(pstate); } static int magic_free_pstate(pTHX_ SV *sv, MAGIC *mg) { free_pstate(aTHX_ (PSTATE *)mg->mg_ptr); return 0; } #if defined(USE_ITHREADS) static PSTATE * dup_pstate(pTHX_ PSTATE *pstate, CLONE_PARAMS *params) { PSTATE *pstate2; int i; Newz(56, pstate2, 1, PSTATE); pstate2->signature = pstate->signature; pstate2->buf = SvREFCNT_inc(sv_dup(pstate->buf, params)); pstate2->offset = pstate->offset; pstate2->line = pstate->line; pstate2->column = pstate->column; pstate2->start_document = pstate->start_document; pstate2->parsing = pstate->parsing; pstate2->eof = pstate->eof; pstate2->literal_mode = pstate->literal_mode; pstate2->is_cdata = pstate->is_cdata; pstate2->no_dash_dash_comment_end = pstate->no_dash_dash_comment_end; pstate2->pending_end_tag = pstate->pending_end_tag; pstate2->pend_text = SvREFCNT_inc(sv_dup(pstate->pend_text, params)); pstate2->pend_text_is_cdata = pstate->pend_text_is_cdata; pstate2->pend_text_offset = pstate->pend_text_offset; pstate2->pend_text_line = pstate->pend_text_offset; pstate2->pend_text_column = pstate->pend_text_column; pstate2->skipped_text = SvREFCNT_inc(sv_dup(pstate->skipped_text, params)); #ifdef MARKED_SECTION pstate2->ms = pstate->ms; pstate2->ms_stack = (AV *)SvREFCNT_inc(sv_dup((SV *)pstate->ms_stack, params)); pstate2->marked_sections = pstate->marked_sections; #endif pstate2->strict_comment = pstate->strict_comment; pstate2->strict_names = pstate->strict_names; pstate2->strict_end = pstate->strict_end; pstate2->xml_mode = pstate->xml_mode; pstate2->unbroken_text = pstate->unbroken_text; pstate2->attr_encoded = pstate->attr_encoded; pstate2->case_sensitive = pstate->case_sensitive; pstate2->closing_plaintext = pstate->closing_plaintext; pstate2->utf8_mode = pstate->utf8_mode; pstate2->empty_element_tags = pstate->empty_element_tags; pstate2->xml_pic = pstate->xml_pic; pstate2->backquote = pstate->backquote; pstate2->bool_attr_val = SvREFCNT_inc(sv_dup(pstate->bool_attr_val, params)); for (i = 0; i < EVENT_COUNT; i++) { pstate2->handlers[i].cb = SvREFCNT_inc(sv_dup(pstate->handlers[i].cb, params)); pstate2->handlers[i].argspec = SvREFCNT_inc(sv_dup(pstate->handlers[i].argspec, params)); } pstate2->argspec_entity_decode = pstate->argspec_entity_decode; pstate2->report_tags = (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->report_tags, params)); pstate2->ignore_tags = (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_tags, params)); pstate2->ignore_elements = (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_elements, params)); pstate2->ignoring_element = SvREFCNT_inc(sv_dup(pstate->ignoring_element, params)); pstate2->ignore_depth = pstate->ignore_depth; if (params->flags & CLONEf_JOIN_IN) { pstate2->entity2char = get_hv("HTML::Entities::entity2char", GV_ADD); } else { pstate2->entity2char = (HV *)sv_dup((SV *)pstate->entity2char, params); } pstate2->tmp = SvREFCNT_inc(sv_dup(pstate->tmp, params)); return pstate2; } static int magic_dup_pstate(pTHX_ MAGIC *mg, CLONE_PARAMS *params) { mg->mg_ptr = (char *)dup_pstate(aTHX_ (PSTATE *)mg->mg_ptr, params); return 0; } #endif const MGVTBL vtbl_pstate = { 0, 0, 0, 0, MEMBER_TO_FPTR(magic_free_pstate), #if defined(USE_ITHREADS) 0, MEMBER_TO_FPTR(magic_dup_pstate), #endif }; /* * XS interface definition. */ MODULE = HTML::Parser PACKAGE = HTML::Parser PROTOTYPES: DISABLE void _alloc_pstate(self) SV* self; PREINIT: PSTATE* pstate; SV* sv; HV* hv; MAGIC* mg; CODE: sv = SvRV(self); if (!sv || SvTYPE(sv) != SVt_PVHV) croak("Not a reference to a hash"); hv = (HV*)sv; Newz(56, pstate, 1, PSTATE); pstate->signature = P_SIGNATURE; pstate->entity2char = get_hv("HTML::Entities::entity2char", GV_ADD); pstate->tmp = NEWSV(0, 20); sv = newSViv(PTR2IV(pstate)); sv_magic(sv, 0, '~', (char *)pstate, 0); mg = mg_find(sv, '~'); assert(mg); mg->mg_virtual = (MGVTBL*)&vtbl_pstate; #if defined(USE_ITHREADS) mg->mg_flags |= MGf_DUP; #endif SvREADONLY_on(sv); hv_stores(hv, "_hparser_xs_state", newRV_noinc(sv)); void parse(self, chunk) SV* self; SV* chunk PREINIT: PSTATE* p_state = get_pstate_hv(aTHX_ self); PPCODE: (void)sv_2mortal(SvREFCNT_inc(SvRV(self))); if (p_state->parsing) croak("Parse loop not allowed"); p_state->parsing = 1; if (SvROK(chunk) && SvTYPE(SvRV(chunk)) == SVt_PVCV) { SV* generator = chunk; STRLEN len; do { int count; PUSHMARK(SP); count = call_sv(generator, G_SCALAR|G_EVAL); SPAGAIN; chunk = count ? POPs : 0; PUTBACK; if (SvTRUE(ERRSV)) { p_state->parsing = 0; p_state->eof = 0; RETHROW; } if (chunk && SvOK(chunk)) { (void)SvPV(chunk, len); /* get length */ } else { len = 0; } parse(aTHX_ p_state, len ? chunk : 0, self); SPAGAIN; } while (len && !p_state->eof); } else { parse(aTHX_ p_state, chunk, self); SPAGAIN; } p_state->parsing = 0; if (p_state->eof) { p_state->eof = 0; PUSHs(sv_newmortal()); } else { PUSHs(self); } void eof(self) SV* self; PREINIT: PSTATE* p_state = get_pstate_hv(aTHX_ self); PPCODE: if (p_state->parsing) p_state->eof = 1; else { p_state->parsing = 1; parse(aTHX_ p_state, 0, self); /* flush */ SPAGAIN; p_state->parsing = 0; } PUSHs(self); SV* strict_comment(pstate,...) PSTATE* pstate ALIAS: HTML::Parser::strict_comment = 1 HTML::Parser::strict_names = 2 HTML::Parser::xml_mode = 3 HTML::Parser::unbroken_text = 4 HTML::Parser::marked_sections = 5 HTML::Parser::attr_encoded = 6 HTML::Parser::case_sensitive = 7 HTML::Parser::strict_end = 8 HTML::Parser::closing_plaintext = 9 HTML::Parser::utf8_mode = 10 HTML::Parser::empty_element_tags = 11 HTML::Parser::xml_pic = 12 HTML::Parser::backquote = 13 PREINIT: bool *attr; CODE: switch (ix) { case 1: attr = &pstate->strict_comment; break; case 2: attr = &pstate->strict_names; break; case 3: attr = &pstate->xml_mode; break; case 4: attr = &pstate->unbroken_text; break; case 5: #ifdef MARKED_SECTION attr = &pstate->marked_sections; break; #else croak("marked sections not supported"); break; #endif case 6: attr = &pstate->attr_encoded; break; case 7: attr = &pstate->case_sensitive; break; case 8: attr = &pstate->strict_end; break; case 9: attr = &pstate->closing_plaintext; break; case 10: attr = &pstate->utf8_mode; break; case 11: attr = &pstate->empty_element_tags; break; case 12: attr = &pstate->xml_pic; break; case 13: attr = &pstate->backquote; break; default: croak("Unknown boolean attribute (%d)", (int)ix); } RETVAL = boolSV(*attr); if (items > 1) *attr = SvTRUE(ST(1)); OUTPUT: RETVAL SV* boolean_attribute_value(pstate,...) PSTATE* pstate CODE: RETVAL = pstate->bool_attr_val ? newSVsv(pstate->bool_attr_val) : &PL_sv_undef; if (items > 1) { SvREFCNT_dec(pstate->bool_attr_val); pstate->bool_attr_val = newSVsv(ST(1)); } OUTPUT: RETVAL void ignore_tags(pstate,...) PSTATE* pstate ALIAS: HTML::Parser::report_tags = 1 HTML::Parser::ignore_tags = 2 HTML::Parser::ignore_elements = 3 PREINIT: HV** attr; int i; CODE: switch (ix) { case 1: attr = &pstate->report_tags; break; case 2: attr = &pstate->ignore_tags; break; case 3: attr = &pstate->ignore_elements; break; default: croak("Unknown tag-list attribute (%d)", (int)ix); } if (GIMME_V != G_VOID) croak("Can't report tag lists yet"); items--; /* pstate */ if (items) { if (*attr) hv_clear(*attr); else *attr = newHV(); for (i = 0; i < items; i++) { SV* sv = ST(i+1); if (SvROK(sv)) { sv = SvRV(sv); if (SvTYPE(sv) == SVt_PVAV) { AV* av = (AV*)sv; STRLEN j; STRLEN top = av_top_index(av); for (j = 0; j <= top; j++) { SV**svp = av_fetch(av, j, 0); if (svp) { hv_store_ent(*attr, *svp, newSViv(0), 0); } } } else croak("Tag list must be plain scalars and arrays"); } else { hv_store_ent(*attr, sv, newSViv(0), 0); } } } else if (*attr) { SvREFCNT_dec(*attr); *attr = 0; } void handler(pstate, eventname,...) PSTATE* pstate SV* eventname PREINIT: STRLEN name_len; char *name = SvPV(eventname, name_len); int event = -1; int i; struct p_handler *h; PPCODE: /* map event name string to event_id */ for (i = 0; i < EVENT_COUNT; i++) { if (strEQ(name, event_id_str[i])) { event = i; break; } } if (event < 0) croak("No handler for %s events", name); h = &pstate->handlers[event]; /* set up return value */ if (h->cb) { PUSHs((SvTYPE(h->cb) == SVt_PVAV) ? sv_2mortal(newRV_inc(h->cb)) : sv_2mortal(newSVsv(h->cb))); } else { PUSHs(&PL_sv_undef); } /* update */ if (items > 3) { SvREFCNT_dec(h->argspec); h->argspec = 0; h->argspec = argspec_compile(ST(3), pstate); } if (items > 2) { SvREFCNT_dec(h->cb); h->cb = 0; h->cb = check_handler(aTHX_ ST(2)); } MODULE = HTML::Parser PACKAGE = HTML::Entities void decode_entities(...) PREINIT: int i; HV *entity2char = get_hv("HTML::Entities::entity2char", 0); PPCODE: if (GIMME_V == G_SCALAR && items > 1) items = 1; for (i = 0; i < items; i++) { if (GIMME_V != G_VOID) ST(i) = sv_2mortal(newSVsv(ST(i))); else { #ifdef SV_CHECK_THINKFIRST SV_CHECK_THINKFIRST(ST(i)); #endif if (SvREADONLY(ST(i))) croak("Can't inline decode readonly string in decode_entities()"); } decode_entities(aTHX_ ST(i), entity2char, 0); } SP += items; void _decode_entities(string, entities, ...) SV* string SV* entities PREINIT: HV* entities_hv; bool expand_prefix = (items > 2) ? SvTRUE(ST(2)) : 0; CODE: if (SvOK(entities)) { if (SvROK(entities) && SvTYPE(SvRV(entities)) == SVt_PVHV) { entities_hv = (HV*)SvRV(entities); } else { croak("2nd argument must be hash reference"); } } else { entities_hv = 0; } #ifdef SV_CHECK_THINKFIRST SV_CHECK_THINKFIRST(string); #endif if (SvREADONLY(string)) croak("Can't inline decode readonly string in _decode_entities()"); decode_entities(aTHX_ string, entities_hv, expand_prefix); bool _probably_utf8_chunk(string) SV* string PREINIT: STRLEN len; char *s; CODE: sv_utf8_downgrade(string, 0); s = SvPV(string, len); RETVAL = probably_utf8_chunk(aTHX_ s, len); OUTPUT: RETVAL int UNICODE_SUPPORT() PROTOTYPE: CODE: RETVAL = 1; OUTPUT: RETVAL MODULE = HTML::Parser PACKAGE = HTML::Parser HTML-Parser-3.81/README000644 000765 000024 00000117164 14366103441 015616 0ustar00olafaldersstaff000000 000000 NAME HTML::Parser - HTML parser class SYNOPSIS use strict; use warnings; use HTML::Parser (); # Create parser object my $p = HTML::Parser->new( api_version => 3, start_h => [\&start, "tagname, attr"], end_h => [\&end, "tagname"], marked_sections => 1, ); # Parse document text chunk by chunk $p->parse($chunk1); $p->parse($chunk2); # ... # signal end of document $p->eof; # Parse directly from file $p->parse_file("foo.html"); # or open(my $fh, "<:utf8", "foo.html") || die; $p->parse_file($fh); DESCRIPTION Objects of the "HTML::Parser" class will recognize markup and separate it from plain text (alias data content) in HTML documents. As different kinds of markup and text are recognized, the corresponding event handlers are invoked. "HTML::Parser" is not a generic SGML parser. We have tried to make it able to deal with the HTML that is actually "out there", and it normally parses as closely as possible to the way the popular web browsers do it instead of strictly following one of the many HTML specifications from W3C. Where there is disagreement, there is often an option that you can enable to get the official behaviour. The document to be parsed may be supplied in arbitrary chunks. This makes on-the-fly parsing as documents are received from the network possible. If event driven parsing does not feel right for your application, you might want to use "HTML::PullParser". This is an "HTML::Parser" subclass that allows a more conventional program structure. METHODS The following method is used to construct a new "HTML::Parser" object: $p = HTML::Parser->new( %options_and_handlers ) This class method creates a new "HTML::Parser" object and returns it. Key/value argument pairs may be provided to assign event handlers or initialize parser options. The handlers and parser options can also be set or modified later by the method calls described below. If a top level key is in the form "_h" (e.g., "text_h") then it assigns a handler to that event, otherwise it initializes a parser option. The event handler specification value must be an array reference. Multiple handlers may also be assigned with the 'handlers => [%handlers]' option. See examples below. If new() is called without any arguments, it will create a parser that uses callback methods compatible with version 2 of "HTML::Parser". See the section on "version 2 compatibility" below for details. The special constructor option 'api_version => 2' can be used to initialize version 2 callbacks while still setting other options and handlers. The 'api_version => 3' option can be used if you don't want to set any options and don't want to fall back to v2 compatible mode. Examples: $p = HTML::Parser->new( api_version => 3, text_h => [ sub {...}, "dtext" ] ); This creates a new parser object with a text event handler subroutine that receives the original text with general entities decoded. $p = HTML::Parser->new( api_version => 3, start_h => [ 'my_start', "self,tokens" ] ); This creates a new parser object with a start event handler method that receives the $p and the tokens array. $p = HTML::Parser->new( api_version => 3, handlers => { text => [\@array, "event,text"], comment => [\@array, "event,text"], } ); This creates a new parser object that stores the event type and the original text in @array for text and comment events. The following methods feed the HTML document to the "HTML::Parser" object: $p->parse( $string ) Parse $string as the next chunk of the HTML document. Handlers invoked should not attempt to modify the $string in-place until $p->parse returns. If an invoked event handler aborts parsing by calling $p->eof, then $p->parse() will return a FALSE value. Otherwise the return value is a reference to the parser object ($p). $p->parse( $code_ref ) If a code reference is passed as the argument to be parsed, then the chunks to be parsed are obtained by invoking this function repeatedly. Parsing continues until the function returns an empty (or undefined) result. When this happens $p->eof is automatically signaled. Parsing will also abort if one of the event handlers calls $p->eof. The effect of this is the same as: while (1) { my $chunk = &$code_ref(); if (!defined($chunk) || !length($chunk)) { $p->eof; return $p; } $p->parse($chunk) || return undef; } But it is more efficient as this loop runs internally in XS code. $p->parse_file( $file ) Parse text directly from a file. The $file argument can be a filename, an open file handle, or a reference to an open file handle. If $file contains a filename and the file can't be opened, then the method returns an undefined value and $! tells why it failed. Otherwise the return value is a reference to the parser object. If a file handle is passed as the $file argument, then the file will normally be read until EOF, but not closed. If an invoked event handler aborts parsing by calling $p->eof, then $p->parse_file() may not have read the entire file. On systems with multi-byte line terminators, the values passed for the offset and length argspecs may be too low if parse_file() is called on a file handle that is not in binary mode. If a filename is passed in, then parse_file() will open the file in binary mode. $p->eof Signals the end of the HTML document. Calling the $p->eof method outside a handler callback will flush any remaining buffered text (which triggers the "text" event if there is any remaining text). Calling $p->eof inside a handler will terminate parsing at that point and cause $p->parse to return a FALSE value. This also terminates parsing by $p->parse_file(). After $p->eof has been called, the parse() and parse_file() methods can be invoked to feed new documents with the parser object. The return value from eof() is a reference to the parser object. Most parser options are controlled by boolean attributes. Each boolean attribute is enabled by calling the corresponding method with a TRUE argument and disabled with a FALSE argument. The attribute value is left unchanged if no argument is given. The return value from each method is the old attribute value. Methods that can be used to get and/or set parser options are: $p->attr_encoded $p->attr_encoded( $bool ) By default, the "attr" and @attr argspecs will have general entities for attribute values decoded. Enabling this attribute leaves entities alone. $p->backquote $p->backquote( $bool ) By default, only ' and " are recognized as quote characters around attribute values. MSIE also recognizes backquotes for some reason. Enabling this attribute provides compatibility with this behaviour. $p->boolean_attribute_value( $val ) This method sets the value reported for boolean attributes inside HTML start tags. By default, the name of the attribute is also used as its value. This affects the values reported for "tokens" and "attr" argspecs. $p->case_sensitive $p->case_sensitive( $bool ) By default, tag names and attribute names are down-cased. Enabling this attribute leaves them as found in the HTML source document. $p->closing_plaintext $p->closing_plaintext( $bool ) By default, "plaintext" element can never be closed. Everything up to the end of the document is parsed in CDATA mode. This historical behaviour is what at least MSIE does. Enabling this attribute makes closing " tag effective and the parsing process will resume after seeing this tag. This emulates early gecko-based browsers. $p->empty_element_tags $p->empty_element_tags( $bool ) By default, empty element tags are not recognized as such and the "/" before ">" is just treated like a normal name character (unless "strict_names" is enabled). Enabling this attribute make "HTML::Parser" recognize these tags. Empty element tags look like start tags, but end with the character sequence "/>" instead of ">". When recognized by "HTML::Parser" they cause an artificial end event in addition to the start event. The "text" for the artificial end event will be empty and the "tokenpos" array will be undefined even though the token array will have one element containing the tag name. $p->marked_sections $p->marked_sections( $bool ) By default, section markings like are treated like ordinary text. When this attribute is enabled section markings are honoured. There are currently no events associated with the marked section markup, but the text can be returned as "skipped_text". $p->strict_comment $p->strict_comment( $bool ) By default, comments are terminated by the first occurrence of "-->". This is the behaviour of most popular browsers (like Mozilla, Opera and MSIE), but it is not correct according to the official HTML standard. Officially, you need an even number of "--" tokens before the closing ">" is recognized and there may not be anything but whitespace between an even and an odd "--". The official behaviour is enabled by enabling this attribute. Enabling of 'strict_comment' also disables recognizing these forms as comments: $p->strict_end $p->strict_end( $bool ) By default, attributes and other junk are allowed to be present on end tags in a manner that emulates MSIE's behaviour. The official behaviour is enabled with this attribute. If enabled, only whitespace is allowed between the tagname and the final ">". $p->strict_names $p->strict_names( $bool ) By default, almost anything is allowed in tag and attribute names. This is the behaviour of most popular browsers and allows us to parse some broken tags with invalid attribute values like: [PREV By default, "LIST]" is parsed as a boolean attribute, not as part of the ALT value as was clearly intended. This is also what Mozilla sees. The official behaviour is enabled by enabling this attribute. If enabled, it will cause the tag above to be reported as text since "LIST]" is not a legal attribute name. $p->unbroken_text $p->unbroken_text( $bool ) By default, blocks of text are given to the text handler as soon as possible (but the parser takes care always to break text at a boundary between whitespace and non-whitespace so single words and entities can always be decoded safely). This might create breaks that make it hard to do transformations on the text. When this attribute is enabled, blocks of text are always reported in one piece. This will delay the text event until the following (non-text) event has been recognized by the parser. Note that the "offset" argspec will give you the offset of the first segment of text and "length" is the combined length of the segments. Since there might be ignored tags in between, these numbers can't be used to directly index in the original document file. $p->utf8_mode $p->utf8_mode( $bool ) Enable this option when parsing raw undecoded UTF-8. This tells the parser that the entities expanded for strings reported by "attr", @attr and "dtext" should be expanded as decoded UTF-8 so they end up compatible with the surrounding text. If "utf8_mode" is enabled then it is an error to pass strings containing characters with code above 255 to the parse() method, and the parse() method will croak if you try. Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8 encoded. The character can also be represented by the entity "♥" or "♥". If we feed the parser: $p->parse("\xE2\x99\xA5♥"); then "dtext" will be reported as "\xE2\x99\xA5\x{2665}" without "utf8_mode" enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled. The later string is what you want. This option is only available with perl-5.8 or better. $p->xml_mode $p->xml_mode( $bool ) Enabling this attribute changes the parser to allow some XML constructs. This enables the behaviour controlled by individually by the "case_sensitive", "empty_element_tags", "strict_names" and "xml_pic" attributes and also suppresses special treatment of elements that are parsed as CDATA for HTML. $p->xml_pic $p->xml_pic( $bool ) By default, *processing instructions* are terminated by ">". When this attribute is enabled, processing instructions are terminated by "?>" instead. As markup and text is recognized, handlers are invoked. The following method is used to set up handlers for different events: $p->handler( event => \&subroutine, $argspec ) $p->handler( event => $method_name, $argspec ) $p->handler( event => \@accum, $argspec ) $p->handler( event => "" ); $p->handler( event => undef ); $p->handler( event ); This method assigns a subroutine, method, or array to handle an event. Event is one of "text", "start", "end", "declaration", "comment", "process", "start_document", "end_document" or "default". The "\&subroutine" is a reference to a subroutine which is called to handle the event. The $method_name is the name of a method of $p which is called to handle the event. The @accum is an array that will hold the event information as sub-arrays. If the second argument is "", the event is ignored. If it is undef, the default handler is invoked for the event. The $argspec is a string that describes the information to be reported for the event. Any requested information that does not apply to a specific event is passed as "undef". If argspec is omitted, then it is left unchanged. The return value from $p->handler is the old callback routine or a reference to the accumulator array. Any return values from handler callback routines/methods are always ignored. A handler callback can request parsing to be aborted by invoking the $p->eof method. A handler callback is not allowed to invoke the $p->parse() or $p->parse_file() method. An exception will be raised if it tries. Examples: $p->handler(start => "start", 'self, attr, attrseq, text' ); This causes the "start" method of object $p to be called for 'start' events. The callback signature is "$p->start(\%attr, \@attr_seq, $text)". $p->handler(start => \&start, 'attr, attrseq, text' ); This causes subroutine start() to be called for 'start' events. The callback signature is start(\%attr, \@attr_seq, $text). $p->handler(start => \@accum, '"S", attr, attrseq, text' ); This causes 'start' event information to be saved in @accum. The array elements will be ['S', \%attr, \@attr_seq, $text]. $p->handler(start => ""); This causes 'start' events to be ignored. It also suppresses invocations of any default handler for start events. It is in most cases equivalent to $p->handler(start => sub {}), but is more efficient. It is different from the empty-sub-handler in that "skipped_text" is not reset by it. $p->handler(start => undef); This causes no handler to be associated with start events. If there is a default handler it will be invoked. Filters based on tags can be set up to limit the number of events reported. The main bottleneck during parsing is often the huge number of callbacks made from the parser. Applying filters can improve performance significantly. The following methods control filters: $p->ignore_elements( @tags ) Both the "start" event and the "end" event as well as any events that would be reported in between are suppressed. The ignored elements can contain nested occurrences of itself. Example: $p->ignore_elements(qw(script style)); The "script" and "style" tags will always nest properly since their content is parsed in CDATA mode. For most other tags "ignore_elements" must be used with caution since HTML is often not *well formed*. $p->ignore_tags( @tags ) Any "start" and "end" events involving any of the tags given are suppressed. To reset the filter (i.e. don't suppress any "start" and "end" events), call "ignore_tags" without an argument. $p->report_tags( @tags ) Any "start" and "end" events involving any of the tags *not* given are suppressed. To reset the filter (i.e. report all "start" and "end" events), call "report_tags" without an argument. Internally, the system has two filter lists, one for "report_tags" and one for "ignore_tags", and both filters are applied. This effectively gives "ignore_tags" precedence over "report_tags". Examples: $p->ignore_tags(qw(style)); $p->report_tags(qw(script style)); results in only "script" events being reported. Argspec Argspec is a string containing a comma-separated list that describes the information reported by the event. The following argspec identifier names can be used: "attr" Attr causes a reference to a hash of attribute name/value pairs to be passed. Boolean attributes' values are either the value set by $p->boolean_attribute_value, or the attribute name if no value has been set by $p->boolean_attribute_value. This passes undef except for "start" events. Unless "xml_mode" or "case_sensitive" is enabled, the attribute names are forced to lower case. General entities are decoded in the attribute values and one layer of matching quotes enclosing the attribute values is removed. The Unicode character set is assumed for entity decoding. @attr Basically the same as "attr", but keys and values are passed as individual arguments and the original sequence of the attributes is kept. The parameters passed will be the same as the @attr calculated here: @attr = map { $_ => $attr->{$_} } @$attrseq; assuming $attr and $attrseq here are the hash and array passed as the result of "attr" and "attrseq" argspecs. This passes no values for events besides "start". "attrseq" Attrseq causes a reference to an array of attribute names to be passed. This can be useful if you want to walk the "attr" hash in the original sequence. This passes undef except for "start" events. Unless "xml_mode" or "case_sensitive" is enabled, the attribute names are forced to lower case. "column" Column causes the column number of the start of the event to be passed. The first column on a line is 0. "dtext" Dtext causes the decoded text to be passed. General entities are automatically decoded unless the event was inside a CDATA section or was between literal start and end tags ("script", "style", "xmp", "iframe", "title", "textarea" and "plaintext"). The Unicode character set is assumed for entity decoding. With Perl version 5.6 or earlier only the Latin-1 range is supported, and entities for characters outside the range 0..255 are left unchanged. This passes undef except for "text" events. "event" Event causes the event name to be passed. The event name is one of "text", "start", "end", "declaration", "comment", "process", "start_document" or "end_document". "is_cdata" Is_cdata causes a TRUE value to be passed if the event is inside a CDATA section or between literal start and end tags ("script", "style", "xmp", "iframe", "title", "textarea" and "plaintext"). if the flag is FALSE for a text event, then you should normally either use "dtext" or decode the entities yourself before the text is processed further. "length" Length causes the number of bytes of the source text of the event to be passed. "line" Line causes the line number of the start of the event to be passed. The first line in the document is 1. Line counting doesn't start until at least one handler requests this value to be reported. "offset" Offset causes the byte position in the HTML document of the start of the event to be passed. The first byte in the document has offset 0. "offset_end" Offset_end causes the byte position in the HTML document of the end of the event to be passed. This is the same as "offset" + "length". "self" Self causes the current object to be passed to the handler. If the handler is a method, this must be the first element in the argspec. An alternative to passing self as an argspec is to register closures that capture $self by themselves as handlers. Unfortunately this creates circular references which prevent the HTML::Parser object from being garbage collected. Using the "self" argspec avoids this problem. "skipped_text" Skipped_text returns the concatenated text of all the events that have been skipped since the last time an event was reported. Events might be skipped because no handler is registered for them or because some filter applies. Skipped text also includes marked section markup, since there are no events that can catch it. If an ""-handler is registered for an event, then the text for this event is not included in "skipped_text". Skipped text both before and after the ""-event is included in the next reported "skipped_text". "tag" Same as "tagname", but prefixed with "/" if it belongs to an "end" event and "!" for a declaration. The "tag" does not have any prefix for "start" events, and is in this case identical to "tagname". "tagname" This is the element name (or *generic identifier* in SGML jargon) for start and end tags. Since HTML is case insensitive, this name is forced to lower case to ease string matching. Since XML is case sensitive, the tagname case is not changed when "xml_mode" is enabled. The same happens if the "case_sensitive" attribute is set. The declaration type of declaration elements is also passed as a tagname, even if that is a bit strange. In fact, in the current implementation tagname is identical to "token0" except that the name may be forced to lower case. "token0" Token0 causes the original text of the first token string to be passed. This should always be the same as $tokens->[0]. For "declaration" events, this is the declaration type. For "start" and "end" events, this is the tag name. For "process" and non-strict "comment" events, this is everything inside the tag. This passes undef if there are no tokens in the event. "tokenpos" Tokenpos causes a reference to an array of token positions to be passed. For each string that appears in "tokens", this array contains two numbers. The first number is the offset of the start of the token in the original "text" and the second number is the length of the token. Boolean attributes in a "start" event will have (0,0) for the attribute value offset and length. This passes undef if there are no tokens in the event (e.g., "text") and for artificial "end" events triggered by empty element tags. If you are using these offsets and lengths to modify "text", you should either work from right to left, or be very careful to calculate the changes to the offsets. "tokens" Tokens causes a reference to an array of token strings to be passed. The strings are exactly as they were found in the original text, no decoding or case changes are applied. For "declaration" events, the array contains each word, comment, and delimited string starting with the declaration type. For "comment" events, this contains each sub-comment. If $p->strict_comments is disabled, there will be only one sub-comment. For "start" events, this contains the original tag name followed by the attribute name/value pairs. The values of boolean attributes will be either the value set by $p->boolean_attribute_value, or the attribute name if no value has been set by $p->boolean_attribute_value. For "end" events, this contains the original tag name (always one token). For "process" events, this contains the process instructions (always one token). This passes "undef" for "text" events. "text" Text causes the source text (including markup element delimiters) to be passed. "undef" Pass an undefined value. Useful as padding where the same handler routine is registered for multiple events. '...' A literal string of 0 to 255 characters enclosed in single (') or double (") quotes is passed as entered. The whole argspec string can be wrapped up in '@{...}' to signal that the resulting event array should be flattened. This only makes a difference if an array reference is used as the handler target. Consider this example: $p->handler(text => [], 'text'); $p->handler(text => [], '@{text}']); With two text events; "foo", "bar"; then the first example will end up with [["foo"], ["bar"]] and the second with ["foo", "bar"] in the handler target array. Events Handlers for the following events can be registered: "comment" This event is triggered when a markup comment is recognized. Example: "declaration" This event is triggered when a *markup declaration* is recognized. For typical HTML documents, the only declaration you are likely to find is . Example: DTDs inside will confuse HTML::Parser. "default" This event is triggered for events that do not have a specific handler. You can set up a handler for this event to catch stuff you did not want to catch explicitly. "end" This event is triggered when an end tag is recognized. Example: "end_document" This event is triggered when $p->eof is called and after any remaining text is flushed. There is no document text associated with this event. "process" This event is triggered when a processing instructions markup is recognized. The format and content of processing instructions are system and application dependent. Examples: "start" This event is triggered when a start tag is recognized. Example: "start_document" This event is triggered before any other events for a new document. A handler for it can be used to initialize stuff. There is no document text associated with this event. "text" This event is triggered when plain text (characters) is recognized. The text may contain multiple lines. A sequence of text may be broken between several text events unless $p->unbroken_text is enabled. The parser will make sure that it does not break a word or a sequence of whitespace between two text events. Unicode "HTML::Parser" can parse Unicode strings when running under perl-5.8 or better. If Unicode is passed to $p->parse() then chunks of Unicode will be reported to the handlers. The offset and length argspecs will also report their position in terms of characters. It is safe to parse raw undecoded UTF-8 if you either avoid decoding entities and make sure to not use *argspecs* that do, or enable the "utf8_mode" for the parser. Parsing of undecoded UTF-8 might be useful when parsing from a file where you need the reported offsets and lengths to match the byte offsets in the file. If a filename is passed to $p->parse_file() then the file will be read in binary mode. This will be fine if the file contains only ASCII or Latin-1 characters. If the file contains UTF-8 encoded text then care must be taken when decoding entities as described in the previous paragraph, but better is to open the file with the UTF-8 layer so that it is decoded properly: open(my $fh, "<:utf8", "index.html") || die "...: $!"; $p->parse_file($fh); If the file contains text encoded in a charset besides ASCII, Latin-1 or UTF-8 then decoding will always be needed. VERSION 2 COMPATIBILITY When an "HTML::Parser" object is constructed with no arguments, a set of handlers is automatically provided that is compatible with the old HTML::Parser version 2 callback methods. This is equivalent to the following method calls: $p->handler(start => "start", "self, tagname, attr, attrseq, text"); $p->handler(end => "end", "self, tagname, text"); $p->handler(text => "text", "self, text, is_cdata"); $p->handler(process => "process", "self, token0, text"); $p->handler( comment => sub { my ($self, $tokens) = @_; for (@$tokens) { $self->comment($_); } }, "self, tokens" ); $p->handler( declaration => sub { my $self = shift; $self->declaration(substr($_[0], 2, -1)); }, "self, text" ); Setting up these handlers can also be requested with the "api_version => 2" constructor option. SUBCLASSING The "HTML::Parser" class is able to be subclassed. Parser objects are plain hashes and "HTML::Parser" reserves only hash keys that start with "_hparser". The parser state can be set up by invoking the init() method, which takes the same arguments as new(). EXAMPLES The first simple example shows how you might strip out comments from an HTML document. We achieve this by setting up a comment handler that does nothing and a default handler that will print out anything else: use HTML::Parser (); HTML::Parser->new( default_h => [sub { print shift }, 'text'], comment_h => [""], )->parse_file(shift || die) || die $!; An alternative implementation is: use HTML::Parser (); HTML::Parser->new( end_document_h => [sub { print shift }, 'skipped_text'], comment_h => [""], )->parse_file(shift || die) || die $!; This will in most cases be much more efficient since only a single callback will be made. The next example prints out the text that is inside the element of an HTML document. Here we start by setting up a start handler. When it sees the title start tag it enables a text handler that prints any text found and an end handler that will terminate parsing as soon as the title end tag is seen: use HTML::Parser (); sub start_handler { return if shift ne "title"; my $self = shift; $self->handler(text => sub { print shift }, "dtext"); $self->handler( end => sub { shift->eof if shift eq "title"; }, "tagname,self" ); } my $p = HTML::Parser->new(api_version => 3); $p->handler(start => \&start_handler, "tagname,self"); $p->parse_file(shift || die) || die $!; print "\n"; More examples are found in the eg/ directory of the "HTML-Parser" distribution: the program "hrefsub" shows how you can edit all links found in a document; the program "htextsub" shows how to edit the text only; the program "hstrip" shows how you can strip out certain tags/elements and/or attributes; and the program "htext" show how to obtain the plain text, but not any script/style content. You can browse the eg/ directory online from the *[Browse]* link on the http://search.cpan.org/~gaas/HTML-Parser/ page. BUGS The <style> and <script> sections do not end with the first "</", but need the complete corresponding end tag. The standard behaviour is not really practical. When the *strict_comment* option is enabled, we still recognize comments where there is something other than whitespace between even and odd "--" markers. Once $p->boolean_attribute_value has been set, there is no way to restore the default behaviour. There is currently no way to get both quote characters into the same literal argspec. Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them to repeat the previous start tag or close the previous start tag respectively. NET tags, e.g. "code/.../" are not recognized. This is SGML shorthand for "<code>...</code>". Incomplete start or end tags, e.g. "<tt<b>...</b</tt>" are not recognized. DIAGNOSTICS The following messages may be produced by HTML::Parser. The notation in this listing is the same as used in perldiag: Not a reference to a hash (F) The object blessed into or subclassed from HTML::Parser is not a hash as required by the HTML::Parser methods. Bad signature in parser state object at %p (F) The _hparser_xs_state element does not refer to a valid state structure. Something must have changed the internal value stored in this hash element, or the memory has been overwritten. _hparser_xs_state element is not a reference (F) The _hparser_xs_state element has been destroyed. Can't find '_hparser_xs_state' element in HTML::Parser hash (F) The _hparser_xs_state element is missing from the parser hash. It was either deleted, or not created when the object was created. API version %s not supported by HTML::Parser %s (F) The constructor option 'api_version' with an argument greater than or equal to 4 is reserved for future extensions. Bad constructor option '%s' (F) An unknown constructor option key was passed to the new() or init() methods. Parse loop not allowed (F) A handler invoked the parse() or parse_file() method. This is not permitted. marked sections not supported (F) The $p->marked_sections() method was invoked in a HTML::Parser module that was compiled without support for marked sections. Unknown boolean attribute (%d) (F) Something is wrong with the internal logic that set up aliases for boolean attributes. Only code or array references allowed as handler (F) The second argument for $p->handler must be either a subroutine reference, then name of a subroutine or method, or a reference to an array. No handler for %s events (F) The first argument to $p->handler must be a valid event name; i.e. one of "start", "end", "text", "process", "declaration" or "comment". Unrecognized identifier %s in argspec (F) The identifier is not a known argspec name. Use one of the names mentioned in the argspec section above. Literal string is longer than 255 chars in argspec (F) The current implementation limits the length of literals in an argspec to 255 characters. Make the literal shorter. Backslash reserved for literal string in argspec (F) The backslash character "\" is not allowed in argspec literals. It is reserved to permit quoting inside a literal in a later version. Unterminated literal string in argspec (F) The terminating quote character for a literal was not found. Bad argspec (%s) (F) Only identifier names, literals, spaces and commas are allowed in argspecs. Missing comma separator in argspec (F) Identifiers in an argspec must be separated with ",". Parsing of undecoded UTF-8 will give garbage when decoding entities (W) The first chunk parsed appears to contain undecoded UTF-8 and one or more argspecs that decode entities are used for the callback handlers. The result of decoding will be a mix of encoded and decoded characters for any entities that expand to characters with code above 127. This is not a good thing. The recommended solution is to apply Encode::decode_utf8() on the data before feeding it to the $p->parse(). For $p->parse_file() pass a file that has been opened in ":utf8" mode. The alternative solution is to enable the "utf8_mode" and not decode before passing strings to $p->parse(). The parser can process raw undecoded UTF-8 sanely if the "utf8_mode" is enabled, or if the "attr", @attr or "dtext" argspecs are avoided. Parsing string decoded with wrong endian selection (W) The first character in the document is U+FFFE. This is not a legal Unicode character but a byte swapped "BOM". The result of parsing will likely be garbage. Parsing of undecoded UTF-32 (W) The parser found the Unicode UTF-32 "BOM" signature at the start of the document. The result of parsing will likely be garbage. Parsing of undecoded UTF-16 (W) The parser found the Unicode UTF-16 "BOM" signature at the start of the document. The result of parsing will likely be garbage. SEE ALSO HTML::Entities, HTML::PullParser, HTML::TokeParser, HTML::HeadParser, HTML::LinkExtor, HTML::Form HTML::TreeBuilder (part of the *HTML-Tree* distribution) <http://www.w3.org/TR/html4/> More information about marked sections and processing instructions may be found at <http://www.is-thought.co.uk/book/sgml-8.htm>. COPYRIGHT Copyright 1996-2016 Gisle Aas. All rights reserved. Copyright 1999-2000 Michael A. Chase. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/util.c�����������������������������������������������������������������������������000644 �000765 �000024 �00000013260 14366103441 016047� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * Copyright 1999-2009, Gisle Aas. * * This library is free software; you can redistribute it and/or * modify it under the same terms as Perl itself. */ #ifndef EXTERN #define EXTERN extern #endif EXTERN SV* sv_lower(pTHX_ SV* sv) { STRLEN len; char *s = SvPV_force(sv, len); for (; len--; s++) *s = toLOWER(*s); return sv; } EXTERN int strnEQx(const char* s1, const char* s2, STRLEN n, int ignore_case) { while (n--) { if (ignore_case) { if (toLOWER(*s1) != toLOWER(*s2)) return 0; } else { if (*s1 != *s2) return 0; } s1++; s2++; } return 1; } static void grow_gap(pTHX_ SV* sv, STRLEN grow, char** t, char** s, char** e) { /* SvPVX ---> AAAAAA...BBBBBB ^ ^ ^ t s e */ STRLEN t_offset = *t - SvPVX(sv); STRLEN s_offset = *s - SvPVX(sv); STRLEN e_offset = *e - SvPVX(sv); SvGROW(sv, e_offset + grow + 1); *t = SvPVX(sv) + t_offset; *s = SvPVX(sv) + s_offset; *e = SvPVX(sv) + e_offset; Move(*s, *s+grow, *e - *s, char); *s += grow; *e += grow; } EXTERN SV* decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix) { STRLEN len; char *s = SvPV_force(sv, len); char *t = s; char *end = s + len; char *ent_start; char *repl; STRLEN repl_len; char buf[UTF8_MAXLEN]; int repl_utf8; int high_surrogate = 0; #if defined(__GNUC__) /* gcc -Wall reports this variable as possibly used uninitialized */ repl_utf8 = 0; #endif while (s < end) { assert(t <= s); if ((*t++ = *s++) != '&') continue; ent_start = s; repl = 0; if (s < end && *s == '#') { UV num = 0; int ok = 0; s++; if (s < end && (*s == 'x' || *s == 'X')) { s++; while (s < end) { char *tmp = strchr(PL_hexdigit, *s); if (!tmp) break; num = num << 4 | ((tmp - PL_hexdigit) & 15); if (num > 0x10FFFF) { /* overflow */ ok = 0; break; } s++; ok = 1; } } else { while (s < end && isDIGIT(*s)) { num = num * 10 + (*s - '0'); if (num > 0x10FFFF) { /* overflow */ ok = 0; break; } s++; ok = 1; } } if (num && ok) { if (!SvUTF8(sv) && num <= 255) { buf[0] = (char) num; repl = buf; repl_len = 1; repl_utf8 = 0; } else if (num == 0xFFFE || num == 0xFFFF) { /* illegal */ } else { char *tmp; if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */ if (high_surrogate != 0) { t -= 3; /* Back up past 0xFFFD */ num = ((high_surrogate - 0xD800) << 10) + (num - 0xDC00) + 0x10000; high_surrogate = 0; } else { num = 0xFFFD; } } else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */ high_surrogate = num; num = 0xFFFD; } else { high_surrogate = 0; /* otherwise invalid? */ if ((num >= 0xFDD0 && num <= 0xFDEF) || ((num & 0xFFFE) == 0xFFFE) || num > 0x10FFFF) { num = 0xFFFD; } } tmp = (char*)uvuni_to_utf8((U8*)buf, num); repl = buf; repl_len = tmp - buf; repl_utf8 = 1; } } } else { char *ent_name = s; while (s < end && isALNUM(*s)) s++; if (ent_name != s && entity2char) { SV** svp; if ( (svp = hv_fetch(entity2char, ent_name, s - ent_name, 0)) || (*s == ';' && (svp = hv_fetch(entity2char, ent_name, s - ent_name + 1, 0))) ) { repl = SvPV(*svp, repl_len); repl_utf8 = SvUTF8(*svp); } else if (expand_prefix) { char *ss = s - 1; while (ss > ent_name) { svp = hv_fetch(entity2char, ent_name, ss - ent_name, 0); if (svp) { repl = SvPV(*svp, repl_len); repl_utf8 = SvUTF8(*svp); s = ss; break; } ss--; } } } high_surrogate = 0; } if (repl) { char *repl_allocated = 0; if (s < end && *s == ';') s++; t--; /* '&' already copied, undo it */ if (*s != '&') { high_surrogate = 0; } if (!SvUTF8(sv) && repl_utf8) { /* need to upgrade sv before we continue */ STRLEN before_gap_len = t - SvPVX(sv); char *before_gap = (char*)bytes_to_utf8((U8*)SvPVX(sv), &before_gap_len); STRLEN after_gap_len = end - s; char *after_gap = (char*)bytes_to_utf8((U8*)s, &after_gap_len); sv_setpvn(sv, before_gap, before_gap_len); sv_catpvn(sv, after_gap, after_gap_len); SvUTF8_on(sv); Safefree(before_gap); Safefree(after_gap); s = t = SvPVX(sv) + before_gap_len; end = SvPVX(sv) + before_gap_len + after_gap_len; } else if (SvUTF8(sv) && !repl_utf8) { repl = (char*)bytes_to_utf8((U8*)repl, &repl_len); repl_allocated = repl; } if (t + repl_len > s) { /* need to grow the string */ grow_gap(aTHX_ sv, repl_len - (s - t), &t, &s, &end); } /* copy replacement string into string */ while (repl_len--) *t++ = *repl++; if (repl_allocated) Safefree(repl_allocated); } else { while (ent_start < s) *t++ = *ent_start++; } } *t = '\0'; SvCUR_set(sv, t - SvPVX(sv)); return sv; } static bool has_hibit(char *s, char *e) { while (s < e) { U8 ch = *s++; if (!UTF8_IS_INVARIANT(ch)) { return 1; } } return 0; } EXTERN bool probably_utf8_chunk(pTHX_ char *s, STRLEN len) { char *e = s + len; STRLEN clen; /* ignore partial utf8 char at end of buffer */ while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1))) e--; if (s < e && UTF8_IS_START((U8)*(e - 1))) e--; clen = len - (e - s); if (clen && UTF8SKIP(e) == clen) { /* all promised continuation bytes are present */ e = s + len; } if (!has_hibit(s, e)) return 0; return is_utf8_string((U8*)s, e - s); } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/hparser.c��������������������������������������������������������������������������000644 �000765 �000024 �00000120655 14366103441 016545� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * Copyright 1999-2016, Gisle Aas * Copyright 1999-2000, Michael A. Chase * * This library is free software; you can redistribute it and/or * modify it under the same terms as Perl itself. */ #ifndef EXTERN #define EXTERN extern #endif #include "hctype.h" /* isH...() macros */ #include "tokenpos.h" /* dTOKEN; PUSH_TOKEN() */ const static struct literal_tag { int len; char* str; int is_cdata; } literal_mode_elem[] = { {6, "script", 1}, {5, "style", 1}, {3, "xmp", 1}, {6, "iframe", 1}, {9, "plaintext", 1}, {5, "title", 0}, {8, "textarea", 0}, {0, 0, 0} }; enum argcode { ARG_SELF = 1, /* need to avoid '\0' in argspec string */ ARG_TOKENS, ARG_TOKENPOS, ARG_TOKEN0, ARG_TAGNAME, ARG_TAG, ARG_ATTR, ARG_ATTRARR, ARG_ATTRSEQ, ARG_TEXT, ARG_DTEXT, ARG_IS_CDATA, ARG_SKIPPED_TEXT, ARG_OFFSET, ARG_OFFSET_END, ARG_LENGTH, ARG_LINE, ARG_COLUMN, ARG_EVENT, ARG_UNDEF, ARG_LITERAL, /* Always keep last */ /* extra flags always encoded first */ ARG_FLAG_FLAT_ARRAY }; static const char * const argname[] = { /* Must be in the same order as enum argcode */ "self", /* ARG_SELF */ "tokens", /* ARG_TOKENS */ "tokenpos", /* ARG_TOKENPOS */ "token0", /* ARG_TOKEN0 */ "tagname", /* ARG_TAGNAME */ "tag", /* ARG_TAG */ "attr", /* ARG_ATTR */ "@attr", /* ARG_ATTRARR */ "attrseq", /* ARG_ATTRSEQ */ "text", /* ARG_TEXT */ "dtext", /* ARG_DTEXT */ "is_cdata", /* ARG_IS_CDATA */ "skipped_text", /* ARG_SKIPPED_TEXT */ "offset", /* ARG_OFFSET */ "offset_end", /* ARG_OFFSET_END */ "length", /* ARG_LENGTH */ "line", /* ARG_LINE */ "column", /* ARG_COLUMN */ "event", /* ARG_EVENT */ "undef", /* ARG_UNDEF */ /* ARG_LITERAL (not compared) */ /* ARG_FLAG_FLAT_ARRAY */ }; #define CASE_SENSITIVE(p_state) \ ((p_state)->xml_mode || (p_state)->case_sensitive) #define STRICT_NAMES(p_state) \ ((p_state)->xml_mode || (p_state)->strict_names) #define ALLOW_EMPTY_TAG(p_state) \ ((p_state)->xml_mode || (p_state)->empty_element_tags) static void flush_pending_text(PSTATE* p_state, SV* self); /* * Parser functions. * * parse() - top level entry point. * deals with text and calls one of its * subordinate parse_*() routines after * looking at the first char after "<" * parse_decl() - deals with declarations <!...> * parse_comment() - deals with <!-- ... --> * parse_marked_section - deals with <![ ... [ ... ]]> * parse_end() - deals with end tags </...> * parse_start() - deals with start tags <A...> * parse_process() - deals with process instructions <?...> * parse_null() - deals with anything else <....> * * report_event() - called whenever any of the parse*() routines * has recongnized something. */ static void report_event(PSTATE* p_state, event_id_t event, char *beg, char *end, U32 utf8, token_pos_t *tokens, int num_tokens, SV* self ) { struct p_handler *h; dTHX; dSP; AV *array; STRLEN my_na; char *argspec; char *s; STRLEN offset; STRLEN line; STRLEN column; #define CHR_DIST(a,b) (utf8 ? utf8_distance((U8*)(a),(U8*)(b)) : (a) - (b)) /* some events might still fire after a handler has signaled eof * so suppress them here. */ if (p_state->eof) return; /* capture offsets */ offset = p_state->offset; line = p_state->line; column = p_state->column; #if 0 { /* used for debugging at some point */ char *s = beg; int i; /* print debug output */ switch(event) { case E_DECLARATION: printf("DECLARATION"); break; case E_COMMENT: printf("COMMENT"); break; case E_START: printf("START"); break; case E_END: printf("END"); break; case E_TEXT: printf("TEXT"); break; case E_PROCESS: printf("PROCESS"); break; case E_NONE: printf("NONE"); break; default: printf("EVENT #%d", event); break; } printf(" ["); while (s < end) { if (*s == '\n') { putchar('\\'); putchar('n'); } else putchar(*s); s++; } printf("] %d\n", end - beg); for (i = 0; i < num_tokens; i++) { printf(" token %d: %d %d\n", i, tokens[i].beg - beg, tokens[i].end - tokens[i].beg); } } #endif if (p_state->pending_end_tag && event != E_TEXT && event != E_COMMENT) { token_pos_t t; char dummy; t.beg = p_state->pending_end_tag; t.end = p_state->pending_end_tag + strlen(p_state->pending_end_tag); p_state->pending_end_tag = 0; report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self); SPAGAIN; } /* update offsets */ p_state->offset += CHR_DIST(end, beg); if (line) { char *s = beg; char *nl = NULL; while (s < end) { if (*s == '\n') { p_state->line++; nl = s; } s++; } if (nl) p_state->column = CHR_DIST(end, nl) - 1; else p_state->column += CHR_DIST(end, beg); } if (event == E_NONE) goto IGNORE_EVENT; #ifdef MARKED_SECTION if (p_state->ms == MS_IGNORE) goto IGNORE_EVENT; #endif /* tag filters */ if (p_state->ignore_tags || p_state->report_tags || p_state->ignore_elements) { if (event == E_START || event == E_END) { SV* tagname = p_state->tmp; assert(num_tokens >= 1); sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg); if (utf8) SvUTF8_on(tagname); else SvUTF8_off(tagname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ tagname); if (p_state->ignoring_element) { if (sv_eq(p_state->ignoring_element, tagname)) { if (event == E_START) p_state->ignore_depth++; else if (--p_state->ignore_depth == 0) { SvREFCNT_dec(p_state->ignoring_element); p_state->ignoring_element = 0; } } goto IGNORE_EVENT; } if (p_state->ignore_elements && hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0)) { if (event == E_START) { p_state->ignoring_element = newSVsv(tagname); p_state->ignore_depth = 1; } goto IGNORE_EVENT; } if (p_state->ignore_tags && hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } if (p_state->report_tags && !hv_fetch_ent(p_state->report_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } } else if (p_state->ignoring_element) { goto IGNORE_EVENT; } } h = &p_state->handlers[event]; if (!h->cb) { /* event = E_DEFAULT; */ h = &p_state->handlers[E_DEFAULT]; if (!h->cb) goto IGNORE_EVENT; } if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) { /* FALSE scalar ('' or 0) means IGNORE this event */ return; } if (p_state->unbroken_text && event == E_TEXT) { /* should buffer text */ if (!p_state->pend_text) p_state->pend_text = newSV(256); if (SvOK(p_state->pend_text)) { if (p_state->is_cdata != p_state->pend_text_is_cdata) { flush_pending_text(p_state, self); SPAGAIN; goto INIT_PEND_TEXT; } } else { INIT_PEND_TEXT: p_state->pend_text_offset = offset; p_state->pend_text_line = line; p_state->pend_text_column = column; p_state->pend_text_is_cdata = p_state->is_cdata; sv_setpvs(p_state->pend_text, ""); if (!utf8) SvUTF8_off(p_state->pend_text); } if (utf8 && !SvUTF8(p_state->pend_text)) sv_utf8_upgrade(p_state->pend_text); if (utf8 || !SvUTF8(p_state->pend_text)) { sv_catpvn(p_state->pend_text, beg, end - beg); } else { SV *tmp = newSVpvn(beg, end - beg); sv_utf8_upgrade(tmp); sv_catsv(p_state->pend_text, tmp); SvREFCNT_dec(tmp); } return; } else if (p_state->pend_text && SvOK(p_state->pend_text)) { flush_pending_text(p_state, self); SPAGAIN; } /* At this point we have decided to generate an event callback */ argspec = h->argspec ? SvPV(h->argspec, my_na) : ""; if (SvTYPE(h->cb) == SVt_PVAV) { if (*argspec == ARG_FLAG_FLAT_ARRAY) { argspec++; array = (AV*)h->cb; } else { /* start sub-array for accumulator array */ array = newAV(); } } else { array = 0; if (*argspec == ARG_FLAG_FLAT_ARRAY) argspec++; /* start argument stack for callback */ ENTER; SAVETMPS; PUSHMARK(SP); } for (s = argspec; *s; s++) { SV* arg = 0; int push_arg = 1; enum argcode argcode = (enum argcode)*s; switch( argcode ) { case ARG_SELF: arg = sv_mortalcopy(self); break; case ARG_TOKENS: if (num_tokens >= 1) { AV* av = newAV(); SV* prev_token = &PL_sv_undef; int i; av_extend(av, num_tokens); for (i = 0; i < num_tokens; i++) { if (tokens[i].beg) { prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); if (utf8) SvUTF8_on(prev_token); av_push(av, prev_token); } else { /* boolean */ av_push(av, p_state->bool_attr_val ? newSVsv(p_state->bool_attr_val) : newSVsv(prev_token)); } } arg = sv_2mortal(newRV_noinc((SV*)av)); } break; case ARG_TOKENPOS: if (num_tokens >= 1 && tokens[0].beg >= beg) { AV* av = newAV(); int i; av_extend(av, num_tokens*2); for (i = 0; i < num_tokens; i++) { if (tokens[i].beg) { av_push(av, newSViv(CHR_DIST(tokens[i].beg, beg))); av_push(av, newSViv(CHR_DIST(tokens[i].end, tokens[i].beg))); } else { /* boolean tag value */ av_push(av, newSViv(0)); av_push(av, newSViv(0)); } } arg = sv_2mortal(newRV_noinc((SV*)av)); } break; case ARG_TOKEN0: case ARG_TAGNAME: /* fall through */ case ARG_TAG: if (num_tokens >= 1) { arg = sv_2mortal(newSVpvn(tokens[0].beg, tokens[0].end - tokens[0].beg)); if (utf8) SvUTF8_on(arg); if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0) sv_lower(aTHX_ arg); if (argcode == ARG_TAG && event != E_START) { char *e_type = "!##/#?#"; sv_insert(arg, 0, 0, &e_type[event], 1); } } break; case ARG_ATTR: case ARG_ATTRARR: if (event == E_START) { HV* hv; int i; if (argcode == ARG_ATTR) { hv = newHV(); arg = sv_2mortal(newRV_noinc((SV*)hv)); } else { #ifdef __GNUC__ /* gcc -Wall reports this variable as possibly used uninitialized */ hv = 0; #endif push_arg = 0; /* deal with argument pushing here */ } for (i = 1; i < num_tokens; i += 2) { SV* attrname = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); SV* attrval; if (utf8) SvUTF8_on(attrname); if (tokens[i+1].beg) { char *beg = tokens[i+1].beg; STRLEN len = tokens[i+1].end - beg; if (*beg == '"' || *beg == '\'' || (*beg == '`' && p_state->backquote)) { assert(len >= 2 && *beg == beg[len-1]); beg++; len -= 2; } attrval = newSVpvn(beg, len); if (utf8) SvUTF8_on(attrval); if (!p_state->attr_encoded) { if (p_state->utf8_mode) { sv_utf8_decode(attrval); sv_utf8_upgrade(attrval); } decode_entities(aTHX_ attrval, p_state->entity2char, 0); if (p_state->utf8_mode) SvUTF8_off(attrval); } } else { /* boolean */ if (p_state->bool_attr_val) attrval = newSVsv(p_state->bool_attr_val); else attrval = newSVsv(attrname); } if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ attrname); if (argcode == ARG_ATTR) { if (hv_exists_ent(hv, attrname, 0) || !hv_store_ent(hv, attrname, attrval, 0)) { SvREFCNT_dec(attrval); } SvREFCNT_dec(attrname); } else { /* ARG_ATTRARR */ if (array) { av_push(array, attrname); av_push(array, attrval); } else { mXPUSHs(attrname); mXPUSHs(attrval); } } } } else if (argcode == ARG_ATTRARR) { push_arg = 0; } break; case ARG_ATTRSEQ: /* (v2 compatibility stuff) */ if (event == E_START) { AV* av = newAV(); int i; for (i = 1; i < num_tokens; i += 2) { SV* attrname = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); if (utf8) SvUTF8_on(attrname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ attrname); av_push(av, attrname); } arg = sv_2mortal(newRV_noinc((SV*)av)); } break; case ARG_TEXT: arg = sv_2mortal(newSVpvn(beg, end - beg)); if (utf8) SvUTF8_on(arg); break; case ARG_DTEXT: if (event == E_TEXT) { arg = sv_2mortal(newSVpvn(beg, end - beg)); if (utf8) SvUTF8_on(arg); if (!p_state->is_cdata) { if (p_state->utf8_mode) { sv_utf8_decode(arg); sv_utf8_upgrade(arg); } decode_entities(aTHX_ arg, p_state->entity2char, 1); if (p_state->utf8_mode) SvUTF8_off(arg); } } break; case ARG_IS_CDATA: if (event == E_TEXT) { arg = boolSV(p_state->is_cdata); } break; case ARG_SKIPPED_TEXT: arg = sv_2mortal(p_state->skipped_text); p_state->skipped_text = newSVpvs(""); break; case ARG_OFFSET: arg = sv_2mortal(newSViv(offset)); break; case ARG_OFFSET_END: arg = sv_2mortal(newSViv(offset + CHR_DIST(end, beg))); break; case ARG_LENGTH: arg = sv_2mortal(newSViv(CHR_DIST(end, beg))); break; case ARG_LINE: arg = sv_2mortal(newSViv(line)); break; case ARG_COLUMN: arg = sv_2mortal(newSViv(column)); break; case ARG_EVENT: assert(event >= 0 && event < EVENT_COUNT); arg = sv_2mortal(newSVpv(event_id_str[event], 0)); break; case ARG_LITERAL: { int len = (unsigned char)s[1]; arg = sv_2mortal(newSVpvn(s+2, len)); if (SvUTF8(h->argspec)) SvUTF8_on(arg); s += len + 1; } break; case ARG_UNDEF: arg = sv_mortalcopy(&PL_sv_undef); break; default: arg = sv_2mortal(newSVpvf("Bad argspec %d", *s)); break; } if (push_arg) { if (!arg) arg = sv_mortalcopy(&PL_sv_undef); if (array) { /* have to fix mortality here or add mortality to * XPUSHs after removing it from the switch cases. */ av_push(array, SvREFCNT_inc(arg)); } else { XPUSHs(arg); } } } if (array) { if (array != (AV*)h->cb) av_push((AV*)h->cb, newRV_noinc((SV*)array)); } else { PUTBACK; if ((enum argcode)*argspec == ARG_SELF && !SvROK(h->cb)) { char *method = SvPV(h->cb, my_na); call_method(method, G_DISCARD | G_EVAL | G_VOID); } else { call_sv(h->cb, G_DISCARD | G_EVAL | G_VOID); } if (SvTRUE(ERRSV)) { RETHROW; } FREETMPS; LEAVE; } if (p_state->skipped_text) SvCUR_set(p_state->skipped_text, 0); return; IGNORE_EVENT: if (p_state->skipped_text) { if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text)) flush_pending_text(p_state, self); if (utf8 && !SvUTF8(p_state->skipped_text)) sv_utf8_upgrade(p_state->skipped_text); if (utf8 || !SvUTF8(p_state->skipped_text)) { sv_catpvn(p_state->skipped_text, beg, end - beg); } else { SV *tmp = newSVpvn(beg, end - beg); sv_utf8_upgrade(tmp); sv_catsv(p_state->skipped_text, tmp); SvREFCNT_dec(tmp); } } #undef CHR_DIST return; } EXTERN SV* argspec_compile(SV* src, PSTATE* p_state) { dTHX; SV* argspec = newSVpvs(""); STRLEN len; char *s = SvPV(src, len); char *end = s + len; if (SvUTF8(src)) SvUTF8_on(argspec); while (isHSPACE(*s)) s++; if (*s == '@') { /* try to deal with '@{ ... }' wrapping */ char *tmp = s + 1; while (isHSPACE(*tmp)) tmp++; if (*tmp == '{') { char c = ARG_FLAG_FLAT_ARRAY; sv_catpvn(argspec, &c, 1); tmp++; while (isHSPACE(*tmp)) tmp++; s = tmp; } } while (s < end) { if (isHNAME_FIRST(*s) || *s == '@') { char *name = s; int a = ARG_SELF; const char * const *arg_name; s++; while (isHNAME_CHAR(*s)) s++; /* check identifier */ for ( arg_name = argname; a < ARG_LITERAL ; ++a, ++arg_name ) { if (strnEQ(*arg_name, name, s - name) && (*arg_name)[s - name] == '\0') break; } if (a < ARG_LITERAL) { char c = (unsigned char) a; sv_catpvn(argspec, &c, 1); if (a == ARG_LINE || a == ARG_COLUMN) { if (!p_state->line) p_state->line = 1; /* enable tracing of line/column */ } if (a == ARG_SKIPPED_TEXT) { if (!p_state->skipped_text) { p_state->skipped_text = newSVpvs(""); } } if (a == ARG_ATTR || a == ARG_ATTRARR) { if (p_state->argspec_entity_decode != ARG_DTEXT) p_state->argspec_entity_decode = ARG_ATTR; } else if (a == ARG_DTEXT) { p_state->argspec_entity_decode = ARG_DTEXT; } } else { croak("Unrecognized identifier %.*s in argspec", (int) (s - name), name); } } else if (*s == '"' || *s == '\'') { char *string_beg = s; s++; while (s < end && *s != *string_beg && *s != '\\') s++; if (*s == *string_beg) { /* literal */ int len = s - string_beg - 1; unsigned char buf[2]; if (len > 255) croak("Literal string is longer than 255 chars in argspec"); buf[0] = ARG_LITERAL; buf[1] = len; sv_catpvn(argspec, (char*)buf, 2); sv_catpvn(argspec, string_beg+1, len); s++; } else if (*s == '\\') { croak("Backslash reserved for literal string in argspec"); } else { croak("Unterminated literal string in argspec"); } } else { croak("Bad argspec (%s)", s); } while (isHSPACE(*s)) s++; if (*s == '}' && SvPVX(argspec)[0] == ARG_FLAG_FLAT_ARRAY) { /* end of '@{ ... }' */ s++; while (isHSPACE(*s)) s++; if (s < end) croak("Bad argspec: stuff after @{...} (%s)", s); } if (s == end) break; if (*s != ',') { croak("Missing comma separator in argspec"); } s++; while (isHSPACE(*s)) s++; } return argspec; } static void flush_pending_text(PSTATE* p_state, SV* self) { dTHX; bool old_unbroken_text = p_state->unbroken_text; SV* old_pend_text = p_state->pend_text; bool old_is_cdata = p_state->is_cdata; STRLEN old_offset = p_state->offset; STRLEN old_line = p_state->line; STRLEN old_column = p_state->column; assert(p_state->pend_text && SvOK(p_state->pend_text)); p_state->unbroken_text = 0; p_state->pend_text = 0; p_state->is_cdata = p_state->pend_text_is_cdata; p_state->offset = p_state->pend_text_offset; p_state->line = p_state->pend_text_line; p_state->column = p_state->pend_text_column; report_event(p_state, E_TEXT, SvPVX(old_pend_text), SvEND(old_pend_text), SvUTF8(old_pend_text), 0, 0, self); SvOK_off(old_pend_text); p_state->unbroken_text = old_unbroken_text; p_state->pend_text = old_pend_text; p_state->is_cdata = old_is_cdata; p_state->offset = old_offset; p_state->line = old_line; p_state->column = old_column; } static char* skip_until_gt(char *beg, char *end) { /* tries to emulate quote skipping behaviour observed in MSIE */ char *s = beg; char quote = '\0'; char prev = ' '; while (s < end) { if (!quote && *s == '>') return s; if (*s == '"' || *s == '\'') { if (*s == quote) { quote = '\0'; /* end of quoted string */ } else if (!quote && (prev == ' ' || prev == '=')) { quote = *s; } } prev = *s++; } return end; } static char* parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; if (p_state->strict_comment) { dTOKENS(4); char *start_com = s; /* also used to signal inside/outside */ while (1) { /* try to locate "--" */ FIND_DASH_DASH: /* printf("find_dash_dash: [%s]\n", s); */ while (s < end && *s != '-' && *s != '>') s++; if (s == end) { FREE_TOKENS; return beg; } if (*s == '>') { s++; if (start_com) goto FIND_DASH_DASH; /* we are done recognizing all comments, make callbacks */ report_event(p_state, E_COMMENT, beg - 4, s, utf8, tokens, num_tokens, self); FREE_TOKENS; return s; } s++; if (s == end) { FREE_TOKENS; return beg; } if (*s == '-') { /* two dashes in a row seen */ s++; /* do something */ if (start_com) { PUSH_TOKEN(start_com, s-2); start_com = 0; } else { start_com = s; } } } } else if (p_state->no_dash_dash_comment_end) { token_pos_t token; token.beg = beg; /* a lone '>' signals end-of-comment */ while (s < end && *s != '>') s++; token.end = s; if (s < end) { s++; report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); return s; } else { return beg; } } else { /* non-strict comment */ token_pos_t token; token.beg = beg; /* try to locate /--\s*>/ which signals end-of-comment */ LOCATE_END: while (s < end && *s != '-') s++; token.end = s; if (s < end) { s++; if (*s == '-') { s++; while (isHSPACE(*s)) s++; if (*s == '>') { s++; /* yup */ report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); return s; } } if (s < end) { s = token.end + 1; goto LOCATE_END; } } if (s == end) return beg; } return 0; } #ifdef MARKED_SECTION static void marked_section_update(PSTATE* p_state) { dTHX; /* we look at p_state->ms_stack to determine p_state->ms */ AV* ms_stack = p_state->ms_stack; p_state->ms = MS_NONE; if (ms_stack) { int stack_len = av_len(ms_stack); int stack_idx; for (stack_idx = 0; stack_idx <= stack_len; stack_idx++) { SV** svp = av_fetch(ms_stack, stack_idx, 0); if (svp) { AV* tokens = (AV*)SvRV(*svp); int tokens_len = av_len(tokens); int i; assert(SvTYPE(tokens) == SVt_PVAV); for (i = 0; i <= tokens_len; i++) { SV** svp = av_fetch(tokens, i, 0); if (svp) { STRLEN len; char *token_str = SvPV(*svp, len); enum marked_section_t token; if (strEQ(token_str, "include")) token = MS_INCLUDE; else if (strEQ(token_str, "rcdata")) token = MS_RCDATA; else if (strEQ(token_str, "cdata")) token = MS_CDATA; else if (strEQ(token_str, "ignore")) token = MS_IGNORE; else token = MS_NONE; if (p_state->ms < token) p_state->ms = token; } } } } } /* printf("MS %d\n", p_state->ms); */ p_state->is_cdata = (p_state->ms == MS_CDATA); return; } static char* parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { dTHX; char *s; AV* tokens = 0; if (!p_state->marked_sections) return 0; assert(beg[0] == '<'); assert(beg[1] == '!'); assert(beg[2] == '['); s = beg + 3; FIND_NAMES: while (isHSPACE(*s)) s++; while (isHNAME_FIRST(*s)) { char *name_start = s; char *name_end; SV *name; s++; while (isHNAME_CHAR(*s)) s++; name_end = s; while (isHSPACE(*s)) s++; if (s == end) goto PREMATURE; if (!tokens) tokens = newAV(); name = newSVpvn(name_start, name_end - name_start); if (utf8) SvUTF8_on(name); av_push(tokens, sv_lower(aTHX_ name)); } if (*s == '-') { s++; if (*s == '-') { /* comment */ s++; while (1) { while (s < end && *s != '-') s++; if (s == end) goto PREMATURE; s++; /* skip first '-' */ if (*s == '-') { s++; /* comment finished */ goto FIND_NAMES; } } } else goto FAIL; } if (*s == '[') { s++; /* yup */ if (!tokens) { tokens = newAV(); av_push(tokens, newSVpvs("include")); } if (!p_state->ms_stack) p_state->ms_stack = newAV(); av_push(p_state->ms_stack, newRV_noinc((SV*)tokens)); marked_section_update(p_state); report_event(p_state, E_NONE, beg, s, utf8, 0, 0, self); return s; } FAIL: SvREFCNT_dec(tokens); return 0; /* not yet implemented */ PREMATURE: SvREFCNT_dec(tokens); return beg; } #endif static char* parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg + 2; if (*s == '-') { /* comment? */ char *tmp; s++; if (s == end) return beg; if (*s != '-') goto DECL_FAIL; /* nope, illegal */ /* yes, two dashes seen */ s++; tmp = parse_comment(p_state, s, end, utf8, self); return (tmp == s) ? beg : tmp; } #ifdef MARKED_SECTION if (*s == '[') { /* marked section */ char *tmp; tmp = parse_marked_section(p_state, beg, end, utf8, self); if (!tmp) goto DECL_FAIL; return tmp; } #endif if (*s == '>') { /* make <!> into empty comment <SGML Handbook 36:32> */ token_pos_t token; token.beg = s; token.end = s; s++; report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } if (isALPHA(*s)) { dTOKENS(8); char *decl_id = s; STRLEN decl_id_len; s++; /* declaration */ while (s < end && isHNAME_CHAR(*s)) s++; decl_id_len = s - decl_id; if (s == end) goto PREMATURE; /* just hardcode a few names as the recognized declarations */ if (!((decl_id_len == 7 && strnEQx(decl_id, "DOCTYPE", 7, !CASE_SENSITIVE(p_state))) || (decl_id_len == 6 && strnEQx(decl_id, "ENTITY", 6, !CASE_SENSITIVE(p_state))) ) ) { goto FAIL; } /* first word available */ PUSH_TOKEN(decl_id, s); while (1) { while (s < end && isHSPACE(*s)) s++; if (s == end) goto PREMATURE; if (*s == '"' || *s == '\'' || (*s == '`' && p_state->backquote)) { char *str_beg = s; s++; while (s < end && *s != *str_beg) s++; if (s == end) goto PREMATURE; s++; PUSH_TOKEN(str_beg, s); } else if (*s == '-') { /* comment */ char *com_beg = s; s++; if (s == end) goto PREMATURE; if (*s != '-') goto FAIL; s++; while (1) { while (s < end && *s != '-') s++; if (s == end) goto PREMATURE; s++; if (s == end) goto PREMATURE; if (*s == '-') { s++; PUSH_TOKEN(com_beg, s); break; } } } else if (*s != '>') { /* plain word */ char *word_beg = s; s++; while (s < end && isHNOT_SPACE_GT(*s)) s++; if (s == end) goto PREMATURE; PUSH_TOKEN(word_beg, s); } else { break; } } if (s == end) goto PREMATURE; if (*s == '>') { s++; report_event(p_state, E_DECLARATION, beg, s, utf8, tokens, num_tokens, self); FREE_TOKENS; return s; } FAIL: FREE_TOKENS; goto DECL_FAIL; PREMATURE: FREE_TOKENS; return beg; } DECL_FAIL: if (p_state->strict_comment) return 0; /* consider everything up to the first '>' a comment */ while (s < end && *s != '>') s++; if (s < end) { token_pos_t token; token.beg = beg + 2; token.end = s; s++; report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } else { return beg; } } static char* parse_start(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; int empty_tag = 0; dTOKENS(16); hctype_t tag_name_char; hctype_t attr_name_first, attr_name_char; if (STRICT_NAMES(p_state)) { attr_name_first = HCTYPE_NAME_FIRST; tag_name_char = attr_name_char = HCTYPE_NAME_CHAR; } else { tag_name_char = HCTYPE_NOT_SPACE_GT; attr_name_first = HCTYPE_NOT_SPACE_GT; attr_name_char = HCTYPE_NOT_SPACE_EQ_GT; } s += 2; while (s < end && isHCTYPE(*s, tag_name_char)) { if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { if ((s + 1) == end) goto PREMATURE; if (*(s + 1) == '>') break; } s++; } PUSH_TOKEN(beg+1, s); /* tagname */ while (isHSPACE(*s)) s++; if (s == end) goto PREMATURE; while (isHCTYPE(*s, attr_name_first)) { /* attribute */ char *attr_name_beg = s; char *attr_name_end; if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { if ((s + 1) == end) goto PREMATURE; if (*(s + 1) == '>') break; } s++; while (s < end && isHCTYPE(*s, attr_name_char)) { if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { if ((s + 1) == end) goto PREMATURE; if (*(s + 1) == '>') break; } s++; } if (s == end) goto PREMATURE; attr_name_end = s; PUSH_TOKEN(attr_name_beg, attr_name_end); /* attr name */ while (isHSPACE(*s)) s++; if (s == end) goto PREMATURE; if (*s == '=') { /* with a value */ s++; while (isHSPACE(*s)) s++; if (s == end) goto PREMATURE; if (*s == '>') { /* parse it similar to ="" */ PUSH_TOKEN(s, s); break; } if (*s == '"' || *s == '\'' || (*s == '`' && p_state->backquote)) { char *str_beg = s; s++; while (s < end && *s != *str_beg) s++; if (s == end) goto PREMATURE; s++; PUSH_TOKEN(str_beg, s); } else { char *word_start = s; while (s < end && isHNOT_SPACE_GT(*s)) { if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { if ((s + 1) == end) goto PREMATURE; if (*(s + 1) == '>') break; } s++; } if (s == end) goto PREMATURE; PUSH_TOKEN(word_start, s); } while (isHSPACE(*s)) s++; if (s == end) goto PREMATURE; } else { PUSH_TOKEN(0, 0); /* boolean attr value */ } } if (ALLOW_EMPTY_TAG(p_state) && *s == '/') { s++; if (s == end) goto PREMATURE; empty_tag = 1; } if (*s == '>') { s++; /* done */ report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self); if (empty_tag) { report_event(p_state, E_END, s, s, utf8, tokens, 1, self); } else if (!p_state->xml_mode) { /* find out if this start tag should put us into literal_mode */ int i; int tag_len = tokens[0].end - tokens[0].beg; for (i = 0; literal_mode_elem[i].len; i++) { if (tag_len == literal_mode_elem[i].len) { /* try to match it */ char *s = beg + 1; char *t = literal_mode_elem[i].str; int len = tag_len; while (len) { if (toLOWER(*s) != *t) break; s++; t++; if (!--len) { /* found it */ p_state->literal_mode = literal_mode_elem[i].str; p_state->is_cdata = literal_mode_elem[i].is_cdata; /* printf("Found %s\n", p_state->literal_mode); */ goto END_OF_LITERAL_SEARCH; } } } } END_OF_LITERAL_SEARCH: ; } FREE_TOKENS; return s; } FREE_TOKENS; return 0; PREMATURE: FREE_TOKENS; return beg; } static char* parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg+2; hctype_t name_first, name_char; if (STRICT_NAMES(p_state)) { name_first = HCTYPE_NAME_FIRST; name_char = HCTYPE_NAME_CHAR; } else { name_first = name_char = HCTYPE_NOT_SPACE_GT; } if (isHCTYPE(*s, name_first)) { token_pos_t tagname; tagname.beg = s; s++; while (s < end && isHCTYPE(*s, name_char)) s++; tagname.end = s; if (p_state->strict_end) { while (isHSPACE(*s)) s++; } else { s = skip_until_gt(s, end); } if (s < end) { if (*s == '>') { s++; /* a complete end tag has been recognized */ report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self); return s; } } else { return beg; } } else if (!p_state->strict_comment) { s = skip_until_gt(s, end); if (s < end) { token_pos_t token; token.beg = beg + 2; token.end = s; s++; report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); return s; } else { return beg; } } return 0; } static char* parse_process(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg + 2; /* skip '<?' */ /* processing instruction */ token_pos_t token_pos; token_pos.beg = s; while (s < end) { if (*s == '>') { token_pos.end = s; s++; if (p_state->xml_mode || p_state->xml_pic) { /* XML processing instructions are ended by "?>" */ if (s - beg < 4 || s[-2] != '?') continue; token_pos.end = s - 2; } /* a complete processing instruction seen */ report_event(p_state, E_PROCESS, beg, s, utf8, &token_pos, 1, self); return s; } s++; } return beg; /* could not find end */ } #ifdef USE_PFUNC static char* parse_null(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { return 0; } #include "pfunc.h" /* declares the parsefunc[] */ #endif /* USE_PFUNC */ static char* parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) { char *s = beg; char *t = beg; char *new_pos; while (!p_state->eof) { /* * At the start of this loop we will always be ready for eating text * or a new tag. We will never be inside some tag. The 't' points * to where we started and the 's' is advanced as we go. */ while (p_state->literal_mode) { char *l = p_state->literal_mode; char *end_text; while (s < end && *s != '<') { s++; } if (s == end) { s = t; goto DONE; } end_text = s; s++; /* here we rely on '\0' termination of perl svpv buffers */ if (*s == '/') { s++; while (*l && toLOWER(*s) == *l) { s++; l++; } if (!*l && (strNE(p_state->literal_mode, "plaintext") || p_state->closing_plaintext)) { /* matched it all */ token_pos_t end_token; end_token.beg = end_text + 2; end_token.end = s; while (isHSPACE(*s)) s++; if (*s == '>') { s++; if (t != end_text) report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); report_event(p_state, E_END, end_text, s, utf8, &end_token, 1, self); p_state->literal_mode = 0; p_state->is_cdata = 0; t = s; } } } } #ifdef MARKED_SECTION while (p_state->ms == MS_CDATA || p_state->ms == MS_RCDATA) { while (s < end && *s != ']') s++; if (*s == ']') { char *end_text = s; s++; if (*s == ']' && *(s + 1) == '>') { s += 2; /* marked section end */ if (t != end_text) report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self); t = s; SvREFCNT_dec(av_pop(p_state->ms_stack)); marked_section_update(p_state); continue; } } if (s == end) { s = t; goto DONE; } } #endif /* first we try to match as much text as possible */ while (s < end && *s != '<') { #ifdef MARKED_SECTION if (p_state->ms && *s == ']') { char *end_text = s; s++; if (*s == ']') { s++; if (*s == '>') { s++; report_event(p_state, E_TEXT, t, end_text, utf8, 0, 0, self); report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self); t = s; SvREFCNT_dec(av_pop(p_state->ms_stack)); marked_section_update(p_state); continue; } } } #endif s++; } if (s != t) { if (*s == '<') { report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); t = s; } else { s--; if (isHSPACE(*s)) { /* wait with white space at end */ while (s >= t && isHSPACE(*s)) s--; } else { /* might be a chopped up entities/words */ while (s >= t && !isHSPACE(*s)) s--; while (s >= t && isHSPACE(*s)) s--; } s++; if (s != t) report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); break; } } if (end - s < 3) break; /* next char is known to be '<' and pointed to by 't' as well as 's' */ s++; #ifdef USE_PFUNC new_pos = parsefunc[(unsigned char)*s](p_state, t, end, utf8, self); #else if (isHNAME_FIRST(*s)) new_pos = parse_start(p_state, t, end, utf8, self); else if (*s == '/') new_pos = parse_end(p_state, t, end, utf8, self); else if (*s == '!') new_pos = parse_decl(p_state, t, end, utf8, self); else if (*s == '?') new_pos = parse_process(p_state, t, end, utf8, self); else new_pos = 0; #endif /* USE_PFUNC */ if (new_pos) { if (new_pos == t) { /* no progress, need more data to know what it is */ s = t; break; } t = s = new_pos; } /* if we get out here then this was not a conforming tag, so * treat it is plain text at the top of the loop again (we * have already skipped past the "<"). */ } DONE: return s; } EXTERN void parse(pTHX_ PSTATE* p_state, SV* chunk, SV* self) { char *s, *beg, *end; U32 utf8 = 0; STRLEN len; if (!p_state->start_document) { char dummy[1]; report_event(p_state, E_START_DOCUMENT, dummy, dummy, 0, 0, 0, self); p_state->start_document = 1; } if (!chunk) { /* eof */ char empty[1]; if (p_state->buf && SvOK(p_state->buf)) { /* flush it */ s = SvPV(p_state->buf, len); end = s + len; utf8 = SvUTF8(p_state->buf); assert(len); while (s < end) { if (p_state->literal_mode) { if (strEQ(p_state->literal_mode, "plaintext") || strEQ(p_state->literal_mode, "xmp") || strEQ(p_state->literal_mode, "iframe") || strEQ(p_state->literal_mode, "textarea")) { /* rest is considered text */ break; } if (strEQ(p_state->literal_mode, "script") || strEQ(p_state->literal_mode, "style")) { /* effectively make it an empty element */ token_pos_t t; char dummy; t.beg = p_state->literal_mode; t.end = p_state->literal_mode + strlen(p_state->literal_mode); report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self); } else { p_state->pending_end_tag = p_state->literal_mode; } p_state->literal_mode = 0; s = parse_buf(aTHX_ p_state, s, end, utf8, self); continue; } if (!p_state->strict_comment && !p_state->no_dash_dash_comment_end && *s == '<') { p_state->no_dash_dash_comment_end = 1; s = parse_buf(aTHX_ p_state, s, end, utf8, self); continue; } if (!p_state->strict_comment && *s == '<') { char *s1 = s + 1; if (s1 == end || isHNAME_FIRST(*s1) || *s1 == '/' || *s1 == '!' || *s1 == '?') { /* some kind of unterminated markup. Report rest as as comment */ token_pos_t token; token.beg = s + 1; token.end = end; report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self); s = end; } } break; } if (s < end) { /* report rest as text */ report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self); } SvREFCNT_dec(p_state->buf); p_state->buf = 0; } if (p_state->pend_text && SvOK(p_state->pend_text)) flush_pending_text(p_state, self); if (p_state->ignoring_element) { /* document not balanced */ SvREFCNT_dec(p_state->ignoring_element); p_state->ignoring_element = 0; } report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self); /* reset state */ p_state->offset = 0; if (p_state->line) p_state->line = 1; p_state->column = 0; p_state->start_document = 0; p_state->literal_mode = 0; p_state->is_cdata = 0; return; } if (p_state->utf8_mode) sv_utf8_downgrade(chunk, 0); if (p_state->buf && SvOK(p_state->buf)) { sv_catsv(p_state->buf, chunk); beg = SvPV(p_state->buf, len); utf8 = SvUTF8(p_state->buf); } else { beg = SvPV(chunk, len); utf8 = SvUTF8(chunk); if (p_state->offset == 0 && DOWARN) { /* Print warnings if we find unexpected Unicode BOM forms */ if (p_state->argspec_entity_decode && !(p_state->attr_encoded && p_state->argspec_entity_decode == ARG_ATTR) && !p_state->utf8_mode && ( (!utf8 && len >= 3 && strnEQ(beg, "\xEF\xBB\xBF", 3)) || (utf8 && len >= 6 && strnEQ(beg, "\xC3\xAF\xC2\xBB\xC2\xBF", 6)) || (!utf8 && probably_utf8_chunk(aTHX_ beg, len)) ) ) { warn("Parsing of undecoded UTF-8 will give garbage when decoding entities"); } if (utf8 && len >= 2 && strnEQ(beg, "\xFF\xFE", 2)) { warn("Parsing string decoded with wrong endianness"); } if (!utf8 && len >= 4 && (strnEQ(beg, "\x00\x00\xFE\xFF", 4) || strnEQ(beg, "\xFE\xFF\x00\x00", 4)) ) { warn("Parsing of undecoded UTF-32"); } else if (!utf8 && len >= 2 && (strnEQ(beg, "\xFE\xFF", 2) || strnEQ(beg, "\xFF\xFE", 2)) ) { warn("Parsing of undecoded UTF-16"); } } } if (!len) return; /* nothing to do */ end = beg + len; s = parse_buf(aTHX_ p_state, beg, end, utf8, self); if (s == end || p_state->eof) { if (p_state->buf) { SvOK_off(p_state->buf); } } else { /* need to keep rest in buffer */ if (p_state->buf) { /* chop off some chars at the beginning */ if (SvOK(p_state->buf)) { sv_chop(p_state->buf, s); } else { sv_setpvn(p_state->buf, s, end - s); if (utf8) SvUTF8_on(p_state->buf); else SvUTF8_off(p_state->buf); } } else { p_state->buf = newSVpv(s, end - s); if (utf8) SvUTF8_on(p_state->buf); } } return; } �����������������������������������������������������������������������������������HTML-Parser-3.81/cpanfile���������������������������������������������������������������������������000644 �000765 �000024 �00000001774 14366103441 016441� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������on 'runtime' => sub { requires 'strict'; requires 'Carp'; requires 'Exporter'; requires 'HTML::Tagset'; requires 'HTTP::Headers'; requires 'IO::File'; requires 'URI'; requires 'URI::URL'; requires 'XSLoader'; }; on 'configure' => sub { requires 'ExtUtils::MakeMaker' => '6.52'; }; on 'test' => sub { requires 'strict'; requires 'Config'; requires 'FileHandle'; requires 'File::Spec'; requires 'IO::File'; requires 'SelectSaver'; requires 'Test::More'; requires 'URI'; }; on 'develop' => sub { requires 'Dist::Zilla'; requires 'Dist::Zilla::PluginBundle::Starter' => 'v4.0.0'; requires 'Dist::Zilla::Plugin::MinimumPerl'; requires 'Pod::Coverage::TrustPod'; requires 'Test::CheckManifest' => '1.29'; requires 'Test::CPAN::Changes' => '0.4'; requires 'Test::CPAN::Meta'; requires 'Test::Kwalitee' => '1.22'; requires 'Test::Pod::Coverage'; requires 'Test::Pod::Spelling::CommonMistakes' => '1.000'; }; ����HTML-Parser-3.81/tokenpos.h�������������������������������������������������������������������������000644 �000765 �000024 �00000002166 14366103441 016744� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������struct token_pos { char *beg; char *end; }; typedef struct token_pos token_pos_t; #define dTOKENS(init_lim) \ token_pos_t token_buf[init_lim]; \ int token_lim = init_lim; \ token_pos_t *tokens = token_buf; \ int num_tokens = 0 #define PUSH_TOKEN(p_beg, p_end) \ STMT_START { \ ++num_tokens; \ if (num_tokens == token_lim) \ tokens_grow(&tokens, &token_lim, (bool)(tokens != token_buf)); \ tokens[num_tokens-1].beg = p_beg; \ tokens[num_tokens-1].end = p_end; \ } STMT_END #define FREE_TOKENS \ STMT_START { \ if (tokens != token_buf) \ Safefree(tokens); \ } STMT_END static void tokens_grow(token_pos_t **token_ptr, int *token_lim_ptr, bool tokens_on_heap) { int new_lim = *token_lim_ptr; if (new_lim < 4) new_lim = 4; new_lim *= 2; if (tokens_on_heap) { Renew(*token_ptr, new_lim, token_pos_t); } else { token_pos_t *new_tokens; int i; New(57, new_tokens, new_lim, token_pos_t); for (i = 0; i < *token_lim_ptr; i++) new_tokens[i] = (*token_ptr)[i]; *token_ptr = new_tokens; } *token_lim_ptr = new_lim; } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/Changes����������������������������������������������������������������������������000644 �000765 �000024 �00000114332 14366103441 016223� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������Change history for HTML-Parser 3.81 2023-01-30 - Stop depending on "Test" (GH#34) (James Raspass) - fix test scripts after conversion to Test::More (GH#35) (Graham Knop) 3.80 2022-11-01 * Fix compatibility with ancient perl by avoiding index in test (GH#33) (Graham Knop) 3.79 2022-10-12 * Modernise XS (GH#32) (James Raspass) * Skip threads on older perl versions, as they often segfault (GH#31) (Graham * Knop) 3.78 2022-03-28 * Remove unused variable (GH#26) (Michal Josef Špaček) 3.77 2022-03-14 * Update tests to remove HTML4 specific tags (GH#25) (Jess) 3.76 2021-03-04 * Add a fix for a stack confusion error on `eof`. (GH#21) (Matthew Horsfall and Chase Whitener) 3.75 2020-08-30 * Clean up the prereqs a bit * Mark HTML::Filter as deprecated as the docs point out * Move Parser.pm into the lib directory with the others. This will help with everything from auto version bumps after releases, to scanning for prerequisites and spelling errors. * Fix a few spelling errors in the POD for HTML::Parser * Clean up the spacing on many examples in HTML::Parser 3.74 2020-08-30 * Fix the order of date and version in this change log. (Thanks, haarg) * Convert to Dist::Zilla * Build all prereqs from our cpanfile * Go through all test files and: * perltidy * Use strict/warnings * Get rid of two-arg open * Get rid of BAREWORD filehandles * Fix the eval pattern used * Only use -w where we catch $SIG{__WARN__} * Fix encoding problems * use utf8 where we have unicode in the source * Fix a typo here and there * perltidy all of the example apps in eg/ * Add comments explaining the apps in eg/ (GH#13 Thanks, Salvatore Bonaccorso) * Print out UTF-8 encoded data where sensible in eg/ 3.73 2020-08-24 * Cleaned up this changes log. * Added a .mailmap file to organize contributions accurately. * Ensure all versions are equal and on the current version * Add the .mailmap to the MANIFEST * Change the META information to point to the new GH repository * Add a .perltidyrc to use going forward * Add hctype.h and pfunc.h to the dist as static files and stop asking for them to be built on the user's end. * Remove t/pod.t from userland testing * Remove t/pod-coverage.t from userland testing * Clean up the MANIFEST * Start testing via GitHub Actions/Workflows * Protect active parser from being freed (PR 13, RT #115034) 3.72 2016-01-19 * Avoid more clang casting warnings * Remove trailing whitespace * Ensure entities expand to utf8 sequences under 'utf8_mode' [RT#99755] * typo fixes (David Steinbrunner) * Silence clang warning (Jacques Germishuys) * const+static-ing (bulk88) 3.71 2013-05-09 * Transform ':' in headers to '-' [RT#80524] 3.70 2013-03-28 * Fix for cross-compiling with Buildroot (François Perrad) * Comment typo fix * Fix Issue #3 / RT #84144: HTML::Entities::decode_entities() needs to call SV_CHECK_THINKFIRST() before checking READONLY flag (Yves Orton) 3.69 2011-10-15 * Documentation fix; encode_utf8 mixup [RT#71151] * Make it clearer that there are 2 (actually 3) options for handing "UTF-8 garbage" * Github is the official repo * Can't be bothered to try to fix the failures that occur on perl-5.6 * fix to TokeParser to correctly handle option configuration (Barbie) * Aesthetic change: remove extra ; (Jon Jensen) * Trim surrounding whitespace from extracted URLs. (Ville Skyttä) 3.68 2010-09-01 * Declare the encoding of the POD to be utf8 3.67 2010-08-17 * bleadperl 2154eca7 breaks HTML::Parser 3.66 [RT#60368] (Nicholas Clark) 3.66 2010-07-09 * Fix entity decoding in utf8_mode for the title header 3.65 2010-04-04 * Eliminate buggy entities_decode_old * Fixed endianness typo [RT#50811] (Salvatore Bonaccorso) * Documentation Fixes. (Ville Skyttä) 3.64 2009-10-25 * Convert files to UTF-8 * Don't allow decode_entities() to generate illegal Unicode chars * Copyright 2009 * Remove rendundant (repeated) test * Make parse_file() method use 3-arg open [RT#49434] 3.63 2009-10-22 * Take more care to prepare the char range for encode_entities [RT#50170] * decode_entities confused by trailing incomplete entity 3.62 2009-08-13 * Doc patch: Make it clearer what the return value from ->parse is * HTTP::Header doc typo fix. (Ville Skyttä) * Do not bother tracking style or script, they're ignored. (Ville Skyttä) * Bring HTML 5 head elements up to date with WD-html5-20090423. (Ville Skyttä) * Improve HeadParser performance. (Ville Skyttä) 3.61 2009-06-20 * Test that triggers the crash that Chip fixed * Complete documented list of literal tags * Avoid crash (referenced pend_text instead of skipped_text) (Chip Salzenberg) * Reference HTML::LinkExttor [RT#43164] (Antonio Radici) 3.60 2009-02-09 * Spelling fixes. (Ville Skyttä) * Test multi-value headers. (Ville Skyttä) * Documentation improvements. (Ville Skyttä) * Do not terminate head parsing on the <object> element (added in HTML 4.0). (Ville Skyttä) * Add support for HTML 5 <meta charset> and new HEAD elements. (Ville Skyttä) * Short description of the htextsub example (Damyan Ivanov) * Suppress warning when encode_entities is called with undef [RT#27567] (Mike South) * HTML::Parser doesn't compile with perl 5.8.0. (Zefram) 3.59 2008-11-24 * Restore perl-5.6 compatibility for HTML::HeadParser. * Improved META.yml 3.58 2008-11-17 * Suppress "Parsing of undecoded UTF-8 will give garbage" warning with attr_encoded [RT#29089] * HTML::HeadParser: - Recognize the Unicode BOM in utf8_mode as well [RT#27522] - Avoid ending up with '/' keys attribute in Link headers. 3.57 2008-11-16 * The <iframe> element content is now parsed in literal mode. * Parsing of <script> and <style> content ends on the first end tag even when that tag was in a quoted string. That seems to be the behaviour of all modern browsers. * Implement backquote() attribute as requested by Alex Kapranoff. * Test and documentation tweaks from Alex Kapranoff. 3.56 2007-01-12 * Cloning of parser state for compatibility with threads. Fixed by Bo Lindbergh <blgl@hagernas.com>. * Don't require whitespace between declaration tokens. <http://rt.cpan.org/Ticket/Display.html?id=20864> 3.55 2006-07-10 * Treat <> at the end of document as text. Used to be reported as a comment. * Improved Firefox compatibility for bad HTML: - Unclosed <script>, <style> are now treated as empty tags. - Unclosed <textarea>, <xmp> and <plaintext> treat rest as text. - Unclosed <title> closes at next tag. * Make <!a'b> a comment by itself. 3.54 2006-04-28 * Yaakov Belch discovered yet another issue with <script> parsing. Enabling of 'empty_element_tags' got the parser confused if it found such a tag for elements that are normally parsed in literal mode. Of these <script src="..."/> is the only one likely to be found in documents. <http://rt.cpan.org//Ticket/Display.html?id=18965> 3.53 2006-04-27 * When ignore_element was enabled it got confused if the corresponding tags did not nest properly; the end tag was treated it as if it was a start tag. Found and fixed by Yaakov Belch <code@yaakovnet.net>. <http://rt.cpan.org/Ticket/Display.html?id=18936> 3.52 2006-04-26 * Make sure the 'start_document' fires exactly once for each document parsed. For earlier releases it did not fire at all for empty documents and could fire multiple times if parse was called with empty chunks. * Documentation tweaks and typo fixes. 3.51 2006-03-22 * Named entities outside the Latin-1 range are now only expanded when properly terminated with ";". This makes HTML::Parser compatible with Firefox/Konqueror/MSIE when it comes to how these entities are expanded in attribute values. Firefox does expand unterminated non-Latin-1 entities in plain text, so here HTML::Parser only stays compatible with Konqueror/MSIE. Fixes <http://rt.cpan.org/Ticket/Display.html?id=17962>. * Fixed some documentation typos spotted by <william@knowmad.com>. <http://rt.cpan.org/Ticket/Display.html?id=18062> 3.50 2006-02-14 * The 3.49 release didn't compile with VC++ because it mixed code and declarations. Fixed by Steve Hay <steve.hay@uk.radan.com>. 3.49 2006-02-08 * Events could sometimes still fire after a handler has signaled eof. * Marked_sections with text ending in square bracket parsed wrong. Fix provided by <paul.bijnens@xplanation.com>. <http://rt.cpan.org/Ticket/Display.html?id=16749> 3.48 2005-12-02 * Enabling empty_element_tags by default for HTML::TokeParser was a mistake. Reverted that change. <http://rt.cpan.org/Ticket/Display.html?id=16164> * When processing a document with "marked_sections => 1", the skipped text missed the first 3 bytes "<![". <http://rt.cpan.org/Ticket/Display.html?id=16207> 3.47 2005-11-22 * Added empty_element_tags and xml_pic configuration options. These make it possible to enable these XML features without enabling the full XML-mode. * The empty_element_tags is enabled by default for HTML::TokeParser. 3.46 2005-10-24 * Don't try to treat an literal   as space. This breaks Unicode parsing. <http://rt.cpan.org/Ticket/Display.html?id=15068> * The unbroken_text option is now on by default for HTML::TokeParser. * HTML::Entities::encode will now encode "'" by default. * Improved report/ignore_tags documentation by Norbert Kiesel <nkiesel@tbdnetworks.com>. * Test suite now use Test::More, by Norbert Kiesel <nkiesel@tbdnetworks.com>. * Fix HTML::Entities typo spotted by Stefan Funke <bundy@adm.arcor.net>. * Faster load time with XSLoader (perl-5.6 or better now required). * Fixed POD markup errors in some of the modules. 3.45 2005-01-06 * Fix stack memory leak caused by missing PUTBACK. Only code that used $p->parse(\&cb) form was affected. Fix provided by Gurusamy Sarathy <gsar@sophos.com>. 3.44 2004-12-28 * Fix confusion about nested quotes in <script> and <style> text. 3.43 2004-12-06 * The SvUTF8 flag was not propagated correctly when replacing unterminated entities. * Fixed test failure because of missing binmode on Windows. 3.42 2004-12-04 * Avoid sv_catpvn_utf8_upgrade() as that macro was not available in perl-5.8.0. Patch by Reed Russell <Russell.Reed@acxiom.com>. * Add casts to suppress compilation warnings for char/U8 mismatches. * HTML::HeadParser will always push new header values. This make sure we never loose old header values. 3.41 2004-11-30 * Fix unresolved symbol error with perl-5.005. 3.40 2004-11-29 * Make utf8_mode only available on perl-5.8 or better. It produced garbage with older versions of perl. * Emit warning if entities are decoded and something in the first chunk looks like hi-bit UTF-8. Previously this warning was only triggered for documents with BOM. 3.39_92 2004-11-23 * More documentation of the Unicode issues. Moved around HTML::Parser documentation a bit. * New boolean option; $p->utf8_mode to allow parsing of raw UTF-8. * Documented that HTML::Entities::decode_entities() can take multiple arguments. * Unterminated entities are now decoded in text (compatibility with MSIE misfeature). * Document HTML::Entities::_decode_entities(); this variation of the decode_entities() function has been available for a long time, but have not been documented until now. * HTML::Entities::_decode_entities() can now be told to try to expand unterminated entities. * Simplified Makefile.PL 3.39_91 2004-11-23 * The HTML::HeadParser will skip Unicode BOM. Previously it would consider the <head> section done when it saw the BOM. * The parser will look for Unicode BOM and give appropriate warnings if the form found indicate trouble. * If no matching end tag is found for <script>, <style>, <xmp> <title>, <textarea> then generate one where the next tag starts. * For <script> and <style> recognize quoted strings and don't consider end element if the corresponding end tag is found inside such a string. 3.39_90 2004-11-17 * The <title> element is now parsed in literal mode, which means that other tags are not recognized until has been seen. * Unicode support for perl-5.8 and better. * Decoding Unicode entities always enabled; no longer a compile time option. * Propagation of UTF8 state on strings. Patch contributed by John Gardiner Myers . * Calculate offsets and lengths in chars for Unicode strings. * Fixed link typo in the HTML::TokeParser documentation. 3.38 2004-11-11 * New boolean option; $p->closing_plaintext Contributed by Alex Kapranoff 3.37 2004-11-10 * Improved handling of HTML encoded surrogate pairs and illegally encoded Unicode; . Patch by John Gardiner Myers . * Avoid generating bad UTF8 strings when decoding entities representing chars beyond #255 in 8-bit strings. Such bad UTF8 sometimes made perl-5.8.5 and older segfault. * Undocument v2 style subclassing in synopsis section. * Internal cleanup: Make 'gcc -Wall' happier. * Avoid modification of PVs during parsing of attrspec. Another patch by John Gardiner Myers. 3.36 2004-04-01 * Improved MSIE/Mozilla compatibility. If the same attribute name repeats for a start tag, use the first value instead of the last. Patch by Nick Duffek . 3.35 2003-12-12 * Documentation fixes by Paul Croome . * Removed redundant dSP. 3.34 2003-10-27 * Fix segfault that happened when the parse callback caused the stack to get reallocated. The original bug report was 3.33 2003-10-14 * Perl 5.005 or better is now required. For some reason we get a test failure with perl-5.004 and I don't really feel like debugging that perl any more. Details about this failure can be found at . * New HTML::TokeParser method called 'get_phrase'. It returns all current text while ignoring any phrase-level markup. * The HTML::TokeParser method 'get_text' now expands skipped non-phrase-level tags as a single space. 3.32 2003-10-10 * If the document parsed ended with some kind of unterminated markup, then the parser state was not reset properly and this piece of markup would show up in the beginning of the next document parsed. * The get_text and get_trimmed_text methods of HTML::TokeParser can now take multiple end tags as argument. Patch by at . * Various documentation tweaks. * Included another example program: hdump 3.31 2003-08-19 * The -DDEBUGGING fix in 3.30 was not really there :-( 3.30 2003-08-17 * The previous release failed to compile on a -DDEBUGGING perl like the one provided by Redhat 9. * Got rid of references to perl-5.7. * Further fixes to avoid warnings from Visual C. Patch by Steve Hay . 3.29 2003-08-14 * Setting xml_mode now implies strict_names also for end tags. * Avoid warning from Visual C. Patch by . * 64-bit fix from Doug Larrick http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=195500 * Try to parse similar to Mozilla/MSIE in certain edge cases. All these are outside of the official definition of HTML but HTML spam often tries to take advantage of these. - New configuration attribute 'strict_end'. Unless enabled we will allow end tags to contain extra words or stuff that look like attributes before the '>'. This means that tags like these: " ignored> are now all parsed as a 'foo' end tag instead of text. Even if the extra stuff looks like attributes they will not be reported if requested via the 'attr' or 'tokens' argspecs for the 'end' handler. - Parse '' and '' as comments unless strict_comment is enabled. Previous versions of the parser would report these as text. If these comments contain quoted words prefixed by space or '=' these words can contain '>' without terminating the comment. - Parse '" foo>' as comment containing ' "<>" foo'. Previous versions of the parser would terminate the comment at the first '>' and report the rest as text. - Legacy comment mode: Parse with comments terminated with a lone '>' if no '-->' is found before eof. - Incomplete tag at eof is reported as a 'comment' instead of 'text' unless strict_comment is enabled. 3.28 2003-04-16 * When 'strict_comment' is off (which it is by default) treat anything that matches a comment. * Should now be more efficient on threaded perls. 3.27 2003-01-18 * Typo fixes to the documentation. * HTML::Entities::escape_entities_numeric contributed by Sean M. Burke . * Included one more example program 'hlc' that show how to downcase all tags in an HTML file. 3.26 2002-03-17 * Avoid core dump in some cases where the callback croaks. The perl_call_method and perl_call_sv needs G_EVAL flag to be safe. * New parser attributes; 'attr_encoded' and 'case_sensitive'. Contributed by Guy Albertelli II . * HTML::Entities - don't encode \r by default as suggested by Sean M. Burke. * HTML::HeadParser - ignore empty http-equiv - allow multiple elements. Patch by Timur I. Bakeyev * Avoid warnings from bleadperl on the uentities test. 3.25 2001-05-11 * Minor tweaks for build failures on perl5.004_04, perl-5.6.0, and for macro clash under Windows. * Improved parsing of ... :-) 3.24 2001-05-09 * $p->parse(CODE) * New events: start_document, end_document * New argspecs: skipped_text, offset_end * The offset/line/column counters was not properly reset after eof. 3.23 2001-05-01 * If the $p->ignore_elements filter did not work as it should if handlers for start/end events was not registered. 3.22 2001-04-17 * The <textarea> element is now parsed in literal mode, i.e. no other tags recognized until the </textarea> tag is seen. Unlike other literal elements, the text content is not 'cdata'. * The XML &apos; entity is decoded. It apos-char itself is still encoded as &#39; as &apos; is not really an HTML tag, and not recognized by many HTML browsers. 3.21 2001-04-10 * Fix a memory leak which occurred when using filter methods. * Avoid a few compiler warnings (DEC C): - Trailing comma found in enumerator list - "unsigned char" is not compatible with "const char". * Doc update. 3.20 2001-04-02 * Some minor documentation updates. 3.19_94 2001-03-30 * Implemented 'tag', 'line', 'column' argspecs. * HTML::PullParser doc update. eg/hform is an example of HTML::PullParser usage. 3.19_93 2001-03-27 * Shorten 'report_only_tags' to 'report_tags'. I think it reads better. * Bleadperl portability fixes. 3.19_92 2001-03-25 * HTML::HeadParser made more efficient by using 'ignore_elements'. * HTML::LinkExtor made more efficient by using 'report_only_tags'. * HTML::TokeParser generalized into HTML::PullParser. HTML::PullParser only support the get_token/unget_token interface of HTML::TokeParser, but is more flexible because the information that make up an token is customisable. HTML::TokeParser is made into an HTML::PullParser subclass. 3.19_91 2001-03-19 * Array references can be passed to the filter methods. Makes it easier to use them as constructor options. * Example programs updated to use filters. * Reset ignored_element state on EOF. * Documentation updates. * The netscape_buggy_comment() method now generates mandatory warning about its deprecation. 3.19_90 2001-03-13 * This is an developer only release. It contains some new experimental features. The interface to these might still change. * Implemented filters to reduce the numbers of callbacks generated: - $p->ignore_tags() - $p->report_only_tags() - $p->ignore_elements() * New @attr argspec. Less overhead than 'attr' and allow compatibility with XML::Parser style start events. * The whole argspec can be wrapped up in @{...} to signal flattening. Only makes a difference when the target is an array. 3.19 2001-03-09 * Avoid the entity2char global. That should make the module more thread safe. Patch by Gurusamy Sarathy <gsar@ActiveState.com>. 3.18 2001-02-24 * There was a C++ style comment left in util.c. Strict C compilers do not like that kind of stuff. 3.17 2001-02-23 * The 3.16 release broke MULTIPLICITY builds. Fixed. 3.16 2001-02-22 * The unbroken_text option now works across ignored tags. * Fix casting of pointers on some 64 bit platforms. * Fix decoding of Unicode entities. Only optionally available for perl-5.7.0 or better. * Expose internal decode_entities() function at the Perl level. * Reindented some code. 3.15 2000-12-26 * HTML::TokeParser's get_tag() method now takes multiple tags to match. Hopefully the documentation is also a bit clearer. * #define PERL_NO_GET_CONTEXT: Should speed up things for thread enabled versions of perl. * Quote some more entities that also happens to be perl keywords. This avoids warnings on perl-5.004. * Unicode entities only triggered for perl-5.7.0 or higher. 3.14 2000-12-03 * If a handler triggered by flushing text at eof called the eof method then infinite recursion occurred. Fixed. Bug discovered by Jonathan Stowe <gellyfish@gellyfish.com>. * Allow <!doctype ...> to be parsed as declaration. 3.13 2000-09-17 * Experimental support for decoding of Unicode entities. 3.12 2000-09-14 * Some tweaks to get it to compile with "Optimierender Microsoft (R) 32-Bit C/C++-Compiler, Version 12.00.8168, fuer x86." Patch by Matthias Waldorf <matthias.waldorf@zoom.de>. * HTML::Entities documentation spelling patch by David Dyck <dcd@tc.fluke.com>. 3.11 2000-08-22 * HTML::LinkExtor and eg/hrefsub now obtain %linkElements from the HTML::Tagset module. 3.10 2000-06-29 * Avoid core dump when stack gets relocated as the result of text handler invocation while $p->unbroken_text is enabled. Needed to refresh the stack pointer. 3.09 2000-06-28 * Avoid core dump if somebody clobbers the aliased $self argument of a handler. * HTML::TokeParser documentation update suggested by Paul Makepeace <Paul.Makepeace@realprogrammers.com>. 3.08 2000-05-23 * Fix core dump for large start tags. Bug spotted by Alexander Fraser <green795@hotmail.com> * Added yet another example program: eg/hanchors * Typo fix by Jamie McCarthy <jamie@mccarthy.org> 3.07 2000-03-20 * Fix perl5.004 builds (was broken in 3.06) * Declaration parsing mode now only triggers for <!DOCTYPE ...> and <!ENTITY ...>. Based on patch by la mouton <kero@3sheep.com>. 3.06 2000-03-06 * Multi-threading/MULTIPLICITY compilation fix. Both Doug MacEachern <dougm@pobox.com> and Matthias Urlichs <smurf@noris.net> provided a patch. * Avoid some "statement not reached" warnings from picky compilers. * Remove final commas in enums as ANSI C does not allow them and some compilers actually care. Patch by James Walden <jamesw@ichips.intel.com> * Added eg/htextsub example program. 3.05 2000-01-22 * Implemented $p->unbroken_text option * Don't parse content of certain HTML elements as CDATA when xml_mode is enabled. * Offset was reported with wrong sign for text at end of chunk. 3.04 2000-01-15 * Backed out 3.03-patch that checked for legal handler and attribute names in the HTML::Parser constructor. * Documentation typo fixed by Michael. 3.03 2000-01-14 * We did not get out of comment mode for comments ending with an odd number of "-" before ">". Patch by la mouton <kero@3sheep.com> * Documentation patch by Michael. 3.02 1999-12-21 * Hide ~-magic IV-pointer to 'struct p_state' behind a reference. This allow copying of the internal _hparser_xs_state element, and will make HTML-Tree-0.61 work again. * Introduced $p->init() which might be useful for subclasses that only want the initialization part of the constructor. * Filled out DIAGNOSTICS section of the HTML::Parser POD. 3.01 1999-12-19 * Rely on ~-magic instead of a DESTROY method to deallocate the internal 'struct p_state'. This avoid memory leaks when people simply wipe of the content of the object hash. * One of the assertion in hparser.c had opposite logic. This made the parser fail when compiled with a -DDEBUGGING perl. * Don't assume any specific order of hash keys in the t/cases.t. This test failed with some newer development releases of perl. 3.00 1999-12-14 * Documentation update (most of it from Michael) * Minor patch to eg/hstrip so that it use a "" handler instead of &ignore. * Test suite patches from Michael 2.99_96 1999-12-13 * Patches from Michael: - A handler of "" means that the event will be ignored. More efficient than using 'sub {}' as handler. - Don't use a perl hash for looking up argspec keywords. - Documentation tweaks. 2.99_95 1999-12-09 * (this is a 3.00 candidate) * Fixed core dump when "<" was followed by an 8-bit character. Spotted and test case provided by Doug MacEachern. Doug had been running HTML-Parser-XS through more that 1 million urls that had been downloaded via LWP. * Handlers can now invoke $p->eof to request the parsing to terminate. HTML::HeadParser has been simplified by taking advantage of this. Also added a title-extraction example that uses this. * Michael once again fixed my bad English in the HTML::Parser documentation. * netscape_buggy_comment will carp instead of warn * updated TODO/README * Documented that HTML::Filter is depreciated. * Made backslash reserved in literal argspec strings. * Added several new test scripts. 2.99_94 1999-12-08 * (should almost be a 3.00 candidate) * Renamed 'cdata_flag' as 'is_cdata'. * Dropped support for wrapping callback handler and argspec in an array and passing a reference to $p->handler. It created ambiguities when you want to pass a array as handler destination and not update argspec. The wrapping for constructor arguments are unchanged. * Reworked the documentation after updates from Michael. * Simplified internal check_handler(). It should probably simply be inlined in handler() again. * Added argspec 'length' and 'undef' * Fix statement-less label. Fix suggested by Matthew Langford <langfml@Eng.Auburn.EDU>. * Added two more example programs: eg/hstrip and eg/htext. * Various minor patches from Michael. 2.99_93 1999-12-07 * Documentation update * $p->bool_attr_value renamed as $p->boolean_attribute_value * Internal renaming: attrspec --> argspec * Introduced internal 'enum argcode' in hparser.c * Added eg/hrefsub 2.99_92 1999-12-05 * More documentation patches from Michael * Renamed 'token1' as 'token0' as suggested by Michael * For artificial end tags we now report 'tokens', but not 'tokenpos'. * Boolean attribute values show up as (0, 0) in 'tokenpos' now. * If $p->bool_attr_value is set it will influence 'tokens' * Fix for core dump when parsing <a "> when $p->strict_names(0). Based on fix by Michael. * Will av_extend() the tokens/tokenspos arrays. * New test suite script by Michael: t/attrspec.t 2.99_91 1999-12-04 * Implemented attrspec 'offset' * Documentation patch from Michael * Some more cleanup/updated TODO 2.99_90 1999-12-03 * (first beta for 3.00) * Using "realloc" as a parameter name in grow_tokens created problems for some people. Fix by Paul Schinder <schinder@pobox.com> * Patch by Michael that makes array handler destinations really work. * Patch by Michael that make HTML::TokeParser use this. This gave a a speedup of about 80%. * Patch by Michael that makes t/cases into a real test. * Small HTML::Parser documentation patch by Michael. * Renamed attrspec 'origtext' to 'text' and 'decoded_text' to 'dtext' * Split up Parser.xs. Moved stuff into hparser.c and util.c * Dropped html_ prefix from internal parser functions. * Renamed internal function html_handle() as report_event(). 2.99_17 1999-12-02 * HTML::Parser documentation patch from Michael. * Fix memory leaks in html_handler() * Patch that makes an array legal as handler destination. Also from Michael. * The end of marked sections does not eat successive newline any more. * The artificial end event for empty tag in xml_mode did not report an empty origtext. * New constructor option: 'api_version' 2.99_16 1999-12-01 * Support "event" in argspec. It expands to the name of the handler (minus "default"). * Fix core dump for large start tags. The tokens_grow() routine needed an adjustment. Added test for this; t/largstags.t. 2.99_15 1999-11-30 * Major restructuring/simplification of callback interface based on initial work by Michael. The main news is that you now need to tell what arguments you want to be provided to your callbacks. * The following parser options has been eliminated: $p->decode_text_entities $p->keep_case $p->v2_compat $p->pass_self $p->attr_pos 2.99_14 1999-11-26 * Documentation update by Michael A. Chase. * Fix for declaration parsing by Michael A. Chase. * Workaround for perl5.004_05 bug. Can't return &PL_sv_undef. 2.99_13 1999-11-22 * New Parser.pm POD based on initial work by Michael A. Chase. All new features should now be described. * $p->callback(start => undef) will not reset the callback. * $p->xml_mode() did not parse attributes correct because HCTYPE_NOT_SPACE_EQ_SLASH_GT flag was never set. * A few more tests. 2.99_12 1999-11-18 * Implemented $p->attr_pos attribute. This causes attr positions within $origtext of the start tag to be reported instead of the attribute values. The positions are reported as 4 numbers; end of previous attr, start of this attr, start of attr value, and end of attr. This should make substr() manipulations of $origtext easy. * Implemented $p->unbroken_text attribute. This makes sure that text segments are never broken and given back as separate text callbacks. It delays text callbacks until some other markup has been recognized. * More English corrections by Michael A. Chase. * HTML::LinkExtor now recognizes even more URI attributes as suggested by Sean M. Burke <sburke@netadventure.net> * Completed marked sections support. It is also now a compile time decision if you want this supported or not. The only drawback of enabling it should be a possible parsing speed reduction. I have not measured this yet. * The keys for callbacks initialized in the constructor are now suffixed with "_cb". * Renamed $p->pass_cbdata to $p->pass_self. * Added magic number to the p_state struct. 2.99_11 1999-11-17 * Don't leak $@ modifications from HTML::Parser constructor. * Included HTML::Parser POD. * Marked sections almost work. CDATA and RCDATA should work. * For tags that take us into literal_mode; <script>, <style>, <xmp>, we did not recognize the end tag unless it was written in all lower case. 2.99_10 1999-11-16 * The mkhctype and mkpfunc scripts were using \z inside RE. This did not work for perl5.004. Replaced them with plain old dollar signs. 2.99_09 1999-11-15 * Grammar fixes by Michael A. Chase <mchase@ix.netcom.com> * Some more test suite patches for Win32 by Michael A. Chase <mchase@ix.netcom.com> * Implemented $p->strict_names attribute. By default we now allow almost anything in tag and attribute names. This is much closer to the behaviour of some popular browsers. This allows us to parse broken tags like this example from the LWP mailing list: <IMG ALIGN=MIDDLE SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0> * Introduced some tables in "hctype.h" and "pfunc.h". These are built by the corresponding "mk..." script. 2.99_08 1999-11-10 * Make Parser.xs compile on perl5.004_05 too. * New callback called 'default'. This will be called for any document text no other callback shows an interest in. * Patch by Michael A. Chase <mchase@ix.netcom.com> that should help clean up files for the test suite on Win32. * Can now set up various attributes with key/value pairs passed to the constructor. * $p->parse_file() will open the file in binmode() * Pass complete processing instruction tag as second argument to process callback. * New boolean attribute v2_compat. This influences how attributes are reported for start tags. * HTML::Filter now filters process instructions too. * Faster HTML::LinkExtor by taking advantage of the new callback interface. The module now also uses URI.pm (instead of the old URI::URL) to absolutize URIs. * Faster HTML::TokeParser by taking advantage of new accum interface. 2.99_07 1999-11-09 * Entities in attribute values are now always expanded. * If you set the $p->decode_text_entities to a true value, then you don't have to decode the text yourself. * In xml_mode we don't report empty element tags as a start tag with an extra parameter any more. Instead we generate an artificial end tag. * 'xml_mode' now implies 'keep_case'. * The parser now keeps its own copy of the bool_attr_value value. * Avoid memory leak for text callbacks * Avoid using ERROR as a goto label. * Introduced common internal accessor function for all boolean parser attributes. * Tweaks to make Parser.xs compile under perl5.004. 2.99_06 1999-11-08 * Internal fast decode_entities(). By using it we are able to make the HTML::Entities::decode function 6 times faster than the old one implemented in pure Perl. * $p->bool_attr_value() can be set to influence the value that boolean attributes will be assigned. The default is to assign a value identical to the attribute name. * Process instructions are reported as "PI" in @accum * $p->xml_mode(1) modifies how processing instructions are terminated and allows "/>" at the end of start tags. * Turn off optimizations when compiling with gcc on Solaris. Avoids what we believe to be a compiler bug. Should probably figure out which versions of gcc have this bug. 2.99_05 1999-11-05 * The previous release did not even compile. I forgot to try 'make test' before uploading. 2.99_04 1999-11-05 * Generalized <XMP>-support to cover all literal parsing. Currently activated for <script>, <style>, <xmp> and <plaintext>. 2.99_03 1999-11-05 * <XMP>-support. * Allow ":" in tag and attribute names * Include rest of the HTML::* files from the old HTML::Parser package. This should make testing easier. 2.99_02 1999-11-04 * Implemented keep_case() option. If this attribute is true, then we don't lowercase tag and attribute names. * Implemented accum() that takes an array reference. Tokens are pushed onto this array instead of sent to callbacks. * Implemented strict_comment(). 2.99_01 1999-11-03 * Baseline of XS implementation 2.25 1999-11-05 * Allow ":" in attribute names as a workaround for Microsoft Excel 2000 which generates such files. * Make deprecate warning if netscape_buggy_comment() method is used. The method is used in strict_comment(). * Avoid duplication of parse_file() method in HTML::HeadParser. 2.24 1999-10-29 * $p->parse_file() will not close a handle passed to it any more. If passed a filename that can't be opened it will return undef instead of raising an exception, and strings like "*STDIN" are not treated as globs any more. * HTML::LinkExtor knows about background attribute of <tables>. Patch by Clinton Wong <clintdw@netcom.com> * HTML::TokeParser will parse large inline strings much faster now. The string holding the document must not be changed during parsing. 2.23 1999-06-09 * Documentation updates. 2.22 1998-12-18 * Protect HTML::HeadParser from evil $SIG{__DIE__} hooks. 2.21 1998-11-13 * HTML::TokeParser can now parse strings directly and does the right thing if you pass it a GLOB. Based on patch by Sami Itkonen <si@iki.fi>. * HTML::Parser now allows space before and after "--" in Netscape comments. Patch by Peter Orbaek <poe@daimi.au.dk>. 2.20 1998-07-08 * Added HTML::TokeParser. Check it out! 2.19 1998-07-07 * Don't end a text chunk with space when we try to avoid breaking up words. 2.18 1998-06-22 * HTML::HeadParser->parse_file will now stop parsing when the <body> starts as it should. * HTML::LinkExtor more easily subclassable by introducing the $self->_found_link method. 2.17 1998-04-28 * Never split words (a sequence of non-space) between two invocations of $self->text. This is just a simplification of the code that tried not to break entities. * HTML::Parser->parse_file now use smaller chunks as already suggested by the HTML::Parser documentation. 2.16 1998-04-02 * The HTML::Parser could some times break hex entities (like &#xFFFF;) in the middle. * Removed remaining forced dependencies on libwww-perl modules. It means that all tests should now pass, even if libwww-perl was not installed previously. * More tests. 2.14 1998-04-01 * HTML::* modules unbundled from libwww-perl-5.22 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/TODO�������������������������������������������������������������������������������000644 �000765 �000024 �00000001777 14366103441 015430� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������TODO - Check how we compare to the HTML5 parsing rules - limit the length of markup elements that never end. Perhaps by configurable limits on the length that markup can have and still be recognized. Report stuff as 'text' when this happens? - remove 255 char limit on literal argspec strings - implement backslash escapes in literal argspec string - <![%app1;[...]]> (parameter entities) - make literal tags configurable. The current list is hardcoded to be "script", "style", "title", "iframe", "textarea", "xmp", and "plaintext". SGML FEATURES WE WILL PROBABLY IGNORE FOREVER - Empty tags: <> </> (repeat previous start tag) - <foo<bar> (same as <foo><bar>) - NET tags <name/.../ MINOR "BUGS" (alias FEATURES) - no way to clear "boolean_attribute_value". - <style> and <script> do not end with the first "</". MSIE bug compatibility - recognize server side includes as comments; <% ... %> if no matching %> found tread "<% ..." as text - skip quoted strings when looking for PIC �HTML-Parser-3.81/MANIFEST���������������������������������������������������������������������������000644 �000765 �000024 �00000002675 14366103441 016067� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������# This file was automatically generated by Dist::Zilla::Plugin::Manifest v6.025. Changes LICENSE MANIFEST META.json META.yml Makefile.PL Parser.xs README TODO cpanfile dist.ini eg/hanchors eg/hbody eg/hdisable eg/hdump eg/hform eg/hlc eg/hrefsub eg/hstrip eg/htext eg/htextsub eg/htitle entities.html hctype.h hints/solaris.pl hparser.c hparser.h lib/HTML/Entities.pm lib/HTML/Filter.pm lib/HTML/HeadParser.pm lib/HTML/LinkExtor.pm lib/HTML/Parser.pm lib/HTML/PullParser.pm lib/HTML/TokeParser.pm mkhctype mkpfunc pfunc.h ppport.h t/00-report-prereqs.dd t/00-report-prereqs.t t/api_version.t t/argspec-bad.t t/argspec.t t/argspec2.t t/attr-encoded.t t/callback.t t/case-sensitive.t t/cases.t t/comment.t t/crashme.t t/declaration.t t/default.t t/document.t t/dtext.t t/entities.t t/entities2.t t/filter-methods.t t/filter.t t/free.t t/handler-eof.t t/handler.t t/headparser-http.t t/headparser.t t/ignore.t t/largetags.t t/linkextor-base.t t/linkextor-rel.t t/magic.t t/marked-sect.t t/msie-compat.t t/offset.t t/options.t t/parsefile.t t/parser.t t/plaintext.t t/process.t t/pullparser.t t/script.t t/skipped-text.t t/stack-realloc-eof.t t/stack-realloc.t t/textarea.t t/threads.t t/tokeparser.t t/uentities.t t/unbroken-text.t t/unicode-bom.t t/unicode.t t/xml-mode.t tokenpos.h typemap util.c xt/author/00-compile.t xt/author/pod-coverage.t xt/author/pod-spell.t xt/author/pod-syntax.t xt/author/test-version.t xt/release/changes_has_content.t xt/release/kwalitee.t �������������������������������������������������������������������HTML-Parser-3.81/LICENSE����������������������������������������������������������������������������000644 �000765 �000024 �00000044123 14366103441 015735� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������This software is copyright (c) 1996 by Gisle Aas <gaas@cpan.org> and Michael A. Chase <mchase@ix.netcom.com>. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. Terms of the Perl programming language system itself a) the GNU General Public License as published by the Free Software Foundation; either version 1, or (at your option) any later version, or b) the "Artistic License" --- The GNU General Public License, Version 1, February 1989 --- This software is Copyright (c) 1996 by Gisle Aas <gaas@cpan.org> and Michael A. Chase <mchase@ix.netcom.com>. This is free software, licensed under: The GNU General Public License, Version 1, February 1989 GNU GENERAL PUBLIC LICENSE Version 1, February 1989 Copyright (C) 1989 Free Software Foundation, Inc. 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The license agreements of most software companies try to keep users at the mercy of those companies. By contrast, our General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. The General Public License applies to the Free Software Foundation's software and to any other program whose authors commit to using it. You can use it for your programs, too. When we speak of free software, we are referring to freedom, not price. Specifically, the General Public License is designed to make sure that you have the freedom to give away or sell copies of free software, that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of a such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must tell them their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any work containing the Program or a portion of it, either verbatim or with modifications. Each licensee is addressed as "you". 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this General Public License and to the absence of any warranty; and give any other recipients of the Program a copy of this General Public License along with the Program. You may charge a fee for the physical act of transferring a copy. 2. You may modify your copy or copies of the Program or any portion of it, and copy and distribute such modifications under the terms of Paragraph 1 above, provided that you also do the following: a) cause the modified files to carry prominent notices stating that you changed the files and the date of any change; and b) cause the whole of any work that you distribute or publish, that in whole or in part contains the Program or any part thereof, either with or without modifications, to be licensed at no charge to all third parties under the terms of this General Public License (except that you may choose to grant warranty protection to some or all third parties, at your option). c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the simplest and most usual way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this General Public License. d) You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. Mere aggregation of another independent work with the Program (or its derivative) on a volume of a storage or distribution medium does not bring the other work under the scope of these terms. 3. You may copy and distribute the Program (or a portion or derivative of it, under Paragraph 2) in object code or executable form under the terms of Paragraphs 1 and 2 above provided that you also do one of the following: a) accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Paragraphs 1 and 2 above; or, b) accompany it with a written offer, valid for at least three years, to give any third party free (except for a nominal charge for the cost of distribution) a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Paragraphs 1 and 2 above; or, c) accompany it with the information you received as to where the corresponding source code may be obtained. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form alone.) Source code for a work means the preferred form of the work for making modifications to it. For an executable file, complete source code means all the source code for all modules it contains; but, as a special exception, it need not include source code for modules which are standard libraries that accompany the operating system on which the executable file runs, or for standard header files or definitions files that accompany that operating system. 4. You may not copy, modify, sublicense, distribute or transfer the Program except as expressly provided under this General Public License. Any attempt otherwise to copy, modify, sublicense, distribute or transfer the Program is void, and will automatically terminate your rights to use the Program under this License. However, parties who have received copies, or rights to use copies, from you under this General Public License will not have their licenses terminated so long as such parties remain in full compliance. 5. By copying, distributing or modifying the Program (or any work based on the Program) you indicate your acceptance of this license to do so, and all its terms and conditions. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. 7. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of the license which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the license, you may choose any version ever published by the Free Software Foundation. 8. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 9. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 10. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS Appendix: How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to humanity, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. <one line to give the program's name and a brief idea of what it does.> Copyright (C) 19yy <name of author> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 1, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) 19xx name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (a program to direct compilers to make passes at assemblers) written by James Hacker. <signature of Ty Coon>, 1 April 1989 Ty Coon, President of Vice That's all there is to it! --- The Artistic License 1.0 --- This software is Copyright (c) 1996 by Gisle Aas <gaas@cpan.org> and Michael A. Chase <mchase@ix.netcom.com>. This is free software, licensed under: The Artistic License 1.0 The Artistic License Preamble The intent of this document is to state the conditions under which a Package may be copied, such that the Copyright Holder maintains some semblance of artistic control over the development of the package, while giving the users of the package the right to use and distribute the Package in a more-or-less customary fashion, plus the right to make reasonable modifications. Definitions: - "Package" refers to the collection of files distributed by the Copyright Holder, and derivatives of that collection of files created through textual modification. - "Standard Version" refers to such a Package if it has not been modified, or has been modified in accordance with the wishes of the Copyright Holder. - "Copyright Holder" is whoever is named in the copyright or copyrights for the package. - "You" is you, if you're thinking about copying or distributing this Package. - "Reasonable copying fee" is whatever you can justify on the basis of media cost, duplication charges, time of people involved, and so on. (You will not be required to justify it to the Copyright Holder, but only to the computing community at large as a market that must bear the fee.) - "Freely Available" means that no fee is charged for the item itself, though there may be fees involved in handling the item. It also means that recipients of the item may redistribute it under the same conditions they received it. 1. You may make and give away verbatim copies of the source form of the Standard Version of this Package without restriction, provided that you duplicate all of the original copyright notices and associated disclaimers. 2. You may apply bug fixes, portability fixes and other modifications derived from the Public Domain or from the Copyright Holder. A Package modified in such a way shall still be considered the Standard Version. 3. You may otherwise modify your copy of this Package in any way, provided that you insert a prominent notice in each changed file stating how and when you changed that file, and provided that you do at least ONE of the following: a) place your modifications in the Public Domain or otherwise make them Freely Available, such as by posting said modifications to Usenet or an equivalent medium, or placing the modifications on a major archive site such as ftp.uu.net, or by allowing the Copyright Holder to include your modifications in the Standard Version of the Package. b) use the modified Package only within your corporation or organization. c) rename any non-standard executables so the names do not conflict with standard executables, which must also be provided, and provide a separate manual page for each non-standard executable that clearly documents how it differs from the Standard Version. d) make other distribution arrangements with the Copyright Holder. 4. You may distribute the programs of this Package in object code or executable form, provided that you do at least ONE of the following: a) distribute a Standard Version of the executables and library files, together with instructions (in the manual page or equivalent) on where to get the Standard Version. b) accompany the distribution with the machine-readable source of the Package with your modifications. c) accompany any non-standard executables with their corresponding Standard Version executables, giving the non-standard executables non-standard names, and clearly documenting the differences in manual pages (or equivalent), together with instructions on where to get the Standard Version. d) make other distribution arrangements with the Copyright Holder. 5. You may charge a reasonable copying fee for any distribution of this Package. You may charge any fee you choose for support of this Package. You may not charge a fee for this Package itself. However, you may distribute this Package in aggregate with other (possibly commercial) programs as part of a larger (possibly commercial) software distribution provided that you do not advertise this Package as a product of your own. 6. The scripts and library files supplied as input to or produced as output from the programs of this Package do not automatically fall under the copyright of this Package, but belong to whomever generated them, and may be sold commercially, and may be aggregated with this Package. 7. C or perl subroutines supplied by you and linked into this Package shall not be considered part of this Package. 8. The name of the Copyright Holder may not be used to endorse or promote products derived from this software without specific prior written permission. 9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. The End ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/ppport.h���������������������������������������������������������������������������000755 �000765 �000024 �00000421674 14366103441 016442� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#if 0 my $void = <<'SKIP'; #endif /* ---------------------------------------------------------------------- ppport.h -- Perl/Pollution/Portability Version 3.68 Automatically created by Devel::PPPort running under perl 5.036000. Version 3.x, Copyright (c) 2004-2013, Marcus Holland-Moritz. Version 2.x, Copyright (C) 2001, Paul Marquess. Version 1.x, Copyright (C) 1999, Kenneth Albanowski. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. ---------------------------------------------------------------------- SKIP if (@ARGV && $ARGV[0] eq '--unstrip') { eval { require Devel::PPPort }; $@ and die "Cannot require Devel::PPPort, please install.\n"; if (eval $Devel::PPPort::VERSION < 3.68) { die "ppport.h was originally generated with Devel::PPPort 3.68.\n" . "Your Devel::PPPort is only version $Devel::PPPort::VERSION.\n" . "Please install a newer version, or --unstrip will not work.\n"; } Devel::PPPort::WriteFile($0); exit 0; } print <<END; Sorry, but this is a stripped version of $0. To be able to use its original script and doc functionality, please try to regenerate this file using: $^X $0 --unstrip END __DATA__*/ #ifndef _P_P_PORTABILITY_H_ #define _P_P_PORTABILITY_H_ #ifndef DPPP_NAMESPACE #define DPPP_NAMESPACE DPPP_ #endif #define DPPP_CAT2(x,y) CAT2(x,y) #define DPPP_(name) DPPP_CAT2(DPPP_NAMESPACE, name) #define D_PPP_RELEASE_DATE 1647561600 #if ! defined(PERL_REVISION) && ! defined(PERL_VERSION_MAJOR) #if ! defined(__PATCHLEVEL_H_INCLUDED__) \ && ! ( defined(PATCHLEVEL) && defined(SUBVERSION)) #define PERL_PATCHLEVEL_H_IMPLICIT #include <patchlevel.h> #endif #if ! defined(PERL_VERSION) \ && ! defined(PERL_VERSION_MAJOR) \ && ( ! defined(SUBVERSION) || ! defined(PATCHLEVEL) ) #include <could_not_find_Perl_patchlevel.h> #endif #endif #ifdef PERL_VERSION_MAJOR #define D_PPP_MAJOR PERL_VERSION_MAJOR #elif defined(PERL_REVISION) #define D_PPP_MAJOR PERL_REVISION #else #define D_PPP_MAJOR 5 #endif #ifdef PERL_VERSION_MINOR #define D_PPP_MINOR PERL_VERSION_MINOR #elif defined(PERL_VERSION) #define D_PPP_MINOR PERL_VERSION #elif defined(PATCHLEVEL) #define D_PPP_MINOR PATCHLEVEL #define PERL_VERSION PATCHLEVEL #else #error Could not find a source for PERL_VERSION_MINOR #endif #ifdef PERL_VERSION_PATCH #define D_PPP_PATCH PERL_VERSION_PATCH #elif defined(PERL_SUBVERSION) #define D_PPP_PATCH PERL_SUBVERSION #elif defined(SUBVERSION) #define D_PPP_PATCH SUBVERSION #define PERL_SUBVERSION SUBVERSION #else #error Could not find a source for PERL_VERSION_PATCH #endif #if D_PPP_MAJOR < 5 || D_PPP_MAJOR == 6 #error Devel::PPPort works only on Perl 5, Perl 7, ... #elif D_PPP_MAJOR != 5 #undef PERL_REVISION #undef PERL_VERSION #undef PERL_SUBVERSION #define D_PPP_REVISION 5 #define D_PPP_VERSION 201 #define D_PPP_SUBVERSION 201 #if (defined(__clang__) \ && ( (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) \ || defined(_STDC_C99) \ || defined(__c99))) #define D_PPP_STRINGIFY(x) #x #define D_PPP_deprecate(xyz) _Pragma(D_PPP_STRINGIFY(GCC warning(D_PPP_STRINGIFY(xyz) " is deprecated"))) #define PERL_REVISION (D_PPP_REVISION D_PPP_deprecate(PERL_REVISION)) #define PERL_VERSION (D_PPP_REVISION D_PPP_deprecate(PERL_VERSION)) #define PERL_SUBVERSION (D_PPP_SUBVERSION D_PPP_deprecate(PERL_SUBVERSION)) #else #define PERL_REVISION D_PPP_REVISION #define PERL_VERSION D_PPP_REVISION #define PERL_SUBVERSION D_PPP_SUBVERSION #endif #endif #define D_PPP_DEC2BCD(dec) ((((dec)/100)<<8)|((((dec)%100)/10)<<4)|((dec)%10)) #define D_PPP_JNP_TO_BCD(j,n,p) ((D_PPP_DEC2BCD(j)<<24)|(D_PPP_DEC2BCD(n)<<12)|D_PPP_DEC2BCD(p)) #define PERL_BCDVERSION D_PPP_JNP_TO_BCD(D_PPP_MAJOR, \ D_PPP_MINOR, \ D_PPP_PATCH) #undef PERL_VERSION_EQ #undef PERL_VERSION_NE #undef PERL_VERSION_LT #undef PERL_VERSION_GE #undef PERL_VERSION_LE #undef PERL_VERSION_GT #ifndef PERL_VERSION_EQ #define PERL_VERSION_EQ(j,n,p) \ (((p) == '*') ? ( (j) == D_PPP_VERSION_MAJOR \ && (n) == D_PPP_VERSION_MINOR) \ : (PERL_BCDVERSION == D_PPP_JNP_TO_BCD(j,n,p))) #endif #ifndef PERL_VERSION_NE #define PERL_VERSION_NE(j,n,p) (! PERL_VERSION_EQ(j,n,p)) #endif #ifndef PERL_VERSION_LT #define PERL_VERSION_LT(j,n,p) \ (PERL_BCDVERSION < D_PPP_JNP_TO_BCD( (j), \ (n), \ (((p) == '*') ? 0 : (p)))) #endif #ifndef PERL_VERSION_GE #define PERL_VERSION_GE(j,n,p) (! PERL_VERSION_LT(j,n,p)) #endif #ifndef PERL_VERSION_LE #define PERL_VERSION_LE(j,n,p) \ (PERL_BCDVERSION < D_PPP_JNP_TO_BCD( (j), \ (((p) == '*') ? ((n)+1) : (n)), \ (((p) == '*') ? 0 : (p)))) #endif #ifndef PERL_VERSION_GT #define PERL_VERSION_GT(j,n,p) (! PERL_VERSION_LE(j,n,p)) #endif #ifndef dTHR #define dTHR dNOOP #endif #ifndef dTHX #define dTHX dNOOP #endif #ifndef dTHXa #define dTHXa(x) dNOOP #endif #ifndef pTHX #define pTHX void #endif #ifndef pTHX_ #define pTHX_ #endif #ifndef aTHX #define aTHX #endif #ifndef aTHX_ #define aTHX_ #endif #if (PERL_BCDVERSION < 0x5006000) #ifdef USE_THREADS #define aTHXR thr #define aTHXR_ thr, #else #define aTHXR #define aTHXR_ #endif #define dTHXR dTHR #else #define aTHXR aTHX #define aTHXR_ aTHX_ #define dTHXR dTHX #endif #ifndef dTHXoa #define dTHXoa(x) dTHXa(x) #endif #ifdef I_LIMITS #include <limits.h> #endif #ifndef PERL_UCHAR_MIN #define PERL_UCHAR_MIN ((unsigned char)0) #endif #ifndef PERL_UCHAR_MAX #ifdef UCHAR_MAX #define PERL_UCHAR_MAX ((unsigned char)UCHAR_MAX) #else #ifdef MAXUCHAR #define PERL_UCHAR_MAX ((unsigned char)MAXUCHAR) #else #define PERL_UCHAR_MAX ((unsigned char)~(unsigned)0) #endif #endif #endif #ifndef PERL_USHORT_MIN #define PERL_USHORT_MIN ((unsigned short)0) #endif #ifndef PERL_USHORT_MAX #ifdef USHORT_MAX #define PERL_USHORT_MAX ((unsigned short)USHORT_MAX) #else #ifdef MAXUSHORT #define PERL_USHORT_MAX ((unsigned short)MAXUSHORT) #else #ifdef USHRT_MAX #define PERL_USHORT_MAX ((unsigned short)USHRT_MAX) #else #define PERL_USHORT_MAX ((unsigned short)~(unsigned)0) #endif #endif #endif #endif #ifndef PERL_SHORT_MAX #ifdef SHORT_MAX #define PERL_SHORT_MAX ((short)SHORT_MAX) #else #ifdef MAXSHORT #define PERL_SHORT_MAX ((short)MAXSHORT) #else #ifdef SHRT_MAX #define PERL_SHORT_MAX ((short)SHRT_MAX) #else #define PERL_SHORT_MAX ((short) (PERL_USHORT_MAX >> 1)) #endif #endif #endif #endif #ifndef PERL_SHORT_MIN #ifdef SHORT_MIN #define PERL_SHORT_MIN ((short)SHORT_MIN) #else #ifdef MINSHORT #define PERL_SHORT_MIN ((short)MINSHORT) #else #ifdef SHRT_MIN #define PERL_SHORT_MIN ((short)SHRT_MIN) #else #define PERL_SHORT_MIN (-PERL_SHORT_MAX - ((3 & -1) == 3)) #endif #endif #endif #endif #ifndef PERL_UINT_MAX #ifdef UINT_MAX #define PERL_UINT_MAX ((unsigned int)UINT_MAX) #else #ifdef MAXUINT #define PERL_UINT_MAX ((unsigned int)MAXUINT) #else #define PERL_UINT_MAX (~(unsigned int)0) #endif #endif #endif #ifndef PERL_UINT_MIN #define PERL_UINT_MIN ((unsigned int)0) #endif #ifndef PERL_INT_MAX #ifdef INT_MAX #define PERL_INT_MAX ((int)INT_MAX) #else #ifdef MAXINT #define PERL_INT_MAX ((int)MAXINT) #else #define PERL_INT_MAX ((int)(PERL_UINT_MAX >> 1)) #endif #endif #endif #ifndef PERL_INT_MIN #ifdef INT_MIN #define PERL_INT_MIN ((int)INT_MIN) #else #ifdef MININT #define PERL_INT_MIN ((int)MININT) #else #define PERL_INT_MIN (-PERL_INT_MAX - ((3 & -1) == 3)) #endif #endif #endif #ifndef PERL_ULONG_MAX #ifdef ULONG_MAX #define PERL_ULONG_MAX ((unsigned long)ULONG_MAX) #else #ifdef MAXULONG #define PERL_ULONG_MAX ((unsigned long)MAXULONG) #else #define PERL_ULONG_MAX (~(unsigned long)0) #endif #endif #endif #ifndef PERL_ULONG_MIN #define PERL_ULONG_MIN ((unsigned long)0L) #endif #ifndef PERL_LONG_MAX #ifdef LONG_MAX #define PERL_LONG_MAX ((long)LONG_MAX) #else #ifdef MAXLONG #define PERL_LONG_MAX ((long)MAXLONG) #else #define PERL_LONG_MAX ((long) (PERL_ULONG_MAX >> 1)) #endif #endif #endif #ifndef PERL_LONG_MIN #ifdef LONG_MIN #define PERL_LONG_MIN ((long)LONG_MIN) #else #ifdef MINLONG #define PERL_LONG_MIN ((long)MINLONG) #else #define PERL_LONG_MIN (-PERL_LONG_MAX - ((3 & -1) == 3)) #endif #endif #endif #if defined(HAS_QUAD) && (defined(convex) || defined(uts)) #ifndef PERL_UQUAD_MAX #ifdef ULONGLONG_MAX #define PERL_UQUAD_MAX ((unsigned long long)ULONGLONG_MAX) #else #ifdef MAXULONGLONG #define PERL_UQUAD_MAX ((unsigned long long)MAXULONGLONG) #else #define PERL_UQUAD_MAX (~(unsigned long long)0) #endif #endif #endif #ifndef PERL_UQUAD_MIN #define PERL_UQUAD_MIN ((unsigned long long)0L) #endif #ifndef PERL_QUAD_MAX #ifdef LONGLONG_MAX #define PERL_QUAD_MAX ((long long)LONGLONG_MAX) #else #ifdef MAXLONGLONG #define PERL_QUAD_MAX ((long long)MAXLONGLONG) #else #define PERL_QUAD_MAX ((long long) (PERL_UQUAD_MAX >> 1)) #endif #endif #endif #ifndef PERL_QUAD_MIN #ifdef LONGLONG_MIN #define PERL_QUAD_MIN ((long long)LONGLONG_MIN) #else #ifdef MINLONGLONG #define PERL_QUAD_MIN ((long long)MINLONGLONG) #else #define PERL_QUAD_MIN (-PERL_QUAD_MAX - ((3 & -1) == 3)) #endif #endif #endif #endif #ifdef HAS_QUAD #ifdef cray #ifndef IVTYPE #define IVTYPE int #endif #ifndef IV_MIN #define IV_MIN PERL_INT_MIN #endif #ifndef IV_MAX #define IV_MAX PERL_INT_MAX #endif #ifndef UV_MIN #define UV_MIN PERL_UINT_MIN #endif #ifndef UV_MAX #define UV_MAX PERL_UINT_MAX #endif #ifdef INTSIZE #ifndef IVSIZE #define IVSIZE INTSIZE #endif #endif #else #if defined(convex) || defined(uts) #ifndef IVTYPE #define IVTYPE long long #endif #ifndef IV_MIN #define IV_MIN PERL_QUAD_MIN #endif #ifndef IV_MAX #define IV_MAX PERL_QUAD_MAX #endif #ifndef UV_MIN #define UV_MIN PERL_UQUAD_MIN #endif #ifndef UV_MAX #define UV_MAX PERL_UQUAD_MAX #endif #ifdef LONGLONGSIZE #ifndef IVSIZE #define IVSIZE LONGLONGSIZE #endif #endif #else #ifndef IVTYPE #define IVTYPE long #endif #ifndef IV_MIN #define IV_MIN PERL_LONG_MIN #endif #ifndef IV_MAX #define IV_MAX PERL_LONG_MAX #endif #ifndef UV_MIN #define UV_MIN PERL_ULONG_MIN #endif #ifndef UV_MAX #define UV_MAX PERL_ULONG_MAX #endif #ifdef LONGSIZE #ifndef IVSIZE #define IVSIZE LONGSIZE #endif #endif #endif #endif #ifndef IVSIZE #define IVSIZE 8 #endif #ifndef LONGSIZE #define LONGSIZE 8 #endif #ifndef PERL_QUAD_MIN #define PERL_QUAD_MIN IV_MIN #endif #ifndef PERL_QUAD_MAX #define PERL_QUAD_MAX IV_MAX #endif #ifndef PERL_UQUAD_MIN #define PERL_UQUAD_MIN UV_MIN #endif #ifndef PERL_UQUAD_MAX #define PERL_UQUAD_MAX UV_MAX #endif #else #ifndef IVTYPE #define IVTYPE long #endif #ifndef LONGSIZE #define LONGSIZE 4 #endif #ifndef IV_MIN #define IV_MIN PERL_LONG_MIN #endif #ifndef IV_MAX #define IV_MAX PERL_LONG_MAX #endif #ifndef UV_MIN #define UV_MIN PERL_ULONG_MIN #endif #ifndef UV_MAX #define UV_MAX PERL_ULONG_MAX #endif #endif #ifndef IVSIZE #ifdef LONGSIZE #define IVSIZE LONGSIZE #else #define IVSIZE 4 #endif #endif #ifndef UVTYPE #define UVTYPE unsigned IVTYPE #endif #ifndef UVSIZE #define UVSIZE IVSIZE #endif #ifndef PERL_SIGNALS_UNSAFE_FLAG #define PERL_SIGNALS_UNSAFE_FLAG 0x0001 #if (PERL_BCDVERSION < 0x5008000) #define D_PPP_PERL_SIGNALS_INIT PERL_SIGNALS_UNSAFE_FLAG #else #define D_PPP_PERL_SIGNALS_INIT 0 #endif #if defined(NEED_PL_signals) static U32 DPPP_(my_PL_signals) = D_PPP_PERL_SIGNALS_INIT; #elif defined(NEED_PL_signals_GLOBAL) U32 DPPP_(my_PL_signals) = D_PPP_PERL_SIGNALS_INIT; #else extern U32 DPPP_(my_PL_signals); #endif #define PL_signals DPPP_(my_PL_signals) #endif #if (PERL_BCDVERSION <= 0x5005005) #define PL_ppaddr ppaddr #define PL_no_modify no_modify #endif #if (PERL_BCDVERSION <= 0x5004005) #define PL_DBsignal DBsignal #define PL_DBsingle DBsingle #define PL_DBsub DBsub #define PL_DBtrace DBtrace #define PL_Sv Sv #define PL_Xpv Xpv #define PL_bufend bufend #define PL_bufptr bufptr #define PL_compiling compiling #define PL_copline copline #define PL_curcop curcop #define PL_curstash curstash #define PL_debstash debstash #define PL_defgv defgv #define PL_diehook diehook #define PL_dirty dirty #define PL_dowarn dowarn #define PL_errgv errgv #define PL_error_count error_count #define PL_expect expect #define PL_hexdigit hexdigit #define PL_hints hints #define PL_in_my in_my #define PL_laststatval laststatval #define PL_lex_state lex_state #define PL_lex_stuff lex_stuff #define PL_linestr linestr #define PL_na na #define PL_perl_destruct_level perl_destruct_level #define PL_perldb perldb #define PL_rsfp_filters rsfp_filters #define PL_rsfp rsfp #define PL_stack_base stack_base #define PL_stack_sp stack_sp #define PL_statcache statcache #define PL_stdingv stdingv #define PL_sv_arenaroot sv_arenaroot #define PL_sv_no sv_no #define PL_sv_undef sv_undef #define PL_sv_yes sv_yes #define PL_tainted tainted #define PL_tainting tainting #define PL_tokenbuf tokenbuf #define PL_mess_sv mess_sv #endif #if (PERL_BCDVERSION >= 0x5009005) #ifdef DPPP_PL_parser_NO_DUMMY #define D_PPP_my_PL_parser_var(var) ((PL_parser ? PL_parser : \ (croak("panic: PL_parser == NULL in %s:%d", \ __FILE__, __LINE__), (yy_parser *) NULL))->var) #else #ifdef DPPP_PL_parser_NO_DUMMY_WARNING #define D_PPP_parser_dummy_warning(var) #else #define D_PPP_parser_dummy_warning(var) \ warn("warning: dummy PL_" #var " used in %s:%d", __FILE__, __LINE__), #endif #define D_PPP_my_PL_parser_var(var) ((PL_parser ? PL_parser : \ (D_PPP_parser_dummy_warning(var) &DPPP_(dummy_PL_parser)))->var) #if defined(NEED_PL_parser) static yy_parser DPPP_(dummy_PL_parser); #elif defined(NEED_PL_parser_GLOBAL) yy_parser DPPP_(dummy_PL_parser); #else extern yy_parser DPPP_(dummy_PL_parser); #endif #endif #define PL_expect D_PPP_my_PL_parser_var(expect) #define PL_copline D_PPP_my_PL_parser_var(copline) #define PL_rsfp D_PPP_my_PL_parser_var(rsfp) #define PL_rsfp_filters D_PPP_my_PL_parser_var(rsfp_filters) #define PL_linestr D_PPP_my_PL_parser_var(linestr) #define PL_bufptr D_PPP_my_PL_parser_var(bufptr) #define PL_bufend D_PPP_my_PL_parser_var(bufend) #define PL_lex_state D_PPP_my_PL_parser_var(lex_state) #define PL_lex_stuff D_PPP_my_PL_parser_var(lex_stuff) #define PL_tokenbuf D_PPP_my_PL_parser_var(tokenbuf) #define PL_in_my D_PPP_my_PL_parser_var(in_my) #define PL_in_my_stash D_PPP_my_PL_parser_var(in_my_stash) #define PL_error_count D_PPP_my_PL_parser_var(error_count) #else #define PL_parser ((void *) 1) #endif #if (PERL_BCDVERSION <= 0x5003022) #undef start_subparse #if (PERL_BCDVERSION < 0x5003022) #ifndef start_subparse #define start_subparse(a, b) Perl_start_subparse() #endif #else #ifndef start_subparse #define start_subparse(a, b) Perl_start_subparse(b) #endif #endif #if (PERL_BCDVERSION < 0x5003007) foo #endif #endif #if (PERL_BCDVERSION < 0x5004063) && (PERL_BCDVERSION != 0x5004005) #define NEED_newCONSTSUB #if defined(NEED_newCONSTSUB) static CV * DPPP_(my_newCONSTSUB)(HV * stash, const char * name, SV * sv); static #else extern CV * DPPP_(my_newCONSTSUB)(HV * stash, const char * name, SV * sv); #endif #if defined(NEED_newCONSTSUB) || defined(NEED_newCONSTSUB_GLOBAL) #ifdef newCONSTSUB #undef newCONSTSUB #endif #define newCONSTSUB(a,b,c) DPPP_(my_newCONSTSUB)(aTHX_ a,b,c) #define Perl_newCONSTSUB DPPP_(my_newCONSTSUB) #define D_PPP_PL_copline PL_copline CV * DPPP_(my_newCONSTSUB)(HV *stash, const char *name, SV *sv) { CV *cv; U32 oldhints = PL_hints; HV *old_cop_stash = PL_curcop->cop_stash; HV *old_curstash = PL_curstash; line_t oldline = PL_curcop->cop_line; PL_curcop->cop_line = D_PPP_PL_copline; PL_hints &= ~HINT_BLOCK_SCOPE; if (stash) PL_curstash = PL_curcop->cop_stash = stash; cv = newSUB( start_subparse(FALSE, 0), newSVOP(OP_CONST, 0, newSVpv((char *) name, 0)), newSVOP(OP_CONST, 0, &PL_sv_no), newSTATEOP(0, Nullch, newSVOP(OP_CONST, 0, sv)) ); PL_hints = oldhints; PL_curcop->cop_stash = old_cop_stash; PL_curstash = old_curstash; PL_curcop->cop_line = oldline; return cv; } #endif #endif #ifndef PERL_MAGIC_sv #define PERL_MAGIC_sv '\0' #endif #ifndef PERL_MAGIC_overload #define PERL_MAGIC_overload 'A' #endif #ifndef PERL_MAGIC_overload_elem #define PERL_MAGIC_overload_elem 'a' #endif #ifndef PERL_MAGIC_overload_table #define PERL_MAGIC_overload_table 'c' #endif #ifndef PERL_MAGIC_bm #define PERL_MAGIC_bm 'B' #endif #ifndef PERL_MAGIC_regdata #define PERL_MAGIC_regdata 'D' #endif #ifndef PERL_MAGIC_regdatum #define PERL_MAGIC_regdatum 'd' #endif #ifndef PERL_MAGIC_env #define PERL_MAGIC_env 'E' #endif #ifndef PERL_MAGIC_envelem #define PERL_MAGIC_envelem 'e' #endif #ifndef PERL_MAGIC_fm #define PERL_MAGIC_fm 'f' #endif #ifndef PERL_MAGIC_regex_global #define PERL_MAGIC_regex_global 'g' #endif #ifndef PERL_MAGIC_isa #define PERL_MAGIC_isa 'I' #endif #ifndef PERL_MAGIC_isaelem #define PERL_MAGIC_isaelem 'i' #endif #ifndef PERL_MAGIC_nkeys #define PERL_MAGIC_nkeys 'k' #endif #ifndef PERL_MAGIC_dbfile #define PERL_MAGIC_dbfile 'L' #endif #ifndef PERL_MAGIC_dbline #define PERL_MAGIC_dbline 'l' #endif #ifndef PERL_MAGIC_mutex #define PERL_MAGIC_mutex 'm' #endif #ifndef PERL_MAGIC_shared #define PERL_MAGIC_shared 'N' #endif #ifndef PERL_MAGIC_shared_scalar #define PERL_MAGIC_shared_scalar 'n' #endif #ifndef PERL_MAGIC_collxfrm #define PERL_MAGIC_collxfrm 'o' #endif #ifndef PERL_MAGIC_tied #define PERL_MAGIC_tied 'P' #endif #ifndef PERL_MAGIC_tiedelem #define PERL_MAGIC_tiedelem 'p' #endif #ifndef PERL_MAGIC_tiedscalar #define PERL_MAGIC_tiedscalar 'q' #endif #ifndef PERL_MAGIC_qr #define PERL_MAGIC_qr 'r' #endif #ifndef PERL_MAGIC_sig #define PERL_MAGIC_sig 'S' #endif #ifndef PERL_MAGIC_sigelem #define PERL_MAGIC_sigelem 's' #endif #ifndef PERL_MAGIC_taint #define PERL_MAGIC_taint 't' #endif #ifndef PERL_MAGIC_uvar #define PERL_MAGIC_uvar 'U' #endif #ifndef PERL_MAGIC_uvar_elem #define PERL_MAGIC_uvar_elem 'u' #endif #ifndef PERL_MAGIC_vstring #define PERL_MAGIC_vstring 'V' #endif #ifndef PERL_MAGIC_vec #define PERL_MAGIC_vec 'v' #endif #ifndef PERL_MAGIC_utf8 #define PERL_MAGIC_utf8 'w' #endif #ifndef PERL_MAGIC_substr #define PERL_MAGIC_substr 'x' #endif #ifndef PERL_MAGIC_defelem #define PERL_MAGIC_defelem 'y' #endif #ifndef PERL_MAGIC_glob #define PERL_MAGIC_glob '*' #endif #ifndef PERL_MAGIC_arylen #define PERL_MAGIC_arylen '#' #endif #ifndef PERL_MAGIC_pos #define PERL_MAGIC_pos '.' #endif #ifndef PERL_MAGIC_backref #define PERL_MAGIC_backref '<' #endif #ifndef PERL_MAGIC_ext #define PERL_MAGIC_ext '~' #endif #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L #ifndef PERL_STATIC_INLINE #define PERL_STATIC_INLINE static inline #endif #else #ifndef PERL_STATIC_INLINE #define PERL_STATIC_INLINE static #endif #endif #ifndef cBOOL #define cBOOL(cbool) ((cbool) ? (bool)1 : (bool)0) #endif #ifndef OpHAS_SIBLING #define OpHAS_SIBLING(o) (cBOOL((o)->op_sibling)) #endif #ifndef OpSIBLING #define OpSIBLING(o) (0 + (o)->op_sibling) #endif #ifndef OpMORESIB_set #define OpMORESIB_set(o, sib) ((o)->op_sibling = (sib)) #endif #ifndef OpLASTSIB_set #define OpLASTSIB_set(o, parent) ((o)->op_sibling = NULL) #endif #ifndef OpMAYBESIB_set #define OpMAYBESIB_set(o, sib, parent) ((o)->op_sibling = (sib)) #endif #ifndef HEf_SVKEY #define HEf_SVKEY -2 #endif #if defined(DEBUGGING) && !defined(__COVERITY__) #ifndef __ASSERT_ #define __ASSERT_(statement) assert(statement), #endif #else #ifndef __ASSERT_ #define __ASSERT_(statement) #endif #endif #ifndef __has_builtin #define __has_builtin(x) 0 #endif #if __has_builtin(__builtin_unreachable) #define D_PPP_HAS_BUILTIN_UNREACHABLE #elif (defined(__GNUC__) && ( __GNUC__ > 4 \ || __GNUC__ == 4 && __GNUC_MINOR__ >= 5)) #define D_PPP_HAS_BUILTIN_UNREACHABLE #endif #ifndef ASSUME #ifdef DEBUGGING #define ASSUME(x) assert(x) #elif defined(_MSC_VER) #define ASSUME(x) __assume(x) #elif defined(__ARMCC_VERSION) #define ASSUME(x) __promise(x) #elif defined(D_PPP_HAS_BUILTIN_UNREACHABLE) #define ASSUME(x) ((x) ? (void) 0 : __builtin_unreachable()) #else #define ASSUME(x) assert(x) #endif #endif #ifndef NOT_REACHED #ifdef D_PPP_HAS_BUILTIN_UNREACHABLE #define NOT_REACHED \ STMT_START { \ ASSUME(!"UNREACHABLE"); __builtin_unreachable(); \ } STMT_END #elif ! defined(__GNUC__) && (defined(__sun) || defined(__hpux)) #define NOT_REACHED #else #define NOT_REACHED ASSUME(!"UNREACHABLE") #endif #endif #ifndef WIDEST_UTYPE #ifdef QUADKIND #ifdef U64TYPE #define WIDEST_UTYPE U64TYPE #else #define WIDEST_UTYPE unsigned Quad_t #endif #else #define WIDEST_UTYPE U32 #endif #endif #ifndef withinCOUNT #define withinCOUNT(c, l, n) \ (((WIDEST_UTYPE) (((c)) - ((l) | 0))) <= (((WIDEST_UTYPE) ((n) | 0)))) #endif #ifndef inRANGE #define inRANGE(c, l, u) \ ( (sizeof(c) == sizeof(U8)) ? withinCOUNT(((U8) (c)), (l), ((u) - (l))) \ : (sizeof(c) == sizeof(U32)) ? withinCOUNT(((U32) (c)), (l), ((u) - (l))) \ : (withinCOUNT(((WIDEST_UTYPE) (c)), (l), ((u) - (l))))) #endif #undef FITS_IN_8_BITS #ifndef FITS_IN_8_BITS #define FITS_IN_8_BITS(c) ( (sizeof(c) == 1) \ || !(((WIDEST_UTYPE)((c) | 0)) & ~0xFF)) #endif #define D_PPP_IS_GENERIC_UTF8_SAFE(s, e, macro) \ (((e) - (s)) <= 0 \ ? 0 \ : UTF8_IS_INVARIANT((s)[0]) \ ? is ## macro ## _L1((s)[0]) \ : (((e) - (s)) < UTF8SKIP(s)) \ ? 0 \ : UTF8_IS_DOWNGRADEABLE_START((s)[0]) \ \ ? is ## macro ## _L1((WIDEST_UTYPE) LATIN1_TO_NATIVE( \ UTF8_ACCUMULATE(NATIVE_UTF8_TO_I8((s)[0]) \ & UTF_START_MASK(2), \ (s)[1]))) \ : is ## macro ## _utf8(s)) #define D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, macro) \ (((e) - (s)) <= 0 \ ? 0 \ : UTF8_IS_INVARIANT((s)[0]) \ ? is ## macro ## _LC((s)[0]) \ : (((e) - (s)) < UTF8SKIP(s)) \ ? 0 \ : UTF8_IS_DOWNGRADEABLE_START((s)[0]) \ \ ? is ## macro ## _LC((WIDEST_UTYPE) LATIN1_TO_NATIVE( \ UTF8_ACCUMULATE(NATIVE_UTF8_TO_I8((s)[0]) \ & UTF_START_MASK(2), \ (s)[1]))) \ : is ## macro ## _utf8(s)) #define D_PPP_IS_GENERIC_LC_UTF8_SAFE_BROKEN(s, e, macro) \ (((e) - (s)) <= 0 \ ? 0 \ : UTF8_IS_INVARIANT((s)[0]) \ ? is ## macro ## _LC((s)[0]) \ : (((e) - (s)) < UTF8SKIP(s)) \ ? 0 \ : UTF8_IS_DOWNGRADEABLE_START((s)[0]) \ \ ? is ## macro ## _LC((WIDEST_UTYPE) LATIN1_TO_NATIVE( \ UTF8_ACCUMULATE(NATIVE_UTF8_TO_I8((s)[0]) \ & UTF_START_MASK(2), \ (s)[1]))) \ : is ## macro ## _utf8_safe(s, e)) #ifndef SvRX #define SvRX(rv) (SvROK((rv)) ? (SvMAGICAL(SvRV((rv))) ? (mg_find(SvRV((rv)), PERL_MAGIC_qr) ? mg_find(SvRV((rv)), PERL_MAGIC_qr)->mg_obj : NULL) : NULL) : NULL) #endif #ifndef SvRXOK #define SvRXOK(sv) (!!SvRX(sv)) #endif #ifndef PERL_UNUSED_DECL #ifdef HASATTRIBUTE #if (defined(__GNUC__) && defined(__cplusplus)) || defined(__INTEL_COMPILER) #define PERL_UNUSED_DECL #else #define PERL_UNUSED_DECL __attribute__((unused)) #endif #else #define PERL_UNUSED_DECL #endif #endif #ifndef PERL_UNUSED_ARG #if defined(lint) && defined(S_SPLINT_S) #include <note.h> #define PERL_UNUSED_ARG(x) NOTE(ARGUNUSED(x)) #else #define PERL_UNUSED_ARG(x) ((void)x) #endif #endif #ifndef PERL_UNUSED_VAR #define PERL_UNUSED_VAR(x) ((void)x) #endif #ifndef PERL_UNUSED_CONTEXT #ifdef USE_ITHREADS #define PERL_UNUSED_CONTEXT PERL_UNUSED_ARG(my_perl) #else #define PERL_UNUSED_CONTEXT #endif #endif #ifndef PERL_UNUSED_RESULT #if defined(__GNUC__) && defined(HASATTRIBUTE_WARN_UNUSED_RESULT) #define PERL_UNUSED_RESULT(v) STMT_START { __typeof__(v) z = (v); (void)sizeof(z); } STMT_END #else #define PERL_UNUSED_RESULT(v) ((void)(v)) #endif #endif #ifndef NOOP #define NOOP (void)0 #endif #if (PERL_BCDVERSION < 0x5006001) && (PERL_BCDVERSION < 0x5027007) #undef dNOOP #ifndef dNOOP #define dNOOP struct Perl___notused_struct #endif #endif #ifndef NVTYPE #if defined(USE_LONG_DOUBLE) && defined(HAS_LONG_DOUBLE) #define NVTYPE long double #else #define NVTYPE double #endif typedef NVTYPE NV; #endif #ifndef INT2PTR #if (IVSIZE == PTRSIZE) && (UVSIZE == PTRSIZE) #define PTRV UV #define INT2PTR(any,d) (any)(d) #else #if PTRSIZE == LONGSIZE #define PTRV unsigned long #else #define PTRV unsigned #endif #define INT2PTR(any,d) (any)(PTRV)(d) #endif #endif #ifndef PTR2ul #if PTRSIZE == LONGSIZE #define PTR2ul(p) (unsigned long)(p) #else #define PTR2ul(p) INT2PTR(unsigned long,p) #endif #endif #ifndef PTR2nat #define PTR2nat(p) (PTRV)(p) #endif #ifndef NUM2PTR #define NUM2PTR(any,d) (any)PTR2nat(d) #endif #ifndef PTR2IV #define PTR2IV(p) INT2PTR(IV,p) #endif #ifndef PTR2UV #define PTR2UV(p) INT2PTR(UV,p) #endif #ifndef PTR2NV #define PTR2NV(p) NUM2PTR(NV,p) #endif #undef START_EXTERN_C #undef END_EXTERN_C #undef EXTERN_C #ifdef __cplusplus #define START_EXTERN_C extern "C" { #define END_EXTERN_C } #define EXTERN_C extern "C" #else #define START_EXTERN_C #define END_EXTERN_C #define EXTERN_C extern #endif #if (PERL_BCDVERSION < 0x5004000) || defined(PERL_GCC_PEDANTIC) #ifndef PERL_GCC_BRACE_GROUPS_FORBIDDEN #ifndef PERL_GCC_BRACE_GROUPS_FORBIDDEN #define PERL_GCC_BRACE_GROUPS_FORBIDDEN #endif #endif #endif #if ! defined(__GNUC__) || defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN) || defined(__cplusplus) #undef PERL_USE_GCC_BRACE_GROUPS #else #ifndef PERL_USE_GCC_BRACE_GROUPS #define PERL_USE_GCC_BRACE_GROUPS #endif #endif #undef STMT_START #undef STMT_END #if defined(VOIDFLAGS) && defined(PERL_USE_GCC_BRACE_GROUPS) #define STMT_START (void)( #define STMT_END ) #else #if defined(VOIDFLAGS) && (VOIDFLAGS) && (defined(sun) || defined(__sun__)) && !defined(__GNUC__) #define STMT_START if (1) #define STMT_END else (void)0 #else #define STMT_START do #define STMT_END while (0) #endif #endif #ifndef boolSV #define boolSV(b) ((b) ? &PL_sv_yes : &PL_sv_no) #endif #ifndef DEFSV #define DEFSV GvSV(PL_defgv) #endif #ifndef SAVE_DEFSV #define SAVE_DEFSV SAVESPTR(GvSV(PL_defgv)) #endif #ifndef DEFSV_set #define DEFSV_set(sv) (DEFSV = (sv)) #endif #ifndef AvFILLp #define AvFILLp AvFILL #endif #ifndef av_tindex #define av_tindex AvFILL #endif #ifndef av_top_index #define av_top_index AvFILL #endif #ifndef av_count #define av_count(av) (AvFILL(av)+1) #endif #ifndef ERRSV #define ERRSV get_sv("@",FALSE) #endif #ifndef gv_stashpvn #define gv_stashpvn(str,len,create) gv_stashpv(str,create) #endif #ifndef get_cv #define get_cv perl_get_cv #endif #ifndef get_sv #define get_sv perl_get_sv #endif #ifndef get_av #define get_av perl_get_av #endif #ifndef get_hv #define get_hv perl_get_hv #endif #ifndef dUNDERBAR #define dUNDERBAR dNOOP #endif #ifndef UNDERBAR #define UNDERBAR DEFSV #endif #ifndef dAX #define dAX I32 ax = MARK - PL_stack_base + 1 #endif #ifndef dITEMS #define dITEMS I32 items = SP - MARK #endif #ifndef dXSTARG #define dXSTARG SV * targ = sv_newmortal() #endif #ifndef dAXMARK #define dAXMARK I32 ax = POPMARK; \ SV ** const mark = PL_stack_base + ax++ #endif #ifndef XSprePUSH #define XSprePUSH (sp = PL_stack_base + ax - 1) #endif #if (PERL_BCDVERSION < 0x5005000) #undef XSRETURN #define XSRETURN(off) \ STMT_START { \ PL_stack_sp = PL_stack_base + ax + ((off) - 1); \ return; \ } STMT_END #endif #ifndef XSPROTO #define XSPROTO(name) void name(pTHX_ CV* cv) #endif #ifndef SVfARG #define SVfARG(p) ((void*)(p)) #endif #ifndef PERL_ABS #define PERL_ABS(x) ((x) < 0 ? -(x) : (x)) #endif #ifndef dVAR #define dVAR dNOOP #endif #ifndef SVf #define SVf "_" #endif #ifndef CPERLscope #define CPERLscope(x) x #endif #ifndef PERL_HASH #define PERL_HASH(hash,str,len) \ STMT_START { \ const char *s_PeRlHaSh = str; \ I32 i_PeRlHaSh = len; \ U32 hash_PeRlHaSh = 0; \ while (i_PeRlHaSh--) \ hash_PeRlHaSh = hash_PeRlHaSh * 33 + *s_PeRlHaSh++; \ (hash) = hash_PeRlHaSh; \ } STMT_END #endif #ifndef PERLIO_FUNCS_DECL #ifdef PERLIO_FUNCS_CONST #define PERLIO_FUNCS_DECL(funcs) const PerlIO_funcs funcs #define PERLIO_FUNCS_CAST(funcs) (PerlIO_funcs*)(funcs) #else #define PERLIO_FUNCS_DECL(funcs) PerlIO_funcs funcs #define PERLIO_FUNCS_CAST(funcs) (funcs) #endif #endif #if (PERL_BCDVERSION < 0x5009003) #ifdef ARGSproto typedef OP* (CPERLscope(*Perl_ppaddr_t))(ARGSproto); #else typedef OP* (CPERLscope(*Perl_ppaddr_t))(pTHX); #endif typedef OP* (CPERLscope(*Perl_check_t)) (pTHX_ OP*); #endif #if defined(EBCDIC) && defined(NATIVE_TO_ASCI) #ifndef NATIVE_TO_LATIN1 #define NATIVE_TO_LATIN1(c) NATIVE_TO_ASCII(c) #endif #ifndef LATIN1_TO_NATIVE #define LATIN1_TO_NATIVE(c) ASCII_TO_NATIVE(c) #endif #ifndef NATIVE_TO_UNI #define NATIVE_TO_UNI(c) ((c) > 255 ? (c) : NATIVE_TO_LATIN1(c)) #endif #ifndef UNI_TO_NATIVE #define UNI_TO_NATIVE(c) ((c) > 255 ? (c) : LATIN1_TO_NATIVE(c)) #endif #else #ifndef NATIVE_TO_LATIN1 #define NATIVE_TO_LATIN1(c) (c) #endif #ifndef LATIN1_TO_NATIVE #define LATIN1_TO_NATIVE(c) (c) #endif #ifndef NATIVE_TO_UNI #define NATIVE_TO_UNI(c) (c) #endif #ifndef UNI_TO_NATIVE #define UNI_TO_NATIVE(c) (c) #endif #endif #undef isPSXSPC #undef isPSXSPC_A #undef isPSXSPC_L1 #ifdef EBCDIC #if (PERL_BCDVERSION < 0x5022000) #undef isALNUM #undef isALNUM_A #undef isALNUM_L1 #undef isALNUMC #undef isALNUMC_A #undef isALNUMC_L1 #undef isALPHA #undef isALPHA_A #undef isALPHA_L1 #undef isALPHANUMERIC #undef isALPHANUMERIC_A #undef isALPHANUMERIC_L1 #undef isASCII #undef isASCII_A #undef isASCII_L1 #undef isBLANK #undef isBLANK_A #undef isBLANK_L1 #undef isCNTRL #undef isCNTRL_A #undef isCNTRL_L1 #undef isDIGIT #undef isDIGIT_A #undef isDIGIT_L1 #undef isGRAPH #undef isGRAPH_A #undef isGRAPH_L1 #undef isIDCONT #undef isIDCONT_A #undef isIDCONT_L1 #undef isIDFIRST #undef isIDFIRST_A #undef isIDFIRST_L1 #undef isLOWER #undef isLOWER_A #undef isLOWER_L1 #undef isOCTAL #undef isOCTAL_A #undef isOCTAL_L1 #undef isPRINT #undef isPRINT_A #undef isPRINT_L1 #undef isPUNCT #undef isPUNCT_A #undef isPUNCT_L1 #undef isSPACE #undef isSPACE_A #undef isSPACE_L1 #undef isUPPER #undef isUPPER_A #undef isUPPER_L1 #undef isWORDCHAR #undef isWORDCHAR_A #undef isWORDCHAR_L1 #undef isXDIGIT #undef isXDIGIT_A #undef isXDIGIT_L1 #endif #ifndef isASCII #define isASCII(c) (isCNTRL(c) || isPRINT(c)) #endif #ifndef isCNTRL #define isCNTRL(c) ( (c) == '\0' || (c) == '\a' || (c) == '\b' \ || (c) == '\f' || (c) == '\n' || (c) == '\r' \ || (c) == '\t' || (c) == '\v' \ || ((c) <= 3 && (c) >= 1) \ || (c) == 7 \ || ((c) <= 0x13 && (c) >= 0x0E) \ \ || (c) == 0x18 \ || (c) == 0x19 \ || ((c) <= 0x1F && (c) >= 0x1C) \ || (c) == 0x26 \ || (c) == 0x27 \ || (c) == 0x2D \ || (c) == 0x2E \ || (c) == 0x32 \ || (c) == 0x37 \ || (c) == 0x3C \ || (c) == 0x3D \ || (c) == 0x3F \ ) #endif #if '^' == 106 #define D_PPP_OUTLIER_CONTROL 0x5F #else #define D_PPP_OUTLIER_CONTROL 0xFF #endif #ifndef isCNTRL_L1 #define isCNTRL_L1(c) ((WIDEST_UTYPE) (c) < ' ' \ || (WIDEST_UTYPE) (c) == D_PPP_OUTLIER_CONTROL) #endif #ifndef isLOWER #define isLOWER(c) ( (c) >= 'a' && (c) <= 'z' \ && ( (c) <= 'i' \ || ((c) >= 'j' && (c) <= 'r') \ || (c) >= 's')) #endif #ifndef isUPPER #define isUPPER(c) ( (c) >= 'A' && (c) <= 'Z' \ && ( (c) <= 'I' \ || ((c) >= 'J' && (c) <= 'R') \ || (c) >= 'S')) #endif #else #if (PERL_BCDVERSION < 0x5004000) #undef isALNUM #undef isALNUM_A #undef isALPHA #undef isALPHA_A #undef isDIGIT #undef isDIGIT_A #undef isIDFIRST #undef isIDFIRST_A #undef isLOWER #undef isLOWER_A #undef isUPPER #undef isUPPER_A #endif #if (PERL_BCDVERSION == 0x5007000) #undef isGRAPH #endif #if (PERL_BCDVERSION < 0x5008000) #undef isCNTRL #endif #if (PERL_BCDVERSION < 0x5010000) #undef isPRINT #undef isPRINT_A #endif #if (PERL_BCDVERSION < 0x5014000) #undef isASCII #undef isASCII_A #endif #if (PERL_BCDVERSION < 0x5017008) #undef isPUNCT_L1 #endif #if (PERL_BCDVERSION < 0x5013007) #undef isALNUMC_L1 #endif #if (PERL_BCDVERSION < 0x5020000) #undef isSPACE #undef isSPACE_A #undef isSPACE_L1 #endif #ifndef isASCII #define isASCII(c) ((WIDEST_UTYPE) (c) <= 127) #endif #ifndef isCNTRL #define isCNTRL(c) ((WIDEST_UTYPE) (c) < ' ' || (c) == 127) #endif #ifndef isCNTRL_L1 #define isCNTRL_L1(c) ( (WIDEST_UTYPE) (c) < ' ' \ || inRANGE((c), 0x7F, 0x9F)) #endif #ifndef isLOWER #define isLOWER(c) inRANGE((c), 'a', 'z') #endif #ifndef isUPPER #define isUPPER(c) inRANGE((c), 'A', 'Z') #endif #endif #ifndef isASCII_L1 #define isASCII_L1(c) isASCII(c) #endif #ifndef isASCII_LC #define isASCII_LC(c) isASCII(c) #endif #ifndef isALNUM #define isALNUM(c) isWORDCHAR(c) #endif #ifndef isALNUMC #define isALNUMC(c) isALPHANUMERIC(c) #endif #ifndef isALNUMC_L1 #define isALNUMC_L1(c) isALPHANUMERIC_L1(c) #endif #ifndef isALPHA #define isALPHA(c) (isUPPER(c) || isLOWER(c)) #endif #ifndef isALPHA_L1 #define isALPHA_L1(c) (isUPPER_L1(c) || isLOWER_L1(c)) #endif #ifndef isALPHANUMERIC #define isALPHANUMERIC(c) (isALPHA(c) || isDIGIT(c)) #endif #ifndef isALPHANUMERIC_L1 #define isALPHANUMERIC_L1(c) (isALPHA_L1(c) || isDIGIT(c)) #endif #ifndef isALPHANUMERIC_LC #define isALPHANUMERIC_LC(c) (isALPHA_LC(c) || isDIGIT_LC(c)) #endif #ifndef isBLANK #define isBLANK(c) ((c) == ' ' || (c) == '\t') #endif #ifndef isBLANK_L1 #define isBLANK_L1(c) ( isBLANK(c) \ || ( FITS_IN_8_BITS(c) \ && NATIVE_TO_LATIN1((U8) c) == 0xA0)) #endif #ifndef isBLANK_LC #define isBLANK_LC(c) isBLANK(c) #endif #ifndef isDIGIT #define isDIGIT(c) inRANGE(c, '0', '9') #endif #ifndef isDIGIT_L1 #define isDIGIT_L1(c) isDIGIT(c) #endif #ifndef isGRAPH #define isGRAPH(c) (isWORDCHAR(c) || isPUNCT(c)) #endif #ifndef isGRAPH_L1 #define isGRAPH_L1(c) ( isPRINT_L1(c) \ && (c) != ' ' \ && NATIVE_TO_LATIN1((U8) c) != 0xA0) #endif #ifndef isIDCONT #define isIDCONT(c) isWORDCHAR(c) #endif #ifndef isIDCONT_L1 #define isIDCONT_L1(c) isWORDCHAR_L1(c) #endif #ifndef isIDCONT_LC #define isIDCONT_LC(c) isWORDCHAR_LC(c) #endif #ifndef isIDFIRST #define isIDFIRST(c) (isALPHA(c) || (c) == '_') #endif #ifndef isIDFIRST_L1 #define isIDFIRST_L1(c) (isALPHA_L1(c) || (U8) (c) == '_') #endif #ifndef isIDFIRST_LC #define isIDFIRST_LC(c) (isALPHA_LC(c) || (U8) (c) == '_') #endif #ifndef isLOWER_L1 #define isLOWER_L1(c) ( isLOWER(c) \ || ( FITS_IN_8_BITS(c) \ && ( ( NATIVE_TO_LATIN1((U8) c) >= 0xDF \ && NATIVE_TO_LATIN1((U8) c) != 0xF7) \ || NATIVE_TO_LATIN1((U8) c) == 0xAA \ || NATIVE_TO_LATIN1((U8) c) == 0xBA \ || NATIVE_TO_LATIN1((U8) c) == 0xB5))) #endif #ifndef isOCTAL #define isOCTAL(c) (((WIDEST_UTYPE)((c)) & ~7) == '0') #endif #ifndef isOCTAL_L1 #define isOCTAL_L1(c) isOCTAL(c) #endif #ifndef isPRINT #define isPRINT(c) (isGRAPH(c) || (c) == ' ') #endif #ifndef isPRINT_L1 #define isPRINT_L1(c) (FITS_IN_8_BITS(c) && ! isCNTRL_L1(c)) #endif #ifndef isPSXSPC #define isPSXSPC(c) isSPACE(c) #endif #ifndef isPSXSPC_L1 #define isPSXSPC_L1(c) isSPACE_L1(c) #endif #ifndef isPUNCT #define isPUNCT(c) ( (c) == '-' || (c) == '!' || (c) == '"' \ || (c) == '#' || (c) == '$' || (c) == '%' \ || (c) == '&' || (c) == '\'' || (c) == '(' \ || (c) == ')' || (c) == '*' || (c) == '+' \ || (c) == ',' || (c) == '.' || (c) == '/' \ || (c) == ':' || (c) == ';' || (c) == '<' \ || (c) == '=' || (c) == '>' || (c) == '?' \ || (c) == '@' || (c) == '[' || (c) == '\\' \ || (c) == ']' || (c) == '^' || (c) == '_' \ || (c) == '`' || (c) == '{' || (c) == '|' \ || (c) == '}' || (c) == '~') #endif #ifndef isPUNCT_L1 #define isPUNCT_L1(c) ( isPUNCT(c) \ || ( FITS_IN_8_BITS(c) \ && ( NATIVE_TO_LATIN1((U8) c) == 0xA1 \ || NATIVE_TO_LATIN1((U8) c) == 0xA7 \ || NATIVE_TO_LATIN1((U8) c) == 0xAB \ || NATIVE_TO_LATIN1((U8) c) == 0xB6 \ || NATIVE_TO_LATIN1((U8) c) == 0xB7 \ || NATIVE_TO_LATIN1((U8) c) == 0xBB \ || NATIVE_TO_LATIN1((U8) c) == 0xBF))) #endif #ifndef isSPACE #define isSPACE(c) ( isBLANK(c) || (c) == '\n' || (c) == '\r' \ || (c) == '\v' || (c) == '\f') #endif #ifndef isSPACE_L1 #define isSPACE_L1(c) ( isSPACE(c) \ || (FITS_IN_8_BITS(c) \ && ( NATIVE_TO_LATIN1((U8) c) == 0x85 \ || NATIVE_TO_LATIN1((U8) c) == 0xA0))) #endif #ifndef isUPPER_L1 #define isUPPER_L1(c) ( isUPPER(c) \ || (FITS_IN_8_BITS(c) \ && ( NATIVE_TO_LATIN1((U8) c) >= 0xC0 \ && NATIVE_TO_LATIN1((U8) c) <= 0xDE \ && NATIVE_TO_LATIN1((U8) c) != 0xD7))) #endif #ifndef isWORDCHAR #define isWORDCHAR(c) (isALPHANUMERIC(c) || (c) == '_') #endif #ifndef isWORDCHAR_L1 #define isWORDCHAR_L1(c) (isIDFIRST_L1(c) || isDIGIT(c)) #endif #ifndef isWORDCHAR_LC #define isWORDCHAR_LC(c) (isIDFIRST_LC(c) || isDIGIT_LC(c)) #endif #ifndef isXDIGIT #define isXDIGIT(c) ( isDIGIT(c) \ || inRANGE((c), 'a', 'f') \ || inRANGE((c), 'A', 'F')) #endif #ifndef isXDIGIT_L1 #define isXDIGIT_L1(c) isXDIGIT(c) #endif #ifndef isXDIGIT_LC #define isXDIGIT_LC(c) isxdigit(c) #endif #ifndef isALNUM_A #define isALNUM_A(c) isALNUM(c) #endif #ifndef isALNUMC_A #define isALNUMC_A(c) isALNUMC(c) #endif #ifndef isALPHA_A #define isALPHA_A(c) isALPHA(c) #endif #ifndef isALPHANUMERIC_A #define isALPHANUMERIC_A(c) isALPHANUMERIC(c) #endif #ifndef isASCII_A #define isASCII_A(c) isASCII(c) #endif #ifndef isBLANK_A #define isBLANK_A(c) isBLANK(c) #endif #ifndef isCNTRL_A #define isCNTRL_A(c) isCNTRL(c) #endif #ifndef isDIGIT_A #define isDIGIT_A(c) isDIGIT(c) #endif #ifndef isGRAPH_A #define isGRAPH_A(c) isGRAPH(c) #endif #ifndef isIDCONT_A #define isIDCONT_A(c) isIDCONT(c) #endif #ifndef isIDFIRST_A #define isIDFIRST_A(c) isIDFIRST(c) #endif #ifndef isLOWER_A #define isLOWER_A(c) isLOWER(c) #endif #ifndef isOCTAL_A #define isOCTAL_A(c) isOCTAL(c) #endif #ifndef isPRINT_A #define isPRINT_A(c) isPRINT(c) #endif #ifndef isPSXSPC_A #define isPSXSPC_A(c) isPSXSPC(c) #endif #ifndef isPUNCT_A #define isPUNCT_A(c) isPUNCT(c) #endif #ifndef isSPACE_A #define isSPACE_A(c) isSPACE(c) #endif #ifndef isUPPER_A #define isUPPER_A(c) isUPPER(c) #endif #ifndef isWORDCHAR_A #define isWORDCHAR_A(c) isWORDCHAR(c) #endif #ifndef isXDIGIT_A #define isXDIGIT_A(c) isXDIGIT(c) #endif #ifndef isASCII_utf8_safe #define isASCII_utf8_safe(s,e) (((e) - (s)) <= 0 ? 0 : isASCII(*(s))) #endif #ifndef isASCII_uvchr #define isASCII_uvchr(c) (FITS_IN_8_BITS(c) ? isASCII_L1(c) : 0) #endif #if (PERL_BCDVERSION >= 0x5006000) #ifdef isALPHA_uni #define D_PPP_is_ctype(upper, lower, c) \ (FITS_IN_8_BITS(c) \ ? is ## upper ## _L1(c) \ : is ## upper ## _uni((UV) (c))) #else #define D_PPP_is_ctype(upper, lower, c) \ (FITS_IN_8_BITS(c) \ ? is ## upper ## _L1(c) \ : is_uni_ ## lower((UV) (c))) #endif #ifndef isALPHA_uvchr #define isALPHA_uvchr(c) D_PPP_is_ctype(ALPHA, alpha, c) #endif #ifndef isALPHANUMERIC_uvchr #define isALPHANUMERIC_uvchr(c) (isALPHA_uvchr(c) || isDIGIT_uvchr(c)) #endif #ifdef is_uni_blank #ifndef isBLANK_uvchr #define isBLANK_uvchr(c) D_PPP_is_ctype(BLANK, blank, c) #endif #else #ifndef isBLANK_uvchr #define isBLANK_uvchr(c) (FITS_IN_8_BITS(c) \ ? isBLANK_L1(c) \ : ( (UV) (c) == 0x1680 \ || inRANGE((UV) (c), 0x2000, 0x200A) \ || (UV) (c) == 0x202F \ || (UV) (c) == 0x205F \ || (UV) (c) == 0x3000)) #endif #endif #ifndef isCNTRL_uvchr #define isCNTRL_uvchr(c) D_PPP_is_ctype(CNTRL, cntrl, c) #endif #ifndef isDIGIT_uvchr #define isDIGIT_uvchr(c) D_PPP_is_ctype(DIGIT, digit, c) #endif #ifndef isGRAPH_uvchr #define isGRAPH_uvchr(c) D_PPP_is_ctype(GRAPH, graph, c) #endif #ifndef isIDCONT_uvchr #define isIDCONT_uvchr(c) isWORDCHAR_uvchr(c) #endif #ifndef isIDFIRST_uvchr #define isIDFIRST_uvchr(c) D_PPP_is_ctype(IDFIRST, idfirst, c) #endif #ifndef isLOWER_uvchr #define isLOWER_uvchr(c) D_PPP_is_ctype(LOWER, lower, c) #endif #ifndef isPRINT_uvchr #define isPRINT_uvchr(c) D_PPP_is_ctype(PRINT, print, c) #endif #ifndef isPSXSPC_uvchr #define isPSXSPC_uvchr(c) isSPACE_uvchr(c) #endif #ifndef isPUNCT_uvchr #define isPUNCT_uvchr(c) D_PPP_is_ctype(PUNCT, punct, c) #endif #ifndef isSPACE_uvchr #define isSPACE_uvchr(c) D_PPP_is_ctype(SPACE, space, c) #endif #ifndef isUPPER_uvchr #define isUPPER_uvchr(c) D_PPP_is_ctype(UPPER, upper, c) #endif #ifndef isXDIGIT_uvchr #define isXDIGIT_uvchr(c) D_PPP_is_ctype(XDIGIT, xdigit, c) #endif #ifndef isWORDCHAR_uvchr #define isWORDCHAR_uvchr(c) (FITS_IN_8_BITS(c) \ ? isWORDCHAR_L1(c) : isALPHANUMERIC_uvchr(c)) #endif #ifndef isALPHA_utf8_safe #define isALPHA_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, ALPHA) #endif #ifdef isALPHANUMERIC_utf8 #ifndef isALPHANUMERIC_utf8_safe #define isALPHANUMERIC_utf8_safe(s,e) \ D_PPP_IS_GENERIC_UTF8_SAFE(s, e, ALPHANUMERIC) #endif #else #ifndef isALPHANUMERIC_utf8_safe #define isALPHANUMERIC_utf8_safe(s,e) \ (isALPHA_utf8_safe(s,e) || isDIGIT_utf8_safe(s,e)) #endif #endif #if 'A' == 65 #ifndef isBLANK_utf8_safe #define isBLANK_utf8_safe(s,e) \ ( ( LIKELY((e) > (s)) ) ? \ ( ( 0x09 == ((const U8*)s)[0] || 0x20 == ((const U8*)s)[0] ) ? 1 \ : ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) ? \ ( ( 0xC2 == ((const U8*)s)[0] ) ? \ ( ( 0xA0 == ((const U8*)s)[1] ) ? 2 : 0 ) \ : ( 0xE1 == ((const U8*)s)[0] ) ? \ ( ( ( 0x9A == ((const U8*)s)[1] ) && ( 0x80 == ((const U8*)s)[2] ) ) ? 3 : 0 )\ : ( 0xE2 == ((const U8*)s)[0] ) ? \ ( ( 0x80 == ((const U8*)s)[1] ) ? \ ( ( inRANGE(((const U8*)s)[2], 0x80, 0x8A ) || 0xAF == ((const U8*)s)[2] ) ? 3 : 0 )\ : ( ( 0x81 == ((const U8*)s)[1] ) && ( 0x9F == ((const U8*)s)[2] ) ) ? 3 : 0 )\ : ( ( ( 0xE3 == ((const U8*)s)[0] ) && ( 0x80 == ((const U8*)s)[1] ) ) && ( 0x80 == ((const U8*)s)[2] ) ) ? 3 : 0 )\ : 0 ) \ : 0 ) #endif #elif 'A' == 193 && '^' == 95 #ifndef isBLANK_utf8_safe #define isBLANK_utf8_safe(s,e) \ ( ( LIKELY((e) > (s)) ) ? \ ( ( 0x05 == ((const U8*)s)[0] || 0x40 == ((const U8*)s)[0] ) ? 1 \ : ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) ? \ ( ( 0x80 == ((const U8*)s)[0] ) ? \ ( ( 0x41 == ((const U8*)s)[1] ) ? 2 : 0 ) \ : ( 0xBC == ((const U8*)s)[0] ) ? \ ( ( ( 0x63 == ((const U8*)s)[1] ) && ( 0x41 == ((const U8*)s)[2] ) ) ? 3 : 0 )\ : ( 0xCA == ((const U8*)s)[0] ) ? \ ( ( 0x41 == ((const U8*)s)[1] ) ? \ ( ( inRANGE(((const U8*)s)[2], 0x41, 0x4A ) || 0x51 == ((const U8*)s)[2] ) ? 3 : 0 )\ : ( 0x42 == ((const U8*)s)[1] ) ? \ ( ( 0x56 == ((const U8*)s)[2] ) ? 3 : 0 ) \ : ( ( 0x43 == ((const U8*)s)[1] ) && ( 0x73 == ((const U8*)s)[2] ) ) ? 3 : 0 )\ : ( ( ( 0xCE == ((const U8*)s)[0] ) && ( 0x41 == ((const U8*)s)[1] ) ) && ( 0x41 == ((const U8*)s)[2] ) ) ? 3 : 0 )\ : 0 ) \ : 0 ) #endif #elif 'A' == 193 && '^' == 176 #ifndef isBLANK_utf8_safe #define isBLANK_utf8_safe(s,e) \ ( ( LIKELY((e) > (s)) ) ? \ ( ( 0x05 == ((const U8*)s)[0] || 0x40 == ((const U8*)s)[0] ) ? 1 \ : ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) ? \ ( ( 0x78 == ((const U8*)s)[0] ) ? \ ( ( 0x41 == ((const U8*)s)[1] ) ? 2 : 0 ) \ : ( 0xBD == ((const U8*)s)[0] ) ? \ ( ( ( 0x62 == ((const U8*)s)[1] ) && ( 0x41 == ((const U8*)s)[2] ) ) ? 3 : 0 )\ : ( 0xCA == ((const U8*)s)[0] ) ? \ ( ( 0x41 == ((const U8*)s)[1] ) ? \ ( ( inRANGE(((const U8*)s)[2], 0x41, 0x4A ) || 0x51 == ((const U8*)s)[2] ) ? 3 : 0 )\ : ( 0x42 == ((const U8*)s)[1] ) ? \ ( ( 0x56 == ((const U8*)s)[2] ) ? 3 : 0 ) \ : ( ( 0x43 == ((const U8*)s)[1] ) && ( 0x72 == ((const U8*)s)[2] ) ) ? 3 : 0 )\ : ( ( ( 0xCE == ((const U8*)s)[0] ) && ( 0x41 == ((const U8*)s)[1] ) ) && ( 0x41 == ((const U8*)s)[2] ) ) ? 3 : 0 )\ : 0 ) \ : 0 ) #endif #else #error Unknown character set #endif #ifndef isCNTRL_utf8_safe #define isCNTRL_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, CNTRL) #endif #ifndef isDIGIT_utf8_safe #define isDIGIT_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, DIGIT) #endif #ifndef isGRAPH_utf8_safe #define isGRAPH_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, GRAPH) #endif #ifdef isIDCONT_utf8 #ifndef isIDCONT_utf8_safe #define isIDCONT_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, IDCONT) #endif #else #ifndef isIDCONT_utf8_safe #define isIDCONT_utf8_safe(s,e) isWORDCHAR_utf8_safe(s,e) #endif #endif #ifndef isIDFIRST_utf8_safe #define isIDFIRST_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, IDFIRST) #endif #ifndef isLOWER_utf8_safe #define isLOWER_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, LOWER) #endif #ifndef isPRINT_utf8_safe #define isPRINT_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, PRINT) #endif #undef isPSXSPC_utf8_safe #ifndef isPSXSPC_utf8_safe #define isPSXSPC_utf8_safe(s,e) isSPACE_utf8_safe(s,e) #endif #ifndef isPUNCT_utf8_safe #define isPUNCT_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, PUNCT) #endif #ifndef isSPACE_utf8_safe #define isSPACE_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, SPACE) #endif #ifndef isUPPER_utf8_safe #define isUPPER_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, UPPER) #endif #ifdef isWORDCHAR_utf8 #ifndef isWORDCHAR_utf8_safe #define isWORDCHAR_utf8_safe(s,e) D_PPP_IS_GENERIC_UTF8_SAFE(s, e, WORDCHAR) #endif #else #ifndef isWORDCHAR_utf8_safe #define isWORDCHAR_utf8_safe(s,e) \ (isALPHANUMERIC_utf8_safe(s,e) || (*(s)) == '_') #endif #endif #if 'A' == 65 #ifndef isXDIGIT_utf8_safe #define isXDIGIT_utf8_safe(s,e) \ ( ( LIKELY((e) > (s)) ) ? \ ( ( inRANGE(((const U8*)s)[0], 0x30, 0x39 ) || inRANGE(((const U8*)s)[0], 0x41, 0x46 ) || inRANGE(((const U8*)s)[0], 0x61, 0x66 ) ) ? 1\ : ( ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xEF == ((const U8*)s)[0] ) ) ? ( ( 0xBC == ((const U8*)s)[1] ) ?\ ( ( inRANGE(((const U8*)s)[2], 0x90, 0x99 ) || inRANGE(((const U8*)s)[2], 0xA1, 0xA6 ) ) ? 3 : 0 )\ : ( ( 0xBD == ((const U8*)s)[1] ) && ( inRANGE(((const U8*)s)[2], 0x81, 0x86 ) ) ) ? 3 : 0 ) : 0 )\ : 0 ) #endif #elif 'A' == 193 && '^' == 95 #ifndef isXDIGIT_utf8_safe #define isXDIGIT_utf8_safe(s,e) \ ( ( LIKELY((e) > (s)) ) ? \ ( ( inRANGE(((const U8*)s)[0], 0x81, 0x86 ) || inRANGE(((const U8*)s)[0], 0xC1, 0xC6 ) || inRANGE(((const U8*)s)[0], 0xF0, 0xF9 ) ) ? 1\ : ( ( ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xDD == ((const U8*)s)[0] ) ) && ( 0x73 == ((const U8*)s)[1] ) ) ? ( ( 0x67 == ((const U8*)s)[2] ) ?\ ( ( inRANGE(((const U8*)s)[3], 0x57, 0x59 ) || inRANGE(((const U8*)s)[3], 0x62, 0x68 ) ) ? 4 : 0 )\ : ( ( inRANGE(((const U8*)s)[2], 0x68, 0x69 ) ) && ( inRANGE(((const U8*)s)[3], 0x42, 0x47 ) ) ) ? 4 : 0 ) : 0 )\ : 0 ) #endif #elif 'A' == 193 && '^' == 176 #ifndef isXDIGIT_utf8_safe #define isXDIGIT_utf8_safe(s,e) \ ( ( LIKELY((e) > (s)) ) ? \ ( ( inRANGE(((const U8*)s)[0], 0x81, 0x86 ) || inRANGE(((const U8*)s)[0], 0xC1, 0xC6 ) || inRANGE(((const U8*)s)[0], 0xF0, 0xF9 ) ) ? 1\ : ( ( ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xDD == ((const U8*)s)[0] ) ) && ( 0x72 == ((const U8*)s)[1] ) ) ? ( ( 0x66 == ((const U8*)s)[2] ) ?\ ( ( inRANGE(((const U8*)s)[3], 0x57, 0x59 ) || 0x5F == ((const U8*)s)[3] || inRANGE(((const U8*)s)[3], 0x62, 0x67 ) ) ? 4 : 0 )\ : ( ( inRANGE(((const U8*)s)[2], 0x67, 0x68 ) ) && ( inRANGE(((const U8*)s)[3], 0x42, 0x47 ) ) ) ? 4 : 0 ) : 0 )\ : 0 ) #endif #else #error Unknown character set #endif #ifndef isALPHA_LC_utf8_safe #define isALPHA_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, ALPHA) #endif #ifdef isALPHANUMERIC_utf8 #ifndef isALPHANUMERIC_LC_utf8_safe #define isALPHANUMERIC_LC_utf8_safe(s,e) \ D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, ALPHANUMERIC) #endif #else #ifndef isALPHANUMERIC_LC_utf8_safe #define isALPHANUMERIC_LC_utf8_safe(s,e) \ (isALPHA_LC_utf8_safe(s,e) || isDIGIT_LC_utf8_safe(s,e)) #endif #endif #ifndef isBLANK_LC_utf8_safe #define isBLANK_LC_utf8_safe(s,e) \ D_PPP_IS_GENERIC_LC_UTF8_SAFE_BROKEN(s, e, BLANK) #endif #ifndef isCNTRL_LC_utf8_safe #define isCNTRL_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, CNTRL) #endif #ifndef isDIGIT_LC_utf8_safe #define isDIGIT_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, DIGIT) #endif #ifndef isGRAPH_LC_utf8_safe #define isGRAPH_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, GRAPH) #endif #ifdef isIDCONT_utf8 #ifndef isIDCONT_LC_utf8_safe #define isIDCONT_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, IDCONT) #endif #else #ifndef isIDCONT_LC_utf8_safe #define isIDCONT_LC_utf8_safe(s,e) isWORDCHAR_LC_utf8_safe(s,e) #endif #endif #ifndef isIDFIRST_LC_utf8_safe #define isIDFIRST_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, IDFIRST) #endif #ifndef isLOWER_LC_utf8_safe #define isLOWER_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, LOWER) #endif #ifndef isPRINT_LC_utf8_safe #define isPRINT_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, PRINT) #endif #undef isPSXSPC_LC_utf8_safe #ifndef isPSXSPC_LC_utf8_safe #define isPSXSPC_LC_utf8_safe(s,e) isSPACE_LC_utf8_safe(s,e) #endif #ifndef isPUNCT_LC_utf8_safe #define isPUNCT_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, PUNCT) #endif #ifndef isSPACE_LC_utf8_safe #define isSPACE_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, SPACE) #endif #ifndef isUPPER_LC_utf8_safe #define isUPPER_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, UPPER) #endif #ifdef isWORDCHAR_utf8 #ifndef isWORDCHAR_LC_utf8_safe #define isWORDCHAR_LC_utf8_safe(s,e) D_PPP_IS_GENERIC_LC_UTF8_SAFE(s, e, WORDCHAR) #endif #else #ifndef isWORDCHAR_LC_utf8_safe #define isWORDCHAR_LC_utf8_safe(s,e) \ (isALPHANUMERIC_LC_utf8_safe(s,e) || (*(s)) == '_') #endif #endif #ifndef isXDIGIT_LC_utf8_safe #define isXDIGIT_LC_utf8_safe(s,e) \ D_PPP_IS_GENERIC_LC_UTF8_SAFE_BROKEN(s, e, XDIGIT) #endif #endif #define D_PPP_TOO_SHORT_MSG "Malformed UTF-8 character starting with:" \ " \\x%02x (too short; %d bytes available, need" \ " %d)\n" #if (PERL_BCDVERSION >= 0x5007003) #ifndef toLOWER_uvchr #define toLOWER_uvchr(c, s, l) UNI_TO_NATIVE(to_uni_lower(NATIVE_TO_UNI(c), s, l)) #endif #ifndef toUPPER_uvchr #define toUPPER_uvchr(c, s, l) UNI_TO_NATIVE(to_uni_upper(NATIVE_TO_UNI(c), s, l)) #endif #ifndef toTITLE_uvchr #define toTITLE_uvchr(c, s, l) UNI_TO_NATIVE(to_uni_title(NATIVE_TO_UNI(c), s, l)) #endif #ifndef toFOLD_uvchr #define toFOLD_uvchr(c, s, l) UNI_TO_NATIVE(to_uni_fold( NATIVE_TO_UNI(c), s, l)) #endif #if (PERL_BCDVERSION != 0x5015006) #if defined toLOWER_utf8 #define D_PPP_TO_LOWER_CALLEE(s,r,l) toLOWER_utf8(s,r,l) #else #define D_PPP_TO_LOWER_CALLEE(s,r,l) to_utf8_lower(s,r,l) #endif #if defined toTITLE_utf8 #define D_PPP_TO_TITLE_CALLEE(s,r,l) toTITLE_utf8(s,r,l) #else #define D_PPP_TO_TITLE_CALLEE(s,r,l) to_utf8_title(s,r,l) #endif #if defined toUPPER_utf8 #define D_PPP_TO_UPPER_CALLEE(s,r,l) toUPPER_utf8(s,r,l) #else #define D_PPP_TO_UPPER_CALLEE(s,r,l) to_utf8_upper(s,r,l) #endif #if defined toFOLD_utf8 #define D_PPP_TO_FOLD_CALLEE(s,r,l) toFOLD_utf8(s,r,l) #else #define D_PPP_TO_FOLD_CALLEE(s,r,l) to_utf8_fold(s,r,l) #endif #else #define D_PPP_TO_LOWER_CALLEE(s,r,l) \ Perl__to_utf8_lower_flags(aTHX_ s, r, l, 0, NULL) #define D_PPP_TO_TITLE_CALLEE(s,r,l) \ Perl__to_utf8_title_flags(aTHX_ s, r, l, 0, NULL) #define D_PPP_TO_UPPER_CALLEE(s,r,l) \ Perl__to_utf8_upper_flags(aTHX_ s, r, l, 0, NULL) #define D_PPP_TO_FOLD_CALLEE(s,r,l) \ Perl__to_utf8_fold_flags(aTHX_ s, r, l, FOLD_FLAGS_FULL, NULL) #endif #define D_PPP_GENERIC_MULTI_ARG_TO(name, s, e,r,l) \ (((((e) - (s)) <= 0) \ \ ? (croak("Attempting case change on zero length string"), \ 0) \ : ((e) - (s)) < UTF8SKIP(s)) \ ? (croak(D_PPP_TOO_SHORT_MSG, \ s[0], (int) ((e) - (s)), (int) UTF8SKIP(s)), \ 0) \ : D_PPP_TO_ ## name ## _CALLEE(s,r,l)) #ifndef toUPPER_utf8_safe #define toUPPER_utf8_safe(s,e,r,l) \ D_PPP_GENERIC_MULTI_ARG_TO(UPPER,s,e,r,l) #endif #ifndef toLOWER_utf8_safe #define toLOWER_utf8_safe(s,e,r,l) \ D_PPP_GENERIC_MULTI_ARG_TO(LOWER,s,e,r,l) #endif #ifndef toTITLE_utf8_safe #define toTITLE_utf8_safe(s,e,r,l) \ D_PPP_GENERIC_MULTI_ARG_TO(TITLE,s,e,r,l) #endif #ifndef toFOLD_utf8_safe #define toFOLD_utf8_safe(s,e,r,l) \ D_PPP_GENERIC_MULTI_ARG_TO(FOLD,s,e,r,l) #endif #elif (PERL_BCDVERSION >= 0x5006000) #ifdef uvchr_to_utf8 #define D_PPP_UV_TO_UTF8 uvchr_to_utf8 #else #define D_PPP_UV_TO_UTF8 uv_to_utf8 #endif #define D_PPP_GENERIC_SINGLE_ARG_TO_UVCHR(name, c, s, l) \ (*(l) = (D_PPP_UV_TO_UTF8(s, \ UNI_TO_NATIVE(to_uni_ ## name(NATIVE_TO_UNI(c)))) - (s)), \ UNI_TO_NATIVE(to_uni_ ## name(NATIVE_TO_UNI(c)))) #ifndef toLOWER_uvchr #define toLOWER_uvchr(c, s, l) \ D_PPP_GENERIC_SINGLE_ARG_TO_UVCHR(lower, c, s, l) #endif #ifndef toUPPER_uvchr #define toUPPER_uvchr(c, s, l) \ D_PPP_GENERIC_SINGLE_ARG_TO_UVCHR(upper, c, s, l) #endif #ifndef toTITLE_uvchr #define toTITLE_uvchr(c, s, l) \ D_PPP_GENERIC_SINGLE_ARG_TO_UVCHR(title, c, s, l) #endif #ifndef toFOLD_uvchr #define toFOLD_uvchr(c, s, l) toLOWER_uvchr(c, s, l) #endif #define D_PPP_GENERIC_SINGLE_ARG_TO_UTF8(name, s, e, r, l) \ (((((e) - (s)) <= 0) \ ? (croak("Attempting case change on zero length string"), \ 0) \ : ((e) - (s)) < UTF8SKIP(s)) \ ? (croak(D_PPP_TOO_SHORT_MSG, \ s[0], (int) ((e) - (s)), (int) UTF8SKIP(s)), \ 0) \ \ : D_PPP_UV_TO_UTF8(r, to_utf8_ ## name(s)), \ \ *(l) = UTF8SKIP(r), to_utf8_ ## name(r)) #ifndef toUPPER_utf8_safe #define toUPPER_utf8_safe(s,e,r,l) \ D_PPP_GENERIC_SINGLE_ARG_TO_UTF8(upper, s, e, r, l) #endif #ifndef toLOWER_utf8_safe #define toLOWER_utf8_safe(s,e,r,l) \ D_PPP_GENERIC_SINGLE_ARG_TO_UTF8(lower, s, e, r, l) #endif #ifndef toTITLE_utf8_safe #define toTITLE_utf8_safe(s,e,r,l) \ D_PPP_GENERIC_SINGLE_ARG_TO_UTF8(title, s, e, r, l) #endif #ifndef toFOLD_utf8_safe #define toFOLD_utf8_safe(s,e,r,l) toLOWER_utf8_safe(s,e,r,l) #endif #endif #if (PERL_BCDVERSION >= 0x5008000) #ifndef HeUTF8 #define HeUTF8(he) ((HeKLEN(he) == HEf_SVKEY) ? \ SvUTF8(HeKEY_sv(he)) : \ (U32)HeKUTF8(he)) #endif #endif #ifndef C_ARRAY_LENGTH #define C_ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0])) #endif #ifndef C_ARRAY_END #define C_ARRAY_END(a) ((a) + C_ARRAY_LENGTH(a)) #endif #ifndef LIKELY #define LIKELY(x) (x) #endif #ifndef UNLIKELY #define UNLIKELY(x) (x) #endif #ifndef MUTABLE_PTR #if defined(PERL_USE_GCC_BRACE_GROUPS) #define MUTABLE_PTR(p) ({ void *_p = (p); _p; }) #else #define MUTABLE_PTR(p) ((void *) (p)) #endif #endif #ifndef MUTABLE_AV #define MUTABLE_AV(p) ((AV *)MUTABLE_PTR(p)) #endif #ifndef MUTABLE_CV #define MUTABLE_CV(p) ((CV *)MUTABLE_PTR(p)) #endif #ifndef MUTABLE_GV #define MUTABLE_GV(p) ((GV *)MUTABLE_PTR(p)) #endif #ifndef MUTABLE_HV #define MUTABLE_HV(p) ((HV *)MUTABLE_PTR(p)) #endif #ifndef MUTABLE_IO #define MUTABLE_IO(p) ((IO *)MUTABLE_PTR(p)) #endif #ifndef MUTABLE_SV #define MUTABLE_SV(p) ((SV *)MUTABLE_PTR(p)) #endif #if (PERL_BCDVERSION >= 0x5004000) && !defined(vnewSVpvf) #if defined(PERL_USE_GCC_BRACE_GROUPS) #define vnewSVpvf(pat, args) ({ SV *_sv = newSV(0); sv_vsetpvfn(_sv, (pat), strlen((pat)), (args), Null(SV**), 0, Null(bool*)); _sv; }) #else #define vnewSVpvf(pat, args) ((PL_Sv = newSV(0)), sv_vsetpvfn(PL_Sv, (pat), strlen((pat)), (args), Null(SV**), 0, Null(bool*)), PL_Sv) #endif #endif #if (PERL_BCDVERSION >= 0x5004000) && !defined(sv_vcatpvf) #define sv_vcatpvf(sv, pat, args) sv_vcatpvfn(sv, pat, strlen(pat), args, Null(SV**), 0, Null(bool*)) #endif #if (PERL_BCDVERSION >= 0x5004000) && !defined(sv_vsetpvf) #define sv_vsetpvf(sv, pat, args) sv_vsetpvfn(sv, pat, strlen(pat), args, Null(SV**), 0, Null(bool*)) #endif #if (PERL_BCDVERSION >= 0x5004000) && !defined(sv_catpvf_mg) #if defined(NEED_sv_catpvf_mg) static void DPPP_(my_sv_catpvf_mg)(pTHX_ SV * const sv, const char * const pat, ...); static #else extern void DPPP_(my_sv_catpvf_mg)(pTHX_ SV * const sv, const char * const pat, ...); #endif #if defined(NEED_sv_catpvf_mg) || defined(NEED_sv_catpvf_mg_GLOBAL) #define Perl_sv_catpvf_mg DPPP_(my_sv_catpvf_mg) void DPPP_(my_sv_catpvf_mg)(pTHX_ SV * const sv, const char * const pat, ...) { va_list args; va_start(args, pat); sv_vcatpvfn(sv, pat, strlen(pat), &args, Null(SV**), 0, Null(bool*)); SvSETMAGIC(sv); va_end(args); } #endif #endif #ifdef PERL_IMPLICIT_CONTEXT #if (PERL_BCDVERSION >= 0x5004000) && !defined(sv_catpvf_mg_nocontext) #if defined(NEED_sv_catpvf_mg_nocontext) static void DPPP_(my_sv_catpvf_mg_nocontext)(SV * const sv, const char * const pat, ...); static #else extern void DPPP_(my_sv_catpvf_mg_nocontext)(SV * const sv, const char * const pat, ...); #endif #if defined(NEED_sv_catpvf_mg_nocontext) || defined(NEED_sv_catpvf_mg_nocontext_GLOBAL) #define sv_catpvf_mg_nocontext DPPP_(my_sv_catpvf_mg_nocontext) #define Perl_sv_catpvf_mg_nocontext DPPP_(my_sv_catpvf_mg_nocontext) void DPPP_(my_sv_catpvf_mg_nocontext)(SV * const sv, const char * const pat, ...) { dTHX; va_list args; va_start(args, pat); sv_vcatpvfn(sv, pat, strlen(pat), &args, Null(SV**), 0, Null(bool*)); SvSETMAGIC(sv); va_end(args); } #endif #endif #endif #ifndef sv_catpvf_mg #ifdef PERL_IMPLICIT_CONTEXT #define sv_catpvf_mg Perl_sv_catpvf_mg_nocontext #else #define sv_catpvf_mg Perl_sv_catpvf_mg #endif #endif #if (PERL_BCDVERSION >= 0x5004000) && !defined(sv_vcatpvf_mg) #define sv_vcatpvf_mg(sv, pat, args) \ STMT_START { \ sv_vcatpvfn(sv, pat, strlen(pat), args, Null(SV**), 0, Null(bool*)); \ SvSETMAGIC(sv); \ } STMT_END #endif #if (PERL_BCDVERSION >= 0x5004000) && !defined(sv_setpvf_mg) #if defined(NEED_sv_setpvf_mg) static void DPPP_(my_sv_setpvf_mg)(pTHX_ SV * const sv, const char * const pat, ...); static #else extern void DPPP_(my_sv_setpvf_mg)(pTHX_ SV * const sv, const char * const pat, ...); #endif #if defined(NEED_sv_setpvf_mg) || defined(NEED_sv_setpvf_mg_GLOBAL) #define Perl_sv_setpvf_mg DPPP_(my_sv_setpvf_mg) void DPPP_(my_sv_setpvf_mg)(pTHX_ SV * const sv, const char * const pat, ...) { va_list args; va_start(args, pat); sv_vsetpvfn(sv, pat, strlen(pat), &args, Null(SV**), 0, Null(bool*)); SvSETMAGIC(sv); va_end(args); } #endif #endif #ifdef PERL_IMPLICIT_CONTEXT #if (PERL_BCDVERSION >= 0x5004000) && !defined(sv_setpvf_mg_nocontext) #if defined(NEED_sv_setpvf_mg_nocontext) static void DPPP_(my_sv_setpvf_mg_nocontext)(SV * const sv, const char * const pat, ...); static #else extern void DPPP_(my_sv_setpvf_mg_nocontext)(SV * const sv, const char * const pat, ...); #endif #if defined(NEED_sv_setpvf_mg_nocontext) || defined(NEED_sv_setpvf_mg_nocontext_GLOBAL) #define sv_setpvf_mg_nocontext DPPP_(my_sv_setpvf_mg_nocontext) #define Perl_sv_setpvf_mg_nocontext DPPP_(my_sv_setpvf_mg_nocontext) void DPPP_(my_sv_setpvf_mg_nocontext)(SV * const sv, const char * const pat, ...) { dTHX; va_list args; va_start(args, pat); sv_vsetpvfn(sv, pat, strlen(pat), &args, Null(SV**), 0, Null(bool*)); SvSETMAGIC(sv); va_end(args); } #endif #endif #endif #ifndef sv_setpvf_mg #ifdef PERL_IMPLICIT_CONTEXT #define sv_setpvf_mg Perl_sv_setpvf_mg_nocontext #else #define sv_setpvf_mg Perl_sv_setpvf_mg #endif #endif #if (PERL_BCDVERSION >= 0x5004000) && !defined(sv_vsetpvf_mg) #define sv_vsetpvf_mg(sv, pat, args) \ STMT_START { \ sv_vsetpvfn(sv, pat, strlen(pat), args, Null(SV**), 0, Null(bool*)); \ SvSETMAGIC(sv); \ } STMT_END #endif #ifndef sv_2pv_nolen #define sv_2pv_nolen(sv) SvPV_nolen(sv) #endif #ifdef SvPVbyte #if (PERL_BCDVERSION < 0x5007000) #ifndef sv_2pvbyte #define sv_2pvbyte(sv, lp) (sv_utf8_downgrade((sv), 0), SvPV((sv), *(lp))) #endif #undef SvPVbyte #define SvPVbyte(sv, lp) \ ((SvFLAGS(sv) & (SVf_POK|SVf_UTF8)) == (SVf_POK) \ ? ((lp = SvCUR(sv)), SvPVX(sv)) : sv_2pvbyte(sv, &lp)) #endif #else #define SvPVbyte SvPV #define sv_2pvbyte sv_2pv #endif #ifndef sv_2pvbyte_nolen #define sv_2pvbyte_nolen(sv) sv_2pv_nolen(sv) #endif #ifndef SV_IMMEDIATE_UNREF #define SV_IMMEDIATE_UNREF 0 #endif #ifndef SV_GMAGIC #define SV_GMAGIC 0 #endif #ifndef SV_COW_DROP_PV #define SV_COW_DROP_PV 0 #endif #ifndef SV_UTF8_NO_ENCODING #define SV_UTF8_NO_ENCODING 0 #endif #ifndef SV_CONST_RETURN #define SV_CONST_RETURN 0 #endif #ifndef SV_MUTABLE_RETURN #define SV_MUTABLE_RETURN 0 #endif #ifndef SV_SMAGIC #define SV_SMAGIC 0 #endif #ifndef SV_HAS_TRAILING_NUL #define SV_HAS_TRAILING_NUL 0 #endif #ifndef SV_COW_SHARED_HASH_KEYS #define SV_COW_SHARED_HASH_KEYS 0 #endif #if defined(PERL_USE_GCC_BRACE_GROUPS) #ifndef sv_2pv_flags #define sv_2pv_flags(sv, lp, flags) ({ SV *_sv = (sv); const I32 _flags = (flags); STRLEN *_lp = lp; _lp = _lp ? : &PL_na; (!(_flags & SV_GMAGIC) && SvGMAGICAL(_sv)) ? ({ char *_pv; SvGMAGICAL_off(_sv); _pv = sv_2pv(_sv, _lp); SvGMAGICAL_on(_sv); _pv; }) : sv_2pv(_sv, _lp); }) #endif #ifndef sv_pvn_force_flags #define sv_pvn_force_flags(sv, lp, flags) ({ SV *_sv = (sv); const I32 _flags = (flags); STRLEN *_lp = lp; _lp = _lp ? : &PL_na; (!(_flags & SV_GMAGIC) && SvGMAGICAL(_sv)) ? ({ char *_pv; SvGMAGICAL_off(_sv); _pv = sv_pvn_force(_sv, _lp); SvGMAGICAL_on(_sv); _pv; }) : sv_pvn_force(_sv, _lp); }) #endif #else #ifndef sv_2pv_flags #define sv_2pv_flags(sv, lp, flags) ((PL_Sv = (sv)), (!((flags) & SV_GMAGIC) && SvGMAGICAL(PL_Sv)) ? (SvGMAGICAL_off(PL_Sv), (PL_Xpv = (XPV *)sv_2pv(PL_Sv, (lp) ? (lp) : &PL_na)), SvGMAGICAL_on(PL_Sv), (char *)PL_Xpv) : sv_2pv(PL_Sv, (lp) ? (lp) : &PL_na)) #endif #ifndef sv_pvn_force_flags #define sv_pvn_force_flags(sv, lp, flags) ((PL_Sv = (sv)), (!((flags) & SV_GMAGIC) && SvGMAGICAL(PL_Sv)) ? (SvGMAGICAL_off(PL_Sv), (PL_Xpv = (XPV *)sv_pvn_force(PL_Sv, (lp) ? (lp) : &PL_na)), SvGMAGICAL_on(PL_Sv), (char *)PL_Xpv) : sv_pvn_force(PL_Sv, (lp) ? (lp) : &PL_na)) #endif #endif #if (PERL_BCDVERSION < 0x5008008) || ( (PERL_BCDVERSION >= 0x5009000) && (PERL_BCDVERSION < 0x5009003) ) #define D_PPP_SVPV_NOLEN_LP_ARG &PL_na #else #define D_PPP_SVPV_NOLEN_LP_ARG 0 #endif #ifndef SvPV_const #define SvPV_const(sv, lp) SvPV_flags_const(sv, lp, SV_GMAGIC) #endif #ifndef SvPV_mutable #define SvPV_mutable(sv, lp) SvPV_flags_mutable(sv, lp, SV_GMAGIC) #endif #ifndef SvPV_flags #define SvPV_flags(sv, lp, flags) \ ((SvFLAGS(sv) & (SVf_POK)) == SVf_POK \ ? ((lp = SvCUR(sv)), SvPVX(sv)) : sv_2pv_flags(sv, &lp, flags)) #endif #ifndef SvPV_flags_const #define SvPV_flags_const(sv, lp, flags) \ ((SvFLAGS(sv) & (SVf_POK)) == SVf_POK \ ? ((lp = SvCUR(sv)), SvPVX_const(sv)) : \ (const char*) sv_2pv_flags(sv, &lp, flags|SV_CONST_RETURN)) #endif #ifndef SvPV_flags_const_nolen #define SvPV_flags_const_nolen(sv, flags) \ ((SvFLAGS(sv) & (SVf_POK)) == SVf_POK \ ? SvPVX_const(sv) : \ (const char*) sv_2pv_flags(sv, D_PPP_SVPV_NOLEN_LP_ARG, flags|SV_CONST_RETURN)) #endif #ifndef SvPV_flags_mutable #define SvPV_flags_mutable(sv, lp, flags) \ ((SvFLAGS(sv) & (SVf_POK)) == SVf_POK \ ? ((lp = SvCUR(sv)), SvPVX_mutable(sv)) : \ sv_2pv_flags(sv, &lp, flags|SV_MUTABLE_RETURN)) #endif #ifndef SvPV_force #define SvPV_force(sv, lp) SvPV_force_flags(sv, lp, SV_GMAGIC) #endif #ifndef SvPV_force_nolen #define SvPV_force_nolen(sv) SvPV_force_flags_nolen(sv, SV_GMAGIC) #endif #ifndef SvPV_force_mutable #define SvPV_force_mutable(sv, lp) SvPV_force_flags_mutable(sv, lp, SV_GMAGIC) #endif #ifndef SvPV_force_nomg #define SvPV_force_nomg(sv, lp) SvPV_force_flags(sv, lp, 0) #endif #ifndef SvPV_force_nomg_nolen #define SvPV_force_nomg_nolen(sv) SvPV_force_flags_nolen(sv, 0) #endif #ifndef SvPV_force_flags #define SvPV_force_flags(sv, lp, flags) \ ((SvFLAGS(sv) & (SVf_POK|SVf_THINKFIRST)) == SVf_POK \ ? ((lp = SvCUR(sv)), SvPVX(sv)) : sv_pvn_force_flags(sv, &lp, flags)) #endif #ifndef SvPV_force_flags_nolen #define SvPV_force_flags_nolen(sv, flags) \ ((SvFLAGS(sv) & (SVf_POK|SVf_THINKFIRST)) == SVf_POK \ ? SvPVX(sv) : sv_pvn_force_flags(sv, D_PPP_SVPV_NOLEN_LP_ARG, flags)) #endif #ifndef SvPV_force_flags_mutable #define SvPV_force_flags_mutable(sv, lp, flags) \ ((SvFLAGS(sv) & (SVf_POK|SVf_THINKFIRST)) == SVf_POK \ ? ((lp = SvCUR(sv)), SvPVX_mutable(sv)) \ : sv_pvn_force_flags(sv, &lp, flags|SV_MUTABLE_RETURN)) #endif #ifndef SvPV_nolen #define SvPV_nolen(sv) \ ((SvFLAGS(sv) & (SVf_POK)) == SVf_POK \ ? SvPVX(sv) : sv_2pv_flags(sv, D_PPP_SVPV_NOLEN_LP_ARG, SV_GMAGIC)) #endif #ifndef SvPV_nolen_const #define SvPV_nolen_const(sv) \ ((SvFLAGS(sv) & (SVf_POK)) == SVf_POK \ ? SvPVX_const(sv) : sv_2pv_flags(sv, D_PPP_SVPV_NOLEN_LP_ARG, SV_GMAGIC|SV_CONST_RETURN)) #endif #if defined(PERL_USE_GCC_BRACE_GROUPS) #ifndef SvPVx_nolen_const #define SvPVx_nolen_const(sv) ({SV *sV_ = (sv); SvPV_nolen_const(sV_); }) #endif #else #ifndef SvPVx_nolen_const #define SvPVx_nolen_const(sv) (PL_Sv = sv, SvPV_nolen_const(PL_Sv)) #endif #endif #ifndef SvPV_nomg #define SvPV_nomg(sv, lp) SvPV_flags(sv, lp, 0) #endif #ifndef SvPV_nomg_const #define SvPV_nomg_const(sv, lp) SvPV_flags_const(sv, lp, 0) #endif #ifndef SvPV_nomg_const_nolen #define SvPV_nomg_const_nolen(sv) SvPV_flags_const_nolen(sv, 0) #endif #ifndef SvPV_nomg_nolen #define SvPV_nomg_nolen(sv) ((SvFLAGS(sv) & (SVf_POK)) == SVf_POK \ ? SvPVX(sv) : sv_2pv_flags(sv, D_PPP_SVPV_NOLEN_LP_ARG, 0)) #endif #ifndef SvPV_renew #define SvPV_renew(sv,n) STMT_START { SvLEN_set(sv, n); \ SvPV_set((sv), (char *) saferealloc( \ (Malloc_t)SvPVX(sv), (MEM_SIZE)((n)))); \ } STMT_END #endif #ifndef SvPVCLEAR #define SvPVCLEAR(sv) sv_setpvs((sv), "") #endif #ifndef WARN_ALL #define WARN_ALL 0 #endif #ifndef WARN_CLOSURE #define WARN_CLOSURE 1 #endif #ifndef WARN_DEPRECATED #define WARN_DEPRECATED 2 #endif #ifndef WARN_EXITING #define WARN_EXITING 3 #endif #ifndef WARN_GLOB #define WARN_GLOB 4 #endif #ifndef WARN_IO #define WARN_IO 5 #endif #ifndef WARN_CLOSED #define WARN_CLOSED 6 #endif #ifndef WARN_EXEC #define WARN_EXEC 7 #endif #ifndef WARN_LAYER #define WARN_LAYER 8 #endif #ifndef WARN_NEWLINE #define WARN_NEWLINE 9 #endif #ifndef WARN_PIPE #define WARN_PIPE 10 #endif #ifndef WARN_UNOPENED #define WARN_UNOPENED 11 #endif #ifndef WARN_MISC #define WARN_MISC 12 #endif #ifndef WARN_NUMERIC #define WARN_NUMERIC 13 #endif #ifndef WARN_ONCE #define WARN_ONCE 14 #endif #ifndef WARN_OVERFLOW #define WARN_OVERFLOW 15 #endif #ifndef WARN_PACK #define WARN_PACK 16 #endif #ifndef WARN_PORTABLE #define WARN_PORTABLE 17 #endif #ifndef WARN_RECURSION #define WARN_RECURSION 18 #endif #ifndef WARN_REDEFINE #define WARN_REDEFINE 19 #endif #ifndef WARN_REGEXP #define WARN_REGEXP 20 #endif #ifndef WARN_SEVERE #define WARN_SEVERE 21 #endif #ifndef WARN_DEBUGGING #define WARN_DEBUGGING 22 #endif #ifndef WARN_INPLACE #define WARN_INPLACE 23 #endif #ifndef WARN_INTERNAL #define WARN_INTERNAL 24 #endif #ifndef WARN_MALLOC #define WARN_MALLOC 25 #endif #ifndef WARN_SIGNAL #define WARN_SIGNAL 26 #endif #ifndef WARN_SUBSTR #define WARN_SUBSTR 27 #endif #ifndef WARN_SYNTAX #define WARN_SYNTAX 28 #endif #ifndef WARN_AMBIGUOUS #define WARN_AMBIGUOUS 29 #endif #ifndef WARN_BAREWORD #define WARN_BAREWORD 30 #endif #ifndef WARN_DIGIT #define WARN_DIGIT 31 #endif #ifndef WARN_PARENTHESIS #define WARN_PARENTHESIS 32 #endif #ifndef WARN_PRECEDENCE #define WARN_PRECEDENCE 33 #endif #ifndef WARN_PRINTF #define WARN_PRINTF 34 #endif #ifndef WARN_PROTOTYPE #define WARN_PROTOTYPE 35 #endif #ifndef WARN_QW #define WARN_QW 36 #endif #ifndef WARN_RESERVED #define WARN_RESERVED 37 #endif #ifndef WARN_SEMICOLON #define WARN_SEMICOLON 38 #endif #ifndef WARN_TAINT #define WARN_TAINT 39 #endif #ifndef WARN_THREADS #define WARN_THREADS 40 #endif #ifndef WARN_UNINITIALIZED #define WARN_UNINITIALIZED 41 #endif #ifndef WARN_UNPACK #define WARN_UNPACK 42 #endif #ifndef WARN_UNTIE #define WARN_UNTIE 43 #endif #ifndef WARN_UTF8 #define WARN_UTF8 44 #endif #ifndef WARN_VOID #define WARN_VOID 45 #endif #ifndef WARN_ASSERTIONS #define WARN_ASSERTIONS 46 #endif #ifndef packWARN #define packWARN(a) (a) #endif #ifndef packWARN2 #define packWARN2(a,b) (packWARN(a) << 8 | (b)) #endif #ifndef packWARN3 #define packWARN3(a,b,c) (packWARN2(a,b) << 8 | (c)) #endif #ifndef packWARN4 #define packWARN4(a,b,c,d) (packWARN3(a,b,c) << 8 | (d)) #endif #ifndef ckWARN #ifdef G_WARN_ON #define ckWARN(a) (PL_dowarn & G_WARN_ON) #else #define ckWARN(a) PL_dowarn #endif #endif #ifndef ckWARN2 #define ckWARN2(a,b) (ckWARN(a) || ckWARN(b)) #endif #ifndef ckWARN3 #define ckWARN3(a,b,c) (ckWARN(c) || ckWARN2(a,b)) #endif #ifndef ckWARN4 #define ckWARN4(a,b,c,d) (ckWARN(d) || ckWARN3(a,b,c)) #endif #ifndef ckWARN_d #ifdef isLEXWARN_off #define ckWARN_d(a) (isLEXWARN_off || ckWARN(a)) #else #define ckWARN_d(a) 1 #endif #endif #ifndef ckWARN2_d #define ckWARN2_d(a,b) (ckWARN_d(a) || ckWARN_d(b)) #endif #ifndef ckWARN3_d #define ckWARN3_d(a,b,c) (ckWARN_d(c) || ckWARN2_d(a,b)) #endif #ifndef ckWARN4_d #define ckWARN4_d(a,b,c,d) (ckWARN_d(d) || ckWARN3_d(a,b,c)) #endif #ifndef vwarner #define vwarner(err, pat, argsp) \ STMT_START { SV *sv; \ PERL_UNUSED_ARG(err); \ sv = vnewSVpvf(pat, argsp); \ sv_2mortal(sv); \ warn("%s", SvPV_nolen(sv)); \ } STMT_END #endif #if (PERL_BCDVERSION >= 0x5004000) && !defined(warner) #if defined(NEED_warner) static void DPPP_(my_warner)(U32 err, const char * pat, ...); static #else extern void DPPP_(my_warner)(U32 err, const char * pat, ...); #endif #if defined(NEED_warner) || defined(NEED_warner_GLOBAL) #define Perl_warner DPPP_(my_warner) void DPPP_(my_warner)(U32 err, const char *pat, ...) { va_list args; va_start(args, pat); vwarner(err, pat, &args); va_end(args); } #define warner Perl_warner #define Perl_warner_nocontext Perl_warner #endif #endif #if (PERL_BCDVERSION >= 0x5004000) && !defined(ck_warner) #if defined(NEED_ck_warner) static void DPPP_(my_ck_warner)(pTHX_ U32 err, const char * pat, ...); static #else extern void DPPP_(my_ck_warner)(pTHX_ U32 err, const char * pat, ...); #endif #if defined(NEED_ck_warner) || defined(NEED_ck_warner_GLOBAL) #define Perl_ck_warner DPPP_(my_ck_warner) void DPPP_(my_ck_warner)(pTHX_ U32 err, const char *pat, ...) { va_list args; if ( ! ckWARN((err ) & 0xFF) && ! ckWARN((err >> 8) & 0xFF) && ! ckWARN((err >> 16) & 0xFF) && ! ckWARN((err >> 24) & 0xFF)) { return; } va_start(args, pat); vwarner(err, pat, &args); va_end(args); } #define ck_warner Perl_ck_warner #endif #endif #if (PERL_BCDVERSION >= 0x5004000) && !defined(ck_warner_d) #if defined(NEED_ck_warner_d) static void DPPP_(my_ck_warner_d)(pTHX_ U32 err, const char * pat, ...); static #else extern void DPPP_(my_ck_warner_d)(pTHX_ U32 err, const char * pat, ...); #endif #if defined(NEED_ck_warner_d) || defined(NEED_ck_warner_d_GLOBAL) #define Perl_ck_warner_d DPPP_(my_ck_warner_d) void DPPP_(my_ck_warner_d)(pTHX_ U32 err, const char *pat, ...) { va_list args; if ( ! ckWARN_d((err ) & 0xFF) && ! ckWARN_d((err >> 8) & 0xFF) && ! ckWARN_d((err >> 16) & 0xFF) && ! ckWARN_d((err >> 24) & 0xFF)) { return; } va_start(args, pat); vwarner(err, pat, &args); va_end(args); } #define ck_warner_d Perl_ck_warner_d #endif #endif #ifndef IVdf #if IVSIZE == LONGSIZE #define IVdf "ld" #define UVuf "lu" #define UVof "lo" #define UVxf "lx" #define UVXf "lX" #elif IVSIZE == INTSIZE #define IVdf "d" #define UVuf "u" #define UVof "o" #define UVxf "x" #define UVXf "X" #else #error "cannot define IV/UV formats" #endif #endif #ifndef NVef #if defined(USE_LONG_DOUBLE) && defined(HAS_LONG_DOUBLE) && \ defined(PERL_PRIfldbl) && (PERL_BCDVERSION != 0x5006000) #define NVef PERL_PRIeldbl #define NVff PERL_PRIfldbl #define NVgf PERL_PRIgldbl #else #define NVef "e" #define NVff "f" #define NVgf "g" #endif #endif #ifndef sv_setuv #define sv_setuv(sv, uv) \ STMT_START { \ UV TeMpUv = uv; \ if (TeMpUv <= IV_MAX) \ sv_setiv(sv, TeMpUv); \ else \ sv_setnv(sv, (double)TeMpUv); \ } STMT_END #endif #ifndef newSVuv #define newSVuv(uv) ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv)) #endif #if defined(PERL_USE_GCC_BRACE_GROUPS) #ifndef sv_2uv #define sv_2uv(sv) ({ SV *_sv = (sv); (UV) (SvNOK(_sv) ? SvNV(_sv) : sv_2nv(_sv)); }) #endif #else #ifndef sv_2uv #define sv_2uv(sv) ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv))) #endif #endif #ifndef SvUVX #define SvUVX(sv) ((UV)SvIVX(sv)) #endif #ifndef SvUVXx #define SvUVXx(sv) SvUVX(sv) #endif #ifndef SvUV #define SvUV(sv) (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv)) #endif #if defined(PERL_USE_GCC_BRACE_GROUPS) #ifndef SvUVx #define SvUVx(sv) ({ SV *_sv = (sv)); SvUV(_sv); }) #endif #else #ifndef SvUVx #define SvUVx(sv) ((PL_Sv = (sv)), SvUV(PL_Sv)) #endif #endif #ifndef sv_uv #define sv_uv(sv) SvUVx(sv) #endif #if !defined(SvUOK) && defined(SvIOK_UV) #define SvUOK(sv) SvIOK_UV(sv) #endif #ifndef XST_mUV #define XST_mUV(i,v) (ST(i) = sv_2mortal(newSVuv(v)) ) #endif #ifndef XSRETURN_UV #define XSRETURN_UV(v) STMT_START { XST_mUV(0,v); XSRETURN(1); } STMT_END #endif #ifndef PUSHu #define PUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG; } STMT_END #endif #ifndef XPUSHu #define XPUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END #endif #if !defined(my_strnlen) #if defined(NEED_my_strnlen) static Size_t DPPP_(my_my_strnlen)(const char * str, Size_t maxlen); static #else extern Size_t DPPP_(my_my_strnlen)(const char * str, Size_t maxlen); #endif #if defined(NEED_my_strnlen) || defined(NEED_my_strnlen_GLOBAL) #define my_strnlen DPPP_(my_my_strnlen) #define Perl_my_strnlen DPPP_(my_my_strnlen) Size_t DPPP_(my_my_strnlen)(const char *str, Size_t maxlen) { const char *p = str; while(maxlen-- && *p) p++; return p - str; } #endif #endif #ifdef HAS_MEMCMP #ifndef memNE #define memNE(s1,s2,l) (memcmp(s1,s2,l)) #endif #ifndef memEQ #define memEQ(s1,s2,l) (!memcmp(s1,s2,l)) #endif #else #ifndef memNE #define memNE(s1,s2,l) (bcmp(s1,s2,l)) #endif #ifndef memEQ #define memEQ(s1,s2,l) (!bcmp(s1,s2,l)) #endif #endif #ifndef memEQs #define memEQs(s1, l, s2) \ (sizeof(s2)-1 == l && memEQ(s1, (s2 ""), (sizeof(s2)-1))) #endif #ifndef memNEs #define memNEs(s1, l, s2) !memEQs(s1, l, s2) #endif #ifndef memCHRs #define memCHRs(s, c) ((const char *) memchr("" s "" , c, sizeof(s)-1)) #endif #ifndef MoveD #define MoveD(s,d,n,t) memmove((char*)(d),(char*)(s), (n) * sizeof(t)) #endif #ifndef CopyD #define CopyD(s,d,n,t) memcpy((char*)(d),(char*)(s), (n) * sizeof(t)) #endif #ifdef HAS_MEMSET #ifndef ZeroD #define ZeroD(d,n,t) memzero((char*)(d), (n) * sizeof(t)) #endif #else #ifndef ZeroD #define ZeroD(d,n,t) ((void)memzero((char*)(d), (n) * sizeof(t)), d) #endif #endif #ifndef PoisonWith #define PoisonWith(d,n,t,b) (void)memset((char*)(d), (U8)(b), (n) * sizeof(t)) #endif #ifndef PoisonNew #define PoisonNew(d,n,t) PoisonWith(d,n,t,0xAB) #endif #ifndef PoisonFree #define PoisonFree(d,n,t) PoisonWith(d,n,t,0xEF) #endif #ifndef Poison #define Poison(d,n,t) PoisonFree(d,n,t) #endif #ifndef Newx #define Newx(v,n,t) New(0,v,n,t) #endif #ifndef Newxc #define Newxc(v,n,t,c) Newc(0,v,n,t,c) #endif #ifndef Newxz #define Newxz(v,n,t) Newz(0,v,n,t) #endif #ifdef NEED_mess_sv #define NEED_mess #endif #ifdef NEED_mess #define NEED_mess_nocontext #define NEED_vmess #endif #ifndef croak_sv #if (PERL_BCDVERSION >= 0x5007003) || ( (PERL_BCDVERSION >= 0x5006001) && (PERL_BCDVERSION < 0x5007000) ) #if ( (PERL_BCDVERSION >= 0x5008000) && (PERL_BCDVERSION < 0x5008009) ) || ( (PERL_BCDVERSION >= 0x5009000) && (PERL_BCDVERSION < 0x5010001) ) #define D_PPP_FIX_UTF8_ERRSV_FOR_SV(sv) \ STMT_START { \ SV *_errsv = ERRSV; \ SvFLAGS(_errsv) = (SvFLAGS(_errsv) & ~SVf_UTF8) | \ (SvFLAGS(sv) & SVf_UTF8); \ } STMT_END #else #define D_PPP_FIX_UTF8_ERRSV_FOR_SV(sv) STMT_START {} STMT_END #endif #define croak_sv(sv) \ STMT_START { \ SV *_sv = (sv); \ if (SvROK(_sv)) { \ sv_setsv(ERRSV, _sv); \ croak(NULL); \ } else { \ D_PPP_FIX_UTF8_ERRSV_FOR_SV(_sv); \ croak("%" SVf, SVfARG(_sv)); \ } \ } STMT_END #elif (PERL_BCDVERSION >= 0x5004000) #define croak_sv(sv) croak("%" SVf, SVfARG(sv)) #else #define croak_sv(sv) croak("%s", SvPV_nolen(sv)) #endif #endif #ifndef die_sv #if defined(NEED_die_sv) static OP * DPPP_(my_die_sv)(pTHX_ SV * baseex); static #else extern OP * DPPP_(my_die_sv)(pTHX_ SV * baseex); #endif #if defined(NEED_die_sv) || defined(NEED_die_sv_GLOBAL) #ifdef die_sv #undef die_sv #endif #define die_sv(a) DPPP_(my_die_sv)(aTHX_ a) #define Perl_die_sv DPPP_(my_die_sv) OP * DPPP_(my_die_sv)(pTHX_ SV *baseex) { croak_sv(baseex); return (OP *)NULL; } #endif #endif #ifndef warn_sv #if (PERL_BCDVERSION >= 0x5004000) #define warn_sv(sv) warn("%" SVf, SVfARG(sv)) #else #define warn_sv(sv) warn("%s", SvPV_nolen(sv)) #endif #endif #if ! defined vmess && (PERL_BCDVERSION >= 0x5004000) #if defined(NEED_vmess) static SV * DPPP_(my_vmess)(pTHX_ const char * pat, va_list * args); static #else extern SV * DPPP_(my_vmess)(pTHX_ const char * pat, va_list * args); #endif #if defined(NEED_vmess) || defined(NEED_vmess_GLOBAL) #ifdef vmess #undef vmess #endif #define vmess(a,b) DPPP_(my_vmess)(aTHX_ a,b) #define Perl_vmess DPPP_(my_vmess) SV* DPPP_(my_vmess)(pTHX_ const char* pat, va_list* args) { mess(pat, args); return PL_mess_sv; } #endif #endif #if (PERL_BCDVERSION < 0x5006000) && (PERL_BCDVERSION >= 0x5004000) #undef mess #endif #if !defined(mess_nocontext) && !defined(Perl_mess_nocontext) && (PERL_BCDVERSION >= 0x5004000) #if defined(NEED_mess_nocontext) static SV * DPPP_(my_mess_nocontext)(const char * pat, ...); static #else extern SV * DPPP_(my_mess_nocontext)(const char * pat, ...); #endif #if defined(NEED_mess_nocontext) || defined(NEED_mess_nocontext_GLOBAL) #define mess_nocontext DPPP_(my_mess_nocontext) #define Perl_mess_nocontext DPPP_(my_mess_nocontext) SV* DPPP_(my_mess_nocontext)(const char* pat, ...) { dTHX; SV *sv; va_list args; va_start(args, pat); sv = vmess(pat, &args); va_end(args); return sv; } #endif #endif #ifndef mess #if defined(NEED_mess) static SV * DPPP_(my_mess)(pTHX_ const char * pat, ...); static #else extern SV * DPPP_(my_mess)(pTHX_ const char * pat, ...); #endif #if defined(NEED_mess) || defined(NEED_mess_GLOBAL) #define Perl_mess DPPP_(my_mess) SV* DPPP_(my_mess)(pTHX_ const char* pat, ...) { SV *sv; va_list args; va_start(args, pat); sv = vmess(pat, &args); va_end(args); return sv; } #ifdef mess_nocontext #define mess mess_nocontext #else #define mess Perl_mess_nocontext #endif #endif #endif #if ! defined mess_sv && (PERL_BCDVERSION >= 0x5004000) #if defined(NEED_mess_sv) static SV * DPPP_(my_mess_sv)(pTHX_ SV * basemsg, bool consume); static #else extern SV * DPPP_(my_mess_sv)(pTHX_ SV * basemsg, bool consume); #endif #if defined(NEED_mess_sv) || defined(NEED_mess_sv_GLOBAL) #ifdef mess_sv #undef mess_sv #endif #define mess_sv(a,b) DPPP_(my_mess_sv)(aTHX_ a,b) #define Perl_mess_sv DPPP_(my_mess_sv) SV * DPPP_(my_mess_sv)(pTHX_ SV *basemsg, bool consume) { SV *tmp; SV *ret; if (SvPOK(basemsg) && SvCUR(basemsg) && *(SvEND(basemsg)-1) == '\n') { if (consume) return basemsg; ret = mess(""); SvSetSV_nosteal(ret, basemsg); return ret; } if (consume) { sv_catsv(basemsg, mess("")); return basemsg; } ret = mess(""); tmp = newSVsv(ret); SvSetSV_nosteal(ret, basemsg); sv_catsv(ret, tmp); sv_dec(tmp); return ret; } #endif #endif #ifndef warn_nocontext #define warn_nocontext warn #endif #ifndef croak_nocontext #define croak_nocontext croak #endif #ifndef croak_no_modify #define croak_no_modify() croak_nocontext("%s", PL_no_modify) #define Perl_croak_no_modify() croak_no_modify() #endif #ifndef croak_memory_wrap #if (PERL_BCDVERSION >= 0x5009002) || ( (PERL_BCDVERSION >= 0x5008006) && (PERL_BCDVERSION < 0x5009000) ) #define croak_memory_wrap() croak_nocontext("%s", PL_memory_wrap) #else #define croak_memory_wrap() croak_nocontext("panic: memory wrap") #endif #endif #ifndef croak_xs_usage #if defined(NEED_croak_xs_usage) static void DPPP_(my_croak_xs_usage)(const CV * const cv, const char * const params); static #else extern void DPPP_(my_croak_xs_usage)(const CV * const cv, const char * const params); #endif #if defined(NEED_croak_xs_usage) || defined(NEED_croak_xs_usage_GLOBAL) #define croak_xs_usage DPPP_(my_croak_xs_usage) #define Perl_croak_xs_usage DPPP_(my_croak_xs_usage) #ifndef PERL_ARGS_ASSERT_CROAK_XS_USAGE #define PERL_ARGS_ASSERT_CROAK_XS_USAGE assert(cv); assert(params) void DPPP_(my_croak_xs_usage)(const CV *const cv, const char *const params) { dTHX; const GV *const gv = CvGV(cv); PERL_ARGS_ASSERT_CROAK_XS_USAGE; if (gv) { const char *const gvname = GvNAME(gv); const HV *const stash = GvSTASH(gv); const char *const hvname = stash ? HvNAME(stash) : NULL; if (hvname) croak("Usage: %s::%s(%s)", hvname, gvname, params); else croak("Usage: %s(%s)", gvname, params); } else { croak("Usage: CODE(0x%" UVxf ")(%s)", PTR2UV(cv), params); } } #endif #endif #endif #ifndef mPUSHs #define mPUSHs(s) PUSHs(sv_2mortal(s)) #endif #ifndef PUSHmortal #define PUSHmortal PUSHs(sv_newmortal()) #endif #ifndef mPUSHp #define mPUSHp(p,l) sv_setpvn(PUSHmortal, (p), (l)) #endif #ifndef mPUSHn #define mPUSHn(n) sv_setnv(PUSHmortal, (NV)(n)) #endif #ifndef mPUSHi #define mPUSHi(i) sv_setiv(PUSHmortal, (IV)(i)) #endif #ifndef mPUSHu #define mPUSHu(u) sv_setuv(PUSHmortal, (UV)(u)) #endif #ifndef mXPUSHs #define mXPUSHs(s) XPUSHs(sv_2mortal(s)) #endif #ifndef XPUSHmortal #define XPUSHmortal XPUSHs(sv_newmortal()) #endif #ifndef mXPUSHp #define mXPUSHp(p,l) STMT_START { EXTEND(sp,1); sv_setpvn(PUSHmortal, (p), (l)); } STMT_END #endif #ifndef mXPUSHn #define mXPUSHn(n) STMT_START { EXTEND(sp,1); sv_setnv(PUSHmortal, (NV)(n)); } STMT_END #endif #ifndef mXPUSHi #define mXPUSHi(i) STMT_START { EXTEND(sp,1); sv_setiv(PUSHmortal, (IV)(i)); } STMT_END #endif #ifndef mXPUSHu #define mXPUSHu(u) STMT_START { EXTEND(sp,1); sv_setuv(PUSHmortal, (UV)(u)); } STMT_END #endif #ifndef call_sv #define call_sv perl_call_sv #endif #ifndef call_pv #define call_pv perl_call_pv #endif #ifndef call_argv #define call_argv perl_call_argv #endif #ifndef call_method #define call_method perl_call_method #endif #ifndef eval_sv #define eval_sv perl_eval_sv #endif #if (PERL_BCDVERSION >= 0x5003098) && (PERL_BCDVERSION < 0x5006000) #ifndef eval_pv #define eval_pv perl_eval_pv #endif #endif #if (PERL_BCDVERSION < 0x5006000) #ifndef Perl_eval_sv #define Perl_eval_sv perl_eval_sv #endif #if (PERL_BCDVERSION >= 0x5003098) #ifndef Perl_eval_pv #define Perl_eval_pv perl_eval_pv #endif #endif #endif #ifndef G_LIST #define G_LIST G_ARRAY #endif #ifndef PERL_LOADMOD_DENY #define PERL_LOADMOD_DENY 0x1 #endif #ifndef PERL_LOADMOD_NOIMPORT #define PERL_LOADMOD_NOIMPORT 0x2 #endif #ifndef PERL_LOADMOD_IMPORT_OPS #define PERL_LOADMOD_IMPORT_OPS 0x4 #endif #if defined(PERL_USE_GCC_BRACE_GROUPS) #define D_PPP_CROAK_IF_ERROR(cond) ({ \ SV *_errsv; \ ( (cond) \ && (_errsv = ERRSV) \ && (SvROK(_errsv) || SvTRUE(_errsv)) \ && (croak_sv(_errsv), 1)); \ }) #else PERL_STATIC_INLINE void D_PPP_CROAK_IF_ERROR(int cond) { dTHX; SV *errsv; if (!cond) return; errsv = ERRSV; if (SvROK(errsv) || SvTRUE(errsv)) croak_sv(errsv); } #define D_PPP_CROAK_IF_ERROR(cond) D_PPP_CROAK_IF_ERROR(cond) #endif #ifndef G_METHOD #define G_METHOD 64 #ifdef call_sv #undef call_sv #endif #if (PERL_BCDVERSION < 0x5006000) #define call_sv(sv, flags) ((flags) & G_METHOD ? perl_call_method((char *) SvPV_nolen_const(sv), \ (flags) & ~G_METHOD) : perl_call_sv(sv, flags)) #else #define call_sv(sv, flags) ((flags) & G_METHOD ? Perl_call_method(aTHX_ (char *) SvPV_nolen_const(sv), \ (flags) & ~G_METHOD) : Perl_call_sv(aTHX_ sv, flags)) #endif #endif #ifndef G_RETHROW #define G_RETHROW 8192 #ifdef eval_sv #undef eval_sv #endif #if defined(PERL_USE_GCC_BRACE_GROUPS) #define eval_sv(sv, flags) ({ I32 _flags = (flags); I32 _ret = Perl_eval_sv(aTHX_ sv, (_flags & ~G_RETHROW)); D_PPP_CROAK_IF_ERROR(_flags & G_RETHROW); _ret; }) #else #define eval_sv(sv, flags) ((PL_na = Perl_eval_sv(aTHX_ sv, ((flags) & ~G_RETHROW))), D_PPP_CROAK_IF_ERROR((flags) & G_RETHROW), (I32)PL_na) #endif #endif #if (PERL_BCDVERSION < 0x5031002) #ifdef eval_pv #undef eval_pv #if defined(PERL_USE_GCC_BRACE_GROUPS) #define eval_pv(p, croak_on_error) ({ SV *_sv = Perl_eval_pv(aTHX_ p, 0); D_PPP_CROAK_IF_ERROR(croak_on_error); _sv; }) #else #define eval_pv(p, croak_on_error) ((PL_Sv = Perl_eval_pv(aTHX_ p, 0)), D_PPP_CROAK_IF_ERROR(croak_on_error), PL_Sv) #endif #endif #endif #ifndef eval_pv #if defined(NEED_eval_pv) static SV * DPPP_(my_eval_pv)(const char * p, I32 croak_on_error); static #else extern SV * DPPP_(my_eval_pv)(const char * p, I32 croak_on_error); #endif #if defined(NEED_eval_pv) || defined(NEED_eval_pv_GLOBAL) #ifdef eval_pv #undef eval_pv #endif #define eval_pv(a,b) DPPP_(my_eval_pv)(aTHX_ a,b) #define Perl_eval_pv DPPP_(my_eval_pv) SV* DPPP_(my_eval_pv)(const char *p, I32 croak_on_error) { dSP; SV* sv = newSVpv(p, 0); PUSHMARK(sp); eval_sv(sv, G_SCALAR); SvREFCNT_dec(sv); SPAGAIN; sv = POPs; PUTBACK; D_PPP_CROAK_IF_ERROR(croak_on_error); return sv; } #endif #endif #if ! defined(vload_module) && defined(start_subparse) #if defined(NEED_vload_module) static void DPPP_(my_vload_module)(U32 flags, SV * name, SV * ver, va_list * args); static #else extern void DPPP_(my_vload_module)(U32 flags, SV * name, SV * ver, va_list * args); #endif #if defined(NEED_vload_module) || defined(NEED_vload_module_GLOBAL) #ifdef vload_module #undef vload_module #endif #define vload_module(a,b,c,d) DPPP_(my_vload_module)(aTHX_ a,b,c,d) #define Perl_vload_module DPPP_(my_vload_module) void DPPP_(my_vload_module)(U32 flags, SV *name, SV *ver, va_list *args) { dTHR; dVAR; OP *veop, *imop; OP * const modname = newSVOP(OP_CONST, 0, name); SvREADONLY_off(((SVOP*)modname)->op_sv); modname->op_private |= OPpCONST_BARE; if (ver) { veop = newSVOP(OP_CONST, 0, ver); } else veop = NULL; if (flags & PERL_LOADMOD_NOIMPORT) { imop = sawparens(newNULLLIST()); } else if (flags & PERL_LOADMOD_IMPORT_OPS) { imop = va_arg(*args, OP*); } else { SV *sv; imop = NULL; sv = va_arg(*args, SV*); while (sv) { imop = append_elem(OP_LIST, imop, newSVOP(OP_CONST, 0, sv)); sv = va_arg(*args, SV*); } } { const line_t ocopline = PL_copline; COP * const ocurcop = PL_curcop; const int oexpect = PL_expect; utilize(!(flags & PERL_LOADMOD_DENY), start_subparse(FALSE, 0), #if (PERL_BCDVERSION > 0x5003000) veop, #endif modname, imop); PL_expect = oexpect; PL_copline = ocopline; PL_curcop = ocurcop; } } #endif #endif #ifndef load_module #if defined(NEED_load_module) static void DPPP_(my_load_module)(U32 flags, SV * name, SV * ver, ...); static #else extern void DPPP_(my_load_module)(U32 flags, SV * name, SV * ver, ...); #endif #if defined(NEED_load_module) || defined(NEED_load_module_GLOBAL) #ifdef load_module #undef load_module #endif #define load_module DPPP_(my_load_module) #define Perl_load_module DPPP_(my_load_module) void DPPP_(my_load_module)(U32 flags, SV *name, SV *ver, ...) { va_list args; va_start(args, ver); vload_module(flags, name, ver, &args); va_end(args); } #endif #endif #ifndef newRV_inc #define newRV_inc(sv) newRV(sv) #endif #ifndef newRV_noinc #if defined(PERL_USE_GCC_BRACE_GROUPS) #define newRV_noinc(sv) ({ SV *_sv = (SV *)newRV((sv)); SvREFCNT_dec((sv)); _sv; }) #else #define newRV_noinc(sv) ((PL_Sv = (SV *)newRV((sv))), SvREFCNT_dec((sv)), PL_Sv) #endif #endif #if defined(MULTIPLICITY) || defined(PERL_OBJECT) || \ defined(PERL_CAPI) || defined(PERL_IMPLICIT_CONTEXT) #ifndef START_MY_CXT #define START_MY_CXT #if (PERL_BCDVERSION < 0x5004068) #define dMY_CXT_SV \ SV *my_cxt_sv = get_sv(MY_CXT_KEY, FALSE) #else #define dMY_CXT_SV \ SV *my_cxt_sv = *hv_fetch(PL_modglobal, MY_CXT_KEY, \ sizeof(MY_CXT_KEY)-1, TRUE) #endif #define dMY_CXT \ dMY_CXT_SV; \ my_cxt_t *my_cxtp = INT2PTR(my_cxt_t*,SvUV(my_cxt_sv)) #define MY_CXT_INIT \ dMY_CXT_SV; \ \ my_cxt_t *my_cxtp = (my_cxt_t*)SvPVX(newSV(sizeof(my_cxt_t)-1));\ Zero(my_cxtp, 1, my_cxt_t); \ sv_setuv(my_cxt_sv, PTR2UV(my_cxtp)) #define MY_CXT (*my_cxtp) #define pMY_CXT my_cxt_t *my_cxtp #define pMY_CXT_ pMY_CXT, #define _pMY_CXT ,pMY_CXT #define aMY_CXT my_cxtp #define aMY_CXT_ aMY_CXT, #define _aMY_CXT ,aMY_CXT #endif #ifndef MY_CXT_CLONE #define MY_CXT_CLONE \ dMY_CXT_SV; \ my_cxt_t *my_cxtp = (my_cxt_t*)SvPVX(newSV(sizeof(my_cxt_t)-1));\ Copy(INT2PTR(my_cxt_t*, SvUV(my_cxt_sv)), my_cxtp, 1, my_cxt_t);\ sv_setuv(my_cxt_sv, PTR2UV(my_cxtp)) #endif #else #ifndef START_MY_CXT #define START_MY_CXT static my_cxt_t my_cxt; #define dMY_CXT_SV dNOOP #define dMY_CXT dNOOP #define MY_CXT_INIT NOOP #define MY_CXT my_cxt #define pMY_CXT void #define pMY_CXT_ #define _pMY_CXT #define aMY_CXT #define aMY_CXT_ #define _aMY_CXT #endif #ifndef MY_CXT_CLONE #define MY_CXT_CLONE NOOP #endif #endif #ifndef SvREFCNT_inc #ifdef PERL_USE_GCC_BRACE_GROUPS #define SvREFCNT_inc(sv) \ ({ \ SV * const _sv = (SV*)(sv); \ if (_sv) \ (SvREFCNT(_sv))++; \ _sv; \ }) #else #define SvREFCNT_inc(sv) \ ((PL_Sv=(SV*)(sv)) ? (++(SvREFCNT(PL_Sv)),PL_Sv) : NULL) #endif #endif #ifndef SvREFCNT_inc_simple #ifdef PERL_USE_GCC_BRACE_GROUPS #define SvREFCNT_inc_simple(sv) \ ({ \ if (sv) \ (SvREFCNT(sv))++; \ (SV *)(sv); \ }) #else #define SvREFCNT_inc_simple(sv) \ ((sv) ? (SvREFCNT(sv)++,(SV*)(sv)) : NULL) #endif #endif #ifndef SvREFCNT_inc_NN #ifdef PERL_USE_GCC_BRACE_GROUPS #define SvREFCNT_inc_NN(sv) \ ({ \ SV * const _sv = (SV*)(sv); \ SvREFCNT(_sv)++; \ _sv; \ }) #else #define SvREFCNT_inc_NN(sv) \ (PL_Sv=(SV*)(sv),++(SvREFCNT(PL_Sv)),PL_Sv) #endif #endif #ifndef SvREFCNT_inc_void #ifdef PERL_USE_GCC_BRACE_GROUPS #define SvREFCNT_inc_void(sv) \ ({ \ SV * const _sv = (SV*)(sv); \ if (_sv) \ (void)(SvREFCNT(_sv)++); \ }) #else #define SvREFCNT_inc_void(sv) \ (void)((PL_Sv=(SV*)(sv)) ? ++(SvREFCNT(PL_Sv)) : 0) #endif #endif #ifndef SvREFCNT_inc_simple_void #define SvREFCNT_inc_simple_void(sv) STMT_START { if (sv) SvREFCNT(sv)++; } STMT_END #endif #ifndef SvREFCNT_inc_simple_NN #define SvREFCNT_inc_simple_NN(sv) (++SvREFCNT(sv), (SV*)(sv)) #endif #ifndef SvREFCNT_inc_void_NN #define SvREFCNT_inc_void_NN(sv) (void)(++SvREFCNT((SV*)(sv))) #endif #ifndef SvREFCNT_inc_simple_void_NN #define SvREFCNT_inc_simple_void_NN(sv) (void)(++SvREFCNT((SV*)(sv))) #endif #ifndef newSV_type #if defined(PERL_USE_GCC_BRACE_GROUPS) #define newSV_type(t) ({ SV *_sv = newSV(0); sv_upgrade(_sv, (t)); _sv; }) #else #define newSV_type(t) ((PL_Sv = newSV(0)), sv_upgrade(PL_Sv, (t)), PL_Sv) #endif #endif #if (PERL_BCDVERSION < 0x5006000) #define D_PPP_CONSTPV_ARG(x) ((char *) (x)) #else #define D_PPP_CONSTPV_ARG(x) (x) #endif #ifndef newSVpvn #define newSVpvn(data,len) ((data) \ ? ((len) ? newSVpv((data), (len)) : newSVpv("", 0)) \ : newSV(0)) #endif #ifndef newSVpvn_utf8 #define newSVpvn_utf8(s, len, u) newSVpvn_flags((s), (len), (u) ? SVf_UTF8 : 0) #endif #ifndef SVf_UTF8 #define SVf_UTF8 0 #endif #ifndef newSVpvn_flags #if defined(PERL_USE_GCC_BRACE_GROUPS) #define newSVpvn_flags(s, len, flags) \ ({ \ SV * sv = newSVpvn(D_PPP_CONSTPV_ARG(s), (len)); \ SvFLAGS(sv) |= ((flags) & SVf_UTF8); \ if ((flags) & SVs_TEMP) sv = sv_2mortal(sv); \ sv; \ }) #else PERL_STATIC_INLINE SV* D_PPP_newSVpvn_flags(const char *const s, const STRLEN len, const U32 flags) { dTHX; SV * sv = newSVpvn(s, len); SvFLAGS(sv) |= (flags & SVf_UTF8); if (flags & SVs_TEMP) return sv_2mortal(sv); return sv; } #define newSVpvn_flags(s, len, flags) D_PPP_newSVpvn_flags((s), (len), (flags)) #endif #endif #ifndef SV_NOSTEAL #define SV_NOSTEAL 16 #endif #if ( (PERL_BCDVERSION >= 0x5007003) && (PERL_BCDVERSION < 0x5008007) ) || ( (PERL_BCDVERSION >= 0x5009000) && (PERL_BCDVERSION < 0x5009002) ) #undef sv_setsv_flags #if defined(PERL_USE_GCC_BRACE_GROUPS) #define sv_setsv_flags(dstr, sstr, flags) \ STMT_START { \ if (((flags) & SV_NOSTEAL) && (sstr) && (SvFLAGS((SV *)(sstr)) & SVs_TEMP)) { \ SvTEMP_off((SV *)(sstr)); \ Perl_sv_setsv_flags(aTHX_ (dstr), (sstr), (flags) & ~SV_NOSTEAL); \ SvTEMP_on((SV *)(sstr)); \ } else { \ Perl_sv_setsv_flags(aTHX_ (dstr), (sstr), (flags) & ~SV_NOSTEAL); \ } \ } STMT_END #else #define sv_setsv_flags(dstr, sstr, flags) \ ( \ (((flags) & SV_NOSTEAL) && (sstr) && (SvFLAGS((SV *)(sstr)) & SVs_TEMP)) ? ( \ SvTEMP_off((SV *)(sstr)), \ Perl_sv_setsv_flags(aTHX_ (dstr), (sstr), (flags) & ~SV_NOSTEAL), \ SvTEMP_on((SV *)(sstr)), \ 1 \ ) : ( \ Perl_sv_setsv_flags(aTHX_ (dstr), (sstr), (flags) & ~SV_NOSTEAL), \ 1 \ ) \ ) #endif #endif #if defined(PERL_USE_GCC_BRACE_GROUPS) #ifndef sv_setsv_flags #define sv_setsv_flags(dstr, sstr, flags) \ STMT_START { \ if (((flags) & SV_NOSTEAL) && (sstr) && (SvFLAGS((SV *)(sstr)) & SVs_TEMP)) { \ SvTEMP_off((SV *)(sstr)); \ if (!((flags) & SV_GMAGIC) && (sstr) && SvGMAGICAL((SV *)(sstr))) { \ SvGMAGICAL_off((SV *)(sstr)); \ sv_setsv((dstr), (sstr)); \ SvGMAGICAL_on((SV *)(sstr)); \ } else { \ sv_setsv((dstr), (sstr)); \ } \ SvTEMP_on((SV *)(sstr)); \ } else { \ if (!((flags) & SV_GMAGIC) && (sstr) && SvGMAGICAL((SV *)(sstr))) { \ SvGMAGICAL_off((SV *)(sstr)); \ sv_setsv((dstr), (sstr)); \ SvGMAGICAL_on((SV *)(sstr)); \ } else { \ sv_setsv((dstr), (sstr)); \ } \ } \ } STMT_END #endif #else #ifndef sv_setsv_flags #define sv_setsv_flags(dstr, sstr, flags) \ ( \ (((flags) & SV_NOSTEAL) && (sstr) && (SvFLAGS((SV *)(sstr)) & SVs_TEMP)) ? ( \ SvTEMP_off((SV *)(sstr)), \ (!((flags) & SV_GMAGIC) && (sstr) && SvGMAGICAL((SV *)(sstr))) ? ( \ SvGMAGICAL_off((SV *)(sstr)), \ sv_setsv((dstr), (sstr)), \ SvGMAGICAL_on((SV *)(sstr)), \ 1 \ ) : ( \ sv_setsv((dstr), (sstr)), \ 1 \ ), \ SvTEMP_on((SV *)(sstr)), \ 1 \ ) : ( \ (!((flags) & SV_GMAGIC) && (sstr) && SvGMAGICAL((SV *)(sstr))) ? ( \ SvGMAGICAL_off((SV *)(sstr)), \ sv_setsv((dstr), (sstr)), \ SvGMAGICAL_on((SV *)(sstr)), \ 1 \ ) : ( \ sv_setsv((dstr), (sstr)), \ 1 \ ) \ ) \ ) #endif #endif #ifndef newSVsv_flags #if defined(PERL_USE_GCC_BRACE_GROUPS) #define newSVsv_flags(sv, flags) \ ({ \ SV *n= newSV(0); \ sv_setsv_flags(n, (sv), (flags)); \ n; \ }) #else PERL_STATIC_INLINE SV* D_PPP_newSVsv_flags(SV *const old, I32 flags) { dTHX; SV *n= newSV(0); sv_setsv_flags(n, old, flags); return n; } #define newSVsv_flags(sv, flags) D_PPP_newSVsv_flags(sv, flags) #endif #endif #ifndef newSVsv_nomg #define newSVsv_nomg(sv) newSVsv_flags((sv), SV_NOSTEAL) #endif #if (PERL_BCDVERSION >= 0x5017005) #ifndef sv_mortalcopy_flags #define sv_mortalcopy_flags(sv, flags) Perl_sv_mortalcopy_flags(aTHX_ (sv), (flags)) #endif #else #ifndef sv_mortalcopy_flags #define sv_mortalcopy_flags(sv, flags) sv_2mortal(newSVsv_flags((sv), (flags))) #endif #endif #ifndef SvMAGIC_set #define SvMAGIC_set(sv, val) \ STMT_START { assert(SvTYPE(sv) >= SVt_PVMG); \ (((XPVMG*) SvANY(sv))->xmg_magic = (val)); } STMT_END #endif #if (PERL_BCDVERSION < 0x5009003) #ifndef SvPVX_const #define SvPVX_const(sv) ((const char*) (0 + SvPVX(sv))) #endif #ifndef SvPVX_mutable #define SvPVX_mutable(sv) (0 + SvPVX(sv)) #endif #ifndef SvRV_set #define SvRV_set(sv, val) \ STMT_START { assert(SvTYPE(sv) >= SVt_RV); \ (((XRV*) SvANY(sv))->xrv_rv = (val)); } STMT_END #endif #else #ifndef SvPVX_const #define SvPVX_const(sv) ((const char*)((sv)->sv_u.svu_pv)) #endif #ifndef SvPVX_mutable #define SvPVX_mutable(sv) ((sv)->sv_u.svu_pv) #endif #ifndef SvRV_set #define SvRV_set(sv, val) \ STMT_START { assert(SvTYPE(sv) >= SVt_RV); \ ((sv)->sv_u.svu_rv = (val)); } STMT_END #endif #endif #ifndef SvSTASH_set #define SvSTASH_set(sv, val) \ STMT_START { assert(SvTYPE(sv) >= SVt_PVMG); \ (((XPVMG*) SvANY(sv))->xmg_stash = (val)); } STMT_END #endif #if (PERL_BCDVERSION < 0x5004000) #ifndef SvUV_set #define SvUV_set(sv, val) \ STMT_START { assert(SvTYPE(sv) == SVt_IV || SvTYPE(sv) >= SVt_PVIV); \ (((XPVIV*) SvANY(sv))->xiv_iv = (IV) (val)); } STMT_END #endif #else #ifndef SvUV_set #define SvUV_set(sv, val) \ STMT_START { assert(SvTYPE(sv) == SVt_IV || SvTYPE(sv) >= SVt_PVIV); \ (((XPVUV*) SvANY(sv))->xuv_uv = (val)); } STMT_END #endif #endif #ifndef newSVpvn_share #if defined(NEED_newSVpvn_share) static SV * DPPP_(my_newSVpvn_share)(pTHX_ const char * s, I32 len, U32 hash); static #else extern SV * DPPP_(my_newSVpvn_share)(pTHX_ const char * s, I32 len, U32 hash); #endif #if defined(NEED_newSVpvn_share) || defined(NEED_newSVpvn_share_GLOBAL) #ifdef newSVpvn_share #undef newSVpvn_share #endif #define newSVpvn_share(a,b,c) DPPP_(my_newSVpvn_share)(aTHX_ a,b,c) #define Perl_newSVpvn_share DPPP_(my_newSVpvn_share) SV * DPPP_(my_newSVpvn_share)(pTHX_ const char *s, I32 len, U32 hash) { SV *sv; if (len < 0) len = -len; if (!hash) PERL_HASH(hash, (char*) s, len); sv = newSVpvn((char *) s, len); sv_upgrade(sv, SVt_PVIV); SvIVX(sv) = hash; SvREADONLY_on(sv); SvPOK_on(sv); return sv; } #endif #endif #ifndef SvSHARED_HASH #define SvSHARED_HASH(sv) (0 + SvUVX(sv)) #endif #ifndef HvNAME_get #define HvNAME_get(hv) HvNAME(hv) #endif #ifndef HvNAMELEN_get #define HvNAMELEN_get(hv) (HvNAME_get(hv) ? (I32)strlen(HvNAME_get(hv)) : 0) #endif #if (PERL_BCDVERSION >= 0x5009002) && (PERL_BCDVERSION <= 0x5009003) #undef gv_fetchpvn_flags #endif #ifdef GV_NOADD_MASK #define D_PPP_GV_NOADD_MASK GV_NOADD_MASK #else #define D_PPP_GV_NOADD_MASK 0xE0 #endif #ifndef gv_fetchpvn_flags #define gv_fetchpvn_flags(name, len, flags, sv_type) gv_fetchpv(SvPVX(sv_2mortal(newSVpvn((name), (len)))), ((flags) & D_PPP_GV_NOADD_MASK) ? FALSE : TRUE, (I32)(sv_type)) #endif #ifndef GvSVn #define GvSVn(gv) GvSV(gv) #endif #ifndef isGV_with_GP #define isGV_with_GP(gv) isGV(gv) #endif #ifndef gv_fetchsv #define gv_fetchsv(name, flags, svt) gv_fetchpv(SvPV_nolen_const(name), flags, svt) #endif #ifndef get_cvn_flags #define get_cvn_flags(name, namelen, flags) get_cv(name, flags) #endif #ifndef gv_init_pvn #define gv_init_pvn(gv, stash, ptr, len, flags) gv_init(gv, stash, ptr, len, flags & GV_ADDMULTI ? TRUE : FALSE) #endif #ifndef STR_WITH_LEN #define STR_WITH_LEN(s) (s ""), (sizeof(s)-1) #endif #ifndef newSVpvs #define newSVpvs(str) newSVpvn(str "", sizeof(str) - 1) #endif #ifndef newSVpvs_flags #define newSVpvs_flags(str, flags) newSVpvn_flags(str "", sizeof(str) - 1, flags) #endif #ifndef newSVpvs_share #define newSVpvs_share(str) newSVpvn_share(str "", sizeof(str) - 1, 0) #endif #ifndef sv_catpvs #define sv_catpvs(sv, str) sv_catpvn(sv, str "", sizeof(str) - 1) #endif #ifndef sv_setpvs #define sv_setpvs(sv, str) sv_setpvn(sv, str "", sizeof(str) - 1) #endif #ifndef hv_fetchs #define hv_fetchs(hv, key, lval) hv_fetch(hv, key "", sizeof(key) - 1, lval) #endif #ifndef hv_stores #define hv_stores(hv, key, val) hv_store(hv, key "", sizeof(key) - 1, val, 0) #endif #ifndef gv_fetchpvs #define gv_fetchpvs(name, flags, svt) gv_fetchpvn_flags(name "", sizeof(name) - 1, flags, svt) #endif #ifndef gv_stashpvs #define gv_stashpvs(name, flags) gv_stashpvn(name "", sizeof(name) - 1, flags) #endif #ifndef get_cvs #define get_cvs(name, flags) get_cvn_flags(name "", sizeof(name)-1, flags) #endif #undef SvGETMAGIC #ifndef SvGETMAGIC #define SvGETMAGIC(x) ((void)(UNLIKELY(SvGMAGICAL(x)) && mg_get(x))) #endif #ifndef sv_catpvn_nomg #define sv_catpvn_nomg sv_catpvn #endif #ifndef sv_catsv_nomg #define sv_catsv_nomg sv_catsv #endif #ifndef sv_setsv_nomg #define sv_setsv_nomg sv_setsv #endif #ifndef sv_pvn_nomg #define sv_pvn_nomg sv_pvn #endif #ifdef SVf_IVisUV #if defined(PERL_USE_GCC_BRACE_GROUPS) #ifndef SvIV_nomg #define SvIV_nomg(sv) (!SvGMAGICAL((sv)) ? SvIV((sv)) : ({ SV *_sviv = sv_mortalcopy_flags((sv), SV_NOSTEAL); IV _iv = SvIV(_sviv); SvFLAGS((sv)) = (SvFLAGS((sv)) & ~SVf_IVisUV) | (SvFLAGS(_sviv) & SVf_IVisUV); _iv; })) #endif #ifndef SvUV_nomg #define SvUV_nomg(sv) (!SvGMAGICAL((sv)) ? SvUV((sv)) : ({ SV *_svuv = sv_mortalcopy_flags((sv), SV_NOSTEAL); UV _uv = SvUV(_svuv); SvFLAGS((sv)) = (SvFLAGS((sv)) & ~SVf_IVisUV) | (SvFLAGS(_svuv) & SVf_IVisUV); _uv; })) #endif #else #ifndef SvIV_nomg #define SvIV_nomg(sv) (!SvGMAGICAL((sv)) ? SvIV((sv)) : ((PL_Sv = sv_mortalcopy_flags((sv), SV_NOSTEAL)), sv_upgrade(PL_Sv, SVt_PVIV), (SvIVX(PL_Sv) = SvIV(PL_Sv)), (SvFLAGS((sv)) = (SvFLAGS((sv)) & ~SVf_IVisUV) | (SvFLAGS(PL_Sv) & SVf_IVisUV)), SvIVX(PL_Sv))) #endif #ifndef SvUV_nomg #define SvUV_nomg(sv) (!SvGMAGICAL((sv)) ? SvIV((sv)) : ((PL_Sv = sv_mortalcopy_flags((sv), SV_NOSTEAL)), sv_upgrade(PL_Sv, SVt_PVIV), (SvUVX(PL_Sv) = SvUV(PL_Sv)), (SvFLAGS((sv)) = (SvFLAGS((sv)) & ~SVf_IVisUV) | (SvFLAGS(PL_Sv) & SVf_IVisUV)), SvUVX(PL_Sv))) #endif #endif #else #ifndef SvIV_nomg #define SvIV_nomg(sv) (!SvGMAGICAL((sv)) ? SvIV((sv)) : SvIVx(sv_mortalcopy_flags((sv), SV_NOSTEAL))) #endif #ifndef SvUV_nomg #define SvUV_nomg(sv) (!SvGMAGICAL((sv)) ? SvUV((sv)) : SvUVx(sv_mortalcopy_flags((sv), SV_NOSTEAL))) #endif #endif #ifndef SvNV_nomg #define SvNV_nomg(sv) (!SvGMAGICAL((sv)) ? SvNV((sv)) : SvNVx(sv_mortalcopy_flags((sv), SV_NOSTEAL))) #endif #ifndef SvTRUE_nomg #define SvTRUE_nomg(sv) (!SvGMAGICAL((sv)) ? SvTRUE((sv)) : SvTRUEx(sv_mortalcopy_flags((sv), SV_NOSTEAL))) #endif #ifndef sv_catpv_mg #define sv_catpv_mg(sv, ptr) \ STMT_START { \ SV *TeMpSv = sv; \ sv_catpv(TeMpSv,ptr); \ SvSETMAGIC(TeMpSv); \ } STMT_END #endif #ifndef sv_catpvn_mg #define sv_catpvn_mg(sv, ptr, len) \ STMT_START { \ SV *TeMpSv = sv; \ sv_catpvn(TeMpSv,ptr,len); \ SvSETMAGIC(TeMpSv); \ } STMT_END #endif #ifndef sv_catsv_mg #define sv_catsv_mg(dsv, ssv) \ STMT_START { \ SV *TeMpSv = dsv; \ sv_catsv(TeMpSv,ssv); \ SvSETMAGIC(TeMpSv); \ } STMT_END #endif #ifndef sv_setiv_mg #define sv_setiv_mg(sv, i) \ STMT_START { \ SV *TeMpSv = sv; \ sv_setiv(TeMpSv,i); \ SvSETMAGIC(TeMpSv); \ } STMT_END #endif #ifndef sv_setnv_mg #define sv_setnv_mg(sv, num) \ STMT_START { \ SV *TeMpSv = sv; \ sv_setnv(TeMpSv,num); \ SvSETMAGIC(TeMpSv); \ } STMT_END #endif #ifndef sv_setpv_mg #define sv_setpv_mg(sv, ptr) \ STMT_START { \ SV *TeMpSv = sv; \ sv_setpv(TeMpSv,ptr); \ SvSETMAGIC(TeMpSv); \ } STMT_END #endif #ifndef sv_setpvn_mg #define sv_setpvn_mg(sv, ptr, len) \ STMT_START { \ SV *TeMpSv = sv; \ sv_setpvn(TeMpSv,ptr,len); \ SvSETMAGIC(TeMpSv); \ } STMT_END #endif #ifndef sv_setsv_mg #define sv_setsv_mg(dsv, ssv) \ STMT_START { \ SV *TeMpSv = dsv; \ sv_setsv(TeMpSv,ssv); \ SvSETMAGIC(TeMpSv); \ } STMT_END #endif #ifndef sv_setuv_mg #define sv_setuv_mg(sv, i) \ STMT_START { \ SV *TeMpSv = sv; \ sv_setuv(TeMpSv,i); \ SvSETMAGIC(TeMpSv); \ } STMT_END #endif #ifndef sv_usepvn_mg #define sv_usepvn_mg(sv, ptr, len) \ STMT_START { \ SV *TeMpSv = sv; \ sv_usepvn(TeMpSv,ptr,len); \ SvSETMAGIC(TeMpSv); \ } STMT_END #endif #ifndef SvVSTRING_mg #define SvVSTRING_mg(sv) (SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_vstring) : NULL) #endif #if (PERL_BCDVERSION < 0x5004000) #elif (PERL_BCDVERSION < 0x5008000) #define sv_magic_portable(sv, obj, how, name, namlen) \ STMT_START { \ SV *SvMp_sv = (sv); \ char *SvMp_name = (char *) (name); \ I32 SvMp_namlen = (namlen); \ if (SvMp_name && SvMp_namlen == 0) \ { \ MAGIC *mg; \ sv_magic(SvMp_sv, obj, how, 0, 0); \ mg = SvMAGIC(SvMp_sv); \ mg->mg_len = -42; \ mg->mg_ptr = SvMp_name; \ } \ else \ { \ sv_magic(SvMp_sv, obj, how, SvMp_name, SvMp_namlen); \ } \ } STMT_END #else #define sv_magic_portable(a, b, c, d, e) sv_magic(a, b, c, d, e) #endif #if !defined(mg_findext) #if defined(NEED_mg_findext) static MAGIC * DPPP_(my_mg_findext)(const SV * sv, int type, const MGVTBL * vtbl); static #else extern MAGIC * DPPP_(my_mg_findext)(const SV * sv, int type, const MGVTBL * vtbl); #endif #if defined(NEED_mg_findext) || defined(NEED_mg_findext_GLOBAL) #define mg_findext DPPP_(my_mg_findext) #define Perl_mg_findext DPPP_(my_mg_findext) MAGIC * DPPP_(my_mg_findext)(const SV * sv, int type, const MGVTBL *vtbl) { if (sv) { MAGIC *mg; #ifdef AvPAD_NAMELIST assert(!(SvTYPE(sv) == SVt_PVAV && AvPAD_NAMELIST(sv))); #endif for (mg = SvMAGIC (sv); mg; mg = mg->mg_moremagic) { if (mg->mg_type == type && mg->mg_virtual == vtbl) return mg; } } return NULL; } #endif #endif #if !defined(sv_unmagicext) #if defined(NEED_sv_unmagicext) static int DPPP_(my_sv_unmagicext)(pTHX_ SV * const sv, const int type, MGVTBL * vtbl); static #else extern int DPPP_(my_sv_unmagicext)(pTHX_ SV * const sv, const int type, MGVTBL * vtbl); #endif #if defined(NEED_sv_unmagicext) || defined(NEED_sv_unmagicext_GLOBAL) #ifdef sv_unmagicext #undef sv_unmagicext #endif #define sv_unmagicext(a,b,c) DPPP_(my_sv_unmagicext)(aTHX_ a,b,c) #define Perl_sv_unmagicext DPPP_(my_sv_unmagicext) int DPPP_(my_sv_unmagicext)(pTHX_ SV *const sv, const int type, MGVTBL *vtbl) { MAGIC* mg; MAGIC** mgp; if (SvTYPE(sv) < SVt_PVMG || !SvMAGIC(sv)) return 0; mgp = &(SvMAGIC(sv)); for (mg = *mgp; mg; mg = *mgp) { const MGVTBL* const virt = mg->mg_virtual; if (mg->mg_type == type && virt == vtbl) { *mgp = mg->mg_moremagic; if (virt && virt->svt_free) virt->svt_free(aTHX_ sv, mg); if (mg->mg_ptr && mg->mg_type != PERL_MAGIC_regex_global) { if (mg->mg_len > 0) Safefree(mg->mg_ptr); else if (mg->mg_len == HEf_SVKEY) SvREFCNT_dec(MUTABLE_SV(mg->mg_ptr)); else if (mg->mg_type == PERL_MAGIC_utf8) Safefree(mg->mg_ptr); } if (mg->mg_flags & MGf_REFCOUNTED) SvREFCNT_dec(mg->mg_obj); Safefree(mg); } else mgp = &mg->mg_moremagic; } if (SvMAGIC(sv)) { if (SvMAGICAL(sv)) mg_magical(sv); } else { SvMAGICAL_off(sv); SvFLAGS(sv) |= (SvFLAGS(sv) & (SVp_IOK|SVp_NOK|SVp_POK)) >> PRIVSHIFT; } return 0; } #endif #endif #ifdef USE_ITHREADS #ifndef CopFILE #define CopFILE(c) ((c)->cop_file) #endif #ifndef CopFILEGV #define CopFILEGV(c) (CopFILE(c) ? gv_fetchfile(CopFILE(c)) : Nullgv) #endif #ifndef CopFILE_set #define CopFILE_set(c,pv) ((c)->cop_file = savepv(pv)) #endif #ifndef CopFILESV #define CopFILESV(c) (CopFILE(c) ? GvSV(gv_fetchfile(CopFILE(c))) : Nullsv) #endif #ifndef CopFILEAV #define CopFILEAV(c) (CopFILE(c) ? GvAV(gv_fetchfile(CopFILE(c))) : Nullav) #endif #ifndef CopSTASHPV #define CopSTASHPV(c) ((c)->cop_stashpv) #endif #ifndef CopSTASHPV_set #define CopSTASHPV_set(c,pv) ((c)->cop_stashpv = ((pv) ? savepv(pv) : Nullch)) #endif #ifndef CopSTASH #define CopSTASH(c) (CopSTASHPV(c) ? gv_stashpv(CopSTASHPV(c),GV_ADD) : Nullhv) #endif #ifndef CopSTASH_set #define CopSTASH_set(c,hv) CopSTASHPV_set(c, (hv) ? HvNAME(hv) : Nullch) #endif #ifndef CopSTASH_eq #define CopSTASH_eq(c,hv) ((hv) && (CopSTASHPV(c) == HvNAME(hv) \ || (CopSTASHPV(c) && HvNAME(hv) \ && strEQ(CopSTASHPV(c), HvNAME(hv))))) #endif #else #ifndef CopFILEGV #define CopFILEGV(c) ((c)->cop_filegv) #endif #ifndef CopFILEGV_set #define CopFILEGV_set(c,gv) ((c)->cop_filegv = (GV*)SvREFCNT_inc(gv)) #endif #ifndef CopFILE_set #define CopFILE_set(c,pv) CopFILEGV_set((c), gv_fetchfile(pv)) #endif #ifndef CopFILESV #define CopFILESV(c) (CopFILEGV(c) ? GvSV(CopFILEGV(c)) : Nullsv) #endif #ifndef CopFILEAV #define CopFILEAV(c) (CopFILEGV(c) ? GvAV(CopFILEGV(c)) : Nullav) #endif #ifndef CopFILE #define CopFILE(c) (CopFILESV(c) ? SvPVX(CopFILESV(c)) : Nullch) #endif #ifndef CopSTASH #define CopSTASH(c) ((c)->cop_stash) #endif #ifndef CopSTASH_set #define CopSTASH_set(c,hv) ((c)->cop_stash = (hv)) #endif #ifndef CopSTASHPV #define CopSTASHPV(c) (CopSTASH(c) ? HvNAME(CopSTASH(c)) : Nullch) #endif #ifndef CopSTASHPV_set #define CopSTASHPV_set(c,pv) CopSTASH_set((c), gv_stashpv(pv,GV_ADD)) #endif #ifndef CopSTASH_eq #define CopSTASH_eq(c,hv) (CopSTASH(c) == (hv)) #endif #endif #if (PERL_BCDVERSION >= 0x5006000) #ifndef caller_cx #if defined(NEED_caller_cx) || defined(NEED_caller_cx_GLOBAL) static I32 DPPP_dopoptosub_at(const PERL_CONTEXT *cxstk, I32 startingblock) { I32 i; for (i = startingblock; i >= 0; i--) { const PERL_CONTEXT * const cx = &cxstk[i]; switch (CxTYPE(cx)) { default: continue; case CXt_EVAL: case CXt_SUB: case CXt_FORMAT: return i; } } return i; } #endif #if defined(NEED_caller_cx) static const PERL_CONTEXT * DPPP_(my_caller_cx)(pTHX_ I32 level, const PERL_CONTEXT * * dbcxp); static #else extern const PERL_CONTEXT * DPPP_(my_caller_cx)(pTHX_ I32 level, const PERL_CONTEXT * * dbcxp); #endif #if defined(NEED_caller_cx) || defined(NEED_caller_cx_GLOBAL) #ifdef caller_cx #undef caller_cx #endif #define caller_cx(a,b) DPPP_(my_caller_cx)(aTHX_ a,b) #define Perl_caller_cx DPPP_(my_caller_cx) const PERL_CONTEXT * DPPP_(my_caller_cx)(pTHX_ I32 level, const PERL_CONTEXT **dbcxp) { I32 cxix = DPPP_dopoptosub_at(cxstack, cxstack_ix); const PERL_CONTEXT *cx; const PERL_CONTEXT *ccstack = cxstack; const PERL_SI *top_si = PL_curstackinfo; for (;;) { while (cxix < 0 && top_si->si_type != PERLSI_MAIN) { top_si = top_si->si_prev; ccstack = top_si->si_cxstack; cxix = DPPP_dopoptosub_at(ccstack, top_si->si_cxix); } if (cxix < 0) return NULL; if (PL_DBsub && GvCV(PL_DBsub) && cxix >= 0 && ccstack[cxix].blk_sub.cv == GvCV(PL_DBsub)) level++; if (!level--) break; cxix = DPPP_dopoptosub_at(ccstack, cxix - 1); } cx = &ccstack[cxix]; if (dbcxp) *dbcxp = cx; if (CxTYPE(cx) == CXt_SUB || CxTYPE(cx) == CXt_FORMAT) { const I32 dbcxix = DPPP_dopoptosub_at(ccstack, cxix - 1); if (PL_DBsub && GvCV(PL_DBsub) && dbcxix >= 0 && ccstack[dbcxix].blk_sub.cv == GvCV(PL_DBsub)) cx = &ccstack[dbcxix]; } return cx; } #endif #endif #endif #ifndef IN_PERL_COMPILETIME #define IN_PERL_COMPILETIME (PL_curcop == &PL_compiling) #endif #ifndef IN_LOCALE_RUNTIME #define IN_LOCALE_RUNTIME (PL_curcop->op_private & HINT_LOCALE) #endif #ifndef IN_LOCALE_COMPILETIME #define IN_LOCALE_COMPILETIME (PL_hints & HINT_LOCALE) #endif #ifndef IN_LOCALE #define IN_LOCALE (IN_PERL_COMPILETIME ? IN_LOCALE_COMPILETIME : IN_LOCALE_RUNTIME) #endif #ifndef IS_NUMBER_IN_UV #define IS_NUMBER_IN_UV 0x01 #endif #ifndef IS_NUMBER_GREATER_THAN_UV_MAX #define IS_NUMBER_GREATER_THAN_UV_MAX 0x02 #endif #ifndef IS_NUMBER_NOT_INT #define IS_NUMBER_NOT_INT 0x04 #endif #ifndef IS_NUMBER_NEG #define IS_NUMBER_NEG 0x08 #endif #ifndef IS_NUMBER_INFINITY #define IS_NUMBER_INFINITY 0x10 #endif #ifndef IS_NUMBER_NAN #define IS_NUMBER_NAN 0x20 #endif #ifndef GROK_NUMERIC_RADIX #define GROK_NUMERIC_RADIX(sp, send) grok_numeric_radix(sp, send) #endif #ifndef PERL_SCAN_GREATER_THAN_UV_MAX #define PERL_SCAN_GREATER_THAN_UV_MAX 0x02 #endif #ifndef PERL_SCAN_SILENT_ILLDIGIT #define PERL_SCAN_SILENT_ILLDIGIT 0x04 #endif #ifndef PERL_SCAN_ALLOW_UNDERSCORES #define PERL_SCAN_ALLOW_UNDERSCORES 0x01 #endif #ifndef PERL_SCAN_DISALLOW_PREFIX #define PERL_SCAN_DISALLOW_PREFIX 0x02 #endif #ifndef grok_numeric_radix #if defined(NEED_grok_numeric_radix) static bool DPPP_(my_grok_numeric_radix)(pTHX_ const char * * sp, const char * send); static #else extern bool DPPP_(my_grok_numeric_radix)(pTHX_ const char * * sp, const char * send); #endif #if defined(NEED_grok_numeric_radix) || defined(NEED_grok_numeric_radix_GLOBAL) #ifdef grok_numeric_radix #undef grok_numeric_radix #endif #define grok_numeric_radix(a,b) DPPP_(my_grok_numeric_radix)(aTHX_ a,b) #define Perl_grok_numeric_radix DPPP_(my_grok_numeric_radix) bool DPPP_(my_grok_numeric_radix)(pTHX_ const char **sp, const char *send) { #ifdef USE_LOCALE_NUMERIC #ifdef PL_numeric_radix_sv if (PL_numeric_radix_sv && IN_LOCALE) { STRLEN len; char* radix = SvPV(PL_numeric_radix_sv, len); if (*sp + len <= send && memEQ(*sp, radix, len)) { *sp += len; return TRUE; } } #else #include <locale.h> dTHR; struct lconv *lc = localeconv(); char *radix = lc->decimal_point; if (radix && IN_LOCALE) { STRLEN len = strlen(radix); if (*sp + len <= send && memEQ(*sp, radix, len)) { *sp += len; return TRUE; } } #endif #endif if (*sp < send && **sp == '.') { ++*sp; return TRUE; } return FALSE; } #endif #endif #ifndef grok_number #if defined(NEED_grok_number) static int DPPP_(my_grok_number)(pTHX_ const char * pv, STRLEN len, UV * valuep); static #else extern int DPPP_(my_grok_number)(pTHX_ const char * pv, STRLEN len, UV * valuep); #endif #if defined(NEED_grok_number) || defined(NEED_grok_number_GLOBAL) #ifdef grok_number #undef grok_number #endif #define grok_number(a,b,c) DPPP_(my_grok_number)(aTHX_ a,b,c) #define Perl_grok_number DPPP_(my_grok_number) int DPPP_(my_grok_number)(pTHX_ const char *pv, STRLEN len, UV *valuep) { const char *s = pv; const char *send = pv + len; const UV max_div_10 = UV_MAX / 10; const char max_mod_10 = UV_MAX % 10; int numtype = 0; int sawinf = 0; int sawnan = 0; while (s < send && isSPACE(*s)) s++; if (s == send) { return 0; } else if (*s == '-') { s++; numtype = IS_NUMBER_NEG; } else if (*s == '+') s++; if (s == send) return 0; if (isDIGIT(*s)) { UV value = *s - '0'; if (++s < send) { int digit = *s - '0'; if (digit >= 0 && digit <= 9) { value = value * 10 + digit; if (++s < send) { digit = *s - '0'; if (digit >= 0 && digit <= 9) { value = value * 10 + digit; if (++s < send) { digit = *s - '0'; if (digit >= 0 && digit <= 9) { value = value * 10 + digit; if (++s < send) { digit = *s - '0'; if (digit >= 0 && digit <= 9) { value = value * 10 + digit; if (++s < send) { digit = *s - '0'; if (digit >= 0 && digit <= 9) { value = value * 10 + digit; if (++s < send) { digit = *s - '0'; if (digit >= 0 && digit <= 9) { value = value * 10 + digit; if (++s < send) { digit = *s - '0'; if (digit >= 0 && digit <= 9) { value = value * 10 + digit; if (++s < send) { digit = *s - '0'; if (digit >= 0 && digit <= 9) { value = value * 10 + digit; if (++s < send) { digit = *s - '0'; while (digit >= 0 && digit <= 9 && (value < max_div_10 || (value == max_div_10 && digit <= max_mod_10))) { value = value * 10 + digit; if (++s < send) digit = *s - '0'; else break; } if (digit >= 0 && digit <= 9 && (s < send)) { do { s++; } while (s < send && isDIGIT(*s)); numtype |= IS_NUMBER_GREATER_THAN_UV_MAX; goto skip_value; } } } } } } } } } } } } } } } } } } numtype |= IS_NUMBER_IN_UV; if (valuep) *valuep = value; skip_value: if (GROK_NUMERIC_RADIX(&s, send)) { numtype |= IS_NUMBER_NOT_INT; while (s < send && isDIGIT(*s)) s++; } } else if (GROK_NUMERIC_RADIX(&s, send)) { numtype |= IS_NUMBER_NOT_INT | IS_NUMBER_IN_UV; if (s < send && isDIGIT(*s)) { do { s++; } while (s < send && isDIGIT(*s)); if (valuep) { *valuep = 0; } } else return 0; } else if (*s == 'I' || *s == 'i') { s++; if (s == send || (*s != 'N' && *s != 'n')) return 0; s++; if (s == send || (*s != 'F' && *s != 'f')) return 0; s++; if (s < send && (*s == 'I' || *s == 'i')) { s++; if (s == send || (*s != 'N' && *s != 'n')) return 0; s++; if (s == send || (*s != 'I' && *s != 'i')) return 0; s++; if (s == send || (*s != 'T' && *s != 't')) return 0; s++; if (s == send || (*s != 'Y' && *s != 'y')) return 0; s++; } sawinf = 1; } else if (*s == 'N' || *s == 'n') { s++; if (s == send || (*s != 'A' && *s != 'a')) return 0; s++; if (s == send || (*s != 'N' && *s != 'n')) return 0; s++; sawnan = 1; } else return 0; if (sawinf) { numtype &= IS_NUMBER_NEG; numtype |= IS_NUMBER_INFINITY | IS_NUMBER_NOT_INT; } else if (sawnan) { numtype &= IS_NUMBER_NEG; numtype |= IS_NUMBER_NAN | IS_NUMBER_NOT_INT; } else if (s < send) { if (*s == 'e' || *s == 'E') { numtype &= IS_NUMBER_NEG; numtype |= IS_NUMBER_NOT_INT; s++; if (s < send && (*s == '-' || *s == '+')) s++; if (s < send && isDIGIT(*s)) { do { s++; } while (s < send && isDIGIT(*s)); } else return 0; } } while (s < send && isSPACE(*s)) s++; if (s >= send) return numtype; if (len == 10 && memEQ(pv, "0 but true", 10)) { if (valuep) *valuep = 0; return IS_NUMBER_IN_UV; } return 0; } #endif #endif #ifndef grok_bin #if defined(NEED_grok_bin) static UV DPPP_(my_grok_bin)(pTHX_ const char * start, STRLEN * len_p, I32 * flags, NV * result); static #else extern UV DPPP_(my_grok_bin)(pTHX_ const char * start, STRLEN * len_p, I32 * flags, NV * result); #endif #if defined(NEED_grok_bin) || defined(NEED_grok_bin_GLOBAL) #ifdef grok_bin #undef grok_bin #endif #define grok_bin(a,b,c,d) DPPP_(my_grok_bin)(aTHX_ a,b,c,d) #define Perl_grok_bin DPPP_(my_grok_bin) UV DPPP_(my_grok_bin)(pTHX_ const char *start, STRLEN *len_p, I32 *flags, NV *result) { const char *s = start; STRLEN len = *len_p; UV value = 0; NV value_nv = 0; const UV max_div_2 = UV_MAX / 2; bool allow_underscores = *flags & PERL_SCAN_ALLOW_UNDERSCORES; bool overflowed = FALSE; if (!(*flags & PERL_SCAN_DISALLOW_PREFIX)) { if (len >= 1) { if (s[0] == 'b') { s++; len--; } else if (len >= 2 && s[0] == '0' && s[1] == 'b') { s+=2; len-=2; } } } for (; len-- && *s; s++) { char bit = *s; if (bit == '0' || bit == '1') { redo: if (!overflowed) { if (value <= max_div_2) { value = (value << 1) | (bit - '0'); continue; } warn("Integer overflow in binary number"); overflowed = TRUE; value_nv = (NV) value; } value_nv *= 2.0; value_nv += (NV)(bit - '0'); continue; } if (bit == '_' && len && allow_underscores && (bit = s[1]) && (bit == '0' || bit == '1')) { --len; ++s; goto redo; } if (!(*flags & PERL_SCAN_SILENT_ILLDIGIT)) warn("Illegal binary digit '%c' ignored", *s); break; } if ( ( overflowed && value_nv > 4294967295.0) #if UVSIZE > 4 || (!overflowed && value > 0xffffffff ) #endif ) { warn("Binary number > 0b11111111111111111111111111111111 non-portable"); } *len_p = s - start; if (!overflowed) { *flags = 0; return value; } *flags = PERL_SCAN_GREATER_THAN_UV_MAX; if (result) *result = value_nv; return UV_MAX; } #endif #endif #ifndef grok_hex #if defined(NEED_grok_hex) static UV DPPP_(my_grok_hex)(pTHX_ const char * start, STRLEN * len_p, I32 * flags, NV * result); static #else extern UV DPPP_(my_grok_hex)(pTHX_ const char * start, STRLEN * len_p, I32 * flags, NV * result); #endif #if defined(NEED_grok_hex) || defined(NEED_grok_hex_GLOBAL) #ifdef grok_hex #undef grok_hex #endif #define grok_hex(a,b,c,d) DPPP_(my_grok_hex)(aTHX_ a,b,c,d) #define Perl_grok_hex DPPP_(my_grok_hex) UV DPPP_(my_grok_hex)(pTHX_ const char *start, STRLEN *len_p, I32 *flags, NV *result) { const char *s = start; STRLEN len = *len_p; UV value = 0; NV value_nv = 0; const UV max_div_16 = UV_MAX / 16; bool allow_underscores = *flags & PERL_SCAN_ALLOW_UNDERSCORES; bool overflowed = FALSE; const char *xdigit; if (!(*flags & PERL_SCAN_DISALLOW_PREFIX)) { if (len >= 1) { if (s[0] == 'x') { s++; len--; } else if (len >= 2 && s[0] == '0' && s[1] == 'x') { s+=2; len-=2; } } } for (; len-- && *s; s++) { xdigit = strchr((char *) PL_hexdigit, *s); if (xdigit) { redo: if (!overflowed) { if (value <= max_div_16) { value = (value << 4) | ((xdigit - PL_hexdigit) & 15); continue; } warn("Integer overflow in hexadecimal number"); overflowed = TRUE; value_nv = (NV) value; } value_nv *= 16.0; value_nv += (NV)((xdigit - PL_hexdigit) & 15); continue; } if (*s == '_' && len && allow_underscores && s[1] && (xdigit = strchr((char *) PL_hexdigit, s[1]))) { --len; ++s; goto redo; } if (!(*flags & PERL_SCAN_SILENT_ILLDIGIT)) warn("Illegal hexadecimal digit '%c' ignored", *s); break; } if ( ( overflowed && value_nv > 4294967295.0) #if UVSIZE > 4 || (!overflowed && value > 0xffffffff ) #endif ) { warn("Hexadecimal number > 0xffffffff non-portable"); } *len_p = s - start; if (!overflowed) { *flags = 0; return value; } *flags = PERL_SCAN_GREATER_THAN_UV_MAX; if (result) *result = value_nv; return UV_MAX; } #endif #endif #ifndef grok_oct #if defined(NEED_grok_oct) static UV DPPP_(my_grok_oct)(pTHX_ const char * start, STRLEN * len_p, I32 * flags, NV * result); static #else extern UV DPPP_(my_grok_oct)(pTHX_ const char * start, STRLEN * len_p, I32 * flags, NV * result); #endif #if defined(NEED_grok_oct) || defined(NEED_grok_oct_GLOBAL) #ifdef grok_oct #undef grok_oct #endif #define grok_oct(a,b,c,d) DPPP_(my_grok_oct)(aTHX_ a,b,c,d) #define Perl_grok_oct DPPP_(my_grok_oct) UV DPPP_(my_grok_oct)(pTHX_ const char *start, STRLEN *len_p, I32 *flags, NV *result) { const char *s = start; STRLEN len = *len_p; UV value = 0; NV value_nv = 0; const UV max_div_8 = UV_MAX / 8; bool allow_underscores = *flags & PERL_SCAN_ALLOW_UNDERSCORES; bool overflowed = FALSE; for (; len-- && *s; s++) { int digit = *s - '0'; if (digit >= 0 && digit <= 7) { redo: if (!overflowed) { if (value <= max_div_8) { value = (value << 3) | digit; continue; } warn("Integer overflow in octal number"); overflowed = TRUE; value_nv = (NV) value; } value_nv *= 8.0; value_nv += (NV)digit; continue; } if (digit == ('_' - '0') && len && allow_underscores && (digit = s[1] - '0') && (digit >= 0 && digit <= 7)) { --len; ++s; goto redo; } if (digit == 8 || digit == 9) { if (!(*flags & PERL_SCAN_SILENT_ILLDIGIT)) warn("Illegal octal digit '%c' ignored", *s); } break; } if ( ( overflowed && value_nv > 4294967295.0) #if UVSIZE > 4 || (!overflowed && value > 0xffffffff ) #endif ) { warn("Octal number > 037777777777 non-portable"); } *len_p = s - start; if (!overflowed) { *flags = 0; return value; } *flags = PERL_SCAN_GREATER_THAN_UV_MAX; if (result) *result = value_nv; return UV_MAX; } #endif #endif #if !defined(my_snprintf) #if defined(NEED_my_snprintf) static int DPPP_(my_my_snprintf)(char * buffer, const Size_t len, const char * format, ...); static #else extern int DPPP_(my_my_snprintf)(char * buffer, const Size_t len, const char * format, ...); #endif #if defined(NEED_my_snprintf) || defined(NEED_my_snprintf_GLOBAL) #define my_snprintf DPPP_(my_my_snprintf) #define Perl_my_snprintf DPPP_(my_my_snprintf) int DPPP_(my_my_snprintf)(char *buffer, const Size_t len, const char *format, ...) { dTHX; int retval; va_list ap; va_start(ap, format); #ifdef HAS_VSNPRINTF retval = vsnprintf(buffer, len, format, ap); #else retval = vsprintf(buffer, format, ap); #endif va_end(ap); if (retval < 0 || (len > 0 && (Size_t)retval >= len)) Perl_croak(aTHX_ "panic: my_snprintf buffer overflow"); return retval; } #endif #endif #if !defined(my_sprintf) #if defined(NEED_my_sprintf) static int DPPP_(my_my_sprintf)(char * buffer, const char * pat, ...); static #else extern int DPPP_(my_my_sprintf)(char * buffer, const char * pat, ...); #endif #if defined(NEED_my_sprintf) || defined(NEED_my_sprintf_GLOBAL) #define my_sprintf DPPP_(my_my_sprintf) int DPPP_(my_my_sprintf)(char *buffer, const char* pat, ...) { va_list args; va_start(args, pat); vsprintf(buffer, pat, args); va_end(args); return strlen(buffer); } #endif #endif #ifdef NO_XSLOCKS #ifdef dJMPENV #define dXCPT dJMPENV; int rEtV = 0 #define XCPT_TRY_START JMPENV_PUSH(rEtV); if (rEtV == 0) #define XCPT_TRY_END JMPENV_POP; #define XCPT_CATCH if (rEtV != 0) #define XCPT_RETHROW JMPENV_JUMP(rEtV) #else #define dXCPT Sigjmp_buf oldTOP; int rEtV = 0 #define XCPT_TRY_START Copy(top_env, oldTOP, 1, Sigjmp_buf); rEtV = Sigsetjmp(top_env, 1); if (rEtV == 0) #define XCPT_TRY_END Copy(oldTOP, top_env, 1, Sigjmp_buf); #define XCPT_CATCH if (rEtV != 0) #define XCPT_RETHROW Siglongjmp(top_env, rEtV) #endif #endif #if !defined(my_strlcat) #if defined(NEED_my_strlcat) static Size_t DPPP_(my_my_strlcat)(char * dst, const char * src, Size_t size); static #else extern Size_t DPPP_(my_my_strlcat)(char * dst, const char * src, Size_t size); #endif #if defined(NEED_my_strlcat) || defined(NEED_my_strlcat_GLOBAL) #define my_strlcat DPPP_(my_my_strlcat) #define Perl_my_strlcat DPPP_(my_my_strlcat) Size_t DPPP_(my_my_strlcat)(char *dst, const char *src, Size_t size) { Size_t used, length, copy; used = strlen(dst); length = strlen(src); if (size > 0 && used < size - 1) { copy = (length >= size - used) ? size - used - 1 : length; memcpy(dst + used, src, copy); dst[used + copy] = '\0'; } return used + length; } #endif #endif #if !defined(my_strlcpy) #if defined(NEED_my_strlcpy) static Size_t DPPP_(my_my_strlcpy)(char * dst, const char * src, Size_t size); static #else extern Size_t DPPP_(my_my_strlcpy)(char * dst, const char * src, Size_t size); #endif #if defined(NEED_my_strlcpy) || defined(NEED_my_strlcpy_GLOBAL) #define my_strlcpy DPPP_(my_my_strlcpy) #define Perl_my_strlcpy DPPP_(my_my_strlcpy) Size_t DPPP_(my_my_strlcpy)(char *dst, const char *src, Size_t size) { Size_t length, copy; length = strlen(src); if (size > 0) { copy = (length >= size) ? size - 1 : length; memcpy(dst, src, copy); dst[copy] = '\0'; } return length; } #endif #endif #ifdef SVf_UTF8 #ifndef SvUTF8 #define SvUTF8(sv) (SvFLAGS(sv) & SVf_UTF8) #endif #endif #if (PERL_BCDVERSION == 0x5019001) #undef UTF8f #endif #ifdef SVf_UTF8 #ifndef UTF8f #define UTF8f SVf #endif #ifndef UTF8fARG #define UTF8fARG(u,l,p) newSVpvn_flags((p), (l), ((u) ? SVf_UTF8 : 0) | SVs_TEMP) #endif #endif #define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b)) #ifndef UNICODE_REPLACEMENT #define UNICODE_REPLACEMENT 0xFFFD #endif #ifdef UTF8_MAXLEN #ifndef UTF8_MAXBYTES #define UTF8_MAXBYTES UTF8_MAXLEN #endif #endif #ifndef UTF_START_MARK #define UTF_START_MARK(len) \ (((len) > 7) ? 0xFF : (0xFF & (0xFE << (7-(len))))) #endif #if (PERL_BCDVERSION < 0x5018000) #undef UTF8_MAXBYTES_CASE #endif #if 'A' == 65 #define D_PPP_BYTE_INFO_BITS 6 #ifndef UTF8_MAXBYTES_CASE #define UTF8_MAXBYTES_CASE 13 #endif #else #define D_PPP_BYTE_INFO_BITS 5 #ifndef UTF8_MAXBYTES_CASE #define UTF8_MAXBYTES_CASE 15 #endif #endif #ifndef UTF_ACCUMULATION_SHIFT #define UTF_ACCUMULATION_SHIFT D_PPP_BYTE_INFO_BITS #endif #ifdef NATIVE_TO_UTF #ifndef NATIVE_UTF8_TO_I8 #define NATIVE_UTF8_TO_I8(c) NATIVE_TO_UTF(c) #endif #else #ifndef NATIVE_UTF8_TO_I8 #define NATIVE_UTF8_TO_I8(c) (c) #endif #endif #ifdef UTF_TO_NATIVE #ifndef I8_TO_NATIVE_UTF8 #define I8_TO_NATIVE_UTF8(c) UTF_TO_NATIVE(c) #endif #else #ifndef I8_TO_NATIVE_UTF8 #define I8_TO_NATIVE_UTF8(c) (c) #endif #endif #ifndef UTF_START_MASK #define UTF_START_MASK(len) \ (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2))) #endif #ifndef UTF_IS_CONTINUATION_MASK #define UTF_IS_CONTINUATION_MASK \ ((U8) (0xFF << UTF_ACCUMULATION_SHIFT)) #endif #ifndef UTF_CONTINUATION_MARK #define UTF_CONTINUATION_MARK \ (UTF_IS_CONTINUATION_MASK & 0xB0) #endif #ifndef UTF_MIN_START_BYTE #define UTF_MIN_START_BYTE \ ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) #endif #ifndef UTF_MIN_ABOVE_LATIN1_BYTE #define UTF_MIN_ABOVE_LATIN1_BYTE \ ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) #endif #if (PERL_BCDVERSION < 0x5007000) #undef UTF8_IS_DOWNGRADEABLE_START #endif #ifndef UTF8_IS_DOWNGRADEABLE_START #define UTF8_IS_DOWNGRADEABLE_START(c) \ inRANGE(NATIVE_UTF8_TO_I8(c), \ UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1) #endif #ifndef UTF_CONTINUATION_MASK #define UTF_CONTINUATION_MASK \ ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1)) #endif #ifndef UTF8_ACCUMULATE #define UTF8_ACCUMULATE(base, added) \ (((base) << UTF_ACCUMULATION_SHIFT) \ | ((NATIVE_UTF8_TO_I8(added)) \ & UTF_CONTINUATION_MASK)) #endif #ifndef UTF8_ALLOW_ANYUV #define UTF8_ALLOW_ANYUV 0 #endif #ifndef UTF8_ALLOW_EMPTY #define UTF8_ALLOW_EMPTY 0x0001 #endif #ifndef UTF8_ALLOW_CONTINUATION #define UTF8_ALLOW_CONTINUATION 0x0002 #endif #ifndef UTF8_ALLOW_NON_CONTINUATION #define UTF8_ALLOW_NON_CONTINUATION 0x0004 #endif #ifndef UTF8_ALLOW_SHORT #define UTF8_ALLOW_SHORT 0x0008 #endif #ifndef UTF8_ALLOW_LONG #define UTF8_ALLOW_LONG 0x0010 #endif #ifndef UTF8_ALLOW_OVERFLOW #define UTF8_ALLOW_OVERFLOW 0x0080 #endif #ifndef UTF8_ALLOW_ANY #define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \ |UTF8_ALLOW_NON_CONTINUATION \ |UTF8_ALLOW_SHORT \ |UTF8_ALLOW_LONG \ |UTF8_ALLOW_OVERFLOW) #endif #if defined UTF8SKIP #undef UTF8_SAFE_SKIP #undef UTF8_CHK_SKIP #ifndef UTF8_SAFE_SKIP #define UTF8_SAFE_SKIP(s, e) ( \ ((((e) - (s)) <= 0) \ ? 0 \ : D_PPP_MIN(((e) - (s)), UTF8SKIP(s)))) #endif #ifndef UTF8_CHK_SKIP #define UTF8_CHK_SKIP(s) \ (s[0] == '\0' ? 1 : ((U8) D_PPP_MIN(my_strnlen((char *) (s), UTF8SKIP(s)), \ UTF8SKIP(s)))) #endif #ifndef UTF8_SKIP #define UTF8_SKIP(s) UTF8SKIP(s) #endif #endif #if 'A' == 65 #ifndef UTF8_IS_INVARIANT #define UTF8_IS_INVARIANT(c) isASCII(c) #endif #else #ifndef UTF8_IS_INVARIANT #define UTF8_IS_INVARIANT(c) (isASCII(c) || isCNTRL_L1(c)) #endif #endif #ifndef UVCHR_IS_INVARIANT #define UVCHR_IS_INVARIANT(c) UTF8_IS_INVARIANT(c) #endif #ifdef UVCHR_IS_INVARIANT #if 'A' != 65 || UVSIZE < 8 #define D_PPP_UVCHR_SKIP_UPPER(c) 7 #else #define D_PPP_UVCHR_SKIP_UPPER(c) \ (((WIDEST_UTYPE) (c)) < \ (((WIDEST_UTYPE) 1) << (6 * D_PPP_BYTE_INFO_BITS)) ? 7 : 13) #endif #ifndef UVCHR_SKIP #define UVCHR_SKIP(c) \ UVCHR_IS_INVARIANT(c) ? 1 : \ (WIDEST_UTYPE) (c) < (32 * (1U << ( D_PPP_BYTE_INFO_BITS))) ? 2 : \ (WIDEST_UTYPE) (c) < (16 * (1U << (2 * D_PPP_BYTE_INFO_BITS))) ? 3 : \ (WIDEST_UTYPE) (c) < ( 8 * (1U << (3 * D_PPP_BYTE_INFO_BITS))) ? 4 : \ (WIDEST_UTYPE) (c) < ( 4 * (1U << (4 * D_PPP_BYTE_INFO_BITS))) ? 5 : \ (WIDEST_UTYPE) (c) < ( 2 * (1U << (5 * D_PPP_BYTE_INFO_BITS))) ? 6 : \ D_PPP_UVCHR_SKIP_UPPER(c) #endif #endif #ifdef is_ascii_string #ifndef is_invariant_string #define is_invariant_string(s,l) is_ascii_string(s,l) #endif #ifndef is_utf8_invariant_string #define is_utf8_invariant_string(s,l) is_ascii_string(s,l) #endif #endif #ifdef ibcmp_utf8 #ifndef foldEQ_utf8 #define foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2) \ cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2)) #endif #endif #if defined(is_utf8_string) && defined(UTF8SKIP) #ifndef isUTF8_CHAR #define isUTF8_CHAR(s, e) ( \ (e) <= (s) || ! is_utf8_string(s, UTF8_SAFE_SKIP(s, e)) \ ? 0 \ : UTF8SKIP(s)) #endif #endif #if 'A' == 65 #ifndef BOM_UTF8 #define BOM_UTF8 "\xEF\xBB\xBF" #endif #ifndef REPLACEMENT_CHARACTER_UTF8 #define REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD" #endif #elif '^' == 95 #ifndef BOM_UTF8 #define BOM_UTF8 "\xDD\x73\x66\x73" #endif #ifndef REPLACEMENT_CHARACTER_UTF8 #define REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71" #endif #elif '^' == 176 #ifndef BOM_UTF8 #define BOM_UTF8 "\xDD\x72\x65\x72" #endif #ifndef REPLACEMENT_CHARACTER_UTF8 #define REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70" #endif #else #error Unknown character set #endif #if (PERL_BCDVERSION < 0x5035010) #undef utf8_to_uvchr_buf #endif #if (PERL_BCDVERSION >= 0x5006001) && ! defined(utf8_to_uvchr_buf) #if defined(utf8n_to_uvchr) || defined(utf8_to_uvchr) || defined(utf8_to_uv) #if defined(utf8n_to_uvchr) #define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr #elif \ defined(utf8_to_uv) && defined(utf8_to_uv_simple) #define D_PPP_utf8_to_uvchr_buf_callee utf8_to_uv #elif defined(utf8_to_uvchr) #define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \ utf8_to_uvchr((U8 *)(s), (retlen)) #else #define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \ utf8_to_uv((U8 *)(s), (retlen)) #endif #endif #if defined(NEED_utf8_to_uvchr_buf) static UV DPPP_(my_utf8_to_uvchr_buf)(pTHX_ const U8 * s, const U8 * send, STRLEN * retlen); static #else extern UV DPPP_(my_utf8_to_uvchr_buf)(pTHX_ const U8 * s, const U8 * send, STRLEN * retlen); #endif #if defined(NEED_utf8_to_uvchr_buf) || defined(NEED_utf8_to_uvchr_buf_GLOBAL) #ifdef utf8_to_uvchr_buf #undef utf8_to_uvchr_buf #endif #define utf8_to_uvchr_buf(a,b,c) DPPP_(my_utf8_to_uvchr_buf)(aTHX_ a,b,c) #define Perl_utf8_to_uvchr_buf DPPP_(my_utf8_to_uvchr_buf) UV DPPP_(my_utf8_to_uvchr_buf)(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) { #if (PERL_BCDVERSION >= 0x5031004) #if (PERL_BCDVERSION != 0x5035009) if (send <= s) s = send = (U8 *) "?"; return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen); #else if (send > s) return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen); if (! ckWARN_d(WARN_UTF8)) { if (retlen) *retlen = 0; return UNICODE_REPLACEMENT; } else { s = send = (U8 *) "?"; (void) Perl__utf8n_to_uvchr_msgs_helper(s, 0, NULL, 0, NULL, NULL); if (retlen) *retlen = (STRLEN) -1; return 0; } #endif #else UV ret; STRLEN curlen; bool overflows = 0; const U8 *cur_s = s; const bool do_warnings = ckWARN_d(WARN_UTF8); #if (PERL_BCDVERSION < 0x5026000) && ! defined(EBCDIC) STRLEN overflow_length = 0; #endif if (send > s) { curlen = send - s; } else { assert(0); curlen = 0; if (! do_warnings) { if (retlen) *retlen = 0; return UNICODE_REPLACEMENT; } } #if (PERL_BCDVERSION < 0x5026000) && ! defined(EBCDIC) if (curlen > 0 && UNLIKELY(*s >= 0xFE)) { if (sizeof(ret) < 8) { overflows = 1; overflow_length = (*s == 0xFE) ? 7 : 13; } else { const U8 highest[] = "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"; const U8 *cur_h = highest; for (cur_s = s; cur_s < send; cur_s++, cur_h++) { if (UNLIKELY(*cur_s == *cur_h)) { continue; } overflows = *cur_s > *cur_h; break; } overflow_length = 13; } } if (UNLIKELY(overflows)) { ret = 0; if (! do_warnings && retlen) { *retlen = overflow_length; } } else #endif ret = D_PPP_utf8_to_uvchr_buf_callee( (U8 *) s, curlen, retlen, (UTF8_ALLOW_ANYUV & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY))); #if (PERL_BCDVERSION >= 0x5026000) && (PERL_BCDVERSION < 0x5028000) if (UNLIKELY(ret > IV_MAX)) { overflows = 1; } #endif if (UNLIKELY(overflows)) { if (! do_warnings) { if (retlen) { *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s)); *retlen = D_PPP_MIN(*retlen, curlen); } return UNICODE_REPLACEMENT; } else { Perl_warner(aTHX_ packWARN(WARN_UTF8), "Malformed UTF-8 character (overflow at 0x%" UVxf ", byte 0x%02x, after start byte 0x%02x)", ret, *cur_s, *s); if (retlen) { *retlen = (STRLEN) -1; } return 0; } } if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) { if (do_warnings) { if (retlen) { *retlen = (STRLEN) -1; } } else { ret = D_PPP_utf8_to_uvchr_buf_callee( (U8 *) s, curlen, retlen, UTF8_ALLOW_ANY); ret = UNICODE_REPLACEMENT; #if (PERL_BCDVERSION < 0x5016000) if (retlen && (IV) *retlen >= 0) { unsigned int i = 1; *retlen = D_PPP_MIN(*retlen, curlen); *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s)); do { #ifdef UTF8_IS_CONTINUATION if (! UTF8_IS_CONTINUATION(s[i])) #else if (s[i] < 0x80 || s[i] > 0xBF) #endif { *retlen = i; break; } } while (++i < *retlen); } #endif } } return ret; #endif } #endif #endif #if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf) #undef utf8_to_uvchr #ifndef utf8_to_uvchr #define utf8_to_uvchr(s, lp) \ ((*(s) == '\0') \ ? utf8_to_uvchr_buf(s,((s)+1), lp) \ : utf8_to_uvchr_buf(s, (s) + UTF8_CHK_SKIP(s), (lp))) #endif #endif #ifdef sv_len_utf8 #if (PERL_BCDVERSION >= 0x5017005) #ifndef sv_len_utf8_nomg #if defined(PERL_USE_GCC_BRACE_GROUPS) #define sv_len_utf8_nomg(sv) \ ({ \ SV *sv_ = (sv); \ sv_len_utf8(!SvGMAGICAL(sv_) \ ? sv_ \ : sv_mortalcopy_flags(sv_, SV_NOSTEAL)); \ }) #else PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv) { dTHX; if (SvGMAGICAL(sv)) return sv_len_utf8(sv_mortalcopy_flags(sv, SV_NOSTEAL)); else return sv_len_utf8(sv); } #define sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv) #endif #endif #else #undef sv_len_utf8 #if defined(PERL_USE_GCC_BRACE_GROUPS) #define sv_len_utf8_nomg(sv) \ ({ \ SV *sv2 = (sv); \ STRLEN len; \ if (SvUTF8(sv2)) { \ if (SvGMAGICAL(sv2)) \ len = Perl_sv_len_utf8(aTHX_ \ sv_mortalcopy_flags(sv2, \ SV_NOSTEAL));\ else \ len = Perl_sv_len_utf8(aTHX_ sv2); \ } \ else SvPV_nomg(sv2, len); \ len; \ }) #define sv_len_utf8(sv) ({ SV *_sv1 = (sv); \ SvGETMAGIC(_sv1); \ sv_len_utf8_nomg(_sv1); \ }) #else PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv) { dTHX; STRLEN len; if (SvUTF8(sv)) { if (SvGMAGICAL(sv)) len = Perl_sv_len_utf8(aTHX_ sv_mortalcopy_flags(sv, SV_NOSTEAL)); else len = Perl_sv_len_utf8(aTHX_ sv); } else SvPV_nomg(sv, len); return len; } #define sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv) PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8(SV * sv) { dTHX; SvGETMAGIC(sv); return sv_len_utf8_nomg(sv); } #define sv_len_utf8(sv) D_PPP_sv_len_utf8(sv) #endif #endif #endif #ifndef PERL_PV_ESCAPE_QUOTE #define PERL_PV_ESCAPE_QUOTE 0x0001 #endif #ifndef PERL_PV_PRETTY_QUOTE #define PERL_PV_PRETTY_QUOTE PERL_PV_ESCAPE_QUOTE #endif #ifndef PERL_PV_PRETTY_ELLIPSES #define PERL_PV_PRETTY_ELLIPSES 0x0002 #endif #ifndef PERL_PV_PRETTY_LTGT #define PERL_PV_PRETTY_LTGT 0x0004 #endif #ifndef PERL_PV_ESCAPE_FIRSTCHAR #define PERL_PV_ESCAPE_FIRSTCHAR 0x0008 #endif #ifndef PERL_PV_ESCAPE_UNI #define PERL_PV_ESCAPE_UNI 0x0100 #endif #ifndef PERL_PV_ESCAPE_UNI_DETECT #define PERL_PV_ESCAPE_UNI_DETECT 0x0200 #endif #ifndef PERL_PV_ESCAPE_ALL #define PERL_PV_ESCAPE_ALL 0x1000 #endif #ifndef PERL_PV_ESCAPE_NOBACKSLASH #define PERL_PV_ESCAPE_NOBACKSLASH 0x2000 #endif #ifndef PERL_PV_ESCAPE_NOCLEAR #define PERL_PV_ESCAPE_NOCLEAR 0x4000 #endif #ifndef PERL_PV_ESCAPE_RE #define PERL_PV_ESCAPE_RE 0x8000 #endif #ifndef PERL_PV_PRETTY_NOCLEAR #define PERL_PV_PRETTY_NOCLEAR PERL_PV_ESCAPE_NOCLEAR #endif #ifndef PERL_PV_PRETTY_DUMP #define PERL_PV_PRETTY_DUMP PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE #endif #ifndef PERL_PV_PRETTY_REGPROP #define PERL_PV_PRETTY_REGPROP PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_LTGT|PERL_PV_ESCAPE_RE #endif #ifndef pv_escape #if defined(NEED_pv_escape) static char * DPPP_(my_pv_escape)(pTHX_ SV * dsv, char const * const str, const STRLEN count, const STRLEN max, STRLEN * const escaped, const U32 flags); static #else extern char * DPPP_(my_pv_escape)(pTHX_ SV * dsv, char const * const str, const STRLEN count, const STRLEN max, STRLEN * const escaped, const U32 flags); #endif #if defined(NEED_pv_escape) || defined(NEED_pv_escape_GLOBAL) #ifdef pv_escape #undef pv_escape #endif #define pv_escape(a,b,c,d,e,f) DPPP_(my_pv_escape)(aTHX_ a,b,c,d,e,f) #define Perl_pv_escape DPPP_(my_pv_escape) char * DPPP_(my_pv_escape)(pTHX_ SV *dsv, char const * const str, const STRLEN count, const STRLEN max, STRLEN * const escaped, const U32 flags) { const char esc = flags & PERL_PV_ESCAPE_RE ? '%' : '\\'; const char dq = flags & PERL_PV_ESCAPE_QUOTE ? '"' : esc; char octbuf[32] = "%123456789ABCDF"; STRLEN wrote = 0; STRLEN chsize = 0; STRLEN readsize = 1; #if defined(is_utf8_string) && defined(utf8_to_uvchr_buf) bool isuni = flags & PERL_PV_ESCAPE_UNI ? 1 : 0; #endif const char *pv = str; const char * const end = pv + count; octbuf[0] = esc; if (!(flags & PERL_PV_ESCAPE_NOCLEAR)) sv_setpvs(dsv, ""); #if defined(is_utf8_string) && defined(utf8_to_uvchr_buf) if ((flags & PERL_PV_ESCAPE_UNI_DETECT) && is_utf8_string((U8*)pv, count)) isuni = 1; #endif for (; pv < end && (!max || wrote < max) ; pv += readsize) { const UV u = #if defined(is_utf8_string) && defined(utf8_to_uvchr_buf) isuni ? utf8_to_uvchr_buf((U8*)pv, end, &readsize) : #endif (U8)*pv; const U8 c = (U8)u & 0xFF; if (u > 255 || (flags & PERL_PV_ESCAPE_ALL)) { if (flags & PERL_PV_ESCAPE_FIRSTCHAR) chsize = my_snprintf(octbuf, sizeof octbuf, "%" UVxf, u); else chsize = my_snprintf(octbuf, sizeof octbuf, "%cx{%" UVxf "}", esc, u); } else if (flags & PERL_PV_ESCAPE_NOBACKSLASH) { chsize = 1; } else { if (c == dq || c == esc || !isPRINT(c)) { chsize = 2; switch (c) { case '\\' : case '%' : if (c == esc) octbuf[1] = esc; else chsize = 1; break; case '\v' : octbuf[1] = 'v'; break; case '\t' : octbuf[1] = 't'; break; case '\r' : octbuf[1] = 'r'; break; case '\n' : octbuf[1] = 'n'; break; case '\f' : octbuf[1] = 'f'; break; case '"' : if (dq == '"') octbuf[1] = '"'; else chsize = 1; break; default: chsize = my_snprintf(octbuf, sizeof octbuf, pv < end && isDIGIT((U8)*(pv+readsize)) ? "%c%03o" : "%c%o", esc, c); } } else { chsize = 1; } } if (max && wrote + chsize > max) { break; } else if (chsize > 1) { sv_catpvn(dsv, octbuf, chsize); wrote += chsize; } else { char tmp[2]; my_snprintf(tmp, sizeof tmp, "%c", c); sv_catpvn(dsv, tmp, 1); wrote++; } if (flags & PERL_PV_ESCAPE_FIRSTCHAR) break; } if (escaped != NULL) *escaped= pv - str; return SvPVX(dsv); } #endif #endif #ifndef pv_pretty #if defined(NEED_pv_pretty) static char * DPPP_(my_pv_pretty)(pTHX_ SV * dsv, char const * const str, const STRLEN count, const STRLEN max, char const * const start_color, char const * const end_color, const U32 flags); static #else extern char * DPPP_(my_pv_pretty)(pTHX_ SV * dsv, char const * const str, const STRLEN count, const STRLEN max, char const * const start_color, char const * const end_color, const U32 flags); #endif #if defined(NEED_pv_pretty) || defined(NEED_pv_pretty_GLOBAL) #ifdef pv_pretty #undef pv_pretty #endif #define pv_pretty(a,b,c,d,e,f,g) DPPP_(my_pv_pretty)(aTHX_ a,b,c,d,e,f,g) #define Perl_pv_pretty DPPP_(my_pv_pretty) char * DPPP_(my_pv_pretty)(pTHX_ SV *dsv, char const * const str, const STRLEN count, const STRLEN max, char const * const start_color, char const * const end_color, const U32 flags) { const U8 dq = (flags & PERL_PV_PRETTY_QUOTE) ? '"' : '%'; STRLEN escaped; if (!(flags & PERL_PV_PRETTY_NOCLEAR)) sv_setpvs(dsv, ""); if (dq == '"') sv_catpvs(dsv, "\""); else if (flags & PERL_PV_PRETTY_LTGT) sv_catpvs(dsv, "<"); if (start_color != NULL) sv_catpv(dsv, D_PPP_CONSTPV_ARG(start_color)); pv_escape(dsv, str, count, max, &escaped, flags | PERL_PV_ESCAPE_NOCLEAR); if (end_color != NULL) sv_catpv(dsv, D_PPP_CONSTPV_ARG(end_color)); if (dq == '"') sv_catpvs(dsv, "\""); else if (flags & PERL_PV_PRETTY_LTGT) sv_catpvs(dsv, ">"); if ((flags & PERL_PV_PRETTY_ELLIPSES) && escaped < count) sv_catpvs(dsv, "..."); return SvPVX(dsv); } #endif #endif #ifndef pv_display #if defined(NEED_pv_display) static char * DPPP_(my_pv_display)(pTHX_ SV * dsv, const char * pv, STRLEN cur, STRLEN len, STRLEN pvlim); static #else extern char * DPPP_(my_pv_display)(pTHX_ SV * dsv, const char * pv, STRLEN cur, STRLEN len, STRLEN pvlim); #endif #if defined(NEED_pv_display) || defined(NEED_pv_display_GLOBAL) #ifdef pv_display #undef pv_display #endif #define pv_display(a,b,c,d,e) DPPP_(my_pv_display)(aTHX_ a,b,c,d,e) #define Perl_pv_display DPPP_(my_pv_display) char * DPPP_(my_pv_display)(pTHX_ SV *dsv, const char *pv, STRLEN cur, STRLEN len, STRLEN pvlim) { pv_pretty(dsv, pv, cur, pvlim, NULL, NULL, PERL_PV_PRETTY_DUMP); if (len > cur && pv[cur] == '\0') sv_catpvs(dsv, "\\0"); return SvPVX(dsv); } #endif #endif #if PERL_VERSION_LT(5,27,9) #ifndef LC_NUMERIC_LOCK #define LC_NUMERIC_LOCK #endif #ifndef LC_NUMERIC_UNLOCK #define LC_NUMERIC_UNLOCK #endif #if PERL_VERSION_LT(5,19,0) #undef STORE_LC_NUMERIC_SET_STANDARD #undef RESTORE_LC_NUMERIC #undef DECLARATION_FOR_LC_NUMERIC_MANIPULATION #ifdef USE_LOCALE #ifndef DECLARATION_FOR_LC_NUMERIC_MANIPULATION #define DECLARATION_FOR_LC_NUMERIC_MANIPULATION char *LoC_ #endif #ifndef STORE_NUMERIC_SET_STANDARD #define STORE_NUMERIC_SET_STANDARD() \ LoC_ = savepv(setlocale(LC_NUMERIC, NULL)); \ SAVEFREEPV(LoC_); \ setlocale(LC_NUMERIC, "C"); #endif #ifndef RESTORE_LC_NUMERIC #define RESTORE_LC_NUMERIC() \ setlocale(LC_NUMERIC, LoC_); #endif #else #ifndef DECLARATION_FOR_LC_NUMERIC_MANIPULATION #define DECLARATION_FOR_LC_NUMERIC_MANIPULATION #endif #ifndef STORE_LC_NUMERIC_SET_STANDARD #define STORE_LC_NUMERIC_SET_STANDARD() #endif #ifndef RESTORE_LC_NUMERIC #define RESTORE_LC_NUMERIC() #endif #endif #endif #endif #ifndef LOCK_NUMERIC_STANDARD #define LOCK_NUMERIC_STANDARD() #endif #ifndef UNLOCK_NUMERIC_STANDARD #define UNLOCK_NUMERIC_STANDARD() #endif #ifndef LOCK_LC_NUMERIC_STANDARD #define LOCK_LC_NUMERIC_STANDARD LOCK_NUMERIC_STANDARD #endif #ifndef UNLOCK_LC_NUMERIC_STANDARD #define UNLOCK_LC_NUMERIC_STANDARD UNLOCK_NUMERIC_STANDARD #endif #ifndef switch_to_global_locale #define switch_to_global_locale() #endif #ifdef sync_locale #if (PERL_BCDVERSION < 0x5027009) #if (PERL_BCDVERSION >= 0x5021003) #undef sync_locale #define sync_locale() (Perl_sync_locale(aTHX), 1) #elif defined(sync_locale) #undef sync_locale #define sync_locale() (new_ctype(setlocale(LC_CTYPE, NULL)), \ new_collate(setlocale(LC_COLLATE, NULL)), \ set_numeric_local(), \ new_numeric(setlocale(LC_NUMERIC, NULL)), \ 1) #elif defined(new_ctype) && defined(LC_CTYPE) #define sync_locale() (new_ctype(setlocale(LC_CTYPE, NULL)), 1) #endif #endif #endif #ifndef sync_locale #define sync_locale() 1 #endif #endif ��������������������������������������������������������������������HTML-Parser-3.81/t/���������������������������������������������������������������������������������000755 �000765 �000024 �00000000000 14366103441 015167� 5����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/entities.html����������������������������������������������������������������������000644 �000765 �000024 �00000007074 14366103441 017446� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������<html> <body> <h1>Entities expansion test page</h1> <p>The purpose of this table is to test how various browsers expands HTML entities in various edge cases in order to improve compatiblity for <a href="http://search.cpan.org/dist/HTML-Parser">HTML-Parser</a>. <p> <table border=1> <tr><th>ent</th><th>with</th><th>without</th><th>dec</th><th>hex</th><th>hexx</th><th>ref</th></tr> <tr><td>&amp;amp;</td><td>&amp;</td><td>&amp&ampamp</td><td>&#38;&#38</td><td>&#x26;&#x26</td><td>&#x26x</td><td><a href="http://example.com?a=1&amp=2&ampamp=3">ref</a></td></tr> <tr><td>&amp;apos;</td><td>&apos;</td><td>&apos&aposapos</td><td>&#39;&#39</td><td>&#x27;&#x27</td><td>&#x27x</td><td><a href="http://example.com?a=1&apos=2&aposapos=3">ref</a></td></tr> <tr><td>&amp;times;</td><td>&times;</td><td>&times&timestimes</td><td>&#215;&#215</td><td>&#XD7;&#XD7</td><td>&#XD7x</td><td><a href="http://example.com?a=1&times=2&timestimes=3">ref</a></td></tr> <tr><td>&amp;euro;</td><td>&euro;</td><td>&euro&euroeuro</td><td>&#8364;&#8364</td><td>&#x20ac;&#x20ac</td><td>&#x20acx</td><td><a href="http://example.com?a=1&euro=2&euroeuro=3">ref</a></td></tr> <tr><td>&amp;hearts;</td><td>&hearts;</td><td>&hearts&heartshearts</td><td>&#9829;&#9829</td><td>&#x2665;&#x2665</td><td>&#x2665x</td><td><a href="http://example.com?a=1&heart=2&heartheart=3">ref</a></td></tr> </table> <p> overflow: 3=&#3; 33=&#33; 333=&#333; 3333=&#3333; 33333=&#33333; 333333=&#333333; 3333333=&#3333333; 33333333=&#33333333; 333333333=&#333333333; 3333333333=&#3333333333; 33333333333=&#33333333333; <p> ref: <a href="http://example.com?a=&#xFx">&amp;#xFx</a> <a href="http://example.com?a=&#xFFx">&amp;#xFFx</a> <a href="http://example.com?a=&#xFFFx">&amp;#xFFFx</a> <a href="http://example.com?a=&#xFFFFx">&amp;#xFFFFx</a> <a href="http://example.com?a=&#xFFFFFx">&amp;#xFFFFFx</a> <h2>Results</h2> <p>This table records observations of how various browsers behave with regards to expanding entities. The term "Latin-1 entites" means named entities that map to characters in the 0 .. 255 range. The term "Unicode entities" means named entities that are not "Latin-1 entities". <p> <table border=1> <tr> <th>feature</th> <th>do</th> <th>don't</th> </tr> <tr> <td>expand &amp;apos;</td> <td>HTML-Parser >= 3.22, FireFox 1.5, Konqueror 3.4.3, Safari 2.0</td> <td>HTML-Parser <= 3.21, MSIE 7.0 beta 2</td> </tr> <tr> <td>expand Latin-1 entities that are not terminated by semicolon</td> <td>HTML-Parser, FireFox 1.5, Konqueror 3.4.3, MSIE 7.0 beta 2, Safari 2.0</td> <td>&nbsp;</td> </tr> <tr> <td>expand prefix of Latin-1 entities in text content</td> <td>HTML-Parser >= 3.40, Konqueror 3.4.3, MSIE 7.0 beta 2</td> <td>HTML-Parser <= 3.38, FireFox 1.5, Safari 2.0</td> </tr> <tr> <td>expand Unicode entities that are not terminated by semicolon in text content</td> <td>HTML-Parser <= 3.50, FireFox 1.5, Safari 2.0</td> <td>HTML-Parser >= 3.51, Konqueror 3.4.3, MSIE 7.0 beta 2</td> </tr> <tr> <td>expand Unicode entities that are not terminated by semicolon in attributes</td> <td>HTML-Parser <= 3.50</td> <td>HTML-Parser >= 3.51, FireFox 1.5, Konqueror 3.4.3, MSIE 7.0 beta 2, Safari 2.0</td> </tr> <tr> <td>expand hex entities that are not terminated by semicolon in text content</td> <td>HTML-Parser, FireFox 1.5, Konqueror 3.4.3, Safari 2.0</td> <td>MSIE 7.0 beta 2</td> </tr> <tr> <td>expand hex entities that are not terminated by semicolon in attributes</td> <td>HTML-Parser, FireFox 1.5, Konqueror 3.4.3, MSIE 7.0 beta 2, Safari 2.0</td> <td>&nbsp;</td> </tr> </table> </body> </html> ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/xt/��������������������������������������������������������������������������������000755 �000765 �000024 �00000000000 14366103441 015357� 5����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/META.yml���������������������������������������������������������������������������000644 �000765 �000024 �00000005161 14366103441 016200� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������--- abstract: 'HTML parser class' author: - 'Gisle Aas <gaas@cpan.org>' build_requires: Config: '0' ExtUtils::MakeMaker: '0' File::Spec: '0' FileHandle: '0' IO::File: '0' SelectSaver: '0' Test::More: '0' URI: '0' perl: '5.008' strict: '0' configure_requires: ExtUtils::MakeMaker: '6.52' perl: '5.008' dynamic_config: 0 generated_by: 'Dist::Zilla version 6.025, CPAN::Meta::Converter version 2.150010' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: '1.4' name: HTML-Parser no_index: directory: - eg - examples - inc - share - t - xt provides: HTML::Entities: file: lib/HTML/Entities.pm version: '3.81' HTML::Filter: file: lib/HTML/Filter.pm version: '3.81' x_deprecated: 1 HTML::HeadParser: file: lib/HTML/HeadParser.pm version: '3.81' HTML::LinkExtor: file: lib/HTML/LinkExtor.pm version: '3.81' HTML::Parser: file: lib/HTML/Parser.pm version: '3.81' HTML::PullParser: file: lib/HTML/PullParser.pm version: '3.81' HTML::TokeParser: file: lib/HTML/TokeParser.pm version: '3.81' requires: Carp: '0' Exporter: '0' HTML::Tagset: '0' HTTP::Headers: '0' IO::File: '0' URI: '0' URI::URL: '0' XSLoader: '0' perl: '5.008' strict: '0' resources: bugtracker: https://github.com/libwww-perl/HTML-Parser/issues homepage: https://github.com/libwww-perl/HTML-Parser repository: https://github.com/libwww-perl/HTML-Parser.git version: '3.81' x_contributors: - 'Antonio Radici <antonio@dyne.org>' - 'Barbie <barbie@missbarbell.co.uk>' - 'bulk88 <bulk88@hotmail.com>' - 'Chase Whitener <capoeirab@cpan.org>' - 'Chip Salzenberg <chip@pobox.com>' - 'Damyan Ivanov <dmn@debian.org>' - 'David Steinbrunner <dsteinbrunner@pobox.com>' - 'François Perrad <francois.perrad@gadz.org>' - 'Gisle Aas <gisle@aas.no>' - 'Graham Knop <haarg@haarg.org>' - 'Jacques Germishuys <jacquesg@striata.com>' - 'James Raspass <jraspass@gmail.com>' - 'Jess Robinson <castaway@desert-island.me.uk>' - 'Jon Jensen <jon@endpoint.com>' - 'Michal Josef Špaček <mspacek@redhat.com>' - 'Mike South <msouth@gmail.com>' - 'Nicholas Clark <nick@ccl4.org>' - 'Nicolas R <nicolas@atoomic.org>' - 'Olaf Alders <olaf@wundersolutions.com>' - 'Salvatore Bonaccorso <salvatore.bonaccorso@gmail.com>' - 'Todd Rinaldo <toddr@cpan.org>' - 'Ville Skyttä <ville.skytta@iki.fi>' - 'Yves Orton <demerphq@gmail.com>' - 'Zefram <zefram@fysh.org>' x_generated_by_perl: v5.36.0 x_serialization_backend: 'YAML::Tiny version 1.73' x_spdx_expression: 'Artistic-1.0-Perl OR GPL-1.0-or-later' ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/META.json��������������������������������������������������������������������������000644 �000765 �000024 �00000011361 14366103441 016347� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������{ "abstract" : "HTML parser class", "author" : [ "Gisle Aas <gaas@cpan.org>" ], "dynamic_config" : 0, "generated_by" : "Dist::Zilla version 6.025, CPAN::Meta::Converter version 2.150010", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : 2 }, "name" : "HTML-Parser", "no_index" : { "directory" : [ "eg", "examples", "inc", "share", "t", "xt" ] }, "prereqs" : { "configure" : { "requires" : { "ExtUtils::MakeMaker" : "6.52", "perl" : "5.008" }, "suggests" : { "JSON::PP" : "2.27300" } }, "develop" : { "requires" : { "Dist::Zilla" : "0", "Dist::Zilla::Plugin::MinimumPerl" : "0", "Dist::Zilla::PluginBundle::Starter" : "v4.0.0", "File::Spec" : "0", "IO::Handle" : "0", "IPC::Open3" : "0", "Pod::Coverage::TrustPod" : "0", "Test::CPAN::Changes" : "0.4", "Test::CPAN::Meta" : "0", "Test::CheckManifest" : "1.29", "Test::Kwalitee" : "1.22", "Test::More" : "0.88", "Test::Pod" : "1.41", "Test::Pod::Coverage" : "1.08", "Test::Pod::Spelling::CommonMistakes" : "1.000", "Test::Spelling" : "0.12", "Test::Version" : "2.00" } }, "runtime" : { "requires" : { "Carp" : "0", "Exporter" : "0", "HTML::Tagset" : "0", "HTTP::Headers" : "0", "IO::File" : "0", "URI" : "0", "URI::URL" : "0", "XSLoader" : "0", "perl" : "5.008", "strict" : "0" } }, "test" : { "recommends" : { "CPAN::Meta" : "2.120900" }, "requires" : { "Config" : "0", "ExtUtils::MakeMaker" : "0", "File::Spec" : "0", "FileHandle" : "0", "IO::File" : "0", "SelectSaver" : "0", "Test::More" : "0", "URI" : "0", "perl" : "5.008", "strict" : "0" } } }, "provides" : { "HTML::Entities" : { "file" : "lib/HTML/Entities.pm", "version" : "3.81" }, "HTML::Filter" : { "file" : "lib/HTML/Filter.pm", "version" : "3.81", "x_deprecated" : 1 }, "HTML::HeadParser" : { "file" : "lib/HTML/HeadParser.pm", "version" : "3.81" }, "HTML::LinkExtor" : { "file" : "lib/HTML/LinkExtor.pm", "version" : "3.81" }, "HTML::Parser" : { "file" : "lib/HTML/Parser.pm", "version" : "3.81" }, "HTML::PullParser" : { "file" : "lib/HTML/PullParser.pm", "version" : "3.81" }, "HTML::TokeParser" : { "file" : "lib/HTML/TokeParser.pm", "version" : "3.81" } }, "release_status" : "stable", "resources" : { "bugtracker" : { "web" : "https://github.com/libwww-perl/HTML-Parser/issues" }, "homepage" : "https://github.com/libwww-perl/HTML-Parser", "repository" : { "type" : "git", "url" : "https://github.com/libwww-perl/HTML-Parser.git", "web" : "https://github.com/libwww-perl/HTML-Parser" } }, "version" : "3.81", "x_contributors" : [ "Antonio Radici <antonio@dyne.org>", "Barbie <barbie@missbarbell.co.uk>", "bulk88 <bulk88@hotmail.com>", "Chase Whitener <capoeirab@cpan.org>", "Chip Salzenberg <chip@pobox.com>", "Damyan Ivanov <dmn@debian.org>", "David Steinbrunner <dsteinbrunner@pobox.com>", "Fran\u00e7ois Perrad <francois.perrad@gadz.org>", "Gisle Aas <gisle@aas.no>", "Graham Knop <haarg@haarg.org>", "Jacques Germishuys <jacquesg@striata.com>", "James Raspass <jraspass@gmail.com>", "Jess Robinson <castaway@desert-island.me.uk>", "Jon Jensen <jon@endpoint.com>", "Michal Josef \u0160pa\u010dek <mspacek@redhat.com>", "Mike South <msouth@gmail.com>", "Nicholas Clark <nick@ccl4.org>", "Nicolas R <nicolas@atoomic.org>", "Olaf Alders <olaf@wundersolutions.com>", "Salvatore Bonaccorso <salvatore.bonaccorso@gmail.com>", "Todd Rinaldo <toddr@cpan.org>", "Ville Skytt\u00e4 <ville.skytta@iki.fi>", "Yves Orton <demerphq@gmail.com>", "Zefram <zefram@fysh.org>" ], "x_generated_by_perl" : "v5.36.0", "x_serialization_backend" : "Cpanel::JSON::XS version 4.32", "x_spdx_expression" : "Artistic-1.0-Perl OR GPL-1.0-or-later" } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/typemap����������������������������������������������������������������������������000644 �000765 �000024 �00000000103 14366103441 016320� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������PSTATE* T_PSTATE INPUT T_PSTATE $var = get_pstate_hv(aTHX_ $arg) �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/hctype.h���������������������������������������������������������������������������000644 �000765 �000024 �00000005571 14366103441 016401� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* This file is autogenerated by mkhctype */ #define HCTYPE_SPACE 0x01 #define HCTYPE_NAME_FIRST 0x02 #define HCTYPE_NAME_CHAR 0x04 #define HCTYPE_NOT_SPACE_GT 0x08 #define HCTYPE_NOT_SPACE_EQ_GT 0x10 #define HCTYPE_NOT_SPACE_SLASH_GT 0x20 #define HCTYPE_NOT_SPACE_EQ_SLASH_GT 0x40 #define HCTYPE(c) hctype[(unsigned char)(c)] #define isHCTYPE(c, w) (HCTYPE(c) & (w)) #define isHSPACE(c) isHCTYPE(c, HCTYPE_SPACE) #define isHNAME_FIRST(c) isHCTYPE(c, HCTYPE_NAME_FIRST) #define isHNAME_CHAR(c) isHCTYPE(c, HCTYPE_NAME_CHAR) #define isHNOT_SPACE_GT(c) isHCTYPE(c, HCTYPE_NOT_SPACE_GT) typedef unsigned char hctype_t; static hctype_t hctype[] = { 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 0 - 7 */ 0x78, 0x01, 0x01, 0x01, 0x01, 0x01, 0x78, 0x78, /* 8 - 15 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 16 - 23 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 24 - 31 */ 0x01, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 32 - 39 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x7c, 0x7c, 0x58, /* 40 - 47 */ 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, /* 48 - 55 */ 0x7c, 0x7c, 0x7e, 0x78, 0x78, 0x28, 0x00, 0x78, /* 56 - 63 */ 0x78, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, /* 64 - 71 */ 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, /* 72 - 79 */ 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, /* 80 - 87 */ 0x7e, 0x7e, 0x7e, 0x78, 0x78, 0x78, 0x78, 0x7e, /* 88 - 95 */ 0x78, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, /* 96 - 103 */ 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, /* 104 - 111 */ 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, /* 112 - 119 */ 0x7e, 0x7e, 0x7e, 0x78, 0x78, 0x78, 0x78, 0x78, /* 120 - 127 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 128 - 135 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 136 - 143 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 144 - 151 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 152 - 159 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 160 - 167 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 168 - 175 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 176 - 183 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 184 - 191 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 192 - 199 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 200 - 207 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 208 - 215 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 216 - 223 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 224 - 231 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 232 - 239 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 240 - 247 */ 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, 0x78, /* 248 - 255 */ }; ���������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/pfunc.h����������������������������������������������������������������������������000644 �000765 �000024 �00000017226 14366103441 016220� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* This file is autogenerated by mkpfunc */ typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, U32 utf8, SV* self); static PFUNC parsefunc[] = { parse_null, /* 0 */ parse_null, /* 1 */ parse_null, /* 2 */ parse_null, /* 3 */ parse_null, /* 4 */ parse_null, /* 5 */ parse_null, /* 6 */ parse_null, /* 7 */ parse_null, /* 8 */ parse_null, /* 9 */ parse_null, /* 10 */ parse_null, /* 11 */ parse_null, /* 12 */ parse_null, /* 13 */ parse_null, /* 14 */ parse_null, /* 15 */ parse_null, /* 16 */ parse_null, /* 17 */ parse_null, /* 18 */ parse_null, /* 19 */ parse_null, /* 20 */ parse_null, /* 21 */ parse_null, /* 22 */ parse_null, /* 23 */ parse_null, /* 24 */ parse_null, /* 25 */ parse_null, /* 26 */ parse_null, /* 27 */ parse_null, /* 28 */ parse_null, /* 29 */ parse_null, /* 30 */ parse_null, /* 31 */ parse_null, /* 32 */ parse_decl, /* 33 */ parse_null, /* 34 */ parse_null, /* 35 */ parse_null, /* 36 */ parse_null, /* 37 */ parse_null, /* 38 */ parse_null, /* 39 */ parse_null, /* 40 */ parse_null, /* 41 */ parse_null, /* 42 */ parse_null, /* 43 */ parse_null, /* 44 */ parse_null, /* 45 */ parse_null, /* 46 */ parse_end, /* 47 */ parse_null, /* 48 */ parse_null, /* 49 */ parse_null, /* 50 */ parse_null, /* 51 */ parse_null, /* 52 */ parse_null, /* 53 */ parse_null, /* 54 */ parse_null, /* 55 */ parse_null, /* 56 */ parse_null, /* 57 */ parse_null, /* 58 */ parse_null, /* 59 */ parse_null, /* 60 */ parse_null, /* 61 */ parse_null, /* 62 */ parse_process, /* 63 */ parse_null, /* 64 */ parse_start, /* 65 */ parse_start, /* 66 */ parse_start, /* 67 */ parse_start, /* 68 */ parse_start, /* 69 */ parse_start, /* 70 */ parse_start, /* 71 */ parse_start, /* 72 */ parse_start, /* 73 */ parse_start, /* 74 */ parse_start, /* 75 */ parse_start, /* 76 */ parse_start, /* 77 */ parse_start, /* 78 */ parse_start, /* 79 */ parse_start, /* 80 */ parse_start, /* 81 */ parse_start, /* 82 */ parse_start, /* 83 */ parse_start, /* 84 */ parse_start, /* 85 */ parse_start, /* 86 */ parse_start, /* 87 */ parse_start, /* 88 */ parse_start, /* 89 */ parse_start, /* 90 */ parse_null, /* 91 */ parse_null, /* 92 */ parse_null, /* 93 */ parse_null, /* 94 */ parse_null, /* 95 */ parse_null, /* 96 */ parse_start, /* 97 */ parse_start, /* 98 */ parse_start, /* 99 */ parse_start, /* 100 */ parse_start, /* 101 */ parse_start, /* 102 */ parse_start, /* 103 */ parse_start, /* 104 */ parse_start, /* 105 */ parse_start, /* 106 */ parse_start, /* 107 */ parse_start, /* 108 */ parse_start, /* 109 */ parse_start, /* 110 */ parse_start, /* 111 */ parse_start, /* 112 */ parse_start, /* 113 */ parse_start, /* 114 */ parse_start, /* 115 */ parse_start, /* 116 */ parse_start, /* 117 */ parse_start, /* 118 */ parse_start, /* 119 */ parse_start, /* 120 */ parse_start, /* 121 */ parse_start, /* 122 */ parse_null, /* 123 */ parse_null, /* 124 */ parse_null, /* 125 */ parse_null, /* 126 */ parse_null, /* 127 */ parse_null, /* 128 */ parse_null, /* 129 */ parse_null, /* 130 */ parse_null, /* 131 */ parse_null, /* 132 */ parse_null, /* 133 */ parse_null, /* 134 */ parse_null, /* 135 */ parse_null, /* 136 */ parse_null, /* 137 */ parse_null, /* 138 */ parse_null, /* 139 */ parse_null, /* 140 */ parse_null, /* 141 */ parse_null, /* 142 */ parse_null, /* 143 */ parse_null, /* 144 */ parse_null, /* 145 */ parse_null, /* 146 */ parse_null, /* 147 */ parse_null, /* 148 */ parse_null, /* 149 */ parse_null, /* 150 */ parse_null, /* 151 */ parse_null, /* 152 */ parse_null, /* 153 */ parse_null, /* 154 */ parse_null, /* 155 */ parse_null, /* 156 */ parse_null, /* 157 */ parse_null, /* 158 */ parse_null, /* 159 */ parse_null, /* 160 */ parse_null, /* 161 */ parse_null, /* 162 */ parse_null, /* 163 */ parse_null, /* 164 */ parse_null, /* 165 */ parse_null, /* 166 */ parse_null, /* 167 */ parse_null, /* 168 */ parse_null, /* 169 */ parse_null, /* 170 */ parse_null, /* 171 */ parse_null, /* 172 */ parse_null, /* 173 */ parse_null, /* 174 */ parse_null, /* 175 */ parse_null, /* 176 */ parse_null, /* 177 */ parse_null, /* 178 */ parse_null, /* 179 */ parse_null, /* 180 */ parse_null, /* 181 */ parse_null, /* 182 */ parse_null, /* 183 */ parse_null, /* 184 */ parse_null, /* 185 */ parse_null, /* 186 */ parse_null, /* 187 */ parse_null, /* 188 */ parse_null, /* 189 */ parse_null, /* 190 */ parse_null, /* 191 */ parse_null, /* 192 */ parse_null, /* 193 */ parse_null, /* 194 */ parse_null, /* 195 */ parse_null, /* 196 */ parse_null, /* 197 */ parse_null, /* 198 */ parse_null, /* 199 */ parse_null, /* 200 */ parse_null, /* 201 */ parse_null, /* 202 */ parse_null, /* 203 */ parse_null, /* 204 */ parse_null, /* 205 */ parse_null, /* 206 */ parse_null, /* 207 */ parse_null, /* 208 */ parse_null, /* 209 */ parse_null, /* 210 */ parse_null, /* 211 */ parse_null, /* 212 */ parse_null, /* 213 */ parse_null, /* 214 */ parse_null, /* 215 */ parse_null, /* 216 */ parse_null, /* 217 */ parse_null, /* 218 */ parse_null, /* 219 */ parse_null, /* 220 */ parse_null, /* 221 */ parse_null, /* 222 */ parse_null, /* 223 */ parse_null, /* 224 */ parse_null, /* 225 */ parse_null, /* 226 */ parse_null, /* 227 */ parse_null, /* 228 */ parse_null, /* 229 */ parse_null, /* 230 */ parse_null, /* 231 */ parse_null, /* 232 */ parse_null, /* 233 */ parse_null, /* 234 */ parse_null, /* 235 */ parse_null, /* 236 */ parse_null, /* 237 */ parse_null, /* 238 */ parse_null, /* 239 */ parse_null, /* 240 */ parse_null, /* 241 */ parse_null, /* 242 */ parse_null, /* 243 */ parse_null, /* 244 */ parse_null, /* 245 */ parse_null, /* 246 */ parse_null, /* 247 */ parse_null, /* 248 */ parse_null, /* 249 */ parse_null, /* 250 */ parse_null, /* 251 */ parse_null, /* 252 */ parse_null, /* 253 */ parse_null, /* 254 */ parse_null, /* 255 */ }; ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/mkpfunc����������������������������������������������������������������������������000755 �000765 �000024 �00000001063 14366103441 016315� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl ($progname = $0) =~ s,.*/,,; print "/* This file is autogenerated by $progname */\n"; print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, U32 utf8, SV* self);\n"; print "static PFUNC parsefunc[] = {\n"; for my $c (0..255) { local $_ = chr($c); my $func = "null"; if (/^[A-Za-z]$/) { $func = "start"; } elsif ($_ eq "/") { $func = "end"; } elsif ($_ eq "!") { $func = "decl"; } elsif ($_ eq "?") { $func = "process"; } printf " %-15s /* %3d */\n", "parse_$func,", $c; } print "};\n"; �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/hparser.h��������������������������������������������������������������������������000644 �000765 �000024 �00000005167 14366103441 016552� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * Copyright 1999-2016, Gisle Aas * Copyright 1999-2000, Michael A. Chase * * This library is free software; you can redistribute it and/or * modify it under the same terms as Perl itself. */ /* * Declare various structures and constants. The main thing * is 'struct p_state' that contains various fields to represent * the state of the parser. */ #ifdef MARKED_SECTION enum marked_section_t { MS_NONE = 0, MS_INCLUDE, MS_RCDATA, MS_CDATA, MS_IGNORE }; #endif /* MARKED_SECTION */ #define P_SIGNATURE 0x16091964 /* tag struct p_state for safer cast */ enum event_id { E_DECLARATION = 0, E_COMMENT, E_START, E_END, E_TEXT, E_PROCESS, E_START_DOCUMENT, E_END_DOCUMENT, E_DEFAULT, /**/ EVENT_COUNT, E_NONE /* used for reporting skipped text (non-events) */ }; typedef enum event_id event_id_t; /* must match event_id_t above */ static char* event_id_str[] = { "declaration", "comment", "start", "end", "text", "process", "start_document", "end_document", "default", }; struct p_handler { SV* cb; SV* argspec; }; struct p_state { U32 signature; /* state */ SV* buf; STRLEN offset; STRLEN line; STRLEN column; bool start_document; bool parsing; bool eof; /* special parsing modes */ char* literal_mode; bool is_cdata; bool no_dash_dash_comment_end; char *pending_end_tag; /* unbroken_text option needs a buffer of pending text */ SV* pend_text; bool pend_text_is_cdata; STRLEN pend_text_offset; STRLEN pend_text_line; STRLEN pend_text_column; /* skipped text is accumulated here */ SV* skipped_text; #ifdef MARKED_SECTION /* marked section support */ enum marked_section_t ms; AV* ms_stack; bool marked_sections; #endif /* various boolean configuration attributes */ bool strict_comment; bool strict_names; bool strict_end; bool xml_mode; bool unbroken_text; bool attr_encoded; bool case_sensitive; bool closing_plaintext; bool utf8_mode; bool empty_element_tags; bool xml_pic; bool backquote; /* other configuration stuff */ SV* bool_attr_val; struct p_handler handlers[EVENT_COUNT]; int argspec_entity_decode; /* filters */ HV* report_tags; HV* ignore_tags; HV* ignore_elements; /* these are set when we are currently inside an element we want to ignore */ SV* ignoring_element; int ignore_depth; /* cache */ HV* entity2char; /* %HTML::Entities::entity2char */ SV* tmp; }; typedef struct p_state PSTATE; ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/lib/�������������������������������������������������������������������������������000755 �000765 �000024 �00000000000 14366103441 015472� 5����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/eg/��������������������������������������������������������������������������������000755 �000765 �000024 �00000000000 14366103441 015317� 5����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/mkhctype���������������������������������������������������������������������������000755 �000765 �000024 �00000002447 14366103441 016505� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl ($progname = $0) =~ s,.*/,,; print "/* This file is autogenerated by $progname */\n"; print <<'EOT'; #define HCTYPE_SPACE 0x01 #define HCTYPE_NAME_FIRST 0x02 #define HCTYPE_NAME_CHAR 0x04 #define HCTYPE_NOT_SPACE_GT 0x08 #define HCTYPE_NOT_SPACE_EQ_GT 0x10 #define HCTYPE_NOT_SPACE_SLASH_GT 0x20 #define HCTYPE_NOT_SPACE_EQ_SLASH_GT 0x40 #define HCTYPE(c) hctype[(unsigned char)(c)] #define isHCTYPE(c, w) (HCTYPE(c) & (w)) #define isHSPACE(c) isHCTYPE(c, HCTYPE_SPACE) #define isHNAME_FIRST(c) isHCTYPE(c, HCTYPE_NAME_FIRST) #define isHNAME_CHAR(c) isHCTYPE(c, HCTYPE_NAME_CHAR) #define isHNOT_SPACE_GT(c) isHCTYPE(c, HCTYPE_NOT_SPACE_GT) typedef unsigned char hctype_t; EOT print "static hctype_t hctype[] = {\n"; for my $c (0 .. 255) { print " " unless $c % 8; local $_ = chr($c); my $v = 0; if (/^\s$/) { # isSPACE $v |= 0x1 } elsif ($_ ne ">") { $v |= 0x08; $v |= 0x10 if $_ ne "="; $v |= 0x20 if $_ ne "/"; $v |= 0x40 if $_ ne "="; } if (/^[\w.\-:]$/) { $v |= 0x4; $v |= 0x2 unless /^[\d.-]$/; # XML allow /[:_]/ as first char } printf "0x%02x, ", $v; unless (($c+1) % 8) { printf " /* %3d - %3d */\n", $c - 7, $c; } } print "};\n"; �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/hints/�����������������������������������������������������������������������������000755 �000765 �000024 �00000000000 14366103441 016051� 5����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/Makefile.PL������������������������������������������������������������������������000644 �000765 �000024 �00000003425 14366103441 016702� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������# This Makefile.PL for HTML-Parser was generated by # Dist::Zilla::Plugin::MakeMaker::Awesome 0.49. # Don't edit it but the dist.ini and plugins used to construct it. use strict; use warnings; use ExtUtils::MakeMaker; my %WriteMakefileArgs = ( "ABSTRACT" => "HTML parser class", "AUTHOR" => "Gisle Aas <gaas\@cpan.org>", "CONFIGURE_REQUIRES" => { "ExtUtils::MakeMaker" => "6.52" }, "DISTNAME" => "HTML-Parser", "LICENSE" => "perl", "NAME" => "HTML::Parser", "PREREQ_PM" => { "Carp" => 0, "Exporter" => 0, "HTML::Tagset" => 0, "HTTP::Headers" => 0, "IO::File" => 0, "URI" => 0, "URI::URL" => 0, "XSLoader" => 0, "strict" => 0 }, "TEST_REQUIRES" => { "Config" => 0, "ExtUtils::MakeMaker" => 0, "File::Spec" => 0, "FileHandle" => 0, "IO::File" => 0, "SelectSaver" => 0, "Test::More" => 0, "URI" => 0, "strict" => 0 }, "VERSION" => "3.81", "test" => { "TESTS" => "t/*.t" } ); %WriteMakefileArgs = ( %WriteMakefileArgs, DEFINE => "-DMARKED_SECTION", H => [ qw(hparser.h hctype.h tokenpos.h pfunc.h hparser.c util.c) ], ); my %FallbackPrereqs = ( "Carp" => 0, "Config" => 0, "Exporter" => 0, "ExtUtils::MakeMaker" => 0, "File::Spec" => 0, "FileHandle" => 0, "HTML::Tagset" => 0, "HTTP::Headers" => 0, "IO::File" => 0, "SelectSaver" => 0, "Test::More" => 0, "URI" => 0, "URI::URL" => 0, "XSLoader" => 0, "strict" => 0 ); unless ( eval { ExtUtils::MakeMaker->VERSION('6.63_03') } ) { delete $WriteMakefileArgs{TEST_REQUIRES}; delete $WriteMakefileArgs{BUILD_REQUIRES}; $WriteMakefileArgs{PREREQ_PM} = \%FallbackPrereqs; } delete $WriteMakefileArgs{CONFIGURE_REQUIRES} unless eval { ExtUtils::MakeMaker->VERSION(6.52) }; WriteMakefile(%WriteMakefileArgs); �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/dist.ini���������������������������������������������������������������������������000644 �000765 �000024 �00000003675 14366103441 016403� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������ name = HTML-Parser author = Gisle Aas <gaas@cpan.org> license = Perl_5 copyright_holder = Gisle Aas <gaas@cpan.org> and Michael A. Chase <mchase@ix.netcom.com> copyright_year = 1996 [ReadmeAnyFromPod / Markdown_Readme] type = gfm source_filename = lib/HTML/Parser.pm filename = README.md location = root [Regenerate::AfterReleasers] plugin = Markdown_Readme [@Starter::Git] revision = 4 managed_versions = 1 installer = MakeMaker::Awesome MakeMaker::Awesome.WriteMakefile_arg[0] = DEFINE => "-DMARKED_SECTION" MakeMaker::Awesome.WriteMakefile_arg[1] = H => [ qw(hparser.h hctype.h tokenpos.h pfunc.h hparser.c util.c) ] RewriteVersion.global = 1 NextRelease.format = %-9v %{yyyy-MM-dd}d regenerate = Makefile.PL regenerate = META.json regenerate = README.md regenerate = LICENSE regenerate = t/00-report-prereqs.t Git::Check.allow_dirty = META.json [Prereqs::FromCPANfile] [MinimumPerl] perl = 5.008 [Deprecated] module = HTML::Filter [Git::Contributors] [GithubMeta] issues = 1 user = libwww-perl [GitHubREADME::Badge] badges = github_actions/linux badges = github_actions/macos badges = github_actions/windows [CheckChangeLog] [CheckChangesHasContent] [Test::ChangesHasContent] [Test::Kwalitee] skiptest = no_symlinks [Test::Version] filename_match = qr/Parser\.pm$/ [Test::Pod::Coverage::Configurable] trustme = HTML::Entities => qr/^(?:UNICODE_SUPPORT|decode|encode|encode_numeric|encode_numerically|num_entity)$/ trustme = HTML::Filter => qr/^(?:output)$/ trustme = HTML::HeadParser => qr/^(?:as_string|flush_text)$/ trustme = HTML::Parser => qr/^(?:init|netscape_buggy_comment)$/ [Test::PodSpelling] wordlist = Pod::Wordlist spell_cmd = aspell list stopword = CPAN stopword = MSIE stopword = argspec stopword = tagname stopword = Isindex stopword = undecoded stopword = IMG stopword = textified stopword = Textification stopword = argspecs stopword = Attr stopword = Attrseq stopword = Dtext stopword = Tokenpos stopword = Unterminated stopword = CDATA �������������������������������������������������������������������HTML-Parser-3.81/hints/solaris.pl�������������������������������������������������������������������000644 �000765 �000024 �00000000165 14366103441 020064� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������if ($Config{gccversion}) { print "Turning off optimizations to avoid compiler bug\n"; $self->{OPTIMIZE} = " "; } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/eg/hbody���������������������������������������������������������������������������000755 �000765 �000024 �00000001331 14366103441 016350� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl use strict; use warnings; use HTML::Parser (); my $doc = <<'EOT'; <!-- This is not where <BODY> starts --> <title>foo</title> <script language="Perl" description="Print out <BODY>"> open(BODY, "body.txt"); while (<BODY>) { print; } </script> <!-- The next thing will be <BODY> the body --> <Body> Howdy! </body> EOT my $body_offset; HTML::Parser->new( start_h => [ sub { return unless shift eq "body"; $body_offset = shift; shift->eof; # tell the parser to stop }, "tagname,offset,self" ] )->parse($doc); die "No <body> found" unless defined $body_offset; my $head = substr($doc, 0, $body_offset, ""); print $doc; �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/eg/hdisable������������������������������������������������������������������������000755 �000765 �000024 �00000001344 14366103441 017022� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl use strict; use warnings; use HTML::Parser (); use HTML::Entities qw(encode_entities); sub disable_tags_but { my ($text, $allowed_tags) = @_; my @text; my %allowed_tag = map { $_ => 1 } @{$allowed_tags || []}; my $tag_h = sub { my ($tag, $text) = @_; $text = encode_entities($text, "<") unless $allowed_tag{$tag}; push(@text, $text); }; HTML::Parser->new( start_h => [$tag_h, 'tagname, text'], end_h => [$tag_h, 'tagname, text'], default_h => [\@text, '@{text}'], )->parse($text)->eof; return join("", @text); } # # Test it # print disable_tags_but(<<EOT, [qw(a br)]) unless caller; Test <foo> <a href="...">...</a> </bar> EOT ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/eg/hlc�����������������������������������������������������������������������������000755 �000765 �000024 �00000001315 14366103441 016013� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl # This script will assume that the first command line argument # is a file containing HTML, and return a version # where all the tags are converted to lowercase. use strict; use warnings; use HTML::Parser (); HTML::Parser->new( start_h => [\&start_lc, "tokenpos, text"], end_h => [sub { print lc shift }, "text"], default_h => [sub { print shift }, "text"], )->parse_file(shift) || die "Can't open file: $!\n"; sub start_lc { my ($tpos, $text) = @_; for (my $i = 0; $i < @$tpos; $i += 2) { next if $i && ($i / 2) % 2 == 0; # skip attribute values $_ = lc $_ for substr($text, $tpos->[$i], $tpos->[$i + 1]); } print $text; } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/eg/hform���������������������������������������������������������������������������000755 �000765 �000024 �00000004706 14366103441 016367� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl # Print information about forms and their controls present in the HTML. # See also HTML::Form module use strict; use warnings; use HTML::PullParser (); use HTML::Entities qw(decode_entities); use Data::Dumper qw(Dumper); my @FORM_TAGS = qw(form input textarea button select option); my $p = HTML::PullParser->new( file => shift || "xxx.html", start => 'tag, attr', end => 'tag', text => '@{text}', report_tags => \@FORM_TAGS, ) || die "$!"; # a little helper function sub get_text { my ($p, $stop) = @_; my $text; while (defined(my $t = $p->get_token)) { if (ref $t) { $p->unget_token($t) unless $t->[0] eq $stop; last; } else { $text .= $t; } } return $text; } my @forms; while (defined(my $t = $p->get_token)) { next unless ref $t; # skip text if ($t->[0] eq "form") { shift @$t; push(@forms, $t); while (defined(my $t = $p->get_token)) { next unless ref $t; # skip text last if $t->[0] eq "/form"; if ($t->[0] eq "select") { my $sel = $t; push(@{$forms[-1]}, $t); while (defined(my $t = $p->get_token)) { next unless ref $t; # skip text last if $t->[0] eq "/select"; #print "select ", Dumper($t), "\n"; if ($t->[0] eq "option") { my $value = $t->[1]->{value}; my $text = get_text($p, "/option"); unless (defined $value) { $value = decode_entities($text); } push(@$sel, $value); } else { warn "$t->[0] inside select"; } } } elsif ($t->[0] =~ /^\/?option$/) { warn "option tag outside select"; } elsif ($t->[0] eq "textarea") { push(@{$forms[-1]}, $t); $t->[1]{value} = get_text($p, "/textarea"); } elsif ($t->[0] =~ m,^/,) { warn "stray $t->[0] tag"; } else { push(@{$forms[-1]}, $t); } } } else { warn "form tag $t->[0] outside form"; } } print Dumper(\@forms), "\n"; ����������������������������������������������������������HTML-Parser-3.81/eg/hrefsub�������������������������������������������������������������������������000755 �000765 �000024 �00000005774 14366103441 016720� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl # Perform transformations on link attributes in an HTML document. # Examples: # # $ hrefsub 's/foo/bar/g' index.html # $ hrefsub '$_=URI->new_abs($_, "http://foo")' index.html # # The first argument is a perl expression that might modify $_. # It is called for each link in the document with $_ set to # the original value of the link URI. The variables $tag and # $attr can be used to access the tagname and attributename # within the tag where the current link is found. # # The second argument is the name of a file to process. use strict; use warnings; use HTML::Parser (); use HTML::Tagset (); use URI; # Construct a hash of tag names that may have links. my %link_attr; { # To simplify things, reformat the %HTML::Tagset::linkElements # hash so that it is always a hash of hashes. while (my ($k, $v) = each %HTML::Tagset::linkElements) { if (ref($v)) { $v = {map { $_ => 1 } @$v}; } else { $v = {$v => 1}; } $link_attr{$k} = $v; } # Uncomment this to see what HTML::Tagset::linkElements thinks are # the tags with link attributes #use Data::Dump; Data::Dump::dump(\%link_attr); exit; } # Create a subroutine named 'edit' to perform the operation # passed in from the command line. The code should modify $_ # to change things. my $code = shift; $code = 'sub edit { local $_ = shift; my($attr, $tag) = @_; no strict; ' . ($code // '') . '; $_; }'; #print $code; eval $code; die $@ if $@; # Set up the parser. my $p = HTML::Parser->new(api_version => 3); # The default is to print everything as is. $p->handler(default => sub { print @_ }, "text"); # All links are found in start tags. This handler will evaluate # &edit for each link attribute found. $p->handler( start => sub { my ($tagname, $pos, $text) = @_; if (my $link_attr = $link_attr{$tagname}) { while (4 <= @$pos) { # use attribute sets from right to left # to avoid invalidating the offsets # when replacing the values my ($k_offset, $k_len, $v_offset, $v_len) = splice(@$pos, -4); my $attrname = lc(substr($text, $k_offset, $k_len)); next unless $link_attr->{$attrname}; next unless $v_offset; # 0 v_offset means no value my $v = substr($text, $v_offset, $v_len); $v =~ s/^([\'\"])(.*)\1$/$2/; my $new_v = edit($v, $attrname, $tagname); next if $new_v eq $v; $new_v =~ s/\"/&quot;/g; # since we quote with "" substr($text, $v_offset, $v_len) = qq("$new_v"); } } print $text; }, "tagname, tokenpos, text" ); # Parse the file passed in from the command line my $file = shift || usage(); $p->parse_file($file) || die "Can't open file $file: $!\n"; sub usage { my $progname = $0; $progname =~ s,^.*/,,; die "Usage: $progname <perlexpr> <filename>\n"; } ����HTML-Parser-3.81/eg/hstrip��������������������������������������������������������������������������000755 �000765 �000024 �00000003347 14366103441 016565� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl # This script cleans up an HTML document use strict; use warnings; use HTML::Parser (); # configure these values my @ignore_attr = qw(bgcolor background color face style link alink vlink text onblur onchange onclick ondblclick onfocus onkeydown onkeyup onload onmousedown onmousemove onmouseout onmouseover onmouseup onreset onselect onunload ); my @ignore_tags = qw(font big small b i); my @ignore_elements = qw(script style); # make it easier to look up attributes my %ignore_attr = map { $_ => 1 } @ignore_attr; sub tag { my ($pos, $text) = @_; if (@$pos >= 4) { # kill some attributes my ($k_offset, $k_len, $v_offset, $v_len) = @{$pos}[-4 .. -1]; my $next_attr = $v_offset ? $v_offset + $v_len : $k_offset + $k_len; my $edited; while (@$pos >= 4) { ($k_offset, $k_len, $v_offset, $v_len) = splice @$pos, -4; if ($ignore_attr{lc substr($text, $k_offset, $k_len)}) { substr($text, $k_offset, $next_attr - $k_offset) = ""; $edited++; } $next_attr = $k_offset; } # if we killed all attributed, kill any extra whitespace too $text =~ s/^(<\w+)\s+>$/$1>/ if $edited; } print $text; } sub decl { my $type = shift; print shift if $type eq "doctype"; } sub text { print shift; } HTML::Parser->new( api_version => 3, start_h => [\&tag, "tokenpos, text"], process_h => ["", ""], comment_h => ["", ""], declaration_h => [\&decl, "tagname, text"], default_h => [\&text, "text"], ignore_tags => \@ignore_tags, ignore_elements => \@ignore_elements, )->parse_file(shift) || die "Can't open file: $!\n"; �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/eg/hdump���������������������������������������������������������������������������000755 �000765 �000024 �00000001304 14366103441 016360� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl # This script will output event information as it parses the HTML document. # This gives the user a "Parser's eye view" of an HTML document. use strict; use warnings; use HTML::Parser (); use Data::Dumper qw(Dumper); sub h { my ($event, $line, $column, $text, $tagname, $attr) = @_; my @d = (uc(substr($event, 0, 1)) . " L$line C$column"); substr($text, 40) = "..." if length($text) > 40; push(@d, $text); push(@d, $tagname) if defined $tagname; push(@d, $attr) if $attr; print Dumper(@d), "\n"; } my $p = HTML::Parser->new(api_version => 3); $p->handler(default => \&h, "event, line, column, text, tagname, attr"); $p->parse_file(@ARGV ? shift : *STDIN); ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/eg/htitle��������������������������������������������������������������������������000755 �000765 �000024 �00000000715 14366103441 016541� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl # This program will print out the title of an HTML document. use strict; use warnings; use HTML::Parser (); sub title_handler { my $self = shift; $self->handler(text => sub { print @_ }, "dtext"); $self->handler(end => "eof", "self"); } my $p = HTML::Parser->new( api_version => 3, start_h => [\&title_handler, "self"], report_tags => ['title'], ); $p->parse_file(shift || die) || die $!; print "\n"; ���������������������������������������������������HTML-Parser-3.81/eg/htextsub������������������������������������������������������������������������000755 �000765 �000024 �00000001650 14366103441 017115� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl # Shows how to mangle all plain text in an HTML document, using an arbitrary # Perl expression. Plain text is all text not within a tag declaration, i.e. # not in <p ...>, but possibly between <p> and </p> # Example (Reverse 'Debian' in all text) : # lynx -dump -source -raw http://www/debian.org > /tmp/a.txt # ./htextsub '$_ =~ s/Debian/Naibed/gi' /tmp/a.txt use strict; use warnings; use HTML::Parser (); my $code = shift || usage(); $code = 'sub edit_print { local $_ = shift; ' . $code . '; print }'; #print $code; eval $code; die $@ if $@; my $p = HTML::Parser->new( unbroken_text => 1, default_h => [sub { print @_; }, "text"], text_h => [\&edit_print, "text"], ); my $file = shift || usage(); $p->parse_file($file) || die "Can't open file $file: $!\n"; sub usage { my $progname = $0; $progname =~ s,^.*/,,; die "Usage: $progname <perlexpr> <filename>\n"; } ����������������������������������������������������������������������������������������HTML-Parser-3.81/eg/htext���������������������������������������������������������������������������000755 �000765 �000024 �00000001144 14366103441 016401� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl # Extract all plain text from an HTML file use strict; use warnings; use Encode (); use HTML::Parser (); my %inside; sub tag { my ($tag, $num) = @_; $inside{$tag} += $num; print " "; # not for all tags } sub text { return if $inside{script} || $inside{style}; print encode('utf8', $_[0]); } HTML::Parser->new( api_version => 3, handlers => [ start => [\&tag, "tagname, '+1'"], end => [\&tag, "tagname, '-1'"], text => [\&text, "dtext"], ], marked_sections => 1, )->parse_file(shift) || die "Can't open file: $!\n"; ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/eg/hanchors������������������������������������������������������������������������000755 �000765 �000024 �00000002254 14366103441 017055� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl use strict; use warnings; # This program will print out all <a href=".."> links in a # document together with the text that goes with it. # # See also HTML::LinkExtor use Encode; use HTML::Parser; my $p = HTML::Parser->new( api_version => 3, start_h => [\&a_start_handler, "self,tagname,attr"], report_tags => [qw(a img)], ); $p->parse_file(shift || die) || die $!; sub a_start_handler { my ($self, $tag, $attr) = @_; return unless $tag eq "a"; return unless exists $attr->{href}; print "A $attr->{href}\n"; $self->handler(text => [], '@{dtext}'); $self->handler(start => \&img_handler); $self->handler(end => \&a_end_handler, "self,tagname"); } sub img_handler { my ($self, $tag, $attr) = @_; return unless $tag eq "img"; push(@{$self->handler("text")}, $attr->{alt} || "[IMG]"); } sub a_end_handler { my ($self, $tag) = @_; my $text = encode('utf8', join("", @{$self->handler("text")})); $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; print "T $text\n"; $self->handler("text", undef); $self->handler("start", \&a_start_handler); $self->handler("end", undef); } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/lib/HTML/��������������������������������������������������������������������������000755 �000765 �000024 �00000000000 14366103441 016236� 5����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/lib/HTML/LinkExtor.pm��������������������������������������������������������������000644 �000765 �000024 �00000010630 14366103441 020513� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::LinkExtor; require HTML::Parser; our @ISA = qw(HTML::Parser); our $VERSION = '3.81'; =head1 NAME HTML::LinkExtor - Extract links from an HTML document =head1 SYNOPSIS require HTML::LinkExtor; $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/"); sub cb { my($tag, %links) = @_; print "$tag @{[%links]}\n"; } $p->parse_file("index.html"); =head1 DESCRIPTION I<HTML::LinkExtor> is an HTML parser that extracts links from an HTML document. The I<HTML::LinkExtor> is a subclass of I<HTML::Parser>. This means that the document should be given to the parser by calling the $p->parse() or $p->parse_file() methods. =cut use strict; use HTML::Tagset (); # legacy (some applications grabs this hash directly) our %LINK_ELEMENT; *LINK_ELEMENT = \%HTML::Tagset::linkElements; =over 4 =item $p = HTML::LinkExtor->new =item $p = HTML::LinkExtor->new( $callback ) =item $p = HTML::LinkExtor->new( $callback, $base ) The constructor takes two optional arguments. The first is a reference to a callback routine. It will be called as links are found. If a callback is not provided, then links are just accumulated internally and can be retrieved by calling the $p->links() method. The $base argument is an optional base URL used to absolutize all URLs found. You need to have the I<URI> module installed if you provide $base. The callback is called with the lowercase tag name as first argument, and then all link attributes as separate key/value pairs. All non-link attributes are removed. =cut sub new { my($class, $cb, $base) = @_; my $self = $class->SUPER::new( start_h => ["_start_tag", "self,tagname,attr"], report_tags => [keys %HTML::Tagset::linkElements], ); $self->{extractlink_cb} = $cb; if ($base) { require URI; $self->{extractlink_base} = URI->new($base); } $self; } sub _start_tag { my($self, $tag, $attr) = @_; my $base = $self->{extractlink_base}; my $links = $HTML::Tagset::linkElements{$tag}; $links = [$links] unless ref $links; my @links; my $a; for $a (@$links) { next unless exists $attr->{$a}; (my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5 push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link); } return unless @links; $self->_found_link($tag, @links); } sub _found_link { my $self = shift; my $cb = $self->{extractlink_cb}; if ($cb) { &$cb(@_); } else { push(@{$self->{'links'}}, [@_]); } } =item $p->links Returns a list of all links found in the document. The returned values will be anonymous arrays with the following elements: [$tag, $attr => $url1, $attr2 => $url2,...] The $p->links method will also truncate the internal link list. This means that if the method is called twice without any parsing between them the second call will return an empty list. Also note that $p->links will always be empty if a callback routine was provided when the I<HTML::LinkExtor> was created. =cut sub links { my $self = shift; exists($self->{'links'}) ? @{delete $self->{'links'}} : (); } # We override the parse_file() method so that we can clear the links # before we start a new file. sub parse_file { my $self = shift; delete $self->{'links'}; $self->SUPER::parse_file(@_); } =back =head1 EXAMPLE This is an example showing how you can extract links from a document received using LWP: use LWP::UserAgent; use HTML::LinkExtor; use URI::URL; $url = "http://www.perl.org/"; # for instance $ua = LWP::UserAgent->new; # Set up a callback that collect image links my @imgs = (); sub callback { my($tag, %attr) = @_; return if $tag ne 'img'; # we only look closer at <img ...> push(@imgs, values %attr); } # Make the parser. Unfortunately, we don't know the base yet # (it might be different from $url) $p = HTML::LinkExtor->new(\&callback); # Request document and parse it as it arrives $res = $ua->request(HTTP::Request->new(GET => $url), sub {$p->parse($_[0])}); # Expand all image URLs to absolute ones my $base = $res->base; @imgs = map { $_ = url($_, $base)->abs; } @imgs; # Print them out print join("\n", @imgs), "\n"; =head1 SEE ALSO L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL> =head1 COPYRIGHT Copyright 1996-2001 Gisle Aas. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1; ��������������������������������������������������������������������������������������������������������HTML-Parser-3.81/lib/HTML/Entities.pm���������������������������������������������������������������000644 �000765 �000024 �00000035155 14366103441 020371� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::Entities; =encoding utf8 =head1 NAME HTML::Entities - Encode or decode strings with HTML entities =head1 SYNOPSIS use HTML::Entities; $a = "V&aring;re norske tegn b&oslash;r &#230res"; decode_entities($a); encode_entities($a, "\200-\377"); For example, this: $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé"; print encode_entities($input), "\n" Prints this out: vis-&agrave;-vis Beyonc&eacute;'s na&iuml;ve papier-m&acirc;ch&eacute; r&eacute;sum&eacute; =head1 DESCRIPTION This module deals with encoding and decoding of strings with HTML character entities. The module provides the following functions: =over 4 =item decode_entities( $string, ... ) This routine replaces HTML entities found in the $string with the corresponding Unicode character. Unrecognized entities are left alone. If multiple strings are provided as argument they are each decoded separately and the same number of strings are returned. If called in void context the arguments are decoded in-place. This routine is exported by default. =item _decode_entities( $string, \%entity2char ) =item _decode_entities( $string, \%entity2char, $expand_prefix ) This will in-place replace HTML entities in $string. The %entity2char hash must be provided. Named entities not found in the %entity2char hash are left alone. Numeric entities are expanded unless their value overflow. The keys in %entity2char are the entity names to be expanded and their values are what they should expand into. The values do not have to be single character strings. If a key has ";" as suffix, then occurrences in $string are only expanded if properly terminated with ";". Entities without ";" will be expanded regardless of how they are terminated for compatibility with how common browsers treat entities in the Latin-1 range. If $expand_prefix is TRUE then entities without trailing ";" in %entity2char will even be expanded as a prefix of a longer unrecognized name. The longest matching name in %entity2char will be used. This is mainly present for compatibility with an MSIE misfeature. $string = "foo&nbspbar"; _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1); print $string; # will print "foo bar" This routine is exported by default. =item encode_entities( $string ) =item encode_entities( $string, $unsafe_chars ) This routine replaces unsafe characters in $string with their entity representation. A second argument can be given to specify which characters to consider unsafe. The unsafe characters is specified using the regular expression character class syntax (what you find within brackets in regular expressions). The default set of characters to encode are control chars, high-bit chars, and the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this, for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters: $encoded = encode_entities($input, '<>&"'); and this would only encode non-plain ASCII: $encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e'); This routine is exported by default. =item encode_entities_numeric( $string ) =item encode_entities_numeric( $string, $unsafe_chars ) This routine works just like encode_entities, except that the replacement entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For example, C<encode_entities("r\xF4le")> returns "r&ocirc;le", but C<encode_entities_numeric("r\xF4le")> returns "r&#xF4;le". This routine is I<not> exported by default. But you can always export it with C<use HTML::Entities qw(encode_entities_numeric);> or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);> =back All these routines modify the string passed as the first argument, if called in a void context. In scalar and array contexts, the encoded or decoded string is returned (without changing the input string). If you prefer not to import these routines into your namespace, you can call them as: use HTML::Entities (); $decoded = HTML::Entities::decode($a); $encoded = HTML::Entities::encode($a); $encoded = HTML::Entities::encode_numeric($a); The module can also export the %char2entity and the %entity2char hashes, which contain the mapping from all characters to the corresponding entities (and vice versa, respectively). =head1 COPYRIGHT Copyright 1995-2006 Gisle Aas. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut use strict; our $VERSION = '3.81'; our (%entity2char, %char2entity); require 5.004; require Exporter; our @ISA = qw(Exporter); our @EXPORT = qw(encode_entities decode_entities _decode_entities); our @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric); sub Version { $VERSION; } require HTML::Parser; # for fast XS implemented decode_entities %entity2char = ( # Some normal chars that have special meaning in SGML context amp => '&', # ampersand 'gt' => '>', # greater than 'lt' => '<', # less than quot => '"', # double quote apos => "'", # single quote # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML AElig => chr(198), # capital AE diphthong (ligature) Aacute => chr(193), # capital A, acute accent Acirc => chr(194), # capital A, circumflex accent Agrave => chr(192), # capital A, grave accent Aring => chr(197), # capital A, ring Atilde => chr(195), # capital A, tilde Auml => chr(196), # capital A, dieresis or umlaut mark Ccedil => chr(199), # capital C, cedilla ETH => chr(208), # capital Eth, Icelandic Eacute => chr(201), # capital E, acute accent Ecirc => chr(202), # capital E, circumflex accent Egrave => chr(200), # capital E, grave accent Euml => chr(203), # capital E, dieresis or umlaut mark Iacute => chr(205), # capital I, acute accent Icirc => chr(206), # capital I, circumflex accent Igrave => chr(204), # capital I, grave accent Iuml => chr(207), # capital I, dieresis or umlaut mark Ntilde => chr(209), # capital N, tilde Oacute => chr(211), # capital O, acute accent Ocirc => chr(212), # capital O, circumflex accent Ograve => chr(210), # capital O, grave accent Oslash => chr(216), # capital O, slash Otilde => chr(213), # capital O, tilde Ouml => chr(214), # capital O, dieresis or umlaut mark THORN => chr(222), # capital THORN, Icelandic Uacute => chr(218), # capital U, acute accent Ucirc => chr(219), # capital U, circumflex accent Ugrave => chr(217), # capital U, grave accent Uuml => chr(220), # capital U, dieresis or umlaut mark Yacute => chr(221), # capital Y, acute accent aacute => chr(225), # small a, acute accent acirc => chr(226), # small a, circumflex accent aelig => chr(230), # small ae diphthong (ligature) agrave => chr(224), # small a, grave accent aring => chr(229), # small a, ring atilde => chr(227), # small a, tilde auml => chr(228), # small a, dieresis or umlaut mark ccedil => chr(231), # small c, cedilla eacute => chr(233), # small e, acute accent ecirc => chr(234), # small e, circumflex accent egrave => chr(232), # small e, grave accent eth => chr(240), # small eth, Icelandic euml => chr(235), # small e, dieresis or umlaut mark iacute => chr(237), # small i, acute accent icirc => chr(238), # small i, circumflex accent igrave => chr(236), # small i, grave accent iuml => chr(239), # small i, dieresis or umlaut mark ntilde => chr(241), # small n, tilde oacute => chr(243), # small o, acute accent ocirc => chr(244), # small o, circumflex accent ograve => chr(242), # small o, grave accent oslash => chr(248), # small o, slash otilde => chr(245), # small o, tilde ouml => chr(246), # small o, dieresis or umlaut mark szlig => chr(223), # small sharp s, German (sz ligature) thorn => chr(254), # small thorn, Icelandic uacute => chr(250), # small u, acute accent ucirc => chr(251), # small u, circumflex accent ugrave => chr(249), # small u, grave accent uuml => chr(252), # small u, dieresis or umlaut mark yacute => chr(253), # small y, acute accent yuml => chr(255), # small y, dieresis or umlaut mark # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) copy => chr(169), # copyright sign reg => chr(174), # registered sign nbsp => chr(160), # non breaking space # Additional ISO-8859/1 entities listed in rfc1866 (section 14) iexcl => chr(161), cent => chr(162), pound => chr(163), curren => chr(164), yen => chr(165), brvbar => chr(166), sect => chr(167), uml => chr(168), ordf => chr(170), laquo => chr(171), 'not' => chr(172), # not is a keyword in perl shy => chr(173), macr => chr(175), deg => chr(176), plusmn => chr(177), sup1 => chr(185), sup2 => chr(178), sup3 => chr(179), acute => chr(180), micro => chr(181), para => chr(182), middot => chr(183), cedil => chr(184), ordm => chr(186), raquo => chr(187), frac14 => chr(188), frac12 => chr(189), frac34 => chr(190), iquest => chr(191), 'times' => chr(215), # times is a keyword in perl divide => chr(247), ( $] > 5.007 ? ( 'OElig;' => chr(338), 'oelig;' => chr(339), 'Scaron;' => chr(352), 'scaron;' => chr(353), 'Yuml;' => chr(376), 'fnof;' => chr(402), 'circ;' => chr(710), 'tilde;' => chr(732), 'Alpha;' => chr(913), 'Beta;' => chr(914), 'Gamma;' => chr(915), 'Delta;' => chr(916), 'Epsilon;' => chr(917), 'Zeta;' => chr(918), 'Eta;' => chr(919), 'Theta;' => chr(920), 'Iota;' => chr(921), 'Kappa;' => chr(922), 'Lambda;' => chr(923), 'Mu;' => chr(924), 'Nu;' => chr(925), 'Xi;' => chr(926), 'Omicron;' => chr(927), 'Pi;' => chr(928), 'Rho;' => chr(929), 'Sigma;' => chr(931), 'Tau;' => chr(932), 'Upsilon;' => chr(933), 'Phi;' => chr(934), 'Chi;' => chr(935), 'Psi;' => chr(936), 'Omega;' => chr(937), 'alpha;' => chr(945), 'beta;' => chr(946), 'gamma;' => chr(947), 'delta;' => chr(948), 'epsilon;' => chr(949), 'zeta;' => chr(950), 'eta;' => chr(951), 'theta;' => chr(952), 'iota;' => chr(953), 'kappa;' => chr(954), 'lambda;' => chr(955), 'mu;' => chr(956), 'nu;' => chr(957), 'xi;' => chr(958), 'omicron;' => chr(959), 'pi;' => chr(960), 'rho;' => chr(961), 'sigmaf;' => chr(962), 'sigma;' => chr(963), 'tau;' => chr(964), 'upsilon;' => chr(965), 'phi;' => chr(966), 'chi;' => chr(967), 'psi;' => chr(968), 'omega;' => chr(969), 'thetasym;' => chr(977), 'upsih;' => chr(978), 'piv;' => chr(982), 'ensp;' => chr(8194), 'emsp;' => chr(8195), 'thinsp;' => chr(8201), 'zwnj;' => chr(8204), 'zwj;' => chr(8205), 'lrm;' => chr(8206), 'rlm;' => chr(8207), 'ndash;' => chr(8211), 'mdash;' => chr(8212), 'lsquo;' => chr(8216), 'rsquo;' => chr(8217), 'sbquo;' => chr(8218), 'ldquo;' => chr(8220), 'rdquo;' => chr(8221), 'bdquo;' => chr(8222), 'dagger;' => chr(8224), 'Dagger;' => chr(8225), 'bull;' => chr(8226), 'hellip;' => chr(8230), 'permil;' => chr(8240), 'prime;' => chr(8242), 'Prime;' => chr(8243), 'lsaquo;' => chr(8249), 'rsaquo;' => chr(8250), 'oline;' => chr(8254), 'frasl;' => chr(8260), 'euro;' => chr(8364), 'image;' => chr(8465), 'weierp;' => chr(8472), 'real;' => chr(8476), 'trade;' => chr(8482), 'alefsym;' => chr(8501), 'larr;' => chr(8592), 'uarr;' => chr(8593), 'rarr;' => chr(8594), 'darr;' => chr(8595), 'harr;' => chr(8596), 'crarr;' => chr(8629), 'lArr;' => chr(8656), 'uArr;' => chr(8657), 'rArr;' => chr(8658), 'dArr;' => chr(8659), 'hArr;' => chr(8660), 'forall;' => chr(8704), 'part;' => chr(8706), 'exist;' => chr(8707), 'empty;' => chr(8709), 'nabla;' => chr(8711), 'isin;' => chr(8712), 'notin;' => chr(8713), 'ni;' => chr(8715), 'prod;' => chr(8719), 'sum;' => chr(8721), 'minus;' => chr(8722), 'lowast;' => chr(8727), 'radic;' => chr(8730), 'prop;' => chr(8733), 'infin;' => chr(8734), 'ang;' => chr(8736), 'and;' => chr(8743), 'or;' => chr(8744), 'cap;' => chr(8745), 'cup;' => chr(8746), 'int;' => chr(8747), 'there4;' => chr(8756), 'sim;' => chr(8764), 'cong;' => chr(8773), 'asymp;' => chr(8776), 'ne;' => chr(8800), 'equiv;' => chr(8801), 'le;' => chr(8804), 'ge;' => chr(8805), 'sub;' => chr(8834), 'sup;' => chr(8835), 'nsub;' => chr(8836), 'sube;' => chr(8838), 'supe;' => chr(8839), 'oplus;' => chr(8853), 'otimes;' => chr(8855), 'perp;' => chr(8869), 'sdot;' => chr(8901), 'lceil;' => chr(8968), 'rceil;' => chr(8969), 'lfloor;' => chr(8970), 'rfloor;' => chr(8971), 'lang;' => chr(9001), 'rang;' => chr(9002), 'loz;' => chr(9674), 'spades;' => chr(9824), 'clubs;' => chr(9827), 'hearts;' => chr(9829), 'diams;' => chr(9830), ) : ()) ); # Make the opposite mapping while (my($entity, $char) = each(%entity2char)) { $entity =~ s/;\z//; $char2entity{$char} = "&$entity;"; } delete $char2entity{"'"}; # only one-way decoding # Fill in missing entities for (0 .. 255) { next if exists $char2entity{chr($_)}; $char2entity{chr($_)} = "&#$_;"; } my %subst; # compiled encoding regexps sub encode_entities { return undef unless defined $_[0]; my $ref; if (defined wantarray) { my $x = $_[0]; $ref = \$x; # copy } else { $ref = \$_[0]; # modify in-place } if (defined $_[1] and length $_[1]) { unless (exists $subst{$_[1]}) { # Because we can't compile regex we fake it with a cached sub my $chars = $_[1]; $chars =~ s,(?<!\\)([]/]),\\$1,g; $chars =~ s,(?<!\\)\\\z,\\\\,; my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }"; $subst{$_[1]} = eval $code; die( $@ . " while trying to turn range: \"$_[1]\"\n " . "into code: $code\n " ) if $@; } &{$subst{$_[1]}}($$ref); } else { # Encode control chars, high bit chars and '<', '&', '>', ''' and '"' $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge; } $$ref; } sub encode_entities_numeric { local %char2entity; return &encode_entities; # a goto &encode_entities wouldn't work } sub num_entity { sprintf "&#x%X;", ord($_[0]); } # Set up aliases *encode = \&encode_entities; *encode_numeric = \&encode_entities_numeric; *encode_numerically = \&encode_entities_numeric; *decode = \&decode_entities; 1; �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/lib/HTML/Filter.pm�����������������������������������������������������������������000644 �000765 �000024 �00000005146 14366103441 020027� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::Filter; use strict; require HTML::Parser; our @ISA = qw(HTML::Parser); our $VERSION = '3.81'; sub declaration { $_[0]->output("<!$_[1]>") } sub process { $_[0]->output($_[2]) } sub comment { $_[0]->output("<!--$_[1]-->") } sub start { $_[0]->output($_[4]) } sub end { $_[0]->output($_[2]) } sub text { $_[0]->output($_[1]) } sub output { print $_[1] } 1; __END__ =head1 NAME HTML::Filter - Filter HTML text through the parser =head1 NOTE B<This module is deprecated.> The C<HTML::Parser> now provides the functionally of C<HTML::Filter> much more efficiently with the C<default> handler. =head1 SYNOPSIS require HTML::Filter; $p = HTML::Filter->new->parse_file("index.html"); =head1 DESCRIPTION C<HTML::Filter> is an HTML parser that by default prints the original text of each HTML element (a slow version of cat(1) basically). The callback methods may be overridden to modify the filtering for some HTML elements and you can override output() method which is called to print the HTML text. C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that the document should be given to the parser by calling the $p->parse() or $p->parse_file() methods. =head1 EXAMPLES The first example is a filter that will remove all comments from an HTML file. This is achieved by simply overriding the comment method to do nothing. package CommentStripper; require HTML::Filter; @ISA=qw(HTML::Filter); sub comment { } # ignore comments The second example shows a filter that will remove any E<lt>TABLE>s found in the HTML file. We specialize the start() and end() methods to count table tags and then make output not happen when inside a table. package TableStripper; require HTML::Filter; @ISA=qw(HTML::Filter); sub start { my $self = shift; $self->{table_seen}++ if $_[0] eq "table"; $self->SUPER::start(@_); } sub end { my $self = shift; $self->SUPER::end(@_); $self->{table_seen}-- if $_[0] eq "table"; } sub output { my $self = shift; unless ($self->{table_seen}) { $self->SUPER::output(@_); } } If you want to collect the parsed text internally you might want to do something like this: package FilterIntoString; require HTML::Filter; @ISA=qw(HTML::Filter); sub output { push(@{$_[0]->{fhtml}}, $_[1]) } sub filtered_html { join("", @{$_[0]->{fhtml}}) } =head1 SEE ALSO L<HTML::Parser> =head1 COPYRIGHT Copyright 1997-1999 Gisle Aas. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/lib/HTML/Parser.pm�����������������������������������������������������������������000644 �000765 �000024 �00000115422 14366103441 020035� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::Parser; use strict; our $VERSION = '3.81'; require HTML::Entities; require XSLoader; XSLoader::load('HTML::Parser', $VERSION); sub new { my $class = shift; my $self = bless {}, $class; return $self->init(@_); } sub init { my $self = shift; $self->_alloc_pstate; my %arg = @_; my $api_version = delete $arg{api_version} || (@_ ? 3 : 2); if ($api_version >= 4) { require Carp; Carp::croak("API version $api_version not supported " . "by HTML::Parser $VERSION"); } if ($api_version < 3) { # Set up method callbacks compatible with HTML-Parser-2.xx $self->handler(text => "text", "self,text,is_cdata"); $self->handler(end => "end", "self,tagname,text"); $self->handler(process => "process", "self,token0,text"); $self->handler(start => "start", "self,tagname,attr,attrseq,text"); $self->handler(comment => sub { my($self, $tokens) = @_; for (@$tokens) { $self->comment($_); } }, "self,tokens"); $self->handler(declaration => sub { my $self = shift; $self->declaration(substr($_[0], 2, -1)); }, "self,text"); } if (my $h = delete $arg{handlers}) { $h = {@$h} if ref($h) eq "ARRAY"; while (my($event, $cb) = each %$h) { $self->handler($event => @$cb); } } # In the end we try to assume plain attribute or handler while (my($option, $val) = each %arg) { if ($option =~ /^(\w+)_h$/) { $self->handler($1 => @$val); } elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) { require Carp; Carp::croak("Bad constructor option '$option'"); } else { $self->$option($val); } } return $self; } sub parse_file { my($self, $file) = @_; my $opened; if (!ref($file) && ref(\$file) ne "GLOB") { # Assume $file is a filename local(*F); open(F, "<", $file) || return undef; binmode(F); # should we? good for byte counts $opened++; $file = *F; } my $chunk = ''; while (read($file, $chunk, 512)) { $self->parse($chunk) || last; } close($file) if $opened; $self->eof; } sub netscape_buggy_comment # legacy { my $self = shift; require Carp; Carp::carp("netscape_buggy_comment() is deprecated. " . "Please use the strict_comment() method instead"); my $old = !$self->strict_comment; $self->strict_comment(!shift) if @_; return $old; } # set up method stubs sub text { } *start = \&text; *end = \&text; *comment = \&text; *declaration = \&text; *process = \&text; 1; __END__ =head1 NAME HTML::Parser - HTML parser class =head1 SYNOPSIS use strict; use warnings; use HTML::Parser (); # Create parser object my $p = HTML::Parser->new( api_version => 3, start_h => [\&start, "tagname, attr"], end_h => [\&end, "tagname"], marked_sections => 1, ); # Parse document text chunk by chunk $p->parse($chunk1); $p->parse($chunk2); # ... # signal end of document $p->eof; # Parse directly from file $p->parse_file("foo.html"); # or open(my $fh, "<:utf8", "foo.html") || die; $p->parse_file($fh); =head1 DESCRIPTION Objects of the C<HTML::Parser> class will recognize markup and separate it from plain text (alias data content) in HTML documents. As different kinds of markup and text are recognized, the corresponding event handlers are invoked. C<HTML::Parser> is not a generic SGML parser. We have tried to make it able to deal with the HTML that is actually "out there", and it normally parses as closely as possible to the way the popular web browsers do it instead of strictly following one of the many HTML specifications from W3C. Where there is disagreement, there is often an option that you can enable to get the official behaviour. The document to be parsed may be supplied in arbitrary chunks. This makes on-the-fly parsing as documents are received from the network possible. If event driven parsing does not feel right for your application, you might want to use C<HTML::PullParser>. This is an C<HTML::Parser> subclass that allows a more conventional program structure. =head1 METHODS The following method is used to construct a new C<HTML::Parser> object: =over =item $p = HTML::Parser->new( %options_and_handlers ) This class method creates a new C<HTML::Parser> object and returns it. Key/value argument pairs may be provided to assign event handlers or initialize parser options. The handlers and parser options can also be set or modified later by the method calls described below. If a top level key is in the form "<event>_h" (e.g., "text_h") then it assigns a handler to that event, otherwise it initializes a parser option. The event handler specification value must be an array reference. Multiple handlers may also be assigned with the 'handlers => [%handlers]' option. See examples below. If new() is called without any arguments, it will create a parser that uses callback methods compatible with version 2 of C<HTML::Parser>. See the section on "version 2 compatibility" below for details. The special constructor option 'api_version => 2' can be used to initialize version 2 callbacks while still setting other options and handlers. The 'api_version => 3' option can be used if you don't want to set any options and don't want to fall back to v2 compatible mode. Examples: $p = HTML::Parser->new( api_version => 3, text_h => [ sub {...}, "dtext" ] ); This creates a new parser object with a text event handler subroutine that receives the original text with general entities decoded. $p = HTML::Parser->new( api_version => 3, start_h => [ 'my_start', "self,tokens" ] ); This creates a new parser object with a start event handler method that receives the $p and the tokens array. $p = HTML::Parser->new( api_version => 3, handlers => { text => [\@array, "event,text"], comment => [\@array, "event,text"], } ); This creates a new parser object that stores the event type and the original text in @array for text and comment events. =back The following methods feed the HTML document to the C<HTML::Parser> object: =over =item $p->parse( $string ) Parse $string as the next chunk of the HTML document. Handlers invoked should not attempt to modify the $string in-place until $p->parse returns. If an invoked event handler aborts parsing by calling $p->eof, then $p->parse() will return a FALSE value. Otherwise the return value is a reference to the parser object ($p). =item $p->parse( $code_ref ) If a code reference is passed as the argument to be parsed, then the chunks to be parsed are obtained by invoking this function repeatedly. Parsing continues until the function returns an empty (or undefined) result. When this happens $p->eof is automatically signaled. Parsing will also abort if one of the event handlers calls $p->eof. The effect of this is the same as: while (1) { my $chunk = &$code_ref(); if (!defined($chunk) || !length($chunk)) { $p->eof; return $p; } $p->parse($chunk) || return undef; } But it is more efficient as this loop runs internally in XS code. =item $p->parse_file( $file ) Parse text directly from a file. The $file argument can be a filename, an open file handle, or a reference to an open file handle. If $file contains a filename and the file can't be opened, then the method returns an undefined value and $! tells why it failed. Otherwise the return value is a reference to the parser object. If a file handle is passed as the $file argument, then the file will normally be read until EOF, but not closed. If an invoked event handler aborts parsing by calling $p->eof, then $p->parse_file() may not have read the entire file. On systems with multi-byte line terminators, the values passed for the offset and length argspecs may be too low if parse_file() is called on a file handle that is not in binary mode. If a filename is passed in, then parse_file() will open the file in binary mode. =item $p->eof Signals the end of the HTML document. Calling the $p->eof method outside a handler callback will flush any remaining buffered text (which triggers the C<text> event if there is any remaining text). Calling $p->eof inside a handler will terminate parsing at that point and cause $p->parse to return a FALSE value. This also terminates parsing by $p->parse_file(). After $p->eof has been called, the parse() and parse_file() methods can be invoked to feed new documents with the parser object. The return value from eof() is a reference to the parser object. =back Most parser options are controlled by boolean attributes. Each boolean attribute is enabled by calling the corresponding method with a TRUE argument and disabled with a FALSE argument. The attribute value is left unchanged if no argument is given. The return value from each method is the old attribute value. Methods that can be used to get and/or set parser options are: =over =item $p->attr_encoded =item $p->attr_encoded( $bool ) By default, the C<attr> and C<@attr> argspecs will have general entities for attribute values decoded. Enabling this attribute leaves entities alone. =item $p->backquote =item $p->backquote( $bool ) By default, only ' and " are recognized as quote characters around attribute values. MSIE also recognizes backquotes for some reason. Enabling this attribute provides compatibility with this behaviour. =item $p->boolean_attribute_value( $val ) This method sets the value reported for boolean attributes inside HTML start tags. By default, the name of the attribute is also used as its value. This affects the values reported for C<tokens> and C<attr> argspecs. =item $p->case_sensitive =item $p->case_sensitive( $bool ) By default, tag names and attribute names are down-cased. Enabling this attribute leaves them as found in the HTML source document. =item $p->closing_plaintext =item $p->closing_plaintext( $bool ) By default, C<plaintext> element can never be closed. Everything up to the end of the document is parsed in CDATA mode. This historical behaviour is what at least MSIE does. Enabling this attribute makes closing C< </plaintext> > tag effective and the parsing process will resume after seeing this tag. This emulates early gecko-based browsers. =item $p->empty_element_tags =item $p->empty_element_tags( $bool ) By default, empty element tags are not recognized as such and the "/" before ">" is just treated like a normal name character (unless C<strict_names> is enabled). Enabling this attribute make C<HTML::Parser> recognize these tags. Empty element tags look like start tags, but end with the character sequence "/>" instead of ">". When recognized by C<HTML::Parser> they cause an artificial end event in addition to the start event. The C<text> for the artificial end event will be empty and the C<tokenpos> array will be undefined even though the token array will have one element containing the tag name. =item $p->marked_sections =item $p->marked_sections( $bool ) By default, section markings like <![CDATA[...]]> are treated like ordinary text. When this attribute is enabled section markings are honoured. There are currently no events associated with the marked section markup, but the text can be returned as C<skipped_text>. =item $p->strict_comment =item $p->strict_comment( $bool ) By default, comments are terminated by the first occurrence of "-->". This is the behaviour of most popular browsers (like Mozilla, Opera and MSIE), but it is not correct according to the official HTML standard. Officially, you need an even number of "--" tokens before the closing ">" is recognized and there may not be anything but whitespace between an even and an odd "--". The official behaviour is enabled by enabling this attribute. Enabling of 'strict_comment' also disables recognizing these forms as comments: </ comment> <! comment> =item $p->strict_end =item $p->strict_end( $bool ) By default, attributes and other junk are allowed to be present on end tags in a manner that emulates MSIE's behaviour. The official behaviour is enabled with this attribute. If enabled, only whitespace is allowed between the tagname and the final ">". =item $p->strict_names =item $p->strict_names( $bool ) By default, almost anything is allowed in tag and attribute names. This is the behaviour of most popular browsers and allows us to parse some broken tags with invalid attribute values like: <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0> By default, "LIST]" is parsed as a boolean attribute, not as part of the ALT value as was clearly intended. This is also what Mozilla sees. The official behaviour is enabled by enabling this attribute. If enabled, it will cause the tag above to be reported as text since "LIST]" is not a legal attribute name. =item $p->unbroken_text =item $p->unbroken_text( $bool ) By default, blocks of text are given to the text handler as soon as possible (but the parser takes care always to break text at a boundary between whitespace and non-whitespace so single words and entities can always be decoded safely). This might create breaks that make it hard to do transformations on the text. When this attribute is enabled, blocks of text are always reported in one piece. This will delay the text event until the following (non-text) event has been recognized by the parser. Note that the C<offset> argspec will give you the offset of the first segment of text and C<length> is the combined length of the segments. Since there might be ignored tags in between, these numbers can't be used to directly index in the original document file. =item $p->utf8_mode =item $p->utf8_mode( $bool ) Enable this option when parsing raw undecoded UTF-8. This tells the parser that the entities expanded for strings reported by C<attr>, C<@attr> and C<dtext> should be expanded as decoded UTF-8 so they end up compatible with the surrounding text. If C<utf8_mode> is enabled then it is an error to pass strings containing characters with code above 255 to the parse() method, and the parse() method will croak if you try. Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8 encoded. The character can also be represented by the entity "&hearts;" or "&#x2665". If we feed the parser: $p->parse("\xE2\x99\xA5&hearts;"); then C<dtext> will be reported as "\xE2\x99\xA5\x{2665}" without C<utf8_mode> enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled. The later string is what you want. This option is only available with perl-5.8 or better. =item $p->xml_mode =item $p->xml_mode( $bool ) Enabling this attribute changes the parser to allow some XML constructs. This enables the behaviour controlled by individually by the C<case_sensitive>, C<empty_element_tags>, C<strict_names> and C<xml_pic> attributes and also suppresses special treatment of elements that are parsed as CDATA for HTML. =item $p->xml_pic =item $p->xml_pic( $bool ) By default, I<processing instructions> are terminated by ">". When this attribute is enabled, processing instructions are terminated by "?>" instead. =back As markup and text is recognized, handlers are invoked. The following method is used to set up handlers for different events: =over =item $p->handler( event => \&subroutine, $argspec ) =item $p->handler( event => $method_name, $argspec ) =item $p->handler( event => \@accum, $argspec ) =item $p->handler( event => "" ); =item $p->handler( event => undef ); =item $p->handler( event ); This method assigns a subroutine, method, or array to handle an event. Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>, C<process>, C<start_document>, C<end_document> or C<default>. The C<\&subroutine> is a reference to a subroutine which is called to handle the event. The C<$method_name> is the name of a method of $p which is called to handle the event. The C<@accum> is an array that will hold the event information as sub-arrays. If the second argument is "", the event is ignored. If it is undef, the default handler is invoked for the event. The C<$argspec> is a string that describes the information to be reported for the event. Any requested information that does not apply to a specific event is passed as C<undef>. If argspec is omitted, then it is left unchanged. The return value from $p->handler is the old callback routine or a reference to the accumulator array. Any return values from handler callback routines/methods are always ignored. A handler callback can request parsing to be aborted by invoking the $p->eof method. A handler callback is not allowed to invoke the $p->parse() or $p->parse_file() method. An exception will be raised if it tries. Examples: $p->handler(start => "start", 'self, attr, attrseq, text' ); This causes the "start" method of object C<$p> to be called for 'start' events. The callback signature is C<< $p->start(\%attr, \@attr_seq, $text) >>. $p->handler(start => \&start, 'attr, attrseq, text' ); This causes subroutine start() to be called for 'start' events. The callback signature is start(\%attr, \@attr_seq, $text). $p->handler(start => \@accum, '"S", attr, attrseq, text' ); This causes 'start' event information to be saved in @accum. The array elements will be ['S', \%attr, \@attr_seq, $text]. $p->handler(start => ""); This causes 'start' events to be ignored. It also suppresses invocations of any default handler for start events. It is in most cases equivalent to $p->handler(start => sub {}), but is more efficient. It is different from the empty-sub-handler in that C<skipped_text> is not reset by it. $p->handler(start => undef); This causes no handler to be associated with start events. If there is a default handler it will be invoked. =back Filters based on tags can be set up to limit the number of events reported. The main bottleneck during parsing is often the huge number of callbacks made from the parser. Applying filters can improve performance significantly. The following methods control filters: =over =item $p->ignore_elements( @tags ) Both the C<start> event and the C<end> event as well as any events that would be reported in between are suppressed. The ignored elements can contain nested occurrences of itself. Example: $p->ignore_elements(qw(script style)); The C<script> and C<style> tags will always nest properly since their content is parsed in CDATA mode. For most other tags C<ignore_elements> must be used with caution since HTML is often not I<well formed>. =item $p->ignore_tags( @tags ) Any C<start> and C<end> events involving any of the tags given are suppressed. To reset the filter (i.e. don't suppress any C<start> and C<end> events), call C<ignore_tags> without an argument. =item $p->report_tags( @tags ) Any C<start> and C<end> events involving any of the tags I<not> given are suppressed. To reset the filter (i.e. report all C<start> and C<end> events), call C<report_tags> without an argument. =back Internally, the system has two filter lists, one for C<report_tags> and one for C<ignore_tags>, and both filters are applied. This effectively gives C<ignore_tags> precedence over C<report_tags>. Examples: $p->ignore_tags(qw(style)); $p->report_tags(qw(script style)); results in only C<script> events being reported. =head2 Argspec Argspec is a string containing a comma-separated list that describes the information reported by the event. The following argspec identifier names can be used: =over =item C<attr> Attr causes a reference to a hash of attribute name/value pairs to be passed. Boolean attributes' values are either the value set by $p->boolean_attribute_value, or the attribute name if no value has been set by $p->boolean_attribute_value. This passes undef except for C<start> events. Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute names are forced to lower case. General entities are decoded in the attribute values and one layer of matching quotes enclosing the attribute values is removed. The Unicode character set is assumed for entity decoding. =item C<@attr> Basically the same as C<attr>, but keys and values are passed as individual arguments and the original sequence of the attributes is kept. The parameters passed will be the same as the @attr calculated here: @attr = map { $_ => $attr->{$_} } @$attrseq; assuming $attr and $attrseq here are the hash and array passed as the result of C<attr> and C<attrseq> argspecs. This passes no values for events besides C<start>. =item C<attrseq> Attrseq causes a reference to an array of attribute names to be passed. This can be useful if you want to walk the C<attr> hash in the original sequence. This passes undef except for C<start> events. Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute names are forced to lower case. =item C<column> Column causes the column number of the start of the event to be passed. The first column on a line is 0. =item C<dtext> Dtext causes the decoded text to be passed. General entities are automatically decoded unless the event was inside a CDATA section or was between literal start and end tags (C<script>, C<style>, C<xmp>, C<iframe>, C<title>, C<textarea> and C<plaintext>). The Unicode character set is assumed for entity decoding. With Perl version 5.6 or earlier only the Latin-1 range is supported, and entities for characters outside the range 0..255 are left unchanged. This passes undef except for C<text> events. =item C<event> Event causes the event name to be passed. The event name is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>, C<process>, C<start_document> or C<end_document>. =item C<is_cdata> Is_cdata causes a TRUE value to be passed if the event is inside a CDATA section or between literal start and end tags (C<script>, C<style>, C<xmp>, C<iframe>, C<title>, C<textarea> and C<plaintext>). if the flag is FALSE for a text event, then you should normally either use C<dtext> or decode the entities yourself before the text is processed further. =item C<length> Length causes the number of bytes of the source text of the event to be passed. =item C<line> Line causes the line number of the start of the event to be passed. The first line in the document is 1. Line counting doesn't start until at least one handler requests this value to be reported. =item C<offset> Offset causes the byte position in the HTML document of the start of the event to be passed. The first byte in the document has offset 0. =item C<offset_end> Offset_end causes the byte position in the HTML document of the end of the event to be passed. This is the same as C<offset> + C<length>. =item C<self> Self causes the current object to be passed to the handler. If the handler is a method, this must be the first element in the argspec. An alternative to passing self as an argspec is to register closures that capture $self by themselves as handlers. Unfortunately this creates circular references which prevent the HTML::Parser object from being garbage collected. Using the C<self> argspec avoids this problem. =item C<skipped_text> Skipped_text returns the concatenated text of all the events that have been skipped since the last time an event was reported. Events might be skipped because no handler is registered for them or because some filter applies. Skipped text also includes marked section markup, since there are no events that can catch it. If an C<"">-handler is registered for an event, then the text for this event is not included in C<skipped_text>. Skipped text both before and after the C<"">-event is included in the next reported C<skipped_text>. =item C<tag> Same as C<tagname>, but prefixed with "/" if it belongs to an C<end> event and "!" for a declaration. The C<tag> does not have any prefix for C<start> events, and is in this case identical to C<tagname>. =item C<tagname> This is the element name (or I<generic identifier> in SGML jargon) for start and end tags. Since HTML is case insensitive, this name is forced to lower case to ease string matching. Since XML is case sensitive, the tagname case is not changed when C<xml_mode> is enabled. The same happens if the C<case_sensitive> attribute is set. The declaration type of declaration elements is also passed as a tagname, even if that is a bit strange. In fact, in the current implementation tagname is identical to C<token0> except that the name may be forced to lower case. =item C<token0> Token0 causes the original text of the first token string to be passed. This should always be the same as $tokens->[0]. For C<declaration> events, this is the declaration type. For C<start> and C<end> events, this is the tag name. For C<process> and non-strict C<comment> events, this is everything inside the tag. This passes undef if there are no tokens in the event. =item C<tokenpos> Tokenpos causes a reference to an array of token positions to be passed. For each string that appears in C<tokens>, this array contains two numbers. The first number is the offset of the start of the token in the original C<text> and the second number is the length of the token. Boolean attributes in a C<start> event will have (0,0) for the attribute value offset and length. This passes undef if there are no tokens in the event (e.g., C<text>) and for artificial C<end> events triggered by empty element tags. If you are using these offsets and lengths to modify C<text>, you should either work from right to left, or be very careful to calculate the changes to the offsets. =item C<tokens> Tokens causes a reference to an array of token strings to be passed. The strings are exactly as they were found in the original text, no decoding or case changes are applied. For C<declaration> events, the array contains each word, comment, and delimited string starting with the declaration type. For C<comment> events, this contains each sub-comment. If $p->strict_comments is disabled, there will be only one sub-comment. For C<start> events, this contains the original tag name followed by the attribute name/value pairs. The values of boolean attributes will be either the value set by $p->boolean_attribute_value, or the attribute name if no value has been set by $p->boolean_attribute_value. For C<end> events, this contains the original tag name (always one token). For C<process> events, this contains the process instructions (always one token). This passes C<undef> for C<text> events. =item C<text> Text causes the source text (including markup element delimiters) to be passed. =item C<undef> Pass an undefined value. Useful as padding where the same handler routine is registered for multiple events. =item C<'...'> A literal string of 0 to 255 characters enclosed in single (') or double (") quotes is passed as entered. =back The whole argspec string can be wrapped up in C<'@{...}'> to signal that the resulting event array should be flattened. This only makes a difference if an array reference is used as the handler target. Consider this example: $p->handler(text => [], 'text'); $p->handler(text => [], '@{text}']); With two text events; C<"foo">, C<"bar">; then the first example will end up with [["foo"], ["bar"]] and the second with ["foo", "bar"] in the handler target array. =head2 Events Handlers for the following events can be registered: =over =item C<comment> This event is triggered when a markup comment is recognized. Example: <!-- This is a comment -- -- So is this --> =item C<declaration> This event is triggered when a I<markup declaration> is recognized. For typical HTML documents, the only declaration you are likely to find is <!DOCTYPE ...>. Example: <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> DTDs inside <!DOCTYPE ...> will confuse HTML::Parser. =item C<default> This event is triggered for events that do not have a specific handler. You can set up a handler for this event to catch stuff you did not want to catch explicitly. =item C<end> This event is triggered when an end tag is recognized. Example: </A> =item C<end_document> This event is triggered when $p->eof is called and after any remaining text is flushed. There is no document text associated with this event. =item C<process> This event is triggered when a processing instructions markup is recognized. The format and content of processing instructions are system and application dependent. Examples: <? HTML processing instructions > <? XML processing instructions ?> =item C<start> This event is triggered when a start tag is recognized. Example: <A HREF="http://www.perl.com/"> =item C<start_document> This event is triggered before any other events for a new document. A handler for it can be used to initialize stuff. There is no document text associated with this event. =item C<text> This event is triggered when plain text (characters) is recognized. The text may contain multiple lines. A sequence of text may be broken between several text events unless $p->unbroken_text is enabled. The parser will make sure that it does not break a word or a sequence of whitespace between two text events. =back =head2 Unicode C<HTML::Parser> can parse Unicode strings when running under perl-5.8 or better. If Unicode is passed to $p->parse() then chunks of Unicode will be reported to the handlers. The offset and length argspecs will also report their position in terms of characters. It is safe to parse raw undecoded UTF-8 if you either avoid decoding entities and make sure to not use I<argspecs> that do, or enable the C<utf8_mode> for the parser. Parsing of undecoded UTF-8 might be useful when parsing from a file where you need the reported offsets and lengths to match the byte offsets in the file. If a filename is passed to $p->parse_file() then the file will be read in binary mode. This will be fine if the file contains only ASCII or Latin-1 characters. If the file contains UTF-8 encoded text then care must be taken when decoding entities as described in the previous paragraph, but better is to open the file with the UTF-8 layer so that it is decoded properly: open(my $fh, "<:utf8", "index.html") || die "...: $!"; $p->parse_file($fh); If the file contains text encoded in a charset besides ASCII, Latin-1 or UTF-8 then decoding will always be needed. =head1 VERSION 2 COMPATIBILITY When an C<HTML::Parser> object is constructed with no arguments, a set of handlers is automatically provided that is compatible with the old HTML::Parser version 2 callback methods. This is equivalent to the following method calls: $p->handler(start => "start", "self, tagname, attr, attrseq, text"); $p->handler(end => "end", "self, tagname, text"); $p->handler(text => "text", "self, text, is_cdata"); $p->handler(process => "process", "self, token0, text"); $p->handler( comment => sub { my ($self, $tokens) = @_; for (@$tokens) { $self->comment($_); } }, "self, tokens" ); $p->handler( declaration => sub { my $self = shift; $self->declaration(substr($_[0], 2, -1)); }, "self, text" ); Setting up these handlers can also be requested with the "api_version => 2" constructor option. =head1 SUBCLASSING The C<HTML::Parser> class is able to be subclassed. Parser objects are plain hashes and C<HTML::Parser> reserves only hash keys that start with "_hparser". The parser state can be set up by invoking the init() method, which takes the same arguments as new(). =head1 EXAMPLES The first simple example shows how you might strip out comments from an HTML document. We achieve this by setting up a comment handler that does nothing and a default handler that will print out anything else: use HTML::Parser (); HTML::Parser->new( default_h => [sub { print shift }, 'text'], comment_h => [""], )->parse_file(shift || die) || die $!; An alternative implementation is: use HTML::Parser (); HTML::Parser->new( end_document_h => [sub { print shift }, 'skipped_text'], comment_h => [""], )->parse_file(shift || die) || die $!; This will in most cases be much more efficient since only a single callback will be made. The next example prints out the text that is inside the <title> element of an HTML document. Here we start by setting up a start handler. When it sees the title start tag it enables a text handler that prints any text found and an end handler that will terminate parsing as soon as the title end tag is seen: use HTML::Parser (); sub start_handler { return if shift ne "title"; my $self = shift; $self->handler(text => sub { print shift }, "dtext"); $self->handler( end => sub { shift->eof if shift eq "title"; }, "tagname,self" ); } my $p = HTML::Parser->new(api_version => 3); $p->handler(start => \&start_handler, "tagname,self"); $p->parse_file(shift || die) || die $!; print "\n"; More examples are found in the F<eg/> directory of the C<HTML-Parser> distribution: the program C<hrefsub> shows how you can edit all links found in a document; the program C<htextsub> shows how to edit the text only; the program C<hstrip> shows how you can strip out certain tags/elements and/or attributes; and the program C<htext> show how to obtain the plain text, but not any script/style content. You can browse the F<eg/> directory online from the I<[Browse]> link on the http://search.cpan.org/~gaas/HTML-Parser/ page. =head1 BUGS The <style> and <script> sections do not end with the first "</", but need the complete corresponding end tag. The standard behaviour is not really practical. When the I<strict_comment> option is enabled, we still recognize comments where there is something other than whitespace between even and odd "--" markers. Once $p->boolean_attribute_value has been set, there is no way to restore the default behaviour. There is currently no way to get both quote characters into the same literal argspec. Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them to repeat the previous start tag or close the previous start tag respectively. NET tags, e.g. "code/.../" are not recognized. This is SGML shorthand for "<code>...</code>". Incomplete start or end tags, e.g. "<tt<b>...</b</tt>" are not recognized. =head1 DIAGNOSTICS The following messages may be produced by HTML::Parser. The notation in this listing is the same as used in L<perldiag>: =over =item Not a reference to a hash (F) The object blessed into or subclassed from HTML::Parser is not a hash as required by the HTML::Parser methods. =item Bad signature in parser state object at %p (F) The _hparser_xs_state element does not refer to a valid state structure. Something must have changed the internal value stored in this hash element, or the memory has been overwritten. =item _hparser_xs_state element is not a reference (F) The _hparser_xs_state element has been destroyed. =item Can't find '_hparser_xs_state' element in HTML::Parser hash (F) The _hparser_xs_state element is missing from the parser hash. It was either deleted, or not created when the object was created. =item API version %s not supported by HTML::Parser %s (F) The constructor option 'api_version' with an argument greater than or equal to 4 is reserved for future extensions. =item Bad constructor option '%s' (F) An unknown constructor option key was passed to the new() or init() methods. =item Parse loop not allowed (F) A handler invoked the parse() or parse_file() method. This is not permitted. =item marked sections not supported (F) The $p->marked_sections() method was invoked in a HTML::Parser module that was compiled without support for marked sections. =item Unknown boolean attribute (%d) (F) Something is wrong with the internal logic that set up aliases for boolean attributes. =item Only code or array references allowed as handler (F) The second argument for $p->handler must be either a subroutine reference, then name of a subroutine or method, or a reference to an array. =item No handler for %s events (F) The first argument to $p->handler must be a valid event name; i.e. one of "start", "end", "text", "process", "declaration" or "comment". =item Unrecognized identifier %s in argspec (F) The identifier is not a known argspec name. Use one of the names mentioned in the argspec section above. =item Literal string is longer than 255 chars in argspec (F) The current implementation limits the length of literals in an argspec to 255 characters. Make the literal shorter. =item Backslash reserved for literal string in argspec (F) The backslash character "\" is not allowed in argspec literals. It is reserved to permit quoting inside a literal in a later version. =item Unterminated literal string in argspec (F) The terminating quote character for a literal was not found. =item Bad argspec (%s) (F) Only identifier names, literals, spaces and commas are allowed in argspecs. =item Missing comma separator in argspec (F) Identifiers in an argspec must be separated with ",". =item Parsing of undecoded UTF-8 will give garbage when decoding entities (W) The first chunk parsed appears to contain undecoded UTF-8 and one or more argspecs that decode entities are used for the callback handlers. The result of decoding will be a mix of encoded and decoded characters for any entities that expand to characters with code above 127. This is not a good thing. The recommended solution is to apply Encode::decode_utf8() on the data before feeding it to the $p->parse(). For $p->parse_file() pass a file that has been opened in ":utf8" mode. The alternative solution is to enable the C<utf8_mode> and not decode before passing strings to $p->parse(). The parser can process raw undecoded UTF-8 sanely if the C<utf8_mode> is enabled, or if the C<attr>, C<@attr> or C<dtext> argspecs are avoided. =item Parsing string decoded with wrong endian selection (W) The first character in the document is U+FFFE. This is not a legal Unicode character but a byte swapped C<BOM>. The result of parsing will likely be garbage. =item Parsing of undecoded UTF-32 (W) The parser found the Unicode UTF-32 C<BOM> signature at the start of the document. The result of parsing will likely be garbage. =item Parsing of undecoded UTF-16 (W) The parser found the Unicode UTF-16 C<BOM> signature at the start of the document. The result of parsing will likely be garbage. =back =head1 SEE ALSO L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>, L<HTML::LinkExtor>, L<HTML::Form> L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution) L<http://www.w3.org/TR/html4/> More information about marked sections and processing instructions may be found at L<http://www.is-thought.co.uk/book/sgml-8.htm>. =head1 COPYRIGHT Copyright 1996-2016 Gisle Aas. All rights reserved. Copyright 1999-2000 Michael A. Chase. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/lib/HTML/PullParser.pm�������������������������������������������������������������000644 �000765 �000024 �00000013074 14366103441 020672� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::PullParser; use strict; require HTML::Parser; our @ISA = qw(HTML::Parser); our $VERSION = '3.81'; use Carp (); sub new { my($class, %cnf) = @_; # Construct argspecs for the various events my %argspec; for (qw(start end text declaration comment process default)) { my $tmp = delete $cnf{$_}; next unless defined $tmp; $argspec{$_} = $tmp; } Carp::croak("Info not collected for any events") unless %argspec; my $file = delete $cnf{file}; my $doc = delete $cnf{doc}; Carp::croak("Can't parse from both 'doc' and 'file' at the same time") if defined($file) && defined($doc); Carp::croak("No 'doc' or 'file' given to parse from") unless defined($file) || defined($doc); # Create object $cnf{api_version} = 3; my $self = $class->SUPER::new(%cnf); my $accum = $self->{pullparser_accum} = []; while (my($event, $argspec) = each %argspec) { $self->SUPER::handler($event => $accum, $argspec); } if (defined $doc) { $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc; $self->{pullparser_str_pos} = 0; } else { if (!ref($file) && ref(\$file) ne "GLOB") { require IO::File; $file = IO::File->new($file, "r") || return; } $self->{pullparser_file} = $file; } $self; } sub handler { Carp::croak("Can't set handlers for HTML::PullParser"); } sub get_token { my $self = shift; while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) { if (my $f = $self->{pullparser_file}) { # must try to parse more from the file my $buf; if (read($f, $buf, 512)) { $self->parse($buf); } else { $self->eof; $self->{pullparser_eof}++; delete $self->{pullparser_file}; } } elsif (my $sref = $self->{pullparser_str_ref}) { # must try to parse more from the scalar my $pos = $self->{pullparser_str_pos}; my $chunk = substr($$sref, $pos, 512); $self->parse($chunk); $pos += length($chunk); if ($pos < length($$sref)) { $self->{pullparser_str_pos} = $pos; } else { $self->eof; $self->{pullparser_eof}++; delete $self->{pullparser_str_ref}; delete $self->{pullparser_str_pos}; } } else { die; } } shift @{$self->{pullparser_accum}}; } sub unget_token { my $self = shift; unshift @{$self->{pullparser_accum}}, @_; $self; } 1; __END__ =head1 NAME HTML::PullParser - Alternative HTML::Parser interface =head1 SYNOPSIS use HTML::PullParser; $p = HTML::PullParser->new(file => "index.html", start => 'event, tagname, @attr', end => 'event, tagname', ignore_elements => [qw(script style)], ) || die "Can't open: $!"; while (my $token = $p->get_token) { #...do something with $token } =head1 DESCRIPTION The HTML::PullParser is an alternative interface to the HTML::Parser class. It basically turns the HTML::Parser inside out. You associate a file (or any IO::Handle object or string) with the parser at construction time and then repeatedly call $parser->get_token to obtain the tags and text found in the parsed document. The following methods are provided: =over 4 =item $p = HTML::PullParser->new( file => $file, %options ) =item $p = HTML::PullParser->new( doc => \$doc, %options ) A C<HTML::PullParser> can be made to parse from either a file or a literal document based on whether the C<file> or C<doc> option is passed to the parser's constructor. The C<file> passed in can either be a file name or a file handle object. If a file name is passed, and it can't be opened for reading, then the constructor will return an undefined value and $! will tell you why it failed. Otherwise the argument is taken to be some object that the C<HTML::PullParser> can read() from when it needs more data. The stream will be read() until EOF, but not closed. A C<doc> can be passed plain or as a reference to a scalar. If a reference is passed then the value of this scalar should not be changed before all tokens have been extracted. Next the information to be returned for the different token types must be set up. This is done by simply associating an argspec (as defined in L<HTML::Parser>) with the events you have an interest in. For instance, if you want C<start> tokens to be reported as the string C<'S'> followed by the tagname and the attributes you might pass an C<start>-option like this: $p = HTML::PullParser->new( doc => $document_to_parse, start => '"S", tagname, @attr', end => '"E", tagname', ); At last other C<HTML::Parser> options, like C<ignore_tags>, and C<unbroken_text>, can be passed in. Note that you should not use the I<event>_h options to set up parser handlers. That would confuse the inner logic of C<HTML::PullParser>. =item $token = $p->get_token This method will return the next I<token> found in the HTML document, or C<undef> at the end of the document. The token is returned as an array reference. The content of this array match the argspec set up during C<HTML::PullParser> construction. =item $p->unget_token( @tokens ) If you find out you have read too many tokens you can push them back, so that they are returned again the next time $p->get_token is called. =back =head1 EXAMPLES The 'eg/hform' script shows how we might parse the form section of HTML::Documents using HTML::PullParser. =head1 SEE ALSO L<HTML::Parser>, L<HTML::TokeParser> =head1 COPYRIGHT Copyright 1998-2001 Gisle Aas. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/lib/HTML/TokeParser.pm�������������������������������������������������������������000644 �000765 �000024 �00000023604 14366103441 020660� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::TokeParser; use strict; require HTML::PullParser; our @ISA = qw(HTML::PullParser); our $VERSION = '3.81'; use Carp (); use HTML::Entities qw(decode_entities); use HTML::Tagset (); my %ARGS = ( start => "'S',tagname,attr,attrseq,text", end => "'E',tagname,text", text => "'T',text,is_cdata", process => "'PI',token0,text", comment => "'C',text", declaration => "'D',text", # options that default on unbroken_text => 1, ); sub new { my $class = shift; my %cnf; if (@_ == 1) { my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; %cnf = ($type => $_[0]); } else { unshift @_, (ref($_[0]) eq "SCALAR") ? "doc" : "file" if(scalar(@_) % 2 == 1); %cnf = @_; } my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; my $self = $class->SUPER::new(%ARGS, %cnf) || return undef; $self->{textify} = $textify; $self; } sub get_tag { my $self = shift; my $token; while (1) { $token = $self->get_token || return undef; my $type = shift @$token; next unless $type eq "S" || $type eq "E"; substr($token->[0], 0, 0) = "/" if $type eq "E"; return $token unless @_; for (@_) { return $token if $token->[0] eq $_; } } } sub _textify { my($self, $token) = @_; my $tag = $token->[1]; return undef unless exists $self->{textify}{$tag}; my $alt = $self->{textify}{$tag}; my $text; if (ref($alt)) { $text = &$alt(@$token); } else { $text = $token->[2]{$alt || "alt"}; $text = "[\U$tag]" unless defined $text; } return $text; } sub get_text { my $self = shift; my @text; while (my $token = $self->get_token) { my $type = $token->[0]; if ($type eq "T") { my $text = $token->[1]; decode_entities($text) unless $token->[2]; push(@text, $text); } elsif ($type =~ /^[SE]$/) { my $tag = $token->[1]; if ($type eq "S") { if (defined(my $text = _textify($self, $token))) { push(@text, $text); next; } } else { $tag = "/$tag"; } if (!@_ || grep $_ eq $tag, @_) { $self->unget_token($token); last; } push(@text, " ") if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]}; } } join("", @text); } sub get_trimmed_text { my $self = shift; my $text = $self->get_text(@_); $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; $text; } sub get_phrase { my $self = shift; my @text; while (my $token = $self->get_token) { my $type = $token->[0]; if ($type eq "T") { my $text = $token->[1]; decode_entities($text) unless $token->[2]; push(@text, $text); } elsif ($type =~ /^[SE]$/) { my $tag = $token->[1]; if ($type eq "S") { if (defined(my $text = _textify($self, $token))) { push(@text, $text); next; } } if (!$HTML::Tagset::isPhraseMarkup{$tag}) { $self->unget_token($token); last; } push(@text, " ") if $tag eq "br"; } } my $text = join("", @text); $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; $text; } 1; __END__ =head1 NAME HTML::TokeParser - Alternative HTML::Parser interface =head1 SYNOPSIS require HTML::TokeParser; $p = HTML::TokeParser->new("index.html") || die "Can't open: $!"; $p->empty_element_tags(1); # configure its behaviour while (my $token = $p->get_token) { #... } =head1 DESCRIPTION The C<HTML::TokeParser> is an alternative interface to the C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a predeclared set of token types. If you wish the tokens to be reported differently you probably want to use the C<HTML::PullParser> directly. The following methods are available: =over 4 =item $p = HTML::TokeParser->new( $filename, %opt ); =item $p = HTML::TokeParser->new( $filehandle, %opt ); =item $p = HTML::TokeParser->new( \$document, %opt ); The object constructor argument is either a file name, a file handle object, or the complete document to be parsed. Extra options can be provided as key/value pairs and are processed as documented by the base classes. If the argument is a plain scalar, then it is taken as the name of a file to be opened and parsed. If the file can't be opened for reading, then the constructor will return C<undef> and $! will tell you why it failed. If the argument is a reference to a plain scalar, then this scalar is taken to be the literal document to parse. The value of this scalar should not be changed before all tokens have been extracted. Otherwise the argument is taken to be some object that the C<HTML::TokeParser> can read() from when it needs more data. Typically it will be a filehandle of some kind. The stream will be read() until EOF, but not closed. A newly constructed C<HTML::TokeParser> differ from its base classes by having the C<unbroken_text> attribute enabled by default. See L<HTML::Parser> for a description of this and other attributes that influence how the document is parsed. It is often a good idea to enable C<empty_element_tags> behaviour. Note that the parsing result will likely not be valid if raw undecoded UTF-8 is used as a source. When parsing UTF-8 encoded files turn on UTF-8 decoding: open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!"; my $p = HTML::TokeParser->new( $fh ); # ... If a $filename is passed to the constructor the file will be opened in raw mode and the parsing result will only be valid if its content is Latin-1 or pure ASCII. If parsing from an UTF-8 encoded string buffer decode it first: utf8::decode($document); my $p = HTML::TokeParser->new( \$document ); # ... =item $p->get_token This method will return the next I<token> found in the HTML document, or C<undef> at the end of the document. The token is returned as an array reference. The first element of the array will be a string denoting the type of this token: "S" for start tag, "E" for end tag, "T" for text, "C" for comment, "D" for declaration, and "PI" for process instructions. The rest of the token array depend on the type like this: ["S", $tag, $attr, $attrseq, $text] ["E", $tag, $text] ["T", $text, $is_data] ["C", $text] ["D", $text] ["PI", $token0, $text] where $attr is a hash reference, $attrseq is an array reference and the rest are plain scalars. The L<HTML::Parser/Argspec> explains the details. =item $p->unget_token( @tokens ) If you find you have read too many tokens you can push them back, so that they are returned the next time $p->get_token is called. =item $p->get_tag =item $p->get_tag( @tags ) This method returns the next start or end tag (skipping any other tokens), or C<undef> if there are no more tags in the document. If one or more arguments are given, then we skip tokens until one of the specified tag types is found. For example: $p->get_tag("font", "/font"); will find the next start or end tag for a font-element. The tag information is returned as an array reference in the same form as for $p->get_token above, but the type code (first element) is missing. A start tag will be returned like this: [$tag, $attr, $attrseq, $text] The tagname of end tags are prefixed with "/", i.e. end tag is returned like this: ["/$tag", $text] =item $p->get_text =item $p->get_text( @endtags ) This method returns all text found at the current position. It will return a zero length string if the next token is not text. Any entities will be converted to their corresponding character. If one or more arguments are given, then we return all text occurring before the first of the specified tags found. For example: $p->get_text("p", "br"); will return the text up to either a paragraph of line break element. The text might span tags that should be I<textified>. This is controlled by the $p->{textify} attribute, which is a hash that defines how certain tags can be treated as text. If the name of a start tag matches a key in this hash then this tag is converted to text. The hash value is used to specify which tag attribute to obtain the text from. If this tag attribute is missing, then the upper case name of the tag enclosed in brackets is returned, e.g. "[IMG]". The hash value can also be a subroutine reference. In this case the routine is called with the start tag token content as its argument and the return value is treated as the text. The default $p->{textify} value is: {img => "alt", applet => "alt"} This means that <IMG> and <APPLET> tags are treated as text, and that the text to substitute can be found in the ALT attribute. =item $p->get_trimmed_text =item $p->get_trimmed_text( @endtags ) Same as $p->get_text above, but will collapse any sequences of white space to a single space character. Leading and trailing white space is removed. =item $p->get_phrase This will return all text found at the current position ignoring any phrasal-level tags. Text is extracted until the first non phrasal-level tag. Textification of tags is the same as for get_text(). This method will collapse white space in the same way as get_trimmed_text() does. The definition of <i>phrasal-level tags</i> is obtained from the HTML::Tagset module. =back =head1 EXAMPLES This example extracts all links from a document. It will print one line for each link, containing the URL and the textual description between the <A>...</A> tags: use HTML::TokeParser; $p = HTML::TokeParser->new(shift||"index.html"); while (my $token = $p->get_tag("a")) { my $url = $token->[1]{href} || "-"; my $text = $p->get_trimmed_text("/a"); print "$url\t$text\n"; } This example extract the <TITLE> from the document: use HTML::TokeParser; $p = HTML::TokeParser->new(shift||"index.html"); if ($p->get_tag("title")) { my $title = $p->get_trimmed_text; print "Title: $title\n"; } =head1 SEE ALSO L<HTML::PullParser>, L<HTML::Parser> =head1 COPYRIGHT Copyright 1998-2005 Gisle Aas. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut ����������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/lib/HTML/HeadParser.pm�������������������������������������������������������������000644 �000765 �000024 �00000020503 14366103441 020612� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::HeadParser; =head1 NAME HTML::HeadParser - Parse <HEAD> section of a HTML document =head1 SYNOPSIS require HTML::HeadParser; $p = HTML::HeadParser->new; $p->parse($text) and print "not finished"; $p->header('Title') # to access <title>....</title> $p->header('Content-Base') # to access <base href="http://..."> $p->header('Foo') # to access <meta http-equiv="Foo" content="..."> $p->header('X-Meta-Author') # to access <meta name="author" content="..."> $p->header('X-Meta-Charset') # to access <meta charset="..."> =head1 DESCRIPTION The C<HTML::HeadParser> is a specialized (and lightweight) C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD> section of an HTML document. The parse() method will return a FALSE value as soon as some E<lt>BODY> element or body text are found, and should not be called again after this. Note that the C<HTML::HeadParser> might get confused if raw undecoded UTF-8 is passed to the parse() method. Make sure the strings are properly decoded before passing them on. The C<HTML::HeadParser> keeps a reference to a header object, and the parser will update this header object as the various elements of the E<lt>HEAD> section of the HTML document are recognized. The following header fields are affected: =over 4 =item Content-Base: The I<Content-Base> header is initialized from the E<lt>base href="..."> element. =item Title: The I<Title> header is initialized from the E<lt>title>...E<lt>/title> element. =item Isindex: The I<Isindex> header will be added if there is a E<lt>isindex> element in the E<lt>head>. The header value is initialized from the I<prompt> attribute if it is present. If no I<prompt> attribute is given it will have '?' as the value. =item X-Meta-Foo: All E<lt>meta> elements containing a C<name> attribute will result in headers using the prefix C<X-Meta-> appended with the value of the C<name> attribute as the name of the header, and the value of the C<content> attribute as the pushed header value. E<lt>meta> elements containing a C<http-equiv> attribute will result in headers as in above, but without the C<X-Meta-> prefix in the header name. E<lt>meta> elements containing a C<charset> attribute will result in an C<X-Meta-Charset> header, using the value of the C<charset> attribute as the pushed header value. The ':' character can't be represented in header field names, so if the meta element contains this char it's substituted with '-' before forming the field name. =back =head1 METHODS The following methods (in addition to those provided by the superclass) are available: =over 4 =cut require HTML::Parser; our @ISA = qw(HTML::Parser); use HTML::Entities (); use strict; our $DEBUG; #$DEBUG = 1; our $VERSION = '3.81'; =item $hp = HTML::HeadParser->new =item $hp = HTML::HeadParser->new( $header ) The object constructor. The optional $header argument should be a reference to an object that implement the header() and push_header() methods as defined by the C<HTTP::Headers> class. Normally it will be of some class that is a or delegates to the C<HTTP::Headers> class. If no $header is given C<HTML::HeadParser> will create an C<HTTP::Headers> object by itself (initially empty). =cut sub new { my($class, $header) = @_; unless ($header) { require HTTP::Headers; $header = HTTP::Headers->new; } my $self = $class->SUPER::new(api_version => 3, start_h => ["start", "self,tagname,attr"], end_h => ["end", "self,tagname"], text_h => ["text", "self,text"], ignore_elements => [qw(script style)], ); $self->{'header'} = $header; $self->{'tag'} = ''; # name of active element that takes textual content $self->{'text'} = ''; # the accumulated text associated with the element $self; } =item $hp->header; Returns a reference to the header object. =item $hp->header( $key ) Returns a header value. It is just a shorter way to write C<$hp-E<gt>header-E<gt>header($key)>. =cut sub header { my $self = shift; return $self->{'header'} unless @_; $self->{'header'}->header(@_); } sub as_string # legacy { my $self = shift; $self->{'header'}->as_string; } sub flush_text # internal { my $self = shift; my $tag = $self->{'tag'}; my $text = $self->{'text'}; $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; print "FLUSH $tag => '$text'\n" if $DEBUG; if ($tag eq 'title') { my $decoded; $decoded = utf8::decode($text) if $self->utf8_mode && defined &utf8::decode; HTML::Entities::decode($text); utf8::encode($text) if $decoded; $self->{'header'}->push_header(Title => $text); } $self->{'tag'} = $self->{'text'} = ''; } # This is an quote from the HTML3.2 DTD which shows which elements # that might be present in a <HEAD>...</HEAD>. Also note that the # <HEAD> tags themselves might be missing: # # <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? & # SCRIPT* & META* & LINK*"> # # <!ELEMENT HEAD O O (%head.content)> # # From HTML 4.01: # # <!ENTITY % head.misc "SCRIPT|STYLE|META|LINK|OBJECT"> # <!ENTITY % head.content "TITLE & BASE?"> # <!ELEMENT HEAD O O (%head.content;) +(%head.misc;)> # # From HTML 5 as of WD-html5-20090825: # # One or more elements of metadata content, [...] # => base, command, link, meta, noscript, script, style, title sub start { my($self, $tag, $attr) = @_; # $attr is reference to a HASH print "START[$tag]\n" if $DEBUG; $self->flush_text if $self->{'tag'}; if ($tag eq 'meta') { my $key = $attr->{'http-equiv'}; if (!defined($key) || !length($key)) { if ($attr->{name}) { $key = "X-Meta-\u$attr->{name}"; } elsif ($attr->{charset}) { # HTML 5 <meta charset="..."> $key = "X-Meta-Charset"; $self->{header}->push_header($key => $attr->{charset}); return; } else { return; } } $key =~ s/:/-/g; $self->{'header'}->push_header($key => $attr->{content}); } elsif ($tag eq 'base') { return unless exists $attr->{href}; (my $base = $attr->{href}) =~ s/^\s+//; $base =~ s/\s+$//; # HTML5 $self->{'header'}->push_header('Content-Base' => $base); } elsif ($tag eq 'isindex') { # This is a non-standard header. Perhaps we should just ignore # this element $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?'); } elsif ($tag =~ /^(?:title|noscript|object|command)$/) { # Just remember tag. Initialize header when we see the end tag. $self->{'tag'} = $tag; } elsif ($tag eq 'link') { return unless exists $attr->{href}; # <link href="http:..." rel="xxx" rev="xxx" title="xxx"> my $href = delete($attr->{href}); $href =~ s/^\s+//; $href =~ s/\s+$//; # HTML5 my $h_val = "<$href>"; for (sort keys %{$attr}) { next if $_ eq "/"; # XHTML junk $h_val .= qq(; $_="$attr->{$_}"); } $self->{'header'}->push_header(Link => $h_val); } elsif ($tag eq 'head' || $tag eq 'html') { # ignore } else { # stop parsing $self->eof; } } sub end { my($self, $tag) = @_; print "END[$tag]\n" if $DEBUG; $self->flush_text if $self->{'tag'}; $self->eof if $tag eq 'head'; } sub text { my($self, $text) = @_; print "TEXT[$text]\n" if $DEBUG; unless ($self->{first_chunk}) { # drop Unicode BOM if found if ($self->utf8_mode) { $text =~ s/^\xEF\xBB\xBF//; } else { $text =~ s/^\x{FEFF}//; } $self->{first_chunk}++; } my $tag = $self->{tag}; if (!$tag && $text =~ /\S/) { # Normal text means start of body $self->eof; return; } return if $tag ne 'title'; $self->{'text'} .= $text; } BEGIN { *utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT; } 1; __END__ =back =head1 EXAMPLE $h = HTTP::Headers->new; $p = HTML::HeadParser->new($h); $p->parse(<<EOT); <title>Stupid example</title> <base href="http://www.linpro.no/lwp/"> Normal text starts here. EOT undef $p; print $h->title; # should print "Stupid example" =head1 SEE ALSO L<HTML::Parser>, L<HTTP::Headers> The C<HTTP::Headers> class is distributed as part of the I<libwww-perl> package. If you don't have that distribution installed you need to provide the $header argument to the C<HTML::HeadParser> constructor with your own object that implements the documented protocol. =head1 COPYRIGHT Copyright 1996-2001 Gisle Aas. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/xt/author/�������������������������������������������������������������������������000755 �000765 �000024 �00000000000 14366103441 016661� 5����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/xt/release/������������������������������������������������������������������������000755 �000765 �000024 �00000000000 14366103441 016777� 5����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/xt/release/kwalitee.t��������������������������������������������������������������000644 �000765 �000024 �00000000321 14366103441 020765� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������# this test was generated with Dist::Zilla::Plugin::Test::Kwalitee 2.12 use strict; use warnings; use Test::More 0.88; use Test::Kwalitee 1.21 'kwalitee_ok'; kwalitee_ok( qw( -no_symlinks ) ); done_testing; ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/xt/release/changes_has_content.t���������������������������������������������������000644 �000765 �000024 �00000002100 14366103441 023152� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use Test::More tests => 2; note 'Checking Changes'; my $changes_file = 'Changes'; my $newver = '3.81'; my $trial_token = '-TRIAL'; my $encoding = 'UTF-8'; SKIP: { ok(-e $changes_file, "$changes_file file exists") or skip 'Changes is missing', 1; ok(_get_changes($newver), "$changes_file has content for $newver"); } done_testing; sub _get_changes { my $newver = shift; # parse changelog to find commit message open(my $fh, '<', $changes_file) or die "cannot open $changes_file: $!"; my $changelog = join('', <$fh>); if ($encoding) { require Encode; $changelog = Encode::decode($encoding, $changelog, Encode::FB_CROAK()); } close $fh; my @content = grep { /^$newver(?:$trial_token)?(?:\s+|$)/ ... /^\S/ } # from newver to un-indented split /\n/, $changelog; shift @content; # drop the version line # drop unindented last line and trailing blank lines pop @content while ( @content && $content[-1] =~ /^(?:\S|\s*$)/ ); # return number of non-blank lines return scalar @content; } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/xt/author/test-version.t�����������������������������������������������������������000644 �000765 �000024 �00000000711 14366103441 021507� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use Test::More; # generated by Dist::Zilla::Plugin::Test::Version 1.09 use Test::Version; my @imports = qw( version_all_ok ); my $params = { is_strict => 0, has_version => 1, multiple => 0, filename_match => [ qr/Parser\.pm$/ ], }; push @imports, $params if version->parse( $Test::Version::VERSION ) >= version->parse('1.002'); Test::Version->import(@imports); version_all_ok; done_testing; �������������������������������������������������������HTML-Parser-3.81/xt/author/00-compile.t�������������������������������������������������������������000644 �000765 �000024 �00000002770 14366103441 020721� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use 5.006; use strict; use warnings; # this test was generated with Dist::Zilla::Plugin::Test::Compile 2.058 use Test::More; plan tests => 8; my @module_files = ( 'HTML/Entities.pm', 'HTML/Filter.pm', 'HTML/HeadParser.pm', 'HTML/LinkExtor.pm', 'HTML/Parser.pm', 'HTML/PullParser.pm', 'HTML/TokeParser.pm' ); # no fake home requested my @switches = ( -d 'blib' ? '-Mblib' : '-Ilib', ); use File::Spec; use IPC::Open3; use IO::Handle; open my $stdin, '<', File::Spec->devnull or die "can't open devnull: $!"; my @warnings; for my $lib (@module_files) { # see L<perlfaq8/How can I capture STDERR from an external command?> my $stderr = IO::Handle->new; diag('Running: ', join(', ', map { my $str = $_; $str =~ s/'/\\'/g; q{'} . $str . q{'} } $^X, @switches, '-e', "require q[$lib]")) if $ENV{PERL_COMPILE_TEST_DEBUG}; my $pid = open3($stdin, '>&STDERR', $stderr, $^X, @switches, '-e', "require q[$lib]"); binmode $stderr, ':crlf' if $^O eq 'MSWin32'; my @_warnings = <$stderr>; waitpid($pid, 0); is($?, 0, "$lib loaded ok"); shift @_warnings if @_warnings and $_warnings[0] =~ /^Using .*\bblib/ and not eval { +require blib; blib->VERSION('1.01') }; if (@_warnings) { warn @_warnings; push @warnings, @_warnings; } } is(scalar(@warnings), 0, 'no warnings found') or diag 'got warnings: ', ( Test::More->can('explain') ? Test::More::explain(\@warnings) : join("\n", '', @warnings) ); ��������HTML-Parser-3.81/xt/author/pod-syntax.t�������������������������������������������������������������000644 �000765 �000024 �00000000252 14366103441 021153� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!perl # This file was automatically generated by Dist::Zilla::Plugin::PodSyntaxTests. use strict; use warnings; use Test::More; use Test::Pod 1.41; all_pod_files_ok(); ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/xt/author/pod-spell.t��������������������������������������������������������������000644 �000765 �000024 �00000001723 14366103441 020750� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use Test::More; # generated by Dist::Zilla::Plugin::Test::PodSpelling 2.007005 use Test::Spelling 0.12; use Pod::Wordlist; set_spell_cmd('aspell list'); add_stopwords(<DATA>); all_pod_files_spelling_ok( qw( bin lib ) ); __DATA__ Aas Alders Antonio Attr Attrseq Barbie Bonaccorso CDATA CPAN Chase Chip Clark Damyan David Dtext Entities Filter François Germishuys Gisle Graham HTML HeadParser IMG Isindex Ivanov Jacques James Jensen Jess Jon Josef Knop LinkExtor MSIE Michael Michal Mike Nicholas Nicolas Olaf Orton Parser Perrad PullParser Radici Raspass Rinaldo Robinson Salvatore Salzenberg Skyttä South Steinbrunner Textification Todd TokeParser Tokenpos Unterminated Ville Whitener Yves Zefram and antonio argspec argspecs barbie bulk88 capoeirab castaway chip demerphq dmn dsteinbrunner francois gaas gisle haarg jacquesg jon jraspass lib mchase msouth mspacek nick nicolas olaf salvatore tagname textified toddr undecoded ville zefram Špaček ���������������������������������������������HTML-Parser-3.81/xt/author/pod-coverage.t�����������������������������������������������������������000644 �000765 �000024 �00000003050 14366103441 021417� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!perl # This file was automatically generated by Dist::Zilla::Plugin::Test::Pod::Coverage::Configurable 0.07. use Test::Pod::Coverage 1.08; use Test::More 0.88; BEGIN { if ( $] <= 5.008008 ) { plan skip_all => 'These tests require Pod::Coverage::TrustPod, which only works with Perl 5.8.9+'; } } use Pod::Coverage::TrustPod; my %skip = map { $_ => 1 } qw( ); my @modules; for my $module ( all_modules() ) { next if $skip{$module}; push @modules, $module; } plan skip_all => 'All the modules we found were excluded from POD coverage test.' unless @modules; plan tests => scalar @modules; my %trustme = ( 'HTML::Entities' => [ qr/^(?:UNICODE_SUPPORT|decode|encode|encode_numeric|encode_numerically|num_entity)$/ ], 'HTML::Filter' => [ qr/^(?:output)$/ ], 'HTML::HeadParser' => [ qr/^(?:as_string|flush_text)$/ ], 'HTML::Parser' => [ qr/^(?:init|netscape_buggy_comment)$/ ] ); my @also_private; for my $module ( sort @modules ) { pod_coverage_ok( $module, { coverage_class => 'Pod::Coverage::TrustPod', also_private => \@also_private, trustme => $trustme{$module} || [], }, "pod coverage for $module" ); } done_testing(); ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/pullparser.t���������������������������������������������������������������������000644 �000765 �000024 �00000001752 14366103441 017552� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::PullParser (); use Test::More tests => 3; my $doc = <<'EOT'; <title>Title</title> <style> h1 { background: white } <foo> </style> <H1 ID="3">Heading</H1> <!-- ignore this --> This is a text with a <A HREF="http://www.sol.no" name="l1">link</a>. EOT my $p = HTML::PullParser->new( doc => $doc, start => 'event,tagname,@attr', end => 'event,tagname', text => 'event,dtext', ignore_elements => [qw(script style)], unbroken_text => 1, boolean_attribute_value => 1, ); my $t = $p->get_token; is($t->[0], "start"); is($t->[1], "title"); $p->unget_token($t); my @data; while (my $t = $p->get_token) { for (@$t) { s/\s/./g; } push(@data, join("|", @$t)); } my $res = join("\n", @data, ""); #diag $res; is($res, <<'EOT'); start|title text|Title end|title text|.. start|h1|id|3 text|Heading end|h1 text|...This.is.a.text.with.a. start|a|href|http://www.sol.no|name|l1 text|link end|a text|.. EOT ����������������������HTML-Parser-3.81/t/ignore.t�������������������������������������������������������������������������000644 �000765 �000024 �00000000775 14366103441 016650� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 4; my $html = '<A href="foo">text</A>'; my $text = ''; my $p = HTML::Parser->new(default_h => [sub { $text .= shift; }, 'text']); $p->parse($html)->eof; is($text, $html); $text = ''; $p->handler(start => ""); $p->parse($html)->eof; is($text, 'text</A>'); $text = ''; $p->handler(end => 0); $p->parse($html)->eof; is($text, 'text'); $text = ''; $p->handler(start => undef); $p->parse($html)->eof; is($text, '<A href="foo">text'); ���HTML-Parser-3.81/t/unicode-bom.t��������������������������������������������������������������������000644 �000765 �000024 �00000002254 14366103441 017560� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!perl -w use strict; use warnings; use utf8; use HTML::Parser (); use Test::More tests => 2; my @parsed; my $p = HTML::Parser->new(api_version => 3, start_h => [\@parsed, 'tag, attr'],); my @warn; $SIG{__WARN__} = sub { push(@warn, $_[0]); }; $p->parse("\xEF\xBB\xBF<head>Hi there</head>"); $p->eof; #use Encode; $p->parse("\xEF\xBB\xBF<head>Hi there</head>" . chr(0x263A)); $p->eof; $p->parse("\xFF\xFE<head>Hi there</head>"); $p->eof; $p->parse("\xFE\xFF<head>Hi there</head>"); $p->eof; $p->parse("\0\0\xFF\xFE<head>Hi there</head>"); $p->eof; $p->parse("\xFE\xFF\0\0<head>Hi there</head>"); $p->eof; for (@warn) { s/line (\d+)/line ##/g; } is(join("", @warn), <<EOT); Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line ##. Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line ##. Parsing of undecoded UTF-16 at $0 line ##. Parsing of undecoded UTF-16 at $0 line ##. Parsing of undecoded UTF-32 at $0 line ##. Parsing of undecoded UTF-32 at $0 line ##. EOT @warn = (); $p = HTML::Parser->new(api_version => 3, start_h => [\@parsed, 'tag'],); $p->parse("\xEF\xBB\xBF<head>Hi there</head>"); $p->eof; ok(!@warn); ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/free.t���������������������������������������������������������������������������000644 �000765 �000024 �00000000334 14366103441 016275� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 1; my $p; $p = HTML::Parser->new( start_h => [ sub { undef $p; } ], ); $p->parse(q(<foo>)); pass 'no SEGV'; ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/unbroken-text.t������������������������������������������������������������������000644 �000765 �000024 �00000002224 14366103441 020161� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 3; my $text = ""; sub text { my $cdata = shift() ? "CDATA" : "TEXT"; my ($offset, $line, $col, $t) = @_; $text .= "[$cdata:$offset:$line.$col:$t]"; } sub tag { $text .= shift; } my $p = HTML::Parser->new( unbroken_text => 1, text_h => [\&text, "is_cdata,offset,line,column,text"], start_h => [\&tag, "text"], end_h => [\&tag, "text"], ); $p->parse("foo "); $p->parse("bar "); $p->parse("<foo>"); $p->parse("bar\n"); $p->parse("</foo>"); $p->parse("<xmp>xmp</xmp>"); $p->parse("atend"); #diag $text; is($text, "[TEXT:0:1.0:foo bar ]<foo>[TEXT:13:1.13:bar\n]</foo><xmp>[CDATA:28:2.11:xmp]</xmp>" ); $text = ""; $p->eof; #diag $text; is($text, "[TEXT:37:2.20:atend]"); $p = HTML::Parser->new( unbroken_text => 1, text_h => [\&text, "is_cdata,offset,line,column,text"], ); $text = ""; $p->parse("foo"); $p->parse("<foo"); $p->parse(">bar\n"); $p->parse("foo<xm"); $p->parse("p>xmp"); $p->parse("</xmp"); $p->parse(">bar"); $p->eof; #diag $text; is($text, "[TEXT:0:1.0:foobar\nfoo][CDATA:20:2.8:xmp][TEXT:29:2.17:bar]"); ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/filter-methods.t�����������������������������������������������������������������000644 �000765 �000024 �00000011101 14366103441 020274� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use utf8; use HTML::Parser (); use Test::More tests => 12; my $p = HTML::Parser->new(api_version => 3, ignore_tags => [qw(b i em tt)],); $p->ignore_elements("script"); $p->unbroken_text(1); $p->handler(default => [], "event, text"); $p->parse(<<"EOT")->eof; <html><head><title>foo</title><Script language="Perl"> while (<B>) { # ... } </Script><body> This is an <i>italic</i> and <b>bold</b> text. </body> </html> EOT my $t = join("||", map join("|", @$_), @{$p->handler("default")}); #diag $t; is( $t, "start_document|||start|<html>||start|<head>||start|<title>||text|foo||end|</title>||start|<body>||text| This is an italic and bold text. ||end|</body>||text| ||end|</html>||text| ||end_document|", 'ignore_elements' ); #------------------------------------------------------ $p = HTML::Parser->new(api_version => 3); $p->report_tags("a"); $p->handler( start => sub { my ($tagname, %attr) = @_; ok($tagname eq "a" && $attr{href} eq "#a", 'report_tags start'); }, 'tagname, @attr' ); $p->handler( end => sub { my $tagname = shift; is($tagname, "a", 'report_tags end'); }, 'tagname' ); $p->parse(<<EOT)->eof; <h1>Next example</h1> This is <a href="#a">very nice</a> example. EOT #------------------------------------------------------ my @tags; $p = HTML::Parser->new(api_version => 3); $p->report_tags(qw(a em)); $p->ignore_tags(qw(em)); $p->handler(end => sub { push @tags, @_; }, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> This is <em>yet another</em> <a href="#a">very nice</a> example. EOT is(join('|', @tags), 'a', 'report_tags followed by ignore_tags'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->report_tags(qw(h1)); $p->report_tags(); $p->handler(end => sub { push @tags, @_; }, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> EOT is(join('|', @tags), 'h1|h2', 'reset report_tags filter'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->report_tags(qw(h1 h2)); $p->ignore_tags(qw(h2)); $p->report_tags(qw(h1 h2)); $p->handler(end => sub { push @tags, @_; }, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> EOT is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->report_tags(qw(h1 h2)); $p->ignore_tags(qw(h2)); $p->report_tags(); $p->handler(end => sub { push @tags, @_; }, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> EOT is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->report_tags(qw(h1 h2)); $p->report_tags(qw(h3)); $p->handler(end => sub { push @tags, @_; }, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> <h3>Next example</h3> EOT is(join('|', @tags), 'h3', 'report_tags replaces filter'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->ignore_tags(qw(h1 h2)); $p->ignore_tags(qw(h3)); $p->handler(end => sub { push @tags, @_; }, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> <h3>Next example</h3> EOT is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->ignore_tags(qw(h2)); $p->ignore_tags(); $p->handler(end => sub { push @tags, @_; }, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> EOT is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->ignore_tags(qw(h2)); $p->report_tags(qw(h1 h2)); $p->handler(end => sub { push @tags, @_; }, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> EOT is(join('|', @tags), 'h1', 'ignore_tags before report_tags'); #------------------------------------------------------ $p = HTML::Parser->new(api_version => 3); $p->ignore_elements("script"); my $res = ""; $p->handler(default => sub { $res .= $_[0]; }, 'text'); $p->parse(<<'EOT')->eof; A <script> B </script> C </script> D <script> E </script> F EOT is($res, "A C D F\n", "ignore </script> without <script> correctly"); ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/argspec2.t�����������������������������������������������������������������������000644 �000765 �000024 �00000000561 14366103441 017064� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 2; my @start; my @text; my $p = HTML::Parser->new(api_version => 3); $p->handler(start => \@start, '@{tagname, @attr}'); $p->handler(text => \@text, '@{dtext}'); $p->parse(<<EOT)->eof; Hi <a href="abc">Foo</a><b>:-)</b> EOT is("@start", "a href abc b"); is(join("", @text), "Hi\nFoo:-)\n"); �����������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/script.t�������������������������������������������������������������������������000644 �000765 �000024 �00000002605 14366103441 016663� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 1; my $TEXT = ""; sub h { my ($event, $tagname, $text) = @_; for ($event, $tagname, $text) { if (defined) { s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge; } else { $_ = "<undef>"; } } $TEXT .= "[$event,$tagname,$text]\n"; } my $p = HTML::Parser->new( default_h => [\&h, "event,tagname,text"], empty_element_tags => 1 ); $p->parse( q(<tr><td align="center" height="100"><script src="whatever"/><SCRIPT language="JavaScript1.1">bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');</SCRIPT></td></tr>) ); $p->eof; is($TEXT, <<'EOT'); [start_document,<undef>,] [start,tr,<tr>] [start,td,<td align="center" height="100">] [start,script,<script src="whatever"/>] [end,script,] [start,script,<SCRIPT language="JavaScript1.1">] [text,<undef>,bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');] [end,script,</SCRIPT>] [end,td,</td>] [end,tr,</tr>] [end_document,<undef>,] EOT ���������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/magic.t��������������������������������������������������������������������������000644 �000765 �000024 �00000002606 14366103441 016440� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 5; # Check that the magic signature at the top of struct p_state works and that we # catch modifications to _hparser_xs_state gracefully my $p = HTML::Parser->new(api_version => 3); $p->xml_mode(1); # We should not be able to simply modify this stuff { local $@; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { ${$p->{_hparser_xs_state}} += 4; 1; }; #>>> like($error, qr/^Modification of a read-only value attempted/); } my $x = delete $p->{_hparser_xs_state}; { local $@; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { $p->xml_mode(1); 1; }; #>>> like($error, qr/^Can't find '_hparser_xs_state'/); } $p->{_hparser_xs_state} = \($$x + 16); { local $@; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { $p->xml_mode(1); 1; }; #>>> like($error, qr/^Lost parser state magic/); } $p->{_hparser_xs_state} = 33; { local $@; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { $p->xml_mode(1); 1; }; #>>> like($error, qr/^_hparser_xs_state element is not a reference/); } $p->{_hparser_xs_state} = $x; ok($p->xml_mode(0)); ��������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/uentities.t����������������������������������������������������������������������000644 �000765 �000024 �00000004057 14366103441 017373� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use utf8; use HTML::Entities qw(decode_entities encode_entities); use Test::More tests => 26; # Test Unicode entities SKIP: { skip "Unicode entities not selected", 26 if !&HTML::Entities::UNICODE_SUPPORT; is(decode_entities("&euro"), "&euro"); is(decode_entities("&euro;"), "\x{20AC}"); is(decode_entities("&aring"), "å"); is(decode_entities("&aring;"), "å"); is(decode_entities("&#500000"), chr(500000)); is(decode_entities("&#x10FFFD"), "\x{10FFFD}"); is(decode_entities("&#xFFFC"), "\x{FFFC}"); is(decode_entities("&#xFDD0"), "\x{FFFD}"); is(decode_entities("&#xFDD1"), "\x{FFFD}"); is(decode_entities("&#xFDE0"), "\x{FFFD}"); is(decode_entities("&#xFDEF"), "\x{FFFD}"); is(decode_entities("&#xFFFF"), "&#xFFFF"); is(decode_entities("&#x10FFFF"), "\x{FFFD}"); is(decode_entities("&#x110000"), "&#x110000"); is(decode_entities("&#XFFFFFFFF"), "&#XFFFFFFFF"); is(decode_entities("&#0"), "&#0"); is(decode_entities("&#0;"), "&#0;"); is(decode_entities("&#x0"), "&#x0"); is(decode_entities("&#X0;"), "&#X0;"); is(decode_entities("&#&aring&#229&#229;&#xFFF"), "&#ååå\x{FFF}"); # This might fail when we get more than 64 bit UVs is(decode_entities("&#0009999999999999999999999999999;"), "&#0009999999999999999999999999999;"); is(decode_entities("&#xFFFF0000FFFF0000FFFF1"), "&#xFFFF0000FFFF0000FFFF1"); my $err; for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) { my $x = join("", map chr, $_->[0] .. $_->[1]); my $e = encode_entities($x); my $d = decode_entities($e); unless ($d eq $x) { diag "Wrong decoding in range $_->[0] .. $_->[1]"; # use Devel::Peek; Dump($x); Dump($d); $err++; } } ok(!$err); is(decode_entities("&#56256;&#56453;"), chr(0x100085)); is(decode_entities("&#56256"), chr(0xFFFD)); is(decode_entities("\260&rsquo;\260"), "\x{b0}\x{2019}\x{b0}"); } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/declaration.t��������������������������������������������������������������������000644 �000765 �000024 �00000002032 14366103441 017636� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 2; my $res = ""; sub decl { my $t = shift; $res .= "[" . join("\n", map "<$_>", @$t) . "]"; } sub text { $res .= shift; } my $p = HTML::Parser->new( declaration_h => [\&decl, "tokens"], default_h => [\&text, "text"], ); $p->parse(<<EOT)->eof; <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" --<comment>-- "http://www.w3.org/TR/html40/strict.dtd"> <!ENTITY foo "<!-- foo -->"> <!Entity foo "<!-- foo -->"> <!row --> foo EOT is($res, <<EOT); [<DOCTYPE> <HTML> <PUBLIC> <"-//W3C//DTD HTML 4.01//EN"> <--<comment>--> <"http://www.w3.org/TR/html40/strict.dtd">] [<ENTITY> <foo> <"<!-- foo -->">] [<Entity> <foo> <"<!-- foo -->">] <!row --> foo EOT $res = ""; $p->parse(<<EOT)->eof; <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[]> EOT is($res, <<EOT); [<DOCTYPE> <html> <PUBLIC> <"-//W3C//DTD XHTML 1.0 Strict//EN"> <"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <[]>] EOT ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/handler-eof.t��������������������������������������������������������������������000644 �000765 �000024 �00000002750 14366103441 017544� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 6; my $p = HTML::Parser->new(api_version => 3); $p->handler(start => sub { my $attr = shift; is($attr->{testno}, 1) }, "attr"); $p->handler(end => sub { shift->eof }, "self"); my $text; $p->handler(text => sub { $text = shift }, "text"); is($p->parse("<foo testno=1>"), $p); $text = ''; ok(!$p->parse("</foo><foo testno=999>")); ok(!$text); $p->handler(end => sub { $p->parse("foo"); }, ""); { local $@; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { $p->parse("</foo>"); 1; }; #>>> like($error, qr/^Parse loop not allowed/); } # We used to get into an infinite loop if the eof triggered # handler called ->eof $p = HTML::Parser->new(api_version => 3); my $i; $p->handler( "default" => sub { my $p = shift; #++$i; diag "$i @_"; $p->eof; }, "self, event" ); $p->parse("Foo"); $p->eof; # We used to sometimes trigger events after a handler signaled eof my $title = ''; $p = HTML::Parser->new(api_version => 3,); $p->handler(start => \&title_handler, 'tagname, self'); $p->parse("<head><title>foo</title>\n</head>"); is($title, "foo"); sub title_handler { return if shift ne 'title'; my $self = shift; $self->handler(text => sub { $title .= shift }, 'dtext'); $self->handler( end => sub { shift->eof if shift eq 'title' }, 'tagname, self' ); } ������������������������HTML-Parser-3.81/t/parser.t�������������������������������������������������������������������������000644 �000765 �000024 �00000010035 14366103441 016647� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use utf8; use FileHandle (); use HTML::Parser (); use Test::More tests => 7; my $HTML = <<'HTML'; <!DOCTYPE HTML> <body> Various entities. The parser must never break them in the middle: &#x2F &#x2F; &#200 &#3030; &#XFFFF; &aring-&Aring; <ul> <li><a href="foo 'bar' baz>" id=33>This is a link</a> <li><a href='foo "bar" baz> &aring' id=34>This is another one</a> </ul> <p><div align="center"><img src="http://www.perl.com/perl.gif" alt="camel"></div> <!-- this is a comment --> and this is not. <!-- this is the kind of >comment< -- --> that Netscape hates --> < this > was not a tag. <this is/not either> </body> HTML #------------------------------------------------------------------- { package P; require HTML::Parser; our @ISA = qw(HTML::Parser); our $OUT = ''; our $COUNT = 0; sub new { my $class = shift; my $self = $class->SUPER::new; $OUT = ''; die "Can only have one" if $COUNT++; $self; } sub DESTROY { my $self = shift; eval { $self->SUPER::DESTROY; }; $COUNT--; } sub declaration { my ($self, $decl) = @_; $OUT .= "[[$decl]]|"; } sub start { my ($self, $tag, $attr) = @_; $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr); $attr = "/$attr" if length $attr; $OUT .= "<<$tag$attr>>|"; } sub end { my ($self, $tag) = @_; $OUT .= ">>$tag<<|"; } sub comment { my ($self, $comment) = @_; $OUT .= "##$comment##|"; } sub text { my ($self, $text) = @_; #$text =~ s/\n/\\n/g; #$text =~ s/\t/\\t/g; #$text =~ s/ /�/g; $OUT .= "$text|"; } sub result { $OUT; } } my $last_res; for my $chunksize (64 * 1024, 64, 13, 3, 1, "file", "filehandle") { #for $chunksize (1) { if ($chunksize =~ /^file/) { #print "Parsing from $chunksize"; } else { #print "Parsing using $chunksize byte chunks"; } my $p = P->new; if ($chunksize =~ /^file/) { # First we must create the file my $tmpfile = "tmp-$$.html"; my $file = $tmpfile; die "$file already exists" if -e $file; open(my $fh, '>', $file) or die "Can't create $file: $!"; binmode $fh; print {$fh} $HTML; close($fh); if ($chunksize eq "filehandle") { my $fh = FileHandle->new($file) || die "Can't open $file: $!"; $file = $fh; } # then we can parse it. $p->parse_file($file); close $file if $chunksize eq "filehandle"; unlink($tmpfile) || warn "Can't unlink $tmpfile: $!"; } else { my $copy = $HTML; while (length $copy) { my $chunk = substr($copy, 0, $chunksize); substr($copy, 0, $chunksize) = ''; $p->parse($chunk); } $p->eof; } my $res = $p->result; my $bad; # Then we start looking for things that should not happen if ($res =~ /\s\|\s/) { diag "broken space"; $bad++; } for ( # Make sure entities are not broken '&#x2F', '&#x2F;', '&#200', '&#3030;', '&#XFFFF;', '&aring', '&Aring', # Some elements that should be produced "|[[DOCTYPE HTML]]|", "|## this is\na comment ##|", "|<<ul>>|\n|<<li>>|<<a/href=foo 'bar' baz>/id=33>>|", '|<<li>>|<<a/href=foo "bar" baz> å/id=34>>', "|>>ul<<|", "|>>body<<|\n\n|", ) { if ($res !~ /\Q$_\E/) { diag "Can't find '$_' in parsed document"; $bad++; } } diag $res if $bad || $ENV{PRINT_RESULTS}; # And we check that we get the same result all the time $res =~ s/\|//g; # remove all break marks if ($last_res && $res ne $last_res) { diag "The result is not the same as last time"; $bad++; } $last_res = $res; unless ($res =~ /Various entities/) { diag "Some text must be missing"; $bad++; } ok(!$bad); } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/crashme.t������������������������������������������������������������������������000644 �000765 �000024 �00000001426 14366103441 017001� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More; my $no_tests = shift || 3; plan tests => $no_tests; my $file = "junk$$.html"; die if -e $file; for (1 .. $no_tests) { open(my $junk, '>', $file) || die; for (1 .. rand(5000)) { for (1 .. rand(200)) { print {$junk} pack("N", rand(2**32)); } print {$junk} ("<", "&", ">")[rand(3)]; # make these a bit more likely } close($junk); #diag "Parse @{[-s $file]} bytes of junk"; HTML::Parser->new->parse_file($file); pass(); #print_mem(); } unlink($file); sub print_mem { # this probably only works on Linux open(my $stat, "/proc/self/status") || return; while (defined(my $line = <$stat>)) { diag $line if $line =~ /^VmSize/; } } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/argspec-bad.t��������������������������������������������������������������������000644 �000765 �000024 �00000001556 14366103441 017533� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 6; my $p = HTML::Parser->new(api_version => 3); sub test_error { my ($arg, $check_exp) = @_; my $error; { local $@; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { $p->handler(end => "end", $arg); 1 }; #>>> } like($error, $check_exp); } test_error(q(xyzzy), qr/^Unrecognized identifier xyzzy/); test_error(q(tagname text), qr/^Missing comma separator/); test_error(q(tagname, "text), qr/^Unterminated literal string/); test_error(q(tagname, "t\\t"), qr/^Backslash reserved for literal string/); test_error('"' . ("x" x 256) . '"', qr/^Literal string is longer than 255 chars/); $p->handler(end => sub { is(length(shift), 255) }, '"' . ("x" x 255) . '"'); $p->parse("</x>"); ��������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/offset.t�������������������������������������������������������������������������000644 �000765 �000024 �00000002421 14366103441 016641� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 1; my $HTML = <<'EOT'; <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html40/strict.dtd"> <foo bar baz=3>heisan </foo> <?process> <!-- comment --> <xmp>xmp</xmp> EOT my $p = HTML::Parser->new(api_version => 3); my $sum_len = 0; my $count = 0; my $err; $p->handler( default => sub { my ($offset, $length, $offset_end, $line, $col, $text) = @_; my $copy = $text; $copy =~ s/\n/\\n/g; substr($copy, 30) = "..." if length($copy) > 32; #diag sprintf ">>> %d.%d %s", $line, $col, $copy; if ($offset != $sum_len) { diag "offset mismatch $offset vs $sum_len"; $err++; } if ($offset_end != $offset + $length) { diag "offset_end $offset_end wrong"; $err++; } if ($length != length($text)) { diag "length mismatch"; $err++; } if (substr($HTML, $offset, $length) ne $text) { diag "content mismatch"; $err++; } $sum_len += $length; $count++; }, 'offset,length,offset_end,line,column,text' ); for (split(//, $HTML)) { $p->parse($_); } $p->eof; ok($count > 5 && !$err); �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/linkextor-rel.t������������������������������������������������������������������000644 �000765 �000024 �00000001603 14366103441 020153� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::LinkExtor (); use Test::More tests => 4; my $HTML = <<HTML; <head> <base href="http://www.sn.no/"> </head> <body background="http://www.sn.no/sn.gif"> This is <A HREF="link.html">link</a> and an <img SRC="img.jpg" lowsrc="img.gif" alt="Image">. HTML # Try the callback interface my $links = ""; my $p = HTML::LinkExtor->new(sub { my($tag, %links) = @_; #diag "$tag @{[%links]}"; $links .= "$tag @{[%links]}\n"; }); $p->parse($HTML); $p->eof; ok($links =~ m|^base href http://www\.sn\.no/$|m); ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m); ok($links =~ m|^a href link\.html$|m); # Used to be problems when using the links method on a document with # no links it it. This is a test to prove that it works. $p = new HTML::LinkExtor; $p->parse("this is a document with no links"); $p->eof; my @links = $p->links; is(@links, 0); �����������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/comment.t������������������������������������������������������������������������000644 �000765 �000024 �00000001465 14366103441 017024� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 1; my $p = HTML::Parser->new(api_version => 3); my @com; $p->handler( comment => sub { push(@com, shift); }, "token0" ); $p->handler( default => sub { push(@com, shift() . "[" . shift() . "]"); }, "event, text" ); $p->parse("<foo><><!><!-><!--><!---><!----><!-----><!------>"); $p->parse("<!--+--"); $p->parse("\n\n"); $p->parse(">"); $p->parse("<!a'b>"); $p->parse("<!--foo--->"); $p->parse("<!--foo---->"); $p->parse("<!--foo----->-->"); $p->parse("<foo>"); $p->parse("<!3453><!-3456><!FOO><>"); $p->eof; my $com = join(":", @com); my $res = "start_document[]:start[<foo>]:text[<>]::-:><!-::-:--:+:a'b:foo-:foo--:foo---:text[-->]:start[<foo>]:3453:-3456:FOO:text[<>]:end_document[]"; is($com, $res); �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/case-sensitive.t�����������������������������������������������������������������000644 �000765 �000024 �00000003245 14366103441 020302� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 8; my $p = HTML::Parser->new(); $p->case_sensitive(1); my $text = ""; $p->handler( start => sub { my ($tag, $attr, $attrseq) = @_; $text .= "S[$tag"; for my $k (sort keys %$attr) { my $v = $attr->{$k}; $text .= " $k=$v"; } if (@$attrseq) { $text .= " Order:"; } for my $k (@$attrseq) { $text .= " $k"; } $text .= "]"; }, "tagname,attr,attrseq" ); $p->handler( end => sub { my ($tag) = @_; $text .= "E[$tag]"; }, "tagname" ); my $html = <<'EOT'; <tAg aRg="Value" arg="other value"></tAg> EOT my $cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]'; my $ci = 'S[tag arg=Value Order: arg arg]E[tag]'; $p->parse($html)->eof; is($text, $cs); $text = ""; $p->case_sensitive(0); $p->parse($html)->eof; is($text, $ci); $text = ""; $p->case_sensitive(1); $p->xml_mode(1); $p->parse($html)->eof; is($text, $cs); $text = ""; $p->case_sensitive(0); $p->parse($html)->eof; is($text, $cs); $html = <<'EOT'; <tAg aRg="Value" arg="other value"></tAg> <iGnOrE></ignore> EOT $p->ignore_tags('ignore'); $cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]S[iGnOrE]'; $ci = 'S[tag arg=Value Order: arg arg]E[tag]'; $text = ""; $p->case_sensitive(0); $p->xml_mode(0); $p->parse($html)->eof; is($text, $ci); $text = ""; $p->case_sensitive(1); $p->xml_mode(0); $p->parse($html)->eof; is($text, $cs); $text = ""; $p->case_sensitive(0); $p->xml_mode(1); $p->parse($html)->eof; is($text, $cs); $text = ""; $p->case_sensitive(1); $p->xml_mode(1); $p->parse($html)->eof; is($text, $cs); �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/threads.t������������������������������������������������������������������������000644 �000765 �000024 �00000001254 14366103441 017010� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use Config; use HTML::Parser (); use Test::More; # Verify thread safety. BEGIN { plan(skip_all => "Not configured for threads") unless $Config{useithreads}; plan(skip_all => "Threads are not reliable on older perls") unless "$]" >= 5.010001; plan(tests => 1); } use threads; my $ok = 0; sub start { my ($tag, $attr) = @_; $ok += ($tag eq "foo"); $ok += (defined($attr->{param}) && $attr->{param} eq "bar"); } my $p = HTML::Parser->new( api_version => 3, handlers => {start => [\&start, "tagname,attr"],} ); $p->parse("<foo pa"); $ok = async { $p->parse("ram=bar>"); $ok; } ->join(); is($ok, 2); ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/xml-mode.t�����������������������������������������������������������������������000644 �000765 �000024 �00000005173 14366103441 017104� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 8; my $p = HTML::Parser->new(xml_mode => 1,); my $text = ""; $p->handler( start => sub { my ($tag, $attr) = @_; $text .= "S[$tag"; for my $k (sort keys %$attr) { my $v = $attr->{$k}; $text .= " $k=$v"; } $text .= "]"; }, "tagname,attr" ); $p->handler( end => sub { $text .= "E[" . shift() . "]"; }, "tagname" ); $p->handler( process => sub { $text .= "PI[" . shift() . "]"; }, "token0" ); $p->handler( text => sub { $text .= shift; }, "text" ); my $xml = <<'EOT'; <?xml version="1.0"?> <?IS10744:arch name="html"?><!-- comment --> <DOC> <title html="h1">My first architectual document</title> <author html="address">Geir Ove Gronmo, grove@infotek.no</author> <para>This is the first paragraph in this document</para> <para html="p">This is the second paragraph</para> <para/> <xmp><foo></foo></xmp> </DOC> EOT $p->parse($xml)->eof; is($text, <<'EOT'); PI[xml version="1.0"] PI[IS10744:arch name="html"] S[DOC] S[title html=h1]My first architectual documentE[title] S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author] S[para]This is the first paragraph in this documentE[para] S[para html=p]This is the second paragraphE[para] S[para]E[para] S[xmp]S[foo]E[foo]E[xmp] E[DOC] EOT $text = ""; $p->xml_mode(0); $p->parse($xml)->eof; is($text, <<'EOT'); PI[xml version="1.0"?] PI[IS10744:arch name="html"?] S[doc] S[title html=h1]My first architectual documentE[title] S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author] S[para]This is the first paragraph in this documentE[para] S[para html=p]This is the second paragraphE[para] S[para/] S[xmp]<foo></foo>E[xmp] E[doc] EOT # Test that we get an empty tag back $p = HTML::Parser->new(api_version => 3, xml_mode => 1); $p->handler( "end" => sub { my ($tagname, $text) = @_; is($tagname, "Xyzzy"); ok(!length($text)); }, "tagname,text" ); $p->parse("<Xyzzy foo=bar/>and some more")->eof; # Test that we get an empty tag back $p = HTML::Parser->new(api_version => 3, empty_element_tags => 1); $p->handler( "end" => sub { my ($tagname, $text) = @_; is($tagname, "xyzzy"); ok(!length($text)); }, "tagname,text" ); $p->parse("<Xyzzy foo=bar/>and some more")->eof; $p = HTML::Parser->new(api_version => 3, xml_pic => 1,); $p->handler( "process" => sub { my ($text, $t0) = @_; is($text, "<?foo > bar?>"); is($t0, "foo > bar"); }, "text, token0" ); $p->parse("<?foo > bar?> and then")->eof; �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/headparser.t���������������������������������������������������������������������000644 �000765 �000024 �00000012524 14366103441 017476� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use utf8; use HTML::HeadParser (); use Test::More tests => 17; { package H; use strict; use warnings; sub new { bless {}, shift; } sub header { my $self = shift; my $key = uc(shift); die if $key =~ /:/; my $old = $self->{$key}; if (@_) { $self->{$key} = shift; } $old; } sub push_header { my ($self, $k, $v) = @_; $k = uc($k); die if $k =~ /:/; if (exists $self->{$k}) { $self->{$k} = [$self->{$k}] unless ref $self->{$k}; push(@{$self->{$k}}, $v); } else { $self->{$k} = $v; } } sub as_string { my $self = shift; my $str = ""; for (sort keys %$self) { if (ref($self->{$_})) { my $v; for $v (@{$self->{$_}}) { $str .= "$_: $v\n"; } } else { $str .= "$_: $self->{$_}\n"; } } $str; } } my $HTML = <<'EOT'; <title>&Aring være eller &#229; ikke være</title> <meta http-equiv="Expires" content="Soon"> <meta http-equiv="Foo" content="Bar"> <meta name='twitter:card' content='photo' /> <link href="mailto:gisle@aas.no" rev=made title="Gisle Aas"> <script> ignore this </script> <noscript> ... and this </noscript> <object classid="foo"> <base href="http://www.sn.no"> <meta name="Keywords" content="test, test, test,..."> <meta name="Keywords" content="more"> <meta charset="ISO-8859-1"><!-- HTML 5 --> Dette er vanlig tekst. Denne teksten definerer også slutten på &lt;head> delen av dokumentet. <style> ignore this too </style> <isindex> Dette er også vanlig tekst som ikke skal blir parset i det hele tatt. EOT $| = 1; #$HTML::HeadParser::DEBUG = 1; my $p = HTML::HeadParser->new(H->new); if ($p->parse($HTML)) { fail("Need more data which should not happen"); } else { #diag $p->as_string; pass(); } like($p->header('Title'), qr/Å være eller å ikke være/); is($p->header('Expires'), 'Soon'); is($p->header('Content-Base'), 'http://www.sn.no'); is_deeply($p->header('X-Meta-Keywords'), ['test, test, test,...', 'more']); is($p->header('X-Meta-Charset'), 'ISO-8859-1'); is($p->header('X-Meta-Twitter-Card'), 'photo'); like($p->header('Link'), qr/<mailto:gisle\@aas.no>/); # This header should not be present because the head ended ok(!$p->header('Isindex')); # Try feeding one char at a time my $expected = $p->as_string; my $nl = 1; $p = HTML::HeadParser->new(H->new); while ($HTML =~ /(.)/sg) { #print STDERR '#' if $nl; #print STDERR $1; $nl = $1 eq "\n"; $p->parse($1) or last; } is($p->as_string, $expected); # Try reading it from a file my $file = "hptest$$.html"; die "$file already exists" if -e $file; { open(my $fh, '>', $file) or die "Can't create $file: $!"; binmode($fh); print {$fh} $HTML; print {$fh} "<p>This is more content...</p>\n" x 2000; print {$fh} "<title>Buuuh!</title>\n" x 200; close $fh or die "Can't close $file: $!"; } $p = HTML::HeadParser->new(H->new); $p->parse_file($file); unlink($file) or warn "Can't unlink $file: $!"; is($p->header("Title"), "Å være eller å ikke være"); # We got into an infinite loop on data without tags and no EOL. # This was actually a HTML::Parser bug. { open(my $fh, '>', $file) or die "Can't create $file: $!"; print {$fh} "Foo"; close($fh); } $p = HTML::HeadParser->new(H->new); $p->parse_file($file); unlink($file) or warn "Can't unlink $file: $!"; ok(!$p->as_string); SKIP: { # Test that the Unicode BOM does not confuse us? $p = HTML::HeadParser->new(H->new); ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>")); $p->eof; is($p->header("title"), "Hi <foo>"); $p = HTML::HeadParser->new(H->new); $p->utf8_mode(1); $p->parse( <<"EOT"); # example from http://rt.cpan.org/Ticket/Display.html?id=27522 \xEF\xBB\xBF<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html> <head> <title> Parkinson's disease</title> <meta name="Keywords" content="brain,disease,dopamine,drug,levodopa,parkinson,patients,symptoms,,Medications, Medications"> </meta> \t \t<link href="../../css/ummAdam.css" rel="stylesheet" type="text/css" /> \t<link rel="stylesheet" rev="stylesheet" href="../../css/ummprint.css" media="print" /> \t \t </head> <body> EOT $p->eof; is($p->header("title"), "Parkinson's disease"); is($p->header("link")->[0], '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"'); $p = HTML::HeadParser->new(H->new); $p->utf8_mode(1); $p->parse(<<"EOT"); # example from http://www.mjw.com.pl/ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r \r <head profile="http://gmpg.org/xfn/11">\r <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r \r <title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa &#8211; MJW</title>\r <link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r EOT $p->eof; is($p->header("title"), "ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW" ); } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/tokeparser.t���������������������������������������������������������������������000644 �000765 �000024 �00000006432 14366103441 017540� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::TokeParser (); use Test::More tests => 17; # First we create an HTML document to test my $file = "ttest$$.htm"; die "$file already exists" if -e $file; { open(my $fh, '>', $file) or die "Can't create $file: $!"; print {$fh} <<'EOT'; <!--This is a test--> <html><head><title> This is the &lt;title&gt; </title> <base href="http://www.perl.com"> </head> <body background="bg.gif"> <h1>This is the <b>title</b> again </h1> And this is a link to the <a href="http://www.perl.com"><img src="camel.gif" alt="Perl">&nbsp;<!--nice isn't it-->Institute</a> <br/><? process instruction > </body> </html> EOT close($fh); } END { unlink($file) || warn "Can't unlink $file: $!"; } my $p; $p = HTML::TokeParser->new($file) || die "Can't open $file: $!"; ok($p->unbroken_text); if ($p->get_tag("foo", "title")) { my $title = $p->get_trimmed_text; #diag "Title: $title"; is($title, "This is the <title>"); } undef($p); my $scount = 0; my $ecount = 0; my $tcount = 0; my $pcount = 0; # Test with reference to glob { open(my $fh, '<', $file) || die "Can't open $file: $!"; $p = HTML::TokeParser->new($fh); while (my $token = $p->get_token) { $scount++ if $token->[0] eq "S"; $ecount++ if $token->[0] eq "E"; $pcount++ if $token->[0] eq "PI"; } undef($p); close $fh; } # Test with glob { open(my $fh, $file) || die "Can't open $file: $!"; $p = HTML::TokeParser->new($fh); $tcount++ while $p->get_tag; undef($p); close $fh; } # Test with plain file name $p = HTML::TokeParser->new($file) || die; $tcount++ while $p->get_tag; undef($p); #diag "Number of tokens found: $tcount/2 = $scount + $ecount"; is($tcount, 34); is($scount, 10); is($ecount, 7); is($pcount, 1); is($tcount / 2, $scount + $ecount); ok(!HTML::TokeParser->new("/noT/thEre/$$")); $p = HTML::TokeParser->new($file) || die; $p->get_tag("a"); my $atext = $p->get_text; undef($p); is($atext, "Perl\240Institute"); # test parsing of embeded document $p = HTML::TokeParser->new(\<<HTML); <title>Title</title> <H1> Heading </h1> HTML ok($p->get_tag("h1")); is($p->get_trimmed_text, "Heading"); undef($p); # test parsing of large embedded documents my $doc = "<a href='foo'>foo is bar</a>\n\n\n" x 2022; #use Time::HiRes qw(time); my $start = time; $p = HTML::TokeParser->new(\$doc); #diag "Construction time: ", time - $start; my $count; while (my $t = $p->get_token) { $count++ if $t->[0] eq "S"; } #diag "Parse time: ", time - $start; is($count, 2022); $p = HTML::TokeParser->new(\<<'EOT'); <H1>This is a heading</H1> This is s<b>o</b>me<hr>text. <br /> This is some more text. <p> This is even some more. EOT $p->get_tag("/h1"); my $t = $p->get_trimmed_text("br", "p"); is($t, "This is some text."); $p->get_tag; $t = $p->get_trimmed_text("br", "p"); is($t, "This is some more text."); undef($p); $p = HTML::TokeParser->new(\<<'EOT'); <H1>This is a <b>bold</b> heading</H1> This is some <i>italic</i> text.<br />This is some <span id=x>more text</span>. <p> This is even some more. EOT $p->get_tag("h1"); $t = $p->get_phrase; is($t, "This is a bold heading"); $t = $p->get_phrase; is($t, ""); $p->get_tag; $t = $p->get_phrase; is($t, "This is some italic text. This is some more text."); undef($p); ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/options.t������������������������������������������������������������������������000644 �000765 �000024 �00000001333 14366103441 017047� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������ use strict; use warnings; use HTML::Parser (); use Test::More tests => 10; # Test option setting methods my $p = HTML::Parser->new(api_version => 3, xml_mode => 1); my $old; $old = $p->boolean_attribute_value("foo"); ok(!defined $old); $old = $p->boolean_attribute_value(); is($old, "foo"); $old = $p->boolean_attribute_value(undef); is($old, "foo"); ok(!defined($p->boolean_attribute_value)); ok($p->xml_mode(0)); ok(!$p->xml_mode); my $seen_buggy_comment_warning; $SIG{__WARN__} = sub { local $_ = shift; $seen_buggy_comment_warning++ if /^netscape_buggy_comment\(\) is deprecated/; }; ok(!$p->strict_comment(1)); ok($p->strict_comment); ok(!$p->netscape_buggy_comment); ok($seen_buggy_comment_warning); �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/unicode.t������������������������������������������������������������������������000644 �000765 �000024 �00000013174 14366103441 017010� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!perl -w use strict; use warnings; use utf8; use HTML::Parser (); use Test::More tests => 107; my @warn; $SIG{__WARN__} = sub { push(@warn, $_[0]); }; my @parsed; my $p = HTML::Parser->new( api_version => 3, default_h => [ \@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr' ], ); my $doc = "<title>\x{263A}</title><h1 id=\x{2600} f>Smile &#x263a</h1>\x{0420}"; is(length($doc), 46); $p->parse($doc)->eof; #use Data::Dump; Data::Dump::dump(@parsed); is(@parsed, 9); is($parsed[0][0], "start_document"); is($parsed[1][0], "start"); is($parsed[1][1], "<title>"); SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8"); } is($parsed[1][3], 0); is($parsed[1][4], 7); is($parsed[2][0], "text"); is(ord($parsed[2][1]), 0x263A); is($parsed[2][2], chr(0x263A)); is($parsed[2][3], 7); is($parsed[2][4], 1); is($parsed[2][5], 8); is($parsed[2][6], 7); is($parsed[3][0], "end"); is($parsed[3][1], "</title>"); is($parsed[3][3], 8); is($parsed[3][6], 8); is($parsed[4][0], "start"); is($parsed[4][1], "<h1 id=\x{2600} f>"); is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0"); is($parsed[4][8]{id}, "\x{2600}"); is($parsed[5][0], "text"); is($parsed[5][1], "Smile &#x263a"); is($parsed[5][2], "Smile \x{263A}"); is($parsed[7][0], "text"); is($parsed[7][1], "\x{0420}"); is($parsed[7][2], "\x{0420}"); is($parsed[8][0], "end_document"); is($parsed[8][3], length($doc)); is($parsed[8][5], length($doc)); is($parsed[8][6], length($doc)); is(@warn, 0); # Try to parse it as an UTF8 encoded string utf8::encode($doc); is(length($doc), 51); @parsed = (); $p->parse($doc)->eof; #use Data::Dump; Data::Dump::dump(@parsed); is(@parsed, 9); is($parsed[0][0], "start_document"); is($parsed[1][0], "start"); is($parsed[1][1], "<title>"); SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8"); } is($parsed[1][3], 0); is($parsed[1][4], 7); is($parsed[2][0], "text"); is(ord($parsed[2][1]), 226); is($parsed[2][1], "\xE2\x98\xBA"); is($parsed[2][2], "\xE2\x98\xBA"); is($parsed[2][3], 7); is($parsed[2][4], 3); is($parsed[2][5], 10); is($parsed[2][6], 7); is($parsed[3][0], "end"); is($parsed[3][1], "</title>"); is($parsed[3][3], 10); is($parsed[3][6], 10); is($parsed[4][0], "start"); is($parsed[4][1], "<h1 id=\xE2\x98\x80 f>"); is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0"); is($parsed[4][8]{id}, "\xE2\x98\x80"); is($parsed[5][0], "text"); is($parsed[5][1], "Smile &#x263a"); is($parsed[5][2], "Smile \x{263A}"); is($parsed[8][0], "end_document"); is($parsed[8][3], length($doc)); is($parsed[8][5], length($doc)); is($parsed[8][6], length($doc)); is(@warn, 1); like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/); my $file = "test-$$.html"; open(my $fh, ">:utf8", $file) || die; print $fh <<EOT; \x{FEFF} <title>\x{263A} Love! </title> <h1 id=&hearts;\x{2665}>&hearts; Love \x{2665}<h1> EOT close($fh) || die; @warn = (); @parsed = (); $p->parse_file($file); is(@parsed, "11"); is($parsed[6][0], "start"); is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5"); is($parsed[7][0], "text"); is($parsed[7][1], "&hearts; Love \xE2\x99\xA5"); is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage is($parsed[10][3], -s $file); is(@warn, 1); like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/); @warn = (); @parsed = (); open($fh, "<:raw:utf8", $file) || die; $p->parse_file($fh); is(@parsed, "11"); is($parsed[6][0], "start"); is($parsed[6][8]{id}, "\x{2665}\x{2665}"); is($parsed[7][0], "text"); is($parsed[7][1], "&hearts; Love \x{2665}"); is($parsed[7][2], "\x{2665} Love \x{2665}"); is($parsed[10][3], (-s $file) - 2 * 4); is(@warn, 0); @warn = (); @parsed = (); open($fh, "<:raw", $file) || die; $p->utf8_mode(1); $p->parse_file($fh); is(@parsed, "11"); is($parsed[6][0], "start"); is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5"); is($parsed[7][0], "text"); is($parsed[7][1], "&hearts; Love \xE2\x99\xA5"); is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5"); is($parsed[10][3], -s $file); is(@warn, 0); unlink($file); @parsed = (); $p->parse(q(<a href="a=1&lang=2&times=3">foo</a>))->eof; is(@parsed, "5"); is($parsed[1][0], "start"); is($parsed[1][8]{href}, "a=1&lang=2\xC3\x97=3"); ok(!HTML::Entities::_probably_utf8_chunk("")); ok(!HTML::Entities::_probably_utf8_chunk("f")); ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5")); ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o")); ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2")); ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99")); ok(!HTML::Entities::_probably_utf8_chunk("f\xE2")); ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99")); $p = HTML::Parser->new( api_version => 3, default_h => [\@parsed, 'event, text, tag, attr'], attr_encoded => 1, ); @warn = (); @parsed = (); $p->parse($doc)->eof; ok(!@warn); is(@parsed, 9); @parsed = (); $p = HTML::Parser->new( api_version => 3, utf8_mode => 1, unbroken_text => 1, default_h => [\@parsed, 'event,dtext'], ); $p->parse( "<p>R\xC3\xA9ductions jusqu'&agrave; -70%.<p>R&eacute;ductions jusqu'&agrave; -70%." ); $p->eof; is($parsed[2][1], "R\xC3\xA9ductions jusqu'\xC3\xA0 -70%."); is($parsed[4][1], "R\xC3\xA9ductions jusqu'\xC3\xA0 -70%."); ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/marked-sect.t��������������������������������������������������������������������000644 �000765 �000024 �00000005500 14366103441 017553� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use utf8; use HTML::Parser (); use Test::More tests => 14; my $tag; my $text; my $p = HTML::Parser->new( start_h => [sub { $tag = shift }, "tagname"], text_h => [sub { $text .= shift }, "dtext"], ); my $error; { local $@; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { $p->marked_sections(1); 1; }; #>>> } SKIP: { skip $error, 14 if $error; $p->parse("<![[foo]]>"); is($text, "foo"); $p->parse("<![TEMP INCLUDE[bar]]>"); is($text, "foobar"); $p->parse("<![ INCLUDE -- IGNORE -- [foo<![IGNORE[bar]]>]]>\n<br>"); is($text, "foobarfoo\n"); $text = ""; $p->parse("<![ CDATA [&lt;foo"); $p->parse("<![IGNORE[bar]]>,bar&gt;]]><br>"); is($text, "&lt;foo<![IGNORE[bar,bar>]]>"); $text = ""; $p->parse("<![ RCDATA [&aring;<a>]]><![CDATA[&aring;<a>]]>&aring;<a><br>"); is($text, "å<a>&aring;<a>å"); is($tag, "br"); $text = ""; $p->parse("<![INCLUDE RCDATA CDATA IGNORE [foo&aring;<a>]]><br>"); is($text, ""); $text = ""; $p->parse("<![INCLUDE RCDATA CDATA [foo&aring;<a>]]><br>"); is($text, "foo&aring;<a>"); $text = ""; $p->parse("<![INCLUDE RCDATA [foo&aring;<a>]]><br>"); is($text, "fooå<a>"); $text = ""; $p->parse("<![INCLUDE [foo&aring;<a>]]><br>"); is($text, "fooå"); $text = ""; $p->parse("<![[foo&aring;<a>]]><br>"); is($text, "fooå"); # offsets/line/column numbers $p = HTML::Parser->new( default_h => [\&x, "line,column,offset,event,text"], marked_sections => 1, ); $p->parse(<<'EOT')->eof; <title>Test</title> <![CDATA [foo&aring;<a> ]]> <![[ INCLUDE STUFF ]]> <h1>Test</h1> EOT my @x; sub x { my ($line, $col, $offset, $event, $text) = @_; $text =~ s/\n/\\n/g; $text =~ s/ /./g; push(@x, "$line.$col:$offset $event \"$text\"\n"); } #diag @x; is(join("", @x), <<'EOT'); 1.0:0 start_document "" 1.0:0 start "<title>" 1.7:7 text "Test" 1.11:11 end "</title>" 1.19:19 text "\n" 3.3:32 text "foo&aring;<a>\n" 4.3:49 text "\n" 5.4:54 text "\nINCLUDE\nSTUFF\n" 8.3:72 text "\n.." 9.2:75 start "<h1>" 9.6:79 text "Test" 9.10:83 end "</h1>" 9.15:88 text "\n" 10.0:89 end_document "" EOT my $doc = "<Tag><![CDATA[This is cdata]]></Tag>"; my $result = ""; $p = HTML::Parser->new( marked_sections => 1, handlers => { default => [sub { $result .= join("", @_); }, "skipped_text,text"] } )->parse($doc)->eof; is($doc, $result); $text = ""; $p = HTML::Parser->new( text_h => [sub { $text .= shift }, "dtext"], marked_sections => 1, ); $p->parse("<![CDATA[foo [1]]]>"); is($text, "foo [1]", "CDATA text ending in square bracket"); } # SKIP ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/largetags.t����������������������������������������������������������������������000644 �000765 �000024 �00000001315 14366103441 017325� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 2; # Exercise the tokenpos buffer allocation routines by feeding it # very large tags. my $p = HTML::Parser->new(api_version => 3); $p->handler( "start" => sub { my $tp = shift; #diag int(@$tp), " - ", join(", ", @$tp); is(@$tp, 2 + 26 * 6 * 4); }, "tokenpos" ); $p->handler( "declaration" => sub { my $t = shift; #diag int(@$t), " - @$t"; is(@$t, 26 * 6 * 2 + 1); }, "tokens" ); $p->parse("<a "); for ("aa" .. "fz") { $p->parse("$_=1 "); } $p->parse(">"); $p->parse("<!DOCTYPE "); for ("aa" .. "fz") { $p->parse("$_ -- $_ -- "); } $p->parse(">"); $p->eof; �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/entities.t�����������������������������������������������������������������������000644 �000765 �000024 �00000022777 14366103441 017217� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use utf8; use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric); use Test::More tests => 20; my $x = "V&aring;re norske tegn b&oslash;r &#230res"; decode_entities($x); is($x, "Våre norske tegn bør æres"); encode_entities($x); is($x, "V&aring;re norske tegn b&oslash;r &aelig;res"); decode_entities($x); encode_entities_numeric($x); is($x, "V&#xE5;re norske tegn b&#xF8;r &#xE6;res"); $x = "<&>\"'"; is(encode_entities($x), "&lt;&amp;&gt;&quot;&#39;"); is(encode_entities_numeric($x), "&#x3C;&#x26;&#x3E;&#x22;&#x27;"); $x = "abcdef"; is(encode_entities($x, 'a-c'), "&#97;&#98;&#99;def"); $x = "[24/7]\\"; is(encode_entities($x, '/'), "[24&#47;7]\\"); is(encode_entities($x, '\\/'), "[24&#47;7]\\"); is(encode_entities($x, '\\'), "[24/7]&#92;"); is(encode_entities($x, ']\\'), "[24/7&#93;&#92;"); # See how well it does against rfc1866... my $ent = ''; my $plain = ''; while (<DATA>) { next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/; $ent .= "&$1;"; $plain .= chr($2); } $x = $ent; decode_entities($x); is($x, $plain); # Try decoding when the ";" are left out $x = $ent; $x =~ s/;//g; decode_entities($x); is($x, $plain); $x = $plain; encode_entities($x); is($x, $ent); #RT #84144 - https://rt.cpan.org/Public/Bug/Display.html?id=84144 { my %hash = ("V&aring;re norske tegn b&oslash;r &#230res" => "Våre norske tegn bør æres",); local $@; my $got; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { $got = decode_entities((keys %hash)[0]); 1; }; #>>> ok(!$error, "decode_entitites() when processing a key as input"); is($got, (values %hash)[0], "decode_entities() decodes a key properly"); } # From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au> # Subject: HTML entities problem with 5.11 # To: libwww-perl@ics.uci.edu # Date: Fri, 05 Sep 1997 16:56:55 +1000 # Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU> # # Hi. I've got a problem that has surfaced with the changes to # HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening # in the process of encoding then decoding special entities. Eg, what goes # in as "abc&def&ghi" comes out as "abc&def;&ghi;". is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;"); # Decoding of &apos; is(decode_entities("&apos;"), "'"); is(encode_entities("'", "'"), "&#39;"); is( decode_entities( "Attention Home&#959&#969n&#1257rs...1&#1109t T&#1110&#1084e E&#957&#1257&#1075" ), "Attention Home\x{3BF}\x{3C9}n\x{4E9}rs...1\x{455}t T\x{456}\x{43C}e E\x{3BD}\x{4E9}\x{433}" ); is(decode_entities("{&#38;amp;&#x26;amp;&amp; also &#x42f;&#339;}"), "{&amp;&amp;& also \x{42F}\x{153}}"); __END__ # Quoted from rfc1866.txt 14. Proposed Entities The HTML DTD references the "Added Latin 1" entity set, which only supplies named entities for a subset of the non-ASCII characters in [ISO-8859-1], namely the accented characters. The following entities should be supported so that all ISO 8859-1 characters may only be referenced symbolically. The names for these entities are taken from the appendixes of [SGML]. <!ENTITY nbsp CDATA "&#160;" -- no-break space --> <!ENTITY iexcl CDATA "&#161;" -- inverted exclamation mark --> <!ENTITY cent CDATA "&#162;" -- cent sign --> <!ENTITY pound CDATA "&#163;" -- pound sterling sign --> <!ENTITY curren CDATA "&#164;" -- general currency sign --> <!ENTITY yen CDATA "&#165;" -- yen sign --> <!ENTITY brvbar CDATA "&#166;" -- broken (vertical) bar --> <!ENTITY sect CDATA "&#167;" -- section sign --> <!ENTITY uml CDATA "&#168;" -- umlaut (dieresis) --> <!ENTITY copy CDATA "&#169;" -- copyright sign --> <!ENTITY ordf CDATA "&#170;" -- ordinal indicator, feminine --> <!ENTITY laquo CDATA "&#171;" -- angle quotation mark, left --> <!ENTITY not CDATA "&#172;" -- not sign --> <!ENTITY shy CDATA "&#173;" -- soft hyphen --> <!ENTITY reg CDATA "&#174;" -- registered sign --> <!ENTITY macr CDATA "&#175;" -- macron --> <!ENTITY deg CDATA "&#176;" -- degree sign --> <!ENTITY plusmn CDATA "&#177;" -- plus-or-minus sign --> <!ENTITY sup2 CDATA "&#178;" -- superscript two --> <!ENTITY sup3 CDATA "&#179;" -- superscript three --> <!ENTITY acute CDATA "&#180;" -- acute accent --> <!ENTITY micro CDATA "&#181;" -- micro sign --> <!ENTITY para CDATA "&#182;" -- pilcrow (paragraph sign) --> <!ENTITY middot CDATA "&#183;" -- middle dot --> <!ENTITY cedil CDATA "&#184;" -- cedilla --> <!ENTITY sup1 CDATA "&#185;" -- superscript one --> <!ENTITY ordm CDATA "&#186;" -- ordinal indicator, masculine --> <!ENTITY raquo CDATA "&#187;" -- angle quotation mark, right --> <!ENTITY frac14 CDATA "&#188;" -- fraction one-quarter --> <!ENTITY frac12 CDATA "&#189;" -- fraction one-half --> <!ENTITY frac34 CDATA "&#190;" -- fraction three-quarters --> <!ENTITY iquest CDATA "&#191;" -- inverted question mark --> <!ENTITY Agrave CDATA "&#192;" -- capital A, grave accent --> <!ENTITY Aacute CDATA "&#193;" -- capital A, acute accent --> <!ENTITY Acirc CDATA "&#194;" -- capital A, circumflex accent --> Berners-Lee & Connolly Standards Track [Page 75] RFC 1866 Hypertext Markup Language - 2.0 November 1995 <!ENTITY Atilde CDATA "&#195;" -- capital A, tilde --> <!ENTITY Auml CDATA "&#196;" -- capital A, dieresis or umlaut mark --> <!ENTITY Aring CDATA "&#197;" -- capital A, ring --> <!ENTITY AElig CDATA "&#198;" -- capital AE diphthong (ligature) --> <!ENTITY Ccedil CDATA "&#199;" -- capital C, cedilla --> <!ENTITY Egrave CDATA "&#200;" -- capital E, grave accent --> <!ENTITY Eacute CDATA "&#201;" -- capital E, acute accent --> <!ENTITY Ecirc CDATA "&#202;" -- capital E, circumflex accent --> <!ENTITY Euml CDATA "&#203;" -- capital E, dieresis or umlaut mark --> <!ENTITY Igrave CDATA "&#204;" -- capital I, grave accent --> <!ENTITY Iacute CDATA "&#205;" -- capital I, acute accent --> <!ENTITY Icirc CDATA "&#206;" -- capital I, circumflex accent --> <!ENTITY Iuml CDATA "&#207;" -- capital I, dieresis or umlaut mark --> <!ENTITY ETH CDATA "&#208;" -- capital Eth, Icelandic --> <!ENTITY Ntilde CDATA "&#209;" -- capital N, tilde --> <!ENTITY Ograve CDATA "&#210;" -- capital O, grave accent --> <!ENTITY Oacute CDATA "&#211;" -- capital O, acute accent --> <!ENTITY Ocirc CDATA "&#212;" -- capital O, circumflex accent --> <!ENTITY Otilde CDATA "&#213;" -- capital O, tilde --> <!ENTITY Ouml CDATA "&#214;" -- capital O, dieresis or umlaut mark --> <!ENTITY times CDATA "&#215;" -- multiply sign --> <!ENTITY Oslash CDATA "&#216;" -- capital O, slash --> <!ENTITY Ugrave CDATA "&#217;" -- capital U, grave accent --> <!ENTITY Uacute CDATA "&#218;" -- capital U, acute accent --> <!ENTITY Ucirc CDATA "&#219;" -- capital U, circumflex accent --> <!ENTITY Uuml CDATA "&#220;" -- capital U, dieresis or umlaut mark --> <!ENTITY Yacute CDATA "&#221;" -- capital Y, acute accent --> <!ENTITY THORN CDATA "&#222;" -- capital THORN, Icelandic --> <!ENTITY szlig CDATA "&#223;" -- small sharp s, German (sz ligature) --> <!ENTITY agrave CDATA "&#224;" -- small a, grave accent --> <!ENTITY aacute CDATA "&#225;" -- small a, acute accent --> <!ENTITY acirc CDATA "&#226;" -- small a, circumflex accent --> <!ENTITY atilde CDATA "&#227;" -- small a, tilde --> <!ENTITY auml CDATA "&#228;" -- small a, dieresis or umlaut mark --> <!ENTITY aring CDATA "&#229;" -- small a, ring --> <!ENTITY aelig CDATA "&#230;" -- small ae diphthong (ligature) --> <!ENTITY ccedil CDATA "&#231;" -- small c, cedilla --> <!ENTITY egrave CDATA "&#232;" -- small e, grave accent --> <!ENTITY eacute CDATA "&#233;" -- small e, acute accent --> <!ENTITY ecirc CDATA "&#234;" -- small e, circumflex accent --> <!ENTITY euml CDATA "&#235;" -- small e, dieresis or umlaut mark --> <!ENTITY igrave CDATA "&#236;" -- small i, grave accent --> <!ENTITY iacute CDATA "&#237;" -- small i, acute accent --> <!ENTITY icirc CDATA "&#238;" -- small i, circumflex accent --> <!ENTITY iuml CDATA "&#239;" -- small i, dieresis or umlaut mark --> <!ENTITY eth CDATA "&#240;" -- small eth, Icelandic --> <!ENTITY ntilde CDATA "&#241;" -- small n, tilde --> <!ENTITY ograve CDATA "&#242;" -- small o, grave accent --> Berners-Lee & Connolly Standards Track [Page 76] RFC 1866 Hypertext Markup Language - 2.0 November 1995 <!ENTITY oacute CDATA "&#243;" -- small o, acute accent --> <!ENTITY ocirc CDATA "&#244;" -- small o, circumflex accent --> <!ENTITY otilde CDATA "&#245;" -- small o, tilde --> <!ENTITY ouml CDATA "&#246;" -- small o, dieresis or umlaut mark --> <!ENTITY divide CDATA "&#247;" -- divide sign --> <!ENTITY oslash CDATA "&#248;" -- small o, slash --> <!ENTITY ugrave CDATA "&#249;" -- small u, grave accent --> <!ENTITY uacute CDATA "&#250;" -- small u, acute accent --> <!ENTITY ucirc CDATA "&#251;" -- small u, circumflex accent --> <!ENTITY uuml CDATA "&#252;" -- small u, dieresis or umlaut mark --> <!ENTITY yacute CDATA "&#253;" -- small y, acute accent --> <!ENTITY thorn CDATA "&#254;" -- small thorn, Icelandic --> <!ENTITY yuml CDATA "&#255;" -- small y, dieresis or umlaut mark --> �HTML-Parser-3.81/t/linkextor-base.t�����������������������������������������������������������������000644 �000765 �000024 �00000001610 14366103441 020301� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::LinkExtor (); use URI (); use Test::More tests => 5; # This test that HTML::LinkExtor really absolutize links correctly # when a base URL is given to the constructor. # Try with base URL and the $p->links interface. my $p = HTML::LinkExtor->new(undef, "http://www.sn.no/foo/foo.html"); $p->parse(<<HTML)->eof; <head> <base href="http://www.sn.no/"> </head> <body> <form action="/post_here"> </form> This is <A HREF="link.html">link</a> and an <img SRC="img.jpg" lowsrc="img.gif" alt="Image">. HTML my @links = $p->links; # There should be 4 links in the document is(@links, 4); my $t; my %attr; for (@links) { ($t, %attr) = @$_ if $_->[0] eq 'img'; } is($t, 'img'); is(delete $attr{src}, "http://www.sn.no/foo/img.jpg"); is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif"); # there should be no more attributes ok(!scalar(keys %attr)); ������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/skipped-text.t�������������������������������������������������������������������000644 �000765 �000024 �00000003324 14366103441 017777� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use Test::More tests => 4; use strict; use HTML::Parser; my $p = HTML::Parser->new(api_version => 3); $p->report_tags("a"); my @doc; $p->handler(start => \&a_handler, "skipped_text, text"); $p->handler(end_document => \@doc, '@{skipped_text}'); $p->parse(<<EOT)->eof; <title>hi</title> <h1><a href="foo">link</a></h1> and <a foo="">some</a> text. EOT sub a_handler { push(@doc, shift); my $text = shift; push(@doc, uc($text)); } is(join("", @doc), <<'EOT'); <title>hi</title> <h1><A HREF="FOO">link</a></h1> and <A FOO="">some</a> text. EOT # # Comment stripper. Interaction with "" handlers. # my $doc = <<EOT; <html>text</html> <!-- comment --> and some more <b>text</b>. EOT (my $expected = $doc) =~ s/<!--.*?-->//; $p = HTML::Parser->new(api_version => 3); $p->handler(comment => ""); $p->handler( end_document => sub { my $stripped = shift; #diag $stripped; is($stripped, $expected); }, "skipped_text" ); for (split(//, $doc)) { $p->parse($_); } $p->eof; # # Interaction with unbroken text # my @x; $p = HTML::Parser->new(api_version => 3, unbroken_text => 1); $p->handler(text => \@x, '@{"X", skipped_text, text}'); $p->handler(end => ""); $p->handler(end_document => \@x, '@{"Y", skipped_text}'); $doc = "a a<a>b b</a>c c<x>d d</x>e"; for (split(//, $doc)) { $p->parse($_); } $p->eof; #diag join(":", @x); is(join(":", @x), "X::a a:X:<a>:b bc c:X:<x>:d de:Y:"); # # The crash that Chip found # my $skipped; $p = HTML::Parser->new( ignore_tags => ["foo"], start_h => [sub { $skipped = shift }, "skipped_text"], ); $p->parse("\x{100}<foo>"); $p->parse("plain"); $p->parse("<bar>"); $p->eof; is($skipped, "\x{100}<foo>plain"); ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/document.t�����������������������������������������������������������������������000644 �000765 �000024 �00000001376 14366103441 017201� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use File::Spec (); use HTML::Parser (); use Test::More tests => 6; my $events; my $p = HTML::Parser->new(default_h => [sub { $events .= "$_[0]\n"; }, "event"]); $events = ""; $p->eof; is($events, "start_document\nend_document\n"); $events = ""; $p->parse_file(File::Spec->devnull); is($events, "start_document\nend_document\n"); $events = ""; $p->parse(""); $p->eof; is($events, "start_document\nend_document\n"); $events = ""; $p->parse(""); $p->parse(""); $p->eof; is($events, "start_document\nend_document\n"); $events = ""; $p->parse(""); $p->parse("<a>"); $p->eof; is($events, "start_document\nstart\nend_document\n"); $events = ""; $p->parse("<a> "); $p->eof; is($events, "start_document\nstart\ntext\nend_document\n"); ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/api_version.t��������������������������������������������������������������������000644 �000765 �000024 �00000001102 14366103441 017664� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 4; my $p = HTML::Parser->new(api_version => 3); ok(!$p->handler("start"), "API version 3"); my $error; my $success = 0; { local $@; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { my $p = HTML::Parser->new(api_version => 4); $success = 1; }; #>>> } like($error, qr/^API version 4 not supported/, 'API v4 error'); ok(!$success, "!API version 4"); $p = HTML::Parser->new(api_version => 2); is($p->handler("start"), "start", "API version 2"); ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/msie-compat.t��������������������������������������������������������������������000644 �000765 �000024 �00000003053 14366103441 017573� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 4; my $TEXT = ""; sub h { my ($event, $tagname, $text, @attr) = @_; for ($event, $tagname, $text, @attr) { if (defined) { s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge; } else { $_ = "<undef>"; } } $TEXT .= "[$event,$tagname,$text," . join(":", @attr) . "]\n"; } my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text,\@attr"]); $p->parse("<a>"); $p->parse("</a f>"); $p->parse("</a 'foo<>' 'bar>' x>"); $p->parse("</a \"foo<>\""); $p->parse(" \"bar>\" x>"); $p->parse("</ foo bar>"); $p->parse("</ \"<>\" >"); $p->parse("<!--comment>text<!--comment><p"); $p->eof; is($TEXT, <<'EOT'); [start_document,<undef>,,] [start,a,<a>,] [end,a,</a f>,] [end,a,</a 'foo<>' 'bar>' x>,] [end,a,</a "foo<>" "bar>" x>,] [comment, foo bar,</ foo bar>,] [comment, "<>" ,</ "<>" >,] [comment,comment,<!--comment>,] [text,<undef>,text,] [comment,comment,<!--comment>,] [comment,p,<p,] [end_document,<undef>,,] EOT $TEXT = ""; $p->parse("<!comment>"); $p->eof; is($TEXT, <<'EOT'); [start_document,<undef>,,] [comment,comment,<!comment>,] [end_document,<undef>,,] EOT $TEXT = ""; $p->parse(q(<a name=`foo bar`>)); $p->eof; is($TEXT, <<'EOT'); [start_document,<undef>,,] [start,a,<a name=`foo bar`>,name:`foo:bar`:bar`] [end_document,<undef>,,] EOT $p->backquote(1); $TEXT = ""; $p->parse(q(<a name=`foo bar`>)); $p->eof; is($TEXT, <<'EOT'); [start_document,<undef>,,] [start,a,<a name=`foo bar`>,name:foo bar] [end_document,<undef>,,] EOT �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/headparser-http.t����������������������������������������������������������������000644 �000765 �000024 �00000000460 14366103441 020447� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::HeadParser (); use Test::More tests => 1; my $h; my $p = HTML::HeadParser->new($h); $p->parse(<<EOT); <title>Stupid example</title> <base href="http://www.sn.no/libwww-perl/"> Normal text starts here. EOT $h = $p->header; undef $p; is($h->title, "Stupid example"); ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/process.t������������������������������������������������������������������������000644 �000765 �000024 �00000001231 14366103441 017027� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 12; my $pi; my $orig; my $p = HTML::Parser->new( process_h => [sub { $pi = shift; $orig = shift; }, "token0,text"]); $p->parse("<a><?foo><a>"); is($pi, "foo"); is($orig, "<?foo>"); $p->parse("<a><?><a>"); is($pi, ""); is($orig, "<?>"); $p->parse( "<a><? foo ><a>" ); is($pi, "\nfoo\n"); is($orig, "<?\nfoo\n>"); for (qw(< a > < ? b a r > < a >)) { $p->parse($_); } is($pi, "bar"); is($orig, "<?bar>"); $p->xml_mode(1); $p->parse("<a><?foo>bar??><a>"); is($pi, "foo>bar?"); is($orig, "<?foo>bar??>"); $p->parse("<a><??></a>"); is($pi, ""); is($orig, "<??>"); �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/00-report-prereqs.t��������������������������������������������������������������000644 �000765 �000024 �00000013452 14366103441 020570� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!perl use strict; use warnings; # This test was generated by Dist::Zilla::Plugin::Test::ReportPrereqs 0.028 use Test::More tests => 1; use ExtUtils::MakeMaker; use File::Spec; # from $version::LAX my $lax_version_re = qr/(?: undef | (?: (?:[0-9]+) (?: \. | (?:\.[0-9]+) (?:_[0-9]+)? )? | (?:\.[0-9]+) (?:_[0-9]+)? ) | (?: v (?:[0-9]+) (?: (?:\.[0-9]+)+ (?:_[0-9]+)? )? | (?:[0-9]+)? (?:\.[0-9]+){2,} (?:_[0-9]+)? ) )/x; # hide optional CPAN::Meta modules from prereq scanner # and check if they are available my $cpan_meta = "CPAN::Meta"; my $cpan_meta_pre = "CPAN::Meta::Prereqs"; my $HAS_CPAN_META = eval "require $cpan_meta; $cpan_meta->VERSION('2.120900')" && eval "require $cpan_meta_pre"; ## no critic # Verify requirements? my $DO_VERIFY_PREREQS = 1; sub _max { my $max = shift; $max = ( $_ > $max ) ? $_ : $max for @_; return $max; } sub _merge_prereqs { my ($collector, $prereqs) = @_; # CPAN::Meta::Prereqs object if (ref $collector eq $cpan_meta_pre) { return $collector->with_merged_prereqs( CPAN::Meta::Prereqs->new( $prereqs ) ); } # Raw hashrefs for my $phase ( keys %$prereqs ) { for my $type ( keys %{ $prereqs->{$phase} } ) { for my $module ( keys %{ $prereqs->{$phase}{$type} } ) { $collector->{$phase}{$type}{$module} = $prereqs->{$phase}{$type}{$module}; } } } return $collector; } my @include = qw( ); my @exclude = qw( ); # Add static prereqs to the included modules list my $static_prereqs = do './t/00-report-prereqs.dd'; # Merge all prereqs (either with ::Prereqs or a hashref) my $full_prereqs = _merge_prereqs( ( $HAS_CPAN_META ? $cpan_meta_pre->new : {} ), $static_prereqs ); # Add dynamic prereqs to the included modules list (if we can) my ($source) = grep { -f } 'MYMETA.json', 'MYMETA.yml'; my $cpan_meta_error; if ( $source && $HAS_CPAN_META && (my $meta = eval { CPAN::Meta->load_file($source) } ) ) { $full_prereqs = _merge_prereqs($full_prereqs, $meta->prereqs); } else { $cpan_meta_error = $@; # capture error from CPAN::Meta->load_file($source) $source = 'static metadata'; } my @full_reports; my @dep_errors; my $req_hash = $HAS_CPAN_META ? $full_prereqs->as_string_hash : $full_prereqs; # Add static includes into a fake section for my $mod (@include) { $req_hash->{other}{modules}{$mod} = 0; } for my $phase ( qw(configure build test runtime develop other) ) { next unless $req_hash->{$phase}; next if ($phase eq 'develop' and not $ENV{AUTHOR_TESTING}); for my $type ( qw(requires recommends suggests conflicts modules) ) { next unless $req_hash->{$phase}{$type}; my $title = ucfirst($phase).' '.ucfirst($type); my @reports = [qw/Module Want Have/]; for my $mod ( sort keys %{ $req_hash->{$phase}{$type} } ) { next if $mod eq 'perl'; next if grep { $_ eq $mod } @exclude; my $file = $mod; $file =~ s{::}{/}g; $file .= ".pm"; my ($prefix) = grep { -e File::Spec->catfile($_, $file) } @INC; my $want = $req_hash->{$phase}{$type}{$mod}; $want = "undef" unless defined $want; $want = "any" if !$want && $want == 0; my $req_string = $want eq 'any' ? 'any version required' : "version '$want' required"; if ($prefix) { my $have = MM->parse_version( File::Spec->catfile($prefix, $file) ); $have = "undef" unless defined $have; push @reports, [$mod, $want, $have]; if ( $DO_VERIFY_PREREQS && $HAS_CPAN_META && $type eq 'requires' ) { if ( $have !~ /\A$lax_version_re\z/ ) { push @dep_errors, "$mod version '$have' cannot be parsed ($req_string)"; } elsif ( ! $full_prereqs->requirements_for( $phase, $type )->accepts_module( $mod => $have ) ) { push @dep_errors, "$mod version '$have' is not in required range '$want'"; } } } else { push @reports, [$mod, $want, "missing"]; if ( $DO_VERIFY_PREREQS && $type eq 'requires' ) { push @dep_errors, "$mod is not installed ($req_string)"; } } } if ( @reports ) { push @full_reports, "=== $title ===\n\n"; my $ml = _max( map { length $_->[0] } @reports ); my $wl = _max( map { length $_->[1] } @reports ); my $hl = _max( map { length $_->[2] } @reports ); if ($type eq 'modules') { splice @reports, 1, 0, ["-" x $ml, "", "-" x $hl]; push @full_reports, map { sprintf(" %*s %*s\n", -$ml, $_->[0], $hl, $_->[2]) } @reports; } else { splice @reports, 1, 0, ["-" x $ml, "-" x $wl, "-" x $hl]; push @full_reports, map { sprintf(" %*s %*s %*s\n", -$ml, $_->[0], $wl, $_->[1], $hl, $_->[2]) } @reports; } push @full_reports, "\n"; } } } if ( @full_reports ) { diag "\nVersions for all modules listed in $source (including optional ones):\n\n", @full_reports; } if ( $cpan_meta_error || @dep_errors ) { diag "\n*** WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING ***\n"; } if ( $cpan_meta_error ) { my ($orig_source) = grep { -f } 'MYMETA.json', 'MYMETA.yml'; diag "\nCPAN::Meta->load_file('$orig_source') failed with: $cpan_meta_error\n"; } if ( @dep_errors ) { diag join("\n", "\nThe following REQUIRED prerequisites were not satisfied:\n", @dep_errors, "\n" ); } pass('Reported prereqs'); # vim: ts=4 sts=4 sw=4 et: ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/plaintext.t����������������������������������������������������������������������000644 �000765 �000024 �00000002147 14366103441 017370� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 3; my @data; my $p = HTML::Parser->new(api_version => 3); $p->handler(default => \@data, '@{event, text, is_cdata}'); $p->parse(<<EOT)->eof; <xmp><foo></xmp>x<plaintext><foo> </plaintext> foo EOT for (@data) { $_ = "" unless defined; } my $doc = join(":", @data); #diag $doc; is( $doc, "start_document:::start:<xmp>::text:<foo>:1:end:</xmp>::text:x::start:<plaintext>::text:<foo> </plaintext> foo :1:end_document::" ); @data = (); $p->closing_plaintext('yep, emulate gecko'); $p->parse(<<EOT)->eof; <plaintext><foo> </plaintext>foo<b></b> EOT for (@data) { $_ = "" unless defined; } $doc = join(":", @data); #diag $doc; is( $doc, "start_document:::start:<plaintext>::text:<foo> :1:end:</plaintext>::text:foo::start:<b>::end:</b>::text: ::end_document::" ); @data = (); $p->closing_plaintext('yep, emulate gecko (2)'); $p->parse(<<EOT)->eof; <plaintext><foo> foo<b></b> EOT $doc = join(":", map { defined $_ ? $_ : "" } @data); is( $doc, "start_document:::start:<plaintext>::text:<foo> foo<b></b> :1:end_document::" ); �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/handler.t������������������������������������������������������������������������000644 �000765 �000024 �00000003323 14366103441 016772� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 11; { package MyParser; use strict; use warnings; require HTML::Parser; our @ISA = qw(HTML::Parser); sub foo { Test::More::is($_[1]{testno}, Test::More->builder->current_test + 1); } sub bar { Test::More::is($_[1], Test::More->builder->current_test + 1); } 1; } my $p = MyParser->new(api_version => 3); { local $@; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { $p->handler(foo => "foo", "foo"); 1; }; #>>> like($error, qr/^No handler for foo events/); } { local $@; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { $p->handler(start => "foo", "foo"); 1; }; #>>> like($error, qr/^Unrecognized identifier foo in argspec/); } my $h = $p->handler(start => "foo", "self,tagname"); ok(!defined($h)); my $x = \substr("xfoo", 1); $p->handler(start => $$x, "self,attr"); $p->parse("<a testno=4>"); $p->handler(start => \&MyParser::foo, "self,attr"); $p->parse("<a testno=5>"); $p->handler(start => "foo"); $p->parse("<a testno=6>"); $p->handler(start => "bar", "self,'7'"); $p->parse("<a>"); { local $@; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { $p->handler(start => {}, "self"); 1; }; #>>> like($error, qr/^Only code or array references allowed as handler/); } $x = []; $p->handler(start => $x); $h = $p->handler("start"); is($p->handler("start", "foo"), $x); is($p->handler("start", \&MyParser::foo, ""), "foo"); is($p->handler("start"), \&MyParser::foo); �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/00-report-prereqs.dd�������������������������������������������������������������000644 �000765 �000024 �00000006204 14366103441 020711� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������do { my $x = { 'configure' => { 'requires' => { 'ExtUtils::MakeMaker' => '6.52', 'perl' => '5.008' }, 'suggests' => { 'JSON::PP' => '2.27300' } }, 'develop' => { 'requires' => { 'Dist::Zilla' => '0', 'Dist::Zilla::Plugin::MinimumPerl' => '0', 'Dist::Zilla::PluginBundle::Starter' => 'v4.0.0', 'File::Spec' => '0', 'IO::Handle' => '0', 'IPC::Open3' => '0', 'Pod::Coverage::TrustPod' => '0', 'Test::CPAN::Changes' => '0.4', 'Test::CPAN::Meta' => '0', 'Test::CheckManifest' => '1.29', 'Test::Kwalitee' => '1.22', 'Test::More' => '0.88', 'Test::Pod' => '1.41', 'Test::Pod::Coverage' => '1.08', 'Test::Pod::Spelling::CommonMistakes' => '1.000', 'Test::Spelling' => '0.12', 'Test::Version' => '2.00' } }, 'runtime' => { 'requires' => { 'Carp' => '0', 'Exporter' => '0', 'HTML::Tagset' => '0', 'HTTP::Headers' => '0', 'IO::File' => '0', 'URI' => '0', 'URI::URL' => '0', 'XSLoader' => '0', 'perl' => '5.008', 'strict' => '0' } }, 'test' => { 'recommends' => { 'CPAN::Meta' => '2.120900' }, 'requires' => { 'Config' => '0', 'ExtUtils::MakeMaker' => '0', 'File::Spec' => '0', 'FileHandle' => '0', 'IO::File' => '0', 'SelectSaver' => '0', 'Test::More' => '0', 'URI' => '0', 'perl' => '5.008', 'strict' => '0' } } }; $x; }��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/parsefile.t����������������������������������������������������������������������000644 �000765 �000024 �00000002163 14366103441 017330� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use IO::File (); use Test::More tests => 6; my $filename = "file$$.htm"; die "$filename is already there" if -e $filename; { open(my $fh, '>', $filename) || die "Can't create $filename: $!"; print {$fh} "<title>Heisan</title>\n"; close($fh); } { package MyParser; use strict; use warnings; require HTML::Parser; our @ISA = qw(HTML::Parser); sub start { my ($self, $tag, $attr) = @_; Test::More::is($tag, "title"); } 1; } MyParser->new->parse_file($filename); open(my $fh, $filename) || die; MyParser->new->parse_file($fh); seek($fh, 0, 0) || die; MyParser->new->parse_file($fh); close($fh); my $io = IO::File->new($filename) || die; MyParser->new->parse_file($io); $io->seek(0, 0) || die; MyParser->new->parse_file(*$io); my $text = ''; $io->seek(0, 0) || die; MyParser->new( start_h => [sub { shift->eof; }, "self"], text_h => [sub { $text = shift; }, "text"] )->parse_file(*$io); ok(!$text); close($io); # needed because of bug in perl undef($io); unlink($filename) or warn "Can't unlink $filename: $!"; �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/entities2.t����������������������������������������������������������������������000644 �000765 �000024 �00000003102 14366103441 017256� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use utf8; use HTML::Entities qw(_decode_entities); use Test::More tests => 9; { local $@; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { _decode_entities("&lt;", undef); 1; }; #>>> like($error, qr/^(?:Can't inline decode readonly string|Modification of a read-only value attempted)/ ); } { local $@; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { my $x = ""; _decode_entities($x, $x); 1; }; #>>> like($error, qr/^2nd argument must be hash reference/); } { local $@; my $error; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { my $x = ""; _decode_entities($x, []); 1; }; #>>> like($error, qr/^2nd argument must be hash reference/); } my $x = "&lt;"; _decode_entities($x, undef); is($x, "&lt;"); _decode_entities($x, {"lt" => "<"}); is($x, "<"); $x = "x" x 20; my $err; for (":", ":a", "a:", "a:a", "a:a:a", "a:::a") { my $x = $_; $x =~ s/:/&a;/g; my $y = $_; $y =~ s/:/$x/g; _decode_entities($x, {"a" => $x}); if ($x ne $y) { diag "Something went wrong with '$_'"; $err++; } } ok(!$err); $x = "foo&nbsp;bar"; _decode_entities($x, \%HTML::Entities::entity2char); is($x, "foo\xA0bar"); $x = "foo&nbspbar"; _decode_entities($x, \%HTML::Entities::entity2char); is($x, "foo&nbspbar"); _decode_entities($x, \%HTML::Entities::entity2char, 1); is($x, "foo\xA0bar"); ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/stack-realloc.t������������������������������������������������������������������000644 �000765 �000024 �00000000625 14366103441 020103� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 1; # HTML-Parser 3.33 and older used to core dump on this program because # of missing SPAGAIN calls in parse() XS code. It was not prepared for # the stack to get realloced. $| = 1; my $x = HTML::Parser->new(api_version => 3); my @row; $x->handler(end => sub { push(@row, (1) x 505); 1 }, "tagname"); $x->parse("</TD>"); pass; �����������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/stack-realloc-eof.t��������������������������������������������������������������000644 �000765 �000024 �00000033602 14366103441 020653� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 2; # HTML-Parser core dumps on this because # of missing SPAGAIN calls in parse() XS code. It was not prepared for # the stack to get realloced. my $em = <<'EOF'; <html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /></head><body style='font-size: 10pt; font-family: Verdana,Geneva,sans-serif'> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> <p><span style="font-size: 10.0pt; font-family: 'Verdana','sans-serif';">cat</span></p> </body></html> EOF sub handle_doc_end { my ($self) = @_; # We need to construct a large list and then splice it in array context, this will # cause splice to regrow the stack and mess up the stack pointer in Parser.xs's eof my @list; for (1..150) { push @list, 1; # { $_ => 1 }; } # ok(1, 'splicing'); foreach my $i (splice(@list)) { } # ok(1, 'done splicing'); } sub extract { my $markup = shift; my $parser = HTML::Parser->new( api_version => 3, handlers => { end_document => [\&handle_doc_end => 'self'] }, ); $parser->empty_element_tags(1); $parser->parse($markup); $parser->eof(); return 1; } ok(extract($em), 'first call okay'); ok(extract($em), 'second call okay'); ������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/filter.t�������������������������������������������������������������������������000644 �000765 �000024 �00000002024 14366103441 016637� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Filter; use SelectSaver; use Test::More tests => 3; my $HTML = <<EOT; <!DOCTYPE HTML> <!-- comment <h1>Foo</h1> --> <H1 >Bar</H1 > <Table><tr><td>1<td>2<td>3 <tr> </table> <?process> EOT my $tmpfile = "test-$$.htm"; die "$tmpfile already exists" if -e $tmpfile; { open(my $fh, '>', $tmpfile) or die "$!"; my $save = SelectSaver->new($fh); HTML::Filter->new->parse($HTML)->eof; close($fh); } { open(my $fh, '<', $tmpfile) or die "$!"; local $/; my $FILTERED = <$fh>; close($fh); #print $FILTERED; is($FILTERED, $HTML); } { package MyFilter; use strict; use warnings; require HTML::Filter; our @ISA = qw(HTML::Filter); sub comment { } sub output { push(@{$_[0]->{fhtml}}, $_[1]) } sub filtered_html { join("", @{$_[0]->{fhtml}}) } 1; } my $f2 = MyFilter->new->parse_file($tmpfile)->filtered_html; unlink($tmpfile) or warn "Can't unlink $tmpfile: $!"; #diag $f2; unlike($f2, qr/Foo/); like($f2, qr/Bar/); ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/attr-encoded.t�������������������������������������������������������������������000644 �000765 �000024 �00000001124 14366103441 017723� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 2; my $p = HTML::Parser->new(); $p->attr_encoded(1); my $text = ""; $p->handler( start => sub { my ($tag, $attr) = @_; $text .= "S[$tag"; for my $k (sort keys %$attr) { my $v = $attr->{$k}; $text .= " $k=$v"; } $text .= "]"; }, "tagname,attr" ); my $html = <<'EOT'; <tag arg="&amp;&lt;&gt"> EOT $p->parse($html)->eof; is($text, 'S[tag arg=&amp;&lt;&gt]'); $text = ""; $p->attr_encoded(0); $p->parse($html)->eof; is($text, 'S[tag arg=&<>]'); ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/callback.t�����������������������������������������������������������������������000644 �000765 �000024 �00000002066 14366103441 017114� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 47; my @expected; my $p = HTML::Parser->new( api_version => 3, unbroken_text => 1, default_h => [\@expected, '@{event, text}'], ); my $doc = <<'EOT'; <title>Hi</title> <h1>Ho ho</h1> <--comment-> EOT $p->parse($doc)->eof; #use Data::Dump; Data::Dump::dump(@expected); for my $i (1 .. length($doc)) { my @t; $p->handler(default => \@t); $p->parse(chunk($doc, $i)); # check that we got the same stuff #diag "X:", join(":", @t); #diag "Y:", join(":", @expected); is(join(":", @t), join(":", @expected)); } sub chunk { my $str = shift; my $size = shift || 1; sub { my $res = substr($str, 0, $size); #diag "...$res"; substr($str, 0, $size) = ""; $res; } } # Test croking behaviour $p->handler(default => []); my $error; { local $@; #<<< do not let perltidy touch this $error = $@ || 'Error' unless eval { $p->parse(sub { die "Hi" }); 1 }; #>>> } like($error, qr/^Hi/); ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/textarea.t�����������������������������������������������������������������������000644 �000765 �000024 �00000002617 14366103441 017177� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 1; my $html = <<'EOT'; <html> <title>This is a <nice> title</title> <!--comment--> <script language="perl">while (<DATA>) { &amp; }</script> <FORM> <textarea name="foo" cols=50 rows=10> foo <foo> <!--comment--> &amp; foo </FORM> </textarea> </FORM> </html> EOT my $dump = ""; sub tdump { my @data = @_; for (@data) { $_ = "<undef>" unless defined; s/\n/\\n/g; } $dump .= join("|", @data) . "\n"; } my $p = HTML::Parser->new(default_h => [\&tdump, "event,text,dtext,is_cdata"]); $p->parse($html)->eof; #diag $dump; is($dump, <<'EOT'); start_document||<undef>|<undef> start|<html>|<undef>|<undef> text|\n|\n| start|<title>|<undef>|<undef> text|This is a <nice> title|This is a <nice> title| end|</title>|<undef>|<undef> text|\n|\n| comment|<!--comment-->|<undef>|<undef> text|\n|\n| start|<script language="perl">|<undef>|<undef> text|while (<DATA>) { &amp; }|while (<DATA>) { &amp; }|1 end|</script>|<undef>|<undef> text|\n\n|\n\n| start|<FORM>|<undef>|<undef> text|\n\n|\n\n| start|<textarea name="foo" cols=50 rows=10>|<undef>|<undef> text|\n\nfoo\n<foo>\n<!--comment-->\n&amp;\nfoo\n</FORM>\n\n|\n\nfoo\n<foo>\n<!--comment-->\n&\nfoo\n</FORM>\n\n| end|</textarea>|<undef>|<undef> text|\n\n|\n\n| end|</FORM>|<undef>|<undef> text|\n\n|\n\n| end|</html>|<undef>|<undef> text|\n|\n| end_document||<undef>|<undef> EOT �����������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/dtext.t��������������������������������������������������������������������������000644 �000765 �000024 �00000002054 14366103441 016505� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use utf8; use HTML::Parser (); use Test::More tests => 2; my $dtext = ""; my $text = ""; sub append { $dtext .= shift; $text .= shift; } my $p = HTML::Parser->new( text_h => [\&append, "dtext, text"], default_h => [\&append, "text, text" ], ); my $doc = <<'EOT'; <title>&aring</title> <a href="foo&aring">&aring&aring;&#65&#65;&lt&#65&gt;&#x41&#X41;</a> <?&aring> foo&nbsp;bar foo&nbspbar &xyzzy &xyzzy; <!-- &#0; --> &#1; &#255; &#xFF &#xFFG <!-- &#256; --> &#40000000000000000000000000000; &#x400000000000000000000000000000000; & &# &#x <xmp>&aring</xmp> <script>&aring</script> <ScRIPT>&aring</scRIPT> <skript>&aring</script> EOT $p->parse($doc)->eof; is($text, $doc); is($dtext, <<"EOT"); <title>å</title> <a href="foo&aring">ååAA<A>AA</a> <?&aring> foo\240bar foo\240bar &xyzzy &xyzzy; <!-- &#0; --> \1 \377 \377 \377G <!-- &#256; --> &#40000000000000000000000000000; &#x400000000000000000000000000000000; & &# &#x <xmp>&aring</xmp> <script>&aring</script> <ScRIPT>&aring</scRIPT> <skript>å</script> EOT ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/argspec.t������������������������������������������������������������������������000644 �000765 �000024 �00000010512 14366103441 016777� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More; my $decl = '<!ENTITY nbsp CDATA "&#160;" -- no-break space -->'; my $com1 = '<!-- Comment -->'; my $com2 = '<!-- Comment -- -- Comment -->'; my $start = '<a href="foo">'; my $end = '</a>'; my $empty = "<IMG SRC='foo'/>"; my $proc = '<? something completely different ?>'; my @argspec = qw( self offset length event tagname tag token0 text is_cdata dtext tokens tokenpos attr attrseq ); my @result; my $p = HTML::Parser->new( default_h => [\@result, join(',', @argspec)], strict_comment => 1, xml_mode => 1 ); my @tests = ( # string, expected results $decl => [ [ $p, 0, 52, 'declaration', 'ENTITY', '!ENTITY', 'ENTITY', '<!ENTITY nbsp CDATA "&#160;" -- no-break space -->', undef, undef, ['ENTITY', 'nbsp', 'CDATA', '"&#160;"', '-- no-break space --'], [2, 6, 9, 4, 16, 5, 22, 8, 31, 20], undef, undef ] ], $com1 => [ [ $p, 0, 16, 'comment', ' Comment ', '# Comment ', ' Comment ', '<!-- Comment -->', undef, undef, [' Comment '], [4, 9], undef, undef ] ], $com2 => [ [ $p, 0, 30, 'comment', ' Comment ', '# Comment ', ' Comment ', '<!-- Comment -- -- Comment -->', undef, undef, [' Comment ', ' Comment '], [4, 9, 18, 9], undef, undef ] ], $start => [ [ $p, 0, 14, 'start', 'a', 'a', 'a', '<a href="foo">', undef, undef, ['a', 'href', '"foo"'], [1, 1, 3, 4, 8, 5], {'href', 'foo'}, ['href'] ] ], $end => [ [ $p, 0, 4, 'end', 'a', '/a', 'a', '</a>', undef, undef, ['a'], [2, 1], undef, undef ] ], $empty => [ [ $p, 0, 16, 'start', 'IMG', 'IMG', 'IMG', "<IMG SRC='foo'/>", undef, undef, ['IMG', 'SRC', "'foo'"], [1, 3, 5, 3, 9, 5], {'SRC', 'foo'}, ['SRC'] ], [ $p, 16, 0, 'end', 'IMG', '/IMG', 'IMG', '', undef, undef, ['IMG'], undef, undef, undef ], ], $proc => [ [ $p, 0, 36, 'process', ' something completely different ', '? something completely different ', ' something completely different ', '<? something completely different ?>', undef, undef, [' something completely different '], [2, 32], undef, undef ] ], "$end\n$end" => [ [ $p, 0, 4, 'end', 'a', '/a', 'a', '</a>', undef, undef, ['a'], [2, 1], undef, undef ], [ $p, 4, 1, 'text', undef, undef, undef, "\n", '', "\n", undef, undef, undef, undef ], [ $p, 5, 4, 'end', 'a', '/a', 'a', '</a>', undef, undef, ['a'], [2, 1], undef, undef ] ], ); plan tests => @tests / 2; sub string_tag { my (@pieces) = @_; my $part; foreach $part (@pieces) { if (!defined $part) { $part = 'undef'; } elsif (!ref $part) { $part = "'$part'" if $part !~ /^\d+$/; } elsif ('ARRAY' eq ref $part) { $part = '[' . join(', ', string_tag(@$part)) . ']'; } elsif ('HASH' eq ref $part) { $part = '{' . join(',', string_tag(%$part)) . '}'; } else { $part = '<' . ref($part) . '>'; } } return join(", ", @pieces); } my $i = 0; TEST: while (@tests) { my ($html, $expected) = splice @tests, 0, 2; ++$i; @result = (); $p->parse($html)->eof; shift(@result) if $result[0][3] eq "start_document"; pop(@result) if $result[-1][3] eq "end_document"; # Compare results for each element expected foreach (@$expected) { my $want = string_tag($_); my $got = string_tag(shift @result); if ($want ne $got) { is($want, $got); next TEST; } } pass; } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/default.t������������������������������������������������������������������������000644 �000765 �000024 �00000001403 14366103441 016776� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use HTML::Parser (); use Test::More tests => 3; my $text = ""; my $p = HTML::Parser->new(default_h => [sub { $text .= shift }, "text"],); my $html = <<'EOT'; <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html40/strict.dtd"> <title>foo</title> <!-- comment <a> --> <?process instruction> EOT $p->parse($html)->eof; is($text, $html); $text = ""; $p->handler(start => sub { }, ""); $p->handler(declaration => sub { }, ""); $p->parse($html)->eof; my $html2; $html2 = $html; $html2 =~ s/<title>//; $html2 =~ s/<!DOCTYPE[^>]*>//; is($text, $html2); $text = ""; $p->handler(start => undef); $p->parse($html)->eof; $html2 = $html; $html2 =~ s/<!DOCTYPE[^>]*>//; is($text, $html2); �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-Parser-3.81/t/cases.t��������������������������������������������������������������������������000644 �000765 �000024 �00000006462 14366103441 016462� 0����������������������������������������������������������������������������������������������������ustar�00olafalders����������������������staff���������������������������000000 �000000 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������use strict; use warnings; use utf8; use HTML::Parser (); use Test::More; my @result; { package P; use strict; use warnings; require HTML::Parser; our @ISA = qw(HTML::Parser); my $start = 0; my $end = 0; my $text = 0; my $comment = 0; my $declaration = 0; sub start { my ($self, $tag, $attr) = @_; push @result, "START[$tag]"; for (sort keys %$attr) { push @result, "\t$_: " . $attr->{$_}; } $start++; } sub end { my ($self, $tag) = @_; push @result, "END[$tag]"; $end++; } sub text { my $self = shift; push @result, "TEXT[$_[0]]"; $text++; } sub comment { my $self = shift; push @result, "COMMENT[$_[0]]"; $comment++; } sub declaration { my $self = shift; push @result, "DECLARATION[$_[0]]"; $declaration++; } 1; } my @tests = ( '<a ">' => ['START[a]', "\t\": \""], '<a/>' => ['START[a/]',], '<a />' => ['START[a]', "\t/: /"], '<a a/>' => ['START[a]', "\ta/: a/"], '<a a/=/>' => ['START[a]', "\ta/: /"], '<a x="foo&nbsp;bar">' => ['START[a]', "\tx: foo\xA0bar"], '<a x="foo&nbspbar">' => ['START[a]', "\tx: foo&nbspbar"], '<å >' => ['TEXT[<å]', 'TEXT[ >]'], '2 < 5' => ['TEXT[2 ]', 'TEXT[<]', 'TEXT[ 5]'], '2 <5> 2' => ['TEXT[2 ]', 'TEXT[<5>]', 'TEXT[ 2]'], '2 <a' => ['TEXT[2 ]', 'TEXT[<a]'], '2 <a> 2' => ['TEXT[2 ]', 'START[a]', 'TEXT[ 2]'], '2 <a href=foo' => ['TEXT[2 ]', 'TEXT[<a href=foo]'], "2 <a href='foo bar'> 2" => ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'], '2 <a href=foo bar> 2' => ['TEXT[2 ]', 'START[a]', "\tbar: bar", "\thref: foo", 'TEXT[ 2]'], '2 <a href="foo bar"> 2' => ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'], '2 <a href="foo\'bar"> 2' => ['TEXT[2 ]', 'START[a]', "\thref: foo'bar", 'TEXT[ 2]'], "2 <a href='foo\"bar'> 2" => ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'], "2 <a href='foo&quot;bar'> 2" => ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'], '2 <a.b> 2' => ['TEXT[2 ]', 'START[a.b]', 'TEXT[ 2]'], '2 <a.b-12 a.b = 2 a> 2' => ['TEXT[2 ]', 'START[a.b-12]', "\ta: a", "\ta.b: 2", 'TEXT[ 2]'], '2 <a_b> 2' => ['TEXT[2 ]', 'START[a_b]', 'TEXT[ 2]'], '<!ENTITY nbsp CDATA "&#160;" -- no-break space -->' => ['DECLARATION[ENTITY nbsp CDATA "&#160;" -- no-break space --]'], '<!-- comment -->' => ['COMMENT[ comment ]'], '<!-- comment -- --- comment -->' => ['COMMENT[ comment ]', 'COMMENT[- comment ]'], '<!-- comment <!-- not comment --> comment -->' => ['COMMENT[ comment <!]', 'COMMENT[> comment ]'], '<!-- <a href="foo"> -->' => ['COMMENT[ <a href="foo"> ]'], ); plan tests => @tests / 2; my $i = 0; TEST: while (@tests) { ++$i; my ($html, $expected) = splice @tests, 0, 2; @result = (); my $p = P->new; $p->strict_comment(1); $p->parse($html)->eof; ok(eq_array($expected, \@result)) or diag("Expected: @$expected\n", "Got: @result\n"); } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������