html5-parser-0.4.10/ 0000755 0001750 0001750 00000000000 14122552315 014371 5 ustar kovid kovid 0000000 0000000 html5-parser-0.4.10/gentags.py 0000755 0001750 0001750 00000004622 13475205144 016407 0 ustar kovid kovid 0000000 0000000 #!/usr/bin/env python
# vim:fileencoding=utf-8
# License: Apache 2.0 Copyright: 2017, Kovid Goyal
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import re
import subprocess
self_path = os.path.abspath(__file__)
HEADER = '''\
// Do not edit
// Generated from {}.in (see gentags.py)
'''
def generate_tag_headers():
with open("gumbo/tag_strings.h", "wb") as tag_strings, \
open("gumbo/tag_enum.h", "wb") as tag_enum, \
open("gumbo/tag_sizes.h", "wb") as tag_sizes, \
open('gumbo/tag.in', 'rb') as tagfile:
for f in (tag_strings, tag_enum, tag_sizes):
f.write(HEADER.format('tag').encode('utf-8'))
for tag in tagfile:
tag = tag.decode('utf-8').strip()
tag_upper = tag.upper().replace('-', '_')
tag_strings.write(('"%s",\n' % tag).encode('utf-8'))
tag_enum.write(('GUMBO_TAG_%s,\n' % tag_upper).encode('utf-8'))
tag_sizes.write(('%d, ' % len(tag)).encode('utf-8'))
tag_sizes.write(b'\n')
def generate_tag_perfect_hash(repetitions=200):
raw = subprocess.check_output(
'gperf -LANSI-C --ignore-case -H tag_hash -m{} gumbo/tag.in'.format(repetitions).split()
).decode('utf-8').splitlines()
for i, line in enumerate(raw):
if line.startswith('in_word_set'):
break
else:
raise SystemExit('Failed to find in_word_set()')
lines = raw[:i - 1]
del raw[:i - 1]
raw = '\n'.join(raw)
wordlist = re.search("wordlist\[\]\s+=\s+{(.*?)}", raw, re.DOTALL)
if wordlist is None:
raise SystemExit('Failed to find wordlist')
wordlist = [w.strip().replace('"', '') for w in wordlist.group(1).split(',')]
taglist = ["\tGUMBO_TAG_" + (w.upper().replace('-', '_') if w else 'LAST') for w in wordlist]
processed = '\n'.join(lines) + '\n\n'
processed += 'static const GumboTag kGumboTagMap[] = {\n%s\n};' % '\n,'.join(taglist)
processed = re.sub(
r'.+^tag_hash',
HEADER.format('tag') + 'static inline unsigned int\ntag_hash',
processed,
flags=re.DOTALL | re.MULTILINE)
with open('gumbo/tag_perf.h', 'wb') as f:
f.write(processed.encode('utf-8'))
f.write(b'\n')
def main():
os.chdir(os.path.dirname(self_path))
generate_tag_headers()
if __name__ == '__main__':
main()
generate_tag_perfect_hash()
html5-parser-0.4.10/src/ 0000755 0001750 0001750 00000000000 14122552315 015160 5 ustar kovid kovid 0000000 0000000 html5-parser-0.4.10/src/attr_sizes.h 0000644 0001750 0001750 00000002422 13475205503 017524 0 ustar kovid kovid 0000000 0000000 // Do not edit
// Generated by genattrs.py
13, 6, 14, 9, 10, 6, 8, 5, 18, 12, 10, 3, 9, 11, 6, 5, 13, 13, 11, 12, 9, 8, 8, 7, 13, 11, 14, 4, 5, 7, 4, 6, 8, 2, 8, 10, 9, 7, 7, 4, 5, 4, 9, 9, 13, 4, 8, 5, 19, 27, 13, 15, 4, 7, 7, 17, 16, 15, 11, 8, 6, 6, 2, 2, 1, 4, 12, 8, 10, 7, 5, 7, 15, 3, 9, 7, 8, 7, 7, 17, 8, 9, 8, 3, 2, 2, 8, 9, 17, 7, 3, 8, 25, 4, 12, 9, 6, 9, 11, 11, 13, 11, 9, 16, 12, 10, 12, 11, 3, 4, 10, 6, 2, 4, 2, 2, 2, 2, 10, 28, 26, 8, 17, 13, 7, 7, 6, 6, 4, 11, 14, 4, 8, 10, 4, 2, 11, 15, 2, 3, 9, 9, 5, 8, 1, 2, 2, 2, 2, 12, 16, 7, 9, 10, 8, 7, 4, 5, 4, 8, 12, 14, 14, 17, 4, 5, 4, 3, 8, 10, 10, 12, 12, 11, 11, 4, 16, 9, 12, 3, 9, 5, 6, 3, 9, 4, 8, 5, 4, 10, 10, 6, 7, 10, 7, 7, 5, 7, 9, 10, 6, 11, 11, 10, 11, 9, 8, 8, 8, 8, 7, 4, 8, 7, 5, 6, 11, 6, 8, 17, 18, 11, 8, 10, 7, 19, 16, 12, 4, 11, 14, 6, 9, 9, 9, 6, 7, 13, 19, 14, 1, 10, 6, 8, 4, 4, 3, 16, 11, 9, 8, 18, 16, 7, 6, 8, 6, 4, 7, 2, 2, 7, 5, 5, 6, 8, 4, 8, 5, 15, 4, 5, 5, 4, 7, 4, 16, 16, 5, 10, 12, 3, 6, 7, 6, 5, 11, 12, 5, 5, 4, 11, 10, 12, 22, 23, 6, 6, 16, 17, 14, 15, 17, 14, 12, 5, 7, 12, 14, 8, 11, 6, 7, 7, 11, 15, 14, 10, 5, 2, 9, 4, 2, 2, 18, 19, 7, 12, 13, 12, 6, 12, 9, 13, 14, 5, 6, 7, 10, 13, 13, 7, 10, 10, 5, 6, 12, 4, 12, 1, 8, 2, 2, 16, 13, 13, 10, 10, 10, 11, 10, 8, 8, 9, 1, 2, 2, 16, 1, 10,
html5-parser-0.4.10/src/attr_perf.h 0000644 0001750 0001750 00000057706 13475205503 017342 0 ustar kovid kovid 0000000 0000000 // Do not edit
// Generated by genattrs.py
static inline unsigned int
attr_hash (register const char *str, register size_t len)
{
static unsigned short asso_values[] =
{
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 312, 1190, 1190, 1190, 124,
127, 32, 34, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 31, 1190, 33, 1190, 32,
31, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 36, 1190,
1190, 1190, 31, 37, 31, 40, 1190, 1190, 74, 49,
32, 1190, 1190, 1190, 1190, 1190, 1190, 58, 298, 49,
35, 31, 225, 326, 175, 112, 54, 339, 308, 125,
38, 41, 66, 207, 39, 32, 34, 59, 104, 213,
264, 172, 65, 32, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190, 1190,
1190, 1190, 1190, 1190, 1190, 1190, 1190
};
register unsigned int hval = len;
switch (hval)
{
default:
hval += asso_values[(unsigned char)str[9]];
/*FALLTHROUGH*/
case 9:
hval += asso_values[(unsigned char)str[8]];
/*FALLTHROUGH*/
case 8:
case 7:
case 6:
case 5:
case 4:
case 3:
hval += asso_values[(unsigned char)str[2]+1];
/*FALLTHROUGH*/
case 2:
hval += asso_values[(unsigned char)str[1]];
/*FALLTHROUGH*/
case 1:
hval += asso_values[(unsigned char)str[0]];
break;
}
return hval + asso_values[(unsigned char)str[len - 1]];
}
static const HTMLAttr HTML_ATTR_MAP[] = {
HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_D
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_R
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_TO
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_Z
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_STROKE
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_END
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_DESCENT
, HTML_ATTR_ENCTYPE
, HTML_ATTR_LAST
, HTML_ATTR_RESULT
, HTML_ATTR_RESTART
, HTML_ATTR_SRCSET
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_ONRESIZE
, HTML_ATTR_LAST
, HTML_ATTR_REQUIRED
, HTML_ATTR_ONREPEAT
, HTML_ATTR_LAST
, HTML_ATTR_ORDER
, HTML_ATTR_CODE
, HTML_ATTR_LAST
, HTML_ATTR_SRC
, HTML_ATTR_LAST
, HTML_ATTR_CODEBASE
, HTML_ATTR_SRCDOC
, HTML_ATTR_LAST
, HTML_ATTR_ADDITIVE
, HTML_ATTR_TARGET
, HTML_ATTR_ASCENT
, HTML_ATTR_RADIUS
, HTML_ATTR_STYLE
, HTML_ATTR_DUR
, HTML_ATTR_NAME
, HTML_ATTR_LAST
, HTML_ATTR_CONTROLS
, HTML_ATTR_CONTENT
, HTML_ATTR_ONBEGIN
, HTML_ATTR_ORIENT
, HTML_ATTR_LAST
, HTML_ATTR_ROTATE
, HTML_ATTR_LAST
, HTML_ATTR_ORIGIN
, HTML_ATTR_DROPZONE
, HTML_ATTR_TARGETY
, HTML_ATTR_LAST
, HTML_ATTR_ACCEPT
, HTML_ATTR_SCOPE
, HTML_ATTR_ID
, HTML_ATTR_CURSOR
, HTML_ATTR_POSTER
, HTML_ATTR_LAST
, HTML_ATTR_SCOPED
, HTML_ATTR_UNICODE
, HTML_ATTR_IN
, HTML_ATTR_DATETIME
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_COORDS
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_POINTS
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_TARGETX
, HTML_ATTR_LAST
, HTML_ATTR_DECELERATE
, HTML_ATTR_LAST
, HTML_ATTR_ASYNC
, HTML_ATTR_ACTION
, HTML_ATTR_SIZE
, HTML_ATTR_VERSION
, HTML_ATTR_SIZES
, HTML_ATTR_DATA
, HTML_ATTR_AUTOSAVE
, HTML_ATTR_LAST
, HTML_ATTR_DIRNAME
, HTML_ATTR_LAST
, HTML_ATTR_REQUIREDFEATURES
, HTML_ATTR_LAST
, HTML_ATTR_DIR
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_DISABLED
, HTML_ATTR_LAST
, HTML_ATTR_ONUNLOAD
, HTML_ATTR_LAST
, HTML_ATTR_PATTERN
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_SELECTED
, HTML_ATTR_MODE
, HTML_ATTR_STRIKETHROUGH_THICKNESS
, HTML_ATTR_POINTSATZ
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_STRIKETHROUGH_POSITION
, HTML_ATTR_STROKE_DASHOFFSET
, HTML_ATTR_NUMOCTAVES
, HTML_ATTR_TITLE
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_ONLOAD
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_ACCUMULATE
, HTML_ATTR_AUTOFOCUS
, HTML_ATTR_MEDIA
, HTML_ATTR_COLS
, HTML_ATTR_LAST
, HTML_ATTR_ISMAP
, HTML_ATTR_ONMOUSEOUT
, HTML_ATTR_CITE
, HTML_ATTR_METHOD
, HTML_ATTR_ATTRIBUTETYPE
, HTML_ATTR_ONMOUSEUP
, HTML_ATTR_COLOR
, HTML_ATTR_COLSPAN
, HTML_ATTR_STOP_COLOR
, HTML_ATTR_ATTRIBUTENAME
, HTML_ATTR_LAST
, HTML_ATTR_DIRECTION
, HTML_ATTR_LAST
, HTML_ATTR_MANIFEST
, HTML_ATTR_CONTENTSCRIPTTYPE
, HTML_ATTR_POINTSATY
, HTML_ATTR_ICON
, HTML_ATTR_CONTEXTMENU
, HTML_ATTR_CALCMODE
, HTML_ATTR_LAST
, HTML_ATTR_SURFACESCALE
, HTML_ATTR_TABLEVALUES
, HTML_ATTR_LAST
, HTML_ATTR_PRIMITIVEUNITS
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_EDGEMODE
, HTML_ATTR_LAST
, HTML_ATTR_DATA_REACTID
, HTML_ATTR_MUTED
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_INTERCEPT
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_AUTOREVERSE
, HTML_ATTR_MASKUNITS
, HTML_ATTR_LAST
, HTML_ATTR_UNICODE_RANGE
, HTML_ATTR_STDDEVIATION
, HTML_ATTR_PAINT_ORDER
, HTML_ATTR_STROKE_MITERLIMIT
, HTML_ATTR_LAST
, HTML_ATTR_PANOSE_1
, HTML_ATTR_OPTIMUM
, HTML_ATTR_LAST
, HTML_ATTR_HEIGHT
, HTML_ATTR_STROKE_LINEJOIN
, HTML_ATTR_ONMOUSEOVER
, HTML_ATTR_LAST
, HTML_ATTR_ONMOUSEMOVE
, HTML_ATTR_FR
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_SUMMARY
, HTML_ATTR_U1
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_IN2
, HTML_ATTR_PATTERNCONTENTUNITS
, HTML_ATTR_LAST
, HTML_ATTR_U2
, HTML_ATTR_ZOOMANDPAN
, HTML_ATTR_LAST
, HTML_ATTR_POINTSATX
, HTML_ATTR_MIN
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_VALUE
, HTML_ATTR_CONTENTEDITABLE
, HTML_ATTR_VALUES
, HTML_ATTR_REVERSED
, HTML_ATTR_SEED
, HTML_ATTR_STROKE_LINECAP
, HTML_ATTR_RADIOGROUP
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_ACCENT_HEIGHT
, HTML_ATTR_MARKER_END
, HTML_ATTR_MASKCONTENTUNITS
, HTML_ATTR_LAST
, HTML_ATTR_ORIENTATION
, HTML_ATTR_COLOR_INTERPOLATION
, HTML_ATTR_FORMAT
, HTML_ATTR_COLOR_INTERPOLATION_FILTERS
, HTML_ATTR_FOR
, HTML_ATTR_LAST
, HTML_ATTR_POINTER_EVENTS
, HTML_ATTR_LAST
, HTML_ATTR_ONEND
, HTML_ATTR_Y
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_MULTIPLE
, HTML_ATTR_LAST
, HTML_ATTR_ONERROR
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_MARKER_START
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_AUTOPLAY
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_AZIMUTH
, HTML_ATTR_DISPLAY
, HTML_ATTR_STEP
, HTML_ATTR_HIDDEN
, HTML_ATTR_SPEED
, HTML_ATTR_REPEATDUR
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_TEXT_DECORATION
, HTML_ATTR_LAST
, HTML_ATTR_PRELOAD
, HTML_ATTR_LAST
, HTML_ATTR_OPEN
, HTML_ATTR_LAST
, HTML_ATTR_STROKE_DASHARRAY
, HTML_ATTR_PATTERNUNITS
, HTML_ATTR_FONT_SIZE
, HTML_ATTR_OPERATOR
, HTML_ATTR_ROWS
, HTML_ATTR_DY
, HTML_ATTR_LAST
, HTML_ATTR_DOWNLOAD
, HTML_ATTR_CONTENTSTYLETYPE
, HTML_ATTR_RY
, HTML_ATTR_SPREADMETHOD
, HTML_ATTR_LAST
, HTML_ATTR_USEMAP
, HTML_ATTR_ROWSPAN
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_WIDTHS
, HTML_ATTR_CY
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_NOVALIDATE
, HTML_ATTR_LAST
, HTML_ATTR_STEMV
, HTML_ATTR_SEAMLESS
, HTML_ATTR_SANDBOX
, HTML_ATTR_START
, HTML_ATTR_MARKERUNITS
, HTML_ATTR_K3
, HTML_ATTR_DIVISOR
, HTML_ATTR_MARKER_MID
, HTML_ATTR_STROKE_OPACITY
, HTML_ATTR_K4
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_TABINDEX
, HTML_ATTR_LAST
, HTML_ATTR_SCALE
, HTML_ATTR_BORDER
, HTML_ATTR_LAST
, HTML_ATTR_ONABORT
, HTML_ATTR_REPEATCOUNT
, HTML_ATTR_ONMOUSEDOWN
, HTML_ATTR_PATTERNTRANSFORM
, HTML_ATTR_Y1
, HTML_ATTR_ACCEPT_CHARSET
, HTML_ATTR_LAST
, HTML_ATTR_FORMACTION
, HTML_ATTR_LAST
, HTML_ATTR_FORM
, HTML_ATTR_Y2
, HTML_ATTR_ONSCROLL
, HTML_ATTR_STRING
, HTML_ATTR_LAST
, HTML_ATTR_STROKE_WIDTH
, HTML_ATTR_DEFAULT
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_DEFER
, HTML_ATTR_LAST
, HTML_ATTR_SPAN
, HTML_ATTR_SRCLANG
, HTML_ATTR_YCHANNELSELECTOR
, HTML_ATTR_LAST
, HTML_ATTR_SLOPE
, HTML_ATTR_DRAGGABLE
, HTML_ATTR_SLOT
, HTML_ATTR_ITEMPROP
, HTML_ATTR_LANGUAGE
, HTML_ATTR_SPECULARCONSTANT
, HTML_ATTR_TYPE
, HTML_ATTR_REFY
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_PRESERVEASPECTRATIO
, HTML_ATTR_LAST
, HTML_ATTR_REQUIREDEXTENSIONS
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_FROM
, HTML_ATTR_ONCLICK
, HTML_ATTR_AMPLITUDE
, HTML_ATTR_ALT
, HTML_ATTR_ALIGN
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_MARKERWIDTH
, HTML_ATTR_STEMH
, HTML_ATTR_STARTOFFSET
, HTML_ATTR_KEYTYPE
, HTML_ATTR_REFX
, HTML_ATTR_KEYTIMES
, HTML_ATTR_EXTERNALRESOURCESREQUIRED
, HTML_ATTR_STOP_OPACITY
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_CLIP
, HTML_ATTR_CLIP_RULE
, HTML_ATTR_ONACTIVATE
, HTML_ATTR_LAST
, HTML_ATTR_LOOP
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_ONFOCUSIN
, HTML_ATTR_CHECKED
, HTML_ATTR_LIST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_ACCESSKEY
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_STITCHTILES
, HTML_ATTR_TEXT_ANCHOR
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_REL
, HTML_ATTR_FILTER
, HTML_ATTR_KEYPOINTS
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_X1
, HTML_ATTR_WRITING_MODE
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_X2
, HTML_ATTR_LAST
, HTML_ATTR_UNDERLINE_THICKNESS
, HTML_ATTR_FONT_VARIANT
, HTML_ATTR_LAST
, HTML_ATTR_COLOR_PROFILE
, HTML_ATTR_ENABLE_BACKGROUND
, HTML_ATTR_UNDERLINE_POSITION
, HTML_ATTR_LAST
, HTML_ATTR_X
, HTML_ATTR_LAST
, HTML_ATTR_KIND
, HTML_ATTR_XCHANNELSELECTOR
, HTML_ATTR_FILL_RULE
, HTML_ATTR_CLIPPATHUNITS
, HTML_ATTR_FILTERRES
, HTML_ATTR_WIDTH
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LENGTHADJUST
, HTML_ATTR_KEYSPLINES
, HTML_ATTR_SHAPE
, HTML_ATTR_ONFOCUSOUT
, HTML_ATTR_HEADERS
, HTML_ATTR_EXPONENT
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_BEGIN
, HTML_ATTR_READONLY
, HTML_ATTR_PING
, HTML_ATTR_VIEWTARGET
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_XML_BASE
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_AUTOCOMPLETE
, HTML_ATTR_FONT_STRETCH
, HTML_ATTR_MASK
, HTML_ATTR_LAST
, HTML_ATTR_INTEGRITY
, HTML_ATTR_CHARSET
, HTML_ATTR_LAST
, HTML_ATTR_DX
, HTML_ATTR_LAST
, HTML_ATTR_CAP_HEIGHT
, HTML_ATTR_LAST
, HTML_ATTR_RX
, HTML_ATTR_LAST
, HTML_ATTR_FY
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_G1
, HTML_ATTR_PATHLENGTH
, HTML_ATTR_LAST
, HTML_ATTR_CX
, HTML_ATTR_LIMITINGCONEANGLE
, HTML_ATTR_LAST
, HTML_ATTR_G2
, HTML_ATTR_LAST
, HTML_ATTR_OPACITY
, HTML_ATTR_XML_SPACE
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_K1
, HTML_ATTR_LAST
, HTML_ATTR_OVERFLOW
, HTML_ATTR_LAST
, HTML_ATTR_CHALLENGE
, HTML_ATTR_LAST
, HTML_ATTR_K2
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_HTTP_EQUIV
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_DIFFUSECONSTANT
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_HANGING
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_ARABIC_FORM
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_ALLOWREORDER
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_WRAP
, HTML_ATTR_LAST
, HTML_ATTR_MAX
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_COLOR_RENDERING
, HTML_ATTR_TRANSFORM
, HTML_ATTR_TEXTLENGTH
, HTML_ATTR_OFFSET
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_MINLENGTH
, HTML_ATTR_VISIBILITY
, HTML_ATTR_UNITS_PER_EM
, HTML_ATTR_LAST
, HTML_ATTR_HIGH
, HTML_ATTR_LAST
, HTML_ATTR_TEXT_RENDERING
, HTML_ATTR_BY
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_SYSTEMLANGUAGE
, HTML_ATTR_LAST
, HTML_ATTR_ELEVATION
, HTML_ATTR_LAST
, HTML_ATTR_FILTERUNITS
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_V_IDEOGRAPHIC
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_BASEFREQUENCY
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_HREF
, HTML_ATTR_SPECULAREXPONENT
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_K
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_UNICODE_BIDI
, HTML_ATTR_LAST
, HTML_ATTR_FONT_STYLE
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_CLASS
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LOCAL
, HTML_ATTR_LAST
, HTML_ATTR_MARKERHEIGHT
, HTML_ATTR_FONT_SIZE_ADJUST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_BGCOLOR
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_VIEWBOX
, HTML_ATTR_LAST
, HTML_ATTR_MAXLENGTH
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_IDEOGRAPHIC
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_MATHEMATICAL
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_BUFFERED
, HTML_ATTR_LAST
, HTML_ATTR_LABEL
, HTML_ATTR_SPACING
, HTML_ATTR_X_HEIGHT
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_KERNING
, HTML_ATTR_LAST
, HTML_ATTR_LANG
, HTML_ATTR_LAST
, HTML_ATTR_KERNELUNITLENGTH
, HTML_ATTR_PRESERVEALPHA
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_BIAS
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_KERNELMATRIX
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_FX
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_XLINK_ARCROLE
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_XLINK_ACTUATE
, HTML_ATTR_XLINK_TYPE
, HTML_ATTR_LAST
, HTML_ATTR_OVERLINE_THICKNESS
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_CLIP_PATH
, HTML_ATTR_LAST
, HTML_ATTR_DOMINANT_BASELINE
, HTML_ATTR_HREFLANG
, HTML_ATTR_FILL
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_BASELINE_SHIFT
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_WORD_SPACING
, HTML_ATTR_GRADIENTUNITS
, HTML_ATTR_LAST
, HTML_ATTR_PLACEHOLDER
, HTML_ATTR_LAST
, HTML_ATTR_VERT_ORIGIN_Y
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_ALPHABETIC
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_RENDERING_INTENT
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_OVERLINE_POSITION
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_FILL_OPACITY
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_ALIGNMENT_BASELINE
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LOW
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_VERT_ADV_Y
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_V_ALPHABETIC
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_XML_LANG
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_BASEPROFILE
, HTML_ATTR_FONT_WEIGHT
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LETTER_SPACING
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_V_MATHEMATICAL
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_GRADIENTTRANSFORM
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_VERT_ORIGIN_X
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_FLOOD_OPACITY
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_GLYPH_NAME
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_FONT_FAMILY
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_SHAPE_RENDERING
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_BBOX
, HTML_ATTR_LAST
, HTML_ATTR_GLYPHREF
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_HORIZ_ADV_X
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_IMAGE_RENDERING
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_HORIZ_ORIGIN_X
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_FLOOD_COLOR
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_XLINK_ROLE
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LIGHTING_COLOR
, HTML_ATTR_XLINK_TITLE
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_SPELLCHECK
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_XLINK_SHOW
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_XLINK_HREF
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_GLYPH_ORIENTATION_VERTICAL
, HTML_ATTR_LAST
, HTML_ATTR_GLYPH_ORIENTATION_HORIZONTAL
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_LAST
, HTML_ATTR_V_HANGING
};
html5-parser-0.4.10/src/attr_enum.h 0000644 0001750 0001750 00000017016 13475205503 017340 0 ustar kovid kovid 0000000 0000000 // Do not edit
// Generated by genattrs.py
HTML_ATTR_ACCENT_HEIGHT,
HTML_ATTR_ACCEPT,
HTML_ATTR_ACCEPT_CHARSET,
HTML_ATTR_ACCESSKEY,
HTML_ATTR_ACCUMULATE,
HTML_ATTR_ACTION,
HTML_ATTR_ADDITIVE,
HTML_ATTR_ALIGN,
HTML_ATTR_ALIGNMENT_BASELINE,
HTML_ATTR_ALLOWREORDER,
HTML_ATTR_ALPHABETIC,
HTML_ATTR_ALT,
HTML_ATTR_AMPLITUDE,
HTML_ATTR_ARABIC_FORM,
HTML_ATTR_ASCENT,
HTML_ATTR_ASYNC,
HTML_ATTR_ATTRIBUTENAME,
HTML_ATTR_ATTRIBUTETYPE,
HTML_ATTR_AUTOREVERSE,
HTML_ATTR_AUTOCOMPLETE,
HTML_ATTR_AUTOFOCUS,
HTML_ATTR_AUTOPLAY,
HTML_ATTR_AUTOSAVE,
HTML_ATTR_AZIMUTH,
HTML_ATTR_BASEFREQUENCY,
HTML_ATTR_BASEPROFILE,
HTML_ATTR_BASELINE_SHIFT,
HTML_ATTR_BBOX,
HTML_ATTR_BEGIN,
HTML_ATTR_BGCOLOR,
HTML_ATTR_BIAS,
HTML_ATTR_BORDER,
HTML_ATTR_BUFFERED,
HTML_ATTR_BY,
HTML_ATTR_CALCMODE,
HTML_ATTR_CAP_HEIGHT,
HTML_ATTR_CHALLENGE,
HTML_ATTR_CHARSET,
HTML_ATTR_CHECKED,
HTML_ATTR_CITE,
HTML_ATTR_CLASS,
HTML_ATTR_CLIP,
HTML_ATTR_CLIP_PATH,
HTML_ATTR_CLIP_RULE,
HTML_ATTR_CLIPPATHUNITS,
HTML_ATTR_CODE,
HTML_ATTR_CODEBASE,
HTML_ATTR_COLOR,
HTML_ATTR_COLOR_INTERPOLATION,
HTML_ATTR_COLOR_INTERPOLATION_FILTERS,
HTML_ATTR_COLOR_PROFILE,
HTML_ATTR_COLOR_RENDERING,
HTML_ATTR_COLS,
HTML_ATTR_COLSPAN,
HTML_ATTR_CONTENT,
HTML_ATTR_CONTENTSCRIPTTYPE,
HTML_ATTR_CONTENTSTYLETYPE,
HTML_ATTR_CONTENTEDITABLE,
HTML_ATTR_CONTEXTMENU,
HTML_ATTR_CONTROLS,
HTML_ATTR_COORDS,
HTML_ATTR_CURSOR,
HTML_ATTR_CX,
HTML_ATTR_CY,
HTML_ATTR_D,
HTML_ATTR_DATA,
HTML_ATTR_DATA_REACTID,
HTML_ATTR_DATETIME,
HTML_ATTR_DECELERATE,
HTML_ATTR_DEFAULT,
HTML_ATTR_DEFER,
HTML_ATTR_DESCENT,
HTML_ATTR_DIFFUSECONSTANT,
HTML_ATTR_DIR,
HTML_ATTR_DIRECTION,
HTML_ATTR_DIRNAME,
HTML_ATTR_DISABLED,
HTML_ATTR_DISPLAY,
HTML_ATTR_DIVISOR,
HTML_ATTR_DOMINANT_BASELINE,
HTML_ATTR_DOWNLOAD,
HTML_ATTR_DRAGGABLE,
HTML_ATTR_DROPZONE,
HTML_ATTR_DUR,
HTML_ATTR_DX,
HTML_ATTR_DY,
HTML_ATTR_EDGEMODE,
HTML_ATTR_ELEVATION,
HTML_ATTR_ENABLE_BACKGROUND,
HTML_ATTR_ENCTYPE,
HTML_ATTR_END,
HTML_ATTR_EXPONENT,
HTML_ATTR_EXTERNALRESOURCESREQUIRED,
HTML_ATTR_FILL,
HTML_ATTR_FILL_OPACITY,
HTML_ATTR_FILL_RULE,
HTML_ATTR_FILTER,
HTML_ATTR_FILTERRES,
HTML_ATTR_FILTERUNITS,
HTML_ATTR_FLOOD_COLOR,
HTML_ATTR_FLOOD_OPACITY,
HTML_ATTR_FONT_FAMILY,
HTML_ATTR_FONT_SIZE,
HTML_ATTR_FONT_SIZE_ADJUST,
HTML_ATTR_FONT_STRETCH,
HTML_ATTR_FONT_STYLE,
HTML_ATTR_FONT_VARIANT,
HTML_ATTR_FONT_WEIGHT,
HTML_ATTR_FOR,
HTML_ATTR_FORM,
HTML_ATTR_FORMACTION,
HTML_ATTR_FORMAT,
HTML_ATTR_FR,
HTML_ATTR_FROM,
HTML_ATTR_FX,
HTML_ATTR_FY,
HTML_ATTR_G1,
HTML_ATTR_G2,
HTML_ATTR_GLYPH_NAME,
HTML_ATTR_GLYPH_ORIENTATION_HORIZONTAL,
HTML_ATTR_GLYPH_ORIENTATION_VERTICAL,
HTML_ATTR_GLYPHREF,
HTML_ATTR_GRADIENTTRANSFORM,
HTML_ATTR_GRADIENTUNITS,
HTML_ATTR_HANGING,
HTML_ATTR_HEADERS,
HTML_ATTR_HEIGHT,
HTML_ATTR_HIDDEN,
HTML_ATTR_HIGH,
HTML_ATTR_HORIZ_ADV_X,
HTML_ATTR_HORIZ_ORIGIN_X,
HTML_ATTR_HREF,
HTML_ATTR_HREFLANG,
HTML_ATTR_HTTP_EQUIV,
HTML_ATTR_ICON,
HTML_ATTR_ID,
HTML_ATTR_IDEOGRAPHIC,
HTML_ATTR_IMAGE_RENDERING,
HTML_ATTR_IN,
HTML_ATTR_IN2,
HTML_ATTR_INTEGRITY,
HTML_ATTR_INTERCEPT,
HTML_ATTR_ISMAP,
HTML_ATTR_ITEMPROP,
HTML_ATTR_K,
HTML_ATTR_K1,
HTML_ATTR_K2,
HTML_ATTR_K3,
HTML_ATTR_K4,
HTML_ATTR_KERNELMATRIX,
HTML_ATTR_KERNELUNITLENGTH,
HTML_ATTR_KERNING,
HTML_ATTR_KEYPOINTS,
HTML_ATTR_KEYSPLINES,
HTML_ATTR_KEYTIMES,
HTML_ATTR_KEYTYPE,
HTML_ATTR_KIND,
HTML_ATTR_LABEL,
HTML_ATTR_LANG,
HTML_ATTR_LANGUAGE,
HTML_ATTR_LENGTHADJUST,
HTML_ATTR_LETTER_SPACING,
HTML_ATTR_LIGHTING_COLOR,
HTML_ATTR_LIMITINGCONEANGLE,
HTML_ATTR_LIST,
HTML_ATTR_LOCAL,
HTML_ATTR_LOOP,
HTML_ATTR_LOW,
HTML_ATTR_MANIFEST,
HTML_ATTR_MARKER_END,
HTML_ATTR_MARKER_MID,
HTML_ATTR_MARKER_START,
HTML_ATTR_MARKERHEIGHT,
HTML_ATTR_MARKERUNITS,
HTML_ATTR_MARKERWIDTH,
HTML_ATTR_MASK,
HTML_ATTR_MASKCONTENTUNITS,
HTML_ATTR_MASKUNITS,
HTML_ATTR_MATHEMATICAL,
HTML_ATTR_MAX,
HTML_ATTR_MAXLENGTH,
HTML_ATTR_MEDIA,
HTML_ATTR_METHOD,
HTML_ATTR_MIN,
HTML_ATTR_MINLENGTH,
HTML_ATTR_MODE,
HTML_ATTR_MULTIPLE,
HTML_ATTR_MUTED,
HTML_ATTR_NAME,
HTML_ATTR_NOVALIDATE,
HTML_ATTR_NUMOCTAVES,
HTML_ATTR_OFFSET,
HTML_ATTR_ONABORT,
HTML_ATTR_ONACTIVATE,
HTML_ATTR_ONBEGIN,
HTML_ATTR_ONCLICK,
HTML_ATTR_ONEND,
HTML_ATTR_ONERROR,
HTML_ATTR_ONFOCUSIN,
HTML_ATTR_ONFOCUSOUT,
HTML_ATTR_ONLOAD,
HTML_ATTR_ONMOUSEDOWN,
HTML_ATTR_ONMOUSEMOVE,
HTML_ATTR_ONMOUSEOUT,
HTML_ATTR_ONMOUSEOVER,
HTML_ATTR_ONMOUSEUP,
HTML_ATTR_ONREPEAT,
HTML_ATTR_ONRESIZE,
HTML_ATTR_ONSCROLL,
HTML_ATTR_ONUNLOAD,
HTML_ATTR_OPACITY,
HTML_ATTR_OPEN,
HTML_ATTR_OPERATOR,
HTML_ATTR_OPTIMUM,
HTML_ATTR_ORDER,
HTML_ATTR_ORIENT,
HTML_ATTR_ORIENTATION,
HTML_ATTR_ORIGIN,
HTML_ATTR_OVERFLOW,
HTML_ATTR_OVERLINE_POSITION,
HTML_ATTR_OVERLINE_THICKNESS,
HTML_ATTR_PAINT_ORDER,
HTML_ATTR_PANOSE_1,
HTML_ATTR_PATHLENGTH,
HTML_ATTR_PATTERN,
HTML_ATTR_PATTERNCONTENTUNITS,
HTML_ATTR_PATTERNTRANSFORM,
HTML_ATTR_PATTERNUNITS,
HTML_ATTR_PING,
HTML_ATTR_PLACEHOLDER,
HTML_ATTR_POINTER_EVENTS,
HTML_ATTR_POINTS,
HTML_ATTR_POINTSATX,
HTML_ATTR_POINTSATY,
HTML_ATTR_POINTSATZ,
HTML_ATTR_POSTER,
HTML_ATTR_PRELOAD,
HTML_ATTR_PRESERVEALPHA,
HTML_ATTR_PRESERVEASPECTRATIO,
HTML_ATTR_PRIMITIVEUNITS,
HTML_ATTR_R,
HTML_ATTR_RADIOGROUP,
HTML_ATTR_RADIUS,
HTML_ATTR_READONLY,
HTML_ATTR_REFX,
HTML_ATTR_REFY,
HTML_ATTR_REL,
HTML_ATTR_RENDERING_INTENT,
HTML_ATTR_REPEATCOUNT,
HTML_ATTR_REPEATDUR,
HTML_ATTR_REQUIRED,
HTML_ATTR_REQUIREDEXTENSIONS,
HTML_ATTR_REQUIREDFEATURES,
HTML_ATTR_RESTART,
HTML_ATTR_RESULT,
HTML_ATTR_REVERSED,
HTML_ATTR_ROTATE,
HTML_ATTR_ROWS,
HTML_ATTR_ROWSPAN,
HTML_ATTR_RX,
HTML_ATTR_RY,
HTML_ATTR_SANDBOX,
HTML_ATTR_SCALE,
HTML_ATTR_SCOPE,
HTML_ATTR_SCOPED,
HTML_ATTR_SEAMLESS,
HTML_ATTR_SEED,
HTML_ATTR_SELECTED,
HTML_ATTR_SHAPE,
HTML_ATTR_SHAPE_RENDERING,
HTML_ATTR_SIZE,
HTML_ATTR_SIZES,
HTML_ATTR_SLOPE,
HTML_ATTR_SLOT,
HTML_ATTR_SPACING,
HTML_ATTR_SPAN,
HTML_ATTR_SPECULARCONSTANT,
HTML_ATTR_SPECULAREXPONENT,
HTML_ATTR_SPEED,
HTML_ATTR_SPELLCHECK,
HTML_ATTR_SPREADMETHOD,
HTML_ATTR_SRC,
HTML_ATTR_SRCDOC,
HTML_ATTR_SRCLANG,
HTML_ATTR_SRCSET,
HTML_ATTR_START,
HTML_ATTR_STARTOFFSET,
HTML_ATTR_STDDEVIATION,
HTML_ATTR_STEMH,
HTML_ATTR_STEMV,
HTML_ATTR_STEP,
HTML_ATTR_STITCHTILES,
HTML_ATTR_STOP_COLOR,
HTML_ATTR_STOP_OPACITY,
HTML_ATTR_STRIKETHROUGH_POSITION,
HTML_ATTR_STRIKETHROUGH_THICKNESS,
HTML_ATTR_STRING,
HTML_ATTR_STROKE,
HTML_ATTR_STROKE_DASHARRAY,
HTML_ATTR_STROKE_DASHOFFSET,
HTML_ATTR_STROKE_LINECAP,
HTML_ATTR_STROKE_LINEJOIN,
HTML_ATTR_STROKE_MITERLIMIT,
HTML_ATTR_STROKE_OPACITY,
HTML_ATTR_STROKE_WIDTH,
HTML_ATTR_STYLE,
HTML_ATTR_SUMMARY,
HTML_ATTR_SURFACESCALE,
HTML_ATTR_SYSTEMLANGUAGE,
HTML_ATTR_TABINDEX,
HTML_ATTR_TABLEVALUES,
HTML_ATTR_TARGET,
HTML_ATTR_TARGETX,
HTML_ATTR_TARGETY,
HTML_ATTR_TEXT_ANCHOR,
HTML_ATTR_TEXT_DECORATION,
HTML_ATTR_TEXT_RENDERING,
HTML_ATTR_TEXTLENGTH,
HTML_ATTR_TITLE,
HTML_ATTR_TO,
HTML_ATTR_TRANSFORM,
HTML_ATTR_TYPE,
HTML_ATTR_U1,
HTML_ATTR_U2,
HTML_ATTR_UNDERLINE_POSITION,
HTML_ATTR_UNDERLINE_THICKNESS,
HTML_ATTR_UNICODE,
HTML_ATTR_UNICODE_BIDI,
HTML_ATTR_UNICODE_RANGE,
HTML_ATTR_UNITS_PER_EM,
HTML_ATTR_USEMAP,
HTML_ATTR_V_ALPHABETIC,
HTML_ATTR_V_HANGING,
HTML_ATTR_V_IDEOGRAPHIC,
HTML_ATTR_V_MATHEMATICAL,
HTML_ATTR_VALUE,
HTML_ATTR_VALUES,
HTML_ATTR_VERSION,
HTML_ATTR_VERT_ADV_Y,
HTML_ATTR_VERT_ORIGIN_X,
HTML_ATTR_VERT_ORIGIN_Y,
HTML_ATTR_VIEWBOX,
HTML_ATTR_VIEWTARGET,
HTML_ATTR_VISIBILITY,
HTML_ATTR_WIDTH,
HTML_ATTR_WIDTHS,
HTML_ATTR_WORD_SPACING,
HTML_ATTR_WRAP,
HTML_ATTR_WRITING_MODE,
HTML_ATTR_X,
HTML_ATTR_X_HEIGHT,
HTML_ATTR_X1,
HTML_ATTR_X2,
HTML_ATTR_XCHANNELSELECTOR,
HTML_ATTR_XLINK_ACTUATE,
HTML_ATTR_XLINK_ARCROLE,
HTML_ATTR_XLINK_HREF,
HTML_ATTR_XLINK_ROLE,
HTML_ATTR_XLINK_SHOW,
HTML_ATTR_XLINK_TITLE,
HTML_ATTR_XLINK_TYPE,
HTML_ATTR_XML_BASE,
HTML_ATTR_XML_LANG,
HTML_ATTR_XML_SPACE,
HTML_ATTR_Y,
HTML_ATTR_Y1,
HTML_ATTR_Y2,
HTML_ATTR_YCHANNELSELECTOR,
HTML_ATTR_Z,
HTML_ATTR_ZOOMANDPAN,
html5-parser-0.4.10/src/stack.h 0000644 0001750 0001750 00000003231 13475205503 016441 0 ustar kovid kovid 0000000 0000000 /*
* Copyright (C) 2017 Kovid Goyal
*
* Distributed under terms of the Apache 2.0 license.
*/
#define STACK_ITEM_CLASS(name) typedef struct { Item1 gumbo; Item2 xml; } name;
STACK_ITEM_CLASS(StackItemClass)
#undef STACK_ITEM_CLASS
#define STACK_CLASS(name) typedef struct { size_t length; size_t capacity; StackItemClass *items; } name;
STACK_CLASS(StackClass)
#undef STACK_CLASS
#define CONC(a, b) a ## _ ## b
#define EVAL(x, y) CONC(x, y)
#define FNAME(x) EVAL(StackClass, x)
static inline StackClass*
FNAME(alloc)(size_t sz) {
StackClass *ans = calloc(sizeof(StackClass), 1);
if (ans) {
ans->items = (StackItemClass*)malloc(sizeof(StackItemClass) * sz);
if (ans->items) ans->capacity = sz;
else { free(ans); ans = NULL; }
}
return ans;
}
static inline void
FNAME(free)(StackClass *s) { if (s) { free(s->items); free(s); } }
static inline void
FNAME(pop)(StackClass *s, Item1 *g, Item2 *x) { StackItemClass *si = &(s->items[--(s->length)]); *g = si->gumbo; *x = si->xml; }
#ifndef SAFE_REALLOC_DEFINED
#define SAFE_REALLOC_DEFINED
static inline void*
safe_realloc(void *p, size_t sz) {
void *orig = p;
void *ans = realloc(p, sz);
if (ans == NULL) free(orig);
return ans;
}
#endif
static inline bool
FNAME(push)(StackClass *s, Item1 g, Item2 x) {
if (s->length >= s->capacity) {
s->capacity *= 2;
s->items = (StackItemClass*)safe_realloc(s->items, s->capacity * sizeof(StackItemClass));
if (!s->items) return false;
}
StackItemClass *si = &(s->items[(s->length)++]);
si->gumbo = g; si->xml = x;
return true;
}
#undef EVAL
#undef CONC
#undef FNAME
html5-parser-0.4.10/src/as-libxml.c 0000644 0001750 0001750 00000036752 13557445205 017243 0 ustar kovid kovid 0000000 0000000 /*
* as-libxml.c
* Copyright (C) 2017 Kovid Goyal
*
* Distributed under terms of the Apache 2.0 license.
*/
#include
#include
#define NEEDS_SANITIZE_NAME 1
#include "as-libxml.h"
#include
#include
// Namespace constants, indexed by GumboNamespaceEnum.
static const char* kLegalXmlns[] = {
"http://www.w3.org/1999/xhtml",
"http://www.w3.org/2000/svg",
"http://www.w3.org/1998/Math/MathML"
};
typedef struct {
xmlNsPtr xlink, xml;
xmlNodePtr root;
bool maybe_xhtml, sanitize_names;
const char* errmsg;
const xmlChar* standard_tags[GUMBO_TAG_LAST], *lang_attribute;
} ParseData;
// Stack {{{
#define Item1 GumboNode*
#define Item2 xmlNodePtr
#define StackItemClass StackItem
#define StackClass Stack
#include "stack.h"
// }}}
static inline bool
push_children(xmlNodePtr parent, GumboElement *elem, Stack *stack) {
for (int i = elem->children.length - 1; i >= 0; i--) {
if (!Stack_push(stack, elem->children.data[i], parent)) return false;
}
return true;
}
static inline xmlNsPtr
ensure_xml_ns(xmlDocPtr doc, ParseData *pd, xmlNodePtr node) {
// By default libxml2 docs do not have the xml: namespace defined.
if (UNLIKELY(!pd->xml)) {
xmlNodePtr root = pd->root ? pd->root : node;
pd->xml = xmlSearchNs(doc, root, BAD_CAST "xml");
}
return pd->xml;
}
static inline xmlNsPtr
ensure_xlink_ns(xmlDocPtr doc, ParseData *pd, xmlNodePtr node) {
if (UNLIKELY(!pd->xlink)) {
xmlNodePtr root = pd->root ? pd->root : node;
pd->xlink = xmlSearchNs(doc, root, BAD_CAST "xlink");
if (UNLIKELY(!pd->xlink)) pd->xlink = xmlNewNs(root, BAD_CAST "http://www.w3.org/1999/xlink", BAD_CAST "xlink");
}
return pd->xlink;
}
static inline xmlNsPtr
find_namespace_by_prefix(xmlDocPtr doc, xmlNodePtr node, xmlNodePtr xml_parent, const char* prefix) {
xmlNsPtr ans = xmlSearchNs(doc, node, BAD_CAST prefix);
if (ans) return ans;
if (!xml_parent) return NULL;
return xmlSearchNs(doc, xml_parent, BAD_CAST prefix);
}
static GumboStringPiece REPROCESS = {"", 0};
static inline bool
create_attributes(xmlDocPtr doc, xmlNodePtr node, GumboElement *elem, xmlNodePtr xml_parent, bool reprocess, bool *needs_reprocess) {
GumboAttribute* attr;
const xmlChar *attr_name;
const char *aname;
char buf[50] = {0};
ParseData *pd = (ParseData*)doc->_private;
xmlNsPtr ns;
int added_lang = 0;
for (unsigned int i = 0; i < elem->attributes.length; ++i) {
attr = elem->attributes.data[i];
if (reprocess && attr->original_name.data != REPROCESS.data) continue;
aname = attr->name;
ns = NULL;
switch (attr->attr_namespace) {
case GUMBO_ATTR_NAMESPACE_XLINK:
ns = ensure_xlink_ns(doc, pd, node);
if (UNLIKELY(!ns)) return false;
break;
case GUMBO_ATTR_NAMESPACE_XML:
ns = ensure_xml_ns(doc, pd, node);
if (UNLIKELY(!ns)) return false;
if (UNLIKELY(pd->maybe_xhtml && strcmp(aname, "lang") == 0)) {
if (!added_lang) {
added_lang = 1;
if (UNLIKELY(!xmlNewNsPropEatName(node, NULL, (xmlChar*)pd->lang_attribute, BAD_CAST attr->value))) return false;
}
continue;
}
break;
case GUMBO_ATTR_NAMESPACE_XMLNS:
if (strncmp(aname, "xlink", 5) == 0) {
if (!ensure_xlink_ns(doc, pd, node)) return false;
// We ignore the value of this attribute since we dont want
// the xlink namespace to be redefined
continue;
} else if (strncmp(aname, "xmlns", 5) == 0) {
// discard since we dont support changing the default
// namespace, namespace are decided by tag names alone.
continue;
}
break;
default:
if (UNLIKELY(pd->maybe_xhtml && strncmp(aname, "xml:lang", 8) == 0)) {
if (!added_lang) {
added_lang = 1;
if (UNLIKELY(!xmlNewNsPropEatName(node, ns, (xmlChar*)pd->lang_attribute, BAD_CAST attr->value))) return false;
}
continue;
} else if (UNLIKELY(strncmp("xmlns", aname, 5) == 0)) {
size_t len = strlen(aname);
if (len == 5) continue; // ignore xmlns
if (aname[5] == ':') {
if (len == 6) continue; //ignore xmlns:
if (pd->maybe_xhtml) {
xmlNewNs(node, BAD_CAST attr->value, BAD_CAST aname + 6);
// We ignore failure to create the namespace as the most likely
// cause is the prefix already exists in this context and xmlNewNs
// does not allow replacing prefixes. We could in theory find the
// existing namespace, but I dont care enough
continue;
} else {
snprintf(buf, sizeof(buf) - 1, "xmlns_%s", aname + 6);
aname = buf;
}
}
}
break;
}
if (pd->maybe_xhtml) {
char *colon = strchr(aname, ':');
if (colon && strlen(colon + 1) > 0) {
*colon = 0;
ns = find_namespace_by_prefix(doc, node, xml_parent, aname);
*colon = ':';
if (!ns) {
if (!reprocess) {
attr->original_name.data = REPROCESS.data;
*needs_reprocess = true;
continue;
}
*colon = '_';
} else aname = colon + 1;
}
}
attr_name = xmlDictLookup(doc->dict, BAD_CAST aname, (pd->sanitize_names ? sanitize_name((char*)aname) : strlen(aname))); // we deliberately discard const, for performance
if (UNLIKELY(!attr_name)) return false;
if (UNLIKELY(pd->maybe_xhtml && attr_name == pd->lang_attribute)) {
if (added_lang == 2) continue;
added_lang = 2;
xmlSetNsProp(node, NULL, attr_name, BAD_CAST attr->value);
} else {
if (UNLIKELY(!xmlNewNsPropEatName(node, ns, (xmlChar*)attr_name, BAD_CAST attr->value))) return false;
}
}
return true;
}
static inline char*
check_for_namespace_prefix(char **tag, uint8_t *sz) {
char *colon = memchr(*tag, ':', *sz);
if (!colon || (size_t)(colon + 1 - *tag) >= *sz) return NULL;
*sz -= colon + 1 - *tag;
*colon = 0;
char *ans = *tag;
*tag = colon + 1;
return ans;
}
static inline const xmlChar*
lookup_standard_tag(xmlDocPtr doc, ParseData *pd, GumboTag tag) {
if (UNLIKELY(!pd->standard_tags[tag])) {
uint8_t tag_sz;
const char *name = gumbo_normalized_tagname_and_size(tag, &tag_sz);
pd->standard_tags[tag] = xmlDictLookup(doc->dict, BAD_CAST name, tag_sz);
}
return pd->standard_tags[tag];
}
static inline xmlNodePtr
create_element(xmlDocPtr doc, xmlNodePtr xml_parent, GumboNode *parent, GumboElement *elem, Options *opts) {
#define ABORT { ok = false; goto end; }
xmlNodePtr result = NULL;
bool ok = true;
const xmlChar *tag_name = NULL;
const char *tag;
char buf[MAX_TAG_NAME_SZ] = {0};
char *nsprefix = NULL;
xmlNsPtr namespace = NULL;
ParseData *pd = (ParseData*)doc->_private;
if (UNLIKELY(elem->tag >= GUMBO_TAG_UNKNOWN)) {
gumbo_tag_from_original_text(&(elem->original_tag));
uint8_t tag_sz = (uint8_t)(MIN(sizeof(buf) - 1, elem->original_tag.length));
memcpy(buf, elem->original_tag.data, tag_sz);
tag = buf;
if (pd->maybe_xhtml) {
char *temp = buf;
nsprefix = check_for_namespace_prefix(&temp, &tag_sz);
tag = temp;
}
tag_sz = (uint8_t)(pd->sanitize_names ? sanitize_name((char*)tag) : strlen(tag));
tag_name = xmlDictLookup(doc->dict, BAD_CAST tag, tag_sz);
} else if (UNLIKELY(elem->tag_namespace == GUMBO_NAMESPACE_SVG)) {
gumbo_tag_from_original_text(&(elem->original_tag));
tag = gumbo_normalize_svg_tagname(&(elem->original_tag));
if (tag == NULL) tag_name = lookup_standard_tag(doc, pd, elem->tag);
else tag_name = xmlDictLookup(doc->dict, BAD_CAST tag, elem->original_tag.length);
} else tag_name = lookup_standard_tag(doc, pd, elem->tag);
if (UNLIKELY(!tag_name)) ABORT;
// Must use xmlNewDocNodeEatName as we are using a dict string and without this
// if an error occurs and we have to call xmlFreeNode before adding this node to the doc
// we get a segfault.
result = xmlNewDocNodeEatName(doc, NULL, (xmlChar*)tag_name, NULL);
if (UNLIKELY(!result)) ABORT;
result->line = elem->start_pos.line;
if (opts->line_number_attr) {
snprintf(buf, sizeof(buf) - 1, "%u", elem->start_pos.line);
if (UNLIKELY(!xmlNewNsPropEatName(result, NULL, (xmlChar*)opts->line_number_attr, BAD_CAST buf))) ABORT;
}
if (opts->namespace_elements) {
if (UNLIKELY(parent->type == GUMBO_NODE_DOCUMENT || elem->tag_namespace != parent->v.element.tag_namespace)) {
// Default namespace has changed
namespace = xmlNewNs(
result, BAD_CAST kLegalXmlns[elem->tag_namespace], NULL);
if (UNLIKELY(!namespace)) ABORT;
}
xmlSetNs(result, namespace ? namespace :xml_parent->ns);
}
bool needs_reprocess = false;
if (UNLIKELY(!create_attributes(doc, result, elem, xml_parent, false, &needs_reprocess))) ABORT;
if (UNLIKELY(needs_reprocess)) {
if (UNLIKELY(!create_attributes(doc, result, elem, xml_parent, true, &needs_reprocess))) ABORT;
}
if (UNLIKELY(nsprefix)) {
namespace = xmlSearchNs(doc, result, BAD_CAST nsprefix);
if (!namespace && xml_parent) namespace = xmlSearchNs(doc, xml_parent, BAD_CAST nsprefix);
if (namespace) xmlSetNs(result, namespace);
}
#undef ABORT
end:
if (UNLIKELY(!ok)) {
if(result) xmlFreeNode(result);
result = NULL;
}
return result;
}
static inline xmlNodePtr
convert_node(xmlDocPtr doc, xmlNodePtr xml_parent, GumboNode* node, GumboElement **elem, Options *opts) {
xmlNodePtr ans = NULL;
ParseData *pd = (ParseData*)doc->_private;
*elem = NULL;
switch (node->type) {
case GUMBO_NODE_ELEMENT:
case GUMBO_NODE_TEMPLATE:
*elem = &node->v.element;
ans = create_element(doc, xml_parent, node->parent, *elem, opts);
break;
case GUMBO_NODE_TEXT:
case GUMBO_NODE_WHITESPACE:
ans = xmlNewText(BAD_CAST node->v.text.text);
break;
case GUMBO_NODE_COMMENT:
ans = xmlNewComment(BAD_CAST node->v.text.text);
break;
case GUMBO_NODE_CDATA:
{
// TODO: probably would be faster to use some calculation on
// original_text.length rather than strlen, but I haven't verified that
// that's correct in all cases.
const char* node_text = node->v.text.text;
ans = xmlNewCDataBlock(doc, BAD_CAST node_text, (int)strlen(node_text));
}
break;
default:
pd->errmsg = ERRMSG("unknown gumbo node type");
break;
}
return ans;
}
static inline xmlDocPtr
alloc_doc(Options *opts) {
xmlDocPtr doc = xmlNewDoc(BAD_CAST "1.0");
if (doc) {
if (!doc->dict) {
doc->dict = xmlDictCreate();
if (doc->dict == NULL) {
xmlFreeDoc(doc);
doc = NULL;
}
opts->line_number_attr = xmlDictLookup(doc->dict, BAD_CAST opts->line_number_attr, -1);
}
doc->encoding = xmlStrdup(BAD_CAST "UTF-8");
}
return doc;
}
static inline bool
add_root_comments(ParseData *pd, GumboDocument *document, GumboNode *root) {
GumboVector *root_nodes = &(document->children);
bool before_root = true;
for (unsigned int i = 0; i < root_nodes->length; i++) {
GumboNode *root_node = (GumboNode*)root_nodes->data[i];
if (root_node == root) { before_root = false; continue; }
if (root_node->type == GUMBO_NODE_COMMENT) {
xmlNodePtr comment = xmlNewComment(BAD_CAST root_node->v.text.text);
if (UNLIKELY(!comment)) { pd->errmsg = ERRMSG("Out of memory allocating comment"); return false; }
if (UNLIKELY(!(before_root ? xmlAddPrevSibling(pd->root, comment) : xmlAddSibling(pd->root, comment)))) {
pd->errmsg = ERRMSG("Failed to add sibling to root node");
xmlFreeNode(comment);
return false;
}
}
}
return true;
}
libxml_doc*
convert_gumbo_tree_to_libxml_tree(GumboOutput *output, Options *opts, char **errmsg) {
#define ABORT { ok = false; goto end; }
xmlDocPtr doc = NULL;
xmlNodePtr parent = NULL, child = NULL;
GumboNode *gumbo = NULL, *root = output->root;
ParseData parse_data = {0};
GumboElement *elem;
bool ok = true;
*errmsg = NULL;
Stack *stack = Stack_alloc(opts->stack_size);
if (stack == NULL) return NULL;
Stack_push(stack, root, NULL);
doc = alloc_doc(opts);
if (doc == NULL) ABORT;
GumboDocument* document = &(output->document->v.document);
if (opts->keep_doctype && output->document->v.document.has_doctype) {
if(!xmlCreateIntSubset(doc, BAD_CAST document->name, BAD_CAST document->public_identifier, BAD_CAST document->system_identifier)) ABORT;
}
parse_data.maybe_xhtml = opts->gumbo_opts.use_xhtml_rules;
parse_data.sanitize_names = opts->sanitize_names;
doc->_private = (void*)&parse_data;
parse_data.lang_attribute = xmlDictLookup(doc->dict, BAD_CAST "lang", 4);
if (!parse_data.lang_attribute) ABORT;
while(stack->length > 0) {
Stack_pop(stack, &gumbo, &parent);
child = convert_node(doc, parent, gumbo, &elem, opts);
if (UNLIKELY(!child)) ABORT;
if (LIKELY(parent)) {
if (UNLIKELY(!xmlAddChild(parent, child))) ABORT;
} else parse_data.root = child;
if (elem != NULL) {
if (!push_children(child, elem, stack)) ABORT;
}
}
if (parse_data.maybe_xhtml) {
// Add xml:lang to the root element if it has lang
xmlChar *root_lang = xmlGetNsProp(parse_data.root, parse_data.lang_attribute, NULL);
if (root_lang) {
ensure_xml_ns(doc, &parse_data, parse_data.root);
if (parse_data.xml) xmlNewNsPropEatName(parse_data.root, parse_data.xml, (xmlChar*)parse_data.lang_attribute, root_lang);
xmlFree(root_lang);
}
}
xmlDocSetRootElement(doc, parse_data.root);
// Add any comments that are outside the root element
if (!add_root_comments(&parse_data, document, root)) ABORT;
#undef ABORT
end:
if (doc) doc->_private = NULL;
Stack_free(stack);
*errmsg = (char*)parse_data.errmsg;
if(!ok) { if (parse_data.root) xmlFreeNode(parse_data.root); if (doc) xmlFreeDoc(doc); doc = NULL; }
return doc;
}
libxml_doc*
copy_libxml_doc(libxml_doc* doc) { return xmlCopyDoc(doc, 1); }
libxml_doc
free_libxml_doc(libxml_doc* doc) { xmlFreeDoc(doc); }
int
get_libxml_version(void) {
return atoi(xmlParserVersion);
}
html5-parser-0.4.10/src/html5_parser.egg-info/ 0000755 0001750 0001750 00000000000 14122552315 021257 5 ustar kovid kovid 0000000 0000000 html5-parser-0.4.10/src/html5_parser.egg-info/top_level.txt 0000644 0001750 0001750 00000000015 14122552315 024005 0 ustar kovid kovid 0000000 0000000 html5_parser
html5-parser-0.4.10/src/html5_parser.egg-info/SOURCES.txt 0000644 0001750 0001750 00000002602 14122552315 023143 0 ustar kovid kovid 0000000 0000000 LICENSE
MANIFEST.in
README.rst
build.py
genattrs.py
genencodings.py
gentags.py
run_tests.py
setup.cfg
setup.py
gumbo/attribute.c
gumbo/attribute.h
gumbo/char_ref.c
gumbo/char_ref.h
gumbo/char_ref.rl
gumbo/error.c
gumbo/error.h
gumbo/gumbo.h
gumbo/gumbo_edit.c
gumbo/gumbo_edit.h
gumbo/insertion_mode.h
gumbo/parser.c
gumbo/parser.h
gumbo/replacement.h
gumbo/string_buffer.c
gumbo/string_buffer.h
gumbo/string_piece.c
gumbo/string_piece.h
gumbo/svg_attrs.c
gumbo/svg_tags.c
gumbo/tag.c
gumbo/tag_enum.h
gumbo/tag_perf.h
gumbo/tag_sizes.h
gumbo/tag_strings.h
gumbo/token_type.h
gumbo/tokenizer.c
gumbo/tokenizer.h
gumbo/tokenizer_states.h
gumbo/utf8.c
gumbo/utf8.h
gumbo/util.c
gumbo/util.h
gumbo/vector.c
gumbo/vector.h
src/as-libxml.c
src/as-libxml.h
src/as-python-tree.c
src/as-python-tree.h
src/attr_enum.h
src/attr_perf.h
src/attr_sizes.h
src/attr_strings.h
src/data-types.h
src/python-wrapper.c
src/stack.h
src/html5_parser/__init__.py
src/html5_parser/dom.py
src/html5_parser/encoding_names.py
src/html5_parser/encoding_parser.py
src/html5_parser/soup.py
src/html5_parser/stdlib_etree.py
src/html5_parser.egg-info/PKG-INFO
src/html5_parser.egg-info/SOURCES.txt
src/html5_parser.egg-info/dependency_links.txt
src/html5_parser.egg-info/requires.txt
src/html5_parser.egg-info/top_level.txt
test/__init__.py
test/adapt.py
test/basic.py
test/html5lib_adapter.py
test/malformed.py
test/namespace.py
test/soup.py html5-parser-0.4.10/src/html5_parser.egg-info/requires.txt 0000644 0001750 0001750 00000000053 14122552315 023655 0 ustar kovid kovid 0000000 0000000 chardet
lxml>=3.8.0
[soup]
beautifulsoup4
html5-parser-0.4.10/src/html5_parser.egg-info/PKG-INFO 0000644 0001750 0001750 00000001536 14122552315 022361 0 ustar kovid kovid 0000000 0000000 Metadata-Version: 2.1
Name: html5-parser
Version: 0.4.10
Summary: Fast C based HTML 5 parsing for python
Home-page: https://html5-parser.readthedocs.io
Author: Kovid Goyal
Author-email: redacted@acme.com
License: Apache 2.0
Download-URL: https://pypi.python.org/packages/source/m/html5-parser/html5-parser-0.4.10.tar.gz
Description: UNKNOWN
Platform: any
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Natural Language :: English
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Topic :: Text Processing
Classifier: Topic :: Text Processing :: Markup
Classifier: Topic :: Text Processing :: Markup :: HTML
Classifier: Topic :: Text Processing :: Markup :: XML
Provides-Extra: soup
html5-parser-0.4.10/src/html5_parser.egg-info/dependency_links.txt 0000644 0001750 0001750 00000000001 14122552315 025325 0 ustar kovid kovid 0000000 0000000
html5-parser-0.4.10/src/python-wrapper.c 0000644 0001750 0001750 00000017560 14122551647 020343 0 ustar kovid kovid 0000000 0000000 /*
* python-wrapper.c
* Copyright (C) 2017 Kovid Goyal
*
* Distributed under terms of the Apache 2.0 license.
*/
#define PY_SSIZE_T_CLEAN
#include
#include "../gumbo/gumbo.h"
#include "as-libxml.h"
#include "as-python-tree.h"
#define MAJOR 0
#define MINOR 4
#define PATCH 10
static char *NAME = "libxml2:xmlDoc";
static char *DESTRUCTOR = "destructor:xmlFreeDoc";
static inline libxml_doc*
convert_tree(GumboOutput *output, Options *opts) {
char *errmsg = NULL;
libxml_doc *doc = NULL;
Py_BEGIN_ALLOW_THREADS;
doc = convert_gumbo_tree_to_libxml_tree(output, opts, &errmsg);
Py_END_ALLOW_THREADS;
if (doc == NULL) {
if (errmsg) PyErr_SetString(PyExc_Exception, errmsg);
else PyErr_NoMemory();
}
return doc;
}
static libxml_doc*
parse_with_options(const char* buffer, size_t buffer_length, Options *opts, const GumboTag context, GumboNamespaceEnum context_namespace) {
GumboOutput *output = NULL;
libxml_doc* doc = NULL;
Py_BEGIN_ALLOW_THREADS;
output = gumbo_parse_fragment(&(opts->gumbo_opts), buffer, buffer_length, context, context_namespace);
Py_END_ALLOW_THREADS;
if (output == NULL) PyErr_NoMemory();
else {
doc = convert_tree(output, opts);
gumbo_destroy_output(output);
}
return doc;
}
static void
free_encapsulated_doc(PyObject *capsule) {
libxml_doc *doc = (libxml_doc*)PyCapsule_GetPointer(capsule, NAME);
if (doc != NULL) {
char *ctx = PyCapsule_GetContext(capsule);
if (ctx == DESTRUCTOR) free_libxml_doc(doc);
}
}
static inline PyObject*
encapsulate(libxml_doc* doc) {
PyObject *ans = NULL;
ans = PyCapsule_New(doc, NAME, free_encapsulated_doc);
if (ans == NULL) { free_libxml_doc(doc); return NULL; }
if (PyCapsule_SetContext(ans, DESTRUCTOR) != 0) { Py_DECREF(ans); return NULL; }
return ans;
}
static PyObject *
parse(PyObject UNUSED *self, PyObject *args, PyObject *kwds) {
libxml_doc *doc = NULL;
const char *buffer = NULL;
Py_ssize_t sz = 0;
Options opts = {0};
opts.stack_size = 16 * 1024;
PyObject *kd = Py_True, *mx = Py_False, *ne = Py_False, *sn = Py_True;
char *fragment_context = NULL; Py_ssize_t fragment_context_sz = 0;
opts.gumbo_opts = kGumboDefaultOptions;
opts.gumbo_opts.max_errors = 0; // We discard errors since we are not reporting them anyway
GumboNamespaceEnum fragment_namespace = GUMBO_NAMESPACE_HTML;
static char *kwlist[] = {"data", "namespace_elements", "keep_doctype", "maybe_xhtml", "line_number_attr", "sanitize_names", "stack_size", "fragment_context", "fragment_namespace", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwds, "s#|OOOzOIz#i", kwlist, &buffer, &sz, &ne, &kd, &mx, &(opts.line_number_attr), &sn, &(opts.stack_size), &fragment_context, &fragment_context_sz, &fragment_namespace)) return NULL;
opts.namespace_elements = PyObject_IsTrue(ne);
opts.keep_doctype = PyObject_IsTrue(kd);
opts.sanitize_names = PyObject_IsTrue(sn);
opts.gumbo_opts.use_xhtml_rules = PyObject_IsTrue(mx);
GumboTag context = GUMBO_TAG_LAST;
if (fragment_context && fragment_context_sz > 0) {
context = gumbo_tagn_enum(fragment_context, fragment_context_sz);
if (context == GUMBO_TAG_UNKNOWN) {
PyErr_Format(PyExc_KeyError, "Unknown fragment_context tag name: %s", fragment_context);
return NULL;
}
}
if (fragment_namespace != GUMBO_NAMESPACE_HTML) {
// causes infinite loops in gumbo, enable the non html fragment context tests
// in html5lib_adapter.py to trigger
PyErr_SetString(PyExc_KeyError, "Fragment parsing with non-HTML namespaces is not supported");
return NULL;
}
doc = parse_with_options(buffer, (size_t)sz, &opts, context, fragment_namespace);
if (!doc) return NULL;
return encapsulate(doc);
}
static PyObject *
parse_and_build(PyObject UNUSED *self, PyObject *args) {
const char *buffer = NULL;
Py_ssize_t sz = 0;
GumboOutput *output = NULL;
PyObject *new_tag, *new_comment, *ans, *new_doctype, *append, *new_string, *ret;
Options opts = {0};
opts.stack_size = 16 * 1024;
opts.gumbo_opts = kGumboDefaultOptions;
opts.gumbo_opts.max_errors = 0; // We discard errors since we are not reporting them anyway
if (!PyArg_ParseTuple(args, "s#OOOOO|I", &buffer, &sz, &new_tag, &new_comment, &new_string, &append, &new_doctype, &(opts.stack_size))) return NULL;
Py_BEGIN_ALLOW_THREADS;
output = gumbo_parse_with_options(&(opts.gumbo_opts), buffer, (size_t)sz);
Py_END_ALLOW_THREADS;
if (output == NULL) PyErr_NoMemory();
GumboDocument* document = &(output->document->v.document);
if (new_doctype != Py_None && document->has_doctype) {
ret = PyObject_CallFunction(new_doctype, "sss", document->name, document->public_identifier, document->system_identifier);
if (ret == NULL) { gumbo_destroy_output(output); return NULL; }
Py_CLEAR(ret);
}
ans = as_python_tree(output, &opts, new_tag, new_comment, new_string, append);
gumbo_destroy_output(output);
return ans;
}
static PyObject *
clone_doc(PyObject UNUSED *self, PyObject *capsule) {
if (!PyCapsule_CheckExact(capsule)) { PyErr_SetString(PyExc_TypeError, "Must specify a capsule as the argument"); return NULL; }
libxml_doc *sdoc = PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)), *doc;
if (sdoc == NULL) return NULL;
doc = copy_libxml_doc(sdoc);
if (doc == NULL) return PyErr_NoMemory();
return encapsulate(doc);
}
static PyMethodDef
methods[] = {
{"parse", (PyCFunction)(void(*)(void))(PyCFunctionWithKeywords)(parse), METH_VARARGS | METH_KEYWORDS,
"parse()\n\nParse specified bytestring which must be in the UTF-8 encoding."
},
{"parse_and_build", (PyCFunction)parse_and_build, METH_VARARGS,
"parse_and_build()\n\nParse specified bytestring which must be in the UTF-8 encoding and build a tree using the specified functions."
},
{"clone_doc", clone_doc, METH_O,
"clone_doc()\n\nClone the specified document. Which must be a document returned by the parse() function."
},
{NULL, NULL, 0, NULL}
};
#define MODULE_NAME "html_parser"
#define MODULE_DOC "HTML parser in C for speed."
#if PY_MAJOR_VERSION >= 3
static struct PyModuleDef
moduledef = {
PyModuleDef_HEAD_INIT,
MODULE_NAME,
MODULE_DOC,
0,
methods,
NULL,
NULL,
NULL,
NULL
};
#define INITERROR return NULL
EXPORTED PyMODINIT_FUNC
PyInit_html_parser(void) {
#else
#define INITERROR return
EXPORTED PyMODINIT_FUNC
inithtml_parser(void) {
#endif
PyObject *m, *known_tag_names, *known_attr_names;
#if PY_MAJOR_VERSION >= 3
m = PyModule_Create(&moduledef);
#else
m = Py_InitModule3(MODULE_NAME, methods, MODULE_DOC);
#endif
if (m == NULL) INITERROR;
if (PyModule_AddIntMacro(m, MAJOR) != 0) INITERROR;
if (PyModule_AddIntMacro(m, MINOR) != 0) INITERROR;
if (PyModule_AddIntMacro(m, PATCH) != 0) INITERROR;
if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_HTML) != 0) INITERROR;
if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_SVG) != 0) INITERROR;
if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_MATHML) != 0) INITERROR;
if (PyModule_AddIntConstant(m, "LIBXML_VERSION", get_libxml_version()) != 0) INITERROR;
known_tag_names = PyTuple_New(GUMBO_TAG_UNKNOWN);
if (known_tag_names == NULL) INITERROR;
if (PyModule_AddObject(m, "KNOWN_TAG_NAMES", known_tag_names) != 0) { Py_CLEAR(known_tag_names); INITERROR; }
known_attr_names = PyTuple_New(HTML_ATTR_LAST);
if (known_attr_names == NULL) INITERROR;
if (PyModule_AddObject(m, "KNOWN_ATTR_NAMES", known_attr_names) != 0) { Py_CLEAR(known_attr_names); INITERROR; }
if (!set_known_tag_names(known_tag_names, known_attr_names)) { Py_CLEAR(known_tag_names); Py_CLEAR(known_attr_names); INITERROR; }
#if PY_MAJOR_VERSION >= 3
return m;
#endif
}
html5-parser-0.4.10/src/html5_parser/ 0000755 0001750 0001750 00000000000 14122552315 017565 5 ustar kovid kovid 0000000 0000000 html5-parser-0.4.10/src/html5_parser/soup.py 0000644 0001750 0001750 00000010170 13516232523 021126 0 ustar kovid kovid 0000000 0000000 #!/usr/bin/env python
# vim:fileencoding=utf-8
# License: Apache 2.0 Copyright: 2017, Kovid Goyal
from __future__ import absolute_import, division, print_function, unicode_literals
unicode = type('')
cdata_list_attributes = None
universal_cdata_list_attributes = None
empty = ()
def init_bs4_cdata_list_attributes():
global cdata_list_attributes, universal_cdata_list_attributes
from bs4.builder import HTMLTreeBuilder
try:
attribs = HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES
except AttributeError:
attribs = HTMLTreeBuilder.cdata_list_attributes
cdata_list_attributes = {k: frozenset(v) for k, v in attribs.items()}
universal_cdata_list_attributes = cdata_list_attributes['*']
def map_list_attributes(tag_name, name, val):
if name in universal_cdata_list_attributes:
return val.split()
if name in cdata_list_attributes.get(tag_name, empty):
return val.split()
return val
def soup_module():
if soup_module.ans is None:
try:
import bs4
soup_module.ans = bs4
except ImportError:
import BeautifulSoup as bs3
soup_module.ans = bs3
return soup_module.ans
soup_module.ans = None
def set_soup_module(val):
soup_module.ans = val
def bs4_fast_append(self, new_child):
new_child.parent = self
if self.contents:
previous_child = self.contents[-1]
new_child.previous_sibling = previous_child
previous_child.next_sibling = new_child
new_child.previous_element = previous_child._last_descendant(False)
else:
new_child.previous_sibling = None
new_child.previous_element = self
new_child.previous_element.next_element = new_child
new_child.next_sibling = new_child.next_element = None
self.contents.append(new_child)
def bs4_new_tag(Tag, soup):
builder = soup.builder
def new_tag(name, attrs):
attrs = {k: map_list_attributes(name, k, v) for k, v in attrs.items()}
return Tag(soup, name=name, attrs=attrs, builder=builder)
return new_tag
def bs3_fast_append(self, newChild):
newChild.parent = self
if self.contents:
previousChild = self.contents[-1]
newChild.previousSibling = previousChild
previousChild.nextSibling = newChild
newChild.previous = previousChild._lastRecursiveChild()
else:
newChild.previousSibling = None
newChild.previous = self
newChild.previous.next = newChild
newChild.nextSibling = newChild.next_element = None
self.contents.append(newChild)
def bs3_new_tag(Tag, soup):
def new_tag(name, attrs):
ans = Tag(soup, name)
ans.attrs = attrs.items()
ans.attrMap = attrs
return ans
return new_tag
VOID_ELEMENTS = frozenset(
'area base br col embed hr img input keygen link menuitem meta param source track wbr'.split())
def is_bs3():
return soup_module().__version__.startswith('3.')
def init_soup():
bs = soup_module()
if is_bs3():
soup = bs.BeautifulSoup()
new_tag = bs3_new_tag(bs.Tag, soup)
append = bs3_fast_append
soup.isSelfClosing = lambda self, name: name in VOID_ELEMENTS
else:
soup = bs.BeautifulSoup('', 'lxml')
new_tag = bs4_new_tag(bs.Tag, soup)
append = bs4_fast_append
if universal_cdata_list_attributes is None:
init_bs4_cdata_list_attributes()
return bs, soup, new_tag, bs.Comment, append, bs.NavigableString
def parse(utf8_data, stack_size=16 * 1024, keep_doctype=False, return_root=True):
from . import html_parser
bs, soup, new_tag, Comment, append, NavigableString = init_soup()
if not isinstance(utf8_data, bytes):
utf8_data = utf8_data.encode('utf-8')
def add_doctype(name, public_id, system_id):
soup.append(bs.Doctype.for_name_and_ids(name, public_id or None, system_id or None))
dt = add_doctype if keep_doctype and hasattr(bs, 'Doctype') else None
root = html_parser.parse_and_build(
utf8_data, new_tag, Comment, NavigableString, append, dt, stack_size)
soup.append(root)
return root if return_root else soup
html5-parser-0.4.10/src/html5_parser/encoding_names.py 0000644 0001750 0001750 00000014266 13475205626 023133 0 ustar kovid kovid 0000000 0000000 # Do not edit
# Generated by genencodings.py
encodings = {
"866": "ibm866",
"ansi_x3.4-1968": "windows-1252",
"arabic": "iso-8859-6",
"ascii": "windows-1252",
"asmo-708": "iso-8859-6",
"big5": "big5",
"big5-hkscs": "big5",
"chinese": "gbk",
"cn-big5": "big5",
"cp1250": "windows-1250",
"cp1251": "windows-1251",
"cp1252": "windows-1252",
"cp1253": "windows-1253",
"cp1254": "windows-1254",
"cp1255": "windows-1255",
"cp1256": "windows-1256",
"cp1257": "windows-1257",
"cp1258": "windows-1258",
"cp819": "windows-1252",
"cp866": "ibm866",
"csbig5": "big5",
"cseuckr": "euc-kr",
"cseucpkdfmtjapanese": "euc-jp",
"csgb2312": "gbk",
"csibm866": "ibm866",
"csiso2022jp": "iso-2022-jp",
"csiso2022kr": "replacement",
"csiso58gb231280": "gbk",
"csiso88596e": "iso-8859-6",
"csiso88596i": "iso-8859-6",
"csiso88598e": "iso-8859-8",
"csiso88598i": "iso-8859-8-i",
"csisolatin1": "windows-1252",
"csisolatin2": "iso-8859-2",
"csisolatin3": "iso-8859-3",
"csisolatin4": "iso-8859-4",
"csisolatin5": "windows-1254",
"csisolatin6": "iso-8859-10",
"csisolatin9": "iso-8859-15",
"csisolatinarabic": "iso-8859-6",
"csisolatincyrillic": "iso-8859-5",
"csisolatingreek": "iso-8859-7",
"csisolatinhebrew": "iso-8859-8",
"cskoi8r": "koi8-r",
"csksc56011987": "euc-kr",
"csmacintosh": "macintosh",
"csshiftjis": "shift_jis",
"cyrillic": "iso-8859-5",
"dos-874": "windows-874",
"ecma-114": "iso-8859-6",
"ecma-118": "iso-8859-7",
"elot_928": "iso-8859-7",
"euc-jp": "euc-jp",
"euc-kr": "euc-kr",
"gb18030": "gb18030",
"gb2312": "gbk",
"gb_2312": "gbk",
"gb_2312-80": "gbk",
"gbk": "gbk",
"greek": "iso-8859-7",
"greek8": "iso-8859-7",
"hebrew": "iso-8859-8",
"hz-gb-2312": "replacement",
"ibm819": "windows-1252",
"ibm866": "ibm866",
"iso-2022-cn": "replacement",
"iso-2022-cn-ext": "replacement",
"iso-2022-jp": "iso-2022-jp",
"iso-2022-kr": "replacement",
"iso-8859-1": "windows-1252",
"iso-8859-10": "iso-8859-10",
"iso-8859-11": "windows-874",
"iso-8859-13": "iso-8859-13",
"iso-8859-14": "iso-8859-14",
"iso-8859-15": "iso-8859-15",
"iso-8859-16": "iso-8859-16",
"iso-8859-2": "iso-8859-2",
"iso-8859-3": "iso-8859-3",
"iso-8859-4": "iso-8859-4",
"iso-8859-5": "iso-8859-5",
"iso-8859-6": "iso-8859-6",
"iso-8859-6-e": "iso-8859-6",
"iso-8859-6-i": "iso-8859-6",
"iso-8859-7": "iso-8859-7",
"iso-8859-8": "iso-8859-8",
"iso-8859-8-e": "iso-8859-8",
"iso-8859-8-i": "iso-8859-8-i",
"iso-8859-9": "windows-1254",
"iso-ir-100": "windows-1252",
"iso-ir-101": "iso-8859-2",
"iso-ir-109": "iso-8859-3",
"iso-ir-110": "iso-8859-4",
"iso-ir-126": "iso-8859-7",
"iso-ir-127": "iso-8859-6",
"iso-ir-138": "iso-8859-8",
"iso-ir-144": "iso-8859-5",
"iso-ir-148": "windows-1254",
"iso-ir-149": "euc-kr",
"iso-ir-157": "iso-8859-10",
"iso-ir-58": "gbk",
"iso8859-1": "windows-1252",
"iso8859-10": "iso-8859-10",
"iso8859-11": "windows-874",
"iso8859-13": "iso-8859-13",
"iso8859-14": "iso-8859-14",
"iso8859-15": "iso-8859-15",
"iso8859-2": "iso-8859-2",
"iso8859-3": "iso-8859-3",
"iso8859-4": "iso-8859-4",
"iso8859-5": "iso-8859-5",
"iso8859-6": "iso-8859-6",
"iso8859-7": "iso-8859-7",
"iso8859-8": "iso-8859-8",
"iso8859-9": "windows-1254",
"iso88591": "windows-1252",
"iso885910": "iso-8859-10",
"iso885911": "windows-874",
"iso885913": "iso-8859-13",
"iso885914": "iso-8859-14",
"iso885915": "iso-8859-15",
"iso88592": "iso-8859-2",
"iso88593": "iso-8859-3",
"iso88594": "iso-8859-4",
"iso88595": "iso-8859-5",
"iso88596": "iso-8859-6",
"iso88597": "iso-8859-7",
"iso88598": "iso-8859-8",
"iso88599": "windows-1254",
"iso_8859-1": "windows-1252",
"iso_8859-15": "iso-8859-15",
"iso_8859-1:1987": "windows-1252",
"iso_8859-2": "iso-8859-2",
"iso_8859-2:1987": "iso-8859-2",
"iso_8859-3": "iso-8859-3",
"iso_8859-3:1988": "iso-8859-3",
"iso_8859-4": "iso-8859-4",
"iso_8859-4:1988": "iso-8859-4",
"iso_8859-5": "iso-8859-5",
"iso_8859-5:1988": "iso-8859-5",
"iso_8859-6": "iso-8859-6",
"iso_8859-6:1987": "iso-8859-6",
"iso_8859-7": "iso-8859-7",
"iso_8859-7:1987": "iso-8859-7",
"iso_8859-8": "iso-8859-8",
"iso_8859-8:1988": "iso-8859-8",
"iso_8859-9": "windows-1254",
"iso_8859-9:1989": "windows-1254",
"koi": "koi8-r",
"koi8": "koi8-r",
"koi8-r": "koi8-r",
"koi8-ru": "koi8-u",
"koi8-u": "koi8-u",
"koi8_r": "koi8-r",
"korean": "euc-kr",
"ks_c_5601-1987": "euc-kr",
"ks_c_5601-1989": "euc-kr",
"ksc5601": "euc-kr",
"ksc_5601": "euc-kr",
"l1": "windows-1252",
"l2": "iso-8859-2",
"l3": "iso-8859-3",
"l4": "iso-8859-4",
"l5": "windows-1254",
"l6": "iso-8859-10",
"l9": "iso-8859-15",
"latin1": "windows-1252",
"latin2": "iso-8859-2",
"latin3": "iso-8859-3",
"latin4": "iso-8859-4",
"latin5": "windows-1254",
"latin6": "iso-8859-10",
"logical": "iso-8859-8-i",
"mac": "macintosh",
"macintosh": "macintosh",
"ms932": "shift_jis",
"ms_kanji": "shift_jis",
"shift-jis": "shift_jis",
"shift_jis": "shift_jis",
"sjis": "shift_jis",
"sun_eu_greek": "iso-8859-7",
"tis-620": "windows-874",
"unicode-1-1-utf-8": "utf-8",
"us-ascii": "windows-1252",
"utf-16": "utf-16le",
"utf-16be": "utf-16be",
"utf-16le": "utf-16le",
"utf-8": "utf-8",
"utf8": "utf-8",
"visual": "iso-8859-8",
"windows-1250": "windows-1250",
"windows-1251": "windows-1251",
"windows-1252": "windows-1252",
"windows-1253": "windows-1253",
"windows-1254": "windows-1254",
"windows-1255": "windows-1255",
"windows-1256": "windows-1256",
"windows-1257": "windows-1257",
"windows-1258": "windows-1258",
"windows-31j": "shift_jis",
"windows-874": "windows-874",
"windows-949": "euc-kr",
"x-cp1250": "windows-1250",
"x-cp1251": "windows-1251",
"x-cp1252": "windows-1252",
"x-cp1253": "windows-1253",
"x-cp1254": "windows-1254",
"x-cp1255": "windows-1255",
"x-cp1256": "windows-1256",
"x-cp1257": "windows-1257",
"x-cp1258": "windows-1258",
"x-euc-jp": "euc-jp",
"x-gbk": "gbk",
"x-mac-cyrillic": "x-mac-cyrillic",
"x-mac-roman": "macintosh",
"x-mac-ukrainian": "x-mac-cyrillic",
"x-sjis": "shift_jis",
"x-user-defined": "x-user-defined",
"x-x-big5": "big5",
}
html5-parser-0.4.10/src/html5_parser/stdlib_etree.py 0000644 0001750 0001750 00000002765 13475205626 022630 0 ustar kovid kovid 0000000 0000000 #!/usr/bin/env python
# vim:fileencoding=utf-8
# License: Apache 2.0 Copyright: 2017, Kovid Goyal
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
from lxml.etree import _Comment
if sys.version_info.major < 3:
from xml.etree.cElementTree import Element, SubElement, ElementTree, Comment, register_namespace
else:
from xml.etree.ElementTree import Element, SubElement, ElementTree, Comment, register_namespace
register_namespace('svg', "http://www.w3.org/2000/svg")
register_namespace('xlink', "http://www.w3.org/1999/xlink")
def convert_elem(src, parent=None):
if parent is None:
ans = Element(src.tag, dict(src.items()))
else:
ans = SubElement(parent, src.tag, dict(src.items()))
return ans
def adapt(src_tree, return_root=True, **kw):
src_root = src_tree.getroot()
dest_root = convert_elem(src_root)
stack = [(src_root, dest_root)]
while stack:
src, dest = stack.pop()
for src_child in src.iterchildren():
if isinstance(src_child, _Comment):
dest_child = Comment(src_child.text)
dest_child.tail = src_child.tail
dest.append(dest_child)
else:
dest_child = convert_elem(src_child, dest)
dest_child.text, dest_child.tail = src_child.text, src_child.tail
stack.append((src_child, dest_child))
return dest_root if return_root else ElementTree(dest_root)
html5-parser-0.4.10/src/html5_parser/dom.py 0000644 0001750 0001750 00000005111 13475205626 020726 0 ustar kovid kovid 0000000 0000000 #!/usr/bin/env python
# vim:fileencoding=utf-8
# License: Apache 2.0 Copyright: 2017, Kovid Goyal
from __future__ import absolute_import, division, print_function, unicode_literals
from xml.dom.minidom import getDOMImplementation
from lxml.etree import _Comment
impl = getDOMImplementation()
try:
dict_items = dict.iteritems
except AttributeError:
dict_items = dict.items
def elem_name_parts(elem):
tag = elem.tag
if tag.startswith('{'):
uri, _, name = tag.rpartition('}')
if elem.prefix:
name = elem.prefix + ':' + name
return uri[1:], name
return None, tag
def attr_name_parts(name, elem, val):
if name.startswith('{'):
uri, _, name = name.rpartition('}')
uri = uri[1:]
for prefix, quri in dict_items(elem.nsmap):
if quri == uri:
break
else:
prefix = None
if prefix:
name = prefix + ':' + name
return uri, name, val
return None, name, val
def add_namespace_declarations(src, dest):
changed = src.nsmap
if changed:
p = src.getparent()
if p is not None:
# Only add namespace declarations different from the parent's
p = p.nsmap or {}
changed = {k: v for k, v in dict_items(changed) if v != p.get(k)}
for prefix, uri in dict_items(changed):
attr = ('xmlns:' + prefix) if prefix else 'xmlns'
dest.setAttributeNS('xmlns', attr, uri)
def adapt(source_tree, return_root=True, **kw):
source_root = source_tree.getroot()
uri, qname = elem_name_parts(source_root)
dest_tree = impl.createDocument(uri, qname, None)
dest_tree.doctype = source_tree.docinfo.doctype
dest_root = dest_tree.documentElement
stack = [(source_root, dest_root)]
while stack:
src, dest = stack.pop()
if src.text:
dest.appendChild(dest_tree.createTextNode(src.text))
add_namespace_declarations(src, dest)
for name, val in src.items():
dest.setAttributeNS(*attr_name_parts(name, src, val))
for child in src.iterchildren():
if isinstance(child, _Comment):
dchild = dest_tree.createComment((child.text or '').replace('--', '—'))
else:
dchild = dest_tree.createElementNS(*elem_name_parts(child))
stack.append((child, dchild))
dest.appendChild(dchild)
if child.tail:
dest.appendChild(dest_tree.createTextNode(child.tail))
return dest_root if return_root else dest_tree
html5-parser-0.4.10/src/html5_parser/encoding_parser.py 0000644 0001750 0001750 00000026542 13475205626 023324 0 ustar kovid kovid 0000000 0000000 #!/usr/bin/env python
# vim:fileencoding=utf-8
# License: Apache 2.0 Copyright: 2017, Kovid Goyal
from __future__ import (absolute_import, division, print_function, unicode_literals)
import string
from .encoding_names import encodings
space_chars = frozenset(("\t", "\n", "\u000C", " ", "\r"))
space_chars_bytes = frozenset(item.encode("ascii") for item in space_chars)
ascii_letters_bytes = frozenset(item.encode("ascii") for item in string.ascii_letters)
ascii_uppercase_bytes = frozenset(item.encode("ascii") for item in string.ascii_uppercase)
spaces_angle_brackets = space_chars_bytes | frozenset((b">", b"<"))
skip1 = space_chars_bytes | frozenset((b"/", ))
PYTHON_NAMES = {
'iso-8859-8-i': 'iso-8859-8',
'x-mac-cyrillic': 'mac-cyrillic',
'macintosh': 'mac-roman',
'windows-874': 'cp874'}
def codec_name(encoding):
"""Return the python codec name corresponding to an encoding or None if the
string doesn't correspond to a valid encoding."""
if isinstance(encoding, bytes):
try:
encoding = encoding.decode("ascii")
except UnicodeDecodeError:
return
if encoding:
encoding = encoding.strip('\t\n\f\r ')
enc = encodings.get(encoding)
if enc is not None:
return PYTHON_NAMES.get(enc, enc)
class EncodingBytes(bytes):
"""String-like object with an associated position and various extra methods
If the position is ever greater than the string length then an exception is
raised"""
def __new__(self, value):
return bytes.__new__(self, value.lower())
def __init__(self, value):
self._position = -1
def __iter__(self):
return self
def __next__(self):
p = self._position = self._position + 1
if p >= len(self):
raise StopIteration
elif p < 0:
raise TypeError
return self[p:p + 1]
def next(self):
# Py2 compat
return self.__next__()
def previous(self):
p = self._position
if p >= len(self):
raise StopIteration
elif p < 0:
raise TypeError
self._position = p = p - 1
return self[p:p + 1]
@property
def position(self):
if self._position >= len(self):
raise StopIteration
if self._position >= 0:
return self._position
@position.setter
def position(self, position):
if self._position >= len(self):
raise StopIteration
self._position = position
@property
def current_byte(self):
return self[self.position:self.position + 1]
def skip(self, chars=space_chars_bytes):
"""Skip past a list of characters"""
p = self.position # use property for the error-checking
while p < len(self):
c = self[p:p + 1]
if c not in chars:
self._position = p
return c
p += 1
self._position = p
return None
def skip_until(self, chars):
p = self.position
while p < len(self):
c = self[p:p + 1]
if c in chars:
self._position = p
return c
p += 1
self._position = p
return None
def match_bytes(self, bytes):
"""Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone"""
p = self.position
data = self[p:p + len(bytes)]
rv = data.startswith(bytes)
if rv:
self.position += len(bytes)
return rv
def jump_to(self, bytes):
"""Look for the next sequence of bytes matching a given sequence. If
a match is found advance the position to the last byte of the match"""
new_pos = self[self.position:].find(bytes)
if new_pos > -1:
if self._position == -1:
self._position = 0
self._position += (new_pos + len(bytes) - 1)
return True
else:
raise StopIteration
class ContentAttrParser(object):
def __init__(self, data):
self.data = data
def parse(self):
try:
# Check if the attr name is charset
# otherwise return
self.data.jump_to(b"charset")
self.data.position += 1
self.data.skip()
if not self.data.current_byte == b"=":
# If there is no = sign keep looking for attrs
return None
self.data.position += 1
self.data.skip()
# Look for an encoding between matching quote marks
if self.data.current_byte in (b'"', b"'"):
quote_mark = self.data.current_byte
self.data.position += 1
old_pos = self.data.position
if self.data.jump_to(quote_mark):
return self.data[old_pos:self.data.position]
else:
return None
else:
# Unquoted value
old_pos = self.data.position
try:
self.data.skip_until(space_chars_bytes)
return self.data[old_pos:self.data.position]
except StopIteration:
# Return the whole remaining value
return self.data[old_pos:]
except StopIteration:
return None
class EncodingParser(object):
"""Mini parser for detecting character encoding from meta elements"""
def __init__(self, data):
"""string - the data to work on for encoding detection"""
self.data = EncodingBytes(data)
self.encoding = None
def __call__(self):
dispatch = ((b"")
def handle_meta(self):
if self.data.current_byte not in space_chars_bytes:
# if we have ")
def get_attribute(self):
"""Return a name,value pair for the next attribute in the stream,
if one is found, or None"""
data = self.data
# Step 1 (skip chars)
c = data.skip(skip1)
assert c is None or len(c) == 1
# Step 2
if c in (b">", None):
return None
# Step 3
attr_name = []
attr_value = []
# Step 4 attribute name
while True:
if c == b"=" and attr_name:
break
elif c in space_chars_bytes:
# Step 6!
c = data.skip()
break
elif c in (b"/", b">"):
return b"".join(attr_name), b""
elif c in ascii_uppercase_bytes:
attr_name.append(c.lower())
elif c is None:
return None
else:
attr_name.append(c)
# Step 5
c = next(data)
# Step 7
if c != b"=":
data.previous()
return b"".join(attr_name), b""
# Step 8
next(data)
# Step 9
c = data.skip()
# Step 10
if c in (b"'", b'"'):
# 10.1
quote_char = c
while True:
# 10.2
c = next(data)
# 10.3
if c == quote_char:
next(data)
return b"".join(attr_name), b"".join(attr_value)
# 10.4
elif c in ascii_uppercase_bytes:
attr_value.append(c.lower())
# 10.5
else:
attr_value.append(c)
elif c == b">":
return b"".join(attr_name), b""
elif c in ascii_uppercase_bytes:
attr_value.append(c.lower())
elif c is None:
return None
else:
attr_value.append(c)
# Step 11
while True:
c = next(data)
if c in spaces_angle_brackets:
return b"".join(attr_name), b"".join(attr_value)
elif c in ascii_uppercase_bytes:
attr_value.append(c.lower())
elif c is None:
return None
else:
attr_value.append(c)
html5-parser-0.4.10/src/html5_parser/__init__.py 0000644 0001750 0001750 00000022056 14122552135 021703 0 ustar kovid kovid 0000000 0000000 #!/usr/bin/env python
# vim:fileencoding=utf-8
# License: Apache 2.0 Copyright: 2017, Kovid Goyal
from __future__ import absolute_import, division, print_function, unicode_literals
import codecs
import importlib
import sys
from collections import namedtuple
from locale import getpreferredencoding
if not hasattr(sys, 'generating_docs_via_sphinx'):
from lxml import etree # Must be imported before html_parser to initialize libxml
try:
from . import html_parser
except ImportError:
raise
else:
version = namedtuple('Version', 'major minor patch')(
html_parser.MAJOR, html_parser.MINOR, html_parser.PATCH)
if not hasattr(etree, 'adopt_external_document'):
raise ImportError('Your version of lxml is too old, version 3.8.0 is minimum')
LIBXML_VERSION = ((html_parser.LIBXML_VERSION // 10000) % 100,
(html_parser.LIBXML_VERSION // 100) % 100,
html_parser.LIBXML_VERSION % 100, )
if LIBXML_VERSION[:2] != etree.LIBXML_VERSION[:2]:
raise RuntimeError(
'html5-parser and lxml are using different versions of libxml2.'
' This happens commonly when using pip installed versions of lxml.'
' Use pip install --no-binary lxml lxml instead.'
' libxml2 versions: html5-parser: {} != lxml: {}'.format(
LIBXML_VERSION, etree.LIBXML_VERSION))
BOMS = (codecs.BOM_UTF8, codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)
def check_bom(data):
for bom in BOMS:
if data.startswith(bom):
return bom
def check_for_meta_charset(raw):
from .encoding_parser import EncodingParser # delay load
q = raw[:10 * 1024]
parser = EncodingParser(q)
encoding = parser()
if encoding in ("utf-16", "utf-16be", "utf-16le"):
encoding = "utf-8"
return encoding
def detect_encoding(raw):
from chardet import detect # delay load
q = raw[:50 * 1024]
return detect(q)['encoding']
passthrough_encodings = frozenset(('utf-8', 'utf8', 'ascii'))
def safe_get_preferred_encoding():
try:
ans = getpreferredencoding(False)
except Exception:
pass
else:
try:
return codecs.lookup(ans).name
except LookupError:
pass
def as_utf8(bytes_or_unicode, transport_encoding=None, fallback_encoding=None):
if isinstance(bytes_or_unicode, bytes):
data = bytes_or_unicode
if transport_encoding:
if transport_encoding.lower() not in passthrough_encodings:
data = bytes_or_unicode.decode(transport_encoding).encode('utf-8')
else:
# See
# https://www.w3.org/TR/2011/WD-html5-20110113/parsing.html#determining-the-character-encoding
bom = check_bom(data)
if bom is not None:
data = data[len(bom):]
if bom is not codecs.BOM_UTF8:
data = data.decode(bom).encode('utf-8')
else:
encoding = (
check_for_meta_charset(data) or detect_encoding(data) or fallback_encoding or
safe_get_preferred_encoding() or 'cp-1252')
if encoding and encoding.lower() not in passthrough_encodings:
if encoding == 'x-user-defined':
# https://encoding.spec.whatwg.org/#x-user-defined
buf = (b if b <= 0x7F else 0xF780 + b - 0x80 for b in bytearray(data))
try:
chr = unichr
except NameError:
pass
data = ''.join(map(chr, buf))
else:
data = data.decode(encoding).encode('utf-8')
else:
data = bytes_or_unicode.encode('utf-8')
return data
def normalize_treebuilder(x):
if hasattr(x, 'lower'):
x = x.lower()
return {'lxml.etree': 'lxml', 'etree': 'stdlib_etree'}.get(x, x)
NAMESPACE_SUPPORTING_BUILDERS = frozenset('lxml stdlib_etree dom lxml_html'.split())
def parse(
html,
transport_encoding=None,
namespace_elements=False,
treebuilder='lxml',
fallback_encoding=None,
keep_doctype=True,
maybe_xhtml=False,
return_root=True,
line_number_attr=None,
sanitize_names=True,
stack_size=16 * 1024,
fragment_context=None,
):
'''
Parse the specified :attr:`html` and return the parsed representation.
:param html: The HTML to be parsed. Can be either bytes or a unicode string.
:param transport_encoding: If specified, assume the passed in bytes are in this encoding.
Ignored if :attr:`html` is unicode.
:param namespace_elements:
Add XML namespaces when parsing so that the resulting tree is XHTML.
:param treebuilder:
The type of tree to return. Note that only the lxml treebuilder is fast, as all
other treebuilders are implemented in python, not C. Supported values are:
* `lxml `_ -- the default, and fastest
* `lxml_html `_ -- tree of lxml.html.HtmlElement, same speed as lxml
(new in *0.4.10*)
* etree (the python stdlib :mod:`xml.etree.ElementTree`)
* dom (the python stdlib :mod:`xml.dom.minidom`)
* `soup `_ -- BeautifulSoup,
which must be installed or it will raise an :class:`ImportError`
:param fallback_encoding: If no encoding could be detected, then use this encoding.
Defaults to an encoding based on system locale.
:param keep_doctype: Keep the (if any).
:param maybe_xhtml: Useful when it is unknown if the HTML to be parsed is
actually XHTML. Changes the HTML 5 parsing algorithm to be more
suitable for XHTML. In particular handles self-closed CDATA elements.
So a ```` or ```` in the HTML will not completely break
parsing. Also preserves namespaced tags and attributes even for namespaces
not supported by HTML 5 (this works only with the ``lxml`` and ``lxml_html``
treebuilders).
Note that setting this also implicitly sets ``namespace_elements``.
:param return_root: If True, return the root node of the document, otherwise
return the tree object for the document.
:param line_number_attr: The optional name of an attribute used to store the line number
of every element. If set, this attribute will be added to each element with the
element's line number.
:param sanitize_names: Ensure tag and attributes contain only ASCII alphanumeric
charactes, underscores, hyphens and periods. This ensures that the resulting
tree is also valid XML. Any characters outside this set are replaced by
underscores. Note that this is not strictly HTML 5 spec compliant, so turn it
off if you need strict spec compliance.
:param stack_size: The initial size (number of items) in the stack. The
default is sufficient to avoid memory allocations for all but the
largest documents.
:param fragment_context: the tag name under which to parse the HTML when the html
is a fragment. Common choices are ``div`` or ``body``. To use SVG or MATHML tags
prefix the tag name with ``svg:`` or ``math:`` respectively. Note that currently
using a non-HTML fragment_context is not supported. New in *0.4.10*.
'''
data = as_utf8(html or b'', transport_encoding, fallback_encoding)
treebuilder = normalize_treebuilder(treebuilder)
if treebuilder == 'soup':
from .soup import parse
return parse(
data, return_root=return_root, keep_doctype=keep_doctype, stack_size=stack_size)
if treebuilder not in NAMESPACE_SUPPORTING_BUILDERS:
namespace_elements = False
fragment_namespace = html_parser.GUMBO_NAMESPACE_HTML
if fragment_context:
fragment_context = fragment_context.lower()
if ':' in fragment_context:
ns, fragment_context = fragment_context.split(':', 1)
fragment_namespace = {
'svg': html_parser.GUMBO_NAMESPACE_SVG, 'math': html_parser.GUMBO_NAMESPACE_MATHML,
'html': html_parser.GUMBO_NAMESPACE_HTML
}[ns]
capsule = html_parser.parse(
data,
namespace_elements=namespace_elements or maybe_xhtml,
keep_doctype=keep_doctype,
maybe_xhtml=maybe_xhtml,
line_number_attr=line_number_attr,
sanitize_names=sanitize_names,
stack_size=stack_size,
fragment_context=fragment_context,
fragment_namespace=fragment_namespace,
)
interpreter = None
if treebuilder == 'lxml_html':
from lxml.html import HTMLParser
interpreter = HTMLParser()
ans = etree.adopt_external_document(capsule, parser=interpreter)
if treebuilder in ('lxml', 'lxml_html'):
return ans.getroot() if return_root else ans
m = importlib.import_module('html5_parser.' + treebuilder)
return m.adapt(ans, return_root=return_root)
html5-parser-0.4.10/src/as-libxml.h 0000644 0001750 0001750 00000000631 13475205503 017225 0 ustar kovid kovid 0000000 0000000 /*
* Copyright (C) 2017 Kovid Goyal
*
* Distributed under terms of the Apache 2.0 license.
*/
#pragma once
#include "data-types.h"
typedef void libxml_doc;
libxml_doc* copy_libxml_doc(libxml_doc* doc);
libxml_doc free_libxml_doc(libxml_doc* doc);
int get_libxml_version(void);
libxml_doc* convert_gumbo_tree_to_libxml_tree(GumboOutput *output, Options *opts, char **errmsg);
html5-parser-0.4.10/src/as-python-tree.c 0000644 0001750 0001750 00000015572 13475205503 020221 0 ustar kovid kovid 0000000 0000000 /*
* as-python-tree.c
* Copyright (C) 2017 Kovid Goyal
*
* Distributed under terms of the GPL3 license.
*/
#include "as-python-tree.h"
// Stack {{{
#define Item1 GumboNode*
#define Item2 PyObject*
#define StackItemClass StackItem
#define StackClass Stack
#include "stack.h"
// }}}
static PyObject *KNOWN_TAG_NAMES, *KNOWN_ATTR_NAMES;
const char* ATTR_NAMES[] = {
#include "attr_strings.h"
"", // ATTR_LAST
};
static const uint8_t ATTR_SIZES[] = {
#include "attr_sizes.h"
0, // ATTR_LAST
};
#include "attr_perf.h"
#define ATTR_MAP_SIZE (sizeof(HTML_ATTR_MAP) / sizeof(HTML_ATTR_MAP[0]))
static inline HTMLAttr
attr_num(const char *attr, unsigned int length) {
if (LIKELY(length)) {
unsigned int key = attr_hash(attr, length);
if (key < ATTR_MAP_SIZE) {
HTMLAttr ans = HTML_ATTR_MAP[key];
if (LIKELY(length == ATTR_SIZES[(int) ans] && !strncmp(attr, ATTR_NAMES[(int) ans], length))) return ans;
}
}
return HTML_ATTR_LAST;
}
bool
set_known_tag_names(PyObject *val, PyObject *attr_val) {
PyObject *tag_name;
KNOWN_TAG_NAMES = val;
for (int i = 0; i < GUMBO_TAG_UNKNOWN; i++) {
tag_name = PyUnicode_FromString(gumbo_normalized_tagname(i));
if (tag_name == NULL) return false;
PyTuple_SET_ITEM(KNOWN_TAG_NAMES, i, tag_name);
}
KNOWN_ATTR_NAMES = attr_val;
for (int i = 0; i < HTML_ATTR_LAST; i++) {
tag_name = PyUnicode_FromString(ATTR_NAMES[i]);
if (tag_name == NULL) return false;
PyTuple_SET_ITEM(KNOWN_ATTR_NAMES, i, tag_name);
}
return true;
}
static inline bool
push_children(PyObject *parent, GumboElement *elem, Stack *stack) {
for (int i = elem->children.length - 1; i >= 0; i--) {
if (!Stack_push(stack, elem->children.data[i], parent)) return false;
}
return true;
}
static inline PyObject*
create_attr_name(const char *aname) {
size_t alen = strlen(aname);
HTMLAttr anum = attr_num(aname, alen);
if (anum >= HTML_ATTR_LAST) return PyUnicode_FromStringAndSize(aname, alen);
PyObject *ans = PyTuple_GET_ITEM(KNOWN_ATTR_NAMES, (int)anum);
Py_INCREF(ans);
return ans;
}
static inline PyObject*
create_attributes(GumboElement *elem) {
GumboAttribute* attr;
const char *aname;
char buf[MAX_TAG_NAME_SZ];
PyObject *attr_name = NULL, *attr_val = NULL, *ans;
ans = PyDict_New();
if (ans == NULL) return NULL;
for (unsigned int i = 0; i < elem->attributes.length; ++i) {
#define ABORT { Py_CLEAR(ans); Py_CLEAR(attr_name); Py_CLEAR(attr_val); break; }
attr = elem->attributes.data[i];
aname = attr->name;
switch (attr->attr_namespace) {
case GUMBO_ATTR_NAMESPACE_XLINK:
snprintf(buf, MAX_TAG_NAME_SZ - 1, "xlink:%s", aname);
aname = buf;
break;
case GUMBO_ATTR_NAMESPACE_XML:
snprintf(buf, MAX_TAG_NAME_SZ - 1, "xml:%s", aname);
aname = buf;
break;
case GUMBO_ATTR_NAMESPACE_XMLNS:
snprintf(buf, MAX_TAG_NAME_SZ - 1, "xmlns:%s", aname);
aname = buf;
break;
default:
break;
}
attr_name = create_attr_name(aname);
attr_val = PyUnicode_FromString(attr->value);
if (UNLIKELY(attr_name == NULL || attr_val == NULL)) ABORT;
if (UNLIKELY(PyDict_SetItem(ans, attr_name, attr_val) != 0)) ABORT;
Py_DECREF(attr_name); Py_DECREF(attr_val);
#undef ABORT
}
return ans;
}
static inline PyObject*
create_element(GumboElement *elem, PyObject *new_tag) {
PyObject *tag_name = NULL, *tag_obj = NULL, *attributes = NULL;
const char *tag;
if (UNLIKELY(elem->tag >= GUMBO_TAG_UNKNOWN)) {
gumbo_tag_from_original_text(&(elem->original_tag));
tag_name = PyUnicode_FromStringAndSize(elem->original_tag.data, elem->original_tag.length);
} else if (UNLIKELY(elem->tag_namespace == GUMBO_NAMESPACE_SVG)) {
gumbo_tag_from_original_text(&(elem->original_tag));
tag = gumbo_normalize_svg_tagname(&(elem->original_tag));
if (tag) {
tag_name = PyUnicode_FromStringAndSize(tag, elem->original_tag.length);
} else {
tag_name = PyTuple_GET_ITEM(KNOWN_TAG_NAMES, elem->tag);
Py_INCREF(tag_name);
}
} else {
tag_name = PyTuple_GET_ITEM(KNOWN_TAG_NAMES, elem->tag);
Py_INCREF(tag_name);
}
if (UNLIKELY(tag_name == NULL)) return NULL;
attributes = create_attributes(elem);
if (UNLIKELY(attributes == NULL)) { Py_CLEAR(tag_name); return NULL; }
tag_obj = PyObject_CallFunctionObjArgs(new_tag, tag_name, attributes, NULL);
Py_DECREF(tag_name); Py_DECREF(attributes);
if (UNLIKELY(tag_obj == NULL)) return NULL;
return tag_obj;
}
static inline PyObject*
convert_node(GumboNode* node, GumboElement **elem, PyObject *new_tag, PyObject *new_comment, PyObject *new_string) {
PyObject *ans = NULL, *temp;
*elem = NULL;
#define STRING_LIKE(converter) \
temp = PyUnicode_FromString(node->v.text.text); \
if (UNLIKELY(temp == NULL)) break; \
ans = PyObject_CallFunctionObjArgs(converter, temp, NULL); \
Py_DECREF(temp);
switch (node->type) {
case GUMBO_NODE_ELEMENT:
case GUMBO_NODE_TEMPLATE:
*elem = &node->v.element;
ans = create_element(*elem, new_tag);
break;
case GUMBO_NODE_TEXT:
case GUMBO_NODE_WHITESPACE:
case GUMBO_NODE_CDATA:
STRING_LIKE(new_string);
break;
case GUMBO_NODE_COMMENT:
STRING_LIKE(new_comment);
break;
default:
PyErr_SetString(PyExc_TypeError, "unknown gumbo node type");
break;
}
#undef STRING_LIKE
return ans;
}
PyObject*
as_python_tree(GumboOutput *gumbo_output, Options *opts, PyObject *new_tag, PyObject *new_comment, PyObject *new_string, PyObject *append) {
#define ABORT { ok = false; goto end; }
bool ok = true;
GumboNode *gumbo;
GumboElement *elem;
PyObject *parent, *child, *ans = NULL, *ret;
Stack *stack = Stack_alloc(opts->stack_size);
if (stack == NULL) return PyErr_NoMemory();
Stack_push(stack, gumbo_output->root, NULL);
while(stack->length > 0) {
Stack_pop(stack, &gumbo, &parent);
child = convert_node(gumbo, &elem, new_tag, new_comment, new_string);
if (UNLIKELY(!child)) ABORT;
if (LIKELY(parent)) {
ret = PyObject_CallFunctionObjArgs(append, parent, child, NULL);
Py_DECREF(child);
if (UNLIKELY(ret == NULL)) ABORT;
Py_DECREF(ret);
} else ans = child;
if (elem != NULL) {
if (UNLIKELY(!push_children(child, elem, stack))) { PyErr_NoMemory(); ABORT; }
}
}
end:
Stack_free(stack);
if (!ok) { Py_CLEAR(ans); }
return ans;
#undef ABORT
}
html5-parser-0.4.10/src/data-types.h 0000644 0001750 0001750 00000003774 13475205503 017423 0 ustar kovid kovid 0000000 0000000 /*
* Copyright (C) 2017 Kovid Goyal
*
* Distributed under terms of the Apache 2.0 license.
*/
#pragma once
#include "../gumbo/gumbo.h"
#include
#ifdef _MSC_VER
#define UNUSED
#define EXPORTED __declspec(dllexport)
#else
#define UNUSED __attribute__ ((unused))
#define EXPORTED __attribute__ ((visibility ("default")))
#endif
#ifdef __builtin_expect
#define LIKELY(x) __builtin_expect (!!(x), 1)
#define UNLIKELY(x) __builtin_expect (!!(x), 0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MAX_TAG_NAME_SZ 100
typedef struct {
unsigned int stack_size;
bool keep_doctype, namespace_elements, sanitize_names;
const void* line_number_attr;
GumboOptions gumbo_opts;
} Options;
typedef enum {
#include "attr_enum.h"
// A marker value to indicate the end of the enum, for iterating over it.
HTML_ATTR_LAST,
} HTMLAttr;
// We only allow subset of the valid characters defined in the XML spec for
// performance, as the following tests can be run directly on UTF-8 without
// decoding. Also the other characters are never actually used successfully in
// the wild.
#define VALID_FIRST_CHAR(c) ( \
(c >= 'a' && c <= 'z') || \
(c >= 'A' && c <= 'Z') || \
c == '_'\
)
#define VALID_CHAR(c) ( \
(c >= 'a' && c <= 'z') || \
(c >= '0' && c <= '9') || \
(c == '-') || \
(c >= 'A' && c <= 'Z') || \
(c == '_') || (c == '.') \
)
#define STRFY(x) #x
#define STRFY2(x) STRFY(x)
#define ERRMSG(x) ("File: " __FILE__ " Line: " STRFY2(__LINE__) ": " x)
#define NOMEM (ERRMSG("Out of memory"))
#ifdef NEEDS_SANITIZE_NAME
static inline size_t
sanitize_name(char *name) {
if (UNLIKELY(name[0] == 0)) return 0;
if (UNLIKELY(!VALID_FIRST_CHAR(name[0]))) name[0] = '_';
size_t i = 1;
while (name[i] != 0) {
if (UNLIKELY(!VALID_CHAR(name[i]))) name[i] = '_';
i++;
}
return i;
}
#endif
html5-parser-0.4.10/src/as-python-tree.h 0000644 0001750 0001750 00000000661 13475205503 020217 0 ustar kovid kovid 0000000 0000000 /*
* Copyright (C) 2017 Kovid Goyal
*
* Distributed under terms of the GPL3 license.
*/
#pragma once
#define PY_SSIZE_T_CLEAN
#include
#include "data-types.h"
#include "../gumbo/gumbo.h"
PyObject*
as_python_tree(GumboOutput *gumbo_output, Options *opts, PyObject *new_tag, PyObject *new_comment, PyObject *new_string, PyObject *append);
bool
set_known_tag_names(PyObject *val, PyObject*);
html5-parser-0.4.10/src/attr_strings.h 0000644 0001750 0001750 00000011176 13475205503 020066 0 ustar kovid kovid 0000000 0000000 // Do not edit
// Generated by genattrs.py
"accent-height",
"accept",
"accept-charset",
"accesskey",
"accumulate",
"action",
"additive",
"align",
"alignment-baseline",
"allowReorder",
"alphabetic",
"alt",
"amplitude",
"arabic-form",
"ascent",
"async",
"attributeName",
"attributeType",
"autoReverse",
"autocomplete",
"autofocus",
"autoplay",
"autosave",
"azimuth",
"baseFrequency",
"baseProfile",
"baseline-shift",
"bbox",
"begin",
"bgcolor",
"bias",
"border",
"buffered",
"by",
"calcMode",
"cap-height",
"challenge",
"charset",
"checked",
"cite",
"class",
"clip",
"clip-path",
"clip-rule",
"clipPathUnits",
"code",
"codebase",
"color",
"color-interpolation",
"color-interpolation-filters",
"color-profile",
"color-rendering",
"cols",
"colspan",
"content",
"contentScriptType",
"contentStyleType",
"contenteditable",
"contextmenu",
"controls",
"coords",
"cursor",
"cx",
"cy",
"d",
"data",
"data-reactid",
"datetime",
"decelerate",
"default",
"defer",
"descent",
"diffuseConstant",
"dir",
"direction",
"dirname",
"disabled",
"display",
"divisor",
"dominant-baseline",
"download",
"draggable",
"dropzone",
"dur",
"dx",
"dy",
"edgeMode",
"elevation",
"enable-background",
"enctype",
"end",
"exponent",
"externalResourcesRequired",
"fill",
"fill-opacity",
"fill-rule",
"filter",
"filterRes",
"filterUnits",
"flood-color",
"flood-opacity",
"font-family",
"font-size",
"font-size-adjust",
"font-stretch",
"font-style",
"font-variant",
"font-weight",
"for",
"form",
"formaction",
"format",
"fr",
"from",
"fx",
"fy",
"g1",
"g2",
"glyph-name",
"glyph-orientation-horizontal",
"glyph-orientation-vertical",
"glyphRef",
"gradientTransform",
"gradientUnits",
"hanging",
"headers",
"height",
"hidden",
"high",
"horiz-adv-x",
"horiz-origin-x",
"href",
"hreflang",
"http-equiv",
"icon",
"id",
"ideographic",
"image-rendering",
"in",
"in2",
"integrity",
"intercept",
"ismap",
"itemprop",
"k",
"k1",
"k2",
"k3",
"k4",
"kernelMatrix",
"kernelUnitLength",
"kerning",
"keyPoints",
"keySplines",
"keyTimes",
"keytype",
"kind",
"label",
"lang",
"language",
"lengthAdjust",
"letter-spacing",
"lighting-color",
"limitingConeAngle",
"list",
"local",
"loop",
"low",
"manifest",
"marker-end",
"marker-mid",
"marker-start",
"markerHeight",
"markerUnits",
"markerWidth",
"mask",
"maskContentUnits",
"maskUnits",
"mathematical",
"max",
"maxlength",
"media",
"method",
"min",
"minlength",
"mode",
"multiple",
"muted",
"name",
"novalidate",
"numOctaves",
"offset",
"onabort",
"onactivate",
"onbegin",
"onclick",
"onend",
"onerror",
"onfocusin",
"onfocusout",
"onload",
"onmousedown",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onrepeat",
"onresize",
"onscroll",
"onunload",
"opacity",
"open",
"operator",
"optimum",
"order",
"orient",
"orientation",
"origin",
"overflow",
"overline-position",
"overline-thickness",
"paint-order",
"panose-1",
"pathLength",
"pattern",
"patternContentUnits",
"patternTransform",
"patternUnits",
"ping",
"placeholder",
"pointer-events",
"points",
"pointsAtX",
"pointsAtY",
"pointsAtZ",
"poster",
"preload",
"preserveAlpha",
"preserveAspectRatio",
"primitiveUnits",
"r",
"radiogroup",
"radius",
"readonly",
"refX",
"refY",
"rel",
"rendering-intent",
"repeatCount",
"repeatDur",
"required",
"requiredExtensions",
"requiredFeatures",
"restart",
"result",
"reversed",
"rotate",
"rows",
"rowspan",
"rx",
"ry",
"sandbox",
"scale",
"scope",
"scoped",
"seamless",
"seed",
"selected",
"shape",
"shape-rendering",
"size",
"sizes",
"slope",
"slot",
"spacing",
"span",
"specularConstant",
"specularExponent",
"speed",
"spellcheck",
"spreadMethod",
"src",
"srcdoc",
"srclang",
"srcset",
"start",
"startOffset",
"stdDeviation",
"stemh",
"stemv",
"step",
"stitchTiles",
"stop-color",
"stop-opacity",
"strikethrough-position",
"strikethrough-thickness",
"string",
"stroke",
"stroke-dasharray",
"stroke-dashoffset",
"stroke-linecap",
"stroke-linejoin",
"stroke-miterlimit",
"stroke-opacity",
"stroke-width",
"style",
"summary",
"surfaceScale",
"systemLanguage",
"tabindex",
"tableValues",
"target",
"targetX",
"targetY",
"text-anchor",
"text-decoration",
"text-rendering",
"textLength",
"title",
"to",
"transform",
"type",
"u1",
"u2",
"underline-position",
"underline-thickness",
"unicode",
"unicode-bidi",
"unicode-range",
"units-per-em",
"usemap",
"v-alphabetic",
"v-hanging",
"v-ideographic",
"v-mathematical",
"value",
"values",
"version",
"vert-adv-y",
"vert-origin-x",
"vert-origin-y",
"viewBox",
"viewTarget",
"visibility",
"width",
"widths",
"word-spacing",
"wrap",
"writing-mode",
"x",
"x-height",
"x1",
"x2",
"xChannelSelector",
"xlink:actuate",
"xlink:arcrole",
"xlink:href",
"xlink:role",
"xlink:show",
"xlink:title",
"xlink:type",
"xml:base",
"xml:lang",
"xml:space",
"y",
"y1",
"y2",
"yChannelSelector",
"z",
"zoomAndPan",
html5-parser-0.4.10/gumbo/ 0000755 0001750 0001750 00000000000 14122552315 015502 5 ustar kovid kovid 0000000 0000000 html5-parser-0.4.10/gumbo/util.h 0000644 0001750 0001750 00000003761 13475205503 016643 0 ustar kovid kovid 0000000 0000000 // Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)
//
// This contains some utility functions that didn't fit into any of the other
// headers.
#ifndef GUMBO_UTIL_H_
#define GUMBO_UTIL_H_
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define strcasecmp _stricmp
#define strncasecmp _strnicmp
#else
#include
#endif
#include
#include
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
extern void *(* gumbo_user_allocator)(void *, size_t);
extern void (* gumbo_user_free)(void *);
static inline void *gumbo_malloc(size_t size)
{
return gumbo_user_allocator(NULL, size);
}
static inline void *gumbo_realloc(void *ptr, size_t size)
{
return gumbo_user_allocator(ptr, size);
}
static inline char *gumbo_strdup(const char *str)
{
size_t len = strlen(str) + 1;
char *copy = (char *)gumbo_malloc(len);
memcpy(copy, str, len);
return copy;
}
static inline void gumbo_free(void *ptr)
{
gumbo_user_free(ptr);
}
static inline int gumbo_tolower(int c)
{
return c | ((c >= 'A' && c <= 'Z') << 5);
}
static inline bool gumbo_isalpha(int c)
{
return (c | 0x20) >= 'a' && (c | 0x20) <= 'z';
}
#ifdef GUMBO_DEBUG
// Debug wrapper for printf, to make it easier to turn off debugging info when
// required.
#define gumbo_debug(...) fprintf(stderr, __VA_ARGS__)
#else
#define gumbo_debug(...)
#endif
#ifdef __cplusplus
}
#endif
#endif // GUMBO_UTIL_H_
html5-parser-0.4.10/gumbo/error.h 0000644 0001750 0001750 00000017133 13475205503 017015 0 ustar kovid kovid 0000000 0000000 // Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)
//
// Error types, enums, and handling functions.
#ifndef GUMBO_ERROR_H_
#define GUMBO_ERROR_H_
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#endif
#include
#include "gumbo.h"
#include "insertion_mode.h"
#include "string_buffer.h"
#include "token_type.h"
#ifdef __cplusplus
extern "C" {
#endif
struct GumboInternalParser;
typedef enum {
GUMBO_ERR_UTF8_INVALID,
GUMBO_ERR_UTF8_TRUNCATED,
GUMBO_ERR_UTF8_NULL,
GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
GUMBO_ERR_NAMED_CHAR_REF_INVALID,
GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
GUMBO_ERR_TAG_EOF,
GUMBO_ERR_TAG_INVALID,
GUMBO_ERR_CLOSE_TAG_EMPTY,
GUMBO_ERR_CLOSE_TAG_EOF,
GUMBO_ERR_CLOSE_TAG_INVALID,
GUMBO_ERR_SCRIPT_EOF,
GUMBO_ERR_ATTR_NAME_EOF,
GUMBO_ERR_ATTR_NAME_INVALID,
GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
GUMBO_ERR_ATTR_UNQUOTED_EOF,
GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
GUMBO_ERR_ATTR_AFTER_EOF,
GUMBO_ERR_ATTR_AFTER_INVALID,
GUMBO_ERR_DUPLICATE_ATTR,
GUMBO_ERR_SOLIDUS_EOF,
GUMBO_ERR_SOLIDUS_INVALID,
GUMBO_ERR_DASHES_OR_DOCTYPE,
GUMBO_ERR_COMMENT_EOF,
GUMBO_ERR_COMMENT_INVALID,
GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
GUMBO_ERR_COMMENT_END_BANG_EOF,
GUMBO_ERR_DOCTYPE_EOF,
GUMBO_ERR_DOCTYPE_INVALID,
GUMBO_ERR_DOCTYPE_SPACE,
GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
GUMBO_ERR_DOCTYPE_END,
GUMBO_ERR_PARSER,
GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
} GumboErrorType;
// Additional data for duplicated attributes.
typedef struct GumboInternalDuplicateAttrError {
// The name of the attribute. Owned by this struct.
const char* name;
// The (0-based) index within the attributes vector of the original
// occurrence.
unsigned int original_index;
// The (0-based) index where the new occurrence would be.
unsigned int new_index;
} GumboDuplicateAttrError;
// A simplified representation of the tokenizer state, designed to be more
// useful to clients of this library than the internal representation. This
// condenses the actual states used in the tokenizer state machine into a few
// values that will be familiar to users of HTML.
typedef enum {
GUMBO_ERR_TOKENIZER_DATA,
GUMBO_ERR_TOKENIZER_CHAR_REF,
GUMBO_ERR_TOKENIZER_RCDATA,
GUMBO_ERR_TOKENIZER_RAWTEXT,
GUMBO_ERR_TOKENIZER_PLAINTEXT,
GUMBO_ERR_TOKENIZER_SCRIPT,
GUMBO_ERR_TOKENIZER_TAG,
GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
GUMBO_ERR_TOKENIZER_ATTR_NAME,
GUMBO_ERR_TOKENIZER_ATTR_VALUE,
GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
GUMBO_ERR_TOKENIZER_COMMENT,
GUMBO_ERR_TOKENIZER_DOCTYPE,
GUMBO_ERR_TOKENIZER_CDATA,
} GumboTokenizerErrorState;
// Additional data for tokenizer errors.
// This records the current state and codepoint encountered - this is usually
// enough to reconstruct what went wrong and provide a friendly error message.
typedef struct GumboInternalTokenizerError {
// The bad codepoint encountered.
int codepoint;
// The state that the tokenizer was in at the time.
GumboTokenizerErrorState state;
} GumboTokenizerError;
// Additional data for parse errors.
typedef struct GumboInternalParserError {
// The type of input token that resulted in this error.
GumboTokenType input_type;
// The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
GumboTag input_tag;
// The insertion mode that the parser was in at the time.
GumboInsertionMode parser_state;
// The tag stack at the point of the error. Note that this is an GumboVector
// of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
// get at the tag.
GumboVector /* GumboTag */ tag_stack;
} GumboParserError;
// The overall error struct representing an error in decoding/tokenizing/parsing
// the HTML. This contains an enumerated type flag, a source position, and then
// a union of fields containing data specific to the error.
typedef struct GumboInternalError {
// The type of error.
GumboErrorType type;
// The position within the source file where the error occurred.
GumboSourcePosition position;
// A pointer to the byte within the original source file text where the error
// occurred (note that this is not the same as position.offset, as that gives
// character-based instead of byte-based offsets).
const char* original_text;
// Type-specific error information.
union {
// The code point we encountered, for:
// * GUMBO_ERR_UTF8_INVALID
// * GUMBO_ERR_UTF8_TRUNCATED
// * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
// * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
uint64_t codepoint;
// Tokenizer errors.
GumboTokenizerError tokenizer;
// Short textual data, for:
// * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
// * GUMBO_ERR_NAMED_CHAR_REF_INVALID
GumboStringPiece text;
// Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
GumboDuplicateAttrError duplicate_attr;
// Parser state, for GUMBO_ERR_PARSER and
// GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
struct GumboInternalParserError parser;
} v;
} GumboError;
// Adds a new error to the parser's error list, and returns a pointer to it so
// that clients can fill out the rest of its fields. May return NULL if we're
// already over the max_errors field specified in GumboOptions.
GumboError* gumbo_add_error(struct GumboInternalParser* parser);
// Initializes the errors vector in the parser.
void gumbo_init_errors(struct GumboInternalParser* errors);
// Frees all the errors in the 'errors_' field of the parser.
void gumbo_destroy_errors(struct GumboInternalParser* errors);
// Frees the memory used for a single GumboError.
void gumbo_error_destroy(GumboError* error);
// Prints an error to a string. This fills an empty GumboStringBuffer with a
// freshly-allocated buffer containing the error message text. The caller is
// responsible for deleting the buffer. (Note that the buffer is allocated with
// the allocator specified in the GumboParser config and hence should be freed
// by gumbo_free().)
void gumbo_error_to_string(const GumboError* error, GumboStringBuffer* output);
// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
// with a freshly-allocated buffer containing the error message text. The
// caller is responsible for deleting the buffer. (Note that the buffer is
// allocated with the allocator specified in the GumboParser config and hence
// should be freed by gumbo_parser_deallocate().)
void gumbo_caret_diagnostic_to_string(const GumboError* error,
const char* source_text, GumboStringBuffer* output);
// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
// of writing to a string.
void gumbo_print_caret_diagnostic(
const GumboError* error, const char* source_text);
#ifdef __cplusplus
}
#endif
#endif // GUMBO_ERROR_H_
html5-parser-0.4.10/gumbo/tag.c 0000644 0001750 0001750 00000005735 13475205503 016437 0 ustar kovid kovid 0000000 0000000 // Copyright 2011 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)
#include "gumbo.h"
#include "util.h"
#include
#include
const char* kGumboTagNames[] = {
#include "tag_strings.h"
"", // TAG_UNKNOWN
"", // TAG_LAST
};
static const uint8_t kGumboTagSizes[] = {
#include "tag_sizes.h"
0, // TAG_UNKNOWN
0, // TAG_LAST
};
const char* gumbo_normalized_tagname(GumboTag tag) {
assert(tag <= GUMBO_TAG_LAST);
return kGumboTagNames[tag];
}
const char* gumbo_normalized_tagname_and_size(GumboTag tag, uint8_t* sz) {
assert(tag <= GUMBO_TAG_LAST);
*sz = kGumboTagSizes[tag];
return kGumboTagNames[tag];
}
void gumbo_tag_from_original_text(GumboStringPiece* text) {
if (text->data == NULL) {
return;
}
assert(text->length >= 2);
assert(text->data[0] == '<');
assert(text->data[text->length - 1] == '>');
if (text->data[1] == '/') {
// End tag.
assert(text->length >= 3);
text->data += 2; // Move past
text->length -= 3;
} else {
// Start tag.
text->data += 1; // Move past <
text->length -= 2;
// strnchr is apparently not a standard C library function, so I loop
// explicitly looking for whitespace or other illegal tag characters - as
// accepted by the Tag Name State
for (const char* c = text->data; c != text->data + text->length; ++c) {
if (*c == '\t' || *c == '\n' || *c == '\f' || *c == ' ' || *c == '/') {
// was: if (isspace(*c) || *c == '/') {
// see https://github.com/google/gumbo-parser/pull/375/
text->length = c - text->data;
break;
}
}
}
}
#include "tag_perf.h"
#define TAG_MAP_SIZE (sizeof(kGumboTagMap) / sizeof(kGumboTagMap[0]))
static int case_memcmp(const char* s1, const char* s2, unsigned int n) {
while (n--) {
unsigned char c1 = gumbo_tolower(*s1++);
unsigned char c2 = gumbo_tolower(*s2++);
if (c1 != c2) return (int) c1 - (int) c2;
}
return 0;
}
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
if (length) {
unsigned int key = tag_hash(tagname, length);
if (key < TAG_MAP_SIZE) {
GumboTag tag = kGumboTagMap[key];
if (length == kGumboTagSizes[(int) tag] &&
!case_memcmp(tagname, kGumboTagNames[(int) tag], length))
return tag;
}
}
return GUMBO_TAG_UNKNOWN;
}
GumboTag gumbo_tag_enum(const char* tagname) {
return gumbo_tagn_enum(tagname, strlen(tagname));
}
html5-parser-0.4.10/gumbo/utf8.h 0000644 0001750 0001750 00000011512 13475205503 016545 0 ustar kovid kovid 0000000 0000000 // Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)
//
// This contains an implementation of a UTF8 iterator and decoder suitable for
// an HTML5 parser. This does a bit more than straight UTF-8 decoding. The
// HTML5 spec specifies that:
// 1. Decoding errors are parse errors.
// 2. Certain other codepoints (eg. control characters) are parse errors.
// 3. Carriage returns and CR/LF groups are converted to line feeds.
// https://encoding.spec.whatwg.org/#utf-8-decode
//
// Also, we want to keep track of source positions for error handling. As a
// result, we fold all that functionality into this decoder, and can't use an
// off-the-shelf library.
//
// This header is internal-only, which is why we prefix functions with only
// utf8_ or utf8_iterator_ instead of gumbo_utf8_.
#ifndef GUMBO_UTF8_H_
#define GUMBO_UTF8_H_
#include
#include
#include "gumbo.h"
#ifdef __cplusplus
extern "C" {
#endif
struct GumboInternalError;
struct GumboInternalParser;
// Unicode replacement char.
extern const int kUtf8ReplacementChar;
typedef struct GumboInternalUtf8Iterator {
// Points at the start of the code point most recently read into 'current'.
const char* _start;
// Points at the mark. The mark is initially set to the beginning of the
// input.
const char* _mark;
// Points past the end of the iter, like a past-the-end iterator in the STL.
const char* _end;
// The code point under the cursor.
int _current;
// The width in bytes of the current code point.
int _width;
// The SourcePosition for the current location.
GumboSourcePosition _pos;
// The SourcePosition for the mark.
GumboSourcePosition _mark_pos;
// Pointer back to the GumboParser instance, for configuration options and
// error recording.
struct GumboInternalParser* _parser;
} Utf8Iterator;
// Returns true if this Unicode code point is in the list of characters
// forbidden by the HTML5 spec, such as NUL bytes and undefined control chars.
bool utf8_is_invalid_code_point(int c);
// Initializes a new Utf8Iterator from the given byte buffer. The source does
// not have to be NUL-terminated, but the length must be passed in explicitly.
void utf8iterator_init(
struct GumboInternalParser* parser, const char* source,
size_t source_length, Utf8Iterator* iter);
// Advances the current position by one code point.
void utf8iterator_next(Utf8Iterator* iter);
// Returns the current code point as an integer.
int utf8iterator_current(const Utf8Iterator* iter);
// Retrieves and fills the output parameter with the current source position.
void utf8iterator_get_position(
const Utf8Iterator* iter, GumboSourcePosition* output);
// Retrieves a character pointer to the start of the current character.
const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
// Retrieves a character pointer to 1 past the end of the buffer. This is
// necessary for certain state machines and string comparisons that would like
// to look directly for ASCII text in the buffer without going through the
// decoder.
const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
// If the upcoming text in the buffer matches the specified prefix (which has
// length 'length'), consume it and return true. Otherwise, return false with
// no other effects. If the length of the string would overflow the buffer,
// this returns false. Note that prefix should not contain null bytes because
// of the use of strncmp/strncasecmp internally. All existing use-cases adhere
// to this.
bool utf8iterator_maybe_consume_match(
Utf8Iterator* iter, const char* prefix, size_t length, bool case_sensitive);
// "Marks" a particular location of interest in the input stream, so that it can
// later be reset() to. There's also the ability to record an error at the
// point that was marked, as oftentimes that's more useful than the last
// character before the error was detected.
void utf8iterator_mark(Utf8Iterator* iter);
// Returns the current input stream position to the mark.
void utf8iterator_reset(Utf8Iterator* iter);
// Sets the position and original text fields of an error to the value at the
// mark.
void utf8iterator_fill_error_at_mark(
Utf8Iterator* iter, struct GumboInternalError* error);
#ifdef __cplusplus
}
#endif
#endif // GUMBO_UTF8_H_
html5-parser-0.4.10/gumbo/string_piece.c 0000644 0001750 0001750 00000002714 13475205503 020331 0 ustar kovid kovid 0000000 0000000 // Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)
#include "string_piece.h"
#include
#include
#include
#include "util.h"
struct GumboInternalParser;
const GumboStringPiece kGumboEmptyString = {NULL, 0};
bool gumbo_string_equals(
const GumboStringPiece* str1, const GumboStringPiece* str2) {
return str1->length == str2->length &&
!memcmp(str1->data, str2->data, str1->length);
}
bool gumbo_string_equals_ignore_case(
const GumboStringPiece* str1, const GumboStringPiece* str2) {
return str1->length == str2->length &&
!strncasecmp(str1->data, str2->data, str1->length);
}
void gumbo_string_copy(GumboStringPiece* dest, const GumboStringPiece* source) {
dest->length = source->length;
char* buffer = gumbo_malloc(source->length);
memcpy(buffer, source->data, source->length);
dest->data = buffer;
}
html5-parser-0.4.10/gumbo/gumbo.h 0000644 0001750 0001750 00000055652 13475205503 017005 0 ustar kovid kovid 0000000 0000000 // Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)
//
// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
// GUMBO_ as a prefix for enum constants (static constants get the Google-style
// kGumbo prefix).
/**
* @file
* @mainpage Gumbo HTML Parser
*
* This provides a conformant, no-dependencies implementation of the HTML5
* parsing algorithm. It supports only UTF8; if you need to parse a different
* encoding, run a preprocessing step to convert to UTF8. It returns a parse
* tree made of the structs in this file.
*
* Example:
* @code
* GumboOutput* output = gumbo_parse(input);
* do_something_with_doctype(output->document);
* do_something_with_html_tree(output->root);
* gumbo_destroy_output(&options, output);
* @endcode
* HTML5 Spec:
*
* http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
*/
#pragma once
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define fileno _fileno
#endif
#include
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
/**
* A struct representing a character position within the original text buffer.
* Line and column numbers are 1-based and offsets are 0-based, which matches
* how most editors and command-line tools work. Also, columns measure
* positions in terms of characters while offsets measure by bytes; this is
* because the offset field is often used to pull out a particular region of
* text (which in most languages that bind to C implies pointer arithmetic on a
* buffer of bytes), while the column field is often used to reference a
* particular column on a printable display, which nowadays is usually UTF-8.
*/
typedef struct {
unsigned int line;
unsigned int column;
unsigned int offset;
} GumboSourcePosition;
/**
* A SourcePosition used for elements that have no source position, i.e.
* parser-inserted elements.
*/
extern const GumboSourcePosition kGumboEmptySourcePosition;
/**
* A struct representing a string or part of a string. Strings within the
* parser are represented by a char* and a length; the char* points into
* an existing data buffer owned by some other code (often the original input).
* GumboStringPieces are assumed (by convention) to be immutable, because they
* may share data. Use GumboStringBuffer if you need to construct a string.
* Clients should assume that it is not NUL-terminated, and should always use
* explicit lengths when manipulating them.
*/
typedef struct {
/** A pointer to the beginning of the string. NULL iff length == 0. */
const char* data;
/** The length of the string fragment, in bytes. May be zero. */
size_t length;
} GumboStringPiece;
/** A constant to represent a 0-length null string. */
extern const GumboStringPiece kGumboEmptyString;
/**
* Compares two GumboStringPieces, and returns true if they're equal or false
* otherwise.
*/
bool gumbo_string_equals(
const GumboStringPiece* str1, const GumboStringPiece* str2);
/**
* Compares two GumboStringPieces ignoring case, and returns true if they're
* equal or false otherwise.
*/
bool gumbo_string_equals_ignore_case(
const GumboStringPiece* str1, const GumboStringPiece* str2);
/**
* A simple vector implementation. This stores a pointer to a data array and a
* length. All elements are stored as void*; client code must cast to the
* appropriate type. Overflows upon addition result in reallocation of the data
* array, with the size doubling to maintain O(1) amortized cost. There is no
* removal function, as this isn't needed for any of the operations within this
* library. Iteration can be done through inspecting the structure directly in
* a for-loop.
*/
typedef struct {
/** Data elements. This points to a dynamically-allocated array of capacity
* elements, each a void* to the element itself.
*/
void** data;
/** Number of elements currently in the vector. */
unsigned int length;
/** Current array capacity. */
unsigned int capacity;
} GumboVector;
/** An empty (0-length, 0-capacity) GumboVector. */
extern const GumboVector kGumboEmptyVector;
/**
* Returns the first index at which an element appears in this vector (testing
* by pointer equality), or -1 if it never does.
*/
int gumbo_vector_index_of(GumboVector* vector, const void* element);
/**
* An enum for all the tags defined in the HTML5 standard. These correspond to
* the tag names themselves. Enum constants exist only for tags which appear in
* the spec itself (or for tags with special handling in the SVG and MathML
* namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
* name can be obtained through original_tag.
*
* This is mostly for API convenience, so that clients of this library don't
* need to perform a strcasecmp to find the normalized tag name. It also has
* efficiency benefits, by letting the parser work with enums instead of
* strings.
*/
typedef enum {
// Load all the tags from an external source, generated from tag.in.
# include "tag_enum.h"
// Used for all tags that don't have special handling in HTML. Add new tags
// to the end of tag.in so as to preserve backwards-compatibility.
GUMBO_TAG_UNKNOWN,
// A marker value to indicate the end of the enum, for iterating over it.
// Also used as the terminator for varargs functions that take tags.
GUMBO_TAG_LAST,
} GumboTag;
/**
* Returns the normalized (usually all-lowercased, except for foreign content)
* tag name for an GumboTag enum. Return value is static data owned by the
* library.
*/
const char* gumbo_normalized_tagname(GumboTag tag);
const char* gumbo_normalized_tagname_and_size(GumboTag tag, uint8_t *sz);
/**
* Extracts the tag name from the original_text field of an element or token by
* stripping off > characters and attributes and adjusting the passed-in
* GumboStringPiece appropriately. The tag name is in the original case and
* shares a buffer with the original text, to simplify memory management.
* Behavior is undefined if a string-piece that doesn't represent an HTML tag
* ( or ) is passed in. If the string piece is completely
* empty (NULL data pointer), then this function will exit successfully as a
* no-op.
*/
void gumbo_tag_from_original_text(GumboStringPiece* text);
/**
* Fixes the case of SVG elements that are not all lowercase.
* https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
* This is not done at parse time because there's no place to store a mutated
* tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
* without special handling), while original_tag_name is a pointer into the
* original buffer. Instead, we provide this helper function that clients can
* use to rename SVG tags as appropriate.
* Returns the case-normalized SVG tagname if a replacement is found, or NULL if
* no normalization is called for. The return value is static data and owned by
* the library.
*/
const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
/**
* Converts a tag name string (which may be in upper or mixed case) to a tag
* enum. The `tag` version expects `tagname` to be NULL-terminated
*/
GumboTag gumbo_tag_enum(const char* tagname);
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
/**
* Attribute namespaces.
* HTML includes special handling for XLink, XML, and XMLNS namespaces on
* attributes. Everything else goes in the generic "NONE" namespace.
*/
typedef enum {
GUMBO_ATTR_NAMESPACE_NONE,
GUMBO_ATTR_NAMESPACE_XLINK,
GUMBO_ATTR_NAMESPACE_XML,
GUMBO_ATTR_NAMESPACE_XMLNS,
} GumboAttributeNamespaceEnum;
/**
* A struct representing a single attribute on an HTML tag. This is a
* name-value pair, but also includes information about source locations and
* original source text.
*/
typedef struct {
/**
* The namespace for the attribute. This will usually be
* GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
* values, per:
* https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
*/
GumboAttributeNamespaceEnum attr_namespace;
/**
* The name of the attribute. This is in a freshly-allocated buffer to deal
* with case-normalization, and is null-terminated.
*/
const char* name;
/**
* The original text of the attribute name, as a pointer into the original
* source buffer.
*/
GumboStringPiece original_name;
/**
* The value of the attribute. This is in a freshly-allocated buffer to deal
* with unescaping, and is null-terminated. It does not include any quotes
* that surround the attribute. If the attribute has no value (for example,
* 'selected' on a checkbox), this will be an empty string.
*/
const char* value;
/**
* The original text of the value of the attribute. This points into the
* original source buffer. It includes any quotes that surround the
* attribute, and you can look at original_value.data[0] and
* original_value.data[original_value.length - 1] to determine what the quote
* characters were. If the attribute has no value, this will be a 0-length
* string.
*/
GumboStringPiece original_value;
/** The starting position of the attribute name. */
GumboSourcePosition name_start;
/**
* The ending position of the attribute name. This is not always derivable
* from the starting position of the value because of the possibility of
* whitespace around the = sign.
*/
GumboSourcePosition name_end;
/** The starting position of the attribute value. */
GumboSourcePosition value_start;
/** The ending position of the attribute value. */
GumboSourcePosition value_end;
} GumboAttribute;
/**
* Given a vector of GumboAttributes, look up the one with the specified name
* and return it, or NULL if no such attribute exists. This uses a
* case-insensitive match, as HTML is case-insensitive.
*/
GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
/**
* Enum denoting the type of node. This determines the type of the node.v
* union.
*/
typedef enum {
/** Document node. v will be a GumboDocument. */
GUMBO_NODE_DOCUMENT,
/** Element node. v will be a GumboElement. */
GUMBO_NODE_ELEMENT,
/** Text node. v will be a GumboText. */
GUMBO_NODE_TEXT,
/** CDATA node. v will be a GumboText. */
GUMBO_NODE_CDATA,
/** Comment node. v will be a GumboText, excluding comment delimiters. */
GUMBO_NODE_COMMENT,
/** Text node, where all contents is whitespace. v will be a GumboText. */
GUMBO_NODE_WHITESPACE,
/** Template node. This is separate from GUMBO_NODE_ELEMENT because many
* client libraries will want to ignore the contents of template nodes, as
* the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
* here, while clients that want to include template contents should also
* check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
GUMBO_NODE_TEMPLATE
} GumboNodeType;
/**
* Forward declaration of GumboNode so it can be used recursively in
* GumboNode.parent.
*/
typedef struct GumboInternalNode GumboNode;
/** https://dom.spec.whatwg.org/#concept-document-quirks */
typedef enum {
GUMBO_DOCTYPE_NO_QUIRKS,
GUMBO_DOCTYPE_QUIRKS,
GUMBO_DOCTYPE_LIMITED_QUIRKS
} GumboQuirksModeEnum;
/**
* Namespaces.
* Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
* anything inside an