htmlmin-0.1.12/ 0000775 0001750 0001750 00000000000 13221466656 013140 5 ustar dave dave 0000000 0000000 htmlmin-0.1.12/PKG-INFO 0000664 0001750 0001750 00000001705 13221466656 014240 0 ustar dave dave 0000000 0000000 Metadata-Version: 1.1 Name: htmlmin Version: 0.1.12 Summary: An HTML Minifier Home-page: https://htmlmin.readthedocs.io/en/latest/ Author: Dave Mankoff Author-email: mankyd@gmail.com License: BSD Download-URL: https://github.com/mankyd/htmlmin Description-Content-Type: UNKNOWN Description: A configurable HTML Minifier with safety features. .. image:: https://travis-ci.org/mankyd/htmlmin.png?branch=master :target: http://travis-ci.org/mankyd/htmlmin Documentation: https://htmlmin.readthedocs.io/en/latest/ Platform: UNKNOWN Classifier: Development Status :: 4 - Beta Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: BSD License Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3.2 Classifier: Topic :: Text Processing :: Markup :: HTML htmlmin-0.1.12/htmlmin/ 0000775 0001750 0001750 00000000000 13221466656 014610 5 ustar dave dave 0000000 0000000 htmlmin-0.1.12/htmlmin/command.py 0000775 0001750 0001750 00000013377 13147667524 016622 0 ustar dave dave 0000000 0000000 #!/usr/bin/env python """ Copyright (c) 2013, Dave Mankoff All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Dave Mankoff nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ import argparse import codecs import locale import io import sys #import htmlmin from . import Minifier parser = argparse.ArgumentParser( description='Minify HTML', formatter_class=argparse.RawTextHelpFormatter ) parser.add_argument('input_file', nargs='?', metavar='INPUT', help='File path to html file to minify. Defaults to stdin.', ) parser.add_argument('output_file', nargs='?', metavar='OUTPUT', help="File path to output to. Defaults to stdout.", ) parser.add_argument('-c', '--remove-comments', help=( '''When set, comments will be removed. They can be kept on an individual basis by starting them with a '!': . The '!' will be removed from the final output. If you want a '!' as the leading character of your comment, put two of them: . '''), action='store_true') parser.add_argument('-s', '--remove-empty-space', help=( '''When set, this removes empty space betwen tags in certain cases. Specifically, it will remove empty space if and only if there a newline character occurs within the space. Thus, code like 'x y' will be left alone, but code such as ' ...
...' will become '......'. Note that this CAN break your html if you spread two inline tags over two lines. Use with caution. '''), action='store_true') parser.add_argument('--remove-all-empty-space', help=( '''When set, this removes ALL empty space betwen tags. WARNING: this can and likely will cause unintended consequences. For instance, 'X Y' will become 'XY'. Putting whitespace along with other text will avoid this problem. Only use if you are confident in the result. Whitespace is not removed from inside of tags, thus ' ' will be left alone. '''), action='store_true') parser.add_argument('--keep-optional-attribute-quotes', help=( '''When set, this keeps all attribute quotes, even if they are optional. '''), action='store_true') parser.add_argument('-H', '--in-head', help=( '''If you are parsing only a fragment of HTML, and the fragment occurs in the head of the document, setting this will remove some extra whitespace. '''), action='store_true') parser.add_argument('-k', '--keep-pre-attr', help=( '''HTMLMin supports the propietary attribute 'pre' that can be added to elements to prevent minification. This attribute is removed by default. Set this flag to keep the 'pre' attributes in place. '''), action='store_true') parser.add_argument('-a', '--pre-attr', help=( '''The attribute htmlmin looks for to find blocks of HTML that it should not minify. This attribute will be removed from the HTML unless '-k' is specified. Defaults to 'pre'. '''), default='pre') parser.add_argument('-p', '--pre-tags', metavar='TAG', help=( '''By default, the contents of 'pre', and 'textarea' tags are left unminified. You can specify different tags using the --pre-tags option. 'script' and 'style' tags are always left unmininfied. '''), nargs='*', default=['pre', 'textarea']) parser.add_argument('-e', '--encoding', help=("Encoding to read and write with. Default 'utf-8'." " When reading from stdin, attempts to use the system's" " encoding before defaulting to utf-8.\n\n"), default=None, ) def main(): args = parser.parse_args() minifier = Minifier( remove_comments=args.remove_comments, remove_empty_space=args.remove_empty_space, remove_optional_attribute_quotes=not args.keep_optional_attribute_quotes, pre_tags=args.pre_tags, keep_pre=args.keep_pre_attr, pre_attr=args.pre_attr, ) default_encoding = args.encoding or 'utf-8' if args.input_file: inp = codecs.open(args.input_file, encoding=default_encoding) else: encoding = args.encoding or sys.stdin.encoding \ or locale.getpreferredencoding() or default_encoding inp = io.open(sys.stdin.fileno(), encoding=encoding) for line in inp.readlines(): minifier.input(line) if args.output_file: codecs.open( args.output_file, 'w', encoding=default_encoding).write(minifier.output) else: encoding = args.encoding or sys.stdout.encoding \ or locale.getpreferredencoding() or default_encoding io.open(sys.stdout.fileno(), 'w', encoding=encoding).write(minifier.output) if __name__ == '__main__': main() htmlmin-0.1.12/htmlmin/decorator.py 0000664 0001750 0001750 00000004706 12110442242 017130 0 ustar dave dave 0000000 0000000 """ Copyright (c) 2013, Dave Mankoff All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Dave Mankoff nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from .main import Minifier def htmlmin(*args, **kwargs): """Minifies HTML that is returned by a function. A simple decorator that minifies the HTML output of any function that it decorates. It supports all the same options that :class:`htmlmin.minify` has. With no options, it uses ``minify``'s default settings:: @htmlmin def foobar(): return ' minify me! ' or:: @htmlmin(remove_comments=True) def foobar(): return ' minify me! ' """ def _decorator(fn): minify = Minifier(**kwargs).minify def wrapper(*a, **kw): return minify(fn(*a, **kw)) return wrapper if len(args) == 1: if callable(args[0]) and not kwargs: return _decorator(args[0]) else: raise RuntimeError( 'htmlmin decorator does accept positional arguments') elif len(args) > 1: raise RuntimeError( 'htmlmin decorator does accept positional arguments') else: return _decorator htmlmin-0.1.12/htmlmin/parser.py 0000664 0001750 0001750 00000036102 13212125436 016444 0 ustar dave dave 0000000 0000000 """ Copyright (c) 2013, Dave Mankoff All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Dave Mankoff nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from __future__ import unicode_literals import logging import sys import re from .python3html.parser import HTMLParser from . import escape # https://www.w3.org/TR/html5/single-page.html#space-character HTML_SPACE_RE = re.compile('[\x20\x09\x0a\x0c\x0d]+') HTML_ALL_SPACE_RE = re.compile('^[\x20\x09\x0a\x0c\x0d]+$') HTML_LEADING_SPACE_RE = re.compile( '^[\x20\x09\x0a\x0c\x0d]+') HTML_TRAILING_SPACE_RE = re.compile( '[\x20\x09\x0a\x0c\x0d]+$') HTML_LEADING_TRAILING_SPACE_RE = re.compile( '(^[\x20\x09\x0a\x0c\x0d]+)|([\x20\x09\x0a\x0c\x0d]+$)') PRE_TAGS = ('pre', 'textarea') # styles and scripts are never minified # http://www.w3.org/TR/html51/syntax.html#elements-0 NO_CLOSE_TAGS = ('area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr') # http://www.w3.org/TR/html51/index.html#attributes-1 BOOLEAN_ATTRIBUTES = { 'audio': ('autoplay', 'controls', 'hidden', 'loop', 'muted',), 'button': ('autofocus', 'disabled', 'formnovalidate', 'hidden',), 'command': ('checked', 'disabled', 'hidden'), 'dialog': ('hidden', 'open',), 'fieldset': ('disabled', 'hidden',), 'form': ('hidden', 'novalidate',), 'iframe': ('hidden', 'seamless',), 'img': ('hidden', 'ismap',), 'input': ('autofocus', 'checked', 'disabled', 'formnovalidate', 'hidden', 'multiple', 'readonly', 'required',), 'keygen': ('autofocus', 'disabled', 'hidden',), 'object': ('hidden', 'typesmustmatch',), 'ol': ('hidden', 'reversed',), 'optgroup': ('disabled', 'hidden',), 'option': ('disabled', 'hidden', 'selected',), 'script': ('async', 'defer', 'hidden',), 'select': ('autofocus', 'disabled', 'hidden', 'multiple', 'required',), 'style': ('hidden', 'scoped',), 'textarea': ('autofocus', 'disabled', 'hidden', 'readonly', 'required',), 'track': ('default', 'hidden', ), 'video': ('autoplay', 'controls', 'hidden', 'loop', 'muted',), '*': ('hidden',), } # a list of tags and tags that they are closed by TAG_SETS = { 'li': ('li',), 'dd': ('dd', 'dt'), 'rp': ('rp', 'rt'), 'p': ('address', 'article', 'aside', 'blockquote', 'dir', 'div', 'dl', 'fieldset', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'menu', 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul'), 'optgroup': ('optgroup',), 'option': ('option', 'optgroup'), 'colgroup': '*', 'tbody': ('tbody', 'tfoot'), 'tfoot': ('tbody',), 'tr': ('tr',), 'td': ('td', 'th'), } TAG_SETS['dt'] = TAG_SETS['dd'] TAG_SETS['rt'] = TAG_SETS['rp'] TAG_SETS['thead'] = TAG_SETS['tbody'] TAG_SETS['th'] = TAG_SETS['td'] # Tag omission rules: # http://www.w3.org/TR/html51/syntax.html#optional-tags class HTMLMinError(Exception): pass class ParseError(HTMLMinError): pass class OpenTagNotFoundError(ParseError): pass class HTMLMinParser(HTMLParser): def __init__(self, remove_comments=False, remove_empty_space=False, remove_all_empty_space=False, reduce_empty_attributes=True, reduce_boolean_attributes=False, remove_optional_attribute_quotes=True, convert_charrefs=True, keep_pre=False, pre_tags=PRE_TAGS, pre_attr='pre'): if sys.version_info[0] >= 3 and sys.version_info[1] >= 4: # convert_charrefs is True by default in Python 3.5.0 and newer. It was # introduced in 3.4. HTMLParser.__init__(self, convert_charrefs=False) else: HTMLParser.__init__(self) self.keep_pre = keep_pre self.pre_tags = pre_tags self.remove_comments = remove_comments self.remove_empty_space = remove_empty_space self.remove_all_empty_space = remove_all_empty_space self.reduce_empty_attributes = reduce_empty_attributes self.reduce_boolean_attributes = reduce_boolean_attributes self.remove_optional_attribute_quotes = remove_optional_attribute_quotes self.convert_charrefs = convert_charrefs self.pre_attr = pre_attr self.reset() def _tag_lang(self): return self._tag_stack[0][2] if self._tag_stack else None def build_tag(self, tag, attrs, close_tag): has_pre = False if self.reduce_boolean_attributes: bool_attrs = BOOLEAN_ATTRIBUTES.get(tag, BOOLEAN_ATTRIBUTES['*']) else: bool_attrs = False lang = self._tag_lang() attrs = list(attrs) # We're modifying it in place last_quoted = last_no_slash = i = -1 for k, v in attrs: pre_prefix = k.startswith("{}-".format(self.pre_attr)) if pre_prefix: k = k[len(self.pre_attr)+1:] if k == self.pre_attr: has_pre = True if not self.keep_pre and not pre_prefix: continue if v and self.convert_charrefs and not pre_prefix: v = HTMLParser.unescape(self, v) if k == 'lang': lang = v if v == self._tag_lang(): continue i += 1 if not pre_prefix: k = escape.escape_attr_name(k) if (v is None or (not v and self.reduce_empty_attributes) or (bool_attrs and k in bool_attrs)): # For our use case, we treat boolean attributes as quoted because they # don't require space between them and "/>" in closing tags. attrs[i] = k last_quoted = i else: if pre_prefix: has_double_quotes = '"' in v has_single_quotes = "'" in v if not has_double_quotes: if not has_single_quotes and self.remove_optional_attribute_quotes: q = escape.NO_QUOTES else: q = escape.DOUBLE_QUOTE elif not has_single_quotes: q = escape.SINGLE_QUOTES else: logging.error('Unsafe content found in pre-attribute. Escaping.') (v, q) = escape.escape_attr_value( v, double_quote=not self.remove_optional_attribute_quotes) else: (v, q) = escape.escape_attr_value( v, double_quote=not self.remove_optional_attribute_quotes) if q == escape.NO_QUOTES: attrs[i] = '%s=%s' % (k, v) if v[-1] != '/': last_no_slash = i else: q = '"' if q == escape.DOUBLE_QUOTE else "'" attrs[i] = '%s=%s%s%s' % (k, q, v, q) last_quoted = i i += 1 if i != len(attrs): del attrs[i:] # 1. If there are no attributes, no additional space is necessary. # 2. If last attribute is quoted, no additional space is necessary. # 3. Two things are happening here: # a) according to the standard,tags don't get closed when a parent a # tag closes them. Here's some logic that addresses this. if tag == 'a': contains_p = False for i, t in enumerate(self._tag_stack): if t[0] == 'p': contains_p = True elif t[0] == 'a': break if contains_p: # the p tag, and all its children should be left open a_tag = self._tag_stack.pop(i) if a_tag[1]: self._in_pre_tag -= 1 else: if tag == 'head': # TODO: Did we know that we were in an head tag?! If not, we need to # reminify everything to remove extra spaces. self._in_head = False elif tag == 'title': self._in_title = False self._title_newly_opened = False try: self._in_pre_tag -= self._close_tags_up_to(tag) except OpenTagNotFoundError: # Some tags don't require a start tag. Most do. Either way, we leave # closing tags along since they affect output. For instance, a '
' # results in a '' in Chrome. pass if tag not in NO_CLOSE_TAGS: self._data_buffer.extend(['', escape.escape_tag(tag), '>']) def handle_startendtag(self, tag, attrs): self._after_doctype = False data = self.build_tag(tag, attrs, tag not in NO_CLOSE_TAGS)[1] self._data_buffer.append(data) def handle_comment(self, data): if not self.remove_comments or re.match(r'^(?:!|\[if\s)', data): self._data_buffer.append(''.format( data[1:] if len(data) and data[0] == '!' else data)) def handle_data(self, data): if self._in_pre_tag > 0: self._data_buffer.append(data) else: # remove_all_empty_space matches everything. remove_empty_space only # matches if there's a newline involved. if self.remove_all_empty_space or self._in_head or self._after_doctype: if HTML_ALL_SPACE_RE.match(data): return elif (self.remove_empty_space and HTML_ALL_SPACE_RE.match(data) and ('\n' in data or '\r' in data)): return # if we're in the title, remove leading and trailing whitespace. # note that the title may be parsed in chunks if entityref's or charrefs # are encountered. if self._in_title: if self.__title_trailing_whitespace: self._data_buffer.append(' ') self.__title_trailing_whitespace = ( HTML_ALL_SPACE_RE.match(data[-1]) is not None) if self._title_newly_opened: self._title_newly_opened = False data = HTML_LEADING_TRAILING_SPACE_RE.sub('', data) else: data = HTML_TRAILING_SPACE_RE.sub( '', HTML_LEADING_TRAILING_SPACE_RE.sub(' ', data)) data = HTML_SPACE_RE.sub(' ', data) if not data: return if self._in_pre_tag == 0 and self._data_buffer: # If we're not in a pre block, its possible that we append two spaces # together, which we want to avoid. For instance, if we remove a comment # from between two blocks of text: a c => a c. if data[0] == ' ' and self._data_buffer[-1][-1] == ' ': data = data[1:] if not data: return self._data_buffer.append(data) def handle_entityref(self, data): if self._in_title: if not self._title_newly_opened and self.__title_trailing_whitespace: self._data_buffer.append(' ') self.__title_trailing_whitespace = False self._title_newly_opened = False self._data_buffer.append('&{};'.format(data)) def handle_charref(self, data): if self._in_title: if not self._title_newly_opened and self.__title_trailing_whitespace: self._data_buffer.append(' ') self.__title_trailing_whitespace = False self._title_newly_opened = False self._data_buffer.append('{};'.format(data)) def handle_pi(self, data): self._data_buffer.append('' + data + '>') def unknown_decl(self, data): self._data_buffer.append('') def reset(self): self._data_buffer = [] self._in_pre_tag = 0 self._in_head = False self._in_title = False self._after_doctype = False self._tag_stack = [] self._title_newly_opened = False self.__title_trailing_whitespace = False HTMLParser.reset(self) def unescape(self, val): """Override this method so that we can handle char ref conversion ourself. """ return val @property def result(self): return ''.join(self._data_buffer) htmlmin-0.1.12/htmlmin/python3html/ 0000775 0001750 0001750 00000000000 13221466656 017101 5 ustar dave dave 0000000 0000000 htmlmin-0.1.12/htmlmin/python3html/parser.py 0000664 0001750 0001750 00000043167 13163734553 020761 0 ustar dave dave 0000000 0000000 """A parser for HTML and XHTML.""" ######## # This is copied from Python3 and the slightly modified to support needed # features. The original file can be found at: # https://github.com/python/cpython/blob/44b548dda872c0d4f30afd6b44fd74b053a55ad8/Lib/html/parser.py # # The largest difference is the reinstatment of the unescape method in # HTMLParser, which is needed for features in htmlmin. Changes are also # made to ensure Python2.7 compatability. ######## # This file is based on sgmllib.py, but the API is slightly different. # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). import re import warnings try: import _markupbase as markupbase except ImportError: import markupbase from . import unescape __all__ = ['HTMLParser'] # Regular expressions used for parsing interesting_normal = re.compile('[&<]') incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') # Note: # 1) if you change tagfind/attrfind remember to update locatestarttagend too; # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will # explode, so don't do it. # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') attrfind_tolerant = re.compile( r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) (?:\s*,)* # possibly followed by a comma )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile('>') # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between # and the tag name, so maybe this should be fixed endtagfind = re.compile(r'\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') class HTMLParser(markupbase.ParserBase): """Find tags and other markup and call handler functions. Usage: p = HTMLParser() p.feed(data) ... p.close() Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). If convert_charrefs is True the character references are converted automatically to the corresponding Unicode character (and self.handle_data() is no longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument. """ CDATA_CONTENT_ELEMENTS = ("script", "style") def __init__(self, convert_charrefs=True): """Initialize and reset this instance. If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters. """ self.convert_charrefs = convert_charrefs self.reset() def reset(self): """Reset this instance. Loses all unprocessed data.""" self.rawdata = '' self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None markupbase.ParserBase.reset(self) def feed(self, data): r"""Feed data to the parser. Call this as often as you want, with as little or as much text as you want (may include '\n'). """ self.rawdata = self.rawdata + data self.goahead(0) def close(self): """Handle any buffered data.""" self.goahead(1) __starttag_text = None def get_starttag_text(self): """Return full source of start tag: '<...>'.""" return self.__starttag_text def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I) def clear_cdata_mode(self): self.interesting = interesting_normal self.cdata_elem = None # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. def goahead(self, end): rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: if self.convert_charrefs and not self.cdata_elem: j = rawdata.find('<', i) if j < 0: # if we can't find the next <, either we are at the end # or there's more text incoming. If the latter is True, # we can't pass the text to handle_data in case we have # a charref cut in half at end. Try to determine if # this is the case before proceeding by looking for an # & near the end and see if it's followed by a space or ;. amppos = rawdata.rfind('&', max(i, n-34)) if (amppos >= 0 and not re.compile(r'[\s;]').search(rawdata, amppos)): break # wait till we get all the text j = n else: match = self.interesting.search(rawdata, i) # < or & if match: j = match.start() else: if self.cdata_elem: break j = n if i < j: if self.convert_charrefs and not self.cdata_elem: self.handle_data(self.unescape(rawdata[i:j])) else: self.handle_data(rawdata[i:j]) i = self.updatepos(i, j) if i == n: break startswith = rawdata.startswith if startswith('<', i): if starttagopen.match(rawdata, i): # < + letter k = self.parse_starttag(i) elif startswith("", i): k = self.parse_endtag(i) elif startswith(" Will become simply:: The added exclamation is removed. :param remove_empty_space: Remove empty space found in HTML between an opening and a closing tag and when it contains a newline or carriage return. If whitespace is found that is only spaces and/or tabs, it will be turned into a single space. Be careful, this can have unintended consequences. :param remove_all_empty_space: A more extreme version of ``remove_empty_space``, this removes all empty whitespace found between tags. This is almost guaranteed to break your HTML unless you are very careful. :param reduce_boolean_attributes: Where allowed by the HTML5 specification, attributes such as 'disabled' and 'readonly' will have their value removed, so 'disabled="true"' will simply become 'disabled'. This is generally a good option to turn on except when JavaScript relies on the values. :param remove_optional_attribute_quotes: When True, optional quotes around attributes are removed. When False, all attribute quotes are left intact. Defaults to True. :param conver_charrefs: Decode character references such as & and . to their single charater values where safe. This currently only applies to attributes. Data content between tags will be left encoded. :param keep_pre: By default, htmlmin uses the special attribute ``pre`` to allow you to demarcate areas of HTML that should not be minified. It removes this attribute as it finds it. Setting this value to ``True`` tells htmlmin to leave the attribute in the output. :param pre_tags: A list of tag names that should never be minified. You are free to change this list as you see fit, but you will probably want to include ``pre`` and ``textarea`` if you make any changes to the list. Note that ``