htmlmin-0.1.12/0000775000175000017500000000000013221466656013140 5ustar davedave00000000000000htmlmin-0.1.12/PKG-INFO0000664000175000017500000000170513221466656014240 0ustar davedave00000000000000Metadata-Version: 1.1 Name: htmlmin Version: 0.1.12 Summary: An HTML Minifier Home-page: https://htmlmin.readthedocs.io/en/latest/ Author: Dave Mankoff Author-email: mankyd@gmail.com License: BSD Download-URL: https://github.com/mankyd/htmlmin Description-Content-Type: UNKNOWN Description: A configurable HTML Minifier with safety features. .. image:: https://travis-ci.org/mankyd/htmlmin.png?branch=master :target: http://travis-ci.org/mankyd/htmlmin Documentation: https://htmlmin.readthedocs.io/en/latest/ Platform: UNKNOWN Classifier: Development Status :: 4 - Beta Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: BSD License Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3.2 Classifier: Topic :: Text Processing :: Markup :: HTML htmlmin-0.1.12/htmlmin/0000775000175000017500000000000013221466656014610 5ustar davedave00000000000000htmlmin-0.1.12/htmlmin/command.py0000775000175000017500000001337713147667524016622 0ustar davedave00000000000000#!/usr/bin/env python """ Copyright (c) 2013, Dave Mankoff All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Dave Mankoff nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ import argparse import codecs import locale import io import sys #import htmlmin from . import Minifier parser = argparse.ArgumentParser( description='Minify HTML', formatter_class=argparse.RawTextHelpFormatter ) parser.add_argument('input_file', nargs='?', metavar='INPUT', help='File path to html file to minify. Defaults to stdin.', ) parser.add_argument('output_file', nargs='?', metavar='OUTPUT', help="File path to output to. Defaults to stdout.", ) parser.add_argument('-c', '--remove-comments', help=( '''When set, comments will be removed. They can be kept on an individual basis by starting them with a '!': . The '!' will be removed from the final output. If you want a '!' as the leading character of your comment, put two of them: . '''), action='store_true') parser.add_argument('-s', '--remove-empty-space', help=( '''When set, this removes empty space betwen tags in certain cases. Specifically, it will remove empty space if and only if there a newline character occurs within the space. Thus, code like 'x y' will be left alone, but code such as ' ... ...' will become '......'. Note that this CAN break your html if you spread two inline tags over two lines. Use with caution. '''), action='store_true') parser.add_argument('--remove-all-empty-space', help=( '''When set, this removes ALL empty space betwen tags. WARNING: this can and likely will cause unintended consequences. For instance, 'X Y' will become 'XY'. Putting whitespace along with other text will avoid this problem. Only use if you are confident in the result. Whitespace is not removed from inside of tags, thus ' ' will be left alone. '''), action='store_true') parser.add_argument('--keep-optional-attribute-quotes', help=( '''When set, this keeps all attribute quotes, even if they are optional. '''), action='store_true') parser.add_argument('-H', '--in-head', help=( '''If you are parsing only a fragment of HTML, and the fragment occurs in the head of the document, setting this will remove some extra whitespace. '''), action='store_true') parser.add_argument('-k', '--keep-pre-attr', help=( '''HTMLMin supports the propietary attribute 'pre' that can be added to elements to prevent minification. This attribute is removed by default. Set this flag to keep the 'pre' attributes in place. '''), action='store_true') parser.add_argument('-a', '--pre-attr', help=( '''The attribute htmlmin looks for to find blocks of HTML that it should not minify. This attribute will be removed from the HTML unless '-k' is specified. Defaults to 'pre'. '''), default='pre') parser.add_argument('-p', '--pre-tags', metavar='TAG', help=( '''By default, the contents of 'pre', and 'textarea' tags are left unminified. You can specify different tags using the --pre-tags option. 'script' and 'style' tags are always left unmininfied. '''), nargs='*', default=['pre', 'textarea']) parser.add_argument('-e', '--encoding', help=("Encoding to read and write with. Default 'utf-8'." " When reading from stdin, attempts to use the system's" " encoding before defaulting to utf-8.\n\n"), default=None, ) def main(): args = parser.parse_args() minifier = Minifier( remove_comments=args.remove_comments, remove_empty_space=args.remove_empty_space, remove_optional_attribute_quotes=not args.keep_optional_attribute_quotes, pre_tags=args.pre_tags, keep_pre=args.keep_pre_attr, pre_attr=args.pre_attr, ) default_encoding = args.encoding or 'utf-8' if args.input_file: inp = codecs.open(args.input_file, encoding=default_encoding) else: encoding = args.encoding or sys.stdin.encoding \ or locale.getpreferredencoding() or default_encoding inp = io.open(sys.stdin.fileno(), encoding=encoding) for line in inp.readlines(): minifier.input(line) if args.output_file: codecs.open( args.output_file, 'w', encoding=default_encoding).write(minifier.output) else: encoding = args.encoding or sys.stdout.encoding \ or locale.getpreferredencoding() or default_encoding io.open(sys.stdout.fileno(), 'w', encoding=encoding).write(minifier.output) if __name__ == '__main__': main() htmlmin-0.1.12/htmlmin/decorator.py0000664000175000017500000000470612110442242017130 0ustar davedave00000000000000""" Copyright (c) 2013, Dave Mankoff All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Dave Mankoff nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from .main import Minifier def htmlmin(*args, **kwargs): """Minifies HTML that is returned by a function. A simple decorator that minifies the HTML output of any function that it decorates. It supports all the same options that :class:`htmlmin.minify` has. With no options, it uses ``minify``'s default settings:: @htmlmin def foobar(): return ' minify me! ' or:: @htmlmin(remove_comments=True) def foobar(): return ' minify me! ' """ def _decorator(fn): minify = Minifier(**kwargs).minify def wrapper(*a, **kw): return minify(fn(*a, **kw)) return wrapper if len(args) == 1: if callable(args[0]) and not kwargs: return _decorator(args[0]) else: raise RuntimeError( 'htmlmin decorator does accept positional arguments') elif len(args) > 1: raise RuntimeError( 'htmlmin decorator does accept positional arguments') else: return _decorator htmlmin-0.1.12/htmlmin/parser.py0000664000175000017500000003610213212125436016444 0ustar davedave00000000000000""" Copyright (c) 2013, Dave Mankoff All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Dave Mankoff nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from __future__ import unicode_literals import logging import sys import re from .python3html.parser import HTMLParser from . import escape # https://www.w3.org/TR/html5/single-page.html#space-character HTML_SPACE_RE = re.compile('[\x20\x09\x0a\x0c\x0d]+') HTML_ALL_SPACE_RE = re.compile('^[\x20\x09\x0a\x0c\x0d]+$') HTML_LEADING_SPACE_RE = re.compile( '^[\x20\x09\x0a\x0c\x0d]+') HTML_TRAILING_SPACE_RE = re.compile( '[\x20\x09\x0a\x0c\x0d]+$') HTML_LEADING_TRAILING_SPACE_RE = re.compile( '(^[\x20\x09\x0a\x0c\x0d]+)|([\x20\x09\x0a\x0c\x0d]+$)') PRE_TAGS = ('pre', 'textarea') # styles and scripts are never minified # http://www.w3.org/TR/html51/syntax.html#elements-0 NO_CLOSE_TAGS = ('area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr') # http://www.w3.org/TR/html51/index.html#attributes-1 BOOLEAN_ATTRIBUTES = { 'audio': ('autoplay', 'controls', 'hidden', 'loop', 'muted',), 'button': ('autofocus', 'disabled', 'formnovalidate', 'hidden',), 'command': ('checked', 'disabled', 'hidden'), 'dialog': ('hidden', 'open',), 'fieldset': ('disabled', 'hidden',), 'form': ('hidden', 'novalidate',), 'iframe': ('hidden', 'seamless',), 'img': ('hidden', 'ismap',), 'input': ('autofocus', 'checked', 'disabled', 'formnovalidate', 'hidden', 'multiple', 'readonly', 'required',), 'keygen': ('autofocus', 'disabled', 'hidden',), 'object': ('hidden', 'typesmustmatch',), 'ol': ('hidden', 'reversed',), 'optgroup': ('disabled', 'hidden',), 'option': ('disabled', 'hidden', 'selected',), 'script': ('async', 'defer', 'hidden',), 'select': ('autofocus', 'disabled', 'hidden', 'multiple', 'required',), 'style': ('hidden', 'scoped',), 'textarea': ('autofocus', 'disabled', 'hidden', 'readonly', 'required',), 'track': ('default', 'hidden', ), 'video': ('autoplay', 'controls', 'hidden', 'loop', 'muted',), '*': ('hidden',), } # a list of tags and tags that they are closed by TAG_SETS = { 'li': ('li',), 'dd': ('dd', 'dt'), 'rp': ('rp', 'rt'), 'p': ('address', 'article', 'aside', 'blockquote', 'dir', 'div', 'dl', 'fieldset', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'menu', 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul'), 'optgroup': ('optgroup',), 'option': ('option', 'optgroup'), 'colgroup': '*', 'tbody': ('tbody', 'tfoot'), 'tfoot': ('tbody',), 'tr': ('tr',), 'td': ('td', 'th'), } TAG_SETS['dt'] = TAG_SETS['dd'] TAG_SETS['rt'] = TAG_SETS['rp'] TAG_SETS['thead'] = TAG_SETS['tbody'] TAG_SETS['th'] = TAG_SETS['td'] # Tag omission rules: # http://www.w3.org/TR/html51/syntax.html#optional-tags class HTMLMinError(Exception): pass class ParseError(HTMLMinError): pass class OpenTagNotFoundError(ParseError): pass class HTMLMinParser(HTMLParser): def __init__(self, remove_comments=False, remove_empty_space=False, remove_all_empty_space=False, reduce_empty_attributes=True, reduce_boolean_attributes=False, remove_optional_attribute_quotes=True, convert_charrefs=True, keep_pre=False, pre_tags=PRE_TAGS, pre_attr='pre'): if sys.version_info[0] >= 3 and sys.version_info[1] >= 4: # convert_charrefs is True by default in Python 3.5.0 and newer. It was # introduced in 3.4. HTMLParser.__init__(self, convert_charrefs=False) else: HTMLParser.__init__(self) self.keep_pre = keep_pre self.pre_tags = pre_tags self.remove_comments = remove_comments self.remove_empty_space = remove_empty_space self.remove_all_empty_space = remove_all_empty_space self.reduce_empty_attributes = reduce_empty_attributes self.reduce_boolean_attributes = reduce_boolean_attributes self.remove_optional_attribute_quotes = remove_optional_attribute_quotes self.convert_charrefs = convert_charrefs self.pre_attr = pre_attr self.reset() def _tag_lang(self): return self._tag_stack[0][2] if self._tag_stack else None def build_tag(self, tag, attrs, close_tag): has_pre = False if self.reduce_boolean_attributes: bool_attrs = BOOLEAN_ATTRIBUTES.get(tag, BOOLEAN_ATTRIBUTES['*']) else: bool_attrs = False lang = self._tag_lang() attrs = list(attrs) # We're modifying it in place last_quoted = last_no_slash = i = -1 for k, v in attrs: pre_prefix = k.startswith("{}-".format(self.pre_attr)) if pre_prefix: k = k[len(self.pre_attr)+1:] if k == self.pre_attr: has_pre = True if not self.keep_pre and not pre_prefix: continue if v and self.convert_charrefs and not pre_prefix: v = HTMLParser.unescape(self, v) if k == 'lang': lang = v if v == self._tag_lang(): continue i += 1 if not pre_prefix: k = escape.escape_attr_name(k) if (v is None or (not v and self.reduce_empty_attributes) or (bool_attrs and k in bool_attrs)): # For our use case, we treat boolean attributes as quoted because they # don't require space between them and "/>" in closing tags. attrs[i] = k last_quoted = i else: if pre_prefix: has_double_quotes = '"' in v has_single_quotes = "'" in v if not has_double_quotes: if not has_single_quotes and self.remove_optional_attribute_quotes: q = escape.NO_QUOTES else: q = escape.DOUBLE_QUOTE elif not has_single_quotes: q = escape.SINGLE_QUOTES else: logging.error('Unsafe content found in pre-attribute. Escaping.') (v, q) = escape.escape_attr_value( v, double_quote=not self.remove_optional_attribute_quotes) else: (v, q) = escape.escape_attr_value( v, double_quote=not self.remove_optional_attribute_quotes) if q == escape.NO_QUOTES: attrs[i] = '%s=%s' % (k, v) if v[-1] != '/': last_no_slash = i else: q = '"' if q == escape.DOUBLE_QUOTE else "'" attrs[i] = '%s=%s%s%s' % (k, q, v, q) last_quoted = i i += 1 if i != len(attrs): del attrs[i:] # 1. If there are no attributes, no additional space is necessary. # 2. If last attribute is quoted, no additional space is necessary. # 3. Two things are happening here: # a) according to the standard, should be treated as so space is necessary if this is self-closing tag, # however # b) reportedly (https://github.com/mankyd/htmlmin/pull/12), older # versions of WebKit interpret as self-closing tag so # we need the space if the last argument ends with a slash. space_maybe = '' if attrs: needs_space = lambda last_attr: (last_attr[-1] not in '"\'' and (close_tag or last_attr[-1] == '/')) if needs_space(attrs[-1][-1]): # If moving attributes around can help, do it. Otherwise bite the # bullet and put the space in. i = last_no_slash if last_quoted == -1 else last_quoted if i == -1 or needs_space(attrs[i]): space_maybe = ' ' else: attrs.append(attrs[i]) del attrs[i] return has_pre, '<%s%s%s%s%s>' % (escape.escape_tag(tag), ' ' if attrs else '', ' '.join(attrs), space_maybe, '/' if close_tag else ''), lang def handle_decl(self, decl): if (len(self._data_buffer) == 1 and HTML_SPACE_RE.match(self._data_buffer[0][0])): self._data_buffer = [] self._data_buffer.append('') self._after_doctype = True def _close_tags_up_to(self, tag): num_pres = 0 i = 0 for i, t in enumerate(self._tag_stack): if t[1]: num_pres += 1 if t[0] == tag: break # Only the html tag can close out everything. Put on the brakes if # we encounter a closing tag that we didn't recognize. if tag != 'html' and t[0] in ('body', 'html', 'head'): raise OpenTagNotFoundError() self._tag_stack = self._tag_stack[i+1:] return num_pres def handle_starttag(self, tag, attrs): self._after_doctype = False if tag == 'head': self._in_head = True elif self._in_head and tag == 'title': self._in_title = True self._title_newly_opened = True for t in self._tag_stack: closed_by_tags = TAG_SETS.get(t[0]) if closed_by_tags and (closed_by_tags == '*' or tag in closed_by_tags): self._in_pre_tag -= self._close_tags_up_to(t[0]) break has_pre, data, lang = self.build_tag(tag, attrs, False) start_pre = False if (has_pre or self._in_pre_tag > 0 or tag == 'script' or tag == 'style' or tag in self.pre_tags): self._in_pre_tag += 1 start_pre = True self._tag_stack.insert(0, (tag, start_pre, lang)) self._data_buffer.append(data) def handle_endtag(self, tag): # According to the spec,

tags don't get closed when a parent a # tag closes them. Here's some logic that addresses this. if tag == 'a': contains_p = False for i, t in enumerate(self._tag_stack): if t[0] == 'p': contains_p = True elif t[0] == 'a': break if contains_p: # the p tag, and all its children should be left open a_tag = self._tag_stack.pop(i) if a_tag[1]: self._in_pre_tag -= 1 else: if tag == 'head': # TODO: Did we know that we were in an head tag?! If not, we need to # reminify everything to remove extra spaces. self._in_head = False elif tag == 'title': self._in_title = False self._title_newly_opened = False try: self._in_pre_tag -= self._close_tags_up_to(tag) except OpenTagNotFoundError: # Some tags don't require a start tag. Most do. Either way, we leave # closing tags along since they affect output. For instance, a '

' # results in a '

' in Chrome. pass if tag not in NO_CLOSE_TAGS: self._data_buffer.extend(['']) def handle_startendtag(self, tag, attrs): self._after_doctype = False data = self.build_tag(tag, attrs, tag not in NO_CLOSE_TAGS)[1] self._data_buffer.append(data) def handle_comment(self, data): if not self.remove_comments or re.match(r'^(?:!|\[if\s)', data): self._data_buffer.append(''.format( data[1:] if len(data) and data[0] == '!' else data)) def handle_data(self, data): if self._in_pre_tag > 0: self._data_buffer.append(data) else: # remove_all_empty_space matches everything. remove_empty_space only # matches if there's a newline involved. if self.remove_all_empty_space or self._in_head or self._after_doctype: if HTML_ALL_SPACE_RE.match(data): return elif (self.remove_empty_space and HTML_ALL_SPACE_RE.match(data) and ('\n' in data or '\r' in data)): return # if we're in the title, remove leading and trailing whitespace. # note that the title may be parsed in chunks if entityref's or charrefs # are encountered. if self._in_title: if self.__title_trailing_whitespace: self._data_buffer.append(' ') self.__title_trailing_whitespace = ( HTML_ALL_SPACE_RE.match(data[-1]) is not None) if self._title_newly_opened: self._title_newly_opened = False data = HTML_LEADING_TRAILING_SPACE_RE.sub('', data) else: data = HTML_TRAILING_SPACE_RE.sub( '', HTML_LEADING_TRAILING_SPACE_RE.sub(' ', data)) data = HTML_SPACE_RE.sub(' ', data) if not data: return if self._in_pre_tag == 0 and self._data_buffer: # If we're not in a pre block, its possible that we append two spaces # together, which we want to avoid. For instance, if we remove a comment # from between two blocks of text: a c => a c. if data[0] == ' ' and self._data_buffer[-1][-1] == ' ': data = data[1:] if not data: return self._data_buffer.append(data) def handle_entityref(self, data): if self._in_title: if not self._title_newly_opened and self.__title_trailing_whitespace: self._data_buffer.append(' ') self.__title_trailing_whitespace = False self._title_newly_opened = False self._data_buffer.append('&{};'.format(data)) def handle_charref(self, data): if self._in_title: if not self._title_newly_opened and self.__title_trailing_whitespace: self._data_buffer.append(' ') self.__title_trailing_whitespace = False self._title_newly_opened = False self._data_buffer.append('&#{};'.format(data)) def handle_pi(self, data): self._data_buffer.append('') def unknown_decl(self, data): self._data_buffer.append('') def reset(self): self._data_buffer = [] self._in_pre_tag = 0 self._in_head = False self._in_title = False self._after_doctype = False self._tag_stack = [] self._title_newly_opened = False self.__title_trailing_whitespace = False HTMLParser.reset(self) def unescape(self, val): """Override this method so that we can handle char ref conversion ourself. """ return val @property def result(self): return ''.join(self._data_buffer) htmlmin-0.1.12/htmlmin/python3html/0000775000175000017500000000000013221466656017101 5ustar davedave00000000000000htmlmin-0.1.12/htmlmin/python3html/parser.py0000664000175000017500000004316713163734553020761 0ustar davedave00000000000000"""A parser for HTML and XHTML.""" ######## # This is copied from Python3 and the slightly modified to support needed # features. The original file can be found at: # https://github.com/python/cpython/blob/44b548dda872c0d4f30afd6b44fd74b053a55ad8/Lib/html/parser.py # # The largest difference is the reinstatment of the unescape method in # HTMLParser, which is needed for features in htmlmin. Changes are also # made to ensure Python2.7 compatability. ######## # This file is based on sgmllib.py, but the API is slightly different. # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). import re import warnings try: import _markupbase as markupbase except ImportError: import markupbase from . import unescape __all__ = ['HTMLParser'] # Regular expressions used for parsing interesting_normal = re.compile('[&<]') incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') # Note: # 1) if you change tagfind/attrfind remember to update locatestarttagend too; # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will # explode, so don't do it. # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') attrfind_tolerant = re.compile( r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) (?:\s*,)* # possibly followed by a comma )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile('>') # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between # ') class HTMLParser(markupbase.ParserBase): """Find tags and other markup and call handler functions. Usage: p = HTMLParser() p.feed(data) ... p.close() Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). If convert_charrefs is True the character references are converted automatically to the corresponding Unicode character (and self.handle_data() is no longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument. """ CDATA_CONTENT_ELEMENTS = ("script", "style") def __init__(self, convert_charrefs=True): """Initialize and reset this instance. If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters. """ self.convert_charrefs = convert_charrefs self.reset() def reset(self): """Reset this instance. Loses all unprocessed data.""" self.rawdata = '' self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None markupbase.ParserBase.reset(self) def feed(self, data): r"""Feed data to the parser. Call this as often as you want, with as little or as much text as you want (may include '\n'). """ self.rawdata = self.rawdata + data self.goahead(0) def close(self): """Handle any buffered data.""" self.goahead(1) __starttag_text = None def get_starttag_text(self): """Return full source of start tag: '<...>'.""" return self.__starttag_text def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() self.interesting = re.compile(r'' % self.cdata_elem, re.I) def clear_cdata_mode(self): self.interesting = interesting_normal self.cdata_elem = None # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. def goahead(self, end): rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: if self.convert_charrefs and not self.cdata_elem: j = rawdata.find('<', i) if j < 0: # if we can't find the next <, either we are at the end # or there's more text incoming. If the latter is True, # we can't pass the text to handle_data in case we have # a charref cut in half at end. Try to determine if # this is the case before proceeding by looking for an # & near the end and see if it's followed by a space or ;. amppos = rawdata.rfind('&', max(i, n-34)) if (amppos >= 0 and not re.compile(r'[\s;]').search(rawdata, amppos)): break # wait till we get all the text j = n else: match = self.interesting.search(rawdata, i) # < or & if match: j = match.start() else: if self.cdata_elem: break j = n if i < j: if self.convert_charrefs and not self.cdata_elem: self.handle_data(self.unescape(rawdata[i:j])) else: self.handle_data(rawdata[i:j]) i = self.updatepos(i, j) if i == n: break startswith = rawdata.startswith if startswith('<', i): if starttagopen.match(rawdata, i): # < + letter k = self.parse_starttag(i) elif startswith("', i + 1) if k < 0: k = rawdata.find('<', i + 1) if k < 0: k = i + 1 else: k += 1 if self.convert_charrefs and not self.cdata_elem: self.handle_data(self.unescape(rawdata[i:k])) else: self.handle_data(rawdata[i:k]) i = self.updatepos(i, k) elif startswith("&#", i): match = charref.match(rawdata, i) if match: name = match.group()[2:-1] self.handle_charref(name) k = match.end() if not startswith(';', k-1): k = k - 1 i = self.updatepos(i, k) continue else: if ";" in rawdata[i:]: # bail by consuming &# self.handle_data(rawdata[i:i+2]) i = self.updatepos(i, i+2) break elif startswith('&', i): match = entityref.match(rawdata, i) if match: name = match.group(1) self.handle_entityref(name) k = match.end() if not startswith(';', k-1): k = k - 1 i = self.updatepos(i, k) continue match = incomplete.match(rawdata, i) if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: k = match.end() if k <= i: k = n i = self.updatepos(i, i + 1) # incomplete break elif (i + 1) < n: # not the end of the buffer, and can't be confused # with some other construct self.handle_data("&") i = self.updatepos(i, i + 1) else: break else: assert 0, "interesting.search() lied" # end while if end and i < n and not self.cdata_elem: if self.convert_charrefs and not self.cdata_elem: self.handle_data(self.unescape(rawdata[i:n])) else: self.handle_data(rawdata[i:n]) i = self.updatepos(i, n) self.rawdata = rawdata[i:] # Internal -- parse html declarations, return length or -1 if not terminated # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state # See also parse_declaration in _markupbase def parse_html_declaration(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == ' gtpos = rawdata.find('>', i+9) if gtpos == -1: return -1 self.handle_decl(rawdata[i+2:gtpos]) return gtpos+1 else: return self.parse_bogus_comment(i) # Internal -- parse bogus comment, return length or -1 if not terminated # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state def parse_bogus_comment(self, i, report=1): rawdata = self.rawdata assert rawdata[i:i+2] in ('', i+2) if pos == -1: return -1 if report: self.handle_comment(rawdata[i+2:pos]) return pos + 1 # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == ' if not match: return -1 j = match.start() self.handle_pi(rawdata[i+2: j]) j = match.end() return j # Internal -- handle starttag, return end or -1 if not terminated def parse_starttag(self, i): self.__starttag_text = None endpos = self.check_for_whole_start_tag(i) if endpos < 0: return endpos rawdata = self.rawdata self.__starttag_text = rawdata[i:endpos] # Now parse the data between i+1 and j into a tag and attrs attrs = [] match = tagfind_tolerant.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = match.group(1).lower() while k < endpos: m = attrfind_tolerant.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) if not rest: attrvalue = None elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue: attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() end = rawdata[k:endpos].strip() if end not in (">", "/>"): lineno, offset = self.getpos() if "\n" in self.__starttag_text: lineno = lineno + self.__starttag_text.count("\n") offset = len(self.__starttag_text) \ - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) self.handle_data(rawdata[i:endpos]) return endpos if end.endswith('/>'): # XHTML-style empty tag: self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) return endpos # Internal -- check to see if we have a complete starttag; return end # or -1 if incomplete. def check_for_whole_start_tag(self, i): rawdata = self.rawdata m = locatestarttagend_tolerant.match(rawdata, i) if m: j = m.end() next = rawdata[j:j+1] if next == ">": return j + 1 if next == "/": if rawdata.startswith("/>", j): return j + 2 if rawdata.startswith("/", j): # buffer boundary return -1 # else bogus input if j > i: return j else: return i + 1 if next == "": # end of input return -1 if next in ("abcdefghijklmnopqrstuvwxyz=/" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 if j > i: return j else: return i + 1 raise AssertionError("we should not get here!") # Internal -- parse endtag, return end or -1 if incomplete def parse_endtag(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == " if not match: return -1 gtpos = match.end() match = endtagfind.match(rawdata, i) # if not match: if self.cdata_elem is not None: self.handle_data(rawdata[i:gtpos]) return gtpos # find the name: w3.org/TR/html5/tokenization.html#tag-name-state namematch = tagfind_tolerant.match(rawdata, i+2) if not namematch: # w3.org/TR/html5/tokenization.html#end-tag-open-state if rawdata[i:i+3] == '': return i+3 else: return self.parse_bogus_comment(i) tagname = namematch.group(1).lower() # consume and ignore other stuff between the name and the > # Note: this is not 100% correct, since we might have things like # , but looking for > after tha name should cover # most of the cases and is much simpler gtpos = rawdata.find('>', namematch.end()) self.handle_endtag(tagname) return gtpos+1 elem = match.group(1).lower() # script or style if self.cdata_elem is not None: if elem != self.cdata_elem: self.handle_data(rawdata[i:gtpos]) return gtpos self.handle_endtag(elem.lower()) self.clear_cdata_mode() return gtpos # Overridable -- finish processing of start+end tag: def handle_startendtag(self, tag, attrs): self.handle_starttag(tag, attrs) self.handle_endtag(tag) # Overridable -- handle start tag def handle_starttag(self, tag, attrs): pass # Overridable -- handle end tag def handle_endtag(self, tag): pass # Overridable -- handle character reference def handle_charref(self, name): pass # Overridable -- handle entity reference def handle_entityref(self, name): pass # Overridable -- handle data def handle_data(self, data): pass # Overridable -- handle comment def handle_comment(self, data): pass # Overridable -- handle declaration def handle_decl(self, decl): pass # Overridable -- handle processing instruction def handle_pi(self, data): pass def unknown_decl(self, data): pass # Internal -- helper to remove special character quoting def unescape(self, s): return unescape(s) htmlmin-0.1.12/htmlmin/python3html/__init__.py0000664000175000017500000001153213163734553021213 0ustar davedave00000000000000""" General functions for HTML manipulation. """ import re as _re try: from html.entities import html5 as _html5 unichr = chr except ImportError: import htmlentitydefs _html5 = {'apos;':u"'"} for k, v in htmlentitydefs.name2codepoint.iteritems(): _html5[k + ';'] = unichr(v) __all__ = ['escape', 'unescape'] def escape(s, quote=True): """ Replace special characters "&", "<" and ">" to HTML-safe sequences. If the optional flag quote is true (the default), the quotation mark characters, both double quote (") and single quote (') characters are also translated. """ s = s.replace("&", "&") # Must be done first! s = s.replace("<", "<") s = s.replace(">", ">") if quote: s = s.replace('"', """) s = s.replace('\'', "'") return s # see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references _invalid_charrefs = { 0x00: '\ufffd', # REPLACEMENT CHARACTER 0x0d: '\r', # CARRIAGE RETURN 0x80: '\u20ac', # EURO SIGN 0x81: '\x81', # 0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK 0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK 0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK 0x85: '\u2026', # HORIZONTAL ELLIPSIS 0x86: '\u2020', # DAGGER 0x87: '\u2021', # DOUBLE DAGGER 0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT 0x89: '\u2030', # PER MILLE SIGN 0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON 0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE 0x8d: '\x8d', # 0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON 0x8f: '\x8f', # 0x90: '\x90', # 0x91: '\u2018', # LEFT SINGLE QUOTATION MARK 0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK 0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK 0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK 0x95: '\u2022', # BULLET 0x96: '\u2013', # EN DASH 0x97: '\u2014', # EM DASH 0x98: '\u02dc', # SMALL TILDE 0x99: '\u2122', # TRADE MARK SIGN 0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON 0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 0x9c: '\u0153', # LATIN SMALL LIGATURE OE 0x9d: '\x9d', # 0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON 0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS } _invalid_codepoints = { # 0x0001 to 0x0008 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, # 0x000E to 0x001F 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, # 0x007F to 0x009F 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, # 0xFDD0 to 0xFDEF 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8, 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1, 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea, 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef, # others 0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff, 0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff, 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff, 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff, 0x10fffe, 0x10ffff } def _replace_charref(s): s = s.group(1) if s[0] == '#': # numeric charref if s[1] in 'xX': num = int(s[2:].rstrip(';'), 16) else: num = int(s[1:].rstrip(';')) if num in _invalid_charrefs: return _invalid_charrefs[num] if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF: return '\uFFFD' if num in _invalid_codepoints: return '' return unichr(num) else: # named charref if s in _html5: return _html5[s] # find the longest matching name (as defined by the standard) for x in range(len(s)-1, 1, -1): if s[:x] in _html5: return _html5[s[:x]] + s[x:] else: return '&' + s _charref = _re.compile(r'&(#[0-9]+;?' r'|#[xX][0-9a-fA-F]+;?' r'|[^\t\n\f <&#;]{1,32};?)') def unescape(s): """ Convert all named and numeric character references (e.g. >, >, &x3e;) in the string s to the corresponding unicode characters. This function uses the rules defined by the HTML 5 standard for both valid and invalid character references, and the list of HTML 5 named character references defined in html.entities.html5. """ if '&' not in s: return s return _charref.sub(_replace_charref, s) htmlmin-0.1.12/htmlmin/middleware.py0000664000175000017500000001003112110442225017250 0ustar davedave00000000000000""" Copyright (c) 2013, Dave Mankoff All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Dave Mankoff nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from .main import Minifier class HTMLMinMiddleware(object): """WSGI Middleware that minifies html on the way out. :param by_default: Specifies if minification should be turned on or off by default. Defaults to ``True``. :param keep_header: The middleware recognizes one custom HTTP header that can be used to turn minification on or off on a per-request basis: ``X-HTML-Min-Enable``. Setting the header to ``true`` will turn minfication on; anything else will turn minification off. If ``by_default`` is set to ``False``, this header is how you would turn minification back on. The middleware, by default, removes the header from the output. Setting this to ``True`` leaves the header in tact. :param debug: A quick setting to turn all minification off. The middleware is effectively bypassed. This simple middleware minifies any HTML content that passes through it. Any additional keyword arguments beyond the three settings the middleware has are passed on to the internal minifier. The documentation for the options can be found under :class:`htmlmin.minify`. """ def __init__(self, app, by_default=True, keep_header=False, debug=False, **kwargs): self.app = app self.by_default = by_default self.debug = debug self.keep_header = keep_header self.minifier = Minifier(**kwargs) def __call__(self, environ, start_response): if self.debug: return self.app(environ, start_response) should_minify = [] # need to use a mutable object so we can change it # in a different scope. def minified_start_response(status, headers, exc_info=None): should_minify.append(self.should_minify(headers)) if not self.keep_header: headers = [(header, value) for header, value in headers if header != 'X-HTML-Min-Enable'] start_response(status, headers, exc_info) html = [i for i in self.app(environ, minified_start_response)] if should_minify[0]: return [self.minifier.minify(*html)] return html def should_minify(self, headers): is_html = False flag_header = None for header, value in headers: if not is_html and header == 'Content-Type' and value == 'text/html': is_html = True if flag_header is not None: break if flag_header is None and header == 'X-HTML-Min-Enable': flag_header = (value.lower() == 'true') if is_html: break return is_html and ( (self.by_default and flag_header != False) or (not self.by_default and flag_header)) htmlmin-0.1.12/htmlmin/main.py0000664000175000017500000002012113163755274016104 0ustar davedave00000000000000""" Copyright (c) 2013, Dave Mankoff All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Dave Mankoff nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ import cgi from . import parser def minify(input, remove_comments=False, remove_empty_space=False, remove_all_empty_space=False, reduce_empty_attributes=True, reduce_boolean_attributes=False, remove_optional_attribute_quotes=True, convert_charrefs=True, keep_pre=False, pre_tags=parser.PRE_TAGS, pre_attr='pre', cls=parser.HTMLMinParser): """Minifies HTML in one shot. :param input: A string containing the HTML to be minified. :param remove_comments: Remove comments found in HTML. Individual comments can be maintained by putting a ``!`` as the first character inside the comment. Thus:: Will become simply:: The added exclamation is removed. :param remove_empty_space: Remove empty space found in HTML between an opening and a closing tag and when it contains a newline or carriage return. If whitespace is found that is only spaces and/or tabs, it will be turned into a single space. Be careful, this can have unintended consequences. :param remove_all_empty_space: A more extreme version of ``remove_empty_space``, this removes all empty whitespace found between tags. This is almost guaranteed to break your HTML unless you are very careful. :param reduce_boolean_attributes: Where allowed by the HTML5 specification, attributes such as 'disabled' and 'readonly' will have their value removed, so 'disabled="true"' will simply become 'disabled'. This is generally a good option to turn on except when JavaScript relies on the values. :param remove_optional_attribute_quotes: When True, optional quotes around attributes are removed. When False, all attribute quotes are left intact. Defaults to True. :param conver_charrefs: Decode character references such as & and . to their single charater values where safe. This currently only applies to attributes. Data content between tags will be left encoded. :param keep_pre: By default, htmlmin uses the special attribute ``pre`` to allow you to demarcate areas of HTML that should not be minified. It removes this attribute as it finds it. Setting this value to ``True`` tells htmlmin to leave the attribute in the output. :param pre_tags: A list of tag names that should never be minified. You are free to change this list as you see fit, but you will probably want to include ``pre`` and ``textarea`` if you make any changes to the list. Note that ``