pynliner-0.8.0/0000775000175000017500000000000013035233730014514 5ustar tannerntannern00000000000000pynliner-0.8.0/pynliner/0000775000175000017500000000000013035233730016354 5ustar tannerntannern00000000000000pynliner-0.8.0/pynliner/soupselect.py0000664000175000017500000002134413035227640021123 0ustar tannerntannern00000000000000""" # Included with pynliner since it isn't on PyPI # soupselect.py CSS selector support for BeautifulSoup. soup = BeautifulSoup('...') select(soup, 'div') - returns a list of div elements select(soup, 'div#main ul a') - returns a list of links inside a ul inside div#main patched to support multiple class selectors here http://code.google.com/p/soupselect/issues/detail?id=4#c0 """ import re import operator as operator_ from functools import partial import bs4 ATTRIBUTE_PATTERN = re.compile(r'\[(?P[^\s\]=~\|\^\$\*]+)(?P[=~\|\^\$\*]?)=?["\']?(?P[^\]"]*)["\']?\]') PSEUDO_CLASS_PATTERN = re.compile(u':(([^:.#(*\\[]|\\([^)]+\\))+)') SELECTOR_TOKEN_PATTERN = re.compile(r'([_0-9a-zA-Z-#.:*]+|\[[^\]]+\])$') def get_attribute_checker(operator, attribute, value=''): """ Takes an operator, attribute and optional value; returns a function that will return True for elements that match that combination. """ return { '=': lambda el: el.get(attribute) == value, # attribute includes value as one of a set of space separated tokens '~': lambda el: value in el.get(attribute, '').split(), # attribute starts with value '^': lambda el: el.get(attribute, '').startswith(value), # attribute ends with value '$': lambda el: el.get(attribute, '').endswith(value), # attribute contains value '*': lambda el: value in el.get(attribute, ''), # attribute is either exactly value or starts with value- '|': lambda el: el.get(attribute, '') == value \ or el.get(attribute, '').startswith('%s-' % value), }.get(operator, lambda el: el.has_attr(attribute)) def is_white_space(el): if isinstance(el, bs4.NavigableString) and str(el).strip() == '': return True if isinstance(el, bs4.Comment): return True return False def is_last_content_node(el): result = False if el is None: result = True elif is_white_space(el): result = is_last_content_node(el.nextSibling) return result def is_first_content_node(el): result = False if el is None: result = True if is_white_space(el): result = is_first_content_node(el.previousSibling) return result def get_pseudo_class_checker(psuedo_class): """ Takes a psuedo_class, like "first-child" or "last-child" and returns a function that will check if the element satisfies that psuedo class """ return { 'first-child': lambda el: is_first_content_node(getattr(el, 'previousSibling', None)), 'last-child': lambda el: is_last_content_node(getattr(el, 'nextSibling', None)) }.get(psuedo_class, lambda el: False) def contains_all_classes_checker(required_classes, el): if not el: return False actual_classes = el.get('class', []) test_has_class = partial(operator_.contains, actual_classes) return all(map(test_has_class, required_classes)) def get_checker(functions): def checker(el): for func in functions: if not func(el): return False return el return checker def select(soup, selector): """ soup should be a BeautifulSoup instance; selector is a CSS selector specifying the elements you want to retrieve. """ handle_token = True current_context = [(soup, [])] operator = None while selector: if handle_token: # Get the rightmost token handle_token = False match = SELECTOR_TOKEN_PATTERN.search(selector) if not match: raise Exception("No match was found. We're done or something is broken") token = match.groups(1)[0] # remove this token from the selector selector = selector.rsplit(token, 1)[0].rstrip() checker_functions = [] # # Get attribute selectors from token # matches = ATTRIBUTE_PATTERN.findall(token) for match in matches: checker_functions.append(get_attribute_checker(match[1], match[0], match[2])) # # Get pseudo classes from token # for match in PSEUDO_CLASS_PATTERN.finditer(token): checker_functions.append(get_pseudo_class_checker(match.groups(1)[0])) checker = get_checker(checker_functions) # # Get tag # tag = re.findall('^([a-zA-Z0-9]+)', token) if len(tag) == 0: tag = True elif len(tag) == 1: tag = tag[0] else: raise Exception("Multiple tags found (invalid CSS)") # # Get ID # ids = re.findall('#([a-zA-Z0-9_-]+)', token) if len(ids) > 1: raise Exception("Only single # OK") # # Get classes # classes = re.findall('\.([a-zA-Z0-9_-]+)', token) checker_functions.append(partial(contains_all_classes_checker, classes)) # # Search contexts for matches # found = [] find_dict = {} if ids: find_dict['id'] = ids if classes: find_dict['class'] = partial(operator_.contains, classes) if operator is None: # This is the first token: simply find all matches for context in current_context: context_matches = [el for el in context[0].find_all(tag, find_dict) if checker(el)] for context_match in context_matches: found.append( (context_match, [context_match]), ) elif operator == ' ': # for each context in current_context, ensure there # exists an element somewhere above that element that # matches the provided token # ("descendant" selector) for context in current_context: context_matches = [] for el in context[1]: if checker(el.findParent(tag, find_dict)): context_matches.append(el) if context_matches: found.append( (context[0], context_matches), ) elif operator == '>': # for each context in current_context, # check if the parent satisfies the provided # arguments. for context in current_context: context_matches = [] for el in context[1]: if checker(el.findParent(tag, find_dict)) == el.parent: context_matches.append(el.parent) if context_matches: found.append( (context[0], context_matches), ) elif operator == '~': # for each context in current_context # check raise NotImplementedError("~ operator is not implemented. Sad face :(") elif operator == '+': # for each context in current_context # check if the preceding sibling satisfies the # provided arguments for context in current_context: context_matches = [] for el in context[1]: if checker(el.findPreviousSibling(tag, find_dict)) == el.previousSibling: context_matches.append(el.previousSibling) if context_matches: found.append( (context[0], context_matches) ) current_context = found else: # Get the next operator (whitespace, >, ~, +) handle_token = True match = re.search('([>~+]+)$', selector) if match: operator = match.groups(1)[0] selector = selector.rsplit(operator, 1)[0].rstrip() else: operator = ' ' return [entry[0] for entry in current_context] def monkeypatch(BeautifulSoupClass=None): """ If you don't explicitly state the class to patch, defaults to the most common import location for BeautifulSoup. """ if not BeautifulSoupClass: from bs4 import BeautifulSoup as BeautifulSoupClass BeautifulSoupClass.findSelect = select def unmonkeypatch(BeautifulSoupClass=None): if not BeautifulSoupClass: from bs4 import BeautifulSoup as BeautifulSoupClass delattr(BeautifulSoupClass, 'findSelect') pynliner-0.8.0/pynliner/__init__.py0000664000175000017500000002531313035232401020463 0ustar tannerntannern00000000000000#!/usr/bin/env python # -*- coding: utf-8 -*- """Pynliner : Convert CSS to inline styles Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and cssutils Copyright (c) 2011-2016 Tanner Netterville Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. The generated output of this software shall not be used in a mass marketing service. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import re import cssutils from bs4 import BeautifulSoup from .soupselect import select try: from urllib.parse import urljoin from urllib.request import urlopen unicode = str except ImportError: from urlparse import urljoin from urllib2 import urlopen __version__ = "0.8.0" # this pattern may be too aggressive HTML_ENTITY_PATTERN = re.compile(r'&(#([0-9]+|x[a-fA-F0-9]+)|[a-zA-Z][^\s;]+);') SUBSTITUTION_FORMAT = '[pynlinerSubstitute:{0}]' SUBSTITUTION_PATTERN = re.compile(r'\[pynlinerSubstitute:(\d+)\]') class Pynliner(object): """Pynliner class""" soup = False style_string = False stylesheet = False output = False def __init__(self, log=None, allow_conditional_comments=False, preserve_entities=True): self.log = log cssutils.log.enabled = False if log is None else True self.extra_style_strings = [] self.allow_conditional_comments = allow_conditional_comments self.preserve_entities = preserve_entities self.root_url = None self.relative_url = None self._substitutions = None def from_url(self, url): """Gets remote HTML page for conversion Downloads HTML page from `url` as a string and passes it to the `from_string` method. Also sets `self.root_url` and `self.relative_url` for use in importing elements. Returns self. >>> p = Pynliner() >>> p.from_url('http://somewebsite.com/file.html') """ self.url = url self.relative_url = '/'.join(url.split('/')[:-1]) + '/' self.root_url = '/'.join(url.split('/')[:3]) self.source_string = self._get_url(self.url) return self def from_string(self, string): """Generates a Pynliner object from the given HTML string. Returns self. >>> p = Pynliner() >>> p.from_string('

Hi

') """ self.source_string = string return self def with_cssString(self, css_string): """Adds external CSS to the Pynliner object. Can be "chained". Returns self. >>> html = "

Hello World!

" >>> css = "h1 { color:#ffcc00; }" >>> p = Pynliner() >>> p.from_string(html).with_cssString(css) """ self.extra_style_strings.append(css_string) return self def run(self): """Applies each step of the process if they have not already been performed. Returns Unicode output with applied styles. >>> html = "

Hello World!

" >>> Pynliner().from_string(html).run() u'

Hello World!

' """ self._substitutions = [] if self.preserve_entities: self._substitute_entities() if not self.soup: self._get_soup() if not self.stylesheet: self._get_styles() self._apply_styles() self._insert_media_rules() self._get_output() self._unsubstitute_output() return self.output def _store_substitute(self, value): """ store a string and return it's substitute """ index = len(self._substitutions) self._substitutions.append(value) return SUBSTITUTION_FORMAT.format(index) def _get_url(self, url): """Returns the response content from the given url """ return urlopen(url).read().decode() def _substitute_entities(self): """ Add HTML entities to the substitutions list and replace with placeholders in HTML source """ self.source_string = re.sub( HTML_ENTITY_PATTERN, lambda m: self._store_substitute(m.group(0)), self.source_string ) def _unsubstitute_output(self): """ Put substitutions back into the output """ self.output = re.sub( SUBSTITUTION_PATTERN, lambda m: self._substitutions[int(m.group(1))], self.output ) def _get_soup(self): """Convert source string to BeautifulSoup object. Sets it to self.soup. If using mod_wgsi, use html5 parsing to prevent BeautifulSoup incompatibility. """ # Check if mod_wsgi is running # - see http://code.google.com/p/modwsgi/wiki/TipsAndTricks try: from mod_wsgi import version self.soup = BeautifulSoup(self.source_string, "html5lib") except ImportError: self.soup = BeautifulSoup(self.source_string, "html.parser") def _get_styles(self): """Gets all CSS content from and removes all and ", "html.parser" ) target = self.soup.body or self.soup target.insert(0, style) def _apply_styles(self): """Steps through CSS rules and applies each to all the proper elements as @style attributes prepending any current @style attributes. """ rules = self.stylesheet.cssRules.rulesOfType(1) elem_prop_map = {} elem_style_map = {} # build up a property list for every styled element for rule in rules: for selector in rule.selectorList: for element in select(self.soup, selector.selectorText): element_tuple = (element, id(element)) if element_tuple not in elem_prop_map: elem_prop_map[element_tuple] = [] elem_prop_map[element_tuple].append({ 'specificity': selector.specificity, 'props': rule.style.getProperties(), }) # build up another property list using selector specificity for elem_tuple, props in elem_prop_map.items(): elem, elem_id = elem_tuple if elem_tuple not in elem_style_map: elem_style_map[elem_tuple] = cssutils.css.CSSStyleDeclaration() # ascending sort of prop_lists based on specificity props = sorted(props, key=lambda p: p['specificity']) # for each prop_list, apply to CSSStyleDeclaration for prop_list in map(lambda obj: obj['props'], props): for prop in prop_list: elem_style_map[elem_tuple].removeProperty(prop.name) elem_style_map[elem_tuple].setProperty(prop.name, prop.value) # apply rules to elements for elem_tuple, style_declaration in elem_style_map.items(): elem, elem_id = elem_tuple if elem.has_attr('style'): elem['style'] = u'%s; %s' % (style_declaration.cssText.replace('\n', ' '), elem['style']) else: elem['style'] = style_declaration.cssText.replace('\n', ' ') def _get_output(self): """Generate Unicode string of `self.soup` and set it to `self.output` Returns self.output """ self.output = unicode(self.soup) return self.output def fromURL(url, **kwargs): """Shortcut Pynliner constructor. Equivalent to: >>> Pynliner().from_url(someURL).run() Returns processed HTML string. """ return Pynliner(**kwargs).from_url(url).run() def fromString(string, **kwargs): """Shortcut Pynliner constructor. Equivalent to: >>> Pynliner().from_string(someString).run() Returns processed HTML string. """ return Pynliner(**kwargs).from_string(string).run() pynliner-0.8.0/pynliner.egg-info/0000775000175000017500000000000013035233730020046 5ustar tannerntannern00000000000000pynliner-0.8.0/pynliner.egg-info/requires.txt0000664000175000017500000000005113035233730022442 0ustar tannerntannern00000000000000BeautifulSoup4 >= 4.4.1 cssutils >=0.9.7 pynliner-0.8.0/pynliner.egg-info/dependency_links.txt0000664000175000017500000000000113035233730024114 0ustar tannerntannern00000000000000 pynliner-0.8.0/pynliner.egg-info/PKG-INFO0000664000175000017500000000147013035233730021145 0ustar tannerntannern00000000000000Metadata-Version: 1.1 Name: pynliner Version: 0.8.0 Summary: Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and cssutils Home-page: UNKNOWN Author: Tanner Netterville Author-email: tannern@gmail.com License: MIT Description: UNKNOWN Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Topic :: Text Processing :: Markup :: HTML Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2.6 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 pynliner-0.8.0/pynliner.egg-info/SOURCES.txt0000664000175000017500000000033513035233730021733 0ustar tannerntannern00000000000000setup.cfg setup.py pynliner/__init__.py pynliner/soupselect.py pynliner.egg-info/PKG-INFO pynliner.egg-info/SOURCES.txt pynliner.egg-info/dependency_links.txt pynliner.egg-info/requires.txt pynliner.egg-info/top_level.txtpynliner-0.8.0/pynliner.egg-info/top_level.txt0000664000175000017500000000001113035233730022570 0ustar tannerntannern00000000000000pynliner pynliner-0.8.0/PKG-INFO0000664000175000017500000000147013035233730015613 0ustar tannerntannern00000000000000Metadata-Version: 1.1 Name: pynliner Version: 0.8.0 Summary: Python CSS-to-inline-styles conversion tool for HTML using BeautifulSoup and cssutils Home-page: UNKNOWN Author: Tanner Netterville Author-email: tannern@gmail.com License: MIT Description: UNKNOWN Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Topic :: Text Processing :: Markup :: HTML Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2.6 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 pynliner-0.8.0/setup.py0000664000175000017500000000216113035233572016232 0ustar tannerntannern00000000000000#!/usr/bin/env python # -*- coding: utf-8 -*- from setuptools import setup setup(name='pynliner', version='0.8.0', description='Python CSS-to-inline-styles conversion tool for HTML using' ' BeautifulSoup and cssutils', author='Tanner Netterville', author_email='tannern@gmail.com', install_requires=[ 'BeautifulSoup4 >= 4.4.1', 'cssutils >=0.9.7', ], tests_require=[ 'mock' ], test_suite='tests', packages=['pynliner'], license='MIT', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: MIT License', 'Topic :: Text Processing :: Markup :: HTML', 'Programming Language :: Python', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5' ]) pynliner-0.8.0/setup.cfg0000664000175000017500000000013013035233730016327 0ustar tannerntannern00000000000000[bdist_wheel] universal = 1 [egg_info] tag_build = tag_date = 0 tag_svn_revision = 0