cssselect-1.1.0/0000775000372000037200000000000013523235335014347 5ustar travistravis00000000000000cssselect-1.1.0/cssselect/0000775000372000037200000000000013523235335016337 5ustar travistravis00000000000000cssselect-1.1.0/cssselect/parser.py0000664000372000037200000006304113523235306020207 0ustar travistravis00000000000000# -*- coding: utf-8 -*- """ cssselect.parser ================ Tokenizer, parser and parsed objects for CSS selectors. :copyright: (c) 2007-2012 Ian Bicking and contributors. See AUTHORS for more details. :license: BSD, see LICENSE for more details. """ import sys import re import operator if sys.version_info[0] < 3: _unicode = unicode _unichr = unichr else: _unicode = str _unichr = chr def ascii_lower(string): """Lower-case, but only in the ASCII range.""" return string.encode('utf8').lower().decode('utf8') class SelectorError(Exception): """Common parent for :class:`SelectorSyntaxError` and :class:`ExpressionError`. You can just use ``except SelectorError:`` when calling :meth:`~GenericTranslator.css_to_xpath` and handle both exceptions types. """ class SelectorSyntaxError(SelectorError, SyntaxError): """Parsing a selector that does not match the grammar.""" #### Parsed objects class Selector(object): """ Represents a parsed selector. :meth:`~GenericTranslator.selector_to_xpath` accepts this object, but ignores :attr:`pseudo_element`. It is the user’s responsibility to account for pseudo-elements and reject selectors with unknown or unsupported pseudo-elements. """ def __init__(self, tree, pseudo_element=None): self.parsed_tree = tree if pseudo_element is not None and not isinstance( pseudo_element, FunctionalPseudoElement): pseudo_element = ascii_lower(pseudo_element) #: A :class:`FunctionalPseudoElement`, #: or the identifier for the pseudo-element as a string, # or ``None``. #: #: +-------------------------+----------------+--------------------------------+ #: | | Selector | Pseudo-element | #: +=========================+================+================================+ #: | CSS3 syntax | ``a::before`` | ``'before'`` | #: +-------------------------+----------------+--------------------------------+ #: | Older syntax | ``a:before`` | ``'before'`` | #: +-------------------------+----------------+--------------------------------+ #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` | #: | not in Selectors3 | | | #: +-------------------------+----------------+--------------------------------+ #: | Invalid pseudo-class | ``li:marker`` | ``None`` | #: +-------------------------+----------------+--------------------------------+ #: | Functional | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` | #: +-------------------------+----------------+--------------------------------+ #: #: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement self.pseudo_element = pseudo_element def __repr__(self): if isinstance(self.pseudo_element, FunctionalPseudoElement): pseudo_element = repr(self.pseudo_element) elif self.pseudo_element: pseudo_element = '::%s' % self.pseudo_element else: pseudo_element = '' return '%s[%r%s]' % ( self.__class__.__name__, self.parsed_tree, pseudo_element) def canonical(self): """Return a CSS representation for this selector (a string) """ if isinstance(self.pseudo_element, FunctionalPseudoElement): pseudo_element = '::%s' % self.pseudo_element.canonical() elif self.pseudo_element: pseudo_element = '::%s' % self.pseudo_element else: pseudo_element = '' res = '%s%s' % (self.parsed_tree.canonical(), pseudo_element) if len(res) > 1: res = res.lstrip('*') return res def specificity(self): """Return the specificity_ of this selector as a tuple of 3 integers. .. _specificity: http://www.w3.org/TR/selectors/#specificity """ a, b, c = self.parsed_tree.specificity() if self.pseudo_element: c += 1 return a, b, c class Class(object): """ Represents selector.class_name """ def __init__(self, selector, class_name): self.selector = selector self.class_name = class_name def __repr__(self): return '%s[%r.%s]' % ( self.__class__.__name__, self.selector, self.class_name) def canonical(self): return '%s.%s' % (self.selector.canonical(), self.class_name) def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c class FunctionalPseudoElement(object): """ Represents selector::name(arguments) .. attribute:: name The name (identifier) of the pseudo-element, as a string. .. attribute:: arguments The arguments of the pseudo-element, as a list of tokens. **Note:** tokens are not part of the public API, and may change between cssselect versions. Use at your own risks. """ def __init__(self, name, arguments): self.name = ascii_lower(name) self.arguments = arguments def __repr__(self): return '%s[::%s(%r)]' % ( self.__class__.__name__, self.name, [token.value for token in self.arguments]) def argument_types(self): return [token.type for token in self.arguments] def canonical(self): args = ''.join(token.css() for token in self.arguments) return '%s(%s)' % (self.name, args) def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c class Function(object): """ Represents selector:name(expr) """ def __init__(self, selector, name, arguments): self.selector = selector self.name = ascii_lower(name) self.arguments = arguments def __repr__(self): return '%s[%r:%s(%r)]' % ( self.__class__.__name__, self.selector, self.name, [token.value for token in self.arguments]) def argument_types(self): return [token.type for token in self.arguments] def canonical(self): args = ''.join(token.css() for token in self.arguments) return '%s:%s(%s)' % (self.selector.canonical(), self.name, args) def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c class Pseudo(object): """ Represents selector:ident """ def __init__(self, selector, ident): self.selector = selector self.ident = ascii_lower(ident) def __repr__(self): return '%s[%r:%s]' % ( self.__class__.__name__, self.selector, self.ident) def canonical(self): return '%s:%s' % (self.selector.canonical(), self.ident) def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c class Negation(object): """ Represents selector:not(subselector) """ def __init__(self, selector, subselector): self.selector = selector self.subselector = subselector def __repr__(self): return '%s[%r:not(%r)]' % ( self.__class__.__name__, self.selector, self.subselector) def canonical(self): subsel = self.subselector.canonical() if len(subsel) > 1: subsel = subsel.lstrip('*') return '%s:not(%s)' % (self.selector.canonical(), subsel) def specificity(self): a1, b1, c1 = self.selector.specificity() a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 class Attrib(object): """ Represents selector[namespace|attrib operator value] """ def __init__(self, selector, namespace, attrib, operator, value): self.selector = selector self.namespace = namespace self.attrib = attrib self.operator = operator self.value = value def __repr__(self): if self.namespace: attrib = '%s|%s' % (self.namespace, self.attrib) else: attrib = self.attrib if self.operator == 'exists': return '%s[%r[%s]]' % ( self.__class__.__name__, self.selector, attrib) else: return '%s[%r[%s %s %r]]' % ( self.__class__.__name__, self.selector, attrib, self.operator, self.value.value) def canonical(self): if self.namespace: attrib = '%s|%s' % (self.namespace, self.attrib) else: attrib = self.attrib if self.operator == 'exists': op = attrib else: op = '%s%s%s' % (attrib, self.operator, self.value.css()) return '%s[%s]' % (self.selector.canonical(), op) def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c class Element(object): """ Represents namespace|element `None` is for the universal selector '*' """ def __init__(self, namespace=None, element=None): self.namespace = namespace self.element = element def __repr__(self): return '%s[%s]' % (self.__class__.__name__, self.canonical()) def canonical(self): element = self.element or '*' if self.namespace: element = '%s|%s' % (self.namespace, element) return element def specificity(self): if self.element: return 0, 0, 1 else: return 0, 0, 0 class Hash(object): """ Represents selector#id """ def __init__(self, selector, id): self.selector = selector self.id = id def __repr__(self): return '%s[%r#%s]' % ( self.__class__.__name__, self.selector, self.id) def canonical(self): return '%s#%s' % (self.selector.canonical(), self.id) def specificity(self): a, b, c = self.selector.specificity() a += 1 return a, b, c class CombinedSelector(object): def __init__(self, selector, combinator, subselector): assert selector is not None self.selector = selector self.combinator = combinator self.subselector = subselector def __repr__(self): if self.combinator == ' ': comb = '' else: comb = self.combinator return '%s[%r %s %r]' % ( self.__class__.__name__, self.selector, comb, self.subselector) def canonical(self): subsel = self.subselector.canonical() if len(subsel) > 1: subsel = subsel.lstrip('*') return '%s %s %s' % ( self.selector.canonical(), self.combinator, subsel) def specificity(self): a1, b1, c1 = self.selector.specificity() a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 #### Parser # foo _el_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$') # foo#bar or #bar _id_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$') # foo.bar or .bar _class_re = re.compile( r'^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$') def parse(css): """Parse a CSS *group of selectors*. If you don't care about pseudo-elements or selector specificity, you can skip this and use :meth:`~GenericTranslator.css_to_xpath`. :param css: A *group of selectors* as an Unicode string. :raises: :class:`SelectorSyntaxError` on invalid selectors. :returns: A list of parsed :class:`Selector` objects, one for each selector in the comma-separated group. """ # Fast path for simple cases match = _el_re.match(css) if match: return [Selector(Element(element=match.group(1)))] match = _id_re.match(css) if match is not None: return [Selector(Hash(Element(element=match.group(1) or None), match.group(2)))] match = _class_re.match(css) if match is not None: return [Selector(Class(Element(element=match.group(1) or None), match.group(2)))] stream = TokenStream(tokenize(css)) stream.source = css return list(parse_selector_group(stream)) # except SelectorSyntaxError: # e = sys.exc_info()[1] # message = "%s at %s -> %r" % ( # e, stream.used, stream.peek()) # e.msg = message # e.args = tuple([message]) # raise def parse_selector_group(stream): stream.skip_whitespace() while 1: yield Selector(*parse_selector(stream)) if stream.peek() == ('DELIM', ','): stream.next() stream.skip_whitespace() else: break def parse_selector(stream): result, pseudo_element = parse_simple_selector(stream) while 1: stream.skip_whitespace() peek = stream.peek() if peek in (('EOF', None), ('DELIM', ',')): break if pseudo_element: raise SelectorSyntaxError( 'Got pseudo-element ::%s not at the end of a selector' % pseudo_element) if peek.is_delim('+', '>', '~'): # A combinator combinator = stream.next().value stream.skip_whitespace() else: # By exclusion, the last parse_simple_selector() ended # at peek == ' ' combinator = ' ' next_selector, pseudo_element = parse_simple_selector(stream) result = CombinedSelector(result, combinator, next_selector) return result, pseudo_element def parse_simple_selector(stream, inside_negation=False): stream.skip_whitespace() selector_start = len(stream.used) peek = stream.peek() if peek.type == 'IDENT' or peek == ('DELIM', '*'): if peek.type == 'IDENT': namespace = stream.next().value else: stream.next() namespace = None if stream.peek() == ('DELIM', '|'): stream.next() element = stream.next_ident_or_star() else: element = namespace namespace = None else: element = namespace = None result = Element(namespace, element) pseudo_element = None while 1: peek = stream.peek() if peek.type in ('S', 'EOF') or peek.is_delim(',', '+', '>', '~') or ( inside_negation and peek == ('DELIM', ')')): break if pseudo_element: raise SelectorSyntaxError( 'Got pseudo-element ::%s not at the end of a selector' % pseudo_element) if peek.type == 'HASH': result = Hash(result, stream.next().value) elif peek == ('DELIM', '.'): stream.next() result = Class(result, stream.next_ident()) elif peek == ('DELIM', '|'): stream.next() result = Element(None, stream.next_ident()) elif peek == ('DELIM', '['): stream.next() result = parse_attrib(result, stream) elif peek == ('DELIM', ':'): stream.next() if stream.peek() == ('DELIM', ':'): stream.next() pseudo_element = stream.next_ident() if stream.peek() == ('DELIM', '('): stream.next() pseudo_element = FunctionalPseudoElement( pseudo_element, parse_arguments(stream)) continue ident = stream.next_ident() if ident.lower() in ('first-line', 'first-letter', 'before', 'after'): # Special case: CSS 2.1 pseudo-elements can have a single ':' # Any new pseudo-element must have two. pseudo_element = _unicode(ident) continue if stream.peek() != ('DELIM', '('): result = Pseudo(result, ident) if result.__repr__() == 'Pseudo[Element[*]:scope]': if not (len(stream.used) == 2 or (len(stream.used) == 3 and stream.used[0].type == 'S')): raise SelectorSyntaxError( 'Got immediate child pseudo-element ":scope" ' 'not at the start of a selector') continue stream.next() stream.skip_whitespace() if ident.lower() == 'not': if inside_negation: raise SelectorSyntaxError('Got nested :not()') argument, argument_pseudo_element = parse_simple_selector( stream, inside_negation=True) next = stream.next() if argument_pseudo_element: raise SelectorSyntaxError( 'Got pseudo-element ::%s inside :not() at %s' % (argument_pseudo_element, next.pos)) if next != ('DELIM', ')'): raise SelectorSyntaxError("Expected ')', got %s" % (next,)) result = Negation(result, argument) else: result = Function(result, ident, parse_arguments(stream)) else: raise SelectorSyntaxError( "Expected selector, got %s" % (peek,)) if len(stream.used) == selector_start: raise SelectorSyntaxError( "Expected selector, got %s" % (stream.peek(),)) return result, pseudo_element def parse_arguments(stream): arguments = [] while 1: stream.skip_whitespace() next = stream.next() if next.type in ('IDENT', 'STRING', 'NUMBER') or next in [ ('DELIM', '+'), ('DELIM', '-')]: arguments.append(next) elif next == ('DELIM', ')'): return arguments else: raise SelectorSyntaxError( "Expected an argument, got %s" % (next,)) def parse_attrib(selector, stream): stream.skip_whitespace() attrib = stream.next_ident_or_star() if attrib is None and stream.peek() != ('DELIM', '|'): raise SelectorSyntaxError( "Expected '|', got %s" % (stream.peek(),)) if stream.peek() == ('DELIM', '|'): stream.next() if stream.peek() == ('DELIM', '='): namespace = None stream.next() op = '|=' else: namespace = attrib attrib = stream.next_ident() op = None else: namespace = op = None if op is None: stream.skip_whitespace() next = stream.next() if next == ('DELIM', ']'): return Attrib(selector, namespace, attrib, 'exists', None) elif next == ('DELIM', '='): op = '=' elif next.is_delim('^', '$', '*', '~', '|', '!') and ( stream.peek() == ('DELIM', '=')): op = next.value + '=' stream.next() else: raise SelectorSyntaxError( "Operator expected, got %s" % (next,)) stream.skip_whitespace() value = stream.next() if value.type not in ('IDENT', 'STRING'): raise SelectorSyntaxError( "Expected string or ident, got %s" % (value,)) stream.skip_whitespace() next = stream.next() if next != ('DELIM', ']'): raise SelectorSyntaxError( "Expected ']', got %s" % (next,)) return Attrib(selector, namespace, attrib, op, value) def parse_series(tokens): """ Parses the arguments for :nth-child() and friends. :raises: A list of tokens :returns: :``(a, b)`` """ for token in tokens: if token.type == 'STRING': raise ValueError('String tokens not allowed in series.') s = ''.join(token.value for token in tokens).strip() if s == 'odd': return 2, 1 elif s == 'even': return 2, 0 elif s == 'n': return 1, 0 if 'n' not in s: # Just b return 0, int(s) a, b = s.split('n', 1) if not a: a = 1 elif a == '-' or a == '+': a = int(a+'1') else: a = int(a) if not b: b = 0 else: b = int(b) return a, b #### Token objects class Token(tuple): def __new__(cls, type_, value, pos): obj = tuple.__new__(cls, (type_, value)) obj.pos = pos return obj def __repr__(self): return "<%s '%s' at %i>" % (self.type, self.value, self.pos) def is_delim(self, *values): return self.type == 'DELIM' and self.value in values type = property(operator.itemgetter(0)) value = property(operator.itemgetter(1)) def css(self): if self.type == 'STRING': return repr(self.value) else: return self.value class EOFToken(Token): def __new__(cls, pos): return Token.__new__(cls, 'EOF', None, pos) def __repr__(self): return '<%s at %i>' % (self.type, self.pos) #### Tokenizer class TokenMacros: unicode_escape = r'\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?' escape = unicode_escape + r'|\\[^\n\r\f0-9a-f]' string_escape = r'\\(?:\n|\r\n|\r|\f)|' + escape nonascii = r'[^\0-\177]' nmchar = '[_a-z0-9-]|%s|%s' % (escape, nonascii) nmstart = '[_a-z]|%s|%s' % (escape, nonascii) def _compile(pattern): return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match _match_whitespace = _compile(r'[ \t\r\n\f]+') _match_number = _compile(r'[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)') _match_hash = _compile('#(?:%(nmchar)s)+') _match_ident = _compile('-?(?:%(nmstart)s)(?:%(nmchar)s)*') _match_string_by_quote = { "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"), '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'), } _sub_simple_escape = re.compile(r'\\(.)').sub _sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.I).sub _sub_newline_escape =re.compile(r'\\(?:\n|\r\n|\r|\f)').sub # Same as r'\1', but faster on CPython _replace_simple = operator.methodcaller('group', 1) def _replace_unicode(match): codepoint = int(match.group(1), 16) if codepoint > sys.maxunicode: codepoint = 0xFFFD return _unichr(codepoint) def unescape_ident(value): value = _sub_unicode_escape(_replace_unicode, value) value = _sub_simple_escape(_replace_simple, value) return value def tokenize(s): pos = 0 len_s = len(s) while pos < len_s: match = _match_whitespace(s, pos=pos) if match: yield Token('S', ' ', pos) pos = match.end() continue match = _match_ident(s, pos=pos) if match: value = _sub_simple_escape(_replace_simple, _sub_unicode_escape(_replace_unicode, match.group())) yield Token('IDENT', value, pos) pos = match.end() continue match = _match_hash(s, pos=pos) if match: value = _sub_simple_escape(_replace_simple, _sub_unicode_escape(_replace_unicode, match.group()[1:])) yield Token('HASH', value, pos) pos = match.end() continue quote = s[pos] if quote in _match_string_by_quote: match = _match_string_by_quote[quote](s, pos=pos + 1) assert match, 'Should have found at least an empty match' end_pos = match.end() if end_pos == len_s: raise SelectorSyntaxError('Unclosed string at %s' % pos) if s[end_pos] != quote: raise SelectorSyntaxError('Invalid string at %s' % pos) value = _sub_simple_escape(_replace_simple, _sub_unicode_escape(_replace_unicode, _sub_newline_escape('', match.group()))) yield Token('STRING', value, pos) pos = end_pos + 1 continue match = _match_number(s, pos=pos) if match: value = match.group() yield Token('NUMBER', value, pos) pos = match.end() continue pos2 = pos + 2 if s[pos:pos2] == '/*': pos = s.find('*/', pos2) if pos == -1: pos = len_s else: pos += 2 continue yield Token('DELIM', s[pos], pos) pos += 1 assert pos == len_s yield EOFToken(pos) class TokenStream(object): def __init__(self, tokens, source=None): self.used = [] self.tokens = iter(tokens) self.source = source self.peeked = None self._peeking = False try: self.next_token = self.tokens.next except AttributeError: # Python 3 self.next_token = self.tokens.__next__ def next(self): if self._peeking: self._peeking = False self.used.append(self.peeked) return self.peeked else: next = self.next_token() self.used.append(next) return next def peek(self): if not self._peeking: self.peeked = self.next_token() self._peeking = True return self.peeked def next_ident(self): next = self.next() if next.type != 'IDENT': raise SelectorSyntaxError('Expected ident, got %s' % (next,)) return next.value def next_ident_or_star(self): next = self.next() if next.type == 'IDENT': return next.value elif next == ('DELIM', '*'): return None else: raise SelectorSyntaxError( "Expected ident or '*', got %s" % (next,)) def skip_whitespace(self): peek = self.peek() if peek.type == 'S': self.next() cssselect-1.1.0/cssselect/xpath.py0000664000372000037200000006714113523235306020044 0ustar travistravis00000000000000# -*- coding: utf-8 -*- """ cssselect.xpath =============== Translation of parsed CSS selectors to XPath expressions. :copyright: (c) 2007-2012 Ian Bicking and contributors. See AUTHORS for more details. :license: BSD, see LICENSE for more details. """ import sys import re from cssselect.parser import parse, parse_series, SelectorError if sys.version_info[0] < 3: _basestring = basestring _unicode = unicode else: _basestring = str _unicode = str def _unicode_safe_getattr(obj, name, default=None): # getattr() with a non-ASCII name fails on Python 2.x name = name.encode('ascii', 'replace').decode('ascii') return getattr(obj, name, default) class ExpressionError(SelectorError, RuntimeError): """Unknown or unsupported selector (eg. pseudo-class).""" #### XPath Helpers class XPathExpr(object): def __init__(self, path='', element='*', condition='', star_prefix=False): self.path = path self.element = element self.condition = condition def __str__(self): path = _unicode(self.path) + _unicode(self.element) if self.condition: path += '[%s]' % self.condition return path def __repr__(self): return '%s[%s]' % (self.__class__.__name__, self) def add_condition(self, condition): if self.condition: self.condition = '%s and (%s)' % (self.condition, condition) else: self.condition = condition return self def add_name_test(self): if self.element == '*': # We weren't doing a test anyway return self.add_condition( "name() = %s" % GenericTranslator.xpath_literal(self.element)) self.element = '*' def add_star_prefix(self): """ Append '*/' to the path to keep the context constrained to a single parent. """ self.path += '*/' def join(self, combiner, other): path = _unicode(self) + combiner # Any "star prefix" is redundant when joining. if other.path != '*/': path += other.path self.path = path self.element = other.element self.condition = other.condition return self split_at_single_quotes = re.compile("('+)").split # The spec is actually more permissive than that, but don’t bother. # This is just for the fast path. # http://www.w3.org/TR/REC-xml/#NT-NameStartChar is_safe_name = re.compile('^[a-zA-Z_][a-zA-Z0-9_.-]*$').match # Test that the string is not empty and does not contain whitespace is_non_whitespace = re.compile(r'^[^ \t\r\n\f]+$').match #### Translation class GenericTranslator(object): """ Translator for "generic" XML documents. Everything is case-sensitive, no assumption is made on the meaning of element names and attribute names. """ #### #### HERE BE DRAGONS #### #### You are welcome to hook into this to change some behavior, #### but do so at your own risks. #### Until it has received a lot more work and review, #### I reserve the right to change this API in backward-incompatible ways #### with any minor version of cssselect. #### See https://github.com/scrapy/cssselect/pull/22 #### -- Simon Sapin. #### combinator_mapping = { ' ': 'descendant', '>': 'child', '+': 'direct_adjacent', '~': 'indirect_adjacent', } attribute_operator_mapping = { 'exists': 'exists', '=': 'equals', '~=': 'includes', '|=': 'dashmatch', '^=': 'prefixmatch', '$=': 'suffixmatch', '*=': 'substringmatch', '!=': 'different', # XXX Not in Level 3 but meh } #: The attribute used for ID selectors depends on the document language: #: http://www.w3.org/TR/selectors/#id-selectors id_attribute = 'id' #: The attribute used for ``:lang()`` depends on the document language: #: http://www.w3.org/TR/selectors/#lang-pseudo lang_attribute = 'xml:lang' #: The case sensitivity of document language element names, #: attribute names, and attribute values in selectors depends #: on the document language. #: http://www.w3.org/TR/selectors/#casesens #: #: When a document language defines one of these as case-insensitive, #: cssselect assumes that the document parser makes the parsed values #: lower-case. Making the selector lower-case too makes the comparaison #: case-insensitive. #: #: In HTML, element names and attributes names (but not attribute values) #: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4 #: and HTMLParser make them lower-case in their parse result, so #: the assumption holds. lower_case_element_names = False lower_case_attribute_names = False lower_case_attribute_values = False # class used to represent and xpath expression xpathexpr_cls = XPathExpr def css_to_xpath(self, css, prefix='descendant-or-self::'): """Translate a *group of selectors* to XPath. Pseudo-elements are not supported here since XPath only knows about "real" elements. :param css: A *group of selectors* as an Unicode string. :param prefix: This string is prepended to the XPath expression for each selector. The default makes selectors scoped to the context node’s subtree. :raises: :class:`SelectorSyntaxError` on invalid selectors, :class:`ExpressionError` on unknown/unsupported selectors, including pseudo-elements. :returns: The equivalent XPath 1.0 expression as an Unicode string. """ return ' | '.join(self.selector_to_xpath(selector, prefix, translate_pseudo_elements=True) for selector in parse(css)) def selector_to_xpath(self, selector, prefix='descendant-or-self::', translate_pseudo_elements=False): """Translate a parsed selector to XPath. :param selector: A parsed :class:`Selector` object. :param prefix: This string is prepended to the resulting XPath expression. The default makes selectors scoped to the context node’s subtree. :param translate_pseudo_elements: Unless this is set to ``True`` (as :meth:`css_to_xpath` does), the :attr:`~Selector.pseudo_element` attribute of the selector is ignored. It is the caller's responsibility to reject selectors with pseudo-elements, or to account for them somehow. :raises: :class:`ExpressionError` on unknown/unsupported selectors. :returns: The equivalent XPath 1.0 expression as an Unicode string. """ tree = getattr(selector, 'parsed_tree', None) if not tree: raise TypeError('Expected a parsed selector, got %r' % (selector,)) xpath = self.xpath(tree) assert isinstance(xpath, self.xpathexpr_cls) # help debug a missing 'return' if translate_pseudo_elements and selector.pseudo_element: xpath = self.xpath_pseudo_element(xpath, selector.pseudo_element) return (prefix or '') + _unicode(xpath) def xpath_pseudo_element(self, xpath, pseudo_element): """Translate a pseudo-element. Defaults to not supporting pseudo-elements at all, but can be overridden by sub-classes. """ raise ExpressionError('Pseudo-elements are not supported.') @staticmethod def xpath_literal(s): s = _unicode(s) if "'" not in s: s = "'%s'" % s elif '"' not in s: s = '"%s"' % s else: s = "concat(%s)" % ','.join([ (("'" in part) and '"%s"' or "'%s'") % part for part in split_at_single_quotes(s) if part ]) return s def xpath(self, parsed_selector): """Translate any parsed selector object.""" type_name = type(parsed_selector).__name__ method = getattr(self, 'xpath_%s' % type_name.lower(), None) if method is None: raise ExpressionError('%s is not supported.' % type_name) return method(parsed_selector) # Dispatched by parsed object type def xpath_combinedselector(self, combined): """Translate a combined selector.""" combinator = self.combinator_mapping[combined.combinator] method = getattr(self, 'xpath_%s_combinator' % combinator) return method(self.xpath(combined.selector), self.xpath(combined.subselector)) def xpath_negation(self, negation): xpath = self.xpath(negation.selector) sub_xpath = self.xpath(negation.subselector) sub_xpath.add_name_test() if sub_xpath.condition: return xpath.add_condition('not(%s)' % sub_xpath.condition) else: return xpath.add_condition('0') def xpath_function(self, function): """Translate a functional pseudo-class.""" method = 'xpath_%s_function' % function.name.replace('-', '_') method = _unicode_safe_getattr(self, method, None) if not method: raise ExpressionError( "The pseudo-class :%s() is unknown" % function.name) return method(self.xpath(function.selector), function) def xpath_pseudo(self, pseudo): """Translate a pseudo-class.""" method = 'xpath_%s_pseudo' % pseudo.ident.replace('-', '_') method = _unicode_safe_getattr(self, method, None) if not method: # TODO: better error message for pseudo-elements? raise ExpressionError( "The pseudo-class :%s is unknown" % pseudo.ident) return method(self.xpath(pseudo.selector)) def xpath_attrib(self, selector): """Translate an attribute selector.""" operator = self.attribute_operator_mapping[selector.operator] method = getattr(self, 'xpath_attrib_%s' % operator) if self.lower_case_attribute_names: name = selector.attrib.lower() else: name = selector.attrib safe = is_safe_name(name) if selector.namespace: name = '%s:%s' % (selector.namespace, name) safe = safe and is_safe_name(selector.namespace) if safe: attrib = '@' + name else: attrib = 'attribute::*[name() = %s]' % self.xpath_literal(name) if selector.value is None: value = None elif self.lower_case_attribute_values: value = selector.value.value.lower() else: value = selector.value.value return method(self.xpath(selector.selector), attrib, value) def xpath_class(self, class_selector): """Translate a class selector.""" # .foo is defined as [class~=foo] in the spec. xpath = self.xpath(class_selector.selector) return self.xpath_attrib_includes( xpath, '@class', class_selector.class_name) def xpath_hash(self, id_selector): """Translate an ID selector.""" xpath = self.xpath(id_selector.selector) return self.xpath_attrib_equals(xpath, '@id', id_selector.id) def xpath_element(self, selector): """Translate a type or universal selector.""" element = selector.element if not element: element = '*' safe = True else: safe = is_safe_name(element) if self.lower_case_element_names: element = element.lower() if selector.namespace: # Namespace prefixes are case-sensitive. # http://www.w3.org/TR/css3-namespace/#prefixes element = '%s:%s' % (selector.namespace, element) safe = safe and is_safe_name(selector.namespace) xpath = self.xpathexpr_cls(element=element) if not safe: xpath.add_name_test() return xpath # CombinedSelector: dispatch by combinator def xpath_descendant_combinator(self, left, right): """right is a child, grand-child or further descendant of left""" return left.join('/descendant-or-self::*/', right) def xpath_child_combinator(self, left, right): """right is an immediate child of left""" return left.join('/', right) def xpath_direct_adjacent_combinator(self, left, right): """right is a sibling immediately after left""" xpath = left.join('/following-sibling::', right) xpath.add_name_test() return xpath.add_condition('position() = 1') def xpath_indirect_adjacent_combinator(self, left, right): """right is a sibling after left, immediately or not""" return left.join('/following-sibling::', right) # Function: dispatch by function/pseudo-class name def xpath_nth_child_function(self, xpath, function, last=False, add_name_test=True): try: a, b = parse_series(function.arguments) except ValueError: raise ExpressionError("Invalid series: '%r'" % function.arguments) # From https://www.w3.org/TR/css3-selectors/#structural-pseudos: # # :nth-child(an+b) # an+b-1 siblings before # # :nth-last-child(an+b) # an+b-1 siblings after # # :nth-of-type(an+b) # an+b-1 siblings with the same expanded element name before # # :nth-last-of-type(an+b) # an+b-1 siblings with the same expanded element name after # # So, # for :nth-child and :nth-of-type # # count(preceding-sibling::) = an+b-1 # # for :nth-last-child and :nth-last-of-type # # count(following-sibling::) = an+b-1 # # therefore, # count(...) - (b-1) ≡ 0 (mod a) # # if a == 0: # ~~~~~~~~~~ # count(...) = b-1 # # if a < 0: # ~~~~~~~~~ # count(...) - b +1 <= 0 # -> count(...) <= b-1 # # if a > 0: # ~~~~~~~~~ # count(...) - b +1 >= 0 # -> count(...) >= b-1 # work with b-1 instead b_min_1 = b - 1 # early-exit condition 1: # ~~~~~~~~~~~~~~~~~~~~~~~ # for a == 1, nth-*(an+b) means n+b-1 siblings before/after, # and since n ∈ {0, 1, 2, ...}, if b-1<=0, # there is always an "n" matching any number of siblings (maybe none) if a == 1 and b_min_1 <=0: return xpath # early-exit condition 2: # ~~~~~~~~~~~~~~~~~~~~~~~ # an+b-1 siblings with a<0 and (b-1)<0 is not possible if a < 0 and b_min_1 < 0: return xpath.add_condition('0') # `add_name_test` boolean is inverted and somewhat counter-intuitive: # # nth_of_type() calls nth_child(add_name_test=False) if add_name_test: nodetest = '*' else: nodetest = '%s' % xpath.element # count siblings before or after the element if not last: siblings_count = 'count(preceding-sibling::%s)' % nodetest else: siblings_count = 'count(following-sibling::%s)' % nodetest # special case of fixed position: nth-*(0n+b) # if a == 0: # ~~~~~~~~~~ # count(***-sibling::***) = b-1 if a == 0: return xpath.add_condition('%s = %s' % (siblings_count, b_min_1)) expr = [] if a > 0: # siblings count, an+b-1, is always >= 0, # so if a>0, and (b-1)<=0, an "n" exists to satisfy this, # therefore, the predicate is only interesting if (b-1)>0 if b_min_1 > 0: expr.append('%s >= %s' % (siblings_count, b_min_1)) else: # if a<0, and (b-1)<0, no "n" satisfies this, # this is tested above as an early exist condition # otherwise, expr.append('%s <= %s' % (siblings_count, b_min_1)) # operations modulo 1 or -1 are simpler, one only needs to verify: # # - either: # count(***-sibling::***) - (b-1) = n = 0, 1, 2, 3, etc., # i.e. count(***-sibling::***) >= (b-1) # # - or: # count(***-sibling::***) - (b-1) = -n = 0, -1, -2, -3, etc., # i.e. count(***-sibling::***) <= (b-1) # we we just did above. # if abs(a) != 1: # count(***-sibling::***) - (b-1) ≡ 0 (mod a) left = siblings_count # apply "modulo a" on 2nd term, -(b-1), # to simplify things like "(... +6) % -3", # and also make it positive with |a| b_neg = (-b_min_1) % abs(a) if b_neg != 0: b_neg = '+%s' % b_neg left = '(%s %s)' % (left, b_neg) expr.append('%s mod %s = 0' % (left, a)) xpath.add_condition(' and '.join(expr)) return xpath def xpath_nth_last_child_function(self, xpath, function): return self.xpath_nth_child_function(xpath, function, last=True) def xpath_nth_of_type_function(self, xpath, function): if xpath.element == '*': raise ExpressionError( "*:nth-of-type() is not implemented") return self.xpath_nth_child_function(xpath, function, add_name_test=False) def xpath_nth_last_of_type_function(self, xpath, function): if xpath.element == '*': raise ExpressionError( "*:nth-of-type() is not implemented") return self.xpath_nth_child_function(xpath, function, last=True, add_name_test=False) def xpath_contains_function(self, xpath, function): # Defined there, removed in later drafts: # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors if function.argument_types() not in (['STRING'], ['IDENT']): raise ExpressionError( "Expected a single string or ident for :contains(), got %r" % function.arguments) value = function.arguments[0].value return xpath.add_condition( 'contains(., %s)' % self.xpath_literal(value)) def xpath_lang_function(self, xpath, function): if function.argument_types() not in (['STRING'], ['IDENT']): raise ExpressionError( "Expected a single string or ident for :lang(), got %r" % function.arguments) value = function.arguments[0].value return xpath.add_condition( "lang(%s)" % (self.xpath_literal(value))) # Pseudo: dispatch by pseudo-class name def xpath_root_pseudo(self, xpath): return xpath.add_condition("not(parent::*)") # CSS immediate children (CSS ":scope > div" to XPath "child::div" or "./div") # Works only at the start of a selector # Needed to get immediate children of a processed selector in Scrapy # for product in response.css('.product'): # description = product.css(':scope > div::text').get() def xpath_scope_pseudo(self, xpath): return xpath.add_condition("1") def xpath_first_child_pseudo(self, xpath): return xpath.add_condition('count(preceding-sibling::*) = 0') def xpath_last_child_pseudo(self, xpath): return xpath.add_condition('count(following-sibling::*) = 0') def xpath_first_of_type_pseudo(self, xpath): if xpath.element == '*': raise ExpressionError( "*:first-of-type is not implemented") return xpath.add_condition('count(preceding-sibling::%s) = 0' % xpath.element) def xpath_last_of_type_pseudo(self, xpath): if xpath.element == '*': raise ExpressionError( "*:last-of-type is not implemented") return xpath.add_condition('count(following-sibling::%s) = 0' % xpath.element) def xpath_only_child_pseudo(self, xpath): return xpath.add_condition('count(parent::*/child::*) = 1') def xpath_only_of_type_pseudo(self, xpath): if xpath.element == '*': raise ExpressionError( "*:only-of-type is not implemented") return xpath.add_condition('count(parent::*/child::%s) = 1' % xpath.element) def xpath_empty_pseudo(self, xpath): return xpath.add_condition("not(*) and not(string-length())") def pseudo_never_matches(self, xpath): """Common implementation for pseudo-classes that never match.""" return xpath.add_condition("0") xpath_link_pseudo = pseudo_never_matches xpath_visited_pseudo = pseudo_never_matches xpath_hover_pseudo = pseudo_never_matches xpath_active_pseudo = pseudo_never_matches xpath_focus_pseudo = pseudo_never_matches xpath_target_pseudo = pseudo_never_matches xpath_enabled_pseudo = pseudo_never_matches xpath_disabled_pseudo = pseudo_never_matches xpath_checked_pseudo = pseudo_never_matches # Attrib: dispatch by attribute operator def xpath_attrib_exists(self, xpath, name, value): assert not value xpath.add_condition(name) return xpath def xpath_attrib_equals(self, xpath, name, value): xpath.add_condition('%s = %s' % (name, self.xpath_literal(value))) return xpath def xpath_attrib_different(self, xpath, name, value): # FIXME: this seems like a weird hack... if value: xpath.add_condition('not(%s) or %s != %s' % (name, name, self.xpath_literal(value))) else: xpath.add_condition('%s != %s' % (name, self.xpath_literal(value))) return xpath def xpath_attrib_includes(self, xpath, name, value): if is_non_whitespace(value): xpath.add_condition( "%s and contains(concat(' ', normalize-space(%s), ' '), %s)" % (name, name, self.xpath_literal(' '+value+' '))) else: xpath.add_condition('0') return xpath def xpath_attrib_dashmatch(self, xpath, name, value): # Weird, but true... xpath.add_condition('%s and (%s = %s or starts-with(%s, %s))' % ( name, name, self.xpath_literal(value), name, self.xpath_literal(value + '-'))) return xpath def xpath_attrib_prefixmatch(self, xpath, name, value): if value: xpath.add_condition('%s and starts-with(%s, %s)' % ( name, name, self.xpath_literal(value))) else: xpath.add_condition('0') return xpath def xpath_attrib_suffixmatch(self, xpath, name, value): if value: # Oddly there is a starts-with in XPath 1.0, but not ends-with xpath.add_condition( '%s and substring(%s, string-length(%s)-%s) = %s' % (name, name, name, len(value)-1, self.xpath_literal(value))) else: xpath.add_condition('0') return xpath def xpath_attrib_substringmatch(self, xpath, name, value): if value: # Attribute selectors are case sensitive xpath.add_condition('%s and contains(%s, %s)' % ( name, name, self.xpath_literal(value))) else: xpath.add_condition('0') return xpath class HTMLTranslator(GenericTranslator): """ Translator for (X)HTML documents. Has a more useful implementation of some pseudo-classes based on HTML-specific element names and attribute names, as described in the `HTML5 specification`_. It assumes no-quirks mode. The API is the same as :class:`GenericTranslator`. .. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors :param xhtml: If false (the default), element names and attribute names are case-insensitive. """ lang_attribute = 'lang' def __init__(self, xhtml=False): self.xhtml = xhtml # Might be useful for sub-classes? if not xhtml: # See their definition in GenericTranslator. self.lower_case_element_names = True self.lower_case_attribute_names = True def xpath_checked_pseudo(self, xpath): # FIXME: is this really all the elements? return xpath.add_condition( "(@selected and name(.) = 'option') or " "(@checked " "and (name(.) = 'input' or name(.) = 'command')" "and (@type = 'checkbox' or @type = 'radio'))") def xpath_lang_function(self, xpath, function): if function.argument_types() not in (['STRING'], ['IDENT']): raise ExpressionError( "Expected a single string or ident for :lang(), got %r" % function.arguments) value = function.arguments[0].value return xpath.add_condition( "ancestor-or-self::*[@lang][1][starts-with(concat(" # XPath 1.0 has no lower-case function... "translate(@%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', " "'abcdefghijklmnopqrstuvwxyz'), " "'-'), %s)]" % (self.lang_attribute, self.xpath_literal(value.lower() + '-'))) def xpath_link_pseudo(self, xpath): return xpath.add_condition("@href and " "(name(.) = 'a' or name(.) = 'link' or name(.) = 'area')") # Links are never visited, the implementation for :visited is the same # as in GenericTranslator def xpath_disabled_pseudo(self, xpath): # http://www.w3.org/TR/html5/section-index.html#attributes-1 return xpath.add_condition(''' ( @disabled and ( (name(.) = 'input' and @type != 'hidden') or name(.) = 'button' or name(.) = 'select' or name(.) = 'textarea' or name(.) = 'command' or name(.) = 'fieldset' or name(.) = 'optgroup' or name(.) = 'option' ) ) or ( ( (name(.) = 'input' and @type != 'hidden') or name(.) = 'button' or name(.) = 'select' or name(.) = 'textarea' ) and ancestor::fieldset[@disabled] ) ''') # FIXME: in the second half, add "and is not a descendant of that # fieldset element's first legend element child, if any." def xpath_enabled_pseudo(self, xpath): # http://www.w3.org/TR/html5/section-index.html#attributes-1 return xpath.add_condition(''' ( @href and ( name(.) = 'a' or name(.) = 'link' or name(.) = 'area' ) ) or ( ( name(.) = 'command' or name(.) = 'fieldset' or name(.) = 'optgroup' ) and not(@disabled) ) or ( ( (name(.) = 'input' and @type != 'hidden') or name(.) = 'button' or name(.) = 'select' or name(.) = 'textarea' or name(.) = 'keygen' ) and not (@disabled or ancestor::fieldset[@disabled]) ) or ( name(.) = 'option' and not( @disabled or ancestor::optgroup[@disabled] ) ) ''') # FIXME: ... or "li elements that are children of menu elements, # and that have a child element that defines a command, if the first # such element's Disabled State facet is false (not disabled)". # FIXME: after ancestor::fieldset[@disabled], add "and is not a # descendant of that fieldset element's first legend element child, # if any." cssselect-1.1.0/cssselect/__init__.py0000664000372000037200000000117713523235306020454 0ustar travistravis00000000000000# -*- coding: utf-8 -*- """ CSS Selectors based on XPath ============================ This module supports selecting XML/HTML elements based on CSS selectors. See the `CSSSelector` class for details. :copyright: (c) 2007-2012 Ian Bicking and contributors. See AUTHORS for more details. :license: BSD, see LICENSE for more details. """ from cssselect.parser import (parse, Selector, FunctionalPseudoElement, SelectorError, SelectorSyntaxError) from cssselect.xpath import GenericTranslator, HTMLTranslator, ExpressionError VERSION = '1.1.0' __version__ = VERSION cssselect-1.1.0/MANIFEST.in0000664000372000037200000000015213523235306016101 0ustar travistravis00000000000000include AUTHORS CHANGES LICENSE README.rst tox.ini .coveragerc recursive-include docs * prune docs/_build cssselect-1.1.0/docs/0000775000372000037200000000000013523235335015277 5ustar travistravis00000000000000cssselect-1.1.0/docs/conf.py0000664000372000037200000001777313523235306016613 0ustar travistravis00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # cssselect documentation build configuration file, created by # sphinx-quickstart on Tue Mar 27 14:20:34 2012. # # This file is execfile()d with the current directory set to its containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import sys, os, re # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.insert(0, os.path.abspath('.')) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.doctest'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. #source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = 'cssselect' copyright = '2012-2017, Simon Sapin, Scrapy developers' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The full version, including alpha/beta/rc tags. init_py = open(os.path.join(os.path.dirname(__file__), '..', 'cssselect', '__init__.py')).read() release = re.search("VERSION = '([^']+)'", init_py).group(1) # The short X.Y version. version = release.rstrip('dev') # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. #language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = ['_build'] # The reST default role (used for this markup: `text`) to use for all documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = 'classic' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. #html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". #html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_domain_indices = True # If false, no index is generated. #html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. #html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. #html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = 'cssselectdoc' # -- Options for LaTeX output -------------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). #'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). #'pointsize': '10pt', # Additional stuff for the LaTeX preamble. #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'cssselect.tex', 'cssselect Documentation', 'Simon Sapin', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # If true, show page references after internal links. #latex_show_pagerefs = False # If true, show URL addresses after external links. #latex_show_urls = False # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ ('index', 'cssselect', 'cssselect Documentation', ['Simon Sapin'], 1) ] # If true, show URL addresses after external links. #man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ('index', 'cssselect', 'cssselect Documentation', 'Simon Sapin', 'cssselect', 'One line description of project.', 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. #texinfo_appendices = [] # If false, no module index is generated. #texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. #texinfo_show_urls = 'footnote' # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {'http://docs.python.org/': None} cssselect-1.1.0/docs/index.rst0000664000372000037200000001147713523235306017150 0ustar travistravis00000000000000.. module:: cssselect .. include:: ../README.rst .. contents:: Contents :local: :depth: 1 Quickstart ========== Use :class:`HTMLTranslator` for HTML documents, :class:`GenericTranslator` for "generic" XML documents. (The former has a more useful translation for some selectors, based on HTML-specific element types or attributes.) .. sourcecode:: pycon >>> from cssselect import GenericTranslator, SelectorError >>> try: ... expression = GenericTranslator().css_to_xpath('div.content') ... except SelectorError: ... print('Invalid selector.') ... >>> print(expression) descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' content ')] The resulting expression can be used with lxml's `XPath engine`_: .. _XPath engine: http://lxml.de/xpathxslt.html#xpath .. sourcecode:: pycon >>> from lxml.etree import fromstring >>> document = fromstring(''' ...
...
text
...
... ''') >>> [e.get('id') for e in document.xpath(expression)] ['inner'] User API ======== In CSS3 Selectors terms, the top-level object is a `group of selectors`_, a sequence of comma-separated selectors. For example, ``div, h1.title + p`` is a group of two selectors. .. _group of selectors: http://www.w3.org/TR/selectors/#grouping .. autofunction:: parse .. autoclass:: Selector() :members: .. autoclass:: FunctionalPseudoElement .. autoclass:: GenericTranslator :members: css_to_xpath, selector_to_xpath .. autoclass:: HTMLTranslator Exceptions ---------- .. autoexception:: SelectorError .. autoexception:: SelectorSyntaxError .. autoexception:: ExpressionError Supported selectors =================== This library implements CSS3 selectors as described in `the W3C specification `_. In this context however, there is no interactivity or history of visited links. Therefore, these pseudo-classes are accepted but never match anything: * ``:hover`` * ``:active`` * ``:focus`` * ``:target`` * ``:visited`` Additionally, these depend on document knowledge and only have a useful implementation in :class:`HTMLTranslator`. In :class:`GenericTranslator`, they never match: * ``:link`` * ``:enabled`` * ``:disabled`` * ``:checked`` These applicable pseudo-classes are not yet implemented: * ``*:first-of-type``, ``*:last-of-type``, ``*:nth-of-type``, ``*:nth-last-of-type``, ``*:only-of-type``. All of these work when you specify an element type, but not with ``*`` On the other hand, *cssselect* supports some selectors that are not in the Level 3 specification: * The ``:contains(text)`` pseudo-class that existed in `an early draft`_ but was then removed. * The ``!=`` attribute operator. ``[foo!=bar]`` is the same as ``:not([foo=bar])`` * ``:not()`` accepts a *sequence of simple selectors*, not just single *simple selector*. For example, ``:not(a.important[rel])`` is allowed, even though the negation contains 3 *simple selectors*. * ``:scope`` allows to access immediate children of a selector: ``product.css(':scope > div::text')``, simillar to XPath ``child::div``. Must be used at the start of a selector. Simplified version of `level 4 reference`_. .. _an early draft: http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors .. _level 4 reference: https://developer.mozilla.org/en-US/docs/Web/CSS/:scope .. The following claim was copied from lxml: """ XPath has underspecified string quoting rules (there seems to be no string quoting at all), so if you use expressions that contain characters that requiring quoting you might have problems with the translation from CSS to XPath. """ It seems "string quoting" meant "quote escaping". There is indeed no quote escaping, but the xpath_literal method handles this. It should not be a problem anymore. Customizing the translation =========================== Just like :class:`HTMLTranslator` is a subclass of :class:`GenericTranslator`, you can make new sub-classes of either of them and override some methods. This enables you, for example, to customize how some pseudo-class is implemented without forking or monkey-patching cssselect. The "customization API" is the set of methods in translation classes and their signature. You can look at the `source code`_ to see how it works. However, be aware that this API is not very stable yet. It might change and break your sub-class. .. _source code: https://github.com/scrapy/cssselect/blob/master/cssselect/xpath.py Namespaces ========== In CSS you can use ``namespace-prefix|element``, similar to ``namespace-prefix:element`` in an XPath expression. In fact, it maps one-to-one. How prefixes are mapped to namespace URIs depends on the XPath implementation. .. include:: ../CHANGES cssselect-1.1.0/PKG-INFO0000664000372000037200000000515713523235335015454 0ustar travistravis00000000000000Metadata-Version: 1.2 Name: cssselect Version: 1.1.0 Summary: cssselect parses CSS3 Selectors and translates them to XPath 1.0 Home-page: https://github.com/scrapy/cssselect Author: Ian Bicking Author-email: ianb@colorstudy.com Maintainer: Paul Tremberth Maintainer-email: paul.tremberth@gmail.com License: BSD Description: =================================== cssselect: CSS Selectors for Python =================================== .. image:: https://img.shields.io/pypi/v/cssselect.svg :target: https://pypi.python.org/pypi/cssselect :alt: PyPI Version .. image:: https://img.shields.io/pypi/pyversions/cssselect.svg :target: https://pypi.python.org/pypi/cssselect :alt: Supported Python Versions .. image:: https://img.shields.io/travis/scrapy/cssselect/master.svg :target: https://travis-ci.org/scrapy/cssselect :alt: Build Status .. image:: https://img.shields.io/codecov/c/github/scrapy/cssselect/master.svg :target: https://codecov.io/github/scrapy/cssselect?branch=master :alt: Coverage report *cssselect* parses `CSS3 Selectors`_ and translate them to `XPath 1.0`_ expressions. Such expressions can be used in lxml_ or another XPath engine to find the matching elements in an XML or HTML document. This module used to live inside of lxml as ``lxml.cssselect`` before it was extracted as a stand-alone project. .. _CSS3 Selectors: https://www.w3.org/TR/css3-selectors/ .. _XPath 1.0: https://www.w3.org/TR/xpath/ .. _lxml: http://lxml.de/ Quick facts: * Free software: BSD licensed * Compatible with Python 2.7 and 3.4+ * Latest documentation `on Read the Docs `_ * Source, issues and pull requests `on GitHub `_ * Releases `on PyPI `_ * Install with ``pip install cssselect`` Platform: UNKNOWN Classifier: Development Status :: 4 - Beta Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: BSD License Classifier: Programming Language :: Python :: 2 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.* cssselect-1.1.0/tox.ini0000664000372000037200000000022013523235306015652 0ustar travistravis00000000000000[tox] envlist = py27, py34, py35, py36 [testenv] deps= -r tests/requirements.txt commands = py.test --cov-report term --cov=cssselect cssselect-1.1.0/setup.cfg0000664000372000037200000000031513523235335016167 0ustar travistravis00000000000000[build_sphinx] source-dir = docs build-dir = docs/_build [upload_sphinx] upload-dir = docs/_build/html [tool:pytest] testpaths = tests [bdist_wheel] universal = 1 [egg_info] tag_build = tag_date = 0 cssselect-1.1.0/CHANGES0000664000372000037200000001636313523235306015351 0ustar travistravis00000000000000Changelog ========= Version 1.1.0 ------------- Released on 2019-08-09. * Support for the ``:scope`` selector, which allows to access immediate children of a selector. * Support for the ``|E`` syntax for type selectors without a namespace. * A new selector method, ``canonical``, returns the CSS expression of the selector, as a string. Version 1.0.3 ------------- Released on 2017-12-27. * Fix artifact uploads to pypi Version 1.0.2 ------------- Released on 2017-12-26. * Drop support for Python 2.6 and Python 3.3. * Fix deprecation warning in Python 3.6. * Minor cleanups. Version 1.0.1 ------------- Released on 2017-01-10. * Add support for Python 3.6. * Documentation hosted `on Read the Docs `_ Version 1.0.0 ------------- Released on 2016-10-21. * Add code coverage reports. * Fix ``:nth-*(an+b)`` pseudo-classes selectors. (except ``*:nth-child()`` which looks untranslatable to XPath 1.0.) Version 0.9.2 ------------- Released on 2016-06-15. * Distribute as universal wheel. * Add support for Python 3.3, 3.4 and 3.5. * Drop support for Python 2.5 as testing is getting difficult. * Improve tests on pseudo-elements. Version 0.9.1 ------------- Released on 2013-10-17. * **Backward incompatible change from 0.9**: :meth:`~GenericTranslator.selector_to_xpath` defaults to ignoring pseudo-elements, as it did in 0.8 and previous versions. (:meth:`~GenericTranslator.css_to_xpath` doesn’t change.) * Drop official support for Python 2.4 and 3.1, as testing was becoming difficult. Nothing will break overnight, but future releases may on may not work on these versions. Older releases will remain available on PyPI. Version 0.9 ----------- Released on 2013-10-11. Add parser support for :attr:`functional pseudo-elements `. *Update:* This version accidentally introduced a **backward incompatible** change: :meth:`~GenericTranslator.selector_to_xpath` defaults to rejecting pseudo-elements instead of ignoring them. Version 0.8 ----------- Released on 2013-03-15. Improvements: * `#22 `_ Let extended translators override what XPathExpr class is used * `#19 `_ Use the built-in ``lang()`` XPath function for implementing the ``:lang()`` pseudo-class with XML documents. This is probably faster than ``ancestor-or-self::``. Bug fixes: * `#14 `_ Fix non-ASCII pseudo-classes. (Invalid selector instead of crash.) * `#20 `_ As per the spec, elements containing only whitespace are not considered empty for the ``:empty`` pseudo-class. Version 0.7.1 ------------- Released on 2012-06-14. Code name *remember-to-test-with-tox*. 0.7 broke the parser in Python 2.4 and 2.5; the tests in 2.x. Now all is well again. Also, pseudo-elements are now correctly made lower-case. (They are supposed to be case-insensitive.) Version 0.7 ----------- Released on 2012-06-14. Bug fix release: see #2, #7 and #10 on GitHub. * The tokenizer and parser have been rewritten to be much closer to the specified grammar. In particular, non-ASCII characters and backslash-escapes are now handled correctly. * Special characters are protected in the output so that generated XPath exrpessions should always be valid * The ``~=``, ``^=`` and ``*=`` attribute operators now correctly never match when used with an empty string. Version 0.6.1 ------------- Released on 2012-04-25. Make sure that internal token objects do not "leak" into the public API and :attr:`Selector.pseudo_element` is an unicode string. Version 0.6 ----------- Released on 2012-04-24. * In ``setup.py`` use setuptools/distribute if available, but fall back on distutils. * Implement the ``:lang()`` pseudo-class, although it is only based on ``xml:lang`` or ``lang`` attributes. If the document language is known from some other meta-data (like a ``Content-Language`` HTTP header or ```` element), a workaround is to set a lang attribute on the root element. Version 0.5 ----------- Released on 2012-04-20. * Fix case sensitivity issues. * Implement :class:`HTMLTranslator` based on the `HTML5 specification`_ rather than guessing; add the ``xhtml`` parameter. * Several bug fixes and better test coverage. .. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors Version 0.4 ----------- Released on 2012-04-18. * Add proper support for pseudo-elements * Add specificity calculation * Expose the :func:`parse` function and the parsed :class:`Selector` objects in the API. * Add the :meth:`~GenericTranslator.selector_to_xpath` method. Version 0.3 ----------- Released on 2012-04-17. * Fix many parsing bugs. * Rename the :class:`Translator` class to :class:`GenericTranslator` * There, implement ``:target``, ``:hover``, ``:focus``, ``:active`` ``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited`` as never matching. * Make a new HTML-specific ``HTMLTranslator`` subclass. There, implement ``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited`` as appropriate for HTML, with all links "not visited". * Remove the :func:`css_to_xpath` function. The translator classes are the new API. * Add support for ``:contains()`` back, but case-sensitive. lxml will override it to be case-insensitive for backward-compatibility. Discussion is open if anyone is interested in implementing eg. ``:target`` or ``:visited`` differently, but they can always do it in a ``Translator`` subclass. Version 0.2 ----------- Released on 2012-04-16. * Remove the ``CSSSelector`` class. (The ``css_to_xpath()`` function is now the main API.) * Remove support for the ``:contains()`` pseudo-class. These changes allow cssselect to be used without lxml. (Hey, this was the whole point of this project.) The tests still require lxml, though. The removed parts are expected to stay in lxml for backward-compatibility. ``:contains()`` only existed in an `early draft `_ of the Selectors specification, and was removed before Level 3 stabilized. Internally, it used a custom XPath extension function which can be difficult to express outside of lxml. * Separate the XPath translation from the parsed objects into a new ``Translator`` class. Subclasses of ``Translator`` can be made to change the way that some selector (eg. a pseudo-class) is implemented. Version 0.1 ----------- Released on 2012-04-13. Extract lxml.cssselect from the rest of lxml and make it a stand-alone project. Commit ``ea53ceaf7e44ba4fbb5c818ae31370932f47774e`` was taken on 2012-04-11 from the 'master' branch of lxml’s git repository. This is somewhere between versions 2.3.4 and 2.4. The commit history has been rewritten to: * Remove lxml files unrelated to cssselect * Import the early history from the 'html' branch in the old SVN repository * Fix author names in commits from SVN This project has its own import name, tests and documentation. But the code itself is unchanged and still depends on lxml. Earlier history --------------- Search for *cssselect* in `lxml’s changelog `_ cssselect-1.1.0/LICENSE0000664000372000037200000000275513523235306015363 0ustar travistravis00000000000000Copyright (c) 2007-2012 Ian Bicking and contributors. See AUTHORS for more details. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of Ian Bicking nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cssselect-1.1.0/.coveragerc0000664000372000037200000000023613523235306016467 0ustar travistravis00000000000000[run] branch = True source = cssselect [report] exclude_lines = pragma: no cover def __repr__ if sys.version_info if __name__ == '__main__': cssselect-1.1.0/cssselect.egg-info/0000775000372000037200000000000013523235335020031 5ustar travistravis00000000000000cssselect-1.1.0/cssselect.egg-info/PKG-INFO0000664000372000037200000000515713523235335021136 0ustar travistravis00000000000000Metadata-Version: 1.2 Name: cssselect Version: 1.1.0 Summary: cssselect parses CSS3 Selectors and translates them to XPath 1.0 Home-page: https://github.com/scrapy/cssselect Author: Ian Bicking Author-email: ianb@colorstudy.com Maintainer: Paul Tremberth Maintainer-email: paul.tremberth@gmail.com License: BSD Description: =================================== cssselect: CSS Selectors for Python =================================== .. image:: https://img.shields.io/pypi/v/cssselect.svg :target: https://pypi.python.org/pypi/cssselect :alt: PyPI Version .. image:: https://img.shields.io/pypi/pyversions/cssselect.svg :target: https://pypi.python.org/pypi/cssselect :alt: Supported Python Versions .. image:: https://img.shields.io/travis/scrapy/cssselect/master.svg :target: https://travis-ci.org/scrapy/cssselect :alt: Build Status .. image:: https://img.shields.io/codecov/c/github/scrapy/cssselect/master.svg :target: https://codecov.io/github/scrapy/cssselect?branch=master :alt: Coverage report *cssselect* parses `CSS3 Selectors`_ and translate them to `XPath 1.0`_ expressions. Such expressions can be used in lxml_ or another XPath engine to find the matching elements in an XML or HTML document. This module used to live inside of lxml as ``lxml.cssselect`` before it was extracted as a stand-alone project. .. _CSS3 Selectors: https://www.w3.org/TR/css3-selectors/ .. _XPath 1.0: https://www.w3.org/TR/xpath/ .. _lxml: http://lxml.de/ Quick facts: * Free software: BSD licensed * Compatible with Python 2.7 and 3.4+ * Latest documentation `on Read the Docs `_ * Source, issues and pull requests `on GitHub `_ * Releases `on PyPI `_ * Install with ``pip install cssselect`` Platform: UNKNOWN Classifier: Development Status :: 4 - Beta Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: BSD License Classifier: Programming Language :: Python :: 2 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.* cssselect-1.1.0/cssselect.egg-info/dependency_links.txt0000664000372000037200000000000113523235335024077 0ustar travistravis00000000000000 cssselect-1.1.0/cssselect.egg-info/top_level.txt0000664000372000037200000000001213523235335022554 0ustar travistravis00000000000000cssselect cssselect-1.1.0/cssselect.egg-info/SOURCES.txt0000664000372000037200000000046213523235335021717 0ustar travistravis00000000000000.coveragerc AUTHORS CHANGES LICENSE MANIFEST.in README.rst setup.cfg setup.py tox.ini cssselect/__init__.py cssselect/parser.py cssselect/xpath.py cssselect.egg-info/PKG-INFO cssselect.egg-info/SOURCES.txt cssselect.egg-info/dependency_links.txt cssselect.egg-info/top_level.txt docs/conf.py docs/index.rstcssselect-1.1.0/README.rst0000664000372000037200000000274313523235306016042 0ustar travistravis00000000000000=================================== cssselect: CSS Selectors for Python =================================== .. image:: https://img.shields.io/pypi/v/cssselect.svg :target: https://pypi.python.org/pypi/cssselect :alt: PyPI Version .. image:: https://img.shields.io/pypi/pyversions/cssselect.svg :target: https://pypi.python.org/pypi/cssselect :alt: Supported Python Versions .. image:: https://img.shields.io/travis/scrapy/cssselect/master.svg :target: https://travis-ci.org/scrapy/cssselect :alt: Build Status .. image:: https://img.shields.io/codecov/c/github/scrapy/cssselect/master.svg :target: https://codecov.io/github/scrapy/cssselect?branch=master :alt: Coverage report *cssselect* parses `CSS3 Selectors`_ and translate them to `XPath 1.0`_ expressions. Such expressions can be used in lxml_ or another XPath engine to find the matching elements in an XML or HTML document. This module used to live inside of lxml as ``lxml.cssselect`` before it was extracted as a stand-alone project. .. _CSS3 Selectors: https://www.w3.org/TR/css3-selectors/ .. _XPath 1.0: https://www.w3.org/TR/xpath/ .. _lxml: http://lxml.de/ Quick facts: * Free software: BSD licensed * Compatible with Python 2.7 and 3.4+ * Latest documentation `on Read the Docs `_ * Source, issues and pull requests `on GitHub `_ * Releases `on PyPI `_ * Install with ``pip install cssselect`` cssselect-1.1.0/setup.py0000664000372000037200000000256113523235306016063 0ustar travistravis00000000000000# -*- coding: utf-8 -*- import re import os.path try: from setuptools import setup extra_kwargs = {'test_suite': 'cssselect.tests'} except ImportError: from distutils.core import setup extra_kwargs = {} ROOT = os.path.dirname(__file__) README = open(os.path.join(ROOT, 'README.rst')).read() INIT_PY = open(os.path.join(ROOT, 'cssselect', '__init__.py')).read() VERSION = re.search("VERSION = '([^']+)'", INIT_PY).group(1) setup( name='cssselect', version=VERSION, author='Ian Bicking', author_email='ianb@colorstudy.com', maintainer='Paul Tremberth', maintainer_email='paul.tremberth@gmail.com', description= 'cssselect parses CSS3 Selectors and translates them to XPath 1.0', long_description=README, url='https://github.com/scrapy/cssselect', license='BSD', packages=['cssselect'], python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*', classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'License :: OSI Approved :: BSD License', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', ], **extra_kwargs ) cssselect-1.1.0/AUTHORS0000664000372000037200000000025313523235306015415 0ustar travistravis00000000000000Daniel Graña Ian Bicking James Salter Laurence Rowe Mikhail Korobov Nik Nyby Paul Tremberth Simon Potter Simon Sapin Stefan Behnel Thomas Grainger Varialus Arthur Darcet