pax_global_header00006660000000000000000000000064135232537020014514gustar00rootroot0000000000000052 comment=c9901d25043df6a682bb18fa67c48ad6bf377d97 parsel-1.5.2/000077500000000000000000000000001352325370200130075ustar00rootroot00000000000000parsel-1.5.2/.bumpversion.cfg000066400000000000000000000002361352325370200161200ustar00rootroot00000000000000[bumpversion] current_version = 1.5.2 commit = True tag = True tag_name = v{new_version} [bumpversion:file:setup.py] [bumpversion:file:parsel/__init__.py] parsel-1.5.2/.coveragerc000066400000000000000000000000471352325370200151310ustar00rootroot00000000000000[run] branch = true include = parsel/* parsel-1.5.2/.gitignore000066400000000000000000000006221352325370200147770ustar00rootroot00000000000000*.py[cod] # C extensions *.so # Packages *.egg *.eggs *.egg-info dist build eggs parts bin var sdist develop-eggs .installed.cfg lib lib64 # Installer logs pip-log.txt # Unit test / coverage reports .coverage .tox nosetests.xml htmlcov .pytest_cache # Translations *.mo # Mr Developer .mr.developer.cfg .project .pydevproject # Complexity output/*.html output/*/index.html # Sphinx docs/_build parsel-1.5.2/.travis.yml000066400000000000000000000034411352325370200151220ustar00rootroot00000000000000language: python branches: only: - master - /^v\d\.\d+$/ - /^v\d\.\d+\.\d+(rc\d+|\.dev\d+)?$/ matrix: include: - python: 2.7 env: TOXENV=py27 - python: 2.7 env: TOXENV=pypy - python: 2.7 env: TOXENV=pypy3 - python: 3.4 env: TOXENV=py34 - python: 3.5 env: TOXENV=py35 - python: 3.6 env: TOXENV=py36 - python: 3.7 env: TOXENV=py37 dist: xenial sudo: true install: - | if [ "$TOXENV" = "pypy" ]; then export PYPY_VERSION="pypy-6.0.0-linux_x86_64-portable" wget "https://bitbucket.org/squeaky/portable-pypy/downloads/${PYPY_VERSION}.tar.bz2" tar -jxf ${PYPY_VERSION}.tar.bz2 virtualenv --python="$PYPY_VERSION/bin/pypy" "$HOME/virtualenvs/$PYPY_VERSION" source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate" fi if [ "$TOXENV" = "pypy3" ]; then export PYPY_VERSION="pypy3.5-6.0.0-linux_x86_64-portable" wget "https://bitbucket.org/squeaky/portable-pypy/downloads/${PYPY_VERSION}.tar.bz2" tar -jxf ${PYPY_VERSION}.tar.bz2 virtualenv --python="$PYPY_VERSION/bin/pypy3" "$HOME/virtualenvs/$PYPY_VERSION" source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate" fi - pip install -U pip tox twine wheel codecov script: tox after_success: - codecov notifications: irc: use_notice: true skip_join: true channels: - irc.freenode.org#scrapy deploy: provider: pypi distributions: sdist bdist_wheel user: scrapy password: secure: kY3UdcidDTnwfBY9gGeUDK7g9k9Weg+AEo1CJYdQ8fBYQkIQBubbTiu1UGqOb0v/btySZDidp/jsGPLxlTZ34sG9jQEiteQILHJMjLzgcNNTSBsf8VIqTxcLpxS+RgeB2MyITOvYhxNMI4ezMXkm9TH7jL60gv/4BtiMOTpFzr8= on: tags: true all_branches: true repo: scrapy/parsel condition: $TOXENV == py27 parsel-1.5.2/LICENSE000066400000000000000000000027611352325370200140220ustar00rootroot00000000000000Copyright (c) Scrapy developers. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of Scrapy nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. parsel-1.5.2/MANIFEST.in000066400000000000000000000003551352325370200145500ustar00rootroot00000000000000include AUTHORS.rst include CONTRIBUTING.rst include NEWS include LICENSE include README.rst recursive-include tests * recursive-exclude * __pycache__ recursive-exclude * *.py[co] recursive-include docs *.rst conf.py Makefile make.bat parsel-1.5.2/Makefile000066400000000000000000000036001352325370200144460ustar00rootroot00000000000000.PHONY: clean-pyc clean-build docs clean help: @echo "clean - remove all build, test, coverage and Python artifacts" @echo "clean-build - remove build artifacts" @echo "clean-pyc - remove Python file artifacts" @echo "clean-test - remove test and coverage artifacts" @echo "lint - check style with flake8" @echo "test - run tests quickly with the default Python" @echo "test-all - run tests on every Python version with tox" @echo "coverage - check code coverage quickly with the default Python" @echo "docs - generate Sphinx HTML documentation, including API docs" @echo "release - package and upload a release" @echo "dist - package" @echo "install - install the package to the active Python's site-packages" clean: clean-build clean-pyc clean-test clean-build: rm -fr build/ rm -fr dist/ rm -fr .eggs/ find . -name '*.egg-info' -exec rm -fr {} + find . -name '*.egg' -exec rm -f {} + clean-pyc: find . -name '*.pyc' -exec rm -f {} + find . -name '*.pyo' -exec rm -f {} + find . -name '*~' -exec rm -f {} + find . -name '__pycache__' -exec rm -fr {} + clean-test: rm -fr .tox/ rm -f .coverage rm -fr htmlcov/ lint: flake8 parsel tests test: nosetests --with-doctest --rednose -s -v test-all: tox coverage: coverage run --source parsel setup.py test coverage report -m coverage html python -m webbrowser htmlcov/index.html docs: ( python -c 'import sphinx_rtd_theme' 2>/dev/null || pip install sphinx_rtd_theme ) rm -f docs/parsel.rst rm -f docs/modules.rst sphinx-apidoc -o docs/ parsel $(MAKE) -C docs clean $(MAKE) -C docs html python -m webbrowser docs/_build/html/index.html servedocs: docs watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D docs/ release: clean python setup.py sdist upload python setup.py bdist_wheel upload dist: clean python setup.py sdist python setup.py bdist_wheel ls -l dist install: clean python setup.py install parsel-1.5.2/NEWS000066400000000000000000000116671352325370200135210ustar00rootroot00000000000000.. :changelog: History ------- 1.5.2 (2019-08-09) ~~~~~~~~~~~~~~~~~~ * ``Selector.remove_namespaces`` received a significant performance improvement * The value of ``data`` within the printable representation of a selector (``repr(selector)``) now ends in ``...`` when truncated, to make the truncation obvious. * Minor documentation improvements. 1.5.1 (2018-10-25) ~~~~~~~~~~~~~~~~~~ * ``has-class`` XPath function handles newlines and other separators in class names properly; * fixed parsing of HTML documents with null bytes; * documentation improvements; * Python 3.7 tests are run on CI; other test improvements. 1.5.0 (2018-07-04) ~~~~~~~~~~~~~~~~~~ * New ``Selector.attrib`` and ``SelectorList.attrib`` properties which make it easier to get attributes of HTML elements. * CSS selectors became faster: compilation results are cached (LRU cache is used for ``css2xpath``), so there is less overhead when the same CSS expression is used several times. * ``.get()`` and ``.getall()`` selector methods are documented and recommended over ``.extract_first()`` and ``.extract()``. * Various documentation tweaks and improvements. One more change is that ``.extract()`` and ``.extract_first()`` methods are now implemented using ``.get()`` and ``.getall()``, not the other way around, and instead of calling ``Selector.extract`` all other methods now call ``Selector.get`` internally. It can be **backwards incompatible** in case of custom Selector subclasses which override ``Selector.extract`` without doing the same for ``Selector.get``. If you have such Selector subclass, make sure ``get`` method is also overridden. For example, this:: class MySelector(parsel.Selector): def extract(self): return super().extract() + " foo" should be changed to this:: class MySelector(parsel.Selector): def get(self): return super().get() + " foo" extract = get 1.4.0 (2018-02-08) ~~~~~~~~~~~~~~~~~~ * ``Selector`` and ``SelectorList`` can't be pickled because pickling/unpickling doesn't work for ``lxml.html.HtmlElement``; parsel now raises TypeError explicitly instead of allowing pickle to silently produce wrong output. This is technically backwards-incompatible if you're using Python < 3.6. 1.3.1 (2017-12-28) ~~~~~~~~~~~~~~~~~~ * Fix artifact uploads to pypi. 1.3.0 (2017-12-28) ~~~~~~~~~~~~~~~~~~ * ``has-class`` XPath extension function; * ``parsel.xpathfuncs.set_xpathfunc`` is a simplified way to register XPath extensions; * ``Selector.remove_namespaces`` now removes namespace declarations; * Python 3.3 support is dropped; * ``make htmlview`` command for easier Parsel docs development. * CI: PyPy installation is fixed; parsel now runs tests for PyPy3 as well. 1.2.0 (2017-05-17) ~~~~~~~~~~~~~~~~~~ * Add ``SelectorList.get`` and ``SelectorList.getall`` methods as aliases for ``SelectorList.extract_first`` and ``SelectorList.extract`` respectively * Add default value parameter to ``SelectorList.re_first`` method * Add ``Selector.re_first`` method * Add ``replace_entities`` argument on ``.re()`` and ``.re_first()`` to turn off replacing of character entity references * Bug fix: detect ``None`` result from lxml parsing and fallback with an empty document * Rearrange XML/HTML examples in the selectors usage docs * Travis CI: * Test against Python 3.6 * Test against PyPy using "Portable PyPy for Linux" distribution 1.1.0 (2016-11-22) ~~~~~~~~~~~~~~~~~~ * Change default HTML parser to `lxml.html.HTMLParser `_, which makes easier to use some HTML specific features * Add css2xpath function to translate CSS to XPath * Add support for ad-hoc namespaces declarations * Add support for XPath variables * Documentation improvements and updates 1.0.3 (2016-07-29) ~~~~~~~~~~~~~~~~~~ * Add BSD-3-Clause license file * Re-enable PyPy tests * Integrate py.test runs with setuptools (needed for Debian packaging) * Changelog is now called ``NEWS`` 1.0.2 (2016-04-26) ~~~~~~~~~~~~~~~~~~ * Fix bug in exception handling causing original traceback to be lost * Added docstrings and other doc fixes 1.0.1 (2015-08-24) ~~~~~~~~~~~~~~~~~~ * Updated PyPI classifiers * Added docstrings for csstranslator module and other doc fixes 1.0.0 (2015-08-22) ~~~~~~~~~~~~~~~~~~ * Documentation fixes 0.9.6 (2015-08-14) ~~~~~~~~~~~~~~~~~~ * Updated documentation * Extended test coverage 0.9.5 (2015-08-11) ~~~~~~~~~~~~~~~~~~ * Support for extending SelectorList 0.9.4 (2015-08-10) ~~~~~~~~~~~~~~~~~~ * Try workaround for travis-ci/dpl#253 0.9.3 (2015-08-07) ~~~~~~~~~~~~~~~~~~ * Add base_url argument 0.9.2 (2015-08-07) ~~~~~~~~~~~~~~~~~~ * Rename module unified -> selector and promoted root attribute * Add create_root_node function 0.9.1 (2015-08-04) ~~~~~~~~~~~~~~~~~~ * Setup Sphinx build and docs structure * Build universal wheels * Rename some leftovers from package extraction 0.9.0 (2015-07-30) ~~~~~~~~~~~~~~~~~~ * First release on PyPI. parsel-1.5.2/README.rst000066400000000000000000000025731352325370200145050ustar00rootroot00000000000000=============================== Parsel =============================== .. image:: https://img.shields.io/travis/scrapy/parsel/master.svg :target: https://travis-ci.org/scrapy/parsel :alt: Build Status .. image:: https://img.shields.io/pypi/v/parsel.svg :target: https://pypi.python.org/pypi/parsel :alt: PyPI Version .. image:: https://img.shields.io/codecov/c/github/scrapy/parsel/master.svg :target: http://codecov.io/github/scrapy/parsel?branch=master :alt: Coverage report Parsel is a library to extract data from HTML and XML using XPath and CSS selectors * Free software: BSD license * Documentation: https://parsel.readthedocs.org. Features -------- * Extract text using CSS or XPath selectors * Regular expression helper methods Example:: >>> from parsel import Selector >>> sel = Selector(text=u"""

Hello, Parsel!

""") >>> >>> sel.css('h1::text').get() 'Hello, Parsel!' >>> >>> sel.css('h1::text').re('\w+') ['Hello', 'Parsel'] >>> >>> for e in sel.css('ul > li'): ... print(e.xpath('.//a/@href').get()) http://example.com http://scrapy.org parsel-1.5.2/docs/000077500000000000000000000000001352325370200137375ustar00rootroot00000000000000parsel-1.5.2/docs/Makefile000066400000000000000000000155061352325370200154060ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. PYTHON = python SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" @echo " htmlview to view the compiled HTML files in browser" clean: rm -rf $(BUILDDIR)/* html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/parsel.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/parsel.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/parsel" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/parsel" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." htmlview: html $(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \ os.path.realpath('_build/html/index.html'))" parsel-1.5.2/docs/_static/000077500000000000000000000000001352325370200153655ustar00rootroot00000000000000parsel-1.5.2/docs/_static/selectors-sample1.html000066400000000000000000000010651352325370200216200ustar00rootroot00000000000000 Example website parsel-1.5.2/docs/conf.py000077500000000000000000000203071352325370200152430ustar00rootroot00000000000000#!/usr/bin/env python # -*- coding: utf-8 -*- # # parsel documentation build configuration file, created by # sphinx-quickstart on Tue Jul 9 22:26:36 2013. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import sys import os # If extensions (or modules to document with autodoc) are in another # directory, add these directories to sys.path here. If the directory is # relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. #sys.path.insert(0, os.path.abspath('.')) # Get the project root dir, which is the parent dir of this cwd = os.getcwd() project_root = os.path.dirname(cwd) # Insert the project root dir as the first element in the PYTHONPATH. # This lets us ensure that the source package is imported, and that its # version is used. sys.path.insert(0, project_root) import parsel # -- General configuration --------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. #source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = u'Parsel' copyright = u'2015, Scrapy Project' # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout # the built documents. # # The short X.Y version. version = parsel.__version__ # The full version, including alpha/beta/rc tags. release = parsel.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. #language = None # There are two options for replacing |today|: either, you set today to # some non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = ['_build'] # The reST default role (used for this markup: `text`) to use for all # documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built # documents. #keep_warnings = False # -- Options for HTML output ------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the # documentation. #html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as # html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the # top of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon # of the docs. This file should be a Windows icon file (.ico) being # 16x16 or 32x32 pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) # here, relative to this directory. They are copied after the builtin # static files, so a file named "default.css" will overwrite the builtin # "default.css". html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page # bottom, using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names # to template names. #html_additional_pages = {} # If false, no module index is generated. #html_domain_indices = True # If false, no index is generated. #html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. # Default is True. #html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. # Default is True. #html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages # will contain a tag referring to it. The value of this option # must be the base URL from which the finished HTML is served. #html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = 'parseldoc' # -- Options for LaTeX output ------------------------------------------ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). #'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). #'pointsize': '10pt', # Additional stuff for the LaTeX preamble. #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass # [howto/manual]). latex_documents = [ ('index', 'parsel.tex', u'Parsel Documentation', u'Scrapy Project', 'manual'), ] # The name of an image file (relative to this directory) to place at # the top of the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings # are parts, not chapters. #latex_use_parts = False # If true, show page references after internal links. #latex_show_pagerefs = False # If true, show URL addresses after external links. #latex_show_urls = False # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_domain_indices = True # -- Options for manual page output ------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ ('index', 'parsel', u'Parsel Documentation', [u'Scrapy Project'], 1) ] # If true, show URL addresses after external links. #man_show_urls = False # -- Options for Texinfo output ---------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ('index', 'parsel', u'Parsel Documentation', u'Scrapy Project', 'parsel', 'One line description of project.', 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. #texinfo_appendices = [] # If false, no module index is generated. #texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. #texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. #texinfo_no_detailmenu = False parsel-1.5.2/docs/history.rst000066400000000000000000000000261352325370200161700ustar00rootroot00000000000000.. include:: ../NEWS parsel-1.5.2/docs/index.rst000066400000000000000000000007341352325370200156040ustar00rootroot00000000000000.. parsel documentation master file, created by sphinx-quickstart on Tue Jul 9 22:26:36 2013. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. .. include:: ../README.rst Parsel Documentation Contents ============================= Contents: .. toctree:: :maxdepth: 2 installation usage history Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` parsel-1.5.2/docs/installation.rst000066400000000000000000000005001352325370200171650ustar00rootroot00000000000000============ Installation ============ To install Parsel, we recommend you to use `pip `_:: $ pip install parsel You `probably shouldn't `_, but you can also install it with easy_install:: $ easy_install parsel parsel-1.5.2/docs/make.bat000066400000000000000000000144731352325370200153550ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=_build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . set I18NSPHINXOPTS=%SPHINXOPTS% . if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. xml to make Docutils-native XML files echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) if "%1" == "html" ( %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\parsel.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\parsel.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf cd %BUILDDIR%/.. echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja cd %BUILDDIR%/.. echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 echo. echo.Build finished. The XML files are in %BUILDDIR%/xml. goto end ) if "%1" == "pseudoxml" ( %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml if errorlevel 1 exit /b 1 echo. echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. goto end ) :end parsel-1.5.2/docs/modules.rst000066400000000000000000000000671352325370200161440ustar00rootroot00000000000000parsel ====== .. toctree:: :maxdepth: 4 parsel parsel-1.5.2/docs/parsel.rst000066400000000000000000000011111352325370200157510ustar00rootroot00000000000000parsel package ============== Submodules ---------- parsel.csstranslator module --------------------------- .. automodule:: parsel.csstranslator :members: :undoc-members: :show-inheritance: parsel.selector module ---------------------- .. automodule:: parsel.selector :members: :undoc-members: :show-inheritance: parsel.utils module ------------------- .. automodule:: parsel.utils :members: :undoc-members: :show-inheritance: Module contents --------------- .. automodule:: parsel :members: :undoc-members: :show-inheritance: parsel-1.5.2/docs/readme.rst000066400000000000000000000000341352325370200157230ustar00rootroot00000000000000.. include:: ../README.rst parsel-1.5.2/docs/usage.rst000066400000000000000000001105031352325370200155750ustar00rootroot00000000000000.. _topics-selectors: ===== Usage ===== Getting started =============== If you already know how to write `CSS`_ or `XPath`_ expressions, using Parsel is straightforward: you just need to create a :class:`~parsel.selector.Selector` object for the HTML or XML text you want to parse, and use the available methods for selecting parts from the text and extracting data out of the result. Creating a :class:`~parsel.selector.Selector` object is simple:: >>> from parsel import Selector >>> text = u"

Hello, Parsel!

" >>> sel = Selector(text=text) .. note:: One important thing to note is that if you're using Python 2, make sure to use an `unicode` object for the text argument. :class:`~parsel.selector.Selector` expects text to be an `unicode` object in Python 2 or an `str` object in Python 3. Once you have created the Selector object, you can use `CSS`_ or `XPath`_ expressions to select elements:: >>> sel.css('h1') [] >>> sel.xpath('//h1') # the same, but now with XPath [] And extract data from those elements:: >>> sel.css('h1::text').get() 'Hello, Parsel!' >>> sel.xpath('//h1/text()').getall() ['Hello, Parsel!'] `XPath`_ is a language for selecting nodes in XML documents, which can also be used with HTML. `CSS`_ is a language for applying styles to HTML documents. It defines selectors to associate those styles with specific HTML elements. You can use either language you're more comfortable with, though you may find that in some specific cases `XPath`_ is more powerful than `CSS`_. .. _XPath: http://www.w3.org/TR/xpath .. _CSS: http://www.w3.org/TR/selectors Using selectors =============== To explain how to use the selectors we'll use the `requests`_ library to download an example page located in the Parsel's documentation: http://parsel.readthedocs.org/en/latest/_static/selectors-sample1.html .. _topics-selectors-htmlcode: For the sake of completeness, here's its full HTML code: .. literalinclude:: _static/selectors-sample1.html :language: html .. highlight:: python So, let's download that page and create a selector for it:: >>> import requests >>> from parsel import Selector >>> url = 'http://parsel.readthedocs.org/en/latest/_static/selectors-sample1.html' >>> text = requests.get(url).text >>> selector = Selector(text=text) Since we're dealing with HTML, the default type for Selector, we don't need to specify the `type` argument. So, by looking at the :ref:`HTML code ` of that page, let's construct an XPath for selecting the text inside the title tag:: >>> selector.xpath('//title/text()') [] You can also ask the same thing using CSS instead:: >>> selector.css('title::text') [] To actually extract the textual data, you must call the selector ``.get()`` or ``.getall()`` methods, as follows:: >>> selector.xpath('//title/text()').getall() ['Example website'] >>> selector.xpath('//title/text()').get() 'Example website' ``.get()`` always returns a single result; if there are several matches, content of a first match is returned; if there are no matches, None is returned. ``.getall()`` returns a list with all results. Notice that CSS selectors can select text or attribute nodes using CSS3 pseudo-elements:: >>> selector.css('title::text').get() 'Example website' As you can see, ``.xpath()`` and ``.css()`` methods return a :class:`~parsel.selector.SelectorList` instance, which is a list of new selectors. This API can be used for quickly selecting nested data:: >>> selector.css('img').xpath('@src').getall() ['image1_thumb.jpg', 'image2_thumb.jpg', 'image3_thumb.jpg', 'image4_thumb.jpg', 'image5_thumb.jpg'] If you want to extract only the first matched element, you can call the selector ``.get()`` (or its alias ``.extract_first()`` commonly used in previous parsel versions):: >>> selector.xpath('//div[@id="images"]/a/text()').get() 'Name: My image 1 ' It returns ``None`` if no element was found:: >>> selector.xpath('//div[@id="not-exists"]/text()').get() is None True Instead of using e.g. ``'@src'`` XPath it is possible to query for attributes using ``.attrib`` property of a :class:`~parsel.selector.Selector`:: >>> [img.attrib['src'] for img in selector.css('img')] ['image1_thumb.jpg', 'image2_thumb.jpg', 'image3_thumb.jpg', 'image4_thumb.jpg', 'image5_thumb.jpg'] As a shortcut, ``.attrib`` is also available on SelectorList directly; it returns attributes for the first matching element:: >>> selector.css('img').attrib['src'] 'image1_thumb.jpg' This is most useful when only a single result is expected, e.g. when selecting by id, or selecting unique elements on a web page:: >>> selector.css('base').attrib['href'] 'http://example.com/' Now we're going to get the base URL and some image links:: >>> selector.xpath('//base/@href').get() 'http://example.com/' >>> selector.css('base::attr(href)').get() 'http://example.com/' >>> selector.css('base').attrib['href'] 'http://example.com/' >>> selector.xpath('//a[contains(@href, "image")]/@href').getall() ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] >>> selector.css('a[href*=image]::attr(href)').getall() ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] >>> selector.xpath('//a[contains(@href, "image")]/img/@src').getall() ['image1_thumb.jpg', 'image2_thumb.jpg', 'image3_thumb.jpg', 'image4_thumb.jpg', 'image5_thumb.jpg'] >>> selector.css('a[href*=image] img::attr(src)').getall() ['image1_thumb.jpg', 'image2_thumb.jpg', 'image3_thumb.jpg', 'image4_thumb.jpg', 'image5_thumb.jpg'] .. _topics-selectors-css-extensions: Extensions to CSS Selectors --------------------------- Per W3C standards, `CSS selectors`_ do not support selecting text nodes or attribute values. But selecting these is so essential in a web scraping context that Parsel implements a couple of **non-standard pseudo-elements**: * to select text nodes, use ``::text`` * to select attribute values, use ``::attr(name)`` where *name* is the name of the attribute that you want the value of .. warning:: These pseudo-elements are Scrapy-/Parsel-specific. They will most probably not work with other libraries like `lxml`_ or `PyQuery`_. Examples: * ``title::text`` selects children text nodes of a descendant ```` element:: >>> selector.css('title::text').get() 'Example website' * ``*::text`` selects all descendant text nodes of the current selector context:: >>> selector.css('#images *::text').getall() ['\n ', 'Name: My image 1 ', '\n ', 'Name: My image 2 ', '\n ', 'Name: My image 3 ', '\n ', 'Name: My image 4 ', '\n ', 'Name: My image 5 ', '\n '] * ``a::attr(href)`` selects the *href* attribute value of descendant links:: >>> selector.css('a::attr(href)').getall() ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] .. note:: You cannot chain these pseudo-elements. But in practice it would not make much sense: text nodes do not have attributes, and attribute values are string values already and do not have children nodes. .. note:: See also: :ref:`selecting-attributes`. .. _CSS Selectors: https://www.w3.org/TR/css3-selectors/#selectors .. _topics-selectors-nesting-selectors: Nesting selectors ----------------- The selection methods (``.xpath()`` or ``.css()``) return a list of selectors of the same type, so you can call the selection methods for those selectors too. Here's an example:: >>> links = selector.xpath('//a[contains(@href, "image")]') >>> links.getall() ['<a href="image1.html">Name: My image 1 <br><img src="image1_thumb.jpg"></a>', '<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>', '<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>', '<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>', '<a href="image5.html">Name: My image 5 <br><img src="image5_thumb.jpg"></a>'] >>> for index, link in enumerate(links): ... args = (index, link.xpath('@href').get(), link.xpath('img/@src').get()) ... print('Link number %d points to url %r and image %r' % args) Link number 0 points to url 'image1.html' and image 'image1_thumb.jpg' Link number 1 points to url 'image2.html' and image 'image2_thumb.jpg' Link number 2 points to url 'image3.html' and image 'image3_thumb.jpg' Link number 3 points to url 'image4.html' and image 'image4_thumb.jpg' Link number 4 points to url 'image5.html' and image 'image5_thumb.jpg' .. _selecting-attributes: Selecting element attributes ---------------------------- There are several ways to get a value of an attribute. First, one can use XPath syntax:: >>> selector.xpath("//a/@href").getall() ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] XPath syntax has a few advantages: it is a standard XPath feature, and ``@attributes`` can be used in other parts of an XPath expression - e.g. it is possible to filter by attribute value. parsel also provides an extension to CSS selectors (``::attr(...)``) which allows to get attribute values:: >>> selector.css('a::attr(href)').getall() ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] In addition to that, there is a ``.attrib`` property of Selector. You can use it if you prefer to lookup attributes in Python code, without using XPaths or CSS extensions:: >>> [a.attrib['href'] for a in selector.css('a')] ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] This property is also available on SelectorList; it returns a dictionary with attributes of a first matching element. It is convenient to use when a selector is expected to give a single result (e.g. when selecting by element ID, or when selecting an unique element on a page):: >>> selector.css('base').attrib {'href': 'http://example.com/'} >>> selector.css('base').attrib['href'] 'http://example.com/' ``.attrib`` property of an empty SelectorList is empty:: >>> selector.css('foo').attrib {} Using selectors with regular expressions ---------------------------------------- :class:`~parsel.selector.Selector` also has a ``.re()`` method for extracting data using regular expressions. However, unlike using ``.xpath()`` or ``.css()`` methods, ``.re()`` returns a list of unicode strings. So you can't construct nested ``.re()`` calls. Here's an example used to extract image names from the :ref:`HTML code <topics-selectors-htmlcode>` above:: >>> selector.xpath('//a[contains(@href, "image")]/text()').re(r'Name:\s*(.*)') ['My image 1', 'My image 2', 'My image 3', 'My image 4', 'My image 5'] There's an additional helper reciprocating ``.get()`` (and its alias ``.extract_first()``) for ``.re()``, named ``.re_first()``. Use it to extract just the first matching string:: >>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(r'Name:\s*(.*)') 'My image 1' .. _topics-selectors-relative-xpaths: Working with relative XPaths ---------------------------- Keep in mind that if you are nesting selectors and use an XPath that starts with ``/``, that XPath will be absolute to the document and not relative to the selector you're calling it from. For example, suppose you want to extract all ``<p>`` elements inside ``<div>`` elements. First, you would get all ``<div>`` elements:: >>> divs = selector.xpath('//div') At first, you may be tempted to use the following approach, which is wrong, as it actually extracts all ``<p>`` elements from the document, not only those inside ``<div>`` elements:: >>> for p in divs.xpath('//p'): # this is wrong - gets all <p> from the whole document ... print(p.get()) This is the proper way to do it (note the dot prefixing the ``.//p`` XPath):: >>> for p in divs.xpath('.//p'): # extracts all <p> inside ... print(p.get()) Another common case would be to extract all direct ``<p>`` children:: >>> for p in divs.xpath('p'): ... print(p.get()) For more details about relative XPaths see the `Location Paths`_ section in the XPath specification. .. _Location Paths: https://www.w3.org/TR/xpath#location-paths Using EXSLT extensions ---------------------- Being built atop `lxml`_, parsel selectors support some `EXSLT`_ extensions and come with these pre-registered namespaces to use in XPath expressions: ====== ===================================== ======================= prefix namespace usage ====== ===================================== ======================= re \http://exslt.org/regular-expressions `regular expressions`_ set \http://exslt.org/sets `set manipulation`_ ====== ===================================== ======================= Regular expressions ~~~~~~~~~~~~~~~~~~~ The ``test()`` function, for example, can prove quite useful when XPath's ``starts-with()`` or ``contains()`` are not sufficient. Example selecting links in list item with a "class" attribute ending with a digit:: >>> from parsel import Selector >>> doc = u""" ... <div> ... <ul> ... <li class="item-0"><a href="link1.html">first item</a></li> ... <li class="item-1"><a href="link2.html">second item</a></li> ... <li class="item-inactive"><a href="link3.html">third item</a></li> ... <li class="item-1"><a href="link4.html">fourth item</a></li> ... <li class="item-0"><a href="link5.html">fifth item</a></li> ... </ul> ... </div> ... """ >>> sel = Selector(text=doc) >>> sel.xpath('//li//@href').getall() ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html'] >>> sel.xpath('//li[re:test(@class, "item-\d$")]//@href').getall() ['link1.html', 'link2.html', 'link4.html', 'link5.html'] >>> .. warning:: C library ``libxslt`` doesn't natively support EXSLT regular expressions so `lxml`_'s implementation uses hooks to Python's ``re`` module. Thus, using regexp functions in your XPath expressions may add a small performance penalty. Set operations ~~~~~~~~~~~~~~ These can be handy for excluding parts of a document tree before extracting text elements for example. Example extracting microdata (sample content taken from http://schema.org/Product) with groups of itemscopes and corresponding itemprops:: >>> doc = u""" ... <div itemscope itemtype="http://schema.org/Product"> ... <span itemprop="name">Kenmore White 17" Microwave</span> ... <img src="kenmore-microwave-17in.jpg" alt='Kenmore 17" Microwave' /> ... <div itemprop="aggregateRating" ... itemscope itemtype="http://schema.org/AggregateRating"> ... Rated <span itemprop="ratingValue">3.5</span>/5 ... based on <span itemprop="reviewCount">11</span> customer reviews ... </div> ... ... <div itemprop="offers" itemscope itemtype="http://schema.org/Offer"> ... <span itemprop="price">$55.00</span> ... <link itemprop="availability" href="http://schema.org/InStock" />In stock ... </div> ... ... Product description: ... <span itemprop="description">0.7 cubic feet countertop microwave. ... Has six preset cooking categories and convenience features like ... Add-A-Minute and Child Lock.</span> ... ... Customer reviews: ... ... <div itemprop="review" itemscope itemtype="http://schema.org/Review"> ... <span itemprop="name">Not a happy camper</span> - ... by <span itemprop="author">Ellie</span>, ... <meta itemprop="datePublished" content="2011-04-01">April 1, 2011 ... <div itemprop="reviewRating" itemscope itemtype="http://schema.org/Rating"> ... <meta itemprop="worstRating" content = "1"> ... <span itemprop="ratingValue">1</span>/ ... <span itemprop="bestRating">5</span>stars ... </div> ... <span itemprop="description">The lamp burned out and now I have to replace ... it. </span> ... </div> ... ... <div itemprop="review" itemscope itemtype="http://schema.org/Review"> ... <span itemprop="name">Value purchase</span> - ... by <span itemprop="author">Lucas</span>, ... <meta itemprop="datePublished" content="2011-03-25">March 25, 2011 ... <div itemprop="reviewRating" itemscope itemtype="http://schema.org/Rating"> ... <meta itemprop="worstRating" content = "1"/> ... <span itemprop="ratingValue">4</span>/ ... <span itemprop="bestRating">5</span>stars ... </div> ... <span itemprop="description">Great microwave for the price. It is small and ... fits in my apartment.</span> ... </div> ... ... ... </div> ... """ >>> sel = Selector(text=doc, type="html") >>> for scope in sel.xpath('//div[@itemscope]'): ... print("current scope:", scope.xpath('@itemtype').getall()) ... props = scope.xpath(''' ... set:difference(./descendant::*/@itemprop, ... .//*[@itemscope]/*/@itemprop)''') ... print(" properties: %s" % (props.getall())) ... print("") current scope: ['http://schema.org/Product'] properties: ['name', 'aggregateRating', 'offers', 'description', 'review', 'review'] current scope: ['http://schema.org/AggregateRating'] properties: ['ratingValue', 'reviewCount'] current scope: ['http://schema.org/Offer'] properties: ['price', 'availability'] current scope: ['http://schema.org/Review'] properties: ['name', 'author', 'datePublished', 'reviewRating', 'description'] current scope: ['http://schema.org/Rating'] properties: ['worstRating', 'ratingValue', 'bestRating'] current scope: ['http://schema.org/Review'] properties: ['name', 'author', 'datePublished', 'reviewRating', 'description'] current scope: ['http://schema.org/Rating'] properties: ['worstRating', 'ratingValue', 'bestRating'] >>> Here we first iterate over ``itemscope`` elements, and for each one, we look for all ``itemprops`` elements and exclude those that are themselves inside another ``itemscope``. .. _EXSLT: http://exslt.org/ .. _regular expressions: http://exslt.org/regexp/index.html .. _set manipulation: http://exslt.org/set/index.html .. _topics-xpath-other-extensions: Other XPath extensions ---------------------- Parsel also defines a sorely missed XPath extension function ``has-class`` that returns ``True`` for nodes that have all of the specified HTML classes:: >>> from parsel import Selector >>> sel = Selector(""" ... <p class="foo bar-baz">First</p> ... <p class="foo">Second</p> ... <p class="bar">Third</p> ... <p>Fourth</p> ... """) ... >>> sel = Selector(u""" ... <p class="foo bar-baz">First</p> ... <p class="foo">Second</p> ... <p class="bar">Third</p> ... <p>Fourth</p> ... """) ... >>> sel.xpath('//p[has-class("foo")]') [<Selector xpath='//p[has-class("foo")]' data='<p class="foo bar-baz">First</p>'>, <Selector xpath='//p[has-class("foo")]' data='<p class="foo">Second</p>'>] >>> sel.xpath('//p[has-class("foo", "bar-baz")]') [<Selector xpath='//p[has-class("foo", "bar-baz")]' data='<p class="foo bar-baz">First</p>'>] >>> sel.xpath('//p[has-class("foo", "bar")]') [] So XPath ``//p[has-class("foo", "bar-baz")]`` is roughly equivalent to CSS ``p.foo.bar-baz``. Please note, that it is slower in most of the cases, because it's a pure-Python function that's invoked for every node in question whereas the CSS lookup is translated into XPath and thus runs more efficiently, so performance-wise its uses are limited to situations that are not easily described with CSS selectors. Parsel also simplifies adding your own XPath extensions. .. autofunction:: parsel.xpathfuncs.set_xpathfunc Some XPath tips --------------- Here are some tips that you may find useful when using XPath with Parsel, based on `this post from ScrapingHub's blog`_. If you are not much familiar with XPath yet, you may want to take a look first at this `XPath tutorial`_. .. _`XPath tutorial`: http://www.zvon.org/comp/r/tut-XPath_1.html .. _`this post from ScrapingHub's blog`: https://blog.scrapinghub.com/2014/07/17/xpath-tips-from-the-web-scraping-trenches/ Using text nodes in a condition ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ When you need to use the text content as argument to an `XPath string function`_, avoid using ``.//text()`` and use just ``.`` instead. This is because the expression ``.//text()`` yields a collection of text elements -- a *node-set*. And when a node-set is converted to a string, which happens when it is passed as argument to a string function like ``contains()`` or ``starts-with()``, it results in the text for the first element only. Example:: >>> from parsel import Selector >>> sel = Selector(text='<a href="#">Click here to go to the <strong>Next Page</strong></a>') Converting a *node-set* to string:: >>> sel.xpath('//a//text()').getall() # take a peek at the node-set ['Click here to go to the ', 'Next Page'] >>> sel.xpath("string(//a[1]//text())").getall() # convert it to string ['Click here to go to the '] A *node* converted to a string, however, puts together the text of itself plus of all its descendants:: >>> sel.xpath("//a[1]").getall() # select the first node ['<a href="#">Click here to go to the <strong>Next Page</strong></a>'] >>> sel.xpath("string(//a[1])").getall() # convert it to string ['Click here to go to the Next Page'] So, using the ``.//text()`` node-set won't select anything in this case:: >>> sel.xpath("//a[contains(.//text(), 'Next Page')]").getall() [] But using the ``.`` to mean the node, works:: >>> sel.xpath("//a[contains(., 'Next Page')]").getall() ['<a href="#">Click here to go to the <strong>Next Page</strong></a>'] .. _`XPath string function`: https://www.w3.org/TR/xpath/#section-String-Functions Beware of the difference between //node[1] and (//node)[1] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``//node[1]`` selects all the nodes occurring first under their respective parents. ``(//node)[1]`` selects all the nodes in the document, and then gets only the first of them. Example:: >>> from parsel import Selector >>> sel = Selector(text=""" ....: <ul class="list"> ....: <li>1</li> ....: <li>2</li> ....: <li>3</li> ....: </ul> ....: <ul class="list"> ....: <li>4</li> ....: <li>5</li> ....: <li>6</li> ....: </ul>""") >>> xp = lambda x: sel.xpath(x).getall() This gets all first ``<li>`` elements under whatever it is its parent:: >>> xp("//li[1]") ['<li>1</li>', '<li>4</li>'] And this gets the first ``<li>`` element in the whole document:: >>> xp("(//li)[1]") ['<li>1</li>'] This gets all first ``<li>`` elements under an ``<ul>`` parent:: >>> xp("//ul/li[1]") ['<li>1</li>', '<li>4</li>'] And this gets the first ``<li>`` element under an ``<ul>`` parent in the whole document:: >>> xp("(//ul/li)[1]") ['<li>1</li>'] When querying by class, consider using CSS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Because an element can contain multiple CSS classes, the XPath way to select elements by class is the rather verbose:: *[contains(concat(' ', normalize-space(@class), ' '), ' someclass ')] If you use ``@class='someclass'`` you may end up missing elements that have other classes, and if you just use ``contains(@class, 'someclass')`` to make up for that you may end up with more elements that you want, if they have a different class name that shares the string ``someclass``. As it turns out, parsel selectors allow you to chain selectors, so most of the time you can just select by class using CSS and then switch to XPath when needed:: >>> from parsel import Selector >>> sel = Selector(text='<div class="hero shout"><time datetime="2014-07-23 19:00">Special date</time></div>') >>> sel.css('.shout').xpath('./time/@datetime').getall() ['2014-07-23 19:00'] This is cleaner than using the verbose XPath trick shown above. Just remember to use the ``.`` in the XPath expressions that will follow. .. _old-extraction-api: extract() and extract_first() ----------------------------- If you're a long-time parsel (or Scrapy) user, you're probably familiar with ``.extract()`` and ``.extract_first()`` selector methods. These methods are still supported by parsel, there are no plans to deprecate them. However, ``parsel`` usage docs are now written using ``.get()`` and ``.getall()`` methods. We feel that these new methods result in more concise and readable code. The following examples show how these methods map to each other. 1. ``SelectorList.get()`` is the same as ``SelectorList.extract_first()``:: >>> selector.css('a::attr(href)').get() 'image1.html' >>> selector.css('a::attr(href)').extract_first() 'image1.html' 2. ``SelectorList.getall()`` is the same as ``SelectorList.extract()``:: >>> selector.css('a::attr(href)').getall() ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] >>> selector.css('a::attr(href)').extract() ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] 3. ``Selector.get()`` is the same as ``Selector.extract()``:: >>> selector.css('a::attr(href)')[0].get() 'image1.html' >>> selector.css('a::attr(href)')[0].extract() 'image1.html' 4. For consistency, there is also ``Selector.getall()``, which returns a list:: >>> selector.css('a::attr(href)')[0].getall() ['image1.html'] With the ``.extract()`` method it was not always obvious if a result is a list or not; to get a single result either ``.extract()`` or ``.extract_first()`` needed to be called, depending whether you had a ``Selector`` or ``SelectorList``. So, the main difference is that the outputs of ``.get()`` and ``.getall()`` are more predictable: ``.get()`` always returns a single result, ``.getall()`` always returns a list of all extracted results. .. _topics-selectors-ref: API reference ============= Selector objects ---------------- .. autoclass:: parsel.selector.Selector :members: SelectorList objects -------------------- .. autoclass:: parsel.selector.SelectorList :members: .. _selector-examples-html: Working on HTML --------------- Here are some :class:`~parsel.selector.Selector` examples to illustrate several concepts. In all cases, we assume there is already a :class:`~parsel.selector.Selector` instantiated with an HTML text like this:: sel = Selector(text=html_text) 1. Select all ``<h1>`` elements from an HTML text, returning a list of :class:`~parsel.selector.Selector` objects (ie. a :class:`~parsel.selector.SelectorList` object):: sel.xpath("//h1") 2. Extract the text of all ``<h1>`` elements from an HTML text, returning a list of unicode strings:: sel.xpath("//h1").getall() # this includes the h1 tag sel.xpath("//h1/text()").getall() # this excludes the h1 tag 3. Iterate over all ``<p>`` tags and print their class attribute:: for node in sel.xpath("//p"): print(node.attrib['class']) .. _selector-examples-xml: Working on XML (and namespaces) ------------------------------- Here are some examples to illustrate concepts for :class:`~parsel.selector.Selector` objects instantiated with an XML text like this:: sel = Selector(text=xml_text, type='xml') 1. Select all ``<product>`` elements from an XML text, returning a list of :class:`~parsel.selector.Selector` objects (ie. a :class:`~parsel.selector.SelectorList` object):: sel.xpath("//product") 2. Extract all prices from a `Google Base XML feed`_ which requires registering a namespace:: sel.register_namespace("g", "http://base.google.com/ns/1.0") sel.xpath("//g:price").getall() .. _removing-namespaces: Removing namespaces ~~~~~~~~~~~~~~~~~~~ When dealing with scraping projects, it is often quite convenient to get rid of namespaces altogether and just work with element names, to write more simple/convenient XPaths. You can use the :meth:`Selector.remove_namespaces` method for that. Let's show an example that illustrates this with the Python Insider blog atom feed. Let's download the atom feed using `requests`_ and create a selector:: >>> import requests >>> from parsel import Selector >>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text >>> sel = Selector(text=text, type='xml') This is how the file starts:: <?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet ... <feed xmlns="http://www.w3.org/2005/Atom" xmlns:openSearch="http://a9.com/-/spec/opensearchrss/1.0/" xmlns:blogger="http://schemas.google.com/blogger/2008" xmlns:georss="http://www.georss.org/georss" xmlns:gd="http://schemas.google.com/g/2005" xmlns:thr="http://purl.org/syndication/thread/1.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0"> ... You can see several namespace declarations including a default "http://www.w3.org/2005/Atom" and another one using the "gd:" prefix for "http://schemas.google.com/g/2005". We can try selecting all ``<link>`` objects and then see that it doesn't work (because the Atom XML namespace is obfuscating those nodes):: >>> sel.xpath("//link") [] But once we call the :meth:`Selector.remove_namespaces` method, all nodes can be accessed directly by their names:: >>> sel.remove_namespaces() >>> sel.xpath("//link") [<Selector xpath='//link' data='<link rel="alternate" type="text/html" h'>, <Selector xpath='//link' data='<link rel="next" type="application/atom+'>, ... If you wonder why the namespace removal procedure isn't called always by default instead of having to call it manually, this is because of two reasons, which, in order of relevance, are: 1. Removing namespaces requires to iterate and modify all nodes in the document, which is a reasonably expensive operation to perform by default for all documents. 2. There could be some cases where using namespaces is actually required, in case some element names clash between namespaces. These cases are very rare though. .. _Google Base XML feed: https://support.google.com/merchants/answer/160589?hl=en&ref_topic=2473799 .. _requests: http://www.python-requests.org/ Ad-hoc namespaces references ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :class:`~parsel.selector.Selector` objects also allow passing namespaces references along with the query, through a ``namespaces`` argument, with the prefixes you declare being used in your XPath or CSS query. Let's use the same Python Insider Atom feed:: >>> import requests >>> from parsel import Selector >>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text >>> sel = Selector(text=text, type='xml') And try to select the links again, now using an "atom:" prefix for the "link" node test:: >>> sel.xpath("//atom:link", namespaces={"atom": "http://www.w3.org/2005/Atom"}) [<Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/Atom'>, <Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/Atom'>, ... You can pass several namespaces (here we're using shorter 1-letter prefixes):: >>> sel.xpath("//a:entry/a:author/g:image/@src", ... namespaces={"a": "http://www.w3.org/2005/Atom", ... "g": "http://schemas.google.com/g/2005"}).getall() ['http://photos1.blogger.com/blogger/4554/1119/400/beethoven_10.jpg', '//lh3.googleusercontent.com/-7xisiK0EArc/AAAAAAAAAAI/AAAAAAAAAuM/-r6o6A8RKCM/s512-c/photo.jpg', ... .. _topics-xpath-variables: Variables in XPath expressions ------------------------------ XPath allows you to reference variables in your XPath expressions, using the ``$somevariable`` syntax. This is somewhat similar to parameterized queries or prepared statements in the SQL world where you replace some arguments in your queries with placeholders like ``?``, which are then substituted with values passed with the query. Here's an example to match an element based on its normalized string-value:: >>> str_to_match = "Name: My image 3" >>> selector.xpath('//a[normalize-space(.)=$match]', ... match=str_to_match).get() '<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>' All variable references must have a binding value when calling ``.xpath()`` (otherwise you'll get a ``ValueError: XPath error:`` exception). This is done by passing as many named arguments as necessary. Here's another example using a position range passed as two integers:: >>> start, stop = 2, 4 >>> selector.xpath('//a[position()>=$_from and position()<=$_to]', ... _from=start, _to=stop).getall() ['<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>', '<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>', '<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>'] Named variables can be useful when strings need to be escaped for single or double quotes characters. The example below would be a bit tricky to get right (or legible) without a variable reference:: >>> html = u'''<html> ... <body> ... <p>He said: "I don't know why, but I like mixing single and double quotes!"</p> ... </body> ... </html>''' >>> selector = Selector(text=html) >>> >>> selector.xpath('//p[contains(., $mystring)]', ... mystring='''He said: "I don't know''').get() '<p>He said: "I don\'t know why, but I like mixing single and double quotes!"</p>' Converting CSS to XPath ----------------------- .. autofunction:: parsel.css2xpath When you're using an API that only accepts XPath expressions, it's sometimes useful to convert CSS to XPath. This allows you to take advantage of the conciseness of CSS to query elements by classes and the easeness of manipulating XPath expressions at the same time. On those occasions, use the function :func:`~parsel.css2xpath`: :: >>> from parsel import css2xpath >>> css2xpath('h1.title') "descendant-or-self::h1[@class and contains(concat(' ', normalize-space(@class), ' '), ' title ')]" >>> css2xpath('.profile-data') + '//h2' "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' profile-data ')]//h2" As you can see from the examples above, it returns the translated CSS query into an XPath expression as a string, which you can use as-is or combine to build a more complex expression, before feeding to a function expecting XPath. Similar libraries ================= * `BeautifulSoup`_ is a very popular screen scraping library among Python programmers which constructs a Python object based on the structure of the HTML code and also deals with bad markup reasonably well. * `lxml`_ is an XML parsing library (which also parses HTML) with a pythonic API based on `ElementTree`_. (lxml is not part of the Python standard library.). Parsel uses it under-the-hood. * `PyQuery`_ is a library that, like Parsel, uses `lxml`_ and `cssselect`_ under the hood, but it offers a jQuery-like API to traverse and manipulate XML/HTML documents. Parsel is built on top of the `lxml`_ library, which means they're very similar in speed and parsing accuracy. The advantage of using Parsel over `lxml`_ is that Parsel is simpler to use and extend, unlike the `lxml`_ API which is much bigger because the `lxml`_ library can be used for many other tasks, besides selecting markup documents. .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ .. _lxml: http://lxml.de/ .. _PyQuery: https://pypi.python.org/pypi/pyquery .. _ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html .. _cssselect: https://pypi.python.org/pypi/cssselect/ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/parsel/��������������������������������������������������������������������������������0000775�0000000�0000000�00000000000�13523253702�0014275�5����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/parsel/__init__.py���������������������������������������������������������������������0000664�0000000�0000000�00000000525�13523253702�0016410�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" Parsel lets you extract text from XML/HTML documents using XPath or CSS selectors """ __author__ = 'Scrapy project' __email__ = 'info@scrapy.org' __version__ = '1.5.2' from parsel.selector import Selector, SelectorList # NOQA from parsel.csstranslator import css2xpath # NOQA from parsel import xpathfuncs # NOQA xpathfuncs.setup() ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/parsel/csstranslator.py����������������������������������������������������������������0000664�0000000�0000000�00000007771�13523253702�0017565�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������import six if six.PY2: from functools32 import lru_cache else: from functools import lru_cache from cssselect import GenericTranslator as OriginalGenericTranslator from cssselect import HTMLTranslator as OriginalHTMLTranslator from cssselect.xpath import XPathExpr as OriginalXPathExpr from cssselect.xpath import _unicode_safe_getattr, ExpressionError from cssselect.parser import parse, FunctionalPseudoElement class XPathExpr(OriginalXPathExpr): textnode = False attribute = None @classmethod def from_xpath(cls, xpath, textnode=False, attribute=None): x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition) x.textnode = textnode x.attribute = attribute return x def __str__(self): path = super(XPathExpr, self).__str__() if self.textnode: if path == '*': path = 'text()' elif path.endswith('::*/*'): path = path[:-3] + 'text()' else: path += '/text()' if self.attribute is not None: if path.endswith('::*/*'): path = path[:-2] path += '/@%s' % self.attribute return path def join(self, combiner, other): super(XPathExpr, self).join(combiner, other) self.textnode = other.textnode self.attribute = other.attribute return self class TranslatorMixin(object): """This mixin adds support to CSS pseudo elements via dynamic dispatch. Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``. """ def xpath_element(self, selector): xpath = super(TranslatorMixin, self).xpath_element(selector) return XPathExpr.from_xpath(xpath) def xpath_pseudo_element(self, xpath, pseudo_element): """ Dispatch method that transforms XPath to support pseudo-element """ if isinstance(pseudo_element, FunctionalPseudoElement): method = 'xpath_%s_functional_pseudo_element' % ( pseudo_element.name.replace('-', '_')) method = _unicode_safe_getattr(self, method, None) if not method: raise ExpressionError( "The functional pseudo-element ::%s() is unknown" % pseudo_element.name) xpath = method(xpath, pseudo_element) else: method = 'xpath_%s_simple_pseudo_element' % ( pseudo_element.replace('-', '_')) method = _unicode_safe_getattr(self, method, None) if not method: raise ExpressionError( "The pseudo-element ::%s is unknown" % pseudo_element) xpath = method(xpath) return xpath def xpath_attr_functional_pseudo_element(self, xpath, function): """Support selecting attribute values using ::attr() pseudo-element """ if function.argument_types() not in (['STRING'], ['IDENT']): raise ExpressionError( "Expected a single string or ident for ::attr(), got %r" % function.arguments) return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value) def xpath_text_simple_pseudo_element(self, xpath): """Support selecting text nodes using ::text pseudo-element""" return XPathExpr.from_xpath(xpath, textnode=True) class GenericTranslator(TranslatorMixin, OriginalGenericTranslator): @lru_cache(maxsize=256) def css_to_xpath(self, css, prefix='descendant-or-self::'): return super(GenericTranslator, self).css_to_xpath(css, prefix) class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator): @lru_cache(maxsize=256) def css_to_xpath(self, css, prefix='descendant-or-self::'): return super(HTMLTranslator, self).css_to_xpath(css, prefix) _translator = HTMLTranslator() def css2xpath(query): "Return translated XPath version of a given CSS query" return _translator.css_to_xpath(query) �������parsel-1.5.2/parsel/selector.py���������������������������������������������������������������������0000664�0000000�0000000�00000031546�13523253702�0016500�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" XPath selectors based on lxml """ import sys import six from lxml import etree, html from .utils import flatten, iflatten, extract_regex, shorten from .csstranslator import HTMLTranslator, GenericTranslator class SafeXMLParser(etree.XMLParser): def __init__(self, *args, **kwargs): kwargs.setdefault('resolve_entities', False) super(SafeXMLParser, self).__init__(*args, **kwargs) _ctgroup = { 'html': {'_parser': html.HTMLParser, '_csstranslator': HTMLTranslator(), '_tostring_method': 'html'}, 'xml': {'_parser': SafeXMLParser, '_csstranslator': GenericTranslator(), '_tostring_method': 'xml'}, } def _st(st): if st is None: return 'html' elif st in _ctgroup: return st else: raise ValueError('Invalid type: %s' % st) def create_root_node(text, parser_cls, base_url=None): """Create root node for text using given parser class. """ body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>' parser = parser_cls(recover=True, encoding='utf8') root = etree.fromstring(body, parser=parser, base_url=base_url) if root is None: root = etree.fromstring(b'<html/>', parser=parser, base_url=base_url) return root class SelectorList(list): """ The :class:`SelectorList` class is a subclass of the builtin ``list`` class, which provides a few additional methods. """ # __getslice__ is deprecated but `list` builtin implements it only in Py2 def __getslice__(self, i, j): o = super(SelectorList, self).__getslice__(i, j) return self.__class__(o) def __getitem__(self, pos): o = super(SelectorList, self).__getitem__(pos) return self.__class__(o) if isinstance(pos, slice) else o def __getstate__(self): raise TypeError("can't pickle SelectorList objects") def xpath(self, xpath, namespaces=None, **kwargs): """ Call the ``.xpath()`` method for each element in this list and return their results flattened as another :class:`SelectorList`. ``query`` is the same argument as the one in :meth:`Selector.xpath` ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict) for additional prefixes to those registered with ``register_namespace(prefix, uri)``. Contrary to ``register_namespace()``, these prefixes are not saved for future calls. Any additional named arguments can be used to pass values for XPath variables in the XPath expression, e.g.:: selector.xpath('//a[href=$url]', url="http://www.example.com") """ return self.__class__(flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self])) def css(self, query): """ Call the ``.css()`` method for each element in this list and return their results flattened as another :class:`SelectorList`. ``query`` is the same argument as the one in :meth:`Selector.css` """ return self.__class__(flatten([x.css(query) for x in self])) def re(self, regex, replace_entities=True): """ Call the ``.re()`` method for each element in this list and return their results flattened, as a list of unicode strings. By default, character entity references are replaced by their corresponding character (except for ``&`` and ``<``. Passing ``replace_entities`` as ``False`` switches off these replacements. """ return flatten([x.re(regex, replace_entities=replace_entities) for x in self]) def re_first(self, regex, default=None, replace_entities=True): """ Call the ``.re()`` method for the first element in this list and return the result in an unicode string. If the list is empty or the regex doesn't match anything, return the default value (``None`` if the argument is not provided). By default, character entity references are replaced by their corresponding character (except for ``&`` and ``<``. Passing ``replace_entities`` as ``False`` switches off these replacements. """ for el in iflatten(x.re(regex, replace_entities=replace_entities) for x in self): return el else: return default def getall(self): """ Call the ``.get()`` method for each element is this list and return their results flattened, as a list of unicode strings. """ return [x.get() for x in self] extract = getall def get(self, default=None): """ Return the result of ``.get()`` for the first element in this list. If the list is empty, return the default value. """ for x in self: return x.get() else: return default extract_first = get @property def attrib(self): """Return the attributes dictionary for the first element. If the list is empty, return an empty dict. """ for x in self: return x.attrib else: return {} class Selector(object): """ :class:`Selector` allows you to select parts of an XML or HTML text using CSS or XPath expressions and extract data from it. ``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3 ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default). If ``type`` is ``None``, the selector defaults to ``"html"``. """ __slots__ = ['text', 'namespaces', 'type', '_expr', 'root', '__weakref__', '_parser', '_csstranslator', '_tostring_method'] _default_type = None _default_namespaces = { "re": "http://exslt.org/regular-expressions", # supported in libxslt: # set:difference # set:has-same-node # set:intersection # set:leading # set:trailing "set": "http://exslt.org/sets" } _lxml_smart_strings = False selectorlist_cls = SelectorList def __init__(self, text=None, type=None, namespaces=None, root=None, base_url=None, _expr=None): self.type = st = _st(type or self._default_type) self._parser = _ctgroup[st]['_parser'] self._csstranslator = _ctgroup[st]['_csstranslator'] self._tostring_method = _ctgroup[st]['_tostring_method'] if text is not None: if not isinstance(text, six.text_type): raise TypeError("text argument should be of type %s" % six.text_type) root = self._get_root(text, base_url) elif root is None: raise ValueError("Selector needs either text or root argument") self.namespaces = dict(self._default_namespaces) if namespaces is not None: self.namespaces.update(namespaces) self.root = root self._expr = _expr def __getstate__(self): raise TypeError("can't pickle Selector objects") def _get_root(self, text, base_url=None): return create_root_node(text, self._parser, base_url=base_url) def xpath(self, query, namespaces=None, **kwargs): """ Find nodes matching the xpath ``query`` and return the result as a :class:`SelectorList` instance with all elements flattened. List elements implement :class:`Selector` interface too. ``query`` is a string containing the XPATH query to apply. ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict) for additional prefixes to those registered with ``register_namespace(prefix, uri)``. Contrary to ``register_namespace()``, these prefixes are not saved for future calls. Any additional named arguments can be used to pass values for XPath variables in the XPath expression, e.g.:: selector.xpath('//a[href=$url]', url="http://www.example.com") """ try: xpathev = self.root.xpath except AttributeError: return self.selectorlist_cls([]) nsp = dict(self.namespaces) if namespaces is not None: nsp.update(namespaces) try: result = xpathev(query, namespaces=nsp, smart_strings=self._lxml_smart_strings, **kwargs) except etree.XPathError as exc: msg = u"XPath error: %s in %s" % (exc, query) msg = msg if six.PY3 else msg.encode('unicode_escape') six.reraise(ValueError, ValueError(msg), sys.exc_info()[2]) if type(result) is not list: result = [result] result = [self.__class__(root=x, _expr=query, namespaces=self.namespaces, type=self.type) for x in result] return self.selectorlist_cls(result) def css(self, query): """ Apply the given CSS selector and return a :class:`SelectorList` instance. ``query`` is a string containing the CSS selector to apply. In the background, CSS queries are translated into XPath queries using `cssselect`_ library and run ``.xpath()`` method. .. _cssselect: https://pypi.python.org/pypi/cssselect/ """ return self.xpath(self._css2xpath(query)) def _css2xpath(self, query): return self._csstranslator.css_to_xpath(query) def re(self, regex, replace_entities=True): """ Apply the given regex and return a list of unicode strings with the matches. ``regex`` can be either a compiled regular expression or a string which will be compiled to a regular expression using ``re.compile(regex)``. By default, character entity references are replaced by their corresponding character (except for ``&`` and ``<``). Passing ``replace_entities`` as ``False`` switches off these replacements. """ return extract_regex(regex, self.get(), replace_entities=replace_entities) def re_first(self, regex, default=None, replace_entities=True): """ Apply the given regex and return the first unicode string which matches. If there is no match, return the default value (``None`` if the argument is not provided). By default, character entity references are replaced by their corresponding character (except for ``&`` and ``<``). Passing ``replace_entities`` as ``False`` switches off these replacements. """ return next(iflatten(self.re(regex, replace_entities=replace_entities)), default) def get(self): """ Serialize and return the matched nodes in a single unicode string. Percent encoded content is unquoted. """ try: return etree.tostring(self.root, method=self._tostring_method, encoding='unicode', with_tail=False) except (AttributeError, TypeError): if self.root is True: return u'1' elif self.root is False: return u'0' else: return six.text_type(self.root) extract = get def getall(self): """ Serialize and return the matched node in a 1-element list of unicode strings. """ return [self.get()] def register_namespace(self, prefix, uri): """ Register the given namespace to be used in this :class:`Selector`. Without registering namespaces you can't select or extract data from non-standard namespaces. See :ref:`selector-examples-xml`. """ self.namespaces[prefix] = uri def remove_namespaces(self): """ Remove all namespaces, allowing to traverse the document using namespace-less xpaths. See :ref:`removing-namespaces`. """ for el in self.root.iter('*'): if el.tag.startswith('{'): el.tag = el.tag.split('}', 1)[1] # loop on element attributes also for an in el.attrib.keys(): if an.startswith('{'): el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an) # remove namespace declarations etree.cleanup_namespaces(self.root) @property def attrib(self): """Return the attributes dictionary for underlying element. """ return dict(self.root.attrib) def __bool__(self): """ Return ``True`` if there is any real content selected or ``False`` otherwise. In other words, the boolean value of a :class:`Selector` is given by the contents it selects. """ return bool(self.get()) __nonzero__ = __bool__ def __str__(self): data = repr(shorten(self.get(), width=40)) return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data) __repr__ = __str__ ����������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/parsel/utils.py������������������������������������������������������������������������0000664�0000000�0000000�00000005315�13523253702�0016013�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������import re import six from w3lib.html import replace_entities as w3lib_replace_entities def flatten(x): """flatten(sequence) -> list Returns a single, flat list which contains all elements retrieved from the sequence and all recursively contained sub-sequences (iterables). Examples: >>> [1, 2, [3,4], (5,6)] [1, 2, [3, 4], (5, 6)] >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)]) [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10] >>> flatten(["foo", "bar"]) ['foo', 'bar'] >>> flatten(["foo", ["baz", 42], "bar"]) ['foo', 'baz', 42, 'bar'] """ return list(iflatten(x)) def iflatten(x): """iflatten(sequence) -> iterator Similar to ``.flatten()``, but returns iterator instead""" for el in x: if _is_listlike(el): for el_ in flatten(el): yield el_ else: yield el def _is_listlike(x): """ >>> _is_listlike("foo") False >>> _is_listlike(5) False >>> _is_listlike(b"foo") False >>> _is_listlike([b"foo"]) True >>> _is_listlike((b"foo",)) True >>> _is_listlike({}) True >>> _is_listlike(set()) True >>> _is_listlike((x for x in range(3))) True >>> _is_listlike(six.moves.xrange(5)) True """ return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes)) def extract_regex(regex, text, replace_entities=True): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, six.string_types): regex = re.compile(regex, re.UNICODE) if 'extract' in regex.groupindex: # named group try: extracted = regex.search(text).group('extract') except AttributeError: strings = [] else: strings = [extracted] if extracted is not None else [] else: # full regex or numbered groups strings = regex.findall(text) strings = flatten(strings) if not replace_entities: return strings return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings] def shorten(text, width, suffix='...'): """Truncate the given text to fit in the given width.""" if len(text) <= width: return text if width > len(suffix): return text[:width-len(suffix)] + suffix if width >= 0: return suffix[len(suffix)-width:] raise ValueError('width must be equal or greater than 0') �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/parsel/xpathfuncs.py�������������������������������������������������������������������0000664�0000000�0000000�00000003364�13523253702�0017040�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������import re from lxml import etree from six import string_types from w3lib.html import HTML5_WHITESPACE regex = '[{}]+'.format(HTML5_WHITESPACE) replace_html5_whitespaces = re.compile(regex).sub def set_xpathfunc(fname, func): """Register a custom extension function to use in XPath expressions. The function ``func`` registered under ``fname`` identifier will be called for every matching node, being passed a ``context`` parameter as well as any parameters passed from the corresponding XPath expression. If ``func`` is ``None``, the extension function will be removed. See more `in lxml documentation`_. .. _`in lxml documentation`: http://lxml.de/extensions.html#xpath-extension-functions """ ns_fns = etree.FunctionNamespace(None) if func is not None: ns_fns[fname] = func else: del ns_fns[fname] def setup(): set_xpathfunc('has-class', has_class) def has_class(context, *classes): """has-class function. Return True if all ``classes`` are present in element's class attr. """ if not context.eval_context.get('args_checked'): if not classes: raise ValueError( 'XPath error: has-class must have at least 1 argument') for c in classes: if not isinstance(c, string_types): raise ValueError( 'XPath error: has-class arguments must be strings') context.eval_context['args_checked'] = True node_cls = context.context_node.get('class') if node_cls is None: return False node_cls = ' ' + node_cls + ' ' node_cls = replace_html5_whitespaces(' ', node_cls) for cls in classes: if ' ' + cls + ' ' not in node_cls: return False return True ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/pytest.ini�����������������������������������������������������������������������������0000664�0000000�0000000�00000000106�13523253702�0015035�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������[pytest] addopts = --doctest-modules --assert=plain --ignore=setup.py ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/release.rst����������������������������������������������������������������������������0000664�0000000�0000000�00000001132�13523253702�0015156�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������Release procedures ------------------ * Update NEWS file with the release notes. Review changes using: ``restview --pypi-strict <(cat README.rst NEWS | grep -v ':changelog')`` * Run bumpversion with the proper release type * Push code and tags to GitHub to trigger build * Copy release notes to https://github.com/scrapy/parsel/releases * Verify in a temporary virtualenv that ``pip install parsel`` installs the latest version * Update version builds at: https://readthedocs.org/projects/parsel/versions/ You should ensure that previous stable version is active and point stable to the new tag ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/setup.cfg������������������������������������������������������������������������������0000664�0000000�0000000�00000000053�13523253702�0014626�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������[wheel] universal=1 [aliases] test=pytest �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/setup.py�������������������������������������������������������������������������������0000664�0000000�0000000�00000005157�13523253702�0014531�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python # -*- coding: utf-8 -*- import sys from pkg_resources import parse_version from setuptools import setup, __version__ as setuptools_version with open('README.rst') as readme_file: readme = readme_file.read() with open('NEWS') as history_file: history = history_file.read().replace('.. :changelog:', '') test_requirements = [ ] def has_environment_marker_platform_impl_support(): """Code extracted from 'pytest/setup.py' https://github.com/pytest-dev/pytest/blob/7538680c/setup.py#L31 The first known release to support environment marker with range operators it is 18.5, see: https://setuptools.readthedocs.io/en/latest/history.html#id235 """ return parse_version(setuptools_version) >= parse_version('18.5') install_requires = [ 'w3lib>=1.19.0', 'lxml;python_version!="3.4"', 'lxml<=4.3.5;python_version=="3.4"', 'six>=1.5.2', 'cssselect>=0.9' ] extras_require = {} if not has_environment_marker_platform_impl_support(): if sys.version_info[0:2] < (3, 0): install_requires.append("functools32") else: extras_require[":python_version<'3.0'"] = ["functools32"] setup( name='parsel', version='1.5.2', description="Parsel is a library to extract data from HTML and XML using XPath and CSS selectors", long_description=readme + '\n\n' + history, author="Scrapy project", author_email='info@scrapy.org', url='https://github.com/scrapy/parsel', packages=[ 'parsel', ], package_dir={'parsel': 'parsel'}, include_package_data=True, install_requires=install_requires, extras_require=extras_require, license="BSD", zip_safe=False, keywords='parsel', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: BSD License', 'Natural Language :: English', 'Topic :: Text Processing :: Markup', 'Topic :: Text Processing :: Markup :: HTML', 'Topic :: Text Processing :: Markup :: XML', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', ], setup_requires=['pytest-runner',], tests_require=['pytest',], test_suite='tests', ) �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/tests/���������������������������������������������������������������������������������0000775�0000000�0000000�00000000000�13523253702�0014151�5����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/tests/requirements.txt�����������������������������������������������������������������0000664�0000000�0000000�00000000022�13523253702�0017427�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������pytest pytest-cov ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/tests/test_selector.py�����������������������������������������������������������������0000664�0000000�0000000�00000112042�13523253702�0017402�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������# -*- coding: utf-8 -*- import re import weakref import six import unittest import pickle from parsel import Selector class SelectorTestCase(unittest.TestCase): sscls = Selector def test_pickle_selector(self): sel = self.sscls(text=u'<html><body><p>some text</p></body></html>') self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel) def test_pickle_selector_list(self): sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>') sel_list = sel.css('li') empty_sel_list = sel.css('p') self.assertIsInstance(sel_list, self.sscls.selectorlist_cls) self.assertIsInstance(empty_sel_list, self.sscls.selectorlist_cls) self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list) self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), empty_sel_list) def test_simple_selection(self): """Simple selector tests""" body = u"<p><input name='a'value='1'/><input name='b'value='2'/></p>" sel = self.sscls(text=body) xl = sel.xpath('//input') self.assertEqual(2, len(xl)) for x in xl: assert isinstance(x, self.sscls) self.assertEqual(sel.xpath('//input').extract(), [x.extract() for x in sel.xpath('//input')]) self.assertEqual([x.extract() for x in sel.xpath("//input[@name='a']/@name")], [u'a']) self.assertEqual([x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")], [u'12.0']) self.assertEqual(sel.xpath("concat('xpath', 'rules')").extract(), [u'xpathrules']) self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")], [u'12']) def test_simple_selection_with_variables(self): """Using XPath variables""" body = u"<p><input name='a' value='1'/><input name='b' value='2'/></p>" sel = self.sscls(text=body) self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)], [u'a']) self.assertEqual([x.extract() for x in sel.xpath("//input[@name=$letter]/@value", letter='b')], [u'2']) self.assertEqual(sel.xpath("count(//input[@value=$number or @name=$letter])", number=2, letter='a').extract(), [u'2.0']) # you can also pass booleans self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=2, test=True).extract(), [u'1']) self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=4, test=True).extract(), [u'0']) self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=4, test=False).extract(), [u'1']) # for named nodes, you need to use "name()=node_name" self.assertEqual(sel.xpath("boolean(count(//*[name()=$tag])=$cnt)=$test", tag="input", cnt=2, test=True).extract(), [u'1']) def test_simple_selection_with_variables_escape_friendly(self): """Using XPath variables with quotes that would need escaping with string formatting""" body = u"""<p>I'm mixing single and <input name='a' value='I say "Yeah!"'/> "double quotes" and I don't care :)</p>""" sel = self.sscls(text=body) t = 'I say "Yeah!"' # naive string formatting with give something like: # ValueError: XPath error: Invalid predicate in //input[@value="I say "Yeah!""]/@name self.assertRaises(ValueError, sel.xpath, '//input[@value="{}"]/@name'.format(t)) # with XPath variables, escaping is done for you self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)], [u'a']) lt = """I'm mixing single and "double quotes" and I don't care :)""" # the following gives you something like # ValueError: XPath error: Invalid predicate in //p[normalize-space()='I'm mixing single and "double quotes" and I don't care :)']//@name self.assertRaises(ValueError, sel.xpath, "//p[normalize-space()='{}']//@name".format(lt)) self.assertEqual([x.extract() for x in sel.xpath("//p[normalize-space()=$lng]//@name", lng=lt)], [u'a']) def test_accessing_attributes(self): body = u""" <html lang="en" version="1.0"> <body> <ul id="some-list" class="list-cls" class="list-cls"> <li class="item-cls" id="list-item-1"> <li class="item-cls active" id="list-item-2"> <li class="item-cls" id="list-item-3"> </ul> </body> </html> """ sel = self.sscls(text=body) self.assertEqual({'lang': 'en', 'version': '1.0'}, sel.attrib) self.assertEqual({'id': 'some-list', 'class': 'list-cls'}, sel.css('ul')[0].attrib) # for a SelectorList, bring the attributes of first-element only self.assertEqual({'id': 'some-list', 'class': 'list-cls'}, sel.css('ul').attrib) self.assertEqual({'class': 'item-cls', 'id': 'list-item-1'}, sel.css('li').attrib) self.assertEqual({}, sel.css('body').attrib) self.assertEqual({}, sel.css('non-existing-element').attrib) self.assertEqual( [{'class': 'item-cls', 'id': 'list-item-1'}, {'class': 'item-cls active', 'id': 'list-item-2'}, {'class': 'item-cls', 'id': 'list-item-3'}], [e.attrib for e in sel.css('li')]) def test_representation_slice(self): body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b') sel = self.sscls(text=body) representation = "<Selector xpath='//input/@name' data='{}...'>".format(37 * 'b') if six.PY2: representation = "<Selector xpath='//input/@name' data=u'{}...'>".format(37 * 'b') self.assertEqual( [repr(it) for it in sel.xpath('//input/@name')], [representation] ) def test_representation_unicode_query(self): body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b') representation = '<Selector xpath=\'//input[@value="©"]/@value\' data=\'©\'>' if six.PY2: representation = "<Selector xpath=u'//input[@value=\"\\xa9\"]/@value' data=u'\\xa9'>" sel = self.sscls(text=body) self.assertEqual( [repr(it) for it in sel.xpath(u'//input[@value="\xa9"]/@value')], [representation] ) def test_check_text_argument_type(self): self.assertRaisesRegexp(TypeError, 'text argument should be of type', self.sscls, b'<html/>') def test_extract_first(self): """Test if extract_first() returns first element""" body = u'<ul><li id="1">1</li><li id="2">2</li></ul>' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//ul/li/text()').extract_first(), sel.xpath('//ul/li/text()').extract()[0]) self.assertEqual(sel.xpath('//ul/li[@id="1"]/text()').extract_first(), sel.xpath('//ul/li[@id="1"]/text()').extract()[0]) self.assertEqual(sel.xpath('//ul/li[2]/text()').extract_first(), sel.xpath('//ul/li/text()').extract()[1]) self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').extract_first(), None) def test_extract_first_default(self): """Test if extract_first() returns default value when no results found""" body = u'<ul><li id="1">1</li><li id="2">2</li></ul>' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//div/text()').extract_first(default='missing'), 'missing') def test_selector_get_alias(self): """Test if get() returns extracted value on a Selector""" body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//ul/li[position()>1]')[0].get(), u'<li id="2">2</li>') self.assertEqual(sel.xpath('//ul/li[position()>1]/text()')[0].get(), u'2') def test_selector_getall_alias(self): """Test if get() returns extracted value on a Selector""" body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>' sel = self.sscls(text=body) self.assertListEqual(sel.xpath('//ul/li[position()>1]')[0].getall(), [u'<li id="2">2</li>']) self.assertListEqual(sel.xpath('//ul/li[position()>1]/text()')[0].getall(), [u'2']) def test_selectorlist_get_alias(self): """Test if get() returns first element for a selection call""" body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//ul/li').get(), u'<li id="1">1</li>') self.assertEqual(sel.xpath('//ul/li/text()').get(), u'1') def test_re_first(self): """Test if re_first() returns first matched element""" body = u'<ul><li id="1">1</li><li id="2">2</li></ul>' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//ul/li/text()').re_first(r'\d'), sel.xpath('//ul/li/text()').re(r'\d')[0]) self.assertEqual(sel.xpath('//ul/li[@id="1"]/text()').re_first(r'\d'), sel.xpath('//ul/li[@id="1"]/text()').re(r'\d')[0]) self.assertEqual(sel.xpath('//ul/li[2]/text()').re_first(r'\d'), sel.xpath('//ul/li/text()').re(r'\d')[1]) self.assertEqual(sel.xpath('/ul/li/text()').re_first(r'\w+'), None) self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').re_first(r'\d'), None) self.assertEqual(sel.re_first(r'id="(\d+)'), '1') self.assertEqual(sel.re_first(r'foo'), None) self.assertEqual(sel.re_first(r'foo', default='bar'), 'bar') def test_extract_first_default(self): """Test if re_first() returns default value when no results found""" body = u'<ul><li id="1">1</li><li id="2">2</li></ul>' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//div/text()').re_first(r'\w+', default='missing'), 'missing') self.assertEqual(sel.xpath('/ul/li/text()').re_first(r'\w+', default='missing'), 'missing') def test_select_unicode_query(self): body = u"<p><input name='\xa9' value='1'/></p>" sel = self.sscls(text=body) self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1']) def test_list_elements_type(self): """Test Selector returning the same type in selection methods""" text = u'<p>test<p>' assert isinstance(self.sscls(text=text).xpath("//p")[0], self.sscls) assert isinstance(self.sscls(text=text).css("p")[0], self.sscls) def test_boolean_result(self): body = u"<p><input name='a'value='1'/><input name='b'value='2'/></p>" xs = self.sscls(text=body) self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), [u'1']) self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), [u'0']) def test_differences_parsing_xml_vs_html(self): """Test that XML and HTML Selector's behave differently""" # some text which is parsed differently by XML and HTML flavors text = u'<div><img src="a.jpg"><p>Hello</div>' hs = self.sscls(text=text, type='html') self.assertEqual(hs.xpath("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></div>']) xs = self.sscls(text=text, type='xml') self.assertEqual(xs.xpath("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></img></div>']) def test_error_for_unknown_selector_type(self): self.assertRaises(ValueError, self.sscls, text=u'', type='_na_') def test_text_or_root_is_required(self): self.assertRaisesRegexp(ValueError, 'Selector needs either text or root argument', self.sscls) def test_bool(self): text = u'<a href="" >false</a><a href="nonempty">true</a>' hs = self.sscls(text=text, type='html') falsish = hs.xpath('//a/@href')[0] self.assertEqual(falsish.extract(), u'') self.assertFalse(falsish) trueish = hs.xpath('//a/@href')[1] self.assertEqual(trueish.extract(), u'nonempty') self.assertTrue(trueish) def test_slicing(self): text = u'<div><p>1</p><p>2</p><p>3</p></div>' hs = self.sscls(text=text, type='html') self.assertIsInstance(hs.css('p')[2], self.sscls) self.assertIsInstance(hs.css('p')[2:3], self.sscls.selectorlist_cls) self.assertIsInstance(hs.css('p')[:2], self.sscls.selectorlist_cls) self.assertEqual(hs.css('p')[2:3].extract(), [u'<p>3</p>']) self.assertEqual(hs.css('p')[1:3].extract(), [u'<p>2</p>', u'<p>3</p>']) def test_nested_selectors(self): """Nested selector tests""" body = u"""<body> <div class='one'> <ul> <li>one</li><li>two</li> </ul> </div> <div class='two'> <ul> <li>four</li><li>five</li><li>six</li> </ul> </div> </body>""" x = self.sscls(text=body) divtwo = x.xpath('//div[@class="two"]') self.assertEqual(divtwo.xpath("//li").extract(), ["<li>one</li>", "<li>two</li>", "<li>four</li>", "<li>five</li>", "<li>six</li>"]) self.assertEqual(divtwo.xpath("./ul/li").extract(), ["<li>four</li>", "<li>five</li>", "<li>six</li>"]) self.assertEqual(divtwo.xpath(".//li").extract(), ["<li>four</li>", "<li>five</li>", "<li>six</li>"]) self.assertEqual(divtwo.xpath("./li").extract(), []) def test_selectorlist_getall_alias(self): """Nested selector tests using getall()""" body = u"""<body> <div class='one'> <ul> <li>one</li><li>two</li> </ul> </div> <div class='two'> <ul> <li>four</li><li>five</li><li>six</li> </ul> </div> </body>""" x = self.sscls(text=body) divtwo = x.xpath('//div[@class="two"]') self.assertEqual(divtwo.xpath("//li").getall(), ["<li>one</li>", "<li>two</li>", "<li>four</li>", "<li>five</li>", "<li>six</li>"]) self.assertEqual(divtwo.xpath("./ul/li").getall(), ["<li>four</li>", "<li>five</li>", "<li>six</li>"]) self.assertEqual(divtwo.xpath(".//li").getall(), ["<li>four</li>", "<li>five</li>", "<li>six</li>"]) self.assertEqual(divtwo.xpath("./li").getall(), []) def test_mixed_nested_selectors(self): body = u'''<body> <div id=1>not<span>me</span></div> <div class="dos"><p>text</p><a href='#'>foo</a></div> </body>''' sel = self.sscls(text=body) self.assertEqual(sel.xpath('//div[@id="1"]').css('span::text').extract(), [u'me']) self.assertEqual(sel.css('#1').xpath('./span/text()').extract(), [u'me']) def test_dont_strip(self): sel = self.sscls(text=u'<div>fff: <a href="#">zzz</a></div>') self.assertEqual(sel.xpath("//text()").extract(), [u'fff: ', u'zzz']) def test_namespaces_simple(self): body = u""" <test xmlns:somens="http://scrapy.org"> <somens:a id="foo">take this</a> <a id="bar">found</a> </test> """ x = self.sscls(text=body, type='xml') x.register_namespace("somens", "http://scrapy.org") self.assertEqual(x.xpath("//somens:a/text()").extract(), [u'take this']) def test_namespaces_adhoc(self): body = u""" <test xmlns:somens="http://scrapy.org"> <somens:a id="foo">take this</a> <a id="bar">found</a> </test> """ x = self.sscls(text=body, type='xml') self.assertEqual(x.xpath("//somens:a/text()", namespaces={"somens": "http://scrapy.org"}).extract(), [u'take this']) def test_namespaces_adhoc_variables(self): body = u""" <test xmlns:somens="http://scrapy.org"> <somens:a id="foo">take this</a> <a id="bar">found</a> </test> """ x = self.sscls(text=body, type='xml') self.assertEqual(x.xpath("//somens:a/following-sibling::a[@id=$identifier]/text()", namespaces={"somens": "http://scrapy.org"}, identifier="bar").extract(), [u'found']) def test_namespaces_multiple(self): body = u"""<?xml version="1.0" encoding="UTF-8"?> <BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05" xmlns:b="http://somens.com" xmlns:p="http://www.scrapy.org/product" > <b:Operation>hello</b:Operation> <TestTag b:att="value"><Other>value</Other></TestTag> <p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag> </BrowseNode> """ x = self.sscls(text=body, type='xml') x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05") x.register_namespace("p", "http://www.scrapy.org/product") x.register_namespace("b", "http://somens.com") self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1) self.assertEqual(x.xpath("//b:Operation/text()").extract()[0], 'hello') self.assertEqual(x.xpath("//xmlns:TestTag/@b:att").extract()[0], 'value') self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:price/text()").extract()[0], '90') self.assertEqual(x.xpath("//p:SecondTestTag").xpath("./xmlns:price/text()")[0].extract(), '90') self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron') def test_namespaces_multiple_adhoc(self): body = u"""<?xml version="1.0" encoding="UTF-8"?> <BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05" xmlns:b="http://somens.com" xmlns:p="http://www.scrapy.org/product" > <b:Operation>hello</b:Operation> <TestTag b:att="value"><Other>value</Other></TestTag> <p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag> </BrowseNode> """ x = self.sscls(text=body, type='xml') x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05") self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1) # "b" namespace is not declared yet self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att") # "b" namespace being passed ad-hoc self.assertEqual(x.xpath("//b:Operation/text()", namespaces={"b": "http://somens.com"}).extract()[0], 'hello') # "b" namespace declaration is not cached self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att") # "xmlns" is still defined self.assertEqual(x.xpath("//xmlns:TestTag/@b:att", namespaces={"b": "http://somens.com"}).extract()[0], 'value') # chained selectors still have knowledge of register_namespace() operations self.assertEqual(x.xpath("//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"}).xpath("./xmlns:price/text()")[0].extract(), '90') # but chained selector don't know about parent ad-hoc declarations self.assertRaises(ValueError,x.xpath("//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"}).xpath, "p:name/text()") # ad-hoc declarations need repeats when chaining self.assertEqual(x.xpath("//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"} ).xpath("p:name/text()", namespaces={"p": "http://www.scrapy.org/product"} ).extract_first(), 'Dried Rose') # declaring several ad-hoc namespaces self.assertEqual(x.xpath("""string( //b:Operation /following-sibling::xmlns:TestTag /following-sibling::*//p:name)""", namespaces={"b": "http://somens.com", "p": "http://www.scrapy.org/product"}).extract_first(), 'Dried Rose') # "p" prefix is not cached from previous calls self.assertRaises(ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()") x.register_namespace("p", "http://www.scrapy.org/product") self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron') def test_make_links_absolute(self): text = u'<a href="file.html">link to file</a>' sel = Selector(text=text, base_url='http://example.com') sel.root.make_links_absolute() self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first()) def test_re(self): body = u"""<div>Name: Mary <ul> <li>Name: John</li> <li>Age: 10</li> <li>Name: Paul</li> <li>Age: 20</li> </ul> Age: 20 </div>""" x = self.sscls(text=body) name_re = re.compile(r"Name: (\w+)") self.assertEqual(x.xpath("//ul/li").re(name_re), ["John", "Paul"]) self.assertEqual(x.xpath("//ul/li").re(r"Age: (\d+)"), ["10", "20"]) # Test named group, hit and miss x = self.sscls(text=u'foobar') self.assertEqual(x.re('(?P<extract>foo)'), ['foo']) self.assertEqual(x.re('(?P<extract>baz)'), []) # A purposely constructed test for an edge case x = self.sscls(text=u'baz') self.assertEqual(x.re('(?P<extract>foo)|(?P<bar>baz)'), []) def test_re_replace_entities(self): body = u"""<script>{"foo":"bar & "baz""}</script>""" x = self.sscls(text=body) name_re = re.compile('{"foo":(.*)}') # by default, only & and < are preserved ; # other entities are converted expected = u'"bar & "baz""' self.assertEqual(x.xpath("//script/text()").re(name_re), [expected]) self.assertEqual(x.xpath("//script").re(name_re), [expected]) self.assertEqual(x.xpath("//script/text()")[0].re(name_re), [expected]) self.assertEqual(x.xpath("//script")[0].re(name_re), [expected]) # check that re_first() works the same way for single value output self.assertEqual(x.xpath("//script").re_first(name_re), expected) self.assertEqual(x.xpath("//script")[0].re_first(name_re), expected) # switching off replace_entities will preserve " also expected = u'"bar & "baz""' self.assertEqual(x.xpath("//script/text()").re(name_re, replace_entities=False), [expected]) self.assertEqual(x.xpath("//script")[0].re(name_re, replace_entities=False), [expected]) self.assertEqual(x.xpath("//script/text()").re_first(name_re, replace_entities=False), expected) self.assertEqual(x.xpath("//script")[0].re_first(name_re, replace_entities=False), expected) def test_re_intl(self): body = u'<div>Evento: cumplea\xf1os</div>' x = self.sscls(text=body) self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), [u'cumplea\xf1os']) def test_selector_over_text(self): hs = self.sscls(text=u'<root>lala</root>') self.assertEqual(hs.extract(), u'<html><body><root>lala</root></body></html>') xs = self.sscls(text=u'<root>lala</root>', type='xml') self.assertEqual(xs.extract(), u'<root>lala</root>') self.assertEqual(xs.xpath('.').extract(), [u'<root>lala</root>']) def test_invalid_xpath(self): "Test invalid xpath raises ValueError with the invalid xpath" x = self.sscls(text=u"<html></html>") xpath = "//test[@foo='bar]" self.assertRaisesRegexp(ValueError, re.escape(xpath), x.xpath, xpath) def test_invalid_xpath_unicode(self): "Test *Unicode* invalid xpath raises ValueError with the invalid xpath" x = self.sscls(text=u"<html></html>") xpath = u"//test[@foo='\u0431ar]" encoded = xpath if six.PY3 else xpath.encode('unicode_escape') self.assertRaisesRegexp(ValueError, re.escape(encoded), x.xpath, xpath) def test_http_header_encoding_precedence(self): # u'\xa3' = pound symbol in unicode # u'\xc2\xa3' = pound symbol in utf-8 # u'\xa3' = pound symbol in latin-1 (iso-8859-1) text = u'''<html> <head><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></head> <body><span id="blank">\xa3</span></body></html>''' x = self.sscls(text=text) self.assertEqual(x.xpath("//span[@id='blank']/text()").extract(), [u'\xa3']) def test_empty_bodies_shouldnt_raise_errors(self): self.sscls(text=u'').xpath('//text()').extract() def test_bodies_with_comments_only(self): sel = self.sscls(text=u'<!-- hello world -->', base_url='http://example.com') self.assertEqual(u'http://example.com', sel.root.base) def test_null_bytes_shouldnt_raise_errors(self): text = u'<root>pre\x00post</root>' self.sscls(text).xpath('//text()').extract() def test_replacement_char_from_badly_encoded_body(self): # \xe9 alone isn't valid utf8 sequence text = u'<html><p>an Jos\ufffd de</p><html>' self.assertEqual([u'an Jos\ufffd de'], self.sscls(text).xpath('//text()').extract()) def test_select_on_unevaluable_nodes(self): r = self.sscls(text=u'<span class="big">some text</span>') # Text node x1 = r.xpath('//text()') self.assertEqual(x1.extract(), [u'some text']) self.assertEqual(x1.xpath('.//b').extract(), []) # Tag attribute x1 = r.xpath('//span/@class') self.assertEqual(x1.extract(), [u'big']) self.assertEqual(x1.xpath('.//text()').extract(), []) def test_select_on_text_nodes(self): r = self.sscls(text=u'<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>') x1 = r.xpath("//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]") self.assertEqual(x1.extract(), [u'opt1']) x1 = r.xpath("//div/descendant::text()/preceding-sibling::b[contains(text(), 'Options')]") self.assertEqual(x1.extract(), [u'<b>Options:</b>']) @unittest.skip("Text nodes lost parent node reference in lxml") def test_nested_select_on_text_nodes(self): # FIXME: does not work with lxml backend [upstream] r = self.sscls(text=u'<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>') x1 = r.xpath("//div/descendant::text()") x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]") self.assertEqual(x2.extract(), [u'<b>Options:</b>']) def test_weakref_slots(self): """Check that classes are using slots and are weak-referenceable""" x = self.sscls(text=u'') weakref.ref(x) assert not hasattr(x, '__dict__'), "%s does not use __slots__" % \ x.__class__.__name__ def test_remove_namespaces(self): xml = u"""<?xml version="1.0" encoding="UTF-8"?> <feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/"> <link type="text/html"/> <entry> <link type="text/html"/> </entry> <link type="application/atom+xml"/> </feed> """ sel = self.sscls(text=xml, type='xml') self.assertEqual(len(sel.xpath("//link")), 0) self.assertEqual(len(sel.xpath("./namespace::*")), 3) sel.remove_namespaces() self.assertEqual(len(sel.xpath("//link")), 3) self.assertEqual(len(sel.xpath("./namespace::*")), 1) def test_remove_namespaces_embedded(self): xml = u""" <feed xmlns="http://www.w3.org/2005/Atom"> <link type="text/html"/> <entry> <link type="text/html"/> </entry> <svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 100 100"> <linearGradient id="gradient"> <stop class="begin" offset="0%" style="stop-color:yellow;"/> <stop class="end" offset="80%" style="stop-color:green;"/> </linearGradient> <circle cx="50" cy="50" r="30" style="fill:url(#gradient)" /> </svg> </feed> """ sel = self.sscls(text=xml, type='xml') self.assertEqual(len(sel.xpath("//link")), 0) self.assertEqual(len(sel.xpath("//stop")), 0) self.assertEqual(len(sel.xpath("./namespace::*")), 2) self.assertEqual(len(sel.xpath("//f:link", namespaces={'f': 'http://www.w3.org/2005/Atom'})), 2) self.assertEqual(len(sel.xpath("//s:stop", namespaces={'s': 'http://www.w3.org/2000/svg'})), 2) sel.remove_namespaces() self.assertEqual(len(sel.xpath("//link")), 2) self.assertEqual(len(sel.xpath("//stop")), 2) self.assertEqual(len(sel.xpath("./namespace::*")), 1) def test_remove_attributes_namespaces(self): xml = u"""<?xml version="1.0" encoding="UTF-8"?> <feed xmlns:atom="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/"> <link atom:type="text/html"/> <entry> <link atom:type="text/html"/> </entry> <link atom:type="application/atom+xml"/> </feed> """ sel = self.sscls(text=xml, type='xml') self.assertEqual(len(sel.xpath("//link/@type")), 0) sel.remove_namespaces() self.assertEqual(len(sel.xpath("//link/@type")), 3) def test_smart_strings(self): """Lxml smart strings return values""" class SmartStringsSelector(Selector): _lxml_smart_strings = True body = u"""<body> <div class='one'> <ul> <li>one</li><li>two</li> </ul> </div> <div class='two'> <ul> <li>four</li><li>five</li><li>six</li> </ul> </div> </body>""" # .getparent() is available for text nodes and attributes # only when smart_strings are on x = self.sscls(text=body) li_text = x.xpath('//li/text()') self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), li_text))) div_class = x.xpath('//div/@class') self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), div_class))) x = SmartStringsSelector(text=body) li_text = x.xpath('//li/text()') self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), li_text))) div_class = x.xpath('//div/@class') self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), div_class))) def test_xml_entity_expansion(self): malicious_xml = u'<?xml version="1.0" encoding="ISO-8859-1"?>'\ '<!DOCTYPE foo [ <!ELEMENT foo ANY > <!ENTITY xxe SYSTEM '\ '"file:///etc/passwd" >]><foo>&xxe;</foo>' sel = self.sscls(text=malicious_xml, type='xml') self.assertEqual(sel.extract(), '<foo>&xxe;</foo>') def test_configure_base_url(self): sel = self.sscls(text=u'nothing', base_url='http://example.com') self.assertEqual(u'http://example.com', sel.root.base) def test_extending_selector(self): class MySelectorList(Selector.selectorlist_cls): pass class MySelector(Selector): selectorlist_cls = MySelectorList sel = MySelector(text=u'<html><div>foo</div></html>') self.assertIsInstance(sel.xpath('//div'), MySelectorList) self.assertIsInstance(sel.xpath('//div')[0], MySelector) self.assertIsInstance(sel.css('div'), MySelectorList) self.assertIsInstance(sel.css('div')[0], MySelector) def test_replacement_null_char_from_body(self): text = u'<html>\x00<body><p>Grainy</p></body></html>' self.assertEqual(u'<html><body><p>Grainy</p></body></html>', self.sscls(text).extract()) class ExsltTestCase(unittest.TestCase): sscls = Selector def test_regexp(self): """EXSLT regular expression tests""" body = u""" <p><input name='a' value='1'/><input name='b' value='2'/></p> <div class="links"> <a href="/first.html">first link</a> <a href="/second.html">second link</a> <a href="http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml">EXSLT match example</a> </div> """ sel = self.sscls(text=body) # re:test() self.assertEqual( sel.xpath( '//input[re:test(@name, "[A-Z]+", "i")]').extract(), [x.extract() for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]')]) self.assertEqual( [x.extract() for x in sel.xpath( r'//a[re:test(@href, "\.html$")]/text()')], [u'first link', u'second link']) self.assertEqual( [x.extract() for x in sel.xpath( '//a[re:test(@href, "first")]/text()')], [u'first link']) self.assertEqual( [x.extract() for x in sel.xpath( '//a[re:test(@href, "second")]/text()')], [u'second link']) # re:match() is rather special: it returns a node-set of <match> nodes #[u'<match>http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml</match>', #u'<match>http</match>', #u'<match>www.bayes.co.uk</match>', #u'<match></match>', #u'<match>/xml/index.xml?/xml/utils/rechecker.xml</match>'] self.assertEqual( sel.xpath(r're:match(//a[re:test(@href, "\.xml$")]/@href,' r'"(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)")/text()').extract(), [u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml', u'http', u'www.bayes.co.uk', u'', u'/xml/index.xml?/xml/utils/rechecker.xml']) # re:replace() self.assertEqual( sel.xpath(r're:replace(//a[re:test(@href, "\.xml$")]/@href,' r'"(\w+)://(.+)(\.xml)", "","https://\2.html")').extract(), [u'https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html']) def test_set(self): """EXSLT set manipulation tests""" # microdata example from http://schema.org/Event body = u""" <div itemscope itemtype="http://schema.org/Event"> <a itemprop="url" href="nba-miami-philidelphia-game3.html"> NBA Eastern Conference First Round Playoff Tickets: <span itemprop="name"> Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1) </span> </a> <meta itemprop="startDate" content="2016-04-21T20:00"> Thu, 04/21/16 8:00 p.m. <div itemprop="location" itemscope itemtype="http://schema.org/Place"> <a itemprop="url" href="wells-fargo-center.html"> Wells Fargo Center </a> <div itemprop="address" itemscope itemtype="http://schema.org/PostalAddress"> <span itemprop="addressLocality">Philadelphia</span>, <span itemprop="addressRegion">PA</span> </div> </div> <div itemprop="offers" itemscope itemtype="http://schema.org/AggregateOffer"> Priced from: <span itemprop="lowPrice">$35</span> <span itemprop="offerCount">1938</span> tickets left </div> </div> """ sel = self.sscls(text=body) self.assertEqual( sel.xpath('''//div[@itemtype="http://schema.org/Event"] //@itemprop''').extract(), [u'url', u'name', u'startDate', u'location', u'url', u'address', u'addressLocality', u'addressRegion', u'offers', u'lowPrice', u'offerCount'] ) self.assertEqual(sel.xpath(''' set:difference(//div[@itemtype="http://schema.org/Event"] //@itemprop, //div[@itemtype="http://schema.org/Event"] //*[@itemscope]/*/@itemprop)''').extract(), [u'url', u'name', u'startDate', u'location', u'offers']) ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/tests/test_selector_csstranslator.py���������������������������������������������������0000664�0000000�0000000�00000014070�13523253702�0022366�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" Selector tests for cssselect backend """ import unittest from parsel.csstranslator import HTMLTranslator from parsel import Selector from cssselect.parser import SelectorSyntaxError from cssselect.xpath import ExpressionError HTMLBODY = u''' <html> <body> <div> <a id="name-anchor" name="foo"></a> <a id="tag-anchor" rel="tag" href="http://localhost/foo">link</a> <a id="nofollow-anchor" rel="nofollow" href="https://example.org"> link</a> <p id="paragraph"> lorem ipsum text <b id="p-b">hi</b> <em id="p-em">there</em> <b id="p-b2">guy</b> <input type="checkbox" id="checkbox-unchecked" /> <input type="checkbox" id="checkbox-disabled" disabled="" /> <input type="text" id="text-checked" checked="checked" /> <input type="hidden" /> <input type="hidden" disabled="disabled" /> <input type="checkbox" id="checkbox-checked" checked="checked" /> <input type="checkbox" id="checkbox-disabled-checked" disabled="disabled" checked="checked" /> <fieldset id="fieldset" disabled="disabled"> <input type="checkbox" id="checkbox-fieldset-disabled" /> <input type="hidden" /> </fieldset> </p> <map name="dummymap"> <area shape="circle" coords="200,250,25" href="foo.html" id="area-href" /> <area shape="default" id="area-nohref" /> </map> </div> <div class="cool-footer" id="foobar-div" foobar="ab bc cde"> <span id="foobar-span">foo ter</span> </div> </body></html> ''' class TranslatorMixinTest(unittest.TestCase): tr_cls = HTMLTranslator def setUp(self): self.tr = self.tr_cls() self.c2x = self.tr.css_to_xpath def test_attr_function(self): cases = [ ('::attr(name)', u'descendant-or-self::*/@name'), ('a::attr(href)', u'descendant-or-self::a/@href'), ('a ::attr(img)', u'descendant-or-self::a/descendant-or-self::*/@img'), ('a > ::attr(class)', u'descendant-or-self::a/*/@class'), ] for css, xpath in cases: self.assertEqual(self.c2x(css), xpath, css) def test_attr_function_exception(self): cases = [ ('::attr(12)', ExpressionError), ('::attr(34test)', ExpressionError), ('::attr(@href)', SelectorSyntaxError), ] for css, exc in cases: self.assertRaises(exc, self.c2x, css) def test_text_pseudo_element(self): cases = [ ('::text', u'descendant-or-self::text()'), ('p::text', u'descendant-or-self::p/text()'), ('p ::text', u'descendant-or-self::p/descendant-or-self::text()'), ('#id::text', u"descendant-or-self::*[@id = 'id']/text()"), ('p#id::text', u"descendant-or-self::p[@id = 'id']/text()"), ('p#id ::text', u"descendant-or-self::p[@id = 'id']/descendant-or-self::text()"), ('p#id > ::text', u"descendant-or-self::p[@id = 'id']/*/text()"), ('p#id ~ ::text', u"descendant-or-self::p[@id = 'id']/following-sibling::*/text()"), ('a[href]::text', u'descendant-or-self::a[@href]/text()'), ('a[href] ::text', u'descendant-or-self::a[@href]/descendant-or-self::text()'), ('p::text, a::text', u"descendant-or-self::p/text() | descendant-or-self::a/text()"), ] for css, xpath in cases: self.assertEqual(self.c2x(css), xpath, css) def test_pseudo_function_exception(self): cases = [ ('::attribute(12)', ExpressionError), ('::text()', ExpressionError), ('::attr(@href)', SelectorSyntaxError), ] for css, exc in cases: self.assertRaises(exc, self.c2x, css) def test_unknown_pseudo_element(self): cases = [ ('::text-node', ExpressionError), ] for css, exc in cases: self.assertRaises(exc, self.c2x, css) def test_unknown_pseudo_class(self): cases = [ (':text', ExpressionError), (':attribute(name)', ExpressionError), ] for css, exc in cases: self.assertRaises(exc, self.c2x, css) class UtilCss2XPathTest(unittest.TestCase): def test_css2xpath(self): from parsel import css2xpath expected_xpath = (u"descendant-or-self::*[@class and contains(" "concat(' ', normalize-space(@class), ' '), ' some-class ')]") self.assertEqual(css2xpath('.some-class'), expected_xpath) class CSSSelectorTest(unittest.TestCase): sscls = Selector def setUp(self): self.sel = self.sscls(text=HTMLBODY) def x(self, *a, **kw): return [v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()] def test_selector_simple(self): for x in self.sel.css('input'): self.assertTrue(isinstance(x, self.sel.__class__), x) self.assertEqual(self.sel.css('input').extract(), [x.extract() for x in self.sel.css('input')]) def test_text_pseudo_element(self): self.assertEqual(self.x('#p-b2'), [u'<b id="p-b2">guy</b>']) self.assertEqual(self.x('#p-b2::text'), [u'guy']) self.assertEqual(self.x('#p-b2 ::text'), [u'guy']) self.assertEqual(self.x('#paragraph::text'), [u'lorem ipsum text']) self.assertEqual(self.x('#paragraph ::text'), [u'lorem ipsum text', u'hi', u'there', u'guy']) self.assertEqual(self.x('p::text'), [u'lorem ipsum text']) self.assertEqual(self.x('p ::text'), [u'lorem ipsum text', u'hi', u'there', u'guy']) def test_attribute_function(self): self.assertEqual(self.x('#p-b2::attr(id)'), [u'p-b2']) self.assertEqual(self.x('.cool-footer::attr(class)'), [u'cool-footer']) self.assertEqual(self.x('.cool-footer ::attr(id)'), [u'foobar-div', u'foobar-span']) self.assertEqual(self.x('map[name="dummymap"] ::attr(shape)'), [u'circle', u'default']) def test_nested_selector(self): self.assertEqual(self.sel.css('p').css('b::text').extract(), [u'hi', u'guy']) self.assertEqual(self.sel.css('div').css('area:last-child').extract(), [u'<area shape="default" id="area-nohref">']) ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/tests/test_utils.py��������������������������������������������������������������������0000664�0000000�0000000�00000001041�13523253702�0016716�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������from parsel.utils import shorten from pytest import mark, raises import six @mark.parametrize( 'width,expected', ( (-1, ValueError), (0, u''), (1, u'.'), (2, u'..'), (3, u'...'), (4, u'f...'), (5, u'fo...'), (6, u'foobar'), (7, u'foobar'), ) ) def test_shorten(width, expected): if isinstance(expected, six.string_types): assert shorten(u'foobar', width) == expected else: with raises(expected): shorten(u'foobar', width) �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������parsel-1.5.2/tests/test_xpathfuncs.py���������������������������������������������������������������0000664�0000000�0000000�00000006701�13523253702�0017751�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from parsel import Selector from parsel.xpathfuncs import set_xpathfunc import unittest class XPathFuncsTestCase(unittest.TestCase): def test_has_class_simple(self): body = u""" <p class="foo bar-baz">First</p> <p class="foo">Second</p> <p class="bar">Third</p> <p>Fourth</p> """ sel = Selector(text=body) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], [u'First', u'Second']) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')], [u'Third']) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()')], []) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')], [u'First']) def test_has_class_error_no_args(self): body = u""" <p CLASS="foo">First</p> """ sel = Selector(text=body) self.assertRaisesRegexp( ValueError, 'has-class must have at least 1 argument', sel.xpath, 'has-class()') def test_has_class_error_invalid_arg_type(self): body = u""" <p CLASS="foo">First</p> """ sel = Selector(text=body) self.assertRaisesRegexp( ValueError, 'has-class arguments must be strings', sel.xpath, 'has-class(.)') def test_has_class_error_invalid_unicode(self): body = u""" <p CLASS="foo">First</p> """ sel = Selector(text=body) self.assertRaisesRegexp( ValueError, 'All strings must be XML compatible', sel.xpath, u'has-class("héllö")'.encode('utf-8')) def test_has_class_unicode(self): body = u""" <p CLASS="fóó">First</p> """ sel = Selector(text=body) self.assertEqual( [x.extract() for x in sel.xpath(u'//p[has-class("fóó")]/text()')], [u'First']) def test_has_class_uppercase(self): body = u""" <p CLASS="foo">First</p> """ sel = Selector(text=body) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], [u'First']) def test_has_class_newline(self): body = u""" <p CLASS="foo bar">First</p> """ sel = Selector(text=body) self.assertEqual( [x.extract() for x in sel.xpath(u'//p[has-class("foo")]/text()')], [u'First']) def test_has_class_tab(self): body = u""" <p CLASS="foo\tbar">First</p> """ sel = Selector(text=body) self.assertEqual( [x.extract() for x in sel.xpath(u'//p[has-class("foo")]/text()')], [u'First']) def test_set_xpathfunc(self): def myfunc(ctx): myfunc.call_count += 1 myfunc.call_count = 0 body = u""" <p CLASS="foo">First</p> """ sel = Selector(text=body) self.assertRaisesRegexp( ValueError, 'Unregistered function in myfunc', sel.xpath, 'myfunc()') set_xpathfunc('myfunc', myfunc) sel.xpath('myfunc()') self.assertEqual(myfunc.call_count, 1) set_xpathfunc('myfunc', None) self.assertRaisesRegexp( ValueError, 'Unregistered function in myfunc', sel.xpath, 'myfunc()') ���������������������������������������������������������������parsel-1.5.2/tox.ini��������������������������������������������������������������������������������0000664�0000000�0000000�00000000273�13523253702�0014324�0����������������������������������������������������������������������������������������������������ustar�00root����������������������������root����������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������[tox] envlist = py27, py34, py35, py36, py37, pypy, pypy3 [testenv] deps = -r{toxinidir}/tests/requirements.txt commands = py.test --cov=parsel --cov-report= {posargs:parsel tests} ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������