pax_global_header00006660000000000000000000000064146273317240014523gustar00rootroot0000000000000052 comment=e2bd0069c7790507edbc7e9b5f1985e116e2b476 itemloaders-1.3.1/000077500000000000000000000000001462733172400140355ustar00rootroot00000000000000itemloaders-1.3.1/.bumpversion.cfg000066400000000000000000000001341462733172400171430ustar00rootroot00000000000000[bumpversion] current_version = 1.3.1 commit = True tag = True [bumpversion:file:setup.py] itemloaders-1.3.1/.git-blame-ignore-revs000066400000000000000000000000761462733172400201400ustar00rootroot00000000000000# Apply black format 627f3bd9ea5210f40dbd5697eff9351bb5af019c itemloaders-1.3.1/.github/000077500000000000000000000000001462733172400153755ustar00rootroot00000000000000itemloaders-1.3.1/.github/workflows/000077500000000000000000000000001462733172400174325ustar00rootroot00000000000000itemloaders-1.3.1/.github/workflows/main.yml000066400000000000000000000034041462733172400211020ustar00rootroot00000000000000name: CI on: - pull_request - push jobs: pre-commit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.12" - uses: pre-commit/action@v3.0.1 tests: runs-on: ubuntu-latest strategy: fail-fast: false matrix: include: - python-version: '3.12' # Keep in sync with .readthedocs.yml env: TOXENV: docs - python-version: '3.12' env: TOXENV: twinecheck - python-version: 3.8 env: TOXENV: py - python-version: 3.9 env: TOXENV: py - python-version: pypy-3.9 env: TOXENV: py - python-version: pypy-3.10 env: TOXENV: py - python-version: 3.9 env: TOXENV: extra-deps - python-version: '3.10' env: TOXENV: py - python-version: '3.11' env: TOXENV: py - python-version: '3.12' env: TOXENV: py steps: - uses: actions/checkout@v4 - name: Install system libraries if: contains(matrix.python-version, 'pypy') run: | sudo apt-get update sudo apt-get install libxml2-dev libxslt-dev - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install --upgrade tox codecov - name: Run tests env: ${{ matrix.env }} run: tox - name: Publish coverage data uses: codecov/codecov-action@v1 itemloaders-1.3.1/.github/workflows/publish.yml000066400000000000000000000010561462733172400216250ustar00rootroot00000000000000name: Publish on PyPI on: release: types: [created] jobs: publish: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: 3.12 - name: Install dependencies run: | python -m pip install --upgrade pip pip install --upgrade build twine - name: Build run: | python -m build - name: Upload uses: pypa/gh-action-pypi-publish@v1.8.14 with: password: ${{ secrets.PYPI_TOKEN }} itemloaders-1.3.1/.gitignore000066400000000000000000000002671462733172400160320ustar00rootroot00000000000000/.vagrant /scrapy.iml *.pyc _trial_temp* dropin.cache docs/build *egg-info .tox venv .venv build dist .idea htmlcov/ .coverage .pytest_cache/ .coverage.* .cache/ # Windows Thumbs.db itemloaders-1.3.1/.pre-commit-config.yaml000066400000000000000000000011311462733172400203120ustar00rootroot00000000000000default_language_version: python: python3.12 repos: - hooks: - id: black language_version: python3 repo: https://github.com/ambv/black rev: 24.4.0 - hooks: - id: isort language_version: python3 repo: https://github.com/PyCQA/isort rev: 5.13.2 - hooks: - id: flake8 language_version: python3 additional_dependencies: - flake8-bugbear - flake8-comprehensions - flake8-debugger - flake8-docstrings - flake8-string-format repo: https://github.com/pycqa/flake8 rev: 7.0.0 itemloaders-1.3.1/.readthedocs.yml000066400000000000000000000003611462733172400171230ustar00rootroot00000000000000version: 2 formats: all sphinx: configuration: docs/conf.py build: os: ubuntu-22.04 tools: python: "3.12" # Keep in sync with .github/workflows/main.yml python: install: - requirements: docs/requirements.txt - path: . itemloaders-1.3.1/LICENSE000066400000000000000000000027551462733172400150530ustar00rootroot00000000000000Copyright (c) Scrapy developers. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of Scrapy nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. itemloaders-1.3.1/MANIFEST.in000066400000000000000000000000201462733172400155630ustar00rootroot00000000000000include LICENSE itemloaders-1.3.1/README.rst000066400000000000000000000051521462733172400155270ustar00rootroot00000000000000=========== itemloaders =========== .. image:: https://img.shields.io/pypi/v/itemloaders.svg :target: https://pypi.python.org/pypi/itemloaders :alt: PyPI Version .. image:: https://img.shields.io/pypi/pyversions/itemloaders.svg :target: https://pypi.python.org/pypi/itemloaders :alt: Supported Python Versions .. image:: https://github.com/scrapy/itemloaders/workflows/CI/badge.svg?branch=master :target: https://github.com/scrapy/itemloaders/actions?workflow=CI :alt: CI Status .. image:: https://codecov.io/github/scrapy/itemloaders/coverage.svg?branch=master :target: https://codecov.io/gh/scrapy/itemloaders :alt: Coverage report .. image:: https://readthedocs.org/projects/itemloaders/badge/?version=latest :target: https://itemloaders.readthedocs.io/en/latest/?badge=latest :alt: Documentation Status ``itemloaders`` is a library that helps you collect data from HTML and XML sources. It comes in handy to extract data from web pages, as it supports data extraction using CSS and XPath Selectors. It's specially useful when you need to standardize the data from many sources. For example, it allows you to have all your casting and parsing rules in a single place. Here is an example to get you started:: from itemloaders import ItemLoader from parsel import Selector html_data = ''' Some random product page
Some random product page

$ 100.12

''' loader = ItemLoader(selector=Selector(html_data)) loader.add_xpath('name', '//div[@class="product_name"]/text()') loader.add_xpath('name', '//div[@class="product_title"]/text()') loader.add_css('price', '#price::text') loader.add_value('last_updated', 'today') # you can also use literal values item = loader.load_item() item # {'name': ['Some random product page'], 'price': ['$ 100.12'], 'last_updated': ['today']} For more information, check out the `documentation `_. Contributing ============ All contributions are welcome! * If you want to review some code, check open `Pull Requests here `_ * If you want to submit a code change * File an `issue here `_, if there isn't one yet * Fork this repository * Create a branch to work on your changes * Run `pre-commit install` to install pre-commit hooks * Push your local branch and submit a Pull Request itemloaders-1.3.1/codecov.yml000066400000000000000000000001201462733172400161730ustar00rootroot00000000000000comment: layout: "header, diff, tree" coverage: status: project: false itemloaders-1.3.1/docs/000077500000000000000000000000001462733172400147655ustar00rootroot00000000000000itemloaders-1.3.1/docs/Makefile000066400000000000000000000053641462733172400164350ustar00rootroot00000000000000# # Makefile for Scrapy documentation [based on Python documentation Makefile] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # You can set these variables from the command line. PYTHON = python SPHINXOPTS = PAPER = SOURCES = SHELL = /bin/bash ALLSPHINXOPTS = -b $(BUILDER) -d build/doctrees \ -D latex_elements.papersize=$(PAPER) \ $(SPHINXOPTS) . build/$(BUILDER) $(SOURCES) .PHONY: help update build html htmlhelp clean help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " text to make plain text files" @echo " changes to make an overview over all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " watch build HTML docs, open in browser and watch for changes" build-dirs: mkdir -p build/$(BUILDER) build/doctrees build: build-dirs sphinx-build $(ALLSPHINXOPTS) @echo build-ignore-errors: build-dirs -sphinx-build $(ALLSPHINXOPTS) @echo html: BUILDER = html html: build @echo "Build finished. The HTML pages are in build/html." htmlhelp: BUILDER = htmlhelp htmlhelp: build @echo "Build finished; now you can run HTML Help Workshop with the" \ "build/htmlhelp/pydoc.hhp project file." latex: BUILDER = latex latex: build @echo "Build finished; the LaTeX files are in build/latex." @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ "run these through (pdf)latex." text: BUILDER = text text: build @echo "Build finished; the text files are in build/text." changes: BUILDER = changes changes: build @echo "The overview file is in build/changes." linkcheck: BUILDER = linkcheck linkcheck: build @echo "Link check complete; look for any errors in the above output " \ "or in build/$(BUILDER)/output.txt" linkfix: BUILDER = linkcheck linkfix: build-ignore-errors $(PYTHON) utils/linkfix.py @echo "Fixing redirecting links in docs has finished; check all " \ "replacements before committing them" doctest: BUILDER = doctest doctest: build @echo "Testing of doctests in the sources finished, look at the " \ "results in build/doctest/output.txt" pydoc-topics: BUILDER = pydoc-topics pydoc-topics: build @echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \ "into the Lib/ directory" coverage: BUILDER = coverage coverage: build htmlview: html $(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \ os.path.realpath('build/html/index.html'))" clean: -rm -rf build/* watch: htmlview watchmedo shell-command -p '*.rst' -c 'make html' -R -D itemloaders-1.3.1/docs/README.rst000066400000000000000000000025311462733172400164550ustar00rootroot00000000000000:orphan: =========================================== itemloaders documentation quick start guide =========================================== This file provides a quick guide on how to compile the itemloaders documentation. Setup the environment --------------------- To compile the documentation you need Sphinx Python library. To install it and all its dependencies run the following command from this dir :: pip install -r requirements.txt Compile the documentation ------------------------- To compile the documentation (to classic HTML output) run the following command from this dir:: make html Documentation will be generated (in HTML format) inside the ``build/html`` dir. View the documentation ---------------------- To view the documentation run the following command:: make htmlview This command will fire up your default browser and open the main page of your (previously generated) HTML documentation. Start over ---------- To cleanup all generated documentation files and start from scratch run:: make clean Keep in mind that this command won't touch any documentation source files. Recreating documentation on the fly ----------------------------------- There is a way to recreate the doc automatically when you make changes, you need to install watchdog (``pip install watchdog``) and then use:: make watch itemloaders-1.3.1/docs/_ext/000077500000000000000000000000001462733172400157245ustar00rootroot00000000000000itemloaders-1.3.1/docs/_ext/__init__.py000066400000000000000000000000001462733172400200230ustar00rootroot00000000000000itemloaders-1.3.1/docs/_ext/github.py000066400000000000000000000014031462733172400175560ustar00rootroot00000000000000from typing import Optional from docutils import nodes from docutils.parsers.rst.roles import set_classes def setup(app): app.add_role("gh", github_role) def github_role( name, rawtext, text, lineno, inliner, options: Optional[dict] = None, content: Optional[list] = None, ): options = options or {} content = content or [] if text.isdigit(): display_text = f"#{text}" url = f"https://github.com/scrapy/itemloaders/issues/{text}" else: short_commit = text[:7] display_text = short_commit url = f"https://github.com/scrapy/itemloaders/commit/{short_commit}" set_classes(options) node = nodes.reference(rawtext, display_text, refuri=url, **options) return [node], [] itemloaders-1.3.1/docs/api-reference.rst000066400000000000000000000001511462733172400202210ustar00rootroot00000000000000.. _api-reference: API Reference ================== .. autoclass:: itemloaders.ItemLoader :members:itemloaders-1.3.1/docs/built-in-processors.rst000066400000000000000000000010501462733172400214360ustar00rootroot00000000000000.. _built-in-processors: Available built-in processors ============================= Even though you can use any callable function as input and output processors, ``itemloaders`` provides some commonly used processors, which are described below. Some of them, like the :class:`~itemloaders.processors.MapCompose` (which is typically used as input processor) compose the output of several functions executed in order, to produce the final parsed value. Here is a list of all built-in processors: .. automodule:: itemloaders.processors :members:itemloaders-1.3.1/docs/conf.py000066400000000000000000000161251462733172400162710ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Scrapy documentation build configuration file, created by # sphinx-quickstart on Mon Nov 24 12:02:52 2008. # # This file is execfile()d with the current directory set to its containing dir. # # The contents of this file are pickled, so don't put values in the namespace # that aren't pickleable (module imports are okay, they're removed automatically). # # All configuration values have a default; values that are commented out # serve to show the default. import sys from os import path import sphinx_rtd_theme # If your extensions are in another directory, add it here. If the directory # is relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. sys.path.append(path.dirname(__file__)) sys.path.insert(0, path.dirname(path.dirname(__file__))) # General configuration # --------------------- # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ "_ext.github", "sphinx.ext.autodoc", "sphinx.ext.coverage", "sphinx.ext.intersphinx", "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The suffix of source filenames. source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8' # The master toctree document. master_doc = "index" # General information about the project. project = "itemloaders" copyright = "Zyte Group Ltd" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = "" release = "" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. language = "en" # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: # today = '' # Else, today_fmt is used as the format for a strftime call. # today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. # unused_docs = [] exclude_patterns = ["build"] # List of directories, relative to source directory, that shouldn't be searched # for source files. exclude_trees = [".build"] # The reST default role (used for this markup: `text`) to use for all documents. # default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. # add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). # add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. # show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # Options for HTML output # ----------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. # Add path to the RTD explicitly to robustify builds (otherwise might # fail in a clean Debian build env) html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # The style sheet to use for HTML and HTML Help pages. A file of that name # must exist either in Sphinx' static/ path, or in one of the custom paths # given in html_static_path. # html_style = 'scrapydoc.css' # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". # html_title = None # A shorter title for the navigation bar. Default is the same as html_title. # html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. # html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. # html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. html_last_updated_fmt = "%b %d, %Y" # Custom sidebar templates, maps document names to template names. # html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. # html_additional_pages = {} # If false, no module index is generated. # html_use_modindex = True # If false, no index is generated. # html_use_index = True # If true, the index is split into individual pages for each letter. # html_split_index = False # If true, the reST sources are included in the HTML build as _sources/. html_copy_source = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. # html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). # html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = "itemloadersdoc" # Options for LaTeX output # ------------------------ # The paper size ('letter' or 'a4'). # latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). # latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, document class [howto/manual]). latex_documents = [ ("index", "itemloaders.tex", "itemloaders Documentation", "Zyte", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. # latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. # latex_use_parts = False # Additional stuff for the LaTeX preamble. # latex_preamble = '' # Documents to append as an appendix to all manuals. # latex_appendices = [] # If false, no module index is generated. # latex_use_modindex = True # autodocs def setup(app): app.connect("autodoc-skip-member", maybe_skip_member) def maybe_skip_member(app, what, name, obj, skip, options): if not skip: # autodocs was generating a text "alias of" for the following members # https://github.com/sphinx-doc/sphinx/issues/4422 return name in {"default_item_class", "default_selector_class"} return skip nitpicky = True intersphinx_mapping = { "parsel": ("https://parsel.readthedocs.io/en/stable/", None), "python": ("https://docs.python.org/3", None), "scrapy": ("https://docs.scrapy.org/en/latest/", None), "w3lib": ("https://w3lib.readthedocs.io/en/latest", None), } itemloaders-1.3.1/docs/declaring-loaders.rst000066400000000000000000000032411462733172400210760ustar00rootroot00000000000000.. currentmodule:: itemloaders .. _declaring-loaders: Declaring Item Loaders ====================== Item Loaders are declared by using a class definition syntax. Here is an example:: from itemloaders import ItemLoader from itemloaders.processors import TakeFirst, MapCompose, Join class ProductLoader(ItemLoader): default_output_processor = TakeFirst() name_in = MapCompose(str.title) name_out = Join() # using a built-in processor price_in = MapCompose(str.strip) # using a function def price_out(self, values): return float(values[0]) loader = ProductLoader() loader.add_value('name', 'plasma TV') loader.add_value('price', '999.98') loader.load_item() # {'name': 'Plasma Tv', 'price': 999.98} As you can see, input processors are declared using the ``_in`` suffix while output processors are declared using the ``_out`` suffix. And you can also declare a default input/output processors using the :attr:`ItemLoader.default_input_processor` and :attr:`ItemLoader.default_output_processor` attributes. The precedence order, for both input and output processors, is as follows: 1. Item Loader field-specific attributes: ``field_in`` and ``field_out`` (most precedence) 2. Field metadata (``input_processor`` and ``output_processor`` keys). Check out `itemadapter field metadata `_ for more information. .. versionadded:: 1.0.1 3. Item Loader defaults: :meth:`ItemLoader.default_input_processor` and :meth:`ItemLoader.default_output_processor` (least precedence) See also: :ref:`extending-loaders`. itemloaders-1.3.1/docs/extending-loaders.rst000066400000000000000000000035121462733172400211340ustar00rootroot00000000000000.. _extending-loaders: Reusing and extending Item Loaders ================================== Item Loaders are designed to ease the maintenance burden of parsing rules, without losing flexibility and, at the same time, providing a convenient mechanism for extending and overriding them. For this reason Item Loaders support traditional Python class inheritance for dealing with differences in data schemas. Suppose, for example, that you get some particular product names enclosed in three dashes (e.g. ``---Plasma TV---``) and you don't want to end up with those dashes in the final product names. Here's how you can remove those dashes by reusing and extending the default Product Item Loader (``ProductLoader``):: from itemloaders.processors import MapCompose from myproject.loaders import ProductLoader def strip_dashes(x): return x.strip('-') class SiteSpecificLoader(ProductLoader): name_in = MapCompose(strip_dashes, ProductLoader.name_in) Another case where extending Item Loaders can be very helpful is when you have multiple source formats, for example XML and HTML. In the XML version you may want to remove ``CDATA`` occurrences. Here's an example of how to do it:: from itemloaders.processors import MapCompose from myproject.ItemLoaders import ProductLoader from myproject.utils.xml import remove_cdata class XmlProductLoader(ProductLoader): name_in = MapCompose(remove_cdata, ProductLoader.name_in) And that's how you typically extend input/output processors. There are many other possible ways to extend, inherit and override your Item Loaders, and different Item Loaders hierarchies may fit better for different projects. ``itemloaders`` only provides the mechanism; it doesn't impose any specific organization of your Loaders collection - that's up to you and your project's needs.itemloaders-1.3.1/docs/index.rst000066400000000000000000000072721462733172400166360ustar00rootroot00000000000000.. currentmodule:: itemloaders .. _topics-index: ============ itemloaders ============ ``itemloaders`` provide a convenient mechanism for populating data records. Its design provides a flexible, efficient and easy mechanism for extending and overriding different field parsing rules, either by raw data, or by source format (HTML, XML, etc) without becoming a nightmare to maintain. To install ``itemloaders``, run:: pip install itemloaders .. note:: Under the hood, ``itemloaders`` uses `itemadapter `_ as a common interface. This means you can use any of the types supported by ``itemadapter`` here. .. warning:: ``dataclasses`` and ``attrs`` support is still experimental. Please, refer to :attr:`~ItemLoader.default_item_class` in the :ref:`api-reference` for more information. Getting Started with ``itemloaders`` ==================================== To use an Item Loader, you must first instantiate it. You can either instantiate it with a dict-like object (`item`) or without one, in which case an `item` is automatically instantiated in the Item Loader ``__init__`` method using the `item` class specified in the :attr:`ItemLoader.default_item_class` attribute. Then, you start collecting values into the Item Loader, typically using CSS or XPath Selectors. You can add more than one value to the same item field; the Item Loader will know how to "join" those values later using a proper processing function. .. note:: Collected data is stored internally as lists, allowing to add several values to the same field. If an ``item`` argument is passed when creating a loader, each of the item's values will be stored as-is if it's already an iterable, or wrapped with a list if it's a single value. Here is a typical Item Loader usage:: from itemloaders import ItemLoader from parsel import Selector html_data = ''' Some random product page
Some random product page

$ 100.12

''' l = ItemLoader(selector=Selector(html_data)) l.add_xpath('name', '//div[@class="product_name"]/text()') l.add_xpath('name', '//div[@class="product_title"]/text()') l.add_css('price', '#price::text') l.add_value('last_updated', 'today') # you can also use literal values item = l.load_item() item # {'name': ['Some random product page'], 'price': ['$ 100.12'], 'last_updated': ['today']} By quickly looking at that code, we can see the ``name`` field is being extracted from two different XPath locations in the page: 1. ``//div[@class="product_name"]`` 2. ``//div[@class="product_title"]`` In other words, data is being collected by extracting it from two XPath locations, using the :meth:`~ItemLoader.add_xpath` method. This is the data that will be assigned to the ``name`` field later. Afterwards, similar calls are used for ``price`` field using a CSS selector with the :meth:`~ItemLoader.add_css` method, and finally the ``last_update`` field is populated directly with a literal value (``today``) using a different method: :meth:`~ItemLoader.add_value`. Finally, when all data is collected, the :meth:`ItemLoader.load_item` method is called which actually returns the item populated with the data previously extracted and collected with the :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`, and :meth:`~ItemLoader.add_value` calls. Contents -------- .. toctree:: declaring-loaders processors loaders-context nested-loaders extending-loaders built-in-processors api-reference release-notes itemloaders-1.3.1/docs/loaders-context.rst000066400000000000000000000033371462733172400206400ustar00rootroot00000000000000.. currentmodule:: itemloaders .. _loaders-context: Item Loader Context =================== The Item Loader Context is a mechanism that allows to change the input/ouput processors behavior. It's just a ``dict`` of arbitrary key/values which is shared among all processors. By default, the context contains the ``selector`` and any other `keyword arguments` sent to the Loaders's ``__init__``. The context can be passed when declaring, instantiating or using Item Loader. For example, suppose you have a function ``parse_length`` which receives a text value and extracts a length from it:: def parse_length(text, loader_context): unit = loader_context.get('unit', 'm') # ... length parsing code goes here ... return parsed_length By accepting a ``loader_context`` argument the function is explicitly telling the Item Loader that it's able to receive an Item Loader context, so the Item Loader passes the currently active context when calling it, and the processor function (``parse_length`` in this case) can thus use them. There are several ways to modify Item Loader context values: 1. By modifying the currently active Item Loader context (:attr:`~ItemLoader.context` attribute):: loader = ItemLoader(product) loader.context['unit'] = 'cm' 2. On Item Loader instantiation (the keyword arguments of Item Loader ``__init__`` method are stored in the Item Loader context):: loader = ItemLoader(product, unit='cm') 3. On Item Loader declaration, for those input/output processors that support instantiating them with an Item Loader context. :class:`~processors.MapCompose` is one of them:: class ProductLoader(ItemLoader): length_out = MapCompose(parse_length, unit='cm') itemloaders-1.3.1/docs/nested-loaders.rst000066400000000000000000000031021462733172400204240ustar00rootroot00000000000000.. _nested-loaders: Nested Loaders ============== When parsing related values from a subsection of a document, it can be useful to create nested loaders. Imagine you're extracting details from a footer of a page that looks something like: Example:: Without nested loaders, you need to specify the full xpath (or css) for each value that you wish to extract. Example:: loader = ItemLoader() # load stuff not in the footer loader.add_xpath('social', '//footer/a[@class = "social"]/@href') loader.add_xpath('email', '//footer/a[@class = "email"]/@href') loader.load_item() Instead, you can create a nested loader with the footer selector and add values relative to the footer. The functionality is the same but you avoid repeating the footer selector. Example:: loader = ItemLoader() # load stuff not in the footer footer_loader = loader.nested_xpath('//footer') footer_loader.add_xpath('social', 'a[@class = "social"]/@href') footer_loader.add_xpath('email', 'a[@class = "email"]/@href') # no need to call footer_loader.load_item() loader.load_item() You can nest loaders arbitrarily and they work with either xpath or css selectors. As a general guideline, use nested loaders when they make your code simpler but do not go overboard with nesting or your parser can become difficult to read. itemloaders-1.3.1/docs/processors.rst000066400000000000000000000067511462733172400177320ustar00rootroot00000000000000.. currentmodule:: itemloaders .. _processors: Input and Output processors =========================== An Item Loader contains one input processor and one output processor for each (item) field. The input processor processes the extracted data as soon as it's received (through the :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css` or :meth:`~ItemLoader.add_value` methods) and the result of the input processor is collected and kept inside the ItemLoader. After collecting all data, the :meth:`ItemLoader.load_item` method is called to populate and get the populated item object. That's when the output processor is called with the data previously collected (and processed using the input processor). The result of the output processor is the final value that gets assigned to the item. Let's see an example to illustrate how the input and output processors are called for a particular field (the same applies for any other field):: l = ItemLoader(selector=some_selector) l.add_xpath('name', xpath1) # (1) l.add_xpath('name', xpath2) # (2) l.add_css('name', css) # (3) l.add_value('name', 'test') # (4) return l.load_item() # (5) So what happens is: 1. Data from ``xpath1`` is extracted, and passed through the *input processor* of the ``name`` field. The result of the input processor is collected and kept in the Item Loader (but not yet assigned to the item). 2. Data from ``xpath2`` is extracted, and passed through the same *input processor* used in (1). The result of the input processor is appended to the data collected in (1) (if any). 3. This case is similar to the previous ones, except that the data is extracted from the ``css`` CSS selector, and passed through the same *input processor* used in (1) and (2). The result of the input processor is appended to the data collected in (1) and (2) (if any). 4. This case is also similar to the previous ones, except that the value to be collected is assigned directly, instead of being extracted from a XPath expression or a CSS selector. However, the value is still passed through the input processors. In this case, since the value is not iterable it is converted to an iterable of a single element before passing it to the input processor, because input processor always receive iterables. 5. The data collected in steps (1), (2), (3) and (4) is passed through the *output processor* of the ``name`` field. The result of the output processor is the value assigned to the ``name`` field in the item. It's worth noticing that processors are just callable objects, which are called with the data to be parsed, and return a parsed value. So you can use any function as input or output processor. The only requirement is that they must accept one (and only one) positional argument, which will be an iterable. .. note:: Both input and output processors must receive an iterable as their first argument. The output of those functions can be anything. The result of input processors will be appended to an internal list (in the Loader) containing the collected values (for that field). The result of the output processors is the value that will be finally assigned to the item. The other thing you need to keep in mind is that the values returned by input processors are collected internally (in lists) and then passed to output processors to populate the fields. Last, but not least, ``itemloaders`` comes with some :ref:`commonly used processors ` built-in for convenience. itemloaders-1.3.1/docs/release-notes.rst000066400000000000000000000077371462733172400203030ustar00rootroot00000000000000.. currentmodule:: itemloaders .. _release-notes: Release notes ============= .. _release-1.3.1: itemloaders 1.3.1 (2024-06-03) ------------------------------ - Fixed an error when using nested loaders with empty matches that was introduced in 1.3.0 (:gh:`88`) .. _release-1.3.0: itemloaders 1.3.0 (2024-05-30) ------------------------------ - Added support for method chaining to the ``add_*`` and ``replace_*`` methods, so you can now write code such as ``loader.add_xpath("name", "//body/text()").add_value("url", "http://example.com")`` (:gh:`81`) - Added type hints and ``py.typed`` (:gh:`80`, :gh:`83`) - Made the docs builds reproducible (:gh:`82`) .. _release-1.2.0: itemloaders 1.2.0 (2024-04-18) ------------------------------ - Added official support for Python 3.12 and PyPy 3.10 (:gh:`75`) - Removed official support for Python 3.7 (:gh:`72`) - Improved performance of ``itemloaders.utils.arg_to_iter`` (:gh:`51`) - Fixed test expectations on recent Python versions (:gh:`77`) - Improved CI (:gh:`78`) .. _release-1.1.0: itemloaders 1.1.0 (2023-04-21) ------------------------------ - Added JMESPath support (:meth:`ItemLoader.add_jmes` etc.), requiring Parsel 1.8.1+ (:gh:`68`) - Added official support for Python 3.11 (:gh:`59`) - Removed official support for Python 3.6 (:gh:`61`) - Internal code cleanup (:gh:`65`, :gh:`66`) - Added ``pre-commit`` support and applied changes from ``black`` and ``flake8`` (:gh:`70`). - Improved CI (:gh:`60`) .. _release-1.0.6: itemloaders 1.0.6 (2022-08-29) ------------------------------ - Fixes a regression introduced in 1.0.5 that would cause the ``re`` parameter of :meth:`ItemLoader.add_xpath` and similar methods to be passed to lxml, which would trigger an exception when the value of ``re`` was a compiled pattern and not a string (:gh:`56`) .. _release-1.0.5: itemloaders 1.0.5 (2022-08-25) ------------------------------ - Allow additional args to be passed when calling :meth:`ItemLoader.add_xpath` (:gh:`48`) - Fixed missing space in an exception message (:gh:`47`) - Updated company name in author and copyright sections (:gh:`42`) - Added official support for Python 3.9 and improved PyPy compatibility (:gh:`44`) - Added official support for Python 3.10 (:gh:`53`) .. _release-1.0.4: itemloaders 1.0.4 (2020-11-12) ------------------------------ - When adding a :class:`scrapy.item.scrapy.Item` object as a value into an :class:`ItemLoader` object, that item is now added *as is*, instead of becoming a :class:`list` of keys from its :attr:`scrapy.item.scrapy.Item.fields` (:gh:`28`, :gh:`29`) - Increased test coverage (:gh:`27`) .. _release-1.0.3: itemloaders 1.0.3 (2020-09-09) ------------------------------ - Calls to :meth:`ItemLoader.get_output_value` no longer affect the output of :meth:`ItemLoader.load_item` (:gh:`21`, :gh:`22`) - Fixed some documentation links (:gh:`19`, :gh:`23`) - Fixed some test warnings (:gh:`24`) .. _release-1.0.2: itemloaders 1.0.2 (2020-08-05) ------------------------------ - Included the license file in the source releases (:gh:`13`) - Cleaned up some remnants of Python 2 (:gh:`16`, :gh:`17`) .. _release-1.0.1: itemloaders 1.0.1 (2020-07-02) ------------------------------ - Extended item type support to all item types supported by itemadapter_ (:gh:`13`) - :ref:`Input and output processors ` defined in item field metadata are now taken into account (:gh:`13`) - Lowered some minimum dependency versions (:gh:`10`): - :doc:`parsel `: 1.5.2 → 1.5.0 - :doc:`w3lib `: 1.21.0 → 1.17.0 - Improved the README file (:gh:`9`) - Improved continuous integration (:gh:`e62d95b`) .. _release-1.0.0: itemloaders 1.0.0 (2020-05-18) ------------------------------ - Initial release, based on a part of the :doc:`Scrapy ` code base. .. _itemadapter: https://github.com/scrapy/itemadapter#itemadapter itemloaders-1.3.1/docs/requirements.txt000066400000000000000000000000421462733172400202450ustar00rootroot00000000000000Sphinx>=3.0 sphinx_rtd_theme>=0.4 itemloaders-1.3.1/itemloaders/000077500000000000000000000000001462733172400163455ustar00rootroot00000000000000itemloaders-1.3.1/itemloaders/__init__.py000066400000000000000000000550101462733172400204570ustar00rootroot00000000000000""" Item Loader See documentation in docs/topics/loaders.rst """ from __future__ import annotations from contextlib import suppress from typing import ( TYPE_CHECKING, Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Pattern, Union, ) from itemadapter import ItemAdapter from parsel import Selector from parsel.utils import extract_regex, flatten from itemloaders.common import wrap_loader_context from itemloaders.processors import Identity from itemloaders.utils import arg_to_iter if TYPE_CHECKING: # typing.Self requires Python 3.11 from typing_extensions import Self def unbound_method(method: Callable[..., Any]) -> Callable[..., Any]: """ Allow to use single-argument functions as input or output processors (no need to define an unused first 'self' argument) """ with suppress(AttributeError): if "." not in method.__qualname__: return method.__func__ # type: ignore[attr-defined, no-any-return] return method class ItemLoader: """ Return a new Item Loader for populating the given item. If no item is given, one is instantiated automatically using the class in :attr:`default_item_class`. When instantiated with a :param ``selector`` parameter the :class:`ItemLoader` class provides convenient mechanisms for extracting data from web pages using parsel_ selectors. :param item: The item instance to populate using subsequent calls to :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`, :meth:`~ItemLoader.add_jmes` or :meth:`~ItemLoader.add_value`. :type item: :class:`dict` object :param selector: The selector to extract data from, when using the :meth:`add_xpath` (resp. :meth:`add_css`, :meth:`add_jmes`) or :meth:`replace_xpath` (resp. :meth:`replace_css`, :meth:`replace_jmes`) method. :type selector: :class:`~parsel.selector.Selector` object The item, selector and the remaining keyword arguments are assigned to the Loader context (accessible through the :attr:`context` attribute). .. attribute:: item The item object being parsed by this Item Loader. This is mostly used as a property so when attempting to override this value, you may want to check out :attr:`default_item_class` first. .. attribute:: context The currently active :ref:`Context ` of this Item Loader. Refer to for more information about the Loader Context. .. attribute:: default_item_class An Item class (or factory), used to instantiate items when not given in the ``__init__`` method. .. warning:: Currently, this factory/class needs to be callable/instantiated without any arguments. If you are using ``dataclasses``, please consider the following alternative:: from dataclasses import dataclass, field from typing import Optional @dataclass class Product: name: Optional[str] = field(default=None) price: Optional[float] = field(default=None) .. attribute:: default_input_processor The default input processor to use for those fields which don't specify one. .. attribute:: default_output_processor The default output processor to use for those fields which don't specify one. .. attribute:: selector The :class:`~parsel.selector.Selector` object to extract data from. It's the selector given in the ``__init__`` method. This attribute is meant to be read-only. .. _parsel: https://parsel.readthedocs.io/en/latest/ """ default_item_class: type = dict default_input_processor: Callable[..., Any] = Identity() default_output_processor: Callable[..., Any] = Identity() def __init__( self, item: Any = None, selector: Optional[Selector] = None, parent: Optional[ItemLoader] = None, **context: Any, ): self.selector: Optional[Selector] = selector context.update(selector=selector) if item is None: item = self.default_item_class() self._local_item = item context["item"] = item self.context: MutableMapping[str, Any] = context self.parent: Optional[ItemLoader] = parent self._local_values: Dict[str, List[Any]] = {} # values from initial item for field_name, value in ItemAdapter(item).items(): self._values.setdefault(field_name, []) self._values[field_name] += arg_to_iter(value) @property def _values(self) -> Dict[str, List[Any]]: if self.parent is not None: return self.parent._values else: return self._local_values @property def item(self) -> Any: if self.parent is not None: return self.parent.item else: return self._local_item def nested_xpath(self, xpath: str, **context: Any) -> Self: """ Create a nested loader with an xpath selector. The supplied selector is applied relative to selector associated with this :class:`ItemLoader`. The nested loader shares the item with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected. """ self._check_selector_method() assert self.selector is not None selector = self.selector.xpath(xpath) context.update(selector=selector) subloader = self.__class__(item=self.item, parent=self, **context) return subloader def nested_css(self, css: str, **context: Any) -> Self: """ Create a nested loader with a css selector. The supplied selector is applied relative to selector associated with this :class:`ItemLoader`. The nested loader shares the item with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected. """ self._check_selector_method() assert self.selector is not None selector = self.selector.css(css) context.update(selector=selector) subloader = self.__class__(item=self.item, parent=self, **context) return subloader def add_value( self, field_name: Optional[str], value: Any, *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Process and then add the given ``value`` for the given field. The value is first passed through :meth:`get_value` by giving the ``processors`` and ``kwargs``, and then passed through the :ref:`field input processor ` and its result appended to the data collected for that field. If the field already contains collected data, the new data is added. The given ``field_name`` can be ``None``, in which case values for multiple fields may be added. And the processed value should be a dict with field_name mapped to values. :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader Examples:: loader.add_value('name', 'Color TV') loader.add_value('colours', ['white', 'blue']) loader.add_value('length', '100') loader.add_value('name', 'name: foo', TakeFirst(), re='name: (.+)') loader.add_value(None, {'name': 'foo', 'sex': 'male'}) """ value = self.get_value(value, *processors, re=re, **kw) if value is None: return if not field_name: for k, v in value.items(): self._add_value(k, v) else: self._add_value(field_name, value) return self def replace_value( self, field_name: Optional[str], value: Any, *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`add_value` but replaces the collected data with the new value instead of adding it. :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader """ value = self.get_value(value, *processors, re=re, **kw) if value is None: return if not field_name: for k, v in value.items(): self._replace_value(k, v) else: self._replace_value(field_name, value) return self def _add_value(self, field_name: str, value: Any) -> None: value = arg_to_iter(value) processed_value = self._process_input_value(field_name, value) if processed_value: self._values.setdefault(field_name, []) self._values[field_name] += arg_to_iter(processed_value) def _replace_value(self, field_name: str, value: Any) -> None: self._values.pop(field_name, None) self._add_value(field_name, value) def get_value( self, value: Any, *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Any: """ Process the given ``value`` by the given ``processors`` and keyword arguments. Available keyword arguments: :param re: a regular expression to use for extracting data from the given value using :func:`~parsel.utils.extract_regex` method, applied before processors :type re: str or typing.Pattern[str] Examples: >>> from itemloaders import ItemLoader >>> from itemloaders.processors import TakeFirst >>> loader = ItemLoader() >>> loader.get_value('name: foo', TakeFirst(), str.upper, re='name: (.+)') 'FOO' """ if re: value = arg_to_iter(value) value = flatten(extract_regex(re, x) for x in value) for proc in processors: if value is None: break _proc = proc proc = wrap_loader_context(proc, self.context) try: value = proc(value) except Exception as e: raise ValueError( "Error with processor %s value=%r error='%s: %s'" % (_proc.__class__.__name__, value, type(e).__name__, str(e)) ) from e return value def load_item(self) -> Any: """ Populate the item with the data collected so far, and return it. The data collected is first passed through the :ref:`output processors ` to get the final value to assign to each item field. """ adapter = ItemAdapter(self.item) for field_name in tuple(self._values): value = self.get_output_value(field_name) if value is not None: adapter[field_name] = value return adapter.item def get_output_value(self, field_name: str) -> Any: """ Return the collected values parsed using the output processor, for the given field. This method doesn't populate or modify the item at all. """ proc = self.get_output_processor(field_name) proc = wrap_loader_context(proc, self.context) value = self._values.get(field_name, []) try: return proc(value) except Exception as e: raise ValueError( "Error with output processor: field=%r value=%r error='%s: %s'" % (field_name, value, type(e).__name__, str(e)) ) from e def get_collected_values(self, field_name: str) -> List[Any]: """Return the collected values for the given field.""" return self._values.get(field_name, []) def get_input_processor(self, field_name: str) -> Callable[..., Any]: proc = getattr(self, "%s_in" % field_name, None) if not proc: proc = self._get_item_field_attr( field_name, "input_processor", self.default_input_processor ) return unbound_method(proc) def get_output_processor(self, field_name: str) -> Callable[..., Any]: proc = getattr(self, "%s_out" % field_name, None) if not proc: proc = self._get_item_field_attr( field_name, "output_processor", self.default_output_processor ) return unbound_method(proc) def _get_item_field_attr( self, field_name: str, key: Any, default: Any = None ) -> Any: field_meta = ItemAdapter(self.item).get_field_meta(field_name) return field_meta.get(key, default) def _process_input_value(self, field_name: str, value: Any) -> Any: proc = self.get_input_processor(field_name) _proc = proc proc = wrap_loader_context(proc, self.context) try: return proc(value) except Exception as e: raise ValueError( "Error with input processor %s: field=%r value=%r " "error='%s: %s'" % ( _proc.__class__.__name__, field_name, value, type(e).__name__, str(e), ) ) from e def _check_selector_method(self) -> None: if self.selector is None: raise RuntimeError( "To use XPath or CSS selectors, %s " "must be instantiated with a selector" % self.__class__.__name__ ) def add_xpath( self, field_name: Optional[str], xpath: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a value, which is used to extract a list of strings from the selector associated with this :class:`ItemLoader`. See :meth:`get_xpath` for ``kwargs``. :param xpath: the XPath to extract data from :type xpath: str :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader Examples:: # HTML snippet:

Color TV

loader.add_xpath('name', '//p[@class="product-name"]') # HTML snippet:

the price is $1200

loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') """ values = self._get_xpathvalues(xpath, **kw) return self.add_value(field_name, values, *processors, re=re, **kw) def replace_xpath( self, field_name: Optional[str], xpath: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`add_xpath` but replaces collected data instead of adding it. :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader """ values = self._get_xpathvalues(xpath, **kw) return self.replace_value(field_name, values, *processors, re=re, **kw) def get_xpath( self, xpath: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Any: """ Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a value, which is used to extract a list of unicode strings from the selector associated with this :class:`ItemLoader`. :param xpath: the XPath to extract data from :type xpath: str :param re: a regular expression to use for extracting data from the selected XPath region :type re: str or typing.Pattern[str] Examples:: # HTML snippet:

Color TV

loader.get_xpath('//p[@class="product-name"]') # HTML snippet:

the price is $1200

loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)') """ values = self._get_xpathvalues(xpath, **kw) return self.get_value(values, *processors, re=re, **kw) def _get_xpathvalues( self, xpaths: Union[str, Iterable[str]], **kw: Any ) -> List[Any]: self._check_selector_method() assert self.selector is not None xpaths = arg_to_iter(xpaths) return flatten(self.selector.xpath(xpath, **kw).getall() for xpath in xpaths) def add_css( self, field_name: Optional[str], css: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`ItemLoader.add_value` but receives a CSS selector instead of a value, which is used to extract a list of unicode strings from the selector associated with this :class:`ItemLoader`. See :meth:`get_css` for ``kwargs``. :param css: the CSS selector to extract data from :type css: str :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader Examples:: # HTML snippet:

Color TV

loader.add_css('name', 'p.product-name') # HTML snippet:

the price is $1200

loader.add_css('price', 'p#price', re='the price is (.*)') """ values = self._get_cssvalues(css) return self.add_value(field_name, values, *processors, re=re, **kw) def replace_css( self, field_name: Optional[str], css: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`add_css` but replaces collected data instead of adding it. :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader """ values = self._get_cssvalues(css) return self.replace_value(field_name, values, *processors, re=re, **kw) def get_css( self, css: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Any: """ Similar to :meth:`ItemLoader.get_value` but receives a CSS selector instead of a value, which is used to extract a list of unicode strings from the selector associated with this :class:`ItemLoader`. :param css: the CSS selector to extract data from :type css: str :param re: a regular expression to use for extracting data from the selected CSS region :type re: str or typing.Pattern[str] Examples:: # HTML snippet:

Color TV

loader.get_css('p.product-name') # HTML snippet:

the price is $1200

loader.get_css('p#price', TakeFirst(), re='the price is (.*)') """ values = self._get_cssvalues(css) return self.get_value(values, *processors, re=re, **kw) def _get_cssvalues(self, csss: Union[str, Iterable[str]]) -> List[Any]: self._check_selector_method() assert self.selector is not None csss = arg_to_iter(csss) return flatten(self.selector.css(css).getall() for css in csss) def add_jmes( self, field_name: Optional[str], jmes: str, *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`ItemLoader.add_value` but receives a JMESPath selector instead of a value, which is used to extract a list of unicode strings from the selector associated with this :class:`ItemLoader`. See :meth:`get_jmes` for ``kwargs``. :param jmes: the JMESPath selector to extract data from :type jmes: str :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader Examples:: # HTML snippet: {"name": "Color TV"} loader.add_jmes('name') # HTML snippet: {"price": the price is $1200"} loader.add_jmes('price', TakeFirst(), re='the price is (.*)') """ values = self._get_jmesvalues(jmes) return self.add_value(field_name, values, *processors, re=re, **kw) def replace_jmes( self, field_name: Optional[str], jmes: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`add_jmes` but replaces collected data instead of adding it. :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader """ values = self._get_jmesvalues(jmes) return self.replace_value(field_name, values, *processors, re=re, **kw) def get_jmes( self, jmes: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Any: """ Similar to :meth:`ItemLoader.get_value` but receives a JMESPath selector instead of a value, which is used to extract a list of unicode strings from the selector associated with this :class:`ItemLoader`. :param jmes: the JMESPath selector to extract data from :type jmes: str :param re: a regular expression to use for extracting data from the selected JMESPath :type re: str or typing.Pattern Examples:: # HTML snippet: {"name": "Color TV"} loader.get_jmes('name') # HTML snippet: {"price": the price is $1200"} loader.get_jmes('price', TakeFirst(), re='the price is (.*)') """ values = self._get_jmesvalues(jmes) return self.get_value(values, *processors, re=re, **kw) def _get_jmesvalues(self, jmess: Union[str, Iterable[str]]) -> List[Any]: self._check_selector_method() assert self.selector is not None jmess = arg_to_iter(jmess) if not hasattr(self.selector, "jmespath"): raise AttributeError( "Please install parsel >= 1.8.1 to get jmespath support" ) return flatten(self.selector.jmespath(jmes).getall() for jmes in jmess) itemloaders-1.3.1/itemloaders/common.py000066400000000000000000000011161462733172400202060ustar00rootroot00000000000000"""Common functions used in Item Loaders code""" from functools import partial from typing import Any, Callable, MutableMapping from itemloaders.utils import get_func_args def wrap_loader_context( function: Callable[..., Any], context: MutableMapping[str, Any] ) -> Callable[..., Any]: """Wrap functions that receive loader_context to contain the context "pre-loaded" and expose a interface that receives only one argument """ if "loader_context" in get_func_args(function): return partial(function, loader_context=context) else: return function itemloaders-1.3.1/itemloaders/processors.py000066400000000000000000000215321462733172400211240ustar00rootroot00000000000000""" This module provides some commonly used processors for Item Loaders. See documentation in docs/topics/loaders.rst """ from collections import ChainMap from typing import Any, Callable, Iterable, List, MutableMapping, Optional from itemloaders.common import wrap_loader_context from itemloaders.utils import arg_to_iter class MapCompose: """ A processor which is constructed from the composition of the given functions, similar to the :class:`Compose` processor. The difference with this processor is the way internal results are passed among functions, which is as follows: The input value of this processor is *iterated* and the first function is applied to each element. The results of these function calls (one for each element) are concatenated to construct a new iterable, which is then used to apply the second function, and so on, until the last function is applied to each value of the list of values collected so far. The output values of the last function are concatenated together to produce the output of this processor. Each particular function can return a value or a list of values, which is flattened with the list of values returned by the same function applied to the other input values. The functions can also return ``None`` in which case the output of that function is ignored for further processing over the chain. This processor provides a convenient way to compose functions that only work with single values (instead of iterables). For this reason the :class:`MapCompose` processor is typically used as input processor, since data is often extracted using the :meth:`~parsel.selector.Selector.extract` method of `parsel selectors`_, which returns a list of unicode strings. The example below should clarify how it works: >>> def filter_world(x): ... return None if x == 'world' else x ... >>> from itemloaders.processors import MapCompose >>> proc = MapCompose(filter_world, str.upper) >>> proc(['hello', 'world', 'this', 'is', 'something']) ['HELLO', 'THIS', 'IS', 'SOMETHING'] As with the Compose processor, functions can receive Loader contexts, and ``__init__`` method keyword arguments are used as default context values. See :class:`Compose` processor for more info. .. _`parsel selectors`: https://parsel.readthedocs.io/en/latest/parsel.html#parsel.selector.Selector.extract """ # noqa def __init__(self, *functions: Callable[..., Any], **default_loader_context: Any): self.functions = functions self.default_loader_context = default_loader_context def __call__( self, value: Any, loader_context: Optional[MutableMapping[str, Any]] = None ) -> Iterable[Any]: values = arg_to_iter(value) context: MutableMapping[str, Any] if loader_context: context = ChainMap(loader_context, self.default_loader_context) else: context = self.default_loader_context wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions] for func in wrapped_funcs: next_values: List[Any] = [] for v in values: try: next_values += arg_to_iter(func(v)) except Exception as e: raise ValueError( "Error in MapCompose with " "%s value=%r error='%s: %s'" % (str(func), value, type(e).__name__, str(e)) ) from e values = next_values return values class Compose: """ A processor which is constructed from the composition of the given functions. This means that each input value of this processor is passed to the first function, and the result of that function is passed to the second function, and so on, until the last function returns the output value of this processor. By default, stop process on ``None`` value. This behaviour can be changed by passing keyword argument ``stop_on_none=False``. Example: >>> from itemloaders.processors import Compose >>> proc = Compose(lambda v: v[0], str.upper) >>> proc(['hello', 'world']) 'HELLO' Each function can optionally receive a ``loader_context`` parameter. For those which do, this processor will pass the currently active :ref:`Loader context ` through that parameter. The keyword arguments passed in the ``__init__`` method are used as the default Loader context values passed to each function call. However, the final Loader context values passed to functions are overridden with the currently active Loader context accessible through the :attr:`ItemLoader.context ` attribute. """ def __init__(self, *functions: Callable[..., Any], **default_loader_context: Any): self.functions = functions self.stop_on_none = default_loader_context.get("stop_on_none", True) self.default_loader_context = default_loader_context def __call__( self, value: Any, loader_context: Optional[MutableMapping[str, Any]] = None ) -> Any: context: MutableMapping[str, Any] if loader_context: context = ChainMap(loader_context, self.default_loader_context) else: context = self.default_loader_context wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions] for func in wrapped_funcs: if value is None and self.stop_on_none: break try: value = func(value) except Exception as e: raise ValueError( "Error in Compose with " "%s value=%r error='%s: %s'" % (str(func), value, type(e).__name__, str(e)) ) from e return value class TakeFirst: """ Returns the first non-null/non-empty value from the values received, so it's typically used as an output processor to single-valued fields. It doesn't receive any ``__init__`` method arguments, nor does it accept Loader contexts. Example: >>> from itemloaders.processors import TakeFirst >>> proc = TakeFirst() >>> proc(['', 'one', 'two', 'three']) 'one' """ def __call__(self, values: Any) -> Any: for value in values: if value is not None and value != "": return value class Identity: """ The simplest processor, which doesn't do anything. It returns the original values unchanged. It doesn't receive any ``__init__`` method arguments, nor does it accept Loader contexts. Example: >>> from itemloaders.processors import Identity >>> proc = Identity() >>> proc(['one', 'two', 'three']) ['one', 'two', 'three'] """ def __call__(self, values: Any) -> Any: return values class SelectJmes: """ Query the input string for the jmespath (given at instantiation), and return the answer Requires : jmespath(https://github.com/jmespath/jmespath) Note: SelectJmes accepts only one input element at a time. Example: >>> from itemloaders.processors import SelectJmes, Compose, MapCompose >>> proc = SelectJmes("foo") #for direct use on lists and dictionaries >>> proc({'foo': 'bar'}) 'bar' >>> proc({'foo': {'bar': 'baz'}}) {'bar': 'baz'} Working with Json: >>> import json >>> proc_single_json_str = Compose(json.loads, SelectJmes("foo")) >>> proc_single_json_str('{"foo": "bar"}') 'bar' >>> proc_json_list = Compose(json.loads, MapCompose(SelectJmes('foo'))) >>> proc_json_list('[{"foo":"bar"}, {"baz":"tar"}]') ['bar'] """ def __init__(self, json_path: str): self.json_path: str = json_path import jmespath.parser self.compiled_path: jmespath.parser.ParsedResult = jmespath.compile( self.json_path ) def __call__(self, value: Any) -> Any: """Query value for the jmespath query and return answer :param value: a data structure (dict, list) to extract from :return: Element extracted according to jmespath query """ return self.compiled_path.search(value) class Join: """ Returns the values joined with the separator given in the ``__init__`` method, which defaults to ``' '``. It doesn't accept Loader contexts. When using the default separator, this processor is equivalent to the function: ``' '.join`` Examples: >>> from itemloaders.processors import Join >>> proc = Join() >>> proc(['one', 'two', 'three']) 'one two three' >>> proc = Join('
') >>> proc(['one', 'two', 'three']) 'one
two
three' """ def __init__(self, separator: str = " "): self.separator = separator def __call__(self, values: Any) -> str: return self.separator.join(values) itemloaders-1.3.1/itemloaders/py.typed000066400000000000000000000000001462733172400200320ustar00rootroot00000000000000itemloaders-1.3.1/itemloaders/utils.py000066400000000000000000000030151462733172400200560ustar00rootroot00000000000000""" Copy/paste from scrapy source at the moment, to ensure tests are working. Refactoring to come later """ import inspect from functools import partial from typing import Any, Callable, Generator, Iterable, List def arg_to_iter(arg: Any) -> Iterable[Any]: """Return an iterable based on *arg*. If *arg* is a list, a tuple or a generator, it will be returned as is. If *arg* is ``None``, an empty list will be returned. If *arg* is anything else, a list will be returned with *arg* as its only item, i.e. ``[arg]``. """ if arg is None: return [] if isinstance(arg, (list, tuple, Generator)): return arg return [arg] def get_func_args(func: Callable[..., Any], stripself: bool = False) -> List[str]: """Return the argument name list of a callable object""" if not callable(func): raise TypeError(f"func must be callable, got {type(func).__name__!r}") args: List[str] = [] try: sig = inspect.signature(func) except ValueError: return args if isinstance(func, partial): partial_args = func.args partial_kw = func.keywords for name, param in sig.parameters.items(): if param.name in partial_args: continue if partial_kw and param.name in partial_kw: continue args.append(name) else: for name in sig.parameters.keys(): args.append(name) if stripself and args and args[0] == "self": args = args[1:] return args itemloaders-1.3.1/setup.cfg000066400000000000000000000004031462733172400156530ustar00rootroot00000000000000[flake8] ignore = E266, E501, E704, W503 max-line-length = 100 select = B,C,E,F,W,T4,B9 exclude = .git,__pycache__,.venv [isort] profile = black [mypy] [mypy-tests.*] # Allow test functions to be untyped allow_untyped_defs = true check_untyped_defs = true itemloaders-1.3.1/setup.py000066400000000000000000000033361462733172400155540ustar00rootroot00000000000000from setuptools import find_packages, setup with open("README.rst") as f: long_description = f.read() setup( name="itemloaders", version="1.3.1", url="https://github.com/scrapy/itemloaders", project_urls={ "Documentation": "https://itemloaders.readthedocs.io/", "Source": "https://github.com/scrapy/itemloaders", }, description="Base library for scrapy's ItemLoader", long_description=long_description, long_description_content_type="text/x-rst", author="Zyte", author_email="opensource@zyte.com", license="BSD", packages=find_packages(exclude=("tests", "tests.*")), package_data={ "itemadapter": ["py.typed"], }, include_package_data=True, zip_safe=False, classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ], python_requires=">=3.8", install_requires=[ # before updating these versions, be sure they are not higher than # scrapy's requirements "w3lib>=1.17.0", "parsel>=1.5.0", "jmespath>=0.9.5", "itemadapter>=0.1.0", ], # extras_require=extras_require, ) itemloaders-1.3.1/tests/000077500000000000000000000000001462733172400151775ustar00rootroot00000000000000itemloaders-1.3.1/tests/__init__.py000066400000000000000000000000001462733172400172760ustar00rootroot00000000000000itemloaders-1.3.1/tests/test_base_loader.py000066400000000000000000000426101462733172400210530ustar00rootroot00000000000000import unittest from functools import partial from itemloaders import ItemLoader from itemloaders.processors import Compose, Identity, Join, MapCompose, TakeFirst class CustomItemLoader(ItemLoader): name_in = MapCompose(lambda v: v.title()) class DefaultedItemLoader(ItemLoader): default_input_processor = MapCompose(lambda v: v[:-1]) # test processors def processor_with_args(value, other=None, loader_context=None): if "key" in loader_context: return loader_context["key"] return value class BasicItemLoaderTest(unittest.TestCase): def test_load_item_using_default_loader(self): i = {"summary": "lala"} il = ItemLoader(item=i) il.add_value("name", "marta") item = il.load_item() assert item is i assert item["summary"] == ["lala"] assert item["name"] == ["marta"] def test_load_item_using_custom_loader(self): il = CustomItemLoader() il.add_value("name", "marta") item = il.load_item() assert item["name"] == ["Marta"] def test_load_item_ignore_none_field_values(self): def validate_sku(value): # Let's assume a SKU is only digits. if value.isdigit(): return value class MyLoader(ItemLoader): name_out = Compose(lambda vs: vs[0]) # take first which allows empty values price_out = Compose(TakeFirst(), float) sku_out = Compose(TakeFirst(), validate_sku) valid_fragment = "SKU: 1234" invalid_fragment = "SKU: not available" sku_re = "SKU: (.+)" il = MyLoader(item={}) # Should not return "sku: None". il.add_value("sku", [invalid_fragment], re=sku_re) # Should not ignore empty values. il.add_value("name", "") il.add_value("price", ["0"]) assert il.load_item() == {"name": "", "price": 0.0} il.replace_value("sku", [valid_fragment], re=sku_re) self.assertEqual(il.load_item()["sku"], "1234") def test_self_referencing_loader(self): class MyLoader(ItemLoader): url_out = TakeFirst() def img_url_out(self, values): return (self.get_output_value("url") or "") + values[0] il = MyLoader(item={}) il.add_value("url", "http://example.com/") il.add_value("img_url", "1234.png") assert il.load_item() == { "url": "http://example.com/", "img_url": "http://example.com/1234.png", } il = MyLoader(item={}) il.add_value("img_url", "1234.png") assert il.load_item() == {"img_url": "1234.png"} def test_add_value(self): il = CustomItemLoader() il.add_value("name", "marta") assert il.get_collected_values("name") == ["Marta"] assert il.get_output_value("name") == ["Marta"] il.add_value("name", "pepe") assert il.get_collected_values("name") == ["Marta", "Pepe"] assert il.get_output_value("name") == ["Marta", "Pepe"] # test add object value il.add_value("summary", {"key": 1}) assert il.get_collected_values("summary") == [{"key": 1}] il.add_value(None, "Jim", lambda x: {"name": x}) assert il.get_collected_values("name") == ["Marta", "Pepe", "Jim"] def test_add_zero(self): il = ItemLoader() il.add_value("name", 0) assert il.get_collected_values("name") == [0] def test_add_none(self): il = ItemLoader() il.add_value("name", None) assert il.get_collected_values("name") == [] def test_replace_value(self): il = CustomItemLoader() il.replace_value("name", "marta") self.assertEqual(il.get_collected_values("name"), ["Marta"]) self.assertEqual(il.get_output_value("name"), ["Marta"]) il.replace_value("name", "pepe") self.assertEqual(il.get_collected_values("name"), ["Pepe"]) self.assertEqual(il.get_output_value("name"), ["Pepe"]) il.replace_value(None, "Jim", lambda x: {"name": x}) self.assertEqual(il.get_collected_values("name"), ["Jim"]) def test_replace_value_none(self): il = CustomItemLoader() il.replace_value("name", None) self.assertEqual(il.get_collected_values("name"), []) il.replace_value("name", "marta") self.assertEqual(il.get_collected_values("name"), ["Marta"]) il.replace_value( "name", None ) # when replacing with `None` nothing should happen self.assertEqual(il.get_collected_values("name"), ["Marta"]) def test_get_value(self): il = ItemLoader() self.assertEqual("FOO", il.get_value(["foo", "bar"], TakeFirst(), str.upper)) self.assertEqual( ["foo", "bar"], il.get_value(["name:foo", "name:bar"], re="name:(.*)$") ) self.assertEqual( "foo", il.get_value(["name:foo", "name:bar"], TakeFirst(), re="name:(.*)$") ) self.assertEqual( None, il.get_value(["foo", "bar"], TakeFirst(), re="name:(.*)$") ) self.assertEqual(None, il.get_value(None, TakeFirst())) il.add_value("name", ["name:foo", "name:bar"], TakeFirst(), re="name:(.*)$") self.assertEqual(["foo"], il.get_collected_values("name")) il.replace_value("name", "name:bar", re="name:(.*)$") self.assertEqual(["bar"], il.get_collected_values("name")) def test_iter_on_input_processor_input(self): class NameFirstItemLoader(ItemLoader): name_in = TakeFirst() il = NameFirstItemLoader() il.add_value("name", "marta") self.assertEqual(il.get_collected_values("name"), ["marta"]) il = NameFirstItemLoader() il.add_value("name", ["marta", "jose"]) self.assertEqual(il.get_collected_values("name"), ["marta"]) il = NameFirstItemLoader() il.replace_value("name", "marta") self.assertEqual(il.get_collected_values("name"), ["marta"]) il = NameFirstItemLoader() il.replace_value("name", ["marta", "jose"]) self.assertEqual(il.get_collected_values("name"), ["marta"]) il = NameFirstItemLoader() il.add_value("name", "marta") il.add_value("name", ["jose", "pedro"]) self.assertEqual(il.get_collected_values("name"), ["marta", "jose"]) def test_map_compose_filter(self): def filter_world(x): return None if x == "world" else x proc = MapCompose(filter_world, str.upper) self.assertEqual( proc(["hello", "world", "this", "is", "scrapy"]), ["HELLO", "THIS", "IS", "SCRAPY"], ) def test_map_compose_filter_multil(self): class CustomItemLoader(ItemLoader): name_in = MapCompose(lambda v: v.title(), lambda v: v[:-1]) il = CustomItemLoader() il.add_value("name", "marta") self.assertEqual(il.get_output_value("name"), ["Mart"]) item = il.load_item() self.assertEqual(item["name"], ["Mart"]) def test_default_input_processor(self): il = DefaultedItemLoader() il.add_value("name", "marta") self.assertEqual(il.get_output_value("name"), ["mart"]) def test_inherited_default_input_processor(self): class InheritDefaultedItemLoader(DefaultedItemLoader): pass il = InheritDefaultedItemLoader() il.add_value("name", "marta") self.assertEqual(il.get_output_value("name"), ["mart"]) def test_input_processor_inheritance(self): class ChildItemLoader(CustomItemLoader): url_in = MapCompose(lambda v: v.lower()) il = ChildItemLoader() il.add_value("url", "HTTP://scrapy.ORG") self.assertEqual(il.get_output_value("url"), ["http://scrapy.org"]) il.add_value("name", "marta") self.assertEqual(il.get_output_value("name"), ["Marta"]) class ChildChildItemLoader(ChildItemLoader): url_in = MapCompose(lambda v: v.upper()) summary_in = MapCompose(lambda v: v) il = ChildChildItemLoader() il.add_value("url", "http://scrapy.org") self.assertEqual(il.get_output_value("url"), ["HTTP://SCRAPY.ORG"]) il.add_value("name", "marta") self.assertEqual(il.get_output_value("name"), ["Marta"]) def test_empty_map_compose(self): class IdentityDefaultedItemLoader(DefaultedItemLoader): name_in = MapCompose() il = IdentityDefaultedItemLoader() il.add_value("name", "marta") self.assertEqual(il.get_output_value("name"), ["marta"]) def test_identity_input_processor(self): class IdentityDefaultedItemLoader(DefaultedItemLoader): name_in = Identity() il = IdentityDefaultedItemLoader() il.add_value("name", "marta") self.assertEqual(il.get_output_value("name"), ["marta"]) def test_extend_custom_input_processors(self): class ChildItemLoader(CustomItemLoader): name_in = MapCompose(CustomItemLoader.name_in, str.swapcase) il = ChildItemLoader() il.add_value("name", "marta") self.assertEqual(il.get_output_value("name"), ["mARTA"]) def test_extend_default_input_processors(self): class ChildDefaultedItemLoader(DefaultedItemLoader): name_in = MapCompose( DefaultedItemLoader.default_input_processor, str.swapcase ) il = ChildDefaultedItemLoader() il.add_value("name", "marta") self.assertEqual(il.get_output_value("name"), ["MART"]) def test_output_processor_using_function(self): il = CustomItemLoader() il.add_value("name", ["mar", "ta"]) self.assertEqual(il.get_output_value("name"), ["Mar", "Ta"]) class TakeFirstItemLoader(CustomItemLoader): name_out = " ".join il = TakeFirstItemLoader() il.add_value("name", ["mar", "ta"]) self.assertEqual(il.get_output_value("name"), "Mar Ta") def test_output_processor_error(self): class CustomItemLoader(ItemLoader): name_out = MapCompose(float) il = CustomItemLoader() il.add_value("name", ["$10"]) try: float("$10") except Exception as e: expected_exc_str = str(e) exc = None try: il.load_item() except Exception as e: exc = e assert isinstance(exc, ValueError) s = str(exc) assert "name" in s, s assert "$10" in s, s assert "ValueError" in s, s assert expected_exc_str in s, s def test_output_processor_using_classes(self): il = CustomItemLoader() il.add_value("name", ["mar", "ta"]) self.assertEqual(il.get_output_value("name"), ["Mar", "Ta"]) class TakeFirstItemLoader1(CustomItemLoader): name_out = Join() il = TakeFirstItemLoader1() il.add_value("name", ["mar", "ta"]) self.assertEqual(il.get_output_value("name"), "Mar Ta") class TakeFirstItemLoader2(CustomItemLoader): name_out = Join("
") il = TakeFirstItemLoader2() il.add_value("name", ["mar", "ta"]) self.assertEqual(il.get_output_value("name"), "Mar
Ta") def test_default_output_processor(self): il = CustomItemLoader() il.add_value("name", ["mar", "ta"]) self.assertEqual(il.get_output_value("name"), ["Mar", "Ta"]) class LalaItemLoader(CustomItemLoader): default_output_processor = Identity() il = LalaItemLoader() il.add_value("name", ["mar", "ta"]) self.assertEqual(il.get_output_value("name"), ["Mar", "Ta"]) def test_loader_context_on_declaration(self): class ChildItemLoader(CustomItemLoader): url_in = MapCompose(processor_with_args, key="val") il = ChildItemLoader() il.add_value("url", "text") self.assertEqual(il.get_output_value("url"), ["val"]) il.replace_value("url", "text2") self.assertEqual(il.get_output_value("url"), ["val"]) def test_loader_context_on_instantiation(self): class ChildItemLoader(CustomItemLoader): url_in = MapCompose(processor_with_args) il = ChildItemLoader(key="val") il.add_value("url", "text") self.assertEqual(il.get_output_value("url"), ["val"]) il.replace_value("url", "text2") self.assertEqual(il.get_output_value("url"), ["val"]) def test_loader_context_on_assign(self): class ChildItemLoader(CustomItemLoader): url_in = MapCompose(processor_with_args) il = ChildItemLoader() il.context["key"] = "val" il.add_value("url", "text") self.assertEqual(il.get_output_value("url"), ["val"]) il.replace_value("url", "text2") self.assertEqual(il.get_output_value("url"), ["val"]) def test_item_passed_to_input_processor_functions(self): def processor(value, loader_context): return loader_context["item"]["name"] class ChildItemLoader(CustomItemLoader): url_in = MapCompose(processor) it = {"name": "marta"} il = ChildItemLoader(item=it) il.add_value("url", "text") self.assertEqual(il.get_output_value("url"), ["marta"]) il.replace_value("url", "text2") self.assertEqual(il.get_output_value("url"), ["marta"]) # def test_add_value_on_unknown_field(self): # il = CustomItemLoader() # self.assertRaises(KeyError, il.add_value, 'wrong_field', ['lala', 'lolo']) def test_compose_processor(self): class CustomItemLoader(ItemLoader): name_out = Compose(lambda v: v[0], lambda v: v.title(), lambda v: v[:-1]) il = CustomItemLoader() il.add_value("name", ["marta", "other"]) self.assertEqual(il.get_output_value("name"), "Mart") item = il.load_item() self.assertEqual(item["name"], "Mart") def test_partial_processor(self): def join(values, sep=None, loader_context=None, ignored=None): if sep is not None: return sep.join(values) elif loader_context and "sep" in loader_context: return loader_context["sep"].join(values) else: return "".join(values) class CustomItemLoader(ItemLoader): name_out = Compose(partial(join, sep="+")) url_out = Compose(partial(join, loader_context={"sep": "."})) summary_out = Compose(partial(join, ignored="foo")) il = CustomItemLoader() il.add_value("name", ["rabbit", "hole"]) il.add_value("url", ["rabbit", "hole"]) il.add_value("summary", ["rabbit", "hole"]) item = il.load_item() self.assertEqual(item["name"], "rabbit+hole") self.assertEqual(item["url"], "rabbit.hole") self.assertEqual(item["summary"], "rabbithole") def test_error_input_processor(self): class CustomItemLoader(ItemLoader): name_in = MapCompose(float) il = CustomItemLoader() self.assertRaises(ValueError, il.add_value, "name", ["marta", "other"]) def test_error_output_processor(self): class CustomItemLoader(ItemLoader): name_out = Compose(Join(), float) il = CustomItemLoader() il.add_value("name", "marta") with self.assertRaises(ValueError): il.load_item() def test_error_processor_as_argument(self): il = CustomItemLoader() self.assertRaises( ValueError, il.add_value, "name", ["marta", "other"], Compose(float) ) def test_get_unset_value(self): loader = ItemLoader() self.assertEqual(loader.load_item(), {}) self.assertEqual(loader.get_output_value("foo"), []) self.assertEqual(loader.load_item(), {}) class BaseNoInputReprocessingLoader(ItemLoader): title_in = MapCompose(str.upper) title_out = TakeFirst() class NoInputReprocessingDictLoader(BaseNoInputReprocessingLoader): default_item_class = dict class NoInputReprocessingFromDictTest(unittest.TestCase): """ Loaders initialized from loaded items must not reprocess fields (dict instances) """ def test_avoid_reprocessing_with_initial_values_single(self): il = NoInputReprocessingDictLoader(item={"title": "foo"}) il_loaded = il.load_item() self.assertEqual(il_loaded, {"title": "foo"}) self.assertEqual( NoInputReprocessingDictLoader(item=il_loaded).load_item(), {"title": "foo"} ) def test_avoid_reprocessing_with_initial_values_list(self): il = NoInputReprocessingDictLoader(item={"title": ["foo", "bar"]}) il_loaded = il.load_item() self.assertEqual(il_loaded, {"title": "foo"}) self.assertEqual( NoInputReprocessingDictLoader(item=il_loaded).load_item(), {"title": "foo"} ) def test_avoid_reprocessing_without_initial_values_single(self): il = NoInputReprocessingDictLoader() il.add_value("title", "foo") il_loaded = il.load_item() self.assertEqual(il_loaded, {"title": "FOO"}) self.assertEqual( NoInputReprocessingDictLoader(item=il_loaded).load_item(), {"title": "FOO"} ) def test_avoid_reprocessing_without_initial_values_list(self): il = NoInputReprocessingDictLoader() il.add_value("title", ["foo", "bar"]) il_loaded = il.load_item() self.assertEqual(il_loaded, {"title": "FOO"}) self.assertEqual( NoInputReprocessingDictLoader(item=il_loaded).load_item(), {"title": "FOO"} ) itemloaders-1.3.1/tests/test_loader_initialization.py000066400000000000000000000111151462733172400231640ustar00rootroot00000000000000import unittest from typing import Any, Protocol from itemloaders import ItemLoader class InitializationTestProtocol(Protocol): item_class: Any def assertEqual(self, first: Any, second: Any, msg: Any = ...) -> None: ... def assertIsInstance(self, obj: object, cls: type, msg: Any = None) -> None: ... class InitializationTestMixin: item_class: Any = None def test_keep_single_value(self: InitializationTestProtocol) -> None: """Loaded item should contain values from the initial item""" input_item = self.item_class(name="foo") il = ItemLoader(item=input_item) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(dict(loaded_item), {"name": ["foo"]}) def test_keep_list(self: InitializationTestProtocol) -> None: """Loaded item should contain values from the initial item""" input_item = self.item_class(name=["foo", "bar"]) il = ItemLoader(item=input_item) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(dict(loaded_item), {"name": ["foo", "bar"]}) def test_add_value_singlevalue_singlevalue( self: InitializationTestProtocol, ) -> None: """Values added after initialization should be appended""" input_item = self.item_class(name="foo") il = ItemLoader(item=input_item) il.add_value("name", "bar") loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(dict(loaded_item), {"name": ["foo", "bar"]}) def test_add_value_singlevalue_list(self: InitializationTestProtocol) -> None: """Values added after initialization should be appended""" input_item = self.item_class(name="foo") il = ItemLoader(item=input_item) il.add_value("name", ["item", "loader"]) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(dict(loaded_item), {"name": ["foo", "item", "loader"]}) def test_add_value_list_singlevalue(self: InitializationTestProtocol) -> None: """Values added after initialization should be appended""" input_item = self.item_class(name=["foo", "bar"]) il = ItemLoader(item=input_item) il.add_value("name", "qwerty") loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(dict(loaded_item), {"name": ["foo", "bar", "qwerty"]}) def test_add_value_list_list(self: InitializationTestProtocol) -> None: """Values added after initialization should be appended""" input_item = self.item_class(name=["foo", "bar"]) il = ItemLoader(item=input_item) il.add_value("name", ["item", "loader"]) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(dict(loaded_item), {"name": ["foo", "bar", "item", "loader"]}) def test_get_output_value_singlevalue(self: InitializationTestProtocol) -> None: """Getting output value must not remove value from item""" input_item = self.item_class(name="foo") il = ItemLoader(item=input_item) self.assertEqual(il.get_output_value("name"), ["foo"]) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(loaded_item, {"name": ["foo"]}) def test_get_output_value_list(self: InitializationTestProtocol) -> None: """Getting output value must not remove value from item""" input_item = self.item_class(name=["foo", "bar"]) il = ItemLoader(item=input_item) self.assertEqual(il.get_output_value("name"), ["foo", "bar"]) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(loaded_item, {"name": ["foo", "bar"]}) def test_values_single(self: InitializationTestProtocol) -> None: """Values from initial item must be added to loader._values""" input_item = self.item_class(name="foo") il = ItemLoader(item=input_item) self.assertEqual(il._values.get("name"), ["foo"]) def test_values_list(self: InitializationTestProtocol) -> None: """Values from initial item must be added to loader._values""" input_item = self.item_class(name=["foo", "bar"]) il = ItemLoader(item=input_item) self.assertEqual(il._values.get("name"), ["foo", "bar"]) class InitializationFromDictTest(InitializationTestMixin, unittest.TestCase): item_class = dict itemloaders-1.3.1/tests/test_nested_items.py000066400000000000000000000024451462733172400213000ustar00rootroot00000000000000import unittest from typing import Any from itemloaders import ItemLoader class NestedItemTest(unittest.TestCase): """Test that adding items as values works as expected.""" def _test_item(self, item: Any) -> None: il = ItemLoader() il.add_value("item_list", item) self.assertEqual(il.load_item(), {"item_list": [item]}) def test_attrs(self): try: import attr except ImportError: self.skipTest("Cannot import attr") @attr.s class TestItem: foo = attr.ib() self._test_item(TestItem(foo="bar")) def test_dataclass(self): try: from dataclasses import dataclass except ImportError: self.skipTest("Cannot import dataclasses.dataclass") @dataclass class TestItem: foo: str self._test_item(TestItem(foo="bar")) def test_dict(self): self._test_item({"foo": "bar"}) def test_scrapy_item(self): try: from scrapy import Field, Item except ImportError: self.skipTest("Cannot import Field or Item from scrapy") # needs py.typed in Scrapy class TestItem(Item): # type: ignore[misc] foo = Field() self._test_item(TestItem(foo="bar")) itemloaders-1.3.1/tests/test_nested_loader.py000066400000000000000000000105731462733172400214260ustar00rootroot00000000000000import unittest from parsel import Selector from itemloaders import ItemLoader class SubselectorLoaderTest(unittest.TestCase): selector = Selector( text="""
marta

paragraph

""" ) def test_nested_xpath(self): loader = ItemLoader(selector=self.selector) nl = loader.nested_xpath("//header") nl.add_xpath("name", "div/text()") nl.add_css("name_div", "#id") assert nl.selector nl.add_value("name_value", nl.selector.xpath('div[@id = "id"]/text()').getall()) self.assertEqual(loader.get_output_value("name"), ["marta"]) self.assertEqual( loader.get_output_value("name_div"), ['
marta
'] ) self.assertEqual(loader.get_output_value("name_value"), ["marta"]) self.assertEqual(loader.get_output_value("name"), nl.get_output_value("name")) self.assertEqual( loader.get_output_value("name_div"), nl.get_output_value("name_div") ) self.assertEqual( loader.get_output_value("name_value"), nl.get_output_value("name_value") ) def test_nested_css(self): loader = ItemLoader(selector=self.selector) nl = loader.nested_css("header") nl.add_xpath("name", "div/text()") nl.add_css("name_div", "#id") assert nl.selector nl.add_value("name_value", nl.selector.xpath('div[@id = "id"]/text()').getall()) self.assertEqual(loader.get_output_value("name"), ["marta"]) self.assertEqual( loader.get_output_value("name_div"), ['
marta
'] ) self.assertEqual(loader.get_output_value("name_value"), ["marta"]) self.assertEqual(loader.get_output_value("name"), nl.get_output_value("name")) self.assertEqual( loader.get_output_value("name_div"), nl.get_output_value("name_div") ) self.assertEqual( loader.get_output_value("name_value"), nl.get_output_value("name_value") ) def test_nested_replace(self): loader = ItemLoader(selector=self.selector) nl1 = loader.nested_xpath("//footer") nl2 = nl1.nested_xpath("a") loader.add_xpath("url", "//footer/a/@href") self.assertEqual(loader.get_output_value("url"), ["http://www.scrapy.org"]) nl1.replace_xpath("url", "img/@src") self.assertEqual(loader.get_output_value("url"), ["/images/logo.png"]) nl2.replace_xpath("url", "@href") self.assertEqual(loader.get_output_value("url"), ["http://www.scrapy.org"]) def test_nested_ordering(self): loader = ItemLoader(selector=self.selector) nl1 = loader.nested_xpath("//footer") nl2 = nl1.nested_xpath("a") nl1.add_xpath("url", "img/@src") loader.add_xpath("url", "//footer/a/@href") nl2.add_xpath("url", "text()") loader.add_xpath("url", "//footer/a/@href") self.assertEqual( loader.get_output_value("url"), [ "/images/logo.png", "http://www.scrapy.org", "homepage", "http://www.scrapy.org", ], ) def test_nested_load_item(self): loader = ItemLoader(selector=self.selector) nl1 = loader.nested_xpath("//footer") nl2 = nl1.nested_xpath("img") loader.add_xpath("name", "//header/div/text()") nl1.add_xpath("url", "a/@href") nl2.add_xpath("image", "@src") item = loader.load_item() assert item is loader.item assert item is nl1.item assert item is nl2.item self.assertEqual(item["name"], ["marta"]) self.assertEqual(item["url"], ["http://www.scrapy.org"]) self.assertEqual(item["image"], ["/images/logo.png"]) def test_nested_empty_selector(self): loader = ItemLoader(selector=self.selector) nested_xpath = loader.nested_xpath("//bar") assert isinstance(nested_xpath, ItemLoader) nested_xpath.add_xpath("foo", "./foo") nested_css = loader.nested_css("bar") assert isinstance(nested_css, ItemLoader) nested_css.add_css("foo", "foo") itemloaders-1.3.1/tests/test_output_processor.py000066400000000000000000000023621462733172400222520ustar00rootroot00000000000000import unittest from typing import Any, Dict from itemloaders import ItemLoader from itemloaders.processors import Compose, Identity, TakeFirst class TestOutputProcessorDict(unittest.TestCase): def test_output_processor(self): class TempDict(Dict[str, Any]): def __init__(self, *args, **kwargs): super(TempDict, self).__init__(self, *args, **kwargs) self.setdefault("temp", 0.3) class TempLoader(ItemLoader): default_item_class = TempDict default_input_processor = Identity() default_output_processor = Compose(TakeFirst()) loader = TempLoader() item = loader.load_item() self.assertIsInstance(item, TempDict) self.assertEqual(dict(item), {"temp": 0.3}) class TestOutputProcessorItem(unittest.TestCase): def test_output_processor(self): class TempLoader(ItemLoader): default_input_processor = Identity() default_output_processor = Compose(TakeFirst()) item: Dict[str, Any] = {} item.setdefault("temp", 0.3) loader = TempLoader(item=item) item = loader.load_item() self.assertIsInstance(item, dict) self.assertEqual(dict(item), {"temp": 0.3}) itemloaders-1.3.1/tests/test_processors.py000066400000000000000000000035001462733172400210100ustar00rootroot00000000000000import unittest from itemloaders.processors import Compose, Identity, Join, MapCompose, TakeFirst class ProcessorsTest(unittest.TestCase): def test_take_first(self): proc = TakeFirst() self.assertEqual(proc([None, "", "hello", "world"]), "hello") self.assertEqual(proc([None, "", 0, "hello", "world"]), 0) def test_identity(self): proc = Identity() self.assertEqual( proc([None, "", "hello", "world"]), [None, "", "hello", "world"] ) def test_join(self): proc = Join() self.assertRaises(TypeError, proc, [None, "", "hello", "world"]) self.assertEqual(proc(["", "hello", "world"]), " hello world") self.assertEqual(proc(["hello", "world"]), "hello world") self.assertIsInstance(proc(["hello", "world"]), str) def test_compose(self): proc = Compose(lambda v: v[0], str.upper) self.assertEqual(proc(["hello", "world"]), "HELLO") proc = Compose(str.upper) self.assertEqual(proc(None), None) proc = Compose(str.upper, stop_on_none=False) self.assertRaises(ValueError, proc, None) proc = Compose(str.upper, lambda x: x + 1) self.assertRaises(ValueError, proc, "hello") def test_mapcompose(self): def filter_world(x): return None if x == "world" else x proc = MapCompose(filter_world, str.upper) self.assertEqual( proc(["hello", "world", "this", "is", "scrapy"]), ["HELLO", "THIS", "IS", "SCRAPY"], ) proc = MapCompose(filter_world, str.upper) self.assertEqual(proc(None), []) proc = MapCompose(filter_world, str.upper) self.assertRaises(ValueError, proc, [1]) proc = MapCompose(filter_world, lambda x: x + 1) self.assertRaises(ValueError, proc, "hello") itemloaders-1.3.1/tests/test_select_jmes.py000066400000000000000000000016411462733172400211070ustar00rootroot00000000000000import unittest from itemloaders.processors import SelectJmes class SelectJmesTestCase(unittest.TestCase): test_list_equals = { "simple": ("foo.bar", {"foo": {"bar": "baz"}}, "baz"), "invalid": ("foo.bar.baz", {"foo": {"bar": "baz"}}, None), "top_level": ("foo", {"foo": {"bar": "baz"}}, {"bar": "baz"}), "double_vs_single_quote_string": ("foo.bar", {"foo": {"bar": "baz"}}, "baz"), "dict": ( "foo.bar[*].name", {"foo": {"bar": [{"name": "one"}, {"name": "two"}]}}, ["one", "two"], ), "list": ("[1]", [1, 2], 2), } def test_output(self): for key in self.test_list_equals: expr, test_list, expected = self.test_list_equals[key] test = SelectJmes(expr)(test_list) self.assertEqual( test, expected, msg=f"test {key!r} got {test} expected {expected}" ) itemloaders-1.3.1/tests/test_selector_loader.py000066400000000000000000000304341462733172400217620ustar00rootroot00000000000000import re import unittest from unittest.mock import MagicMock from parsel import Selector from itemloaders import ItemLoader from itemloaders.processors import MapCompose, TakeFirst class CustomItemLoader(ItemLoader): name_in = MapCompose(lambda v: v.title()) class SelectortemLoaderTest(unittest.TestCase): selector = Selector( text="""
marta

paragraph

homepage Scrapy """ ) jmes_selector = Selector( text=""" { "name": "marta", "description": "paragraph", "website": { "url": "http://www.scrapy.org", "name": "homepage" }, "logo": "/images/logo.png" } """ ) def test_init_method(self): loader = CustomItemLoader() self.assertEqual(loader.selector, None) def test_init_method_errors(self): loader = CustomItemLoader() self.assertRaises(RuntimeError, loader.add_xpath, "url", "//a/@href") self.assertRaises(RuntimeError, loader.replace_xpath, "url", "//a/@href") self.assertRaises(RuntimeError, loader.get_xpath, "//a/@href") self.assertRaises(RuntimeError, loader.add_css, "name", "#name::text") self.assertRaises(RuntimeError, loader.replace_css, "name", "#name::text") self.assertRaises(RuntimeError, loader.get_css, "#name::text") def test_init_method_with_selector(self): loader = CustomItemLoader(selector=self.selector) self.assertTrue(loader.selector) loader.add_xpath("name", "//div/text()") self.assertEqual(loader.get_output_value("name"), ["Marta"]) def test_init_method_with_selector_css(self): loader = CustomItemLoader(selector=self.selector) self.assertTrue(loader.selector) loader.add_css("name", "div::text") self.assertEqual(loader.get_output_value("name"), ["Marta"]) loader.add_css("url", "a::attr(href)") self.assertEqual(loader.get_output_value("url"), ["http://www.scrapy.org"]) # combining/accumulating CSS selectors and XPath expressions loader.add_xpath("name", "//div/text()") self.assertEqual(loader.get_output_value("name"), ["Marta", "Marta"]) loader.add_xpath("url", "//img/@src") self.assertEqual( loader.get_output_value("url"), ["http://www.scrapy.org", "/images/logo.png"], ) def test_add_xpath_re(self): loader = CustomItemLoader(selector=self.selector) loader.add_xpath("name", "//div/text()", re="ma") self.assertEqual(loader.get_output_value("name"), ["Ma"]) loader = CustomItemLoader(selector=self.selector) loader.add_xpath("name", "//div/text()", re=re.compile("ma")) self.assertEqual(loader.get_output_value("name"), ["Ma"]) def test_add_xpath_variables(self): loader = CustomItemLoader(selector=self.selector) loader.add_xpath("name", "id($id)/text()", id="id") self.assertEqual(loader.get_output_value("name"), ["Marta"]) loader = CustomItemLoader(selector=self.selector) loader.add_xpath("name", "id($id)/text()", id="id2") self.assertEqual(loader.get_output_value("name"), []) def test_replace_xpath(self): loader = CustomItemLoader(selector=self.selector) self.assertTrue(loader.selector) loader.add_xpath("name", "//div/text()") self.assertEqual(loader.get_output_value("name"), ["Marta"]) loader.replace_xpath("name", "//p/text()") self.assertEqual(loader.get_output_value("name"), ["Paragraph"]) loader.replace_xpath("name", ["//p/text()", "//div/text()"]) self.assertEqual(loader.get_output_value("name"), ["Paragraph", "Marta"]) def test_get_xpath(self): loader = CustomItemLoader(selector=self.selector) self.assertEqual(loader.get_xpath("//p/text()"), ["paragraph"]) self.assertEqual(loader.get_xpath("//p/text()", TakeFirst()), "paragraph") self.assertEqual(loader.get_xpath("//p/text()", TakeFirst(), re="pa"), "pa") self.assertEqual( loader.get_xpath(["//p/text()", "//div/text()"]), ["paragraph", "marta"] ) def test_replace_xpath_multi_fields(self): loader = CustomItemLoader(selector=self.selector) loader.add_xpath(None, "//div/text()", TakeFirst(), lambda x: {"name": x}) self.assertEqual(loader.get_output_value("name"), ["Marta"]) loader.replace_xpath(None, "//p/text()", TakeFirst(), lambda x: {"name": x}) self.assertEqual(loader.get_output_value("name"), ["Paragraph"]) def test_replace_xpath_re(self): loader = CustomItemLoader(selector=self.selector) self.assertTrue(loader.selector) loader.add_xpath("name", "//div/text()") self.assertEqual(loader.get_output_value("name"), ["Marta"]) loader.replace_xpath("name", "//div/text()", re="ma") self.assertEqual(loader.get_output_value("name"), ["Ma"]) def test_add_css_re(self): loader = CustomItemLoader(selector=self.selector) loader.add_css("name", "div::text", re="ma") self.assertEqual(loader.get_output_value("name"), ["Ma"]) loader.add_css("url", "a::attr(href)", re="http://(.+)") self.assertEqual(loader.get_output_value("url"), ["www.scrapy.org"]) loader = CustomItemLoader(selector=self.selector) loader.add_css("name", "div::text", re=re.compile("ma")) self.assertEqual(loader.get_output_value("name"), ["Ma"]) loader.add_css("url", "a::attr(href)", re=re.compile("http://(.+)")) self.assertEqual(loader.get_output_value("url"), ["www.scrapy.org"]) def test_replace_css(self): loader = CustomItemLoader(selector=self.selector) self.assertTrue(loader.selector) loader.add_css("name", "div::text") self.assertEqual(loader.get_output_value("name"), ["Marta"]) loader.replace_css("name", "p::text") self.assertEqual(loader.get_output_value("name"), ["Paragraph"]) loader.replace_css("name", ["p::text", "div::text"]) self.assertEqual(loader.get_output_value("name"), ["Paragraph", "Marta"]) loader.add_css("url", "a::attr(href)", re="http://(.+)") self.assertEqual(loader.get_output_value("url"), ["www.scrapy.org"]) loader.replace_css("url", "img::attr(src)") self.assertEqual(loader.get_output_value("url"), ["/images/logo.png"]) def test_get_css(self): loader = CustomItemLoader(selector=self.selector) self.assertEqual(loader.get_css("p::text"), ["paragraph"]) self.assertEqual(loader.get_css("p::text", TakeFirst()), "paragraph") self.assertEqual(loader.get_css("p::text", TakeFirst(), re="pa"), "pa") self.assertEqual( loader.get_css(["p::text", "div::text"]), ["paragraph", "marta"] ) self.assertEqual( loader.get_css(["a::attr(href)", "img::attr(src)"]), ["http://www.scrapy.org", "/images/logo.png"], ) def test_replace_css_multi_fields(self): loader = CustomItemLoader(selector=self.selector) loader.add_css(None, "div::text", TakeFirst(), lambda x: {"name": x}) self.assertEqual(loader.get_output_value("name"), ["Marta"]) loader.replace_css(None, "p::text", TakeFirst(), lambda x: {"name": x}) self.assertEqual(loader.get_output_value("name"), ["Paragraph"]) loader.add_css(None, "a::attr(href)", TakeFirst(), lambda x: {"url": x}) self.assertEqual(loader.get_output_value("url"), ["http://www.scrapy.org"]) loader.replace_css(None, "img::attr(src)", TakeFirst(), lambda x: {"url": x}) self.assertEqual(loader.get_output_value("url"), ["/images/logo.png"]) def test_replace_css_re(self): loader = CustomItemLoader(selector=self.selector) self.assertTrue(loader.selector) loader.add_css("url", "a::attr(href)") self.assertEqual(loader.get_output_value("url"), ["http://www.scrapy.org"]) loader.replace_css("url", "a::attr(href)", re=r"http://www\.(.+)") self.assertEqual(loader.get_output_value("url"), ["scrapy.org"]) def test_jmes_not_installed(self): selector = MagicMock(spec=Selector) del selector.jmespath loader = CustomItemLoader(selector=selector) with self.assertRaises(AttributeError) as err: loader.add_jmes("name", "name", re="ma") self.assertEqual( str(err.exception), "Please install parsel >= 1.8.1 to get jmespath support" ) def test_add_jmes_re(self): loader = CustomItemLoader(selector=self.jmes_selector) loader.add_jmes("name", "name", re="ma") self.assertEqual(loader.get_output_value("name"), ["Ma"]) loader.add_jmes("url", "website.url", re="http://(.+)") self.assertEqual(loader.get_output_value("url"), ["www.scrapy.org"]) loader = CustomItemLoader(selector=self.jmes_selector) loader.add_jmes("name", "name", re=re.compile("ma")) self.assertEqual(loader.get_output_value("name"), ["Ma"]) loader.add_jmes("url", "website.url", re=re.compile("http://(.+)")) self.assertEqual(loader.get_output_value("url"), ["www.scrapy.org"]) def test_get_jmes(self): loader = CustomItemLoader(selector=self.jmes_selector) self.assertEqual(loader.get_jmes("description"), ["paragraph"]) self.assertEqual(loader.get_jmes("description", TakeFirst()), "paragraph") self.assertEqual(loader.get_jmes("description", TakeFirst(), re="pa"), "pa") self.assertEqual( loader.get_jmes(["description", "name"]), ["paragraph", "marta"] ) self.assertEqual( loader.get_jmes(["website.url", "logo"]), ["http://www.scrapy.org", "/images/logo.png"], ) def test_replace_jmes(self): loader = CustomItemLoader(selector=self.jmes_selector) self.assertTrue(loader.selector) loader.add_jmes("name", "name") self.assertEqual(loader.get_output_value("name"), ["Marta"]) loader.replace_jmes("name", "description") self.assertEqual(loader.get_output_value("name"), ["Paragraph"]) loader.replace_jmes("name", ["description", "name"]) self.assertEqual(loader.get_output_value("name"), ["Paragraph", "Marta"]) loader.add_jmes("url", "website.url", re="http://(.+)") self.assertEqual(loader.get_output_value("url"), ["www.scrapy.org"]) loader.replace_jmes("url", "logo") self.assertEqual(loader.get_output_value("url"), ["/images/logo.png"]) def test_replace_jmes_multi_fields(self): loader = CustomItemLoader(selector=self.jmes_selector) loader.add_jmes(None, "name", TakeFirst(), lambda x: {"name": x}) self.assertEqual(loader.get_output_value("name"), ["Marta"]) loader.replace_jmes(None, "description", TakeFirst(), lambda x: {"name": x}) self.assertEqual(loader.get_output_value("name"), ["Paragraph"]) loader.add_jmes(None, "website.url", TakeFirst(), lambda x: {"url": x}) self.assertEqual(loader.get_output_value("url"), ["http://www.scrapy.org"]) loader.replace_jmes(None, "logo", TakeFirst(), lambda x: {"url": x}) self.assertEqual(loader.get_output_value("url"), ["/images/logo.png"]) def test_replace_jmes_re(self): loader = CustomItemLoader(selector=self.jmes_selector) self.assertTrue(loader.selector) loader.add_jmes("url", "website.url") self.assertEqual(loader.get_output_value("url"), ["http://www.scrapy.org"]) loader.replace_jmes("url", "website.url", re=r"http://www\.(.+)") self.assertEqual(loader.get_output_value("url"), ["scrapy.org"]) def test_fluent_interface(self): loader = ItemLoader(selector=self.selector) item = ( loader.add_xpath("name", "//body/text()") .replace_xpath("name", "//div/text()") .add_css("description", "div::text") .replace_css("description", "p::text") .add_value("url", "http://example.com") .replace_value("url", "http://foo") .load_item() ) self.assertEqual( item, {"name": ["marta"], "description": ["paragraph"], "url": ["http://foo"]}, ) itemloaders-1.3.1/tests/test_utils_misc.py000066400000000000000000000015571462733172400207730ustar00rootroot00000000000000import unittest from itemloaders.utils import arg_to_iter class UtilsMiscTestCase(unittest.TestCase): def test_arg_to_iter(self): assert hasattr(arg_to_iter(None), "__iter__") assert hasattr(arg_to_iter(100), "__iter__") assert hasattr(arg_to_iter("lala"), "__iter__") assert hasattr(arg_to_iter([1, 2, 3]), "__iter__") assert hasattr(arg_to_iter(letter for letter in "abcd"), "__iter__") self.assertEqual(list(arg_to_iter(None)), []) self.assertEqual(list(arg_to_iter("lala")), ["lala"]) self.assertEqual(list(arg_to_iter(100)), [100]) self.assertEqual(list(arg_to_iter(letter for letter in "abc")), ["a", "b", "c"]) self.assertEqual(list(arg_to_iter([1, 2, 3])), [1, 2, 3]) self.assertEqual(list(arg_to_iter({"a": 1})), [{"a": 1}]) if __name__ == "__main__": unittest.main() itemloaders-1.3.1/tests/test_utils_python.py000066400000000000000000000041411462733172400213510ustar00rootroot00000000000000import functools import operator import platform import unittest from typing import Any from itemloaders.utils import get_func_args class UtilsPythonTestCase(unittest.TestCase): def test_get_func_args(self): def f1(a, b, c): pass def f2(a, b=None, c=None): pass def f3(a, b=None, *, c=None): pass class A: def __init__(self, a: Any, b: Any, c: Any): pass def method(self, a, b, c): pass class Callable: def __call__(self, a, b, c): pass a = A(1, 2, 3) cal = Callable() partial_f1 = functools.partial(f1, None) partial_f2 = functools.partial(f1, b=None) partial_f3 = functools.partial(partial_f2, None) self.assertEqual(get_func_args(f1), ["a", "b", "c"]) self.assertEqual(get_func_args(f2), ["a", "b", "c"]) self.assertEqual(get_func_args(f3), ["a", "b", "c"]) self.assertEqual(get_func_args(A), ["a", "b", "c"]) self.assertEqual(get_func_args(a.method), ["a", "b", "c"]) self.assertEqual(get_func_args(partial_f1), ["b", "c"]) self.assertEqual(get_func_args(partial_f2), ["a", "c"]) self.assertEqual(get_func_args(partial_f3), ["c"]) self.assertEqual(get_func_args(cal), ["a", "b", "c"]) self.assertEqual(get_func_args(object), []) self.assertEqual(get_func_args(str.split, stripself=True), ["sep", "maxsplit"]) self.assertEqual(get_func_args(" ".join, stripself=True), ["iterable"]) if platform.python_implementation() == "CPython": # This didn't work on older versions of CPython: https://github.com/python/cpython/issues/86951 self.assertIn( get_func_args(operator.itemgetter(2), stripself=True), [[], ["args", "kwargs"]], ) elif platform.python_implementation() == "PyPy": self.assertEqual( get_func_args(operator.itemgetter(2), stripself=True), ["obj"] ) if __name__ == "__main__": unittest.main() itemloaders-1.3.1/tox.ini000066400000000000000000000020151462733172400153460ustar00rootroot00000000000000[tox] envlist = py38,py39,py310,py311,py312 [testenv] deps = pytest pytest-cov commands = py.test \ --cov-report=term --cov-report=html --cov-report= --cov=itemloaders \ --doctest-modules \ {posargs:itemloaders tests} [testenv:extra-deps] deps = {[testenv]deps} attrs scrapy [testenv:pypy3] basepython = pypy3 [docs] changedir = docs deps = -rdocs/requirements.txt setenv = READTHEDOCS_PROJECT=itemloaders READTHEDOCS_VERSION=master [testenv:docs] basepython = python3 changedir = {[docs]changedir} deps = {[docs]deps} setenv = {[docs]setenv} commands = sphinx-build -W -b html . {envtmpdir}/html [testenv:twinecheck] basepython = python3 deps = twine==5.0.0 build==1.2.1 commands = python -m build --sdist twine check dist/* [testenv:typing] basepython = python3 deps = mypy==1.10.0 types-attrs==19.1.0 types-jmespath==1.0.2.20240106 commands = mypy --strict --ignore-missing-imports --implicit-reexport {posargs:itemloaders tests}