pax_global_header 0000666 0000000 0000000 00000000064 14627331724 0014523 g ustar 00root root 0000000 0000000 52 comment=e2bd0069c7790507edbc7e9b5f1985e116e2b476
itemloaders-1.3.1/ 0000775 0000000 0000000 00000000000 14627331724 0014035 5 ustar 00root root 0000000 0000000 itemloaders-1.3.1/.bumpversion.cfg 0000664 0000000 0000000 00000000134 14627331724 0017143 0 ustar 00root root 0000000 0000000 [bumpversion]
current_version = 1.3.1
commit = True
tag = True
[bumpversion:file:setup.py]
itemloaders-1.3.1/.git-blame-ignore-revs 0000664 0000000 0000000 00000000076 14627331724 0020140 0 ustar 00root root 0000000 0000000 # Apply black format
627f3bd9ea5210f40dbd5697eff9351bb5af019c
itemloaders-1.3.1/.github/ 0000775 0000000 0000000 00000000000 14627331724 0015375 5 ustar 00root root 0000000 0000000 itemloaders-1.3.1/.github/workflows/ 0000775 0000000 0000000 00000000000 14627331724 0017432 5 ustar 00root root 0000000 0000000 itemloaders-1.3.1/.github/workflows/main.yml 0000664 0000000 0000000 00000003404 14627331724 0021102 0 ustar 00root root 0000000 0000000 name: CI
on:
- pull_request
- push
jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- uses: pre-commit/action@v3.0.1
tests:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- python-version: '3.12' # Keep in sync with .readthedocs.yml
env:
TOXENV: docs
- python-version: '3.12'
env:
TOXENV: twinecheck
- python-version: 3.8
env:
TOXENV: py
- python-version: 3.9
env:
TOXENV: py
- python-version: pypy-3.9
env:
TOXENV: py
- python-version: pypy-3.10
env:
TOXENV: py
- python-version: 3.9
env:
TOXENV: extra-deps
- python-version: '3.10'
env:
TOXENV: py
- python-version: '3.11'
env:
TOXENV: py
- python-version: '3.12'
env:
TOXENV: py
steps:
- uses: actions/checkout@v4
- name: Install system libraries
if: contains(matrix.python-version, 'pypy')
run: |
sudo apt-get update
sudo apt-get install libxml2-dev libxslt-dev
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade tox codecov
- name: Run tests
env: ${{ matrix.env }}
run: tox
- name: Publish coverage data
uses: codecov/codecov-action@v1
itemloaders-1.3.1/.github/workflows/publish.yml 0000664 0000000 0000000 00000001056 14627331724 0021625 0 ustar 00root root 0000000 0000000 name: Publish on PyPI
on:
release:
types: [created]
jobs:
publish:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade build twine
- name: Build
run: |
python -m build
- name: Upload
uses: pypa/gh-action-pypi-publish@v1.8.14
with:
password: ${{ secrets.PYPI_TOKEN }}
itemloaders-1.3.1/.gitignore 0000664 0000000 0000000 00000000267 14627331724 0016032 0 ustar 00root root 0000000 0000000 /.vagrant
/scrapy.iml
*.pyc
_trial_temp*
dropin.cache
docs/build
*egg-info
.tox
venv
.venv
build
dist
.idea
htmlcov/
.coverage
.pytest_cache/
.coverage.*
.cache/
# Windows
Thumbs.db
itemloaders-1.3.1/.pre-commit-config.yaml 0000664 0000000 0000000 00000001131 14627331724 0020312 0 ustar 00root root 0000000 0000000 default_language_version:
python: python3.12
repos:
- hooks:
- id: black
language_version: python3
repo: https://github.com/ambv/black
rev: 24.4.0
- hooks:
- id: isort
language_version: python3
repo: https://github.com/PyCQA/isort
rev: 5.13.2
- hooks:
- id: flake8
language_version: python3
additional_dependencies:
- flake8-bugbear
- flake8-comprehensions
- flake8-debugger
- flake8-docstrings
- flake8-string-format
repo: https://github.com/pycqa/flake8
rev: 7.0.0
itemloaders-1.3.1/.readthedocs.yml 0000664 0000000 0000000 00000000361 14627331724 0017123 0 ustar 00root root 0000000 0000000 version: 2
formats: all
sphinx:
configuration: docs/conf.py
build:
os: ubuntu-22.04
tools:
python: "3.12" # Keep in sync with .github/workflows/main.yml
python:
install:
- requirements: docs/requirements.txt
- path: .
itemloaders-1.3.1/LICENSE 0000664 0000000 0000000 00000002755 14627331724 0015053 0 ustar 00root root 0000000 0000000 Copyright (c) Scrapy developers.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions, and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of Scrapy nor the names of its contributors may be used
to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
itemloaders-1.3.1/MANIFEST.in 0000664 0000000 0000000 00000000020 14627331724 0015563 0 ustar 00root root 0000000 0000000 include LICENSE
itemloaders-1.3.1/README.rst 0000664 0000000 0000000 00000005152 14627331724 0015527 0 ustar 00root root 0000000 0000000 ===========
itemloaders
===========
.. image:: https://img.shields.io/pypi/v/itemloaders.svg
:target: https://pypi.python.org/pypi/itemloaders
:alt: PyPI Version
.. image:: https://img.shields.io/pypi/pyversions/itemloaders.svg
:target: https://pypi.python.org/pypi/itemloaders
:alt: Supported Python Versions
.. image:: https://github.com/scrapy/itemloaders/workflows/CI/badge.svg?branch=master
:target: https://github.com/scrapy/itemloaders/actions?workflow=CI
:alt: CI Status
.. image:: https://codecov.io/github/scrapy/itemloaders/coverage.svg?branch=master
:target: https://codecov.io/gh/scrapy/itemloaders
:alt: Coverage report
.. image:: https://readthedocs.org/projects/itemloaders/badge/?version=latest
:target: https://itemloaders.readthedocs.io/en/latest/?badge=latest
:alt: Documentation Status
``itemloaders`` is a library that helps you collect data from HTML and XML sources.
It comes in handy to extract data from web pages, as it supports
data extraction using CSS and XPath Selectors.
It's specially useful when you need to standardize the data from many sources.
For example, it allows you to have all your casting and parsing rules in a
single place.
Here is an example to get you started::
from itemloaders import ItemLoader
from parsel import Selector
html_data = '''
Some random product page
Some random product page
$ 100.12
'''
loader = ItemLoader(selector=Selector(html_data))
loader.add_xpath('name', '//div[@class="product_name"]/text()')
loader.add_xpath('name', '//div[@class="product_title"]/text()')
loader.add_css('price', '#price::text')
loader.add_value('last_updated', 'today') # you can also use literal values
item = loader.load_item()
item
# {'name': ['Some random product page'], 'price': ['$ 100.12'], 'last_updated': ['today']}
For more information, check out the `documentation `_.
Contributing
============
All contributions are welcome!
* If you want to review some code, check open
`Pull Requests here `_
* If you want to submit a code change
* File an `issue here `_, if there isn't one yet
* Fork this repository
* Create a branch to work on your changes
* Run `pre-commit install` to install pre-commit hooks
* Push your local branch and submit a Pull Request
itemloaders-1.3.1/codecov.yml 0000664 0000000 0000000 00000000120 14627331724 0016173 0 ustar 00root root 0000000 0000000 comment:
layout: "header, diff, tree"
coverage:
status:
project: false
itemloaders-1.3.1/docs/ 0000775 0000000 0000000 00000000000 14627331724 0014765 5 ustar 00root root 0000000 0000000 itemloaders-1.3.1/docs/Makefile 0000664 0000000 0000000 00000005364 14627331724 0016435 0 ustar 00root root 0000000 0000000 #
# Makefile for Scrapy documentation [based on Python documentation Makefile]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# You can set these variables from the command line.
PYTHON = python
SPHINXOPTS =
PAPER =
SOURCES =
SHELL = /bin/bash
ALLSPHINXOPTS = -b $(BUILDER) -d build/doctrees \
-D latex_elements.papersize=$(PAPER) \
$(SPHINXOPTS) . build/$(BUILDER) $(SOURCES)
.PHONY: help update build html htmlhelp clean
help:
@echo "Please use \`make ' where is one of"
@echo " html to make standalone HTML files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " text to make plain text files"
@echo " changes to make an overview over all changed/added/deprecated items"
@echo " linkcheck to check all external links for integrity"
@echo " watch build HTML docs, open in browser and watch for changes"
build-dirs:
mkdir -p build/$(BUILDER) build/doctrees
build: build-dirs
sphinx-build $(ALLSPHINXOPTS)
@echo
build-ignore-errors: build-dirs
-sphinx-build $(ALLSPHINXOPTS)
@echo
html: BUILDER = html
html: build
@echo "Build finished. The HTML pages are in build/html."
htmlhelp: BUILDER = htmlhelp
htmlhelp: build
@echo "Build finished; now you can run HTML Help Workshop with the" \
"build/htmlhelp/pydoc.hhp project file."
latex: BUILDER = latex
latex: build
@echo "Build finished; the LaTeX files are in build/latex."
@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
"run these through (pdf)latex."
text: BUILDER = text
text: build
@echo "Build finished; the text files are in build/text."
changes: BUILDER = changes
changes: build
@echo "The overview file is in build/changes."
linkcheck: BUILDER = linkcheck
linkcheck: build
@echo "Link check complete; look for any errors in the above output " \
"or in build/$(BUILDER)/output.txt"
linkfix: BUILDER = linkcheck
linkfix: build-ignore-errors
$(PYTHON) utils/linkfix.py
@echo "Fixing redirecting links in docs has finished; check all " \
"replacements before committing them"
doctest: BUILDER = doctest
doctest: build
@echo "Testing of doctests in the sources finished, look at the " \
"results in build/doctest/output.txt"
pydoc-topics: BUILDER = pydoc-topics
pydoc-topics: build
@echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \
"into the Lib/ directory"
coverage: BUILDER = coverage
coverage: build
htmlview: html
$(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \
os.path.realpath('build/html/index.html'))"
clean:
-rm -rf build/*
watch: htmlview
watchmedo shell-command -p '*.rst' -c 'make html' -R -D
itemloaders-1.3.1/docs/README.rst 0000664 0000000 0000000 00000002531 14627331724 0016455 0 ustar 00root root 0000000 0000000 :orphan:
===========================================
itemloaders documentation quick start guide
===========================================
This file provides a quick guide on how to compile the itemloaders documentation.
Setup the environment
---------------------
To compile the documentation you need Sphinx Python library. To install it
and all its dependencies run the following command from this dir
::
pip install -r requirements.txt
Compile the documentation
-------------------------
To compile the documentation (to classic HTML output) run the following command
from this dir::
make html
Documentation will be generated (in HTML format) inside the ``build/html`` dir.
View the documentation
----------------------
To view the documentation run the following command::
make htmlview
This command will fire up your default browser and open the main page of your
(previously generated) HTML documentation.
Start over
----------
To cleanup all generated documentation files and start from scratch run::
make clean
Keep in mind that this command won't touch any documentation source files.
Recreating documentation on the fly
-----------------------------------
There is a way to recreate the doc automatically when you make changes, you
need to install watchdog (``pip install watchdog``) and then use::
make watch
itemloaders-1.3.1/docs/_ext/ 0000775 0000000 0000000 00000000000 14627331724 0015724 5 ustar 00root root 0000000 0000000 itemloaders-1.3.1/docs/_ext/__init__.py 0000664 0000000 0000000 00000000000 14627331724 0020023 0 ustar 00root root 0000000 0000000 itemloaders-1.3.1/docs/_ext/github.py 0000664 0000000 0000000 00000001403 14627331724 0017556 0 ustar 00root root 0000000 0000000 from typing import Optional
from docutils import nodes
from docutils.parsers.rst.roles import set_classes
def setup(app):
app.add_role("gh", github_role)
def github_role(
name,
rawtext,
text,
lineno,
inliner,
options: Optional[dict] = None,
content: Optional[list] = None,
):
options = options or {}
content = content or []
if text.isdigit():
display_text = f"#{text}"
url = f"https://github.com/scrapy/itemloaders/issues/{text}"
else:
short_commit = text[:7]
display_text = short_commit
url = f"https://github.com/scrapy/itemloaders/commit/{short_commit}"
set_classes(options)
node = nodes.reference(rawtext, display_text, refuri=url, **options)
return [node], []
itemloaders-1.3.1/docs/api-reference.rst 0000664 0000000 0000000 00000000151 14627331724 0020221 0 ustar 00root root 0000000 0000000 .. _api-reference:
API Reference
==================
.. autoclass:: itemloaders.ItemLoader
:members: itemloaders-1.3.1/docs/built-in-processors.rst 0000664 0000000 0000000 00000001050 14627331724 0021436 0 ustar 00root root 0000000 0000000 .. _built-in-processors:
Available built-in processors
=============================
Even though you can use any callable function as input and output processors,
``itemloaders`` provides some commonly used processors, which are described
below.
Some of them, like the :class:`~itemloaders.processors.MapCompose` (which is
typically used as input processor) compose the output of several functions
executed in order, to produce the final parsed value.
Here is a list of all built-in processors:
.. automodule:: itemloaders.processors
:members: itemloaders-1.3.1/docs/conf.py 0000664 0000000 0000000 00000016125 14627331724 0016271 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
#
# Scrapy documentation build configuration file, created by
# sphinx-quickstart on Mon Nov 24 12:02:52 2008.
#
# This file is execfile()d with the current directory set to its containing dir.
#
# The contents of this file are pickled, so don't put values in the namespace
# that aren't pickleable (module imports are okay, they're removed automatically).
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys
from os import path
import sphinx_rtd_theme
# If your extensions are in another directory, add it here. If the directory
# is relative to the documentation root, use os.path.abspath to make it
# absolute, like shown here.
sys.path.append(path.dirname(__file__))
sys.path.insert(0, path.dirname(path.dirname(__file__)))
# General configuration
# ---------------------
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
"_ext.github",
"sphinx.ext.autodoc",
"sphinx.ext.coverage",
"sphinx.ext.intersphinx",
"sphinx.ext.viewcode",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The suffix of source filenames.
source_suffix = ".rst"
# The encoding of source files.
# source_encoding = 'utf-8'
# The master toctree document.
master_doc = "index"
# General information about the project.
project = "itemloaders"
copyright = "Zyte Group Ltd"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = ""
release = ""
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
language = "en"
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
# today = ''
# Else, today_fmt is used as the format for a strftime call.
# today_fmt = '%B %d, %Y'
# List of documents that shouldn't be included in the build.
# unused_docs = []
exclude_patterns = ["build"]
# List of directories, relative to source directory, that shouldn't be searched
# for source files.
exclude_trees = [".build"]
# The reST default role (used for this markup: `text`) to use for all documents.
# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx"
# Options for HTML output
# -----------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = "sphinx_rtd_theme"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
# Add path to the RTD explicitly to robustify builds (otherwise might
# fail in a clean Debian build env)
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# The style sheet to use for HTML and HTML Help pages. A file of that name
# must exist either in Sphinx' static/ path, or in one of the custom paths
# given in html_static_path.
# html_style = 'scrapydoc.css'
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
html_last_updated_fmt = "%b %d, %Y"
# Custom sidebar templates, maps document names to template names.
# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
# html_additional_pages = {}
# If false, no module index is generated.
# html_use_modindex = True
# If false, no index is generated.
# html_use_index = True
# If true, the index is split into individual pages for each letter.
# html_split_index = False
# If true, the reST sources are included in the HTML build as _sources/.
html_copy_source = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
# html_use_opensearch = ''
# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
# html_file_suffix = ''
# Output file base name for HTML help builder.
htmlhelp_basename = "itemloadersdoc"
# Options for LaTeX output
# ------------------------
# The paper size ('letter' or 'a4').
# latex_paper_size = 'letter'
# The font size ('10pt', '11pt' or '12pt').
# latex_font_size = '10pt'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, document class [howto/manual]).
latex_documents = [
("index", "itemloaders.tex", "itemloaders Documentation", "Zyte", "manual"),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
# latex_use_parts = False
# Additional stuff for the LaTeX preamble.
# latex_preamble = ''
# Documents to append as an appendix to all manuals.
# latex_appendices = []
# If false, no module index is generated.
# latex_use_modindex = True
# autodocs
def setup(app):
app.connect("autodoc-skip-member", maybe_skip_member)
def maybe_skip_member(app, what, name, obj, skip, options):
if not skip:
# autodocs was generating a text "alias of" for the following members
# https://github.com/sphinx-doc/sphinx/issues/4422
return name in {"default_item_class", "default_selector_class"}
return skip
nitpicky = True
intersphinx_mapping = {
"parsel": ("https://parsel.readthedocs.io/en/stable/", None),
"python": ("https://docs.python.org/3", None),
"scrapy": ("https://docs.scrapy.org/en/latest/", None),
"w3lib": ("https://w3lib.readthedocs.io/en/latest", None),
}
itemloaders-1.3.1/docs/declaring-loaders.rst 0000664 0000000 0000000 00000003241 14627331724 0021076 0 ustar 00root root 0000000 0000000 .. currentmodule:: itemloaders
.. _declaring-loaders:
Declaring Item Loaders
======================
Item Loaders are declared by using a class definition syntax. Here is an example::
from itemloaders import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose, Join
class ProductLoader(ItemLoader):
default_output_processor = TakeFirst()
name_in = MapCompose(str.title)
name_out = Join()
# using a built-in processor
price_in = MapCompose(str.strip)
# using a function
def price_out(self, values):
return float(values[0])
loader = ProductLoader()
loader.add_value('name', 'plasma TV')
loader.add_value('price', '999.98')
loader.load_item()
# {'name': 'Plasma Tv', 'price': 999.98}
As you can see, input processors are declared using the ``_in`` suffix while
output processors are declared using the ``_out`` suffix. And you can also
declare a default input/output processors using the
:attr:`ItemLoader.default_input_processor` and
:attr:`ItemLoader.default_output_processor` attributes.
The precedence order, for both input and output processors, is as follows:
1. Item Loader field-specific attributes: ``field_in`` and ``field_out`` (most
precedence)
2. Field metadata (``input_processor`` and ``output_processor`` keys).
Check out `itemadapter field metadata
`_ for more
information.
.. versionadded:: 1.0.1
3. Item Loader defaults: :meth:`ItemLoader.default_input_processor` and
:meth:`ItemLoader.default_output_processor` (least precedence)
See also: :ref:`extending-loaders`.
itemloaders-1.3.1/docs/extending-loaders.rst 0000664 0000000 0000000 00000003512 14627331724 0021134 0 ustar 00root root 0000000 0000000 .. _extending-loaders:
Reusing and extending Item Loaders
==================================
Item Loaders are designed to ease the maintenance burden of parsing rules,
without losing flexibility and, at the same time, providing a convenient
mechanism for extending and overriding them. For this reason Item Loaders
support traditional Python class inheritance for dealing with differences
in data schemas.
Suppose, for example, that you get some particular product names enclosed in
three dashes (e.g. ``---Plasma TV---``) and you don't want to end up with
those dashes in the final product names.
Here's how you can remove those dashes by reusing and extending the default
Product Item Loader (``ProductLoader``)::
from itemloaders.processors import MapCompose
from myproject.loaders import ProductLoader
def strip_dashes(x):
return x.strip('-')
class SiteSpecificLoader(ProductLoader):
name_in = MapCompose(strip_dashes, ProductLoader.name_in)
Another case where extending Item Loaders can be very helpful is when you have
multiple source formats, for example XML and HTML. In the XML version you may
want to remove ``CDATA`` occurrences. Here's an example of how to do it::
from itemloaders.processors import MapCompose
from myproject.ItemLoaders import ProductLoader
from myproject.utils.xml import remove_cdata
class XmlProductLoader(ProductLoader):
name_in = MapCompose(remove_cdata, ProductLoader.name_in)
And that's how you typically extend input/output processors.
There are many other possible ways to extend, inherit and override your Item
Loaders, and different Item Loaders hierarchies may fit better for different
projects. ``itemloaders`` only provides the mechanism; it doesn't impose any specific
organization of your Loaders collection - that's up to you and your project's
needs. itemloaders-1.3.1/docs/index.rst 0000664 0000000 0000000 00000007272 14627331724 0016636 0 ustar 00root root 0000000 0000000 .. currentmodule:: itemloaders
.. _topics-index:
============
itemloaders
============
``itemloaders`` provide a convenient mechanism for populating data records.
Its design provides a flexible, efficient and easy mechanism
for extending and overriding different field parsing rules, either by raw data,
or by source format (HTML, XML, etc) without becoming a nightmare to maintain.
To install ``itemloaders``, run::
pip install itemloaders
.. note:: Under the hood, ``itemloaders`` uses
`itemadapter `_ as a common interface.
This means you can use any of the types supported by ``itemadapter`` here.
.. warning:: ``dataclasses`` and ``attrs`` support is still experimental.
Please, refer to :attr:`~ItemLoader.default_item_class` in the
:ref:`api-reference` for more information.
Getting Started with ``itemloaders``
====================================
To use an Item Loader, you must first instantiate it. You can either
instantiate it with a dict-like object (`item`) or without one, in
which case an `item` is automatically instantiated in the Item Loader ``__init__`` method
using the `item` class specified in the :attr:`ItemLoader.default_item_class`
attribute.
Then, you start collecting values into the Item Loader, typically using
CSS or XPath Selectors. You can add more than one value to
the same item field; the Item Loader will know how to "join" those values later
using a proper processing function.
.. note:: Collected data is stored internally as lists,
allowing to add several values to the same field.
If an ``item`` argument is passed when creating a loader,
each of the item's values will be stored as-is if it's already
an iterable, or wrapped with a list if it's a single value.
Here is a typical Item Loader usage::
from itemloaders import ItemLoader
from parsel import Selector
html_data = '''
Some random product page
Some random product page
$ 100.12
'''
l = ItemLoader(selector=Selector(html_data))
l.add_xpath('name', '//div[@class="product_name"]/text()')
l.add_xpath('name', '//div[@class="product_title"]/text()')
l.add_css('price', '#price::text')
l.add_value('last_updated', 'today') # you can also use literal values
item = l.load_item()
item
# {'name': ['Some random product page'], 'price': ['$ 100.12'], 'last_updated': ['today']}
By quickly looking at that code, we can see the ``name`` field is being
extracted from two different XPath locations in the page:
1. ``//div[@class="product_name"]``
2. ``//div[@class="product_title"]``
In other words, data is being collected by extracting it from two XPath
locations, using the :meth:`~ItemLoader.add_xpath` method. This is the
data that will be assigned to the ``name`` field later.
Afterwards, similar calls are used for ``price`` field using a CSS selector with
the :meth:`~ItemLoader.add_css` method, and finally the ``last_update`` field is
populated directly with a literal value
(``today``) using a different method: :meth:`~ItemLoader.add_value`.
Finally, when all data is collected, the :meth:`ItemLoader.load_item` method is
called which actually returns the item populated with the data
previously extracted and collected with the :meth:`~ItemLoader.add_xpath`,
:meth:`~ItemLoader.add_css`, and :meth:`~ItemLoader.add_value` calls.
Contents
--------
.. toctree::
declaring-loaders
processors
loaders-context
nested-loaders
extending-loaders
built-in-processors
api-reference
release-notes
itemloaders-1.3.1/docs/loaders-context.rst 0000664 0000000 0000000 00000003337 14627331724 0020640 0 ustar 00root root 0000000 0000000 .. currentmodule:: itemloaders
.. _loaders-context:
Item Loader Context
===================
The Item Loader Context is a mechanism that allows to change the input/ouput processors behavior.
It's just a ``dict`` of arbitrary key/values which is shared among all processors.
By default, the context contains the ``selector`` and any other `keyword arguments`
sent to the Loaders's ``__init__``.
The context can be passed when declaring, instantiating or using Item Loader.
For example, suppose you have a function ``parse_length`` which receives a text
value and extracts a length from it::
def parse_length(text, loader_context):
unit = loader_context.get('unit', 'm')
# ... length parsing code goes here ...
return parsed_length
By accepting a ``loader_context`` argument the function is explicitly telling
the Item Loader that it's able to receive an Item Loader context, so the Item
Loader passes the currently active context when calling it, and the processor
function (``parse_length`` in this case) can thus use them.
There are several ways to modify Item Loader context values:
1. By modifying the currently active Item Loader context
(:attr:`~ItemLoader.context` attribute)::
loader = ItemLoader(product)
loader.context['unit'] = 'cm'
2. On Item Loader instantiation (the keyword arguments of Item Loader
``__init__`` method are stored in the Item Loader context)::
loader = ItemLoader(product, unit='cm')
3. On Item Loader declaration, for those input/output processors that support
instantiating them with an Item Loader context. :class:`~processors.MapCompose` is one of
them::
class ProductLoader(ItemLoader):
length_out = MapCompose(parse_length, unit='cm')
itemloaders-1.3.1/docs/nested-loaders.rst 0000664 0000000 0000000 00000003102 14627331724 0020424 0 ustar 00root root 0000000 0000000 .. _nested-loaders:
Nested Loaders
==============
When parsing related values from a subsection of a document, it can be
useful to create nested loaders. Imagine you're extracting details from
a footer of a page that looks something like:
Example::
Without nested loaders, you need to specify the full xpath (or css) for each value
that you wish to extract.
Example::
loader = ItemLoader()
# load stuff not in the footer
loader.add_xpath('social', '//footer/a[@class = "social"]/@href')
loader.add_xpath('email', '//footer/a[@class = "email"]/@href')
loader.load_item()
Instead, you can create a nested loader with the footer selector and add values
relative to the footer. The functionality is the same but you avoid repeating
the footer selector.
Example::
loader = ItemLoader()
# load stuff not in the footer
footer_loader = loader.nested_xpath('//footer')
footer_loader.add_xpath('social', 'a[@class = "social"]/@href')
footer_loader.add_xpath('email', 'a[@class = "email"]/@href')
# no need to call footer_loader.load_item()
loader.load_item()
You can nest loaders arbitrarily and they work with either xpath or css selectors.
As a general guideline, use nested loaders when they make your code simpler but do
not go overboard with nesting or your parser can become difficult to read.
itemloaders-1.3.1/docs/processors.rst 0000664 0000000 0000000 00000006751 14627331724 0017732 0 ustar 00root root 0000000 0000000 .. currentmodule:: itemloaders
.. _processors:
Input and Output processors
===========================
An Item Loader contains one input processor and one output processor for each
(item) field. The input processor processes the extracted data as soon as it's
received (through the :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css` or
:meth:`~ItemLoader.add_value` methods) and the result of the input processor is
collected and kept inside the ItemLoader. After collecting all data, the
:meth:`ItemLoader.load_item` method is called to populate and get the populated
item object. That's when the output processor is
called with the data previously collected (and processed using the input
processor). The result of the output processor is the final value that gets
assigned to the item.
Let's see an example to illustrate how the input and output processors are
called for a particular field (the same applies for any other field)::
l = ItemLoader(selector=some_selector)
l.add_xpath('name', xpath1) # (1)
l.add_xpath('name', xpath2) # (2)
l.add_css('name', css) # (3)
l.add_value('name', 'test') # (4)
return l.load_item() # (5)
So what happens is:
1. Data from ``xpath1`` is extracted, and passed through the *input processor* of
the ``name`` field. The result of the input processor is collected and kept in
the Item Loader (but not yet assigned to the item).
2. Data from ``xpath2`` is extracted, and passed through the same *input
processor* used in (1). The result of the input processor is appended to the
data collected in (1) (if any).
3. This case is similar to the previous ones, except that the data is extracted
from the ``css`` CSS selector, and passed through the same *input
processor* used in (1) and (2). The result of the input processor is appended to the
data collected in (1) and (2) (if any).
4. This case is also similar to the previous ones, except that the value to be
collected is assigned directly, instead of being extracted from a XPath
expression or a CSS selector.
However, the value is still passed through the input processors. In this
case, since the value is not iterable it is converted to an iterable of a
single element before passing it to the input processor, because input
processor always receive iterables.
5. The data collected in steps (1), (2), (3) and (4) is passed through
the *output processor* of the ``name`` field.
The result of the output processor is the value assigned to the ``name``
field in the item.
It's worth noticing that processors are just callable objects, which are called
with the data to be parsed, and return a parsed value. So you can use any
function as input or output processor. The only requirement is that they must
accept one (and only one) positional argument, which will be an iterable.
.. note:: Both input and output processors must receive an iterable as their
first argument. The output of those functions can be anything. The result of
input processors will be appended to an internal list (in the Loader)
containing the collected values (for that field). The result of the output
processors is the value that will be finally assigned to the item.
The other thing you need to keep in mind is that the values returned by input
processors are collected internally (in lists) and then passed to output
processors to populate the fields.
Last, but not least, ``itemloaders`` comes with some :ref:`commonly used processors
` built-in for convenience.
itemloaders-1.3.1/docs/release-notes.rst 0000664 0000000 0000000 00000007737 14627331724 0020303 0 ustar 00root root 0000000 0000000 .. currentmodule:: itemloaders
.. _release-notes:
Release notes
=============
.. _release-1.3.1:
itemloaders 1.3.1 (2024-06-03)
------------------------------
- Fixed an error when using nested loaders with empty matches that was
introduced in 1.3.0 (:gh:`88`)
.. _release-1.3.0:
itemloaders 1.3.0 (2024-05-30)
------------------------------
- Added support for method chaining to the ``add_*`` and ``replace_*``
methods, so you can now write code such as
``loader.add_xpath("name", "//body/text()").add_value("url", "http://example.com")``
(:gh:`81`)
- Added type hints and ``py.typed`` (:gh:`80`, :gh:`83`)
- Made the docs builds reproducible (:gh:`82`)
.. _release-1.2.0:
itemloaders 1.2.0 (2024-04-18)
------------------------------
- Added official support for Python 3.12 and PyPy 3.10 (:gh:`75`)
- Removed official support for Python 3.7 (:gh:`72`)
- Improved performance of ``itemloaders.utils.arg_to_iter`` (:gh:`51`)
- Fixed test expectations on recent Python versions (:gh:`77`)
- Improved CI (:gh:`78`)
.. _release-1.1.0:
itemloaders 1.1.0 (2023-04-21)
------------------------------
- Added JMESPath support (:meth:`ItemLoader.add_jmes` etc.), requiring Parsel
1.8.1+ (:gh:`68`)
- Added official support for Python 3.11 (:gh:`59`)
- Removed official support for Python 3.6 (:gh:`61`)
- Internal code cleanup (:gh:`65`, :gh:`66`)
- Added ``pre-commit`` support and applied changes from ``black`` and
``flake8`` (:gh:`70`).
- Improved CI (:gh:`60`)
.. _release-1.0.6:
itemloaders 1.0.6 (2022-08-29)
------------------------------
- Fixes a regression introduced in 1.0.5 that would cause the ``re`` parameter of
:meth:`ItemLoader.add_xpath` and similar methods to be passed to lxml, which
would trigger an exception when the value of ``re`` was a compiled pattern and
not a string (:gh:`56`)
.. _release-1.0.5:
itemloaders 1.0.5 (2022-08-25)
------------------------------
- Allow additional args to be passed when calling :meth:`ItemLoader.add_xpath` (:gh:`48`)
- Fixed missing space in an exception message (:gh:`47`)
- Updated company name in author and copyright sections (:gh:`42`)
- Added official support for Python 3.9 and improved PyPy compatibility (:gh:`44`)
- Added official support for Python 3.10 (:gh:`53`)
.. _release-1.0.4:
itemloaders 1.0.4 (2020-11-12)
------------------------------
- When adding a :class:`scrapy.item.scrapy.Item` object as a value into an
:class:`ItemLoader` object, that item is now added *as is*, instead of
becoming a :class:`list` of keys from its :attr:`scrapy.item.scrapy.Item.fields`
(:gh:`28`, :gh:`29`)
- Increased test coverage (:gh:`27`)
.. _release-1.0.3:
itemloaders 1.0.3 (2020-09-09)
------------------------------
- Calls to :meth:`ItemLoader.get_output_value` no longer affect the output of
:meth:`ItemLoader.load_item` (:gh:`21`, :gh:`22`)
- Fixed some documentation links (:gh:`19`, :gh:`23`)
- Fixed some test warnings (:gh:`24`)
.. _release-1.0.2:
itemloaders 1.0.2 (2020-08-05)
------------------------------
- Included the license file in the source releases (:gh:`13`)
- Cleaned up some remnants of Python 2 (:gh:`16`, :gh:`17`)
.. _release-1.0.1:
itemloaders 1.0.1 (2020-07-02)
------------------------------
- Extended item type support to all item types supported by itemadapter_
(:gh:`13`)
- :ref:`Input and output processors ` defined in item
field metadata are now taken into account (:gh:`13`)
- Lowered some minimum dependency versions (:gh:`10`):
- :doc:`parsel `: 1.5.2 → 1.5.0
- :doc:`w3lib `: 1.21.0 → 1.17.0
- Improved the README file (:gh:`9`)
- Improved continuous integration (:gh:`e62d95b`)
.. _release-1.0.0:
itemloaders 1.0.0 (2020-05-18)
------------------------------
- Initial release, based on a part of the :doc:`Scrapy ` code base.
.. _itemadapter: https://github.com/scrapy/itemadapter#itemadapter
itemloaders-1.3.1/docs/requirements.txt 0000664 0000000 0000000 00000000042 14627331724 0020245 0 ustar 00root root 0000000 0000000 Sphinx>=3.0
sphinx_rtd_theme>=0.4
itemloaders-1.3.1/itemloaders/ 0000775 0000000 0000000 00000000000 14627331724 0016345 5 ustar 00root root 0000000 0000000 itemloaders-1.3.1/itemloaders/__init__.py 0000664 0000000 0000000 00000055010 14627331724 0020457 0 ustar 00root root 0000000 0000000 """
Item Loader
See documentation in docs/topics/loaders.rst
"""
from __future__ import annotations
from contextlib import suppress
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
MutableMapping,
Optional,
Pattern,
Union,
)
from itemadapter import ItemAdapter
from parsel import Selector
from parsel.utils import extract_regex, flatten
from itemloaders.common import wrap_loader_context
from itemloaders.processors import Identity
from itemloaders.utils import arg_to_iter
if TYPE_CHECKING:
# typing.Self requires Python 3.11
from typing_extensions import Self
def unbound_method(method: Callable[..., Any]) -> Callable[..., Any]:
"""
Allow to use single-argument functions as input or output processors
(no need to define an unused first 'self' argument)
"""
with suppress(AttributeError):
if "." not in method.__qualname__:
return method.__func__ # type: ignore[attr-defined, no-any-return]
return method
class ItemLoader:
"""
Return a new Item Loader for populating the given item. If no item is
given, one is instantiated automatically using the class in
:attr:`default_item_class`.
When instantiated with a :param ``selector`` parameter the :class:`ItemLoader` class
provides convenient mechanisms for extracting data from web pages
using parsel_ selectors.
:param item: The item instance to populate using subsequent calls to
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
:meth:`~ItemLoader.add_jmes` or :meth:`~ItemLoader.add_value`.
:type item: :class:`dict` object
:param selector: The selector to extract data from, when using the
:meth:`add_xpath` (resp. :meth:`add_css`, :meth:`add_jmes`) or :meth:`replace_xpath`
(resp. :meth:`replace_css`, :meth:`replace_jmes`) method.
:type selector: :class:`~parsel.selector.Selector` object
The item, selector and the remaining keyword arguments are
assigned to the Loader context (accessible through the :attr:`context` attribute).
.. attribute:: item
The item object being parsed by this Item Loader.
This is mostly used as a property so when attempting to override this
value, you may want to check out :attr:`default_item_class` first.
.. attribute:: context
The currently active :ref:`Context ` of this Item Loader.
Refer to for more information about the Loader Context.
.. attribute:: default_item_class
An Item class (or factory), used to instantiate items when not given in
the ``__init__`` method.
.. warning:: Currently, this factory/class needs to be
callable/instantiated without any arguments.
If you are using ``dataclasses``, please consider the following
alternative::
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class Product:
name: Optional[str] = field(default=None)
price: Optional[float] = field(default=None)
.. attribute:: default_input_processor
The default input processor to use for those fields which don't specify
one.
.. attribute:: default_output_processor
The default output processor to use for those fields which don't specify
one.
.. attribute:: selector
The :class:`~parsel.selector.Selector` object to extract data from.
It's the selector given in the ``__init__`` method.
This attribute is meant to be read-only.
.. _parsel: https://parsel.readthedocs.io/en/latest/
"""
default_item_class: type = dict
default_input_processor: Callable[..., Any] = Identity()
default_output_processor: Callable[..., Any] = Identity()
def __init__(
self,
item: Any = None,
selector: Optional[Selector] = None,
parent: Optional[ItemLoader] = None,
**context: Any,
):
self.selector: Optional[Selector] = selector
context.update(selector=selector)
if item is None:
item = self.default_item_class()
self._local_item = item
context["item"] = item
self.context: MutableMapping[str, Any] = context
self.parent: Optional[ItemLoader] = parent
self._local_values: Dict[str, List[Any]] = {}
# values from initial item
for field_name, value in ItemAdapter(item).items():
self._values.setdefault(field_name, [])
self._values[field_name] += arg_to_iter(value)
@property
def _values(self) -> Dict[str, List[Any]]:
if self.parent is not None:
return self.parent._values
else:
return self._local_values
@property
def item(self) -> Any:
if self.parent is not None:
return self.parent.item
else:
return self._local_item
def nested_xpath(self, xpath: str, **context: Any) -> Self:
"""
Create a nested loader with an xpath selector.
The supplied selector is applied relative to selector associated
with this :class:`ItemLoader`. The nested loader shares the item
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
"""
self._check_selector_method()
assert self.selector is not None
selector = self.selector.xpath(xpath)
context.update(selector=selector)
subloader = self.__class__(item=self.item, parent=self, **context)
return subloader
def nested_css(self, css: str, **context: Any) -> Self:
"""
Create a nested loader with a css selector.
The supplied selector is applied relative to selector associated
with this :class:`ItemLoader`. The nested loader shares the item
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
"""
self._check_selector_method()
assert self.selector is not None
selector = self.selector.css(css)
context.update(selector=selector)
subloader = self.__class__(item=self.item, parent=self, **context)
return subloader
def add_value(
self,
field_name: Optional[str],
value: Any,
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Self:
"""
Process and then add the given ``value`` for the given field.
The value is first passed through :meth:`get_value` by giving the
``processors`` and ``kwargs``, and then passed through the
:ref:`field input processor ` and its result
appended to the data collected for that field. If the field already
contains collected data, the new data is added.
The given ``field_name`` can be ``None``, in which case values for
multiple fields may be added. And the processed value should be a dict
with field_name mapped to values.
:returns: The current ItemLoader instance for method chaining.
:rtype: ItemLoader
Examples::
loader.add_value('name', 'Color TV')
loader.add_value('colours', ['white', 'blue'])
loader.add_value('length', '100')
loader.add_value('name', 'name: foo', TakeFirst(), re='name: (.+)')
loader.add_value(None, {'name': 'foo', 'sex': 'male'})
"""
value = self.get_value(value, *processors, re=re, **kw)
if value is None:
return
if not field_name:
for k, v in value.items():
self._add_value(k, v)
else:
self._add_value(field_name, value)
return self
def replace_value(
self,
field_name: Optional[str],
value: Any,
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Self:
"""
Similar to :meth:`add_value` but replaces the collected data with the
new value instead of adding it.
:returns: The current ItemLoader instance for method chaining.
:rtype: ItemLoader
"""
value = self.get_value(value, *processors, re=re, **kw)
if value is None:
return
if not field_name:
for k, v in value.items():
self._replace_value(k, v)
else:
self._replace_value(field_name, value)
return self
def _add_value(self, field_name: str, value: Any) -> None:
value = arg_to_iter(value)
processed_value = self._process_input_value(field_name, value)
if processed_value:
self._values.setdefault(field_name, [])
self._values[field_name] += arg_to_iter(processed_value)
def _replace_value(self, field_name: str, value: Any) -> None:
self._values.pop(field_name, None)
self._add_value(field_name, value)
def get_value(
self,
value: Any,
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Any:
"""
Process the given ``value`` by the given ``processors`` and keyword
arguments.
Available keyword arguments:
:param re: a regular expression to use for extracting data from the
given value using :func:`~parsel.utils.extract_regex` method,
applied before processors
:type re: str or typing.Pattern[str]
Examples:
>>> from itemloaders import ItemLoader
>>> from itemloaders.processors import TakeFirst
>>> loader = ItemLoader()
>>> loader.get_value('name: foo', TakeFirst(), str.upper, re='name: (.+)')
'FOO'
"""
if re:
value = arg_to_iter(value)
value = flatten(extract_regex(re, x) for x in value)
for proc in processors:
if value is None:
break
_proc = proc
proc = wrap_loader_context(proc, self.context)
try:
value = proc(value)
except Exception as e:
raise ValueError(
"Error with processor %s value=%r error='%s: %s'"
% (_proc.__class__.__name__, value, type(e).__name__, str(e))
) from e
return value
def load_item(self) -> Any:
"""
Populate the item with the data collected so far, and return it. The
data collected is first passed through the :ref:`output processors
` to get the final value to assign to each item field.
"""
adapter = ItemAdapter(self.item)
for field_name in tuple(self._values):
value = self.get_output_value(field_name)
if value is not None:
adapter[field_name] = value
return adapter.item
def get_output_value(self, field_name: str) -> Any:
"""
Return the collected values parsed using the output processor, for the
given field. This method doesn't populate or modify the item at all.
"""
proc = self.get_output_processor(field_name)
proc = wrap_loader_context(proc, self.context)
value = self._values.get(field_name, [])
try:
return proc(value)
except Exception as e:
raise ValueError(
"Error with output processor: field=%r value=%r error='%s: %s'"
% (field_name, value, type(e).__name__, str(e))
) from e
def get_collected_values(self, field_name: str) -> List[Any]:
"""Return the collected values for the given field."""
return self._values.get(field_name, [])
def get_input_processor(self, field_name: str) -> Callable[..., Any]:
proc = getattr(self, "%s_in" % field_name, None)
if not proc:
proc = self._get_item_field_attr(
field_name, "input_processor", self.default_input_processor
)
return unbound_method(proc)
def get_output_processor(self, field_name: str) -> Callable[..., Any]:
proc = getattr(self, "%s_out" % field_name, None)
if not proc:
proc = self._get_item_field_attr(
field_name, "output_processor", self.default_output_processor
)
return unbound_method(proc)
def _get_item_field_attr(
self, field_name: str, key: Any, default: Any = None
) -> Any:
field_meta = ItemAdapter(self.item).get_field_meta(field_name)
return field_meta.get(key, default)
def _process_input_value(self, field_name: str, value: Any) -> Any:
proc = self.get_input_processor(field_name)
_proc = proc
proc = wrap_loader_context(proc, self.context)
try:
return proc(value)
except Exception as e:
raise ValueError(
"Error with input processor %s: field=%r value=%r "
"error='%s: %s'"
% (
_proc.__class__.__name__,
field_name,
value,
type(e).__name__,
str(e),
)
) from e
def _check_selector_method(self) -> None:
if self.selector is None:
raise RuntimeError(
"To use XPath or CSS selectors, %s "
"must be instantiated with a selector" % self.__class__.__name__
)
def add_xpath(
self,
field_name: Optional[str],
xpath: Union[str, Iterable[str]],
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Self:
"""
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
value, which is used to extract a list of strings from the
selector associated with this :class:`ItemLoader`.
See :meth:`get_xpath` for ``kwargs``.
:param xpath: the XPath to extract data from
:type xpath: str
:returns: The current ItemLoader instance for method chaining.
:rtype: ItemLoader
Examples::
# HTML snippet:
Color TV
loader.add_xpath('name', '//p[@class="product-name"]')
# HTML snippet:
the price is $1200
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
"""
values = self._get_xpathvalues(xpath, **kw)
return self.add_value(field_name, values, *processors, re=re, **kw)
def replace_xpath(
self,
field_name: Optional[str],
xpath: Union[str, Iterable[str]],
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Self:
"""
Similar to :meth:`add_xpath` but replaces collected data instead of adding it.
:returns: The current ItemLoader instance for method chaining.
:rtype: ItemLoader
"""
values = self._get_xpathvalues(xpath, **kw)
return self.replace_value(field_name, values, *processors, re=re, **kw)
def get_xpath(
self,
xpath: Union[str, Iterable[str]],
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Any:
"""
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
value, which is used to extract a list of unicode strings from the
selector associated with this :class:`ItemLoader`.
:param xpath: the XPath to extract data from
:type xpath: str
:param re: a regular expression to use for extracting data from the
selected XPath region
:type re: str or typing.Pattern[str]
Examples::
# HTML snippet:
Color TV
loader.get_xpath('//p[@class="product-name"]')
# HTML snippet:
the price is $1200
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
"""
values = self._get_xpathvalues(xpath, **kw)
return self.get_value(values, *processors, re=re, **kw)
def _get_xpathvalues(
self, xpaths: Union[str, Iterable[str]], **kw: Any
) -> List[Any]:
self._check_selector_method()
assert self.selector is not None
xpaths = arg_to_iter(xpaths)
return flatten(self.selector.xpath(xpath, **kw).getall() for xpath in xpaths)
def add_css(
self,
field_name: Optional[str],
css: Union[str, Iterable[str]],
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Self:
"""
Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
instead of a value, which is used to extract a list of unicode strings
from the selector associated with this :class:`ItemLoader`.
See :meth:`get_css` for ``kwargs``.
:param css: the CSS selector to extract data from
:type css: str
:returns: The current ItemLoader instance for method chaining.
:rtype: ItemLoader
Examples::
# HTML snippet:
Color TV
loader.add_css('name', 'p.product-name')
# HTML snippet:
the price is $1200
loader.add_css('price', 'p#price', re='the price is (.*)')
"""
values = self._get_cssvalues(css)
return self.add_value(field_name, values, *processors, re=re, **kw)
def replace_css(
self,
field_name: Optional[str],
css: Union[str, Iterable[str]],
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Self:
"""
Similar to :meth:`add_css` but replaces collected data instead of adding it.
:returns: The current ItemLoader instance for method chaining.
:rtype: ItemLoader
"""
values = self._get_cssvalues(css)
return self.replace_value(field_name, values, *processors, re=re, **kw)
def get_css(
self,
css: Union[str, Iterable[str]],
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Any:
"""
Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
instead of a value, which is used to extract a list of unicode strings
from the selector associated with this :class:`ItemLoader`.
:param css: the CSS selector to extract data from
:type css: str
:param re: a regular expression to use for extracting data from the
selected CSS region
:type re: str or typing.Pattern[str]
Examples::
# HTML snippet:
Color TV
loader.get_css('p.product-name')
# HTML snippet:
the price is $1200
loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
"""
values = self._get_cssvalues(css)
return self.get_value(values, *processors, re=re, **kw)
def _get_cssvalues(self, csss: Union[str, Iterable[str]]) -> List[Any]:
self._check_selector_method()
assert self.selector is not None
csss = arg_to_iter(csss)
return flatten(self.selector.css(css).getall() for css in csss)
def add_jmes(
self,
field_name: Optional[str],
jmes: str,
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Self:
"""
Similar to :meth:`ItemLoader.add_value` but receives a JMESPath selector
instead of a value, which is used to extract a list of unicode strings
from the selector associated with this :class:`ItemLoader`.
See :meth:`get_jmes` for ``kwargs``.
:param jmes: the JMESPath selector to extract data from
:type jmes: str
:returns: The current ItemLoader instance for method chaining.
:rtype: ItemLoader
Examples::
# HTML snippet: {"name": "Color TV"}
loader.add_jmes('name')
# HTML snippet: {"price": the price is $1200"}
loader.add_jmes('price', TakeFirst(), re='the price is (.*)')
"""
values = self._get_jmesvalues(jmes)
return self.add_value(field_name, values, *processors, re=re, **kw)
def replace_jmes(
self,
field_name: Optional[str],
jmes: Union[str, Iterable[str]],
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Self:
"""
Similar to :meth:`add_jmes` but replaces collected data instead of adding it.
:returns: The current ItemLoader instance for method chaining.
:rtype: ItemLoader
"""
values = self._get_jmesvalues(jmes)
return self.replace_value(field_name, values, *processors, re=re, **kw)
def get_jmes(
self,
jmes: Union[str, Iterable[str]],
*processors: Callable[..., Any],
re: Union[str, Pattern[str], None] = None,
**kw: Any,
) -> Any:
"""
Similar to :meth:`ItemLoader.get_value` but receives a JMESPath selector
instead of a value, which is used to extract a list of unicode strings
from the selector associated with this :class:`ItemLoader`.
:param jmes: the JMESPath selector to extract data from
:type jmes: str
:param re: a regular expression to use for extracting data from the
selected JMESPath
:type re: str or typing.Pattern
Examples::
# HTML snippet: {"name": "Color TV"}
loader.get_jmes('name')
# HTML snippet: {"price": the price is $1200"}
loader.get_jmes('price', TakeFirst(), re='the price is (.*)')
"""
values = self._get_jmesvalues(jmes)
return self.get_value(values, *processors, re=re, **kw)
def _get_jmesvalues(self, jmess: Union[str, Iterable[str]]) -> List[Any]:
self._check_selector_method()
assert self.selector is not None
jmess = arg_to_iter(jmess)
if not hasattr(self.selector, "jmespath"):
raise AttributeError(
"Please install parsel >= 1.8.1 to get jmespath support"
)
return flatten(self.selector.jmespath(jmes).getall() for jmes in jmess)
itemloaders-1.3.1/itemloaders/common.py 0000664 0000000 0000000 00000001116 14627331724 0020206 0 ustar 00root root 0000000 0000000 """Common functions used in Item Loaders code"""
from functools import partial
from typing import Any, Callable, MutableMapping
from itemloaders.utils import get_func_args
def wrap_loader_context(
function: Callable[..., Any], context: MutableMapping[str, Any]
) -> Callable[..., Any]:
"""Wrap functions that receive loader_context to contain the context
"pre-loaded" and expose a interface that receives only one argument
"""
if "loader_context" in get_func_args(function):
return partial(function, loader_context=context)
else:
return function
itemloaders-1.3.1/itemloaders/processors.py 0000664 0000000 0000000 00000021532 14627331724 0021124 0 ustar 00root root 0000000 0000000 """
This module provides some commonly used processors for Item Loaders.
See documentation in docs/topics/loaders.rst
"""
from collections import ChainMap
from typing import Any, Callable, Iterable, List, MutableMapping, Optional
from itemloaders.common import wrap_loader_context
from itemloaders.utils import arg_to_iter
class MapCompose:
"""
A processor which is constructed from the composition of the given
functions, similar to the :class:`Compose` processor. The difference with
this processor is the way internal results are passed among functions,
which is as follows:
The input value of this processor is *iterated* and the first function is
applied to each element. The results of these function calls (one for each element)
are concatenated to construct a new iterable, which is then used to apply the
second function, and so on, until the last function is applied to each
value of the list of values collected so far. The output values of the last
function are concatenated together to produce the output of this processor.
Each particular function can return a value or a list of values, which is
flattened with the list of values returned by the same function applied to
the other input values. The functions can also return ``None`` in which
case the output of that function is ignored for further processing over the
chain.
This processor provides a convenient way to compose functions that only
work with single values (instead of iterables). For this reason the
:class:`MapCompose` processor is typically used as input processor, since
data is often extracted using the
:meth:`~parsel.selector.Selector.extract` method of `parsel selectors`_,
which returns a list of unicode strings.
The example below should clarify how it works:
>>> def filter_world(x):
... return None if x == 'world' else x
...
>>> from itemloaders.processors import MapCompose
>>> proc = MapCompose(filter_world, str.upper)
>>> proc(['hello', 'world', 'this', 'is', 'something'])
['HELLO', 'THIS', 'IS', 'SOMETHING']
As with the Compose processor, functions can receive Loader contexts, and
``__init__`` method keyword arguments are used as default context values.
See :class:`Compose` processor for more info.
.. _`parsel selectors`: https://parsel.readthedocs.io/en/latest/parsel.html#parsel.selector.Selector.extract
""" # noqa
def __init__(self, *functions: Callable[..., Any], **default_loader_context: Any):
self.functions = functions
self.default_loader_context = default_loader_context
def __call__(
self, value: Any, loader_context: Optional[MutableMapping[str, Any]] = None
) -> Iterable[Any]:
values = arg_to_iter(value)
context: MutableMapping[str, Any]
if loader_context:
context = ChainMap(loader_context, self.default_loader_context)
else:
context = self.default_loader_context
wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
for func in wrapped_funcs:
next_values: List[Any] = []
for v in values:
try:
next_values += arg_to_iter(func(v))
except Exception as e:
raise ValueError(
"Error in MapCompose with "
"%s value=%r error='%s: %s'"
% (str(func), value, type(e).__name__, str(e))
) from e
values = next_values
return values
class Compose:
"""
A processor which is constructed from the composition of the given
functions. This means that each input value of this processor is passed to
the first function, and the result of that function is passed to the second
function, and so on, until the last function returns the output value of
this processor.
By default, stop process on ``None`` value. This behaviour can be changed by
passing keyword argument ``stop_on_none=False``.
Example:
>>> from itemloaders.processors import Compose
>>> proc = Compose(lambda v: v[0], str.upper)
>>> proc(['hello', 'world'])
'HELLO'
Each function can optionally receive a ``loader_context`` parameter. For
those which do, this processor will pass the currently active :ref:`Loader
context ` through that parameter.
The keyword arguments passed in the ``__init__`` method are used as the default
Loader context values passed to each function call. However, the final
Loader context values passed to functions are overridden with the currently
active Loader context accessible through the :attr:`ItemLoader.context
` attribute.
"""
def __init__(self, *functions: Callable[..., Any], **default_loader_context: Any):
self.functions = functions
self.stop_on_none = default_loader_context.get("stop_on_none", True)
self.default_loader_context = default_loader_context
def __call__(
self, value: Any, loader_context: Optional[MutableMapping[str, Any]] = None
) -> Any:
context: MutableMapping[str, Any]
if loader_context:
context = ChainMap(loader_context, self.default_loader_context)
else:
context = self.default_loader_context
wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
for func in wrapped_funcs:
if value is None and self.stop_on_none:
break
try:
value = func(value)
except Exception as e:
raise ValueError(
"Error in Compose with "
"%s value=%r error='%s: %s'"
% (str(func), value, type(e).__name__, str(e))
) from e
return value
class TakeFirst:
"""
Returns the first non-null/non-empty value from the values received,
so it's typically used as an output processor to single-valued fields.
It doesn't receive any ``__init__`` method arguments, nor does it accept Loader contexts.
Example:
>>> from itemloaders.processors import TakeFirst
>>> proc = TakeFirst()
>>> proc(['', 'one', 'two', 'three'])
'one'
"""
def __call__(self, values: Any) -> Any:
for value in values:
if value is not None and value != "":
return value
class Identity:
"""
The simplest processor, which doesn't do anything. It returns the original
values unchanged. It doesn't receive any ``__init__`` method arguments, nor does it
accept Loader contexts.
Example:
>>> from itemloaders.processors import Identity
>>> proc = Identity()
>>> proc(['one', 'two', 'three'])
['one', 'two', 'three']
"""
def __call__(self, values: Any) -> Any:
return values
class SelectJmes:
"""
Query the input string for the jmespath (given at instantiation), and return the answer
Requires : jmespath(https://github.com/jmespath/jmespath)
Note: SelectJmes accepts only one input element at a time.
Example:
>>> from itemloaders.processors import SelectJmes, Compose, MapCompose
>>> proc = SelectJmes("foo") #for direct use on lists and dictionaries
>>> proc({'foo': 'bar'})
'bar'
>>> proc({'foo': {'bar': 'baz'}})
{'bar': 'baz'}
Working with Json:
>>> import json
>>> proc_single_json_str = Compose(json.loads, SelectJmes("foo"))
>>> proc_single_json_str('{"foo": "bar"}')
'bar'
>>> proc_json_list = Compose(json.loads, MapCompose(SelectJmes('foo')))
>>> proc_json_list('[{"foo":"bar"}, {"baz":"tar"}]')
['bar']
"""
def __init__(self, json_path: str):
self.json_path: str = json_path
import jmespath.parser
self.compiled_path: jmespath.parser.ParsedResult = jmespath.compile(
self.json_path
)
def __call__(self, value: Any) -> Any:
"""Query value for the jmespath query and return answer
:param value: a data structure (dict, list) to extract from
:return: Element extracted according to jmespath query
"""
return self.compiled_path.search(value)
class Join:
"""
Returns the values joined with the separator given in the ``__init__`` method, which
defaults to ``' '``. It doesn't accept Loader contexts.
When using the default separator, this processor is equivalent to the
function: ``' '.join``
Examples:
>>> from itemloaders.processors import Join
>>> proc = Join()
>>> proc(['one', 'two', 'three'])
'one two three'
>>> proc = Join(' ')
>>> proc(['one', 'two', 'three'])
'one two three'
"""
def __init__(self, separator: str = " "):
self.separator = separator
def __call__(self, values: Any) -> str:
return self.separator.join(values)
itemloaders-1.3.1/itemloaders/py.typed 0000664 0000000 0000000 00000000000 14627331724 0020032 0 ustar 00root root 0000000 0000000 itemloaders-1.3.1/itemloaders/utils.py 0000664 0000000 0000000 00000003015 14627331724 0020056 0 ustar 00root root 0000000 0000000 """
Copy/paste from scrapy source at the moment, to ensure tests are working.
Refactoring to come later
"""
import inspect
from functools import partial
from typing import Any, Callable, Generator, Iterable, List
def arg_to_iter(arg: Any) -> Iterable[Any]:
"""Return an iterable based on *arg*.
If *arg* is a list, a tuple or a generator, it will be returned as is.
If *arg* is ``None``, an empty list will be returned.
If *arg* is anything else, a list will be returned with *arg* as its only
item, i.e. ``[arg]``.
"""
if arg is None:
return []
if isinstance(arg, (list, tuple, Generator)):
return arg
return [arg]
def get_func_args(func: Callable[..., Any], stripself: bool = False) -> List[str]:
"""Return the argument name list of a callable object"""
if not callable(func):
raise TypeError(f"func must be callable, got {type(func).__name__!r}")
args: List[str] = []
try:
sig = inspect.signature(func)
except ValueError:
return args
if isinstance(func, partial):
partial_args = func.args
partial_kw = func.keywords
for name, param in sig.parameters.items():
if param.name in partial_args:
continue
if partial_kw and param.name in partial_kw:
continue
args.append(name)
else:
for name in sig.parameters.keys():
args.append(name)
if stripself and args and args[0] == "self":
args = args[1:]
return args
itemloaders-1.3.1/setup.cfg 0000664 0000000 0000000 00000000403 14627331724 0015653 0 ustar 00root root 0000000 0000000 [flake8]
ignore = E266, E501, E704, W503
max-line-length = 100
select = B,C,E,F,W,T4,B9
exclude = .git,__pycache__,.venv
[isort]
profile = black
[mypy]
[mypy-tests.*]
# Allow test functions to be untyped
allow_untyped_defs = true
check_untyped_defs = true
itemloaders-1.3.1/setup.py 0000664 0000000 0000000 00000003336 14627331724 0015554 0 ustar 00root root 0000000 0000000 from setuptools import find_packages, setup
with open("README.rst") as f:
long_description = f.read()
setup(
name="itemloaders",
version="1.3.1",
url="https://github.com/scrapy/itemloaders",
project_urls={
"Documentation": "https://itemloaders.readthedocs.io/",
"Source": "https://github.com/scrapy/itemloaders",
},
description="Base library for scrapy's ItemLoader",
long_description=long_description,
long_description_content_type="text/x-rst",
author="Zyte",
author_email="opensource@zyte.com",
license="BSD",
packages=find_packages(exclude=("tests", "tests.*")),
package_data={
"itemadapter": ["py.typed"],
},
include_package_data=True,
zip_safe=False,
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
],
python_requires=">=3.8",
install_requires=[
# before updating these versions, be sure they are not higher than
# scrapy's requirements
"w3lib>=1.17.0",
"parsel>=1.5.0",
"jmespath>=0.9.5",
"itemadapter>=0.1.0",
],
# extras_require=extras_require,
)
itemloaders-1.3.1/tests/ 0000775 0000000 0000000 00000000000 14627331724 0015177 5 ustar 00root root 0000000 0000000 itemloaders-1.3.1/tests/__init__.py 0000664 0000000 0000000 00000000000 14627331724 0017276 0 ustar 00root root 0000000 0000000 itemloaders-1.3.1/tests/test_base_loader.py 0000664 0000000 0000000 00000042610 14627331724 0021053 0 ustar 00root root 0000000 0000000 import unittest
from functools import partial
from itemloaders import ItemLoader
from itemloaders.processors import Compose, Identity, Join, MapCompose, TakeFirst
class CustomItemLoader(ItemLoader):
name_in = MapCompose(lambda v: v.title())
class DefaultedItemLoader(ItemLoader):
default_input_processor = MapCompose(lambda v: v[:-1])
# test processors
def processor_with_args(value, other=None, loader_context=None):
if "key" in loader_context:
return loader_context["key"]
return value
class BasicItemLoaderTest(unittest.TestCase):
def test_load_item_using_default_loader(self):
i = {"summary": "lala"}
il = ItemLoader(item=i)
il.add_value("name", "marta")
item = il.load_item()
assert item is i
assert item["summary"] == ["lala"]
assert item["name"] == ["marta"]
def test_load_item_using_custom_loader(self):
il = CustomItemLoader()
il.add_value("name", "marta")
item = il.load_item()
assert item["name"] == ["Marta"]
def test_load_item_ignore_none_field_values(self):
def validate_sku(value):
# Let's assume a SKU is only digits.
if value.isdigit():
return value
class MyLoader(ItemLoader):
name_out = Compose(lambda vs: vs[0]) # take first which allows empty values
price_out = Compose(TakeFirst(), float)
sku_out = Compose(TakeFirst(), validate_sku)
valid_fragment = "SKU: 1234"
invalid_fragment = "SKU: not available"
sku_re = "SKU: (.+)"
il = MyLoader(item={})
# Should not return "sku: None".
il.add_value("sku", [invalid_fragment], re=sku_re)
# Should not ignore empty values.
il.add_value("name", "")
il.add_value("price", ["0"])
assert il.load_item() == {"name": "", "price": 0.0}
il.replace_value("sku", [valid_fragment], re=sku_re)
self.assertEqual(il.load_item()["sku"], "1234")
def test_self_referencing_loader(self):
class MyLoader(ItemLoader):
url_out = TakeFirst()
def img_url_out(self, values):
return (self.get_output_value("url") or "") + values[0]
il = MyLoader(item={})
il.add_value("url", "http://example.com/")
il.add_value("img_url", "1234.png")
assert il.load_item() == {
"url": "http://example.com/",
"img_url": "http://example.com/1234.png",
}
il = MyLoader(item={})
il.add_value("img_url", "1234.png")
assert il.load_item() == {"img_url": "1234.png"}
def test_add_value(self):
il = CustomItemLoader()
il.add_value("name", "marta")
assert il.get_collected_values("name") == ["Marta"]
assert il.get_output_value("name") == ["Marta"]
il.add_value("name", "pepe")
assert il.get_collected_values("name") == ["Marta", "Pepe"]
assert il.get_output_value("name") == ["Marta", "Pepe"]
# test add object value
il.add_value("summary", {"key": 1})
assert il.get_collected_values("summary") == [{"key": 1}]
il.add_value(None, "Jim", lambda x: {"name": x})
assert il.get_collected_values("name") == ["Marta", "Pepe", "Jim"]
def test_add_zero(self):
il = ItemLoader()
il.add_value("name", 0)
assert il.get_collected_values("name") == [0]
def test_add_none(self):
il = ItemLoader()
il.add_value("name", None)
assert il.get_collected_values("name") == []
def test_replace_value(self):
il = CustomItemLoader()
il.replace_value("name", "marta")
self.assertEqual(il.get_collected_values("name"), ["Marta"])
self.assertEqual(il.get_output_value("name"), ["Marta"])
il.replace_value("name", "pepe")
self.assertEqual(il.get_collected_values("name"), ["Pepe"])
self.assertEqual(il.get_output_value("name"), ["Pepe"])
il.replace_value(None, "Jim", lambda x: {"name": x})
self.assertEqual(il.get_collected_values("name"), ["Jim"])
def test_replace_value_none(self):
il = CustomItemLoader()
il.replace_value("name", None)
self.assertEqual(il.get_collected_values("name"), [])
il.replace_value("name", "marta")
self.assertEqual(il.get_collected_values("name"), ["Marta"])
il.replace_value(
"name", None
) # when replacing with `None` nothing should happen
self.assertEqual(il.get_collected_values("name"), ["Marta"])
def test_get_value(self):
il = ItemLoader()
self.assertEqual("FOO", il.get_value(["foo", "bar"], TakeFirst(), str.upper))
self.assertEqual(
["foo", "bar"], il.get_value(["name:foo", "name:bar"], re="name:(.*)$")
)
self.assertEqual(
"foo", il.get_value(["name:foo", "name:bar"], TakeFirst(), re="name:(.*)$")
)
self.assertEqual(
None, il.get_value(["foo", "bar"], TakeFirst(), re="name:(.*)$")
)
self.assertEqual(None, il.get_value(None, TakeFirst()))
il.add_value("name", ["name:foo", "name:bar"], TakeFirst(), re="name:(.*)$")
self.assertEqual(["foo"], il.get_collected_values("name"))
il.replace_value("name", "name:bar", re="name:(.*)$")
self.assertEqual(["bar"], il.get_collected_values("name"))
def test_iter_on_input_processor_input(self):
class NameFirstItemLoader(ItemLoader):
name_in = TakeFirst()
il = NameFirstItemLoader()
il.add_value("name", "marta")
self.assertEqual(il.get_collected_values("name"), ["marta"])
il = NameFirstItemLoader()
il.add_value("name", ["marta", "jose"])
self.assertEqual(il.get_collected_values("name"), ["marta"])
il = NameFirstItemLoader()
il.replace_value("name", "marta")
self.assertEqual(il.get_collected_values("name"), ["marta"])
il = NameFirstItemLoader()
il.replace_value("name", ["marta", "jose"])
self.assertEqual(il.get_collected_values("name"), ["marta"])
il = NameFirstItemLoader()
il.add_value("name", "marta")
il.add_value("name", ["jose", "pedro"])
self.assertEqual(il.get_collected_values("name"), ["marta", "jose"])
def test_map_compose_filter(self):
def filter_world(x):
return None if x == "world" else x
proc = MapCompose(filter_world, str.upper)
self.assertEqual(
proc(["hello", "world", "this", "is", "scrapy"]),
["HELLO", "THIS", "IS", "SCRAPY"],
)
def test_map_compose_filter_multil(self):
class CustomItemLoader(ItemLoader):
name_in = MapCompose(lambda v: v.title(), lambda v: v[:-1])
il = CustomItemLoader()
il.add_value("name", "marta")
self.assertEqual(il.get_output_value("name"), ["Mart"])
item = il.load_item()
self.assertEqual(item["name"], ["Mart"])
def test_default_input_processor(self):
il = DefaultedItemLoader()
il.add_value("name", "marta")
self.assertEqual(il.get_output_value("name"), ["mart"])
def test_inherited_default_input_processor(self):
class InheritDefaultedItemLoader(DefaultedItemLoader):
pass
il = InheritDefaultedItemLoader()
il.add_value("name", "marta")
self.assertEqual(il.get_output_value("name"), ["mart"])
def test_input_processor_inheritance(self):
class ChildItemLoader(CustomItemLoader):
url_in = MapCompose(lambda v: v.lower())
il = ChildItemLoader()
il.add_value("url", "HTTP://scrapy.ORG")
self.assertEqual(il.get_output_value("url"), ["http://scrapy.org"])
il.add_value("name", "marta")
self.assertEqual(il.get_output_value("name"), ["Marta"])
class ChildChildItemLoader(ChildItemLoader):
url_in = MapCompose(lambda v: v.upper())
summary_in = MapCompose(lambda v: v)
il = ChildChildItemLoader()
il.add_value("url", "http://scrapy.org")
self.assertEqual(il.get_output_value("url"), ["HTTP://SCRAPY.ORG"])
il.add_value("name", "marta")
self.assertEqual(il.get_output_value("name"), ["Marta"])
def test_empty_map_compose(self):
class IdentityDefaultedItemLoader(DefaultedItemLoader):
name_in = MapCompose()
il = IdentityDefaultedItemLoader()
il.add_value("name", "marta")
self.assertEqual(il.get_output_value("name"), ["marta"])
def test_identity_input_processor(self):
class IdentityDefaultedItemLoader(DefaultedItemLoader):
name_in = Identity()
il = IdentityDefaultedItemLoader()
il.add_value("name", "marta")
self.assertEqual(il.get_output_value("name"), ["marta"])
def test_extend_custom_input_processors(self):
class ChildItemLoader(CustomItemLoader):
name_in = MapCompose(CustomItemLoader.name_in, str.swapcase)
il = ChildItemLoader()
il.add_value("name", "marta")
self.assertEqual(il.get_output_value("name"), ["mARTA"])
def test_extend_default_input_processors(self):
class ChildDefaultedItemLoader(DefaultedItemLoader):
name_in = MapCompose(
DefaultedItemLoader.default_input_processor, str.swapcase
)
il = ChildDefaultedItemLoader()
il.add_value("name", "marta")
self.assertEqual(il.get_output_value("name"), ["MART"])
def test_output_processor_using_function(self):
il = CustomItemLoader()
il.add_value("name", ["mar", "ta"])
self.assertEqual(il.get_output_value("name"), ["Mar", "Ta"])
class TakeFirstItemLoader(CustomItemLoader):
name_out = " ".join
il = TakeFirstItemLoader()
il.add_value("name", ["mar", "ta"])
self.assertEqual(il.get_output_value("name"), "Mar Ta")
def test_output_processor_error(self):
class CustomItemLoader(ItemLoader):
name_out = MapCompose(float)
il = CustomItemLoader()
il.add_value("name", ["$10"])
try:
float("$10")
except Exception as e:
expected_exc_str = str(e)
exc = None
try:
il.load_item()
except Exception as e:
exc = e
assert isinstance(exc, ValueError)
s = str(exc)
assert "name" in s, s
assert "$10" in s, s
assert "ValueError" in s, s
assert expected_exc_str in s, s
def test_output_processor_using_classes(self):
il = CustomItemLoader()
il.add_value("name", ["mar", "ta"])
self.assertEqual(il.get_output_value("name"), ["Mar", "Ta"])
class TakeFirstItemLoader1(CustomItemLoader):
name_out = Join()
il = TakeFirstItemLoader1()
il.add_value("name", ["mar", "ta"])
self.assertEqual(il.get_output_value("name"), "Mar Ta")
class TakeFirstItemLoader2(CustomItemLoader):
name_out = Join(" ")
il = TakeFirstItemLoader2()
il.add_value("name", ["mar", "ta"])
self.assertEqual(il.get_output_value("name"), "Mar Ta")
def test_default_output_processor(self):
il = CustomItemLoader()
il.add_value("name", ["mar", "ta"])
self.assertEqual(il.get_output_value("name"), ["Mar", "Ta"])
class LalaItemLoader(CustomItemLoader):
default_output_processor = Identity()
il = LalaItemLoader()
il.add_value("name", ["mar", "ta"])
self.assertEqual(il.get_output_value("name"), ["Mar", "Ta"])
def test_loader_context_on_declaration(self):
class ChildItemLoader(CustomItemLoader):
url_in = MapCompose(processor_with_args, key="val")
il = ChildItemLoader()
il.add_value("url", "text")
self.assertEqual(il.get_output_value("url"), ["val"])
il.replace_value("url", "text2")
self.assertEqual(il.get_output_value("url"), ["val"])
def test_loader_context_on_instantiation(self):
class ChildItemLoader(CustomItemLoader):
url_in = MapCompose(processor_with_args)
il = ChildItemLoader(key="val")
il.add_value("url", "text")
self.assertEqual(il.get_output_value("url"), ["val"])
il.replace_value("url", "text2")
self.assertEqual(il.get_output_value("url"), ["val"])
def test_loader_context_on_assign(self):
class ChildItemLoader(CustomItemLoader):
url_in = MapCompose(processor_with_args)
il = ChildItemLoader()
il.context["key"] = "val"
il.add_value("url", "text")
self.assertEqual(il.get_output_value("url"), ["val"])
il.replace_value("url", "text2")
self.assertEqual(il.get_output_value("url"), ["val"])
def test_item_passed_to_input_processor_functions(self):
def processor(value, loader_context):
return loader_context["item"]["name"]
class ChildItemLoader(CustomItemLoader):
url_in = MapCompose(processor)
it = {"name": "marta"}
il = ChildItemLoader(item=it)
il.add_value("url", "text")
self.assertEqual(il.get_output_value("url"), ["marta"])
il.replace_value("url", "text2")
self.assertEqual(il.get_output_value("url"), ["marta"])
# def test_add_value_on_unknown_field(self):
# il = CustomItemLoader()
# self.assertRaises(KeyError, il.add_value, 'wrong_field', ['lala', 'lolo'])
def test_compose_processor(self):
class CustomItemLoader(ItemLoader):
name_out = Compose(lambda v: v[0], lambda v: v.title(), lambda v: v[:-1])
il = CustomItemLoader()
il.add_value("name", ["marta", "other"])
self.assertEqual(il.get_output_value("name"), "Mart")
item = il.load_item()
self.assertEqual(item["name"], "Mart")
def test_partial_processor(self):
def join(values, sep=None, loader_context=None, ignored=None):
if sep is not None:
return sep.join(values)
elif loader_context and "sep" in loader_context:
return loader_context["sep"].join(values)
else:
return "".join(values)
class CustomItemLoader(ItemLoader):
name_out = Compose(partial(join, sep="+"))
url_out = Compose(partial(join, loader_context={"sep": "."}))
summary_out = Compose(partial(join, ignored="foo"))
il = CustomItemLoader()
il.add_value("name", ["rabbit", "hole"])
il.add_value("url", ["rabbit", "hole"])
il.add_value("summary", ["rabbit", "hole"])
item = il.load_item()
self.assertEqual(item["name"], "rabbit+hole")
self.assertEqual(item["url"], "rabbit.hole")
self.assertEqual(item["summary"], "rabbithole")
def test_error_input_processor(self):
class CustomItemLoader(ItemLoader):
name_in = MapCompose(float)
il = CustomItemLoader()
self.assertRaises(ValueError, il.add_value, "name", ["marta", "other"])
def test_error_output_processor(self):
class CustomItemLoader(ItemLoader):
name_out = Compose(Join(), float)
il = CustomItemLoader()
il.add_value("name", "marta")
with self.assertRaises(ValueError):
il.load_item()
def test_error_processor_as_argument(self):
il = CustomItemLoader()
self.assertRaises(
ValueError, il.add_value, "name", ["marta", "other"], Compose(float)
)
def test_get_unset_value(self):
loader = ItemLoader()
self.assertEqual(loader.load_item(), {})
self.assertEqual(loader.get_output_value("foo"), [])
self.assertEqual(loader.load_item(), {})
class BaseNoInputReprocessingLoader(ItemLoader):
title_in = MapCompose(str.upper)
title_out = TakeFirst()
class NoInputReprocessingDictLoader(BaseNoInputReprocessingLoader):
default_item_class = dict
class NoInputReprocessingFromDictTest(unittest.TestCase):
"""
Loaders initialized from loaded items must not reprocess fields (dict instances)
"""
def test_avoid_reprocessing_with_initial_values_single(self):
il = NoInputReprocessingDictLoader(item={"title": "foo"})
il_loaded = il.load_item()
self.assertEqual(il_loaded, {"title": "foo"})
self.assertEqual(
NoInputReprocessingDictLoader(item=il_loaded).load_item(), {"title": "foo"}
)
def test_avoid_reprocessing_with_initial_values_list(self):
il = NoInputReprocessingDictLoader(item={"title": ["foo", "bar"]})
il_loaded = il.load_item()
self.assertEqual(il_loaded, {"title": "foo"})
self.assertEqual(
NoInputReprocessingDictLoader(item=il_loaded).load_item(), {"title": "foo"}
)
def test_avoid_reprocessing_without_initial_values_single(self):
il = NoInputReprocessingDictLoader()
il.add_value("title", "foo")
il_loaded = il.load_item()
self.assertEqual(il_loaded, {"title": "FOO"})
self.assertEqual(
NoInputReprocessingDictLoader(item=il_loaded).load_item(), {"title": "FOO"}
)
def test_avoid_reprocessing_without_initial_values_list(self):
il = NoInputReprocessingDictLoader()
il.add_value("title", ["foo", "bar"])
il_loaded = il.load_item()
self.assertEqual(il_loaded, {"title": "FOO"})
self.assertEqual(
NoInputReprocessingDictLoader(item=il_loaded).load_item(), {"title": "FOO"}
)
itemloaders-1.3.1/tests/test_loader_initialization.py 0000664 0000000 0000000 00000011115 14627331724 0023164 0 ustar 00root root 0000000 0000000 import unittest
from typing import Any, Protocol
from itemloaders import ItemLoader
class InitializationTestProtocol(Protocol):
item_class: Any
def assertEqual(self, first: Any, second: Any, msg: Any = ...) -> None: ...
def assertIsInstance(self, obj: object, cls: type, msg: Any = None) -> None: ...
class InitializationTestMixin:
item_class: Any = None
def test_keep_single_value(self: InitializationTestProtocol) -> None:
"""Loaded item should contain values from the initial item"""
input_item = self.item_class(name="foo")
il = ItemLoader(item=input_item)
loaded_item = il.load_item()
self.assertIsInstance(loaded_item, self.item_class)
self.assertEqual(dict(loaded_item), {"name": ["foo"]})
def test_keep_list(self: InitializationTestProtocol) -> None:
"""Loaded item should contain values from the initial item"""
input_item = self.item_class(name=["foo", "bar"])
il = ItemLoader(item=input_item)
loaded_item = il.load_item()
self.assertIsInstance(loaded_item, self.item_class)
self.assertEqual(dict(loaded_item), {"name": ["foo", "bar"]})
def test_add_value_singlevalue_singlevalue(
self: InitializationTestProtocol,
) -> None:
"""Values added after initialization should be appended"""
input_item = self.item_class(name="foo")
il = ItemLoader(item=input_item)
il.add_value("name", "bar")
loaded_item = il.load_item()
self.assertIsInstance(loaded_item, self.item_class)
self.assertEqual(dict(loaded_item), {"name": ["foo", "bar"]})
def test_add_value_singlevalue_list(self: InitializationTestProtocol) -> None:
"""Values added after initialization should be appended"""
input_item = self.item_class(name="foo")
il = ItemLoader(item=input_item)
il.add_value("name", ["item", "loader"])
loaded_item = il.load_item()
self.assertIsInstance(loaded_item, self.item_class)
self.assertEqual(dict(loaded_item), {"name": ["foo", "item", "loader"]})
def test_add_value_list_singlevalue(self: InitializationTestProtocol) -> None:
"""Values added after initialization should be appended"""
input_item = self.item_class(name=["foo", "bar"])
il = ItemLoader(item=input_item)
il.add_value("name", "qwerty")
loaded_item = il.load_item()
self.assertIsInstance(loaded_item, self.item_class)
self.assertEqual(dict(loaded_item), {"name": ["foo", "bar", "qwerty"]})
def test_add_value_list_list(self: InitializationTestProtocol) -> None:
"""Values added after initialization should be appended"""
input_item = self.item_class(name=["foo", "bar"])
il = ItemLoader(item=input_item)
il.add_value("name", ["item", "loader"])
loaded_item = il.load_item()
self.assertIsInstance(loaded_item, self.item_class)
self.assertEqual(dict(loaded_item), {"name": ["foo", "bar", "item", "loader"]})
def test_get_output_value_singlevalue(self: InitializationTestProtocol) -> None:
"""Getting output value must not remove value from item"""
input_item = self.item_class(name="foo")
il = ItemLoader(item=input_item)
self.assertEqual(il.get_output_value("name"), ["foo"])
loaded_item = il.load_item()
self.assertIsInstance(loaded_item, self.item_class)
self.assertEqual(loaded_item, {"name": ["foo"]})
def test_get_output_value_list(self: InitializationTestProtocol) -> None:
"""Getting output value must not remove value from item"""
input_item = self.item_class(name=["foo", "bar"])
il = ItemLoader(item=input_item)
self.assertEqual(il.get_output_value("name"), ["foo", "bar"])
loaded_item = il.load_item()
self.assertIsInstance(loaded_item, self.item_class)
self.assertEqual(loaded_item, {"name": ["foo", "bar"]})
def test_values_single(self: InitializationTestProtocol) -> None:
"""Values from initial item must be added to loader._values"""
input_item = self.item_class(name="foo")
il = ItemLoader(item=input_item)
self.assertEqual(il._values.get("name"), ["foo"])
def test_values_list(self: InitializationTestProtocol) -> None:
"""Values from initial item must be added to loader._values"""
input_item = self.item_class(name=["foo", "bar"])
il = ItemLoader(item=input_item)
self.assertEqual(il._values.get("name"), ["foo", "bar"])
class InitializationFromDictTest(InitializationTestMixin, unittest.TestCase):
item_class = dict
itemloaders-1.3.1/tests/test_nested_items.py 0000664 0000000 0000000 00000002445 14627331724 0021300 0 ustar 00root root 0000000 0000000 import unittest
from typing import Any
from itemloaders import ItemLoader
class NestedItemTest(unittest.TestCase):
"""Test that adding items as values works as expected."""
def _test_item(self, item: Any) -> None:
il = ItemLoader()
il.add_value("item_list", item)
self.assertEqual(il.load_item(), {"item_list": [item]})
def test_attrs(self):
try:
import attr
except ImportError:
self.skipTest("Cannot import attr")
@attr.s
class TestItem:
foo = attr.ib()
self._test_item(TestItem(foo="bar"))
def test_dataclass(self):
try:
from dataclasses import dataclass
except ImportError:
self.skipTest("Cannot import dataclasses.dataclass")
@dataclass
class TestItem:
foo: str
self._test_item(TestItem(foo="bar"))
def test_dict(self):
self._test_item({"foo": "bar"})
def test_scrapy_item(self):
try:
from scrapy import Field, Item
except ImportError:
self.skipTest("Cannot import Field or Item from scrapy")
# needs py.typed in Scrapy
class TestItem(Item): # type: ignore[misc]
foo = Field()
self._test_item(TestItem(foo="bar"))
itemloaders-1.3.1/tests/test_nested_loader.py 0000664 0000000 0000000 00000010573 14627331724 0021426 0 ustar 00root root 0000000 0000000 import unittest
from parsel import Selector
from itemloaders import ItemLoader
class SubselectorLoaderTest(unittest.TestCase):
selector = Selector(
text="""